2018-03-20 02:46:42

by Sinan Kaya

[permalink] [raw]
Subject: [PATCH v4 12/17] net: cxgb4/cxgb4vf: Eliminate duplicate barriers on weakly-ordered archs

Code includes wmb() followed by writel(). writel() already has a barrier on
some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Create a new wrapper function with relaxed write operator. Use the new
wrapper when a write is following a wmb().

Signed-off-by: Sinan Kaya <[email protected]>
---
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 6 ++++++
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 13 +++++++------
drivers/net/ethernet/chelsio/cxgb4/sge.c | 12 ++++++------
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 2 +-
drivers/net/ethernet/chelsio/cxgb4vf/adapter.h | 14 ++++++++++++++
drivers/net/ethernet/chelsio/cxgb4vf/sge.c | 18 ++++++++++--------
6 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 9040e13..6bde0b9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1202,6 +1202,12 @@ static inline void t4_write_reg(struct adapter *adap, u32 reg_addr, u32 val)
writel(val, adap->regs + reg_addr);
}

+static inline void t4_write_reg_relaxed(struct adapter *adap, u32 reg_addr,
+ u32 val)
+{
+ writel_relaxed(val, adap->regs + reg_addr);
+}
+
#ifndef readq
static inline u64 readq(const volatile void __iomem *addr)
{
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 7b452e8..276472d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -1723,8 +1723,8 @@ int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx,
else
val = PIDX_T5_V(delta);
wmb();
- t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
- QID_V(qid) | val);
+ t4_write_reg_relaxed(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+ QID_V(qid) | val);
}
out:
return ret;
@@ -1902,8 +1902,9 @@ static void enable_txq_db(struct adapter *adap, struct sge_txq *q)
* are committed before we tell HW about them.
*/
wmb();
- t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
- QID_V(q->cntxt_id) | PIDX_V(q->db_pidx_inc));
+ t4_write_reg_relaxed(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+ QID_V(q->cntxt_id) |
+ PIDX_V(q->db_pidx_inc));
q->db_pidx_inc = 0;
}
q->db_disabled = 0;
@@ -2003,8 +2004,8 @@ static void sync_txq_pidx(struct adapter *adap, struct sge_txq *q)
else
val = PIDX_T5_V(delta);
wmb();
- t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
- QID_V(q->cntxt_id) | val);
+ t4_write_reg_relaxed(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+ QID_V(q->cntxt_id) | val);
}
out:
q->db_disabled = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 6e310a0..7388aac 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -530,11 +530,11 @@ static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
* mechanism.
*/
if (unlikely(q->bar2_addr == NULL)) {
- t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
- val | QID_V(q->cntxt_id));
+ t4_write_reg_relaxed(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+ val | QID_V(q->cntxt_id));
} else {
- writel(val | QID_V(q->bar2_qid),
- q->bar2_addr + SGE_UDB_KDOORBELL);
+ writel_relaxed(val | QID_V(q->bar2_qid),
+ q->bar2_addr + SGE_UDB_KDOORBELL);

/* This Write memory Barrier will force the write to
* the User Doorbell area to be flushed.
@@ -986,8 +986,8 @@ inline void cxgb4_ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
(q->bar2_addr + SGE_UDB_WCDOORBELL),
wr);
} else {
- writel(val | QID_V(q->bar2_qid),
- q->bar2_addr + SGE_UDB_KDOORBELL);
+ writel_relaxed(val | QID_V(q->bar2_qid),
+ q->bar2_addr + SGE_UDB_KDOORBELL);
}

/* This Write Memory Barrier will force the write to the User
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 920bccd..8b723a0 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -139,7 +139,7 @@ void t4_write_indirect(struct adapter *adap, unsigned int addr_reg,
{
while (nregs--) {
t4_write_reg(adap, addr_reg, start_idx++);
- t4_write_reg(adap, data_reg, *vals++);
+ t4_write_reg_relaxed(adap, data_reg, *vals++);
}
}

diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
index 5883f09..00247be4 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
@@ -442,6 +442,20 @@ static inline void t4_write_reg(struct adapter *adapter, u32 reg_addr, u32 val)
writel(val, adapter->regs + reg_addr);
}

+/**
+ * t4_write_reg_relaxed - write a HW register without ordering guarantees
+ * @adapter: the adapter
+ * @reg_addr: the register address
+ * @val: the value to write
+ *
+ * Write a 32-bit value into the given HW register.
+ */
+static inline void t4_write_reg_relaxed(struct adapter *adapter, u32 reg_addr,
+ u32 val)
+{
+ writel_relaxed(val, adapter->regs + reg_addr);
+}
+
#ifndef readq
static inline u64 readq(const volatile void __iomem *addr)
{
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index dfce5df..a3a420b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -546,12 +546,13 @@ static inline void ring_fl_db(struct adapter *adapter, struct sge_fl *fl)
* mechanism.
*/
if (unlikely(fl->bar2_addr == NULL)) {
- t4_write_reg(adapter,
- T4VF_SGE_BASE_ADDR + SGE_VF_KDOORBELL,
- QID_V(fl->cntxt_id) | val);
+ t4_write_reg_relaxed(adapter,
+ T4VF_SGE_BASE_ADDR +
+ SGE_VF_KDOORBELL,
+ QID_V(fl->cntxt_id) | val);
} else {
- writel(val | QID_V(fl->bar2_qid),
- fl->bar2_addr + SGE_UDB_KDOORBELL);
+ writel_relaxed(val | QID_V(fl->bar2_qid),
+ fl->bar2_addr + SGE_UDB_KDOORBELL);

/* This Write memory Barrier will force the write to
* the User Doorbell area to be flushed.
@@ -980,8 +981,9 @@ static inline void ring_tx_db(struct adapter *adapter, struct sge_txq *tq,
if (unlikely(tq->bar2_addr == NULL)) {
u32 val = PIDX_V(n);

- t4_write_reg(adapter, T4VF_SGE_BASE_ADDR + SGE_VF_KDOORBELL,
- QID_V(tq->cntxt_id) | val);
+ t4_write_reg_relaxed(adapter,
+ T4VF_SGE_BASE_ADDR + SGE_VF_KDOORBELL,
+ QID_V(tq->cntxt_id) | val);
} else {
u32 val = PIDX_T5_V(n);

@@ -1026,7 +1028,7 @@ static inline void ring_tx_db(struct adapter *adapter, struct sge_txq *tq,
count--;
}
} else
- writel(val | QID_V(tq->bar2_qid),
+ writel_relaxed(val | QID_V(tq->bar2_qid),
tq->bar2_addr + SGE_UDB_KDOORBELL);

/* This Write Memory Barrier will force the write to the User
--
2.7.4



2018-03-21 23:04:31

by Casey Leedom

[permalink] [raw]
Subject: Re: [PATCH v4 12/17] net: cxgb4/cxgb4vf: Eliminate duplicate barriers on weakly-ordered archs

[[ Appologies for the DUPLICATE email. I forgot to tell my Mail Agent to
use Plain Text. -- Casey ]]

I feel very uncomfortable with these proposed changes. Our team is right
in the middle of trying to tease our way through the various platform
implementations of writel(), writel_relaxed(), __raw_writel(), etc. in order
to support x86, PowerPC, ARM, etc. with a single code base. This is
complicated by the somewhat ... "fuzzily defined" semantics and varying
platform implementations of all of these APIs. (And note that I'm just
picking writel() as an example.)

Additionally, many of the changes aren't even in fast paths and are thus
unneeded for performance.

Please don't make these changes. We're trying to get this all sussed out.

Casey



From: Sinan Kaya <[email protected]>
Sent: Monday, March 19, 2018 7:42:27 PM
To: [email protected]; [email protected]; [email protected]
Cc: [email protected]; [email protected]; Sinan Kaya; Ganesh GR; Casey Leedom; [email protected]
Subject: [PATCH v4 12/17] net: cxgb4/cxgb4vf: Eliminate duplicate barriers on weakly-ordered archs
?

Code includes wmb() followed by writel(). writel() already has a barrier on
some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Create a new wrapper function with relaxed write operator. Use the new
wrapper when a write is following a wmb().

Signed-off-by: Sinan Kaya <[email protected]>
---
?drivers/net/ethernet/chelsio/cxgb4/cxgb4.h????? |? 6 ++++++
?drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 13 +++++++------
?drivers/net/ethernet/chelsio/cxgb4/sge.c??????? | 12 ++++++------
?drivers/net/ethernet/chelsio/cxgb4/t4_hw.c????? |? 2 +-
?drivers/net/ethernet/chelsio/cxgb4vf/adapter.h? | 14 ++++++++++++++
?drivers/net/ethernet/chelsio/cxgb4vf/sge.c????? | 18 ++++++++++--------
?6 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 9040e13..6bde0b9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1202,6 +1202,12 @@ static inline void t4_write_reg(struct adapter *adap, u32 reg_addr, u32 val)
???????? writel(val, adap->regs + reg_addr);
?}
?
+static inline void t4_write_reg_relaxed(struct adapter *adap, u32 reg_addr,
+?????????????????????????????????????? u32 val)
+{
+?????? writel_relaxed(val, adap->regs + reg_addr);
+}
+
?#ifndef readq
?static inline u64 readq(const volatile void __iomem *addr)
?{
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 7b452e8..276472d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -1723,8 +1723,8 @@ int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx,
???????????????? else
???????????????????????? val = PIDX_T5_V(delta);
???????????????? wmb();
-?????????????? t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
-??????????????????????????? QID_V(qid) | val);
+?????????????? t4_write_reg_relaxed(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+??????????????????????????????????? QID_V(qid) | val);
???????? }
?out:
???????? return ret;
@@ -1902,8 +1902,9 @@ static void enable_txq_db(struct adapter *adap, struct sge_txq *q)
????????????????? * are committed before we tell HW about them.
????????????????? */
???????????????? wmb();
-?????????????? t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
-??????????????????????????? QID_V(q->cntxt_id) | PIDX_V(q->db_pidx_inc));
+?????????????? t4_write_reg_relaxed(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+??????????????????????????????????? QID_V(q->cntxt_id) |
+?????????????????????????????????????????????? PIDX_V(q->db_pidx_inc));
???????????????? q->db_pidx_inc = 0;
???????? }
???????? q->db_disabled = 0;
@@ -2003,8 +2004,8 @@ static void sync_txq_pidx(struct adapter *adap, struct sge_txq *q)
???????????????? else
???????????????????????? val = PIDX_T5_V(delta);
???????????????? wmb();
-?????????????? t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
-??????????????????????????? QID_V(q->cntxt_id) | val);
+?????????????? t4_write_reg_relaxed(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+??????????????????????????????????? QID_V(q->cntxt_id) | val);
???????? }
?out:
???????? q->db_disabled = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 6e310a0..7388aac 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -530,11 +530,11 @@ static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
????????????????? * mechanism.
????????????????? */
???????????????? if (unlikely(q->bar2_addr == NULL)) {
-?????????????????????? t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
-??????????????????????????????????? val | QID_V(q->cntxt_id));
+?????????????????????? t4_write_reg_relaxed(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+??????????????????????????????????????????? val | QID_V(q->cntxt_id));
???????????????? } else {
-?????????????????????? writel(val | QID_V(q->bar2_qid),
-????????????????????????????? q->bar2_addr + SGE_UDB_KDOORBELL);
+?????????????????????? writel_relaxed(val | QID_V(q->bar2_qid),
+????????????????????????????????????? q->bar2_addr + SGE_UDB_KDOORBELL);
?
???????????????????????? /* This Write memory Barrier will force the write to
????????????????????????? * the User Doorbell area to be flushed.
@@ -986,8 +986,8 @@ inline void cxgb4_ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
?????????????????????????????????????? (q->bar2_addr + SGE_UDB_WCDOORBELL),
?????????????????????????????????????? wr);
???????????????? } else {
-?????????????????????? writel(val | QID_V(q->bar2_qid),
-????????????????????????????? q->bar2_addr + SGE_UDB_KDOORBELL);
+?????????????????????? writel_relaxed(val | QID_V(q->bar2_qid),
+????????????????????????????????????? q->bar2_addr + SGE_UDB_KDOORBELL);
???????????????? }
?
???????????????? /* This Write Memory Barrier will force the write to the User
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 920bccd..8b723a0 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -139,7 +139,7 @@ void t4_write_indirect(struct adapter *adap, unsigned int addr_reg,
?{
???????? while (nregs--) {
???????????????? t4_write_reg(adap, addr_reg, start_idx++);
-?????????????? t4_write_reg(adap, data_reg, *vals++);
+?????????????? t4_write_reg_relaxed(adap, data_reg, *vals++);
???????? }
?}
?
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
index 5883f09..00247be4 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
@@ -442,6 +442,20 @@ static inline void t4_write_reg(struct adapter *adapter, u32 reg_addr, u32 val)
???????? writel(val, adapter->regs + reg_addr);
?}
?
+/**
+ * t4_write_reg_relaxed - write a HW register without ordering guarantees
+ * @adapter: the adapter
+ * @reg_addr: the register address
+ * @val: the value to write
+ *
+ * Write a 32-bit value into the given HW register.
+ */
+static inline void t4_write_reg_relaxed(struct adapter *adapter, u32 reg_addr,
+?????????????????????????????????????? u32 val)
+{
+?????? writel_relaxed(val, adapter->regs + reg_addr);
+}
+
?#ifndef readq
?static inline u64 readq(const volatile void __iomem *addr)
?{
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index dfce5df..a3a420b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -546,12 +546,13 @@ static inline void ring_fl_db(struct adapter *adapter, struct sge_fl *fl)
????????????????? * mechanism.
????????????????? */
???????????????? if (unlikely(fl->bar2_addr == NULL)) {
-?????????????????????? t4_write_reg(adapter,
-??????????????????????????????????? T4VF_SGE_BASE_ADDR + SGE_VF_KDOORBELL,
-??????????????????????????????????? QID_V(fl->cntxt_id) | val);
+?????????????????????? t4_write_reg_relaxed(adapter,
+??????????????????????????????????????????? T4VF_SGE_BASE_ADDR +
+?????????????????????????????????????????????????????? SGE_VF_KDOORBELL,
+??????????????????????????????????????????? QID_V(fl->cntxt_id) | val);
???????????????? } else {
-?????????????????????? writel(val | QID_V(fl->bar2_qid),
-????????????????????????????? fl->bar2_addr + SGE_UDB_KDOORBELL);
+?????????????????????? writel_relaxed(val | QID_V(fl->bar2_qid),
+????????????????????????????????????? fl->bar2_addr + SGE_UDB_KDOORBELL);
?
???????????????????????? /* This Write memory Barrier will force the write to
????????????????????????? * the User Doorbell area to be flushed.
@@ -980,8 +981,9 @@ static inline void ring_tx_db(struct adapter *adapter, struct sge_txq *tq,
???????? if (unlikely(tq->bar2_addr == NULL)) {
???????????????? u32 val = PIDX_V(n);
?
-?????????????? t4_write_reg(adapter, T4VF_SGE_BASE_ADDR + SGE_VF_KDOORBELL,
-??????????????????????????? QID_V(tq->cntxt_id) | val);
+?????????????? t4_write_reg_relaxed(adapter,
+??????????????????????????????????? T4VF_SGE_BASE_ADDR + SGE_VF_KDOORBELL,
+??????????????????????????????????? QID_V(tq->cntxt_id) | val);
???????? } else {
???????????????? u32 val = PIDX_T5_V(n);
?
@@ -1026,7 +1028,7 @@ static inline void ring_tx_db(struct adapter *adapter, struct sge_txq *tq,
???????????????????????????????? count--;
???????????????????????? }
???????????????? } else
-?????????????????????? writel(val | QID_V(tq->bar2_qid),
+?????????????????????? writel_relaxed(val | QID_V(tq->bar2_qid),
??????????????????????????????? tq->bar2_addr + SGE_UDB_KDOORBELL);
?
???????????????? /* This Write Memory Barrier will force the write to the User
--
2.7.4


2018-03-22 00:03:17

by Sinan Kaya

[permalink] [raw]
Subject: Re: [PATCH v4 12/17] net: cxgb4/cxgb4vf: Eliminate duplicate barriers on weakly-ordered archs

On 2018-03-21 19:03, Casey Leedom wrote:
> [[ Appologies for the DUPLICATE email. I forgot to tell my Mail Agent
> to
> use Plain Text. -- Casey ]]
>
> I feel very uncomfortable with these proposed changes. Our team is
> right
> in the middle of trying to tease our way through the various platform
> implementations of writel(), writel_relaxed(), __raw_writel(), etc. in
> order
> to support x86, PowerPC, ARM, etc. with a single code base. This is
> complicated by the somewhat ... "fuzzily defined" semantics and varying
> platform implementations of all of these APIs. (And note that I'm just
> picking writel() as an example.)
>
> Additionally, many of the changes aren't even in fast paths and are
> thus
> unneeded for performance.
>
> Please don't make these changes. We're trying to get this all sussed
> out.
>

I was also given the feedback to look at performance critical path only.
I am in the process of revisiting the patches.

If you can point me to the ones that are important, I can try to limit
the changes to those only.

If your team wants to do it, I can drop this patch as well.

I think the semantics of write API is clear. What was actually
implemented is another story.

I can share a few of my findings.

A portable driver needs to do this.

descriptor update in mem
wmb ()
writel_relaxed ()
mmiowb ()

Using __raw_write() is wrong as it can get reordered.

Using wmb()+writel() is also wrong for performance reasons.

If something is unclear, please ask.

>