Currently for each timestamp frame, the SW needs to go and read the
received timestamp over the MDIO bus. But the HW has the capability
to store the received nanoseconds part and the least significant two
bits of the seconds in the reserved field of the PTP header. In this
way we could save few mdio transactions (actually a little more
transactions because the access to the PTP registers are indirect)
for each received frame. But is still required to read the seconds
part of each received frame.
Doing these changes to start to get the received timestamp in the
reserved field of the header, will give a great CPU usage performance.
Running ptp4l with logSyncInterval of -9 will give a ~50% CPU
improvment.
Signed-off-by: Horatiu Vultur <[email protected]>
---
drivers/net/phy/micrel.c | 231 +++++++++++++++++++++++----------------
1 file changed, 139 insertions(+), 92 deletions(-)
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 0ff4fd9f1a183..28365006b2067 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -3502,6 +3502,9 @@ static int lan8814_probe(struct phy_device *phydev)
#define LAN8841_PTP_CMD_CTL_PTP_RESET BIT(0)
#define LAN8841_PTP_RX_PARSE_CONFIG 368
#define LAN8841_PTP_TX_PARSE_CONFIG 432
+#define LAN8841_PTP_RX_MODE 381
+#define LAN8841_PTP_INSERT_TS_EN BIT(0)
+#define LAN8841_PTP_INSERT_TS_32BIT BIT(1)
static int lan8841_config_init(struct phy_device *phydev)
{
@@ -3650,68 +3653,18 @@ static void lan8841_ptp_process_tx_ts(struct kszphy_ptp_priv *ptp_priv)
lan8814_match_tx_skb(ptp_priv, sec, nsec, seq);
}
-#define LAN8841_PTP_RX_INGRESS_SEC_LO 389
-#define LAN8841_PTP_RX_INGRESS_SEC_HI 388
-#define LAN8841_PTP_RX_INGRESS_NS_LO 387
-#define LAN8841_PTP_RX_INGRESS_NS_HI 386
-#define LAN8841_PTP_RX_INGRESS_NSEC_HI_VALID BIT(15)
-#define LAN8841_PTP_RX_MSG_HEADER2 391
-
-static struct lan8814_ptp_rx_ts *lan8841_ptp_get_rx_ts(struct kszphy_ptp_priv *ptp_priv)
-{
- struct phy_device *phydev = ptp_priv->phydev;
- struct lan8814_ptp_rx_ts *rx_ts;
- u32 sec, nsec;
- u16 seq;
-
- nsec = phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_NS_HI);
- if (!(nsec & LAN8841_PTP_RX_INGRESS_NSEC_HI_VALID))
- return NULL;
-
- nsec = ((nsec & 0x3fff) << 16);
- nsec = nsec | phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_NS_LO);
-
- sec = phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_SEC_HI);
- sec = sec << 16;
- sec = sec | phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_SEC_LO);
-
- seq = phy_read_mmd(phydev, 2, LAN8841_PTP_RX_MSG_HEADER2);
-
- rx_ts = kzalloc(sizeof(*rx_ts), GFP_KERNEL);
- if (!rx_ts)
- return NULL;
-
- rx_ts->seconds = sec;
- rx_ts->nsec = nsec;
- rx_ts->seq_id = seq;
-
- return rx_ts;
-}
-
-static void lan8841_ptp_process_rx_ts(struct kszphy_ptp_priv *ptp_priv)
-{
- struct lan8814_ptp_rx_ts *rx_ts;
-
- while ((rx_ts = lan8841_ptp_get_rx_ts(ptp_priv)) != NULL)
- lan8814_match_rx_ts(ptp_priv, rx_ts);
-}
-
#define LAN8841_PTP_INT_STS 259
#define LAN8841_PTP_INT_STS_PTP_TX_TS_OVRFL_INT BIT(13)
#define LAN8841_PTP_INT_STS_PTP_TX_TS_INT BIT(12)
-#define LAN8841_PTP_INT_STS_PTP_RX_TS_OVRFL_INT BIT(9)
-#define LAN8841_PTP_INT_STS_PTP_RX_TS_INT BIT(8)
#define LAN8841_PTP_INT_STS_PTP_GPIO_CAP_INT BIT(2)
-static void lan8841_ptp_flush_fifo(struct kszphy_ptp_priv *ptp_priv, bool egress)
+static void lan8841_ptp_flush_fifo(struct kszphy_ptp_priv *ptp_priv)
{
struct phy_device *phydev = ptp_priv->phydev;
int i;
for (i = 0; i < FIFO_SIZE; ++i)
- phy_read_mmd(phydev, 2,
- egress ? LAN8841_PTP_TX_MSG_HEADER2 :
- LAN8841_PTP_RX_MSG_HEADER2);
+ phy_read_mmd(phydev, 2, LAN8841_PTP_TX_MSG_HEADER2);
phy_read_mmd(phydev, 2, LAN8841_PTP_INT_STS);
}
@@ -3789,23 +3742,17 @@ static void lan8841_handle_ptp_interrupt(struct phy_device *phydev)
if (status & LAN8841_PTP_INT_STS_PTP_TX_TS_INT)
lan8841_ptp_process_tx_ts(ptp_priv);
- if (status & LAN8841_PTP_INT_STS_PTP_RX_TS_INT)
- lan8841_ptp_process_rx_ts(ptp_priv);
-
if (status & LAN8841_PTP_INT_STS_PTP_GPIO_CAP_INT)
lan8841_gpio_process_cap(ptp_priv);
if (status & LAN8841_PTP_INT_STS_PTP_TX_TS_OVRFL_INT) {
- lan8841_ptp_flush_fifo(ptp_priv, true);
+ lan8841_ptp_flush_fifo(ptp_priv);
skb_queue_purge(&ptp_priv->tx_queue);
}
- if (status & LAN8841_PTP_INT_STS_PTP_RX_TS_OVRFL_INT) {
- lan8841_ptp_flush_fifo(ptp_priv, false);
- skb_queue_purge(&ptp_priv->rx_queue);
- }
-
- } while (status);
+ } while (status & (LAN8841_PTP_INT_STS_PTP_TX_TS_INT |
+ LAN8841_PTP_INT_STS_PTP_GPIO_CAP_INT |
+ LAN8841_PTP_INT_STS_PTP_TX_TS_OVRFL_INT));
}
#define LAN8841_INTS_PTP BIT(9)
@@ -3869,32 +3816,44 @@ static int lan8841_ts_info(struct mii_timestamper *mii_ts,
#define LAN8841_PTP_INT_EN 260
#define LAN8841_PTP_INT_EN_PTP_TX_TS_OVRFL_EN BIT(13)
#define LAN8841_PTP_INT_EN_PTP_TX_TS_EN BIT(12)
-#define LAN8841_PTP_INT_EN_PTP_RX_TS_OVRFL_EN BIT(9)
-#define LAN8841_PTP_INT_EN_PTP_RX_TS_EN BIT(8)
-static void lan8841_ptp_enable_int(struct kszphy_ptp_priv *ptp_priv,
- bool enable)
+static void lan8841_ptp_enable_processing(struct kszphy_ptp_priv *ptp_priv,
+ bool enable)
{
struct phy_device *phydev = ptp_priv->phydev;
- if (enable)
- /* Enable interrupts */
+ if (enable) {
+ /* Enable interrupts on the TX side */
phy_modify_mmd(phydev, 2, LAN8841_PTP_INT_EN,
LAN8841_PTP_INT_EN_PTP_TX_TS_OVRFL_EN |
- LAN8841_PTP_INT_EN_PTP_RX_TS_OVRFL_EN |
- LAN8841_PTP_INT_EN_PTP_TX_TS_EN |
- LAN8841_PTP_INT_EN_PTP_RX_TS_EN,
+ LAN8841_PTP_INT_EN_PTP_TX_TS_EN,
LAN8841_PTP_INT_EN_PTP_TX_TS_OVRFL_EN |
- LAN8841_PTP_INT_EN_PTP_RX_TS_OVRFL_EN |
- LAN8841_PTP_INT_EN_PTP_TX_TS_EN |
- LAN8841_PTP_INT_EN_PTP_RX_TS_EN);
- else
- /* Disable interrupts */
+ LAN8841_PTP_INT_EN_PTP_TX_TS_EN);
+
+ /* Enable the modification of the frame on RX side,
+ * this will add the ns and 2 bits of sec in the reserved field
+ * of the PTP header
+ */
+ phy_modify_mmd(phydev, KSZ9131RN_MMD_COMMON_CTRL_REG,
+ LAN8841_PTP_RX_MODE,
+ LAN8841_PTP_INSERT_TS_EN |
+ LAN8841_PTP_INSERT_TS_32BIT,
+ LAN8841_PTP_INSERT_TS_EN |
+ LAN8841_PTP_INSERT_TS_32BIT);
+ } else {
+ /* Disable interrupts on the TX side */
phy_modify_mmd(phydev, 2, LAN8841_PTP_INT_EN,
LAN8841_PTP_INT_EN_PTP_TX_TS_OVRFL_EN |
- LAN8841_PTP_INT_EN_PTP_RX_TS_OVRFL_EN |
- LAN8841_PTP_INT_EN_PTP_TX_TS_EN |
- LAN8841_PTP_INT_EN_PTP_RX_TS_EN, 0);
+ LAN8841_PTP_INT_EN_PTP_TX_TS_EN, 0);
+
+ /* Disable modification of the RX frames */
+ phy_modify_mmd(phydev, KSZ9131RN_MMD_COMMON_CTRL_REG,
+ LAN8841_PTP_RX_MODE,
+ LAN8841_PTP_INSERT_TS_EN |
+ LAN8841_PTP_INSERT_TS_32BIT, 0);
+
+ ptp_cancel_worker_sync(ptp_priv->ptp_clock);
+ }
}
#define LAN8841_PTP_RX_TIMESTAMP_EN 379
@@ -3905,7 +3864,6 @@ static int lan8841_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
{
struct kszphy_ptp_priv *ptp_priv = container_of(mii_ts, struct kszphy_ptp_priv, mii_ts);
struct phy_device *phydev = ptp_priv->phydev;
- struct lan8814_ptp_rx_ts *rx_ts, *tmp;
struct hwtstamp_config config;
int txcfg = 0, rxcfg = 0;
int pkt_ts_enable;
@@ -3969,24 +3927,47 @@ static int lan8841_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
PTP_TX_MOD_TX_PTP_SYNC_TS_INSERT_ : 0);
/* Now enable/disable the timestamping */
- lan8841_ptp_enable_int(ptp_priv,
- config.rx_filter != HWTSTAMP_FILTER_NONE);
-
- /* In case of multiple starts and stops, these needs to be cleared */
- list_for_each_entry_safe(rx_ts, tmp, &ptp_priv->rx_ts_list, list) {
- list_del(&rx_ts->list);
- kfree(rx_ts);
- }
+ lan8841_ptp_enable_processing(ptp_priv,
+ config.rx_filter != HWTSTAMP_FILTER_NONE);
skb_queue_purge(&ptp_priv->rx_queue);
skb_queue_purge(&ptp_priv->tx_queue);
- lan8841_ptp_flush_fifo(ptp_priv, false);
- lan8841_ptp_flush_fifo(ptp_priv, true);
+ lan8841_ptp_flush_fifo(ptp_priv);
return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? -EFAULT : 0;
}
+#define LAN8841_SKB_CB(skb) ((struct lan8841_skb_cb *)(skb)->cb)
+
+struct lan8841_skb_cb {
+ struct ptp_header *header;
+};
+
+static bool lan8841_rxtstamp(struct mii_timestamper *mii_ts,
+ struct sk_buff *skb, int type)
+{
+ struct kszphy_ptp_priv *ptp_priv =
+ container_of(mii_ts, struct kszphy_ptp_priv, mii_ts);
+ struct ptp_header *header = ptp_parse_header(skb, type);
+
+ if (!header)
+ return false;
+
+ if (ptp_priv->rx_filter == HWTSTAMP_FILTER_NONE ||
+ type == PTP_CLASS_NONE)
+ return false;
+
+ if ((type & ptp_priv->version) == 0 || (type & ptp_priv->layer) == 0)
+ return false;
+
+ LAN8841_SKB_CB(skb)->header = header;
+ skb_queue_tail(&ptp_priv->rx_queue, skb);
+ ptp_schedule_worker(ptp_priv->ptp_clock, 0);
+
+ return true;
+}
+
#define LAN8841_EVENT_A 0
#define LAN8841_EVENT_B 1
#define LAN8841_PTP_LTC_TARGET_SEC_HI(event) ((event) == LAN8841_EVENT_A ? 278 : 288)
@@ -4127,6 +4108,30 @@ static int lan8841_ptp_gettime64(struct ptp_clock_info *ptp,
return 0;
}
+static void lan8841_ptp_getseconds(struct ptp_clock_info *ptp,
+ struct timespec64 *ts)
+{
+ struct kszphy_ptp_priv *ptp_priv = container_of(ptp, struct kszphy_ptp_priv,
+ ptp_clock_info);
+ struct phy_device *phydev = ptp_priv->phydev;
+ time64_t s;
+
+ mutex_lock(&ptp_priv->ptp_lock);
+ /* Issue the command to read the LTC */
+ phy_write_mmd(phydev, 2, LAN8841_PTP_CMD_CTL,
+ LAN8841_PTP_CMD_CTL_PTP_LTC_READ);
+
+ /* Read the LTC */
+ s = phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_HI);
+ s <<= 16;
+ s |= phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_MID);
+ s <<= 16;
+ s |= phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_LO);
+ mutex_unlock(&ptp_priv->ptp_lock);
+
+ set_normalized_timespec64(ts, s, 0);
+}
+
#define LAN8841_PTP_LTC_STEP_ADJ_LO 276
#define LAN8841_PTP_LTC_STEP_ADJ_HI 275
#define LAN8841_PTP_LTC_STEP_ADJ_DIR BIT(15)
@@ -4629,6 +4634,37 @@ static int lan8841_ptp_enable(struct ptp_clock_info *ptp,
return 0;
}
+static long lan8841_ptp_do_aux_work(struct ptp_clock_info *ptp)
+{
+ struct kszphy_ptp_priv *ptp_priv = container_of(ptp, struct kszphy_ptp_priv,
+ ptp_clock_info);
+ struct skb_shared_hwtstamps *shhwtstamps;
+ struct timespec64 ts;
+ struct sk_buff *skb;
+ u32 ts_header;
+
+ while ((skb = skb_dequeue(&ptp_priv->rx_queue)) != NULL) {
+ lan8841_ptp_getseconds(ptp, &ts);
+ ts_header = __be32_to_cpu(LAN8841_SKB_CB(skb)->header->reserved2);
+
+ shhwtstamps = skb_hwtstamps(skb);
+ memset(shhwtstamps, 0, sizeof(*shhwtstamps));
+
+ /* Check for any wrap arounds for the second part */
+ if ((ts.tv_sec & GENMASK(1, 0)) < ts_header >> 30)
+ ts.tv_sec -= GENMASK(1, 0) + 1;
+
+ shhwtstamps->hwtstamp =
+ ktime_set((ts.tv_sec & ~(GENMASK(1, 0))) | ts_header >> 30,
+ ts_header & GENMASK(29, 0));
+ LAN8841_SKB_CB(skb)->header->reserved2 = 0;
+
+ netif_rx(skb);
+ }
+
+ return -1;
+}
+
static struct ptp_clock_info lan8841_ptp_clock_info = {
.owner = THIS_MODULE,
.name = "lan8841 ptp",
@@ -4639,6 +4675,7 @@ static struct ptp_clock_info lan8841_ptp_clock_info = {
.adjfine = lan8841_ptp_adjfine,
.verify = lan8841_ptp_verify,
.enable = lan8841_ptp_enable,
+ .do_aux_work = lan8841_ptp_do_aux_work,
.n_per_out = LAN8841_PTP_GPIO_NUM,
.n_ext_ts = LAN8841_PTP_GPIO_NUM,
.n_pins = LAN8841_PTP_GPIO_NUM,
@@ -4705,7 +4742,7 @@ static int lan8841_probe(struct phy_device *phydev)
ptp_priv->phydev = phydev;
mutex_init(&ptp_priv->ptp_lock);
- ptp_priv->mii_ts.rxtstamp = lan8814_rxtstamp;
+ ptp_priv->mii_ts.rxtstamp = lan8841_rxtstamp;
ptp_priv->mii_ts.txtstamp = lan8814_txtstamp;
ptp_priv->mii_ts.hwtstamp = lan8841_hwtstamp;
ptp_priv->mii_ts.ts_info = lan8841_ts_info;
@@ -4715,6 +4752,16 @@ static int lan8841_probe(struct phy_device *phydev)
return 0;
}
+static int lan8841_suspend(struct phy_device *phydev)
+{
+ struct kszphy_priv *priv = phydev->priv;
+ struct kszphy_ptp_priv *ptp_priv = &priv->ptp_priv;
+
+ ptp_cancel_worker_sync(ptp_priv->ptp_clock);
+
+ return genphy_suspend(phydev);
+}
+
static struct phy_driver ksphy_driver[] = {
{
.phy_id = PHY_ID_KS8737,
@@ -4938,7 +4985,7 @@ static struct phy_driver ksphy_driver[] = {
.get_sset_count = kszphy_get_sset_count,
.get_strings = kszphy_get_strings,
.get_stats = kszphy_get_stats,
- .suspend = genphy_suspend,
+ .suspend = lan8841_suspend,
.resume = genphy_resume,
.cable_test_start = lan8814_cable_test_start,
.cable_test_get_status = ksz886x_cable_test_get_status,
--
2.38.0
On Wed, Jun 07, 2023 at 09:09:48AM +0200, Horatiu Vultur wrote:
> Doing these changes to start to get the received timestamp in the
> reserved field of the header, will give a great CPU usage performance.
> Running ptp4l with logSyncInterval of -9 will give a ~50% CPU
> improvment.
Really?
> -static struct lan8814_ptp_rx_ts *lan8841_ptp_get_rx_ts(struct kszphy_ptp_priv *ptp_priv)
> -{
> - struct phy_device *phydev = ptp_priv->phydev;
> - struct lan8814_ptp_rx_ts *rx_ts;
> - u32 sec, nsec;
> - u16 seq;
> -
> - nsec = phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_NS_HI);
> - if (!(nsec & LAN8841_PTP_RX_INGRESS_NSEC_HI_VALID))
> - return NULL;
> -
> - nsec = ((nsec & 0x3fff) << 16);
> - nsec = nsec | phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_NS_LO);
> -
> - sec = phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_SEC_HI);
> - sec = sec << 16;
> - sec = sec | phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_SEC_LO);
> -
> - seq = phy_read_mmd(phydev, 2, LAN8841_PTP_RX_MSG_HEADER2);
Before: 5x phy_read_mmd() per frame ...
> - rx_ts = kzalloc(sizeof(*rx_ts), GFP_KERNEL);
> - if (!rx_ts)
> - return NULL;
> -
> - rx_ts->seconds = sec;
> - rx_ts->nsec = nsec;
> - rx_ts->seq_id = seq;
> -
> - return rx_ts;
> -}
> +static void lan8841_ptp_getseconds(struct ptp_clock_info *ptp,
> + struct timespec64 *ts)
> +{
> + struct kszphy_ptp_priv *ptp_priv = container_of(ptp, struct kszphy_ptp_priv,
> + ptp_clock_info);
> + struct phy_device *phydev = ptp_priv->phydev;
> + time64_t s;
> +
> + mutex_lock(&ptp_priv->ptp_lock);
> + /* Issue the command to read the LTC */
> + phy_write_mmd(phydev, 2, LAN8841_PTP_CMD_CTL,
> + LAN8841_PTP_CMD_CTL_PTP_LTC_READ);
> +
> + /* Read the LTC */
> + s = phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_HI);
> + s <<= 16;
> + s |= phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_MID);
> + s <<= 16;
> + s |= phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_LO);
After: 4x phy_read_mmd() per frame. How does that save 50% cpu?
> + mutex_unlock(&ptp_priv->ptp_lock);
> +
> + set_normalized_timespec64(ts, s, 0);
> +}
> +static long lan8841_ptp_do_aux_work(struct ptp_clock_info *ptp)
> +{
> + struct kszphy_ptp_priv *ptp_priv = container_of(ptp, struct kszphy_ptp_priv,
> + ptp_clock_info);
> + struct skb_shared_hwtstamps *shhwtstamps;
> + struct timespec64 ts;
> + struct sk_buff *skb;
> + u32 ts_header;
> +
> + while ((skb = skb_dequeue(&ptp_priv->rx_queue)) != NULL) {
> + lan8841_ptp_getseconds(ptp, &ts);
No need to call this once per frame. It would be sufficent to call it
once every 2 seconds and cache the result.
> + ts_header = __be32_to_cpu(LAN8841_SKB_CB(skb)->header->reserved2);
> +
> + shhwtstamps = skb_hwtstamps(skb);
> + memset(shhwtstamps, 0, sizeof(*shhwtstamps));
> +
> + /* Check for any wrap arounds for the second part */
> + if ((ts.tv_sec & GENMASK(1, 0)) < ts_header >> 30)
> + ts.tv_sec -= GENMASK(1, 0) + 1;
> +
> + shhwtstamps->hwtstamp =
> + ktime_set((ts.tv_sec & ~(GENMASK(1, 0))) | ts_header >> 30,
> + ts_header & GENMASK(29, 0));
> + LAN8841_SKB_CB(skb)->header->reserved2 = 0;
> +
> + netif_rx(skb);
> + }
> +
> + return -1;
> +}
Thanks,
Richard
On Wed, Jun 07, 2023 at 10:47:16AM -0700, Richard Cochran wrote:
> On Wed, Jun 07, 2023 at 09:09:48AM +0200, Horatiu Vultur wrote:
> > +static long lan8841_ptp_do_aux_work(struct ptp_clock_info *ptp)
> > +{
> > + struct kszphy_ptp_priv *ptp_priv = container_of(ptp, struct kszphy_ptp_priv,
> > + ptp_clock_info);
> > + struct skb_shared_hwtstamps *shhwtstamps;
> > + struct timespec64 ts;
> > + struct sk_buff *skb;
> > + u32 ts_header;
> > +
> > + while ((skb = skb_dequeue(&ptp_priv->rx_queue)) != NULL) {
> > + lan8841_ptp_getseconds(ptp, &ts);
>
> No need to call this once per frame. It would be sufficent to call it
> once every 2 seconds and cache the result.
Okay, this is tricky.
- If you call lan8841_ptp_getseconds() after gathering the received
frames, then the frame timestamps are clearly in the past WRT the
call to getseconds. That makes the wrap check simpler. But the
getseconds call really should be placed before the 'while' loop.
- If the Rx frame rate exceeds 1/second, then it would be more
efficient to call getseconds every second, and cache the result.
But then the wrap around check needs to account for the fact that
the cached value may have occurred either before or after the frame
timestamp.
I'll explain that second point when my brain wakes again...
Thanks,
Richard
The 06/07/2023 22:51, Richard Cochran wrote:
>
> On Wed, Jun 07, 2023 at 10:47:16AM -0700, Richard Cochran wrote:
> > On Wed, Jun 07, 2023 at 09:09:48AM +0200, Horatiu Vultur wrote:
> > > +static long lan8841_ptp_do_aux_work(struct ptp_clock_info *ptp)
> > > +{
> > > + struct kszphy_ptp_priv *ptp_priv = container_of(ptp, struct kszphy_ptp_priv,
> > > + ptp_clock_info);
> > > + struct skb_shared_hwtstamps *shhwtstamps;
> > > + struct timespec64 ts;
> > > + struct sk_buff *skb;
> > > + u32 ts_header;
> > > +
> > > + while ((skb = skb_dequeue(&ptp_priv->rx_queue)) != NULL) {
> > > + lan8841_ptp_getseconds(ptp, &ts);
> >
> > No need to call this once per frame. It would be sufficent to call it
> > once every 2 seconds and cache the result.
Funny thing is that I have already a patch for this, it is not 100%
complete, because of one of the possible issues that you mention below.
But in my case I read the seconds twice per second, regardless of the RX
frame rate.
>
> Okay, this is tricky.
>
> - If you call lan8841_ptp_getseconds() after gathering the received
> frames, then the frame timestamps are clearly in the past WRT the
> call to getseconds. That makes the wrap check simpler. But the
> getseconds call really should be placed before the 'while' loop.
>
> - If the Rx frame rate exceeds 1/second, then it would be more
> efficient to call getseconds every second, and cache the result.
> But then the wrap around check needs to account for the fact that
> the cached value may have occurred either before or after the frame
> timestamp.
>
> I'll explain that second point when my brain wakes again...
I am looking forward for your explanation.
In the end don't you think it should be 2 different patches, first patch
which changes to mode to get the RX timestamp, while the second one
optimize this?
Bellow just for reference I added the patch which it tries to get the second
part twice per second.
---
Subject: [PATCH] net: micrel: Schedule work to read seconds for lan8841
Instead of reading the seconds part of the received frame for each of
the frames, schedule a workqueue to read the seconds part every 500ms
and then for each of the received frames use this information instead of
reading again the seconds part. Because if for example running with 512
frames per second, there is no point to read 512 times the second part.
Of course care needs to be taken in case of the less two significant
bits are 0 or 3, to make sure there are no seconds wraparound.
This will improve the CPU usage by ~20% and also it is possible to receive
1024 Sync frames per second.
Signed-off-by: Horatiu Vultur <[email protected]>
---
drivers/net/phy/micrel.c | 49 ++++++++++++++++++++++++++++++++++++----
1 file changed, 45 insertions(+), 4 deletions(-)
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 28365006b2067..9832eea404377 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -33,6 +33,7 @@
#include <linux/ptp_classify.h>
#include <linux/net_tstamp.h>
#include <linux/gpio/consumer.h>
+#include <linux/workqueue.h>
/* Operation Mode Strap Override */
#define MII_KSZPHY_OMSO 0x16
@@ -254,6 +255,9 @@
#define PS_TO_REG 200
#define FIFO_SIZE 8
+/* Delay used to get the second part from the LTC */
+#define LAN8841_GET_SEC_LTC_DELAY (500 * NSEC_PER_MSEC)
+
struct kszphy_hw_stat {
const char *string;
u8 reg;
@@ -321,6 +325,9 @@ struct kszphy_ptp_priv {
/* Lock for ptp_clock */
struct mutex ptp_lock;
struct ptp_pin_desc *pin_config;
+
+ s64 seconds;
+ struct delayed_work seconds_work;
};
struct kszphy_priv {
@@ -3840,6 +3847,12 @@ static void lan8841_ptp_enable_processing(struct kszphy_ptp_priv *ptp_priv,
LAN8841_PTP_INSERT_TS_32BIT,
LAN8841_PTP_INSERT_TS_EN |
LAN8841_PTP_INSERT_TS_32BIT);
+
+ /* Schedule the work to read the seconds, which will be used in
+ * the received timestamp
+ */
+ schedule_delayed_work(&ptp_priv->seconds_work,
+ nsecs_to_jiffies(LAN8841_GET_SEC_LTC_DELAY));
} else {
/* Disable interrupts on the TX side */
phy_modify_mmd(phydev, 2, LAN8841_PTP_INT_EN,
@@ -3853,6 +3866,11 @@ static void lan8841_ptp_enable_processing(struct kszphy_ptp_priv *ptp_priv,
LAN8841_PTP_INSERT_TS_32BIT, 0);
ptp_cancel_worker_sync(ptp_priv->ptp_clock);
+
+ /* Stop the work, as there is no reason to continue to read the
+ * seconds if there is no timestamping enabled
+ */
+ cancel_delayed_work_sync(&ptp_priv->seconds_work);
}
}
@@ -4062,6 +4080,8 @@ static int lan8841_ptp_settime64(struct ptp_clock_info *ptp,
phy_write_mmd(phydev, 2, LAN8841_PTP_LTC_SET_NS_LO, lower_16_bits(ts->tv_nsec));
phy_write_mmd(phydev, 2, LAN8841_PTP_LTC_SET_NS_HI, upper_16_bits(ts->tv_nsec) & 0x3fff);
+ ptp_priv->seconds = ts->tv_sec;
+
/* Set the command to load the LTC */
phy_write_mmd(phydev, 2, LAN8841_PTP_CMD_CTL,
LAN8841_PTP_CMD_CTL_PTP_LTC_LOAD);
@@ -4116,7 +4136,6 @@ static void lan8841_ptp_getseconds(struct ptp_clock_info *ptp,
struct phy_device *phydev = ptp_priv->phydev;
time64_t s;
- mutex_lock(&ptp_priv->ptp_lock);
/* Issue the command to read the LTC */
phy_write_mmd(phydev, 2, LAN8841_PTP_CMD_CTL,
LAN8841_PTP_CMD_CTL_PTP_LTC_READ);
@@ -4127,7 +4146,6 @@ static void lan8841_ptp_getseconds(struct ptp_clock_info *ptp,
s |= phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_MID);
s <<= 16;
s |= phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_LO);
- mutex_unlock(&ptp_priv->ptp_lock);
set_normalized_timespec64(ts, s, 0);
}
@@ -4644,15 +4662,20 @@ static long lan8841_ptp_do_aux_work(struct ptp_clock_info *ptp)
u32 ts_header;
while ((skb = skb_dequeue(&ptp_priv->rx_queue)) != NULL) {
- lan8841_ptp_getseconds(ptp, &ts);
+ mutex_lock(&ptp_priv->ptp_lock);
+ ts.tv_sec = ptp_priv->seconds;
+ mutex_unlock(&ptp_priv->ptp_lock);
+
ts_header = __be32_to_cpu(LAN8841_SKB_CB(skb)->header->reserved2);
shhwtstamps = skb_hwtstamps(skb);
memset(shhwtstamps, 0, sizeof(*shhwtstamps));
/* Check for any wrap arounds for the second part */
- if ((ts.tv_sec & GENMASK(1, 0)) < ts_header >> 30)
+ if ((ts.tv_sec & GENMASK(1, 0)) == 0 && (ts_header >> 30) == 3)
ts.tv_sec -= GENMASK(1, 0) + 1;
+ else if ((ts.tv_sec & GENMASK(1, 0)) == 3 && (ts_header >> 30) == 0)
+ ts.tv_sec += 1;
shhwtstamps->hwtstamp =
ktime_set((ts.tv_sec & ~(GENMASK(1, 0))) | ts_header >> 30,
@@ -4665,6 +4688,21 @@ static long lan8841_ptp_do_aux_work(struct ptp_clock_info *ptp)
return -1;
}
+static void lan8841_ptp_seconds_work(struct work_struct *seconds_work)
+{
+ struct kszphy_ptp_priv *ptp_priv =
+ container_of(seconds_work, struct kszphy_ptp_priv, seconds_work.work);
+ struct timespec64 ts;
+
+ mutex_lock(&ptp_priv->ptp_lock);
+ lan8841_ptp_getseconds(&ptp_priv->ptp_clock_info, &ts);
+ ptp_priv->seconds = ts.tv_sec;
+ mutex_unlock(&ptp_priv->ptp_lock);
+
+ schedule_delayed_work(&ptp_priv->seconds_work,
+ nsecs_to_jiffies(LAN8841_GET_SEC_LTC_DELAY));
+}
+
static struct ptp_clock_info lan8841_ptp_clock_info = {
.owner = THIS_MODULE,
.name = "lan8841 ptp",
@@ -4749,6 +4787,8 @@ static int lan8841_probe(struct phy_device *phydev)
phydev->mii_ts = &ptp_priv->mii_ts;
+ INIT_DELAYED_WORK(&ptp_priv->seconds_work, lan8841_ptp_seconds_work);
+
return 0;
}
@@ -4758,6 +4798,7 @@ static int lan8841_suspend(struct phy_device *phydev)
struct kszphy_ptp_priv *ptp_priv = &priv->ptp_priv;
ptp_cancel_worker_sync(ptp_priv->ptp_clock);
+ cancel_delayed_work_sync(&ptp_priv->seconds_work);
return genphy_suspend(phydev);
}
--
---
>
> Thanks,
> Richard
--
/Horatiu
Hi Richard,
The 06/07/2023 10:47, Richard Cochran wrote:
>
> On Wed, Jun 07, 2023 at 09:09:48AM +0200, Horatiu Vultur wrote:
>
> > Doing these changes to start to get the received timestamp in the
> > reserved field of the header, will give a great CPU usage performance.
> > Running ptp4l with logSyncInterval of -9 will give a ~50% CPU
> > improvment.
>
> Really?
I have run a simple top on the PC while running ptp4l.
This is the output without this patch:
PID PPID USER STAT VSZ %VSZ %CPU COMMAND
136 2 root DW 0 0% 58% [irq/33-e2004118]
141 133 root R 2232 0% 23% ptp4l -f /tmp/linux.cfg
And this is the output with the patch:
142 134 root S 2232 0% 15% ptp4l -f /tmp/linux.cfg
36 2 root DW 0 0% 15% [ptp0]
137 2 root SW 0 0% 0% [irq/33-e2004118]
If you think I should do better measurements, I am open to suggestions.
>
> > -static struct lan8814_ptp_rx_ts *lan8841_ptp_get_rx_ts(struct kszphy_ptp_priv *ptp_priv)
> > -{
> > - struct phy_device *phydev = ptp_priv->phydev;
> > - struct lan8814_ptp_rx_ts *rx_ts;
> > - u32 sec, nsec;
> > - u16 seq;
> > -
> > - nsec = phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_NS_HI);
> > - if (!(nsec & LAN8841_PTP_RX_INGRESS_NSEC_HI_VALID))
> > - return NULL;
> > -
> > - nsec = ((nsec & 0x3fff) << 16);
> > - nsec = nsec | phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_NS_LO);
> > -
> > - sec = phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_SEC_HI);
> > - sec = sec << 16;
> > - sec = sec | phy_read_mmd(phydev, 2, LAN8841_PTP_RX_INGRESS_SEC_LO);
> > -
> > - seq = phy_read_mmd(phydev, 2, LAN8841_PTP_RX_MSG_HEADER2);
>
> Before: 5x phy_read_mmd() per frame ...
>
> > - rx_ts = kzalloc(sizeof(*rx_ts), GFP_KERNEL);
> > - if (!rx_ts)
> > - return NULL;
> > -
> > - rx_ts->seconds = sec;
> > - rx_ts->nsec = nsec;
> > - rx_ts->seq_id = seq;
> > -
> > - return rx_ts;
> > -}
>
> > +static void lan8841_ptp_getseconds(struct ptp_clock_info *ptp,
> > + struct timespec64 *ts)
> > +{
> > + struct kszphy_ptp_priv *ptp_priv = container_of(ptp, struct kszphy_ptp_priv,
> > + ptp_clock_info);
> > + struct phy_device *phydev = ptp_priv->phydev;
> > + time64_t s;
> > +
> > + mutex_lock(&ptp_priv->ptp_lock);
> > + /* Issue the command to read the LTC */
> > + phy_write_mmd(phydev, 2, LAN8841_PTP_CMD_CTL,
> > + LAN8841_PTP_CMD_CTL_PTP_LTC_READ);
> > +
> > + /* Read the LTC */
> > + s = phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_HI);
> > + s <<= 16;
> > + s |= phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_MID);
> > + s <<= 16;
> > + s |= phy_read_mmd(phydev, 2, LAN8841_PTP_LTC_RD_SEC_LO);
>
> After: 4x phy_read_mmd() per frame. How does that save 50% cpu?
>
> > + mutex_unlock(&ptp_priv->ptp_lock);
> > +
> > + set_normalized_timespec64(ts, s, 0);
> > +}
>
>
> > +static long lan8841_ptp_do_aux_work(struct ptp_clock_info *ptp)
> > +{
> > + struct kszphy_ptp_priv *ptp_priv = container_of(ptp, struct kszphy_ptp_priv,
> > + ptp_clock_info);
> > + struct skb_shared_hwtstamps *shhwtstamps;
> > + struct timespec64 ts;
> > + struct sk_buff *skb;
> > + u32 ts_header;
> > +
> > + while ((skb = skb_dequeue(&ptp_priv->rx_queue)) != NULL) {
> > + lan8841_ptp_getseconds(ptp, &ts);
>
> No need to call this once per frame. It would be sufficent to call it
> once every 2 seconds and cache the result.
>
> > + ts_header = __be32_to_cpu(LAN8841_SKB_CB(skb)->header->reserved2);
> > +
> > + shhwtstamps = skb_hwtstamps(skb);
> > + memset(shhwtstamps, 0, sizeof(*shhwtstamps));
> > +
> > + /* Check for any wrap arounds for the second part */
> > + if ((ts.tv_sec & GENMASK(1, 0)) < ts_header >> 30)
> > + ts.tv_sec -= GENMASK(1, 0) + 1;
> > +
> > + shhwtstamps->hwtstamp =
> > + ktime_set((ts.tv_sec & ~(GENMASK(1, 0))) | ts_header >> 30,
> > + ts_header & GENMASK(29, 0));
> > + LAN8841_SKB_CB(skb)->header->reserved2 = 0;
> > +
> > + netif_rx(skb);
> > + }
> > +
> > + return -1;
> > +}
>
> Thanks,
> Richard
--
/Horatiu