This patch series is meant to add busy polling support to epoll when all of
the sockets on a given epoll are either local or are being sourced by the
same NAPI ID.
In order to support this the first two patches clean up a few issues we
found with the NAPI ID tracking and infrastructure.
In the third patch we introduce SO_INCOMING_NAPI_ID so that applications
have a means of trying to sort their incoming sockets to identify which
requests should be routed where in order to keep the epoll listener aligned
to a given Rx queue without having to rely on IRQ pinning.
Finally the last two patches refactor the existing busy poll infrastructure
to make it so that we can call it without necessarily needing a socket, and
enable the bits needed to support epoll when all of the sockets on the
epoll either share the same NAPI ID, or simply are reporting no NAPI ID.
---
Sridhar Samudrala (5):
net: Do not record sender_cpu as napi_id in socket receive paths
net: Call sk_mark_napi_id() in the ACK receive path
net: Introduce SO_INCOMING_NAPI_ID
net: Commonize busy polling code to focus on napi_id instead of socket
epoll: Add busy poll support to epoll with socket fds.
arch/alpha/include/uapi/asm/socket.h | 2 +
arch/avr32/include/uapi/asm/socket.h | 2 +
arch/frv/include/uapi/asm/socket.h | 2 +
arch/ia64/include/uapi/asm/socket.h | 2 +
arch/m32r/include/uapi/asm/socket.h | 2 +
arch/mips/include/uapi/asm/socket.h | 2 +
arch/mn10300/include/uapi/asm/socket.h | 2 +
arch/parisc/include/uapi/asm/socket.h | 2 +
arch/powerpc/include/uapi/asm/socket.h | 2 +
arch/s390/include/uapi/asm/socket.h | 2 +
arch/sparc/include/uapi/asm/socket.h | 2 +
arch/xtensa/include/uapi/asm/socket.h | 2 +
fs/eventpoll.c | 115 ++++++++++++++++++++++++++++++++
include/net/busy_poll.h | 14 +++-
include/uapi/asm-generic/socket.h | 2 +
net/core/dev.c | 16 ++--
net/core/sock.c | 22 ++++++
net/ipv4/tcp_ipv4.c | 1
18 files changed, 183 insertions(+), 11 deletions(-)
--
From: Sridhar Samudrala <[email protected]>
Fix sk_mark_napi_id() and sk_mark_napi_id_once() to set sk_napi_id only if
skb->napi_id is a valid value.
This happens in loopback paths where skb->napi_id is not updated in
rx path and holds sender_cpu that is set in xmit path.
Signed-off-by: Sridhar Samudrala <[email protected]>
Signed-off-by: Alexander Duyck <[email protected]>
---
include/net/busy_poll.h | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index c0452de83086..67991635953e 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -116,7 +116,8 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock)
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
- sk->sk_napi_id = skb->napi_id;
+ if (skb->napi_id > (u32)NR_CPUS)
+ sk->sk_napi_id = skb->napi_id;
#endif
}
@@ -125,7 +126,7 @@ static inline void sk_mark_napi_id_once(struct sock *sk,
const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
- if (!sk->sk_napi_id)
+ if (!sk->sk_napi_id && (skb->napi_id > (u32)NR_CPUS))
sk->sk_napi_id = skb->napi_id;
#endif
}
From: Sridhar Samudrala <[email protected]>
This socket option returns the napi id associated with the queue on which
the last frame is received. This information can be used by the apps to
split the incoming flows among the threads based on the Rx queue on which
they are received.
Signed-off-by: Sridhar Samudrala <[email protected]>
Signed-off-by: Alexander Duyck <[email protected]>
---
arch/alpha/include/uapi/asm/socket.h | 2 ++
arch/avr32/include/uapi/asm/socket.h | 2 ++
arch/frv/include/uapi/asm/socket.h | 2 ++
arch/ia64/include/uapi/asm/socket.h | 2 ++
arch/m32r/include/uapi/asm/socket.h | 2 ++
arch/mips/include/uapi/asm/socket.h | 2 ++
arch/mn10300/include/uapi/asm/socket.h | 2 ++
arch/parisc/include/uapi/asm/socket.h | 2 ++
arch/powerpc/include/uapi/asm/socket.h | 2 ++
arch/s390/include/uapi/asm/socket.h | 2 ++
arch/sparc/include/uapi/asm/socket.h | 2 ++
arch/xtensa/include/uapi/asm/socket.h | 2 ++
include/uapi/asm-generic/socket.h | 2 ++
net/core/sock.c | 4 ++++
14 files changed, 30 insertions(+)
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index afc901b7a6f6..c90a4eb8fe2d 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 5a650426f357..bee51e585ff6 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index 81e03530ed39..4ac94eff4a64 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -92,5 +92,7 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 57feb0c1f7d7..1bb0c810495e 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -101,4 +101,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 5853f8e92c20..234119518dc1 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 566ecdcb5b4b..0b6d2f0b82b4 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -110,4 +110,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index 0e12527c4b0e..881fa88cbf60 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -92,4 +92,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 7a109b73ddf7..70d6f5ffebef 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -91,4 +91,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 0x402F
+#define SO_INCOMING_NAPI_ID 0x4030
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index 44583a52f882..8188f0ebae5f 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index b24a64cbfeb1..22ef97b26377 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -98,4 +98,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index a25dc32f5d6a..6b7e10019763 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -88,6 +88,8 @@
#define SCM_TIMESTAMPING_OPT_STATS 0x0038
+#define SO_INCOMING_NAPI_ID 0x0039
+
/* Security levels - as per NRL IPv6 - don't actually do anything */
#define SO_SECURITY_AUTHENTICATION 0x5001
#define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index 9fdbe1fe0473..77ba8be3738c 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -103,4 +103,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* _XTENSA_SOCKET_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 2c748ddad5f8..e1ffc8d4aca3 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -94,4 +94,6 @@
#define SCM_TIMESTAMPING_OPT_STATS 54
+#define SO_INCOMING_NAPI_ID 55
+
#endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index a83731c36761..74288b2d4b3d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1313,6 +1313,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_incoming_cpu;
break;
+ case SO_INCOMING_NAPI_ID:
+ v.val = sk->sk_napi_id;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
From: Sridhar Samudrala <[email protected]>
This patch adds busy poll support to epoll if all the sockets attached
to an epoll fd receive packets from the same receive queue(NAPI ID). NAPI
ID is maintained per epoll and is set from sk when the first event is
received for a socket with a non-zero NAPI ID. It is validated to make sure
that all the later events for sockets have the same NAPI ID. Busy polling
is disabled if an event is received for a socket with NAPI ID that is
different from the epoll NAPI ID.
An application can use SO_INCOMING_CPU or SO_REUSEPORT_ATTACH_C/EBPF socket
options to spread the incoming connections to specific worker threads
based on the incoming queue. This enables epoll for each worker thread
to have only sockets that receive packets from a single queue. So when an
application calls epoll_wait() and there are no events available to report,
busy polling is done on the associated queue to pull the packets.
Signed-off-by: Sridhar Samudrala <[email protected]>
Signed-off-by: Alexander Duyck <[email protected]>
---
fs/eventpoll.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 114 insertions(+), 1 deletion(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 341251421ced..304e1592be83 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -42,6 +42,7 @@
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
+#include <net/busy_poll.h>
/*
* LOCKING:
@@ -224,6 +225,11 @@ struct eventpoll {
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* used to track busy poll napi_id */
+ unsigned int napi_id;
+#endif
};
/* Wait structure used by the poll hooks */
@@ -384,8 +390,109 @@ static inline int ep_events_available(struct eventpoll *ep)
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+/*
+ * NAPI ID value used to indicate busy poll is disabled. 2 or more sockets
+ * associated with different NAPI IDs are attached to epoll.
+ */
+#define BUSY_POLL_DISABLED_NAPI_ID 1
+
+/*
+ * If busy polling is on and the file is a socket, return a pointer to
+ * struct sock
+ */
+static inline struct sock *ep_sk_from_file(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ if (!S_ISSOCK(inode->i_mode))
+ return NULL;
+
+ return ((struct socket *)file->private_data)->sk;
+}
+
+/*
+ * If busy polling is on and the file for this pwq is a socket,
+ * return a pointer to struct sock
+ */
+static inline struct sock *ep_sk_from_pwq(struct eppoll_entry *pwq)
+{
+ return ep_sk_from_file(pwq->base->ffd.file);
+}
+
+static inline bool epoll_can_busy_loop(struct eventpoll *ep)
+{
+ return net_busy_loop_on() && (ep->napi_id > BUSY_POLL_DISABLED_NAPI_ID);
+}
+
+/*
+ * Set epoll busy poll napi id from sk if it is not already set.
+ * If it is already set and is not equal to the sk napi id, set it
+ * to BUSY_POLL_DISABLED_NAPI_ID so that busy polling gets disabled
+ * on this epoll.
+ */
+static inline void ep_set_busy_poll_napi_id(struct eventpoll *ep,
+ struct eppoll_entry *pwq)
+{
+ struct sock *sk;
+
+ if ((ep->napi_id == BUSY_POLL_DISABLED_NAPI_ID) || !net_busy_loop_on())
+ return;
+
+ sk = ep_sk_from_pwq(pwq);
+ if (!sk || !sk->sk_napi_id)
+ return;
+
+ /* epoll has a matching napi id, return */
+ if (sk->sk_napi_id == ep->napi_id)
+ return;
+
+ /* disable busy polling if napi id already set, else set it. */
+ ep->napi_id = ep->napi_id ? BUSY_POLL_DISABLED_NAPI_ID :
+ sk->sk_napi_id;
+}
+
+static bool epoll_napi_busy_loop_end(void *p)
+{
+ struct eventpoll *ep = p;
+
+ return ep_events_available(ep);
+}
+
+/*
+ * Busy poll if globally on and supporting sockets found && no events,
+ * busy loop will return if need_resched or ep_events_available.
+ *
+ * we must do our busy polling with irqs enabled
+ */
+static bool epoll_busy_loop(struct eventpoll *ep, int nonblock)
+{
+ unsigned long end_time = !nonblock ? busy_loop_end_time() : 0;
+
+ if (!epoll_can_busy_loop(ep) || ep_events_available(ep))
+ return false;
+
+ return napi_busy_loop(ep->napi_id, end_time, nonblock,
+ epoll_napi_busy_loop_end, ep);
+}
+
+#else /* CONFIG_NET_RX_BUSY_POLL */
+
+static inline void ep_set_busy_poll_napi_id(struct eventpoll *ep,
+ struct eppoll_entry *pwq)
+{
+}
+
+static inline bool epoll_busy_loop(struct eventpoll *ep, int nonblock)
+{
+ return false;
+}
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
/**
- * ep_call_nested - Perform a bound (possibly) nested call, by checking
+ * ep_call_nested - Perform a bound (possibly) nested call, by checking
* that the recursion limit is not exceeded, and that
* the same nested call (by the meaning of same cookie) is
* no re-entered.
@@ -1022,6 +1129,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
spin_lock_irqsave(&ep->lock, flags);
+ ep_set_busy_poll_napi_id(ep, ep_pwq_from_wait(wait));
+
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
@@ -1127,6 +1236,7 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
+ ep_set_busy_poll_napi_id(epi->ep, pwq);
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
@@ -1637,6 +1747,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}
fetch_events:
+
+ epoll_busy_loop(ep, timed_out);
+
spin_lock_irqsave(&ep->lock, flags);
if (!ep_events_available(ep)) {
From: Sridhar Samudrala <[email protected]>
Move the core functionality in sk_busy_loop() to napi_busy_loop() and
make it independent of sk.
This enables re-using this function in epoll busy loop implementation.
Signed-off-by: Sridhar Samudrala <[email protected]>
Signed-off-by: Alexander Duyck <[email protected]>
---
include/net/busy_poll.h | 9 +++++++++
net/core/dev.c | 16 ++++++++--------
net/core/sock.c | 18 ++++++++++++++++++
3 files changed, 35 insertions(+), 8 deletions(-)
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 67991635953e..24bc10d56b49 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -71,6 +71,9 @@ static inline bool busy_loop_timeout(unsigned long end_time)
bool sk_busy_loop(struct sock *sk, int nonblock);
+bool napi_busy_loop(unsigned int napi_id, unsigned long end_time, int nonblock,
+ bool (*loop_end)(void *), void *loop_end_arg);
+
/* used in the NIC receive handler to mark the skb */
static inline void skb_mark_napi_id(struct sk_buff *skb,
struct napi_struct *napi)
@@ -110,6 +113,12 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock)
return false;
}
+static inline bool napi_busy_loop(unsigned int napi_id, unsigned long end_time,
+ int nonblock, bool (*loop_end)(void *),
+ void *loop_end_arg)
+{
+ return false;
+}
#endif /* CONFIG_NET_RX_BUSY_POLL */
/* used in the protocol hanlder to propagate the napi_id to the socket */
diff --git a/net/core/dev.c b/net/core/dev.c
index 7869ae3837ca..4e55765c6998 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5060,9 +5060,9 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
do_softirq();
}
-bool sk_busy_loop(struct sock *sk, int nonblock)
+bool napi_busy_loop(unsigned int napi_id, unsigned long end_time, int nonblock,
+ bool (*loop_end)(void *), void *loop_end_arg)
{
- unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
void *have_poll_lock = NULL;
struct napi_struct *napi;
@@ -5074,7 +5074,7 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
rcu_read_lock();
- napi = napi_by_id(sk->sk_napi_id);
+ napi = napi_by_id(napi_id);
if (!napi)
goto out;
@@ -5102,11 +5102,11 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
count:
if (rc > 0)
- __NET_ADD_STATS(sock_net(sk),
+ __NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, rc);
local_bh_enable();
- if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
+ if (nonblock || loop_end(loop_end_arg) ||
busy_loop_timeout(end_time))
break;
@@ -5116,7 +5116,7 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
preempt_enable();
rcu_read_unlock();
cond_resched();
- rc = !skb_queue_empty(&sk->sk_receive_queue);
+ rc = loop_end(loop_end_arg);
if (rc || busy_loop_timeout(end_time))
return rc;
goto restart;
@@ -5126,12 +5126,12 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
if (napi_poll)
busy_poll_stop(napi, have_poll_lock);
preempt_enable();
- rc = !skb_queue_empty(&sk->sk_receive_queue);
+ rc = loop_end(loop_end_arg);
out:
rcu_read_unlock();
return rc;
}
-EXPORT_SYMBOL(sk_busy_loop);
+EXPORT_SYMBOL(napi_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
diff --git a/net/core/sock.c b/net/core/sock.c
index 74288b2d4b3d..f606d89f044a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3205,3 +3205,21 @@ static int __init proto_init(void)
subsys_initcall(proto_init);
#endif /* PROC_FS */
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static bool sk_napi_busy_loop_end(void *p)
+{
+ struct sock *sk = p;
+
+ return !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+bool sk_busy_loop(struct sock *sk, int nonblock)
+{
+ unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
+
+ return napi_busy_loop(sk->sk_napi_id, end_time, nonblock,
+ sk_napi_busy_loop_end, sk);
+}
+EXPORT_SYMBOL(sk_busy_loop);
+#endif /* CONFIG_NET_RX_BUSY_POLL */
From: Sridhar Samudrala <[email protected]>
Call sk_mark_napi_id() in the ACK receive path of a TCP_NEW_SYN_RECV
socket, so that sk->napi_id is set even if the socket hasn't yet received
any data. With this change we should be able to start busy polling
slightly earlier.
Signed-off-by: Sridhar Samudrala <[email protected]>
Signed-off-by: Alexander Duyck <[email protected]>
---
net/ipv4/tcp_ipv4.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 08d870e45658..b86002a296f1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1687,6 +1687,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_v4_send_reset(nsk, skb);
goto discard_and_relse;
} else {
+ sk_mark_napi_id(nsk, skb);
sock_put(sk);
return 0;
}
On Thu, 2017-03-16 at 11:32 -0700, Alexander Duyck wrote:
> From: Sridhar Samudrala <[email protected]>
>
> Call sk_mark_napi_id() in the ACK receive path of a TCP_NEW_SYN_RECV
> socket, so that sk->napi_id is set even if the socket hasn't yet received
> any data. With this change we should be able to start busy polling
> slightly earlier.
>
> Signed-off-by: Sridhar Samudrala <[email protected]>
> Signed-off-by: Alexander Duyck <[email protected]>
> ---
> net/ipv4/tcp_ipv4.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 08d870e45658..b86002a296f1 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1687,6 +1687,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
> tcp_v4_send_reset(nsk, skb);
> goto discard_and_relse;
> } else {
> + sk_mark_napi_id(nsk, skb);
> sock_put(sk);
> return 0;
> }
>
Seems good, but what about IPv6 ?
Frankly this calls for the sk_mark_napi_id() being done in
tcp_child_process() instead of its four callers.
On Thu, 2017-03-16 at 11:32 -0700, Alexander Duyck wrote:
> From: Sridhar Samudrala <[email protected]>
>
> Fix sk_mark_napi_id() and sk_mark_napi_id_once() to set sk_napi_id only if
> skb->napi_id is a valid value.
>
> This happens in loopback paths where skb->napi_id is not updated in
> rx path and holds sender_cpu that is set in xmit path.
>
> Signed-off-by: Sridhar Samudrala <[email protected]>
> Signed-off-by: Alexander Duyck <[email protected]>
> ---
> include/net/busy_poll.h | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
> index c0452de83086..67991635953e 100644
> --- a/include/net/busy_poll.h
> +++ b/include/net/busy_poll.h
> @@ -116,7 +116,8 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock)
> static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
> {
> #ifdef CONFIG_NET_RX_BUSY_POLL
> - sk->sk_napi_id = skb->napi_id;
> + if (skb->napi_id > (u32)NR_CPUS)
> + sk->sk_napi_id = skb->napi_id;
> #endif
> }
>
> @@ -125,7 +126,7 @@ static inline void sk_mark_napi_id_once(struct sock *sk,
> const struct sk_buff *skb)
> {
> #ifdef CONFIG_NET_RX_BUSY_POLL
> - if (!sk->sk_napi_id)
> + if (!sk->sk_napi_id && (skb->napi_id > (u32)NR_CPUS))
> sk->sk_napi_id = skb->napi_id;
> #endif
> }
>
It is not clear why this patch is needed .
What you describe here is the case we might receive packets for a socket
coming from different interfaces ?
If skb->napi_id is a sender_cpu, why should we prevent overwriting the
sk_napi_id with it, knowing that busy polling will simply ignore the
invalid value ?
Do not get me wrong :
I simply try to understand why the test about napi_id validity is now
done twice :
1) At the time we are writing into sk->sk_napi_id
2) At busy polling time when we read sk->sk_napi_id
On Thu, 2017-03-16 at 11:33 -0700, Alexander Duyck wrote:
> From: Sridhar Samudrala <[email protected]>
> +/*
> + * If busy polling is on and the file is a socket, return a pointer to
> + * struct sock
> + */
> +static inline struct sock *ep_sk_from_file(struct file *file)
> +{
> + struct inode *inode = file_inode(file);
> +
> + if (!S_ISSOCK(inode->i_mode))
> + return NULL;
> +
> + return ((struct socket *)file->private_data)->sk;
> +}
I believe a more standard way is to use sock_from_file()
(This does not have to fetch a cache line to read i_mode ))
On Thu, Mar 16, 2017 at 3:04 PM, Eric Dumazet <[email protected]> wrote:
> On Thu, 2017-03-16 at 11:32 -0700, Alexander Duyck wrote:
>> From: Sridhar Samudrala <[email protected]>
>>
>> Call sk_mark_napi_id() in the ACK receive path of a TCP_NEW_SYN_RECV
>> socket, so that sk->napi_id is set even if the socket hasn't yet received
>> any data. With this change we should be able to start busy polling
>> slightly earlier.
>>
>> Signed-off-by: Sridhar Samudrala <[email protected]>
>> Signed-off-by: Alexander Duyck <[email protected]>
>> ---
>> net/ipv4/tcp_ipv4.c | 1 +
>> 1 file changed, 1 insertion(+)
>>
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index 08d870e45658..b86002a296f1 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -1687,6 +1687,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
>> tcp_v4_send_reset(nsk, skb);
>> goto discard_and_relse;
>> } else {
>> + sk_mark_napi_id(nsk, skb);
>> sock_put(sk);
>> return 0;
>> }
>>
>
> Seems good, but what about IPv6 ?
Sorry, I spaced out and overlooked that this would also be an issue for IPv6.
> Frankly this calls for the sk_mark_napi_id() being done in
> tcp_child_process() instead of its four callers.
We can look into that for the next version.
On Thu, Mar 16, 2017 at 3:11 PM, Eric Dumazet <[email protected]> wrote:
> On Thu, 2017-03-16 at 11:33 -0700, Alexander Duyck wrote:
>> From: Sridhar Samudrala <[email protected]>
>
>> +/*
>> + * If busy polling is on and the file is a socket, return a pointer to
>> + * struct sock
>> + */
>> +static inline struct sock *ep_sk_from_file(struct file *file)
>> +{
>> + struct inode *inode = file_inode(file);
>> +
>> + if (!S_ISSOCK(inode->i_mode))
>> + return NULL;
>> +
>> + return ((struct socket *)file->private_data)->sk;
>> +}
>
> I believe a more standard way is to use sock_from_file()
>
> (This does not have to fetch a cache line to read i_mode ))
Thanks. Will make sure to update to use that for v2.
- Alex
On 3/16/2017 3:05 PM, Eric Dumazet wrote:
> On Thu, 2017-03-16 at 11:32 -0700, Alexander Duyck wrote:
>> From: Sridhar Samudrala <[email protected]>
>>
>> Fix sk_mark_napi_id() and sk_mark_napi_id_once() to set sk_napi_id only if
>> skb->napi_id is a valid value.
>>
>> This happens in loopback paths where skb->napi_id is not updated in
>> rx path and holds sender_cpu that is set in xmit path.
>>
>> Signed-off-by: Sridhar Samudrala <[email protected]>
>> Signed-off-by: Alexander Duyck <[email protected]>
>> ---
>> include/net/busy_poll.h | 5 +++--
>> 1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
>> index c0452de83086..67991635953e 100644
>> --- a/include/net/busy_poll.h
>> +++ b/include/net/busy_poll.h
>> @@ -116,7 +116,8 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock)
>> static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
>> {
>> #ifdef CONFIG_NET_RX_BUSY_POLL
>> - sk->sk_napi_id = skb->napi_id;
>> + if (skb->napi_id > (u32)NR_CPUS)
>> + sk->sk_napi_id = skb->napi_id;
>> #endif
>> }
>>
>> @@ -125,7 +126,7 @@ static inline void sk_mark_napi_id_once(struct sock *sk,
>> const struct sk_buff *skb)
>> {
>> #ifdef CONFIG_NET_RX_BUSY_POLL
>> - if (!sk->sk_napi_id)
>> + if (!sk->sk_napi_id && (skb->napi_id > (u32)NR_CPUS))
>> sk->sk_napi_id = skb->napi_id;
>> #endif
>> }
>>
> It is not clear why this patch is needed .
>
> What you describe here is the case we might receive packets for a socket
> coming from different interfaces ?
This is seen with AF_UNIX or AF_INET sockets over loopback.
>
> If skb->napi_id is a sender_cpu, why should we prevent overwriting the
> sk_napi_id with it, knowing that busy polling will simply ignore the
> invalid value ?
We are not checking for invalid VALUE(< NR_CPUs) in busy_poll,
Non-zero sk->napi_id is considered valid.
If we don't want to add this check while setting sk->sk_napi_Id, we
could change the
check in ep_set_busy_poll_napi_id() to check for invalid value rather
than non-zero value.
>
> Do not get me wrong :
>
> I simply try to understand why the test about napi_id validity is now
> done twice :
>
> 1) At the time we are writing into sk->sk_napi_id
>
> 2) At busy polling time when we read sk->sk_napi_id
>
>
>
On Thu, Mar 16, 2017 at 3:05 PM, Eric Dumazet <[email protected]> wrote:
> On Thu, 2017-03-16 at 11:32 -0700, Alexander Duyck wrote:
>> From: Sridhar Samudrala <[email protected]>
>>
>> Fix sk_mark_napi_id() and sk_mark_napi_id_once() to set sk_napi_id only if
>> skb->napi_id is a valid value.
>>
>> This happens in loopback paths where skb->napi_id is not updated in
>> rx path and holds sender_cpu that is set in xmit path.
>>
>> Signed-off-by: Sridhar Samudrala <[email protected]>
>> Signed-off-by: Alexander Duyck <[email protected]>
>> ---
>> include/net/busy_poll.h | 5 +++--
>> 1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
>> index c0452de83086..67991635953e 100644
>> --- a/include/net/busy_poll.h
>> +++ b/include/net/busy_poll.h
>> @@ -116,7 +116,8 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock)
>> static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
>> {
>> #ifdef CONFIG_NET_RX_BUSY_POLL
>> - sk->sk_napi_id = skb->napi_id;
>> + if (skb->napi_id > (u32)NR_CPUS)
>> + sk->sk_napi_id = skb->napi_id;
>> #endif
>> }
>>
>> @@ -125,7 +126,7 @@ static inline void sk_mark_napi_id_once(struct sock *sk,
>> const struct sk_buff *skb)
>> {
>> #ifdef CONFIG_NET_RX_BUSY_POLL
>> - if (!sk->sk_napi_id)
>> + if (!sk->sk_napi_id && (skb->napi_id > (u32)NR_CPUS))
>> sk->sk_napi_id = skb->napi_id;
>> #endif
>> }
>>
>
> It is not clear why this patch is needed .
>
> What you describe here is the case we might receive packets for a socket
> coming from different interfaces ?
>
> If skb->napi_id is a sender_cpu, why should we prevent overwriting the
> sk_napi_id with it, knowing that busy polling will simply ignore the
> invalid value ?
>
> Do not get me wrong :
>
> I simply try to understand why the test about napi_id validity is now
> done twice :
>
> 1) At the time we are writing into sk->sk_napi_id
I would argue that this is the one piece we were missing.
> 2) At busy polling time when we read sk->sk_napi_id
Unless there was something recently added I don't think this was ever
checked. Instead we start digging into the hash looking for the ID
that won't ever be there. Maybe we should add something to napi_by_id
that just returns NULL in these cases.
On top of that I think there end up being several spots where once we
lock in a non-NAPI ID it is stuck such as the sk_mark_napi_id_once
call. I figure we are better off locking in an actual NAPI ID rather
than getting a sender_cpu stuck in there.
On Thu, 2017-03-16 at 15:33 -0700, Alexander Duyck wrote:
> On Thu, Mar 16, 2017 at 3:05 PM, Eric Dumazet <[email protected]> wrote:
> > It is not clear why this patch is needed .
> >
> > What you describe here is the case we might receive packets for a socket
> > coming from different interfaces ?
> >
> > If skb->napi_id is a sender_cpu, why should we prevent overwriting the
> > sk_napi_id with it, knowing that busy polling will simply ignore the
> > invalid value ?
> >
> > Do not get me wrong :
> >
> > I simply try to understand why the test about napi_id validity is now
> > done twice :
> >
> > 1) At the time we are writing into sk->sk_napi_id
>
> I would argue that this is the one piece we were missing.
>
> > 2) At busy polling time when we read sk->sk_napi_id
>
> Unless there was something recently added I don't think this was ever
> checked. Instead we start digging into the hash looking for the ID
> that won't ever be there. Maybe we should add something to napi_by_id
> that just returns NULL in these cases.
But this is exactly what should happen.
For invalid ID, we return NULL from napi_by_id()
No need to add code for that, since the function is meant to deal with
valid cases.
> On top of that I think there end up being several spots where once we
> lock in a non-NAPI ID it is stuck such as the sk_mark_napi_id_once
> call. I figure we are better off locking in an actual NAPI ID rather
> than getting a sender_cpu stuck in there.
Are you referring to sk_mark_napi_id_once() ?
Since this is only used by UDP, I would be OK to avoid the 'locking' for
'sender_cpu' ids.
On Thu, 2017-03-16 at 11:32 -0700, Alexander Duyck wrote:
>
> + case SO_INCOMING_NAPI_ID:
> + v.val = sk->sk_napi_id;
> + break;
I guess that here you should filter invalid values.
(So that you no longer need the first patch in this series)
Also, it looks like eBPF will need to get access to skb->napi_id for
efficient SO_REUSEPORT support ?
Thanks.
On Thu, Mar 16, 2017 at 3:50 PM, Eric Dumazet <[email protected]> wrote:
> On Thu, 2017-03-16 at 15:33 -0700, Alexander Duyck wrote:
>> On Thu, Mar 16, 2017 at 3:05 PM, Eric Dumazet <[email protected]> wrote:
>
>> > It is not clear why this patch is needed .
>> >
>> > What you describe here is the case we might receive packets for a socket
>> > coming from different interfaces ?
>> >
>> > If skb->napi_id is a sender_cpu, why should we prevent overwriting the
>> > sk_napi_id with it, knowing that busy polling will simply ignore the
>> > invalid value ?
>> >
>> > Do not get me wrong :
>> >
>> > I simply try to understand why the test about napi_id validity is now
>> > done twice :
>> >
>> > 1) At the time we are writing into sk->sk_napi_id
>>
>> I would argue that this is the one piece we were missing.
>>
>> > 2) At busy polling time when we read sk->sk_napi_id
>>
>> Unless there was something recently added I don't think this was ever
>> checked. Instead we start digging into the hash looking for the ID
>> that won't ever be there. Maybe we should add something to napi_by_id
>> that just returns NULL in these cases.
>
> But this is exactly what should happen.
>
> For invalid ID, we return NULL from napi_by_id()
>
> No need to add code for that, since the function is meant to deal with
> valid cases.
I don't know. My concern here is about the cost of going through all
that code just for something that we know shouldn't be valid. If
nothing else I might update sk_can_busy_loop so that it doesn't try
busy looping on a sk_napi_id that is NR_CPU or less.
>> On top of that I think there end up being several spots where once we
>> lock in a non-NAPI ID it is stuck such as the sk_mark_napi_id_once
>> call. I figure we are better off locking in an actual NAPI ID rather
>> than getting a sender_cpu stuck in there.
>
> Are you referring to sk_mark_napi_id_once() ?
>
> Since this is only used by UDP, I would be OK to avoid the 'locking' for
> 'sender_cpu' ids.
What I probably can do is go through and replace all the spots where
we where checking for sk_napi_id being 0, and instead replace it with
a check against NR_CPUS.
On Thu, 2017-03-16 at 19:40 -0700, Alexander Duyck wrote:
> I don't know. My concern here is about the cost of going through all
> that code just for something that we know shouldn't be valid. If
> nothing else I might update sk_can_busy_loop so that it doesn't try
> busy looping on a sk_napi_id that is NR_CPU or less.
But why would that be a win ?
if napi_by_id() returns NULL, we immediately give up, (goto out;)
So why should we add a code that will add something that will not be
useful for the vast majority of the cases where the ID is valid and we
need to do the hash look up ?
Is libc trying to avoid doing syscalls like close(-1) ?
On Thu, Mar 16, 2017 at 7:57 PM, Eric Dumazet <[email protected]> wrote:
> On Thu, 2017-03-16 at 19:40 -0700, Alexander Duyck wrote:
>
>> What I probably can do is go through and replace all the spots where
>> we where checking for sk_napi_id being 0, and instead replace it with
>> a check against NR_CPUS.
>
> This seems a good idea.
Right. This is the path I am planning to go with. It will require
about the same amount of code change but replaces those checks. I
just have to make sure I catch all the spots where we were checking it
for 0 but that shouldn't be too difficult of an issue.
On Thu, 2017-03-16 at 19:40 -0700, Alexander Duyck wrote:
> What I probably can do is go through and replace all the spots where
> we where checking for sk_napi_id being 0, and instead replace it with
> a check against NR_CPUS.
This seems a good idea.
[CC += [email protected]]
Hello Alexander
Since this is a kernel-user-space API change, please CC linux-api@
(and on future iterations of the patch). The kernel source file
Documentation/SubmitChecklist notes that all Linux kernel patches that
change userspace interfaces should be CCed to
[email protected], so that the various parties who are
interested in API changes are informed. For further information, see
https://www.kernel.org/doc/man-pages/linux-api-ml.html
Thanks,
Michael
On Thu, Mar 16, 2017 at 7:32 PM, Alexander Duyck
<[email protected]> wrote:
> This patch series is meant to add busy polling support to epoll when all of
> the sockets on a given epoll are either local or are being sourced by the
> same NAPI ID.
>
> In order to support this the first two patches clean up a few issues we
> found with the NAPI ID tracking and infrastructure.
>
> In the third patch we introduce SO_INCOMING_NAPI_ID so that applications
> have a means of trying to sort their incoming sockets to identify which
> requests should be routed where in order to keep the epoll listener aligned
> to a given Rx queue without having to rely on IRQ pinning.
>
> Finally the last two patches refactor the existing busy poll infrastructure
> to make it so that we can call it without necessarily needing a socket, and
> enable the bits needed to support epoll when all of the sockets on the
> epoll either share the same NAPI ID, or simply are reporting no NAPI ID.
>
> ---
>
> Sridhar Samudrala (5):
> net: Do not record sender_cpu as napi_id in socket receive paths
> net: Call sk_mark_napi_id() in the ACK receive path
> net: Introduce SO_INCOMING_NAPI_ID
> net: Commonize busy polling code to focus on napi_id instead of socket
> epoll: Add busy poll support to epoll with socket fds.
>
>
> arch/alpha/include/uapi/asm/socket.h | 2 +
> arch/avr32/include/uapi/asm/socket.h | 2 +
> arch/frv/include/uapi/asm/socket.h | 2 +
> arch/ia64/include/uapi/asm/socket.h | 2 +
> arch/m32r/include/uapi/asm/socket.h | 2 +
> arch/mips/include/uapi/asm/socket.h | 2 +
> arch/mn10300/include/uapi/asm/socket.h | 2 +
> arch/parisc/include/uapi/asm/socket.h | 2 +
> arch/powerpc/include/uapi/asm/socket.h | 2 +
> arch/s390/include/uapi/asm/socket.h | 2 +
> arch/sparc/include/uapi/asm/socket.h | 2 +
> arch/xtensa/include/uapi/asm/socket.h | 2 +
> fs/eventpoll.c | 115 ++++++++++++++++++++++++++++++++
> include/net/busy_poll.h | 14 +++-
> include/uapi/asm-generic/socket.h | 2 +
> net/core/dev.c | 16 ++--
> net/core/sock.c | 22 ++++++
> net/ipv4/tcp_ipv4.c | 1
> 18 files changed, 183 insertions(+), 11 deletions(-)
>
> --
--
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/