In order to support IP_PKTINFO on those packets, we need to call
ipv4_pktinfo_prepare, so introduced minor changes to this
function to support this flow.
When sending mrouted/pimd daemons a cache report IGMP msg, it is
unnecessary to set dst on the newly created skb.
It used to be necessary on older versions until
commit d826eb14ecef ("ipv4: PKTINFO doesnt need dst reference") which
changed the way IP_PKTINFO struct is been retrieved.
Fixes: d826eb14ecef ("ipv4: PKTINFO doesnt need dst reference")
Signed-off-by: Leone Fernando <[email protected]>
---
include/net/ip.h | 10 +++++++++-
net/ipv4/ip_sockglue.c | 25 ++++++++++++++++---------
net/ipv4/ipmr.c | 12 +++++-------
net/ipv4/raw.c | 2 +-
net/ipv4/udp.c | 2 +-
5 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index b31be912489a..1b40b7386c56 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -767,7 +767,15 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev);
* Functions provided by ip_sockglue.c
*/
-void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb);
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *iskb,
+ struct sk_buff *oskb);
+
+
+static inline void ipv4_pktinfo_input_prepare(const struct sock *sk, struct sk_buff *skb)
+{
+ ipv4_pktinfo_prepare(sk, skb, NULL);
+}
+
void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index d7d13940774e..fb26963e3869 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1364,19 +1364,26 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
/**
* ipv4_pktinfo_prepare - transfer some info from rtable to skb
* @sk: socket
- * @skb: buffer
+ * @iskb: input buffer
+ * @oskb: out buffer
*
* To support IP_CMSG_PKTINFO option, we store rt_iif and specific
* destination in skb->cb[] before dst drop.
* This way, receiver doesn't make cache line misses to read rtable.
*/
-void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *iskb,
+ struct sk_buff *oskb)
{
- struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
+ struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(iskb);
bool prepare = inet_test_bit(PKTINFO, sk) ||
ipv6_sk_rxinfo(sk);
- if (prepare && skb_rtable(skb)) {
+ if (oskb) {
+ memcpy(oskb->cb, iskb->cb, sizeof(iskb->cb));
+ pktinfo = PKTINFO_SKB_CB(oskb);
+ }
+
+ if (prepare && skb_rtable(iskb)) {
/* skb->cb is overloaded: prior to this point it is IP{6}CB
* which has interface index (iif) as the first member of the
* underlying inet{6}_skb_parm struct. This code then overlays
@@ -1386,20 +1393,20 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
* (e.g., process binds socket to eth0 for Tx which is
* redirected to loopback in the rtable/dst).
*/
- struct rtable *rt = skb_rtable(skb);
- bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
+ struct rtable *rt = skb_rtable(iskb);
+ bool l3slave = ipv4_l3mdev_skb(IPCB(iskb)->flags);
if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
- pktinfo->ipi_ifindex = inet_iif(skb);
+ pktinfo->ipi_ifindex = inet_iif(iskb);
else if (l3slave && rt && rt->rt_iif)
pktinfo->ipi_ifindex = rt->rt_iif;
- pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
+ pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(iskb);
} else {
pktinfo->ipi_ifindex = 0;
pktinfo->ipi_spec_dst.s_addr = 0;
}
- skb_dst_drop(skb);
+ skb_dst_drop(iskb);
}
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9e222a57bc2b..6ed7c88743f9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1025,6 +1025,10 @@ static int ipmr_cache_report(const struct mr_table *mrt,
struct sk_buff *skb;
int ret;
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (!mroute_sk)
+ return -EINVAL;
+
if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
@@ -1069,7 +1073,7 @@ static int ipmr_cache_report(const struct mr_table *mrt,
msg = (struct igmpmsg *)skb_network_header(skb);
msg->im_vif = vifi;
msg->im_vif_hi = vifi >> 8;
- skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+ ipv4_pktinfo_prepare(mroute_sk, pkt, skb);
/* Add our header */
igmp = skb_put(skb, sizeof(struct igmphdr));
igmp->type = assert;
@@ -1079,12 +1083,6 @@ static int ipmr_cache_report(const struct mr_table *mrt,
skb->transport_header = skb->network_header;
}
- mroute_sk = rcu_dereference(mrt->mroute_sk);
- if (!mroute_sk) {
- kfree_skb(skb);
- return -EINVAL;
- }
-
igmpmsg_netlink_event(mrt, skb);
/* Deliver to mrouted */
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 27da9d7294c0..cde60c8deed4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -292,7 +292,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
/* Charge it to the socket. */
- ipv4_pktinfo_prepare(sk, skb);
+ ipv4_pktinfo_input_prepare(sk, skb);
if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
kfree_skb_reason(skb, reason);
return NET_RX_DROP;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89e5a806b82e..3e5a418c96c3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2169,7 +2169,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
udp_csum_pull_header(skb);
- ipv4_pktinfo_prepare(sk, skb);
+ ipv4_pktinfo_input_prepare(sk, skb);
return __udp_queue_rcv_skb(sk, skb);
csum_error:
--
2.34.1
On Wed, Dec 13, 2023 at 3:35 PM Leone Fernando <[email protected]> wrote:
>
> In order to support IP_PKTINFO on those packets, we need to call
> ipv4_pktinfo_prepare, so introduced minor changes to this
> function to support this flow.
>
> When sending mrouted/pimd daemons a cache report IGMP msg, it is
> unnecessary to set dst on the newly created skb.
> It used to be necessary on older versions until
> commit d826eb14ecef ("ipv4: PKTINFO doesnt need dst reference") which
> changed the way IP_PKTINFO struct is been retrieved.
>
Given this is a 12 years old bug, I would rather target net-next tree.
> Fixes: d826eb14ecef ("ipv4: PKTINFO doesnt need dst reference")
> Signed-off-by: Leone Fernando <[email protected]>
> ---
> include/net/ip.h | 10 +++++++++-
> net/ipv4/ip_sockglue.c | 25 ++++++++++++++++---------
> net/ipv4/ipmr.c | 12 +++++-------
> net/ipv4/raw.c | 2 +-
> net/ipv4/udp.c | 2 +-
> 5 files changed, 32 insertions(+), 19 deletions(-)
>
> diff --git a/include/net/ip.h b/include/net/ip.h
> index b31be912489a..1b40b7386c56 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -767,7 +767,15 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev);
> * Functions provided by ip_sockglue.c
> */
>
> -void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb);
> +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *iskb,
> + struct sk_buff *oskb);
> +
> +
> +static inline void ipv4_pktinfo_input_prepare(const struct sock *sk, struct sk_buff *skb)
> +{
> + ipv4_pktinfo_prepare(sk, skb, NULL);
> +}
> +
> void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
> struct sk_buff *skb, int tlen, int offset);
> int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index d7d13940774e..fb26963e3869 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -1364,19 +1364,26 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
> /**
> * ipv4_pktinfo_prepare - transfer some info from rtable to skb
> * @sk: socket
> - * @skb: buffer
> + * @iskb: input buffer
> + * @oskb: out buffer
> *
> * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
> * destination in skb->cb[] before dst drop.
> * This way, receiver doesn't make cache line misses to read rtable.
> */
> -void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
> +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *iskb,
> + struct sk_buff *oskb)
This looks more complicated than needed.
I am pretty sure we can fix the bug without touching this function...
> {
> - struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
> + struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(iskb);
> bool prepare = inet_test_bit(PKTINFO, sk) ||
> ipv6_sk_rxinfo(sk);
>
> - if (prepare && skb_rtable(skb)) {
> + if (oskb) {
> + memcpy(oskb->cb, iskb->cb, sizeof(iskb->cb));
> + pktinfo = PKTINFO_SKB_CB(oskb);
> + }
> +
> + if (prepare && skb_rtable(iskb)) {
> /* skb->cb is overloaded: prior to this point it is IP{6}CB
> * which has interface index (iif) as the first member of the
> * underlying inet{6}_skb_parm struct. This code then overlays
> @@ -1386,20 +1393,20 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
> * (e.g., process binds socket to eth0 for Tx which is
> * redirected to loopback in the rtable/dst).
> */
> - struct rtable *rt = skb_rtable(skb);
> - bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
> + struct rtable *rt = skb_rtable(iskb);
> + bool l3slave = ipv4_l3mdev_skb(IPCB(iskb)->flags);
>
> if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
> - pktinfo->ipi_ifindex = inet_iif(skb);
> + pktinfo->ipi_ifindex = inet_iif(iskb);
> else if (l3slave && rt && rt->rt_iif)
> pktinfo->ipi_ifindex = rt->rt_iif;
>
> - pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
> + pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(iskb);
> } else {
> pktinfo->ipi_ifindex = 0;
> pktinfo->ipi_spec_dst.s_addr = 0;
> }
> - skb_dst_drop(skb);
> + skb_dst_drop(iskb);
> }
>
> int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
> index 9e222a57bc2b..6ed7c88743f9 100644
> --- a/net/ipv4/ipmr.c
> +++ b/net/ipv4/ipmr.c
> @@ -1025,6 +1025,10 @@ static int ipmr_cache_report(const struct mr_table *mrt,
> struct sk_buff *skb;
> int ret;
>
> + mroute_sk = rcu_dereference(mrt->mroute_sk);
> + if (!mroute_sk)
> + return -EINVAL;
> +
> if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
> skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
> else
> @@ -1069,7 +1073,7 @@ static int ipmr_cache_report(const struct mr_table *mrt,
> msg = (struct igmpmsg *)skb_network_header(skb);
> msg->im_vif = vifi;
> msg->im_vif_hi = vifi >> 8;
> - skb_dst_set(skb, dst_clone(skb_dst(pkt)));
> + ipv4_pktinfo_prepare(mroute_sk, pkt, skb);
All we need is to call ipv4_pktinfo_prepare(sk, pkt);
then copy pkt->cb to skb->cb ?
> /* Add our header */
> igmp = skb_put(skb, sizeof(struct igmphdr));
> igmp->type = assert;
> @@ -1079,12 +1083,6 @@ static int ipmr_cache_report(const struct mr_table *mrt,
> skb->transport_header = skb->network_header;
> }
>
> - mroute_sk = rcu_dereference(mrt->mroute_sk);
> - if (!mroute_sk) {
> - kfree_skb(skb);
> - return -EINVAL;
> - }
> -
> igmpmsg_netlink_event(mrt, skb);
>
> /* Deliver to mrouted */
> diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
> index 27da9d7294c0..cde60c8deed4 100644
> --- a/net/ipv4/raw.c
> +++ b/net/ipv4/raw.c
> @@ -292,7 +292,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
>
> /* Charge it to the socket. */
>
> - ipv4_pktinfo_prepare(sk, skb);
> + ipv4_pktinfo_input_prepare(sk, skb);
> if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
> kfree_skb_reason(skb, reason);
> return NET_RX_DROP;
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 89e5a806b82e..3e5a418c96c3 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -2169,7 +2169,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
>
> udp_csum_pull_header(skb);
>
> - ipv4_pktinfo_prepare(sk, skb);
> + ipv4_pktinfo_input_prepare(sk, skb);
> return __udp_queue_rcv_skb(sk, skb);
>
> csum_error:
> --
> 2.34.1
>
Thank you Eric. I will submit a v2.