Return-Path: Received: from mail-it0-f65.google.com ([209.85.214.65]:34551 "EHLO mail-it0-f65.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753111AbcJTEnq (ORCPT ); Thu, 20 Oct 2016 00:43:46 -0400 Message-ID: <1476938622.5650.111.camel@edumazet-glaptop3.roam.corp.google.com> Subject: Re: [PATCH net-next v4 2/3] udp: implement memory accounting helpers From: Eric Dumazet To: Paolo Abeni Cc: netdev@vger.kernel.org, "David S. Miller" , James Morris , Trond Myklebust , Alexander Duyck , Daniel Borkmann , Eric Dumazet , Tom Herbert , Hannes Frederic Sowa , Edward Cree , linux-nfs@vger.kernel.org Date: Wed, 19 Oct 2016 21:43:42 -0700 In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Mime-Version: 1.0 Sender: linux-nfs-owner@vger.kernel.org List-ID: On Wed, 2016-10-19 at 14:47 +0200, Paolo Abeni wrote: > Avoid using the generic helpers. > Adds a new spinlock_t to the udp_sock structure and use > it to protect the memory accounting operation, both on > enqueue and on dequeue. > > On dequeue perform partial memory reclaiming, trying to > leave a quantum of forward allocated memory. > > On enqueue use a custom helper, to allow some optimizations: > - use a plain spin_lock() variant instead of the slightly > costly spin_lock_irqsave(), > - avoid dst_force check, since the calling code has already > dropped the skb dst > > The above needs custom memory reclaiming on shutdown, provided > by the udp_destruct_sock(). > > v3 -> v4: > - reworked memory accunting, simplifying the schema > - provide an helper for both memory scheduling and enqueuing > > v1 -> v2: > - use a udp specific destrctor to perform memory reclaiming > - remove a couple of helpers, unneeded after the above cleanup > - do not reclaim memory on dequeue if not under memory > pressure > - reworked the fwd accounting schema to avoid potential > integer overflow > > Acked-by: Hannes Frederic Sowa > Signed-off-by: Paolo Abeni > --- > include/linux/udp.h | 1 + > include/net/udp.h | 4 ++ > net/ipv4/udp.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 116 insertions(+) > > diff --git a/include/linux/udp.h b/include/linux/udp.h > index d1fd8cd..8ea7f8b 100644 > --- a/include/linux/udp.h > +++ b/include/linux/udp.h > @@ -66,6 +66,7 @@ struct udp_sock { > #define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */ > __u8 pcflag; /* marks socket as UDP-Lite if > 0 */ > __u8 unused[3]; > + spinlock_t mem_lock; /* protects memory accounting */ > /* > * For encapsulation sockets. > */ > diff --git a/include/net/udp.h b/include/net/udp.h > index ea53a87..18f1e6b 100644 > --- a/include/net/udp.h > +++ b/include/net/udp.h > @@ -246,6 +246,9 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, > } > > /* net/ipv4/udp.c */ > +void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); > +int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); > + > void udp_v4_early_demux(struct sk_buff *skb); > int udp_get_port(struct sock *sk, unsigned short snum, > int (*saddr_cmp)(const struct sock *, > @@ -258,6 +261,7 @@ int udp_get_port(struct sock *sk, unsigned short snum, > void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); > int udp_rcv(struct sk_buff *skb); > int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); > +int udp_init_sock(struct sock *sk); > int udp_disconnect(struct sock *sk, int flags); > unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait); > struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index 7d96dc2..7047bb3 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -1172,6 +1172,117 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, > return ret; > } > > +static void udp_rmem_release(struct sock *sk, int size, int partial) > +{ > + struct udp_sock *up = udp_sk(sk); > + int amt; > + > + atomic_sub(size, &sk->sk_rmem_alloc); > + > + spin_lock_bh(&up->mem_lock); > + sk->sk_forward_alloc += size; > + amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1); > + sk->sk_forward_alloc -= amt; > + spin_unlock_bh(&up->mem_lock); > + > + if (amt) > + __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT); > +} > + > +static void udp_rmem_free(struct sk_buff *skb) > +{ > + udp_rmem_release(skb->sk, skb->truesize, 1); > +} > + > +int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) > +{ > + struct sk_buff_head *list = &sk->sk_receive_queue; > + int rmem, delta, amt, err = -ENOMEM; > + struct udp_sock *up = udp_sk(sk); > + int size = skb->truesize; > + > + /* try to avoid the costly atomic add/sub pair when the receive > + * queue is full > + */ > + if (atomic_read(&sk->sk_rmem_alloc) + size > sk->sk_rcvbuf) > + goto drop; > + > + rmem = atomic_add_return(size, &sk->sk_rmem_alloc); > + if (rmem > sk->sk_rcvbuf) > + goto uncharge_drop; This is dangerous : We used to accept one packet in receive queue, even if sk_rcvbuf is too small. (Check __sock_queue_rcv_skb()) With your code we might drop all packets even if receive queue is empty, this might add regressions for some apps. > + > + spin_lock(&up->mem_lock); > + if (size < sk->sk_forward_alloc) > + goto unlock; Strange label name, since you do "sk->sk_forward_alloc -= size;"before unlock. You could avoid the goto by : if (size >= sk->sk_forward_alloc) { amt = sk_mem_pages(size); ... } sk->sk_forward_alloc += delta; > + > + amt = sk_mem_pages(size); > + delta = amt << SK_MEM_QUANTUM_SHIFT; > + if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) { > + err = -ENOBUFS; > + spin_unlock(&up->mem_lock); > + goto uncharge_drop; > + } > + > + sk->sk_forward_alloc += delta; > + > +unlock: > + sk->sk_forward_alloc -= size; > + spin_unlock(&up->mem_lock); > + > + /* the skb owner in now the udp socket */ > + skb_orphan(skb); > + skb->sk = sk; > + skb->destructor = udp_rmem_free; > + > + skb->dev = NULL; > + sock_skb_set_dropcount(sk, skb); > + > + spin_lock(&list->lock); Since we acquire this lock anyway to store the packet, why not protecting sk_forward_alloc as well with this spinlock, and get rid of up->mem_lock. Ideally a single spin_lock() should be enough. Then almost all other atomic ops could be transformed to non atomic ops. > + __skb_queue_tail(list, skb); > + spin_unlock(&list->lock); > + > + if (!sock_flag(sk, SOCK_DEAD)) > + sk->sk_data_ready(sk); > + > + return 0; > + > +uncharge_drop: > + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); > + > +drop: > + atomic_inc(&sk->sk_drops); > + return err; > +} > +EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); > + > +static void udp_destruct_sock(struct sock *sk) > +{ > + /* reclaim completely the forward allocated memory */ > + __skb_queue_purge(&sk->sk_receive_queue); > + udp_rmem_release(sk, 0, 0); > + inet_sock_destruct(sk); > +} > + > +int udp_init_sock(struct sock *sk) > +{ > + sk->sk_destruct = udp_destruct_sock; > + spin_lock_init(&udp_sk(sk)->mem_lock); > + return 0; > +} > +EXPORT_SYMBOL_GPL(udp_init_sock); > + > +void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) > +{ > + if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) { > + bool slow = lock_sock_fast(sk); > + > + sk_peek_offset_bwd(sk, len); > + unlock_sock_fast(sk, slow); > + } > + consume_skb(skb); > +} > +EXPORT_SYMBOL_GPL(skb_consume_udp); > + > /** > * first_packet_length - return length of first packet in receive queue > * @sk: socket