Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754370Ab0DYJSI (ORCPT ); Sun, 25 Apr 2010 05:18:08 -0400 Received: from mga11.intel.com ([192.55.52.93]:14281 "EHLO mga11.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752854Ab0DYJRT (ORCPT ); Sun, 25 Apr 2010 05:17:19 -0400 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="4.52,269,1270450800"; d="scan'208";a="561295896" From: xiaohui.xin@intel.com To: netdev@vger.kernel.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mst@redhat.com, mingo@elte.hu, davem@davemloft.net, jdike@linux.intel.com Cc: Xin Xiaohui Subject: [RFC][PATCH v4 16/18] Export proto_ops to vhost-net driver. Date: Sun, 25 Apr 2010 17:20:03 +0800 Message-Id: <1272187206-18534-16-git-send-email-xiaohui.xin@intel.com> X-Mailer: git-send-email 1.5.4.4 In-Reply-To: <1272187206-18534-15-git-send-email-xiaohui.xin@intel.com> References: <1272187206-18534-1-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-2-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-3-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-4-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-5-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-6-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-7-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-8-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-9-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-10-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-11-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-12-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-13-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-14-git-send-email-xiaohui.xin@intel.com> <1272187206-18534-15-git-send-email-xiaohui.xin@intel.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9398 Lines: 360 From: Xin Xiaohui Currently, vhost-net is only user to the mp device. Signed-off-by: Xin Xiaohui Signed-off-by: Zhao Yu Reviewed-by: Jeff Dike --- drivers/vhost/mpassthru.c | 321 ++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 317 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c index b171f21..0ac1a71 100644 --- a/drivers/vhost/mpassthru.c +++ b/drivers/vhost/mpassthru.c @@ -563,8 +563,321 @@ failed: return NULL; } +static void mp_sock_destruct(struct sock *sk) +{ + struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp; + kfree(mp); +} + +static void mp_sock_state_change(struct sock *sk) +{ + if (sk_has_sleeper(sk)) + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN); +} + +static void mp_sock_write_space(struct sock *sk) +{ + if (sk_has_sleeper(sk)) + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT); +} + +static void mp_sock_data_ready(struct sock *sk, int coming) +{ + struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp; + struct page_ctor *ctor = NULL; + struct sk_buff *skb = NULL; + struct page_info *info = NULL; + struct ethhdr *eth; + struct kiocb *iocb = NULL; + int len, i; + + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + + ctor = rcu_dereference(mp->ctor); + if (!ctor) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (skb_shinfo(skb)->destructor_arg) { + info = container_of(skb_shinfo(skb)->destructor_arg, + struct page_info, ext_page); + info->skb = skb; + if (skb->len > info->len) { + mp->dev->stats.rx_dropped++; + DBG(KERN_INFO "Discarded truncated rx packet: " + " len %d > %zd\n", skb->len, info->len); + info->total = skb->len; + goto clean; + } else { + int i; + struct skb_shared_info *gshinfo = + (struct skb_shared_info *) + (&info->ushinfo); + struct skb_shared_info *hshinfo = + skb_shinfo(skb); + + if (gshinfo->nr_frags < hshinfo->nr_frags) + goto clean; + eth = eth_hdr(skb); + skb_push(skb, ETH_HLEN); + + hdr.hdr_len = skb_headlen(skb); + info->total = skb->len; + + for (i = 0; i < gshinfo->nr_frags; i++) + gshinfo->frags[i].size = 0; + for (i = 0; i < hshinfo->nr_frags; i++) + gshinfo->frags[i].size = + hshinfo->frags[i].size; + memcpy(skb_shinfo(skb), &info->ushinfo, + sizeof(struct skb_shared_info)); + } + } else { + /* The skb composed with kernel buffers + * in case external buffers are not sufficent. + * The case should be rare. + */ + unsigned long flags; + int i; + struct skb_shared_info *gshinfo = NULL; + + info = NULL; + + spin_lock_irqsave(&ctor->read_lock, flags); + if (!list_empty(&ctor->readq)) { + info = list_first_entry(&ctor->readq, + struct page_info, list); + list_del(&info->list); + } + spin_unlock_irqrestore(&ctor->read_lock, flags); + if (!info) { + DBG(KERN_INFO + "No external buffer avaliable %p\n", + skb); + skb_queue_head(&sk->sk_receive_queue, + skb); + break; + } + info->skb = skb; + /* compute the guest skb frags info */ + gshinfo = (struct skb_shared_info *) + (info->ext_page.start + + SKB_DATA_ALIGN(info->ext_page.size)); + + if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags) + goto clean; + + eth = eth_hdr(skb); + skb_push(skb, ETH_HLEN); + info->total = skb->len; + + for (i = 0; i < gshinfo->nr_frags; i++) + gshinfo->frags[i].size = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + gshinfo->frags[i].size = + skb_shinfo(skb)->frags[i].size; + hdr.hdr_len = min_t(int, skb->len, + info->iov[1].iov_len); + skb_copy_datagram_iovec(skb, 0, info->iov, skb->len); + } + + len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr, + sizeof hdr); + if (len) { + DBG(KERN_INFO + "Unable to write vnet_hdr at addr %p: %d\n", + info->hdr->iov_base, len); + goto clean; + } + + iocb = create_iocb(info, skb->len + sizeof(hdr)); + continue; + +clean: + kfree_skb(skb); + for (i = 0; info->pages[i]; i++) + put_page(info->pages[i]); + kmem_cache_free(ctor->cache, info); + } + return; +} + +static int mp_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp; + struct page_ctor *ctor; + struct iovec *iov = m->msg_iov; + struct page_info *info = NULL; + struct frag frags[MAX_SKB_FRAGS]; + struct sk_buff *skb; + int count = m->msg_iovlen; + int total = 0, header, n, i, len, rc; + unsigned long base; + + ctor = rcu_dereference(mp->ctor); + if (!ctor) + return -ENODEV; + + total = iov_length(iov, count); + + if (total < ETH_HLEN) + return -EINVAL; + + if (total <= COPY_THRESHOLD) + goto copy; + + n = 0; + for (i = 0; i < count; i++) { + base = (unsigned long)iov[i].iov_base; + len = iov[i].iov_len; + if (!len) + continue; + n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (n > MAX_SKB_FRAGS) + return -EINVAL; + } + +copy: + header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total; + + skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC); + if (!skb) + goto drop; + + skb_reserve(skb, NET_IP_ALIGN); + + skb_set_network_header(skb, ETH_HLEN); + + memcpy_fromiovec(skb->data, iov, header); + skb_put(skb, header); + skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN); + + if (header == total) { + rc = total; + info = alloc_small_page_info(ctor, iocb, total); + } else { + info = alloc_page_info(ctor, iocb, iov, count, frags, 0, total); + if (info) + for (i = 0; info->pages[i]; i++) { + skb_add_rx_frag(skb, i, info->pages[i], + frags[i].offset, frags[i].size); + info->pages[i] = NULL; + } + } + if (info != NULL) { + info->desc_pos = iocb->ki_pos; + info->total = total; + info->skb = skb; + skb_shinfo(skb)->destructor_arg = &info->ext_page; + skb->dev = mp->dev; + dev_queue_xmit(skb); + return 0; + } +drop: + kfree_skb(skb); + if (info) { + for (i = 0; info->pages[i]; i++) + put_page(info->pages[i]); + kmem_cache_free(info->ctor->cache, info); + } + mp->dev->stats.tx_dropped++; + return -ENOMEM; +} + +static int mp_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len, + int flags) +{ + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp; + struct page_ctor *ctor; + struct iovec *iov = m->msg_iov; + int count = m->msg_iovlen; + int npages, payload; + struct page_info *info; + struct frag frags[MAX_SKB_FRAGS]; + unsigned long base; + int i, len; + unsigned long flag; + + if (!(flags & MSG_DONTWAIT)) + return -EINVAL; + + ctor = rcu_dereference(mp->ctor); + if (!ctor) + return -EINVAL; + + /* Error detections in case invalid external buffer */ + if (count > 2 && iov[1].iov_len < ctor->port.hdr_len && + mp->dev->features & NETIF_F_SG) { + return -EINVAL; + } + + npages = ctor->port.npages; + payload = ctor->port.data_len; + + /* If KVM guest virtio-net FE driver use SG feature */ + if (count > 2) { + for (i = 2; i < count; i++) { + base = (unsigned long)iov[i].iov_base & ~PAGE_MASK; + len = iov[i].iov_len; + if (npages == 1) + len = min_t(int, len, PAGE_SIZE - base); + else if (base) + break; + payload -= len; + if (payload <= 0) + goto proceed; + if (npages == 1 || (len & ~PAGE_MASK)) + break; + } + } + + if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK) + - NET_SKB_PAD - NET_IP_ALIGN) >= 0) + goto proceed; + + return -EINVAL; + +proceed: + /* skip the virtnet head */ + iov++; + count--; + + if (!ctor->lock_pages) + set_memlock_rlimit(ctor, RLIMIT_MEMLOCK, + iocb->ki_user_data * 4096, + iocb->ki_user_data * 4096); + + /* Translate address to kernel */ + info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0); + if (!info) + return -ENOMEM; + info->len = total_len; + info->hdr[0].iov_base = iocb->ki_iovec[0].iov_base; + info->hdr[0].iov_len = iocb->ki_iovec[0].iov_len; + info->offset = frags[0].offset; + info->desc_pos = iocb->ki_pos; + + iov--; + count++; + + memcpy(info->iov, iov, sizeof(struct iovec) * count); + + spin_lock_irqsave(&ctor->read_lock, flag); + list_add_tail(&info->list, &ctor->readq); + spin_unlock_irqrestore(&ctor->read_lock, flag); + + return 0; +} + /* Ops structure to mimic raw sockets with mp device */ static const struct proto_ops mp_socket_ops = { + .sendmsg = mp_sendmsg, + .recvmsg = mp_recvmsg, }; static struct proto mp_proto = { @@ -687,10 +1000,10 @@ static long mp_chr_ioctl(struct file *file, unsigned int cmd, sk->sk_sndbuf = INT_MAX; container_of(sk, struct mp_sock, sk)->mp = mp; - sk->sk_destruct = NULL; - sk->sk_data_ready = NULL; - sk->sk_write_space = NULL; - sk->sk_state_change = NULL; + sk->sk_destruct = mp_sock_destruct; + sk->sk_data_ready = mp_sock_data_ready; + sk->sk_write_space = mp_sock_write_space; + sk->sk_state_change = mp_sock_state_change; ret = mp_attach(mp, file); if (ret < 0) goto err_free_sk; -- 1.5.4.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/