Received: by 2002:a25:ab43:0:0:0:0:0 with SMTP id u61csp4566704ybi; Tue, 11 Jun 2019 08:42:44 -0700 (PDT) X-Google-Smtp-Source: APXvYqyWk3dXGi9ZiXvJblynnWru6KkSETJJP7qenPA6YmNKEdhdDeKe11FHntXIn/jfp+nvU1XU X-Received: by 2002:a17:902:7c8e:: with SMTP id y14mr8265957pll.298.1560267764186; Tue, 11 Jun 2019 08:42:44 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1560267764; cv=none; d=google.com; s=arc-20160816; b=KjHTMHz1ybuJrMVXaEfWseNnfolBJUk/4lHJi2eUFEsRRzWevDTh2TGs+jAr1wkBj7 SyiBrn70qcJ3nPXUCj6dhCdXVu8IUn7R199TZm8U/eo5QI0RyAgVMWg+sdy9jbhk7pAN fPo7RAMILw65wI9pIW0XK6SurNDtcJbHmUX3LbmZsS7Vs5TxIUF8g5Lx8yHAU9c1xZum uNWe55np3QNvjryPu/51FROsvHuYaXzClYdelEozH4oIUIH90JBNpLjPUwJAC8DHqmGZ 4V3uZIw6KbQYPkETciM5kt0KfDqiTghDN8AfnGKd8JrV6G8kU1j9I+qe9P+mr3baMgJE sRUA== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=list-id:precedence:sender:to:content-transfer-encoding:mime-version :references:in-reply-to:message-id:date:subject:cc:from; bh=O659x6BgNgETBnvBL243TemPD7NJ6Ipo3YobGdShhw4=; b=YneY5f/xfKdGv6Rat2gQIWorn5XjmImqAPHCWLg23JcbqmQsoZTvXgk1HlHuIVquyh ZqH43TpZ9LITcZ/sv3m+eyOCAtpn5qJ1an6sLDLgyLAzIQwxHwPxZign+j5z7L51PzMD +0CiFcoVYDa3XSEBShvBX8r/2BSWo4YDf9/ogQ83VO1D5+xp82195P6Ou7z20Kkqs+eU HhYzEw5GJJPHLoBudSHdFQ4HH6BtYIRD3vV2RJSmLW7NZbLKEbFgBrX5EKUW/afD07ft ea1QG1z06rpn83Z1Z17B1WoiJuBjytqcpDv5RNear8ladNx4aXhhv8ncB3MZsPCbaIG/ e11A== ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org Return-Path: Received: from vger.kernel.org (vger.kernel.org. [209.132.180.67]) by mx.google.com with ESMTP id o6si13257835plh.197.2019.06.11.08.42.28; Tue, 11 Jun 2019 08:42:44 -0700 (PDT) Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; Authentication-Results: mx.google.com; spf=pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S2404147AbfFKOzV (ORCPT + 99 others); Tue, 11 Jun 2019 10:55:21 -0400 Received: from mx2.suse.de ([195.135.220.15]:52598 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S2404070AbfFKOzT (ORCPT ); Tue, 11 Jun 2019 10:55:19 -0400 X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.220.254]) by mx1.suse.de (Postfix) with ESMTP id 76E02AF0C; Tue, 11 Jun 2019 14:55:17 +0000 (UTC) From: Roman Penyaev Cc: Roman Penyaev , Andrew Morton , Al Viro , Linus Torvalds , linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [PATCH v4 08/14] epoll: support polling from userspace for ep_insert() Date: Tue, 11 Jun 2019 16:54:52 +0200 Message-Id: <20190611145458.9540-9-rpenyaev@suse.de> X-Mailer: git-send-email 2.21.0 In-Reply-To: <20190611145458.9540-1-rpenyaev@suse.de> References: <20190611145458.9540-1-rpenyaev@suse.de> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit To: unlisted-recipients:; (no To-header on input) Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org When epfd is polled by userspace and new item is inserted new bit should be get from a bitmap and then user item is set accordingly. Signed-off-by: Roman Penyaev Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- fs/eventpoll.c | 118 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 100 insertions(+), 18 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index bcd57ca47564..4f541f85c7e5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -885,6 +885,23 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +static inline int ep_get_bit(struct eventpoll *ep) +{ + bool was_set; + int bit; + + lockdep_assert_held(&ep->mtx); + + bit = find_first_zero_bit(ep->items_bm, ep->max_items_nr); + if (bit >= ep->max_items_nr) + return -ENOSPC; + + was_set = test_and_set_bit(bit, ep->items_bm); + WARN_ON(was_set); + + return bit; +} + #define set_unless_zero_atomically(ptr, flags) \ ({ \ typeof(ptr) _ptr = (ptr); \ @@ -1996,6 +2013,33 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) wakeup_source_unregister(ws); } +static inline struct epitem *epi_alloc(struct eventpoll *ep) +{ + struct epitem *epi; + + if (ep_polled_by_user(ep)) { + struct uepitem *uepi; + + uepi = kmem_cache_alloc(uepi_cache, GFP_KERNEL); + if (likely(uepi)) + epi = &uepi->epi; + else + epi = NULL; + } else { + epi = kmem_cache_alloc(epi_cache, GFP_KERNEL); + } + + return epi; +} + +static inline void epi_free(struct eventpoll *ep, struct epitem *epi) +{ + if (ep_polled_by_user(ep)) + kmem_cache_free(uepi_cache, uep_item_from_epi(epi)); + else + kmem_cache_free(epi_cache, epi); +} + /* * Must be called with "mtx" held. */ @@ -2008,29 +2052,55 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct epitem *epi; struct ep_pqueue epq; + lockdep_assert_held(&ep->mtx); lockdep_assert_irqs_enabled(); user_watches = atomic_long_read(&ep->user->epoll_watches); if (unlikely(user_watches >= max_user_watches)) return -ENOSPC; - if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) + epi = epi_alloc(ep); + if (unlikely(!epi)) return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); + RCU_INIT_POINTER(epi->ws, NULL); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; - if (epi->event.events & EPOLLWAKEUP) { + + if (ep_polled_by_user(ep)) { + struct uepitem *uepi = uep_item_from_epi(epi); + struct epoll_uitem *uitem; + int bit; + + INIT_WORK(&uepi->work, ep_poll_callback_work); + + bit = ep_get_bit(ep); + if (unlikely(bit < 0)) { + error = bit; + goto error_get_bit; + } + uepi->bit = bit; + + /* + * Now fill-in user item. Do not touch ready_events, since + * it can be EPOLLREMOVED (has been set by previous user + * item), thus user index entry can be not yet consumed + * by userspace. See ep_remove_user_item() and + * ep_add_event_to_uring() for details. + */ + uitem = &ep->user_header->items[uepi->bit]; + uitem->events = event->events; + uitem->data = event->data; + } else if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) goto error_create_wakeup_source; - } else { - RCU_INIT_POINTER(epi->ws, NULL); } /* Initialize the poll table using the queue callback */ @@ -2077,16 +2147,23 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); - /* If the file is already "ready" we drop it inside the ready list */ - if (revents && !ep_is_linked(epi)) { - list_add_tail(&epi->rdllink, &ep->rdllist); - ep_pm_stay_awake(epi); + if (revents) { + bool added = false; - /* Notify waiting tasks that events are available */ - if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); - if (waitqueue_active(&ep->poll_wait)) - pwake++; + if (ep_polled_by_user(ep)) { + added = ep_add_event_to_uring(epi, revents); + } else if (!ep_is_linked(epi)) { + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake(epi); + added = true; + } + if (added) { + /* Notify waiting tasks that events are available */ + if (waitqueue_active(&ep->wq)) + wake_up(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + } } write_unlock_irq(&ep->lock); @@ -2115,15 +2192,20 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ - write_lock_irq(&ep->lock); - if (ep_is_linked(epi)) - list_del_init(&epi->rdllink); - write_unlock_irq(&ep->lock); + if (ep_polled_by_user(ep)) { + ep_remove_user_item(epi); + } else { + write_lock_irq(&ep->lock); + if (ep_is_linked(epi)) + list_del_init(&epi->rdllink); + write_unlock_irq(&ep->lock); + } wakeup_source_unregister(ep_wakeup_source(epi)); +error_get_bit: error_create_wakeup_source: - kmem_cache_free(epi_cache, epi); + epi_free(ep, epi); return error; } -- 2.21.0