Hi,
Running inotify on a very large number hierachy of directories and
files can result in major lock contention between the producer and
consumer of events.
The current code takes the spinlock when pulling each event off the
queue, which isn't overly idea. This patchset reduces the lock
contention by splicing off the event queue while processing reads and
putting the leftover events back on the queue once the read buffer is
full. In order to ensure ordering when reading events, a mutex is
added to read().
While this should reduce the number of locks taken on read when
reading in bulk, it does increase the overhead for the case where the
users tries to read one event at a time.
Last, the first patch of the series adds the missing mutex_destoy()
for mark_mutex.
Thoughts?
Jes
Jes Sorensen (5):
notify: Call mutex_destroy() before freeing mutex memory
inotify: Use mutex to prevent threaded clients reading events out of
order
notify: Split up some fsnotify functions
inotify: switch get_one_event() to use fsnotify_list_*() helpers
inotify: Avoid taking spinlock for each event processed in read()
fs/notify/group.c | 1 +
fs/notify/inotify/inotify_fsnotify.c | 1 +
fs/notify/inotify/inotify_user.c | 42 ++++++++++++++++++++++++++++--------
fs/notify/notification.c | 38 ++++++++++++++++++++++++++------
include/linux/fsnotify_backend.h | 7 ++++++
5 files changed, 73 insertions(+), 16 deletions(-)
--
2.9.3
Introduces mutex in the read() path to prevent a threaded client reading
from the same fd consuming events out of order.
This will matter when avoiding taking the spinlock when consuming each
event in the read() path.
Signed-off-by: Jes Sorensen <[email protected]>
Reviewed-by: Josef Bacik <[email protected]>
---
fs/notify/inotify/inotify_fsnotify.c | 1 +
fs/notify/inotify/inotify_user.c | 4 ++++
include/linux/fsnotify_backend.h | 3 +++
3 files changed, 8 insertions(+)
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1aeb837..63c071f 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -168,6 +168,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
idr_destroy(&group->inotify_data.idr);
if (group->inotify_data.ucounts)
dec_inotify_instances(group->inotify_data.ucounts);
+ mutex_destroy(&group->inotify_data.consumer_mutex);
}
static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 498d609..6f5b360 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -230,6 +230,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
start = buf;
group = file->private_data;
+ mutex_lock(&group->inotify_data.consumer_mutex);
add_wait_queue(&group->notification_waitq, &wait);
while (1) {
spin_lock(&group->notification_lock);
@@ -264,6 +265,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
remove_wait_queue(&group->notification_waitq, &wait);
+ mutex_unlock(&group->inotify_data.consumer_mutex);
if (start != buf && ret != -EFAULT)
ret = buf - start;
@@ -650,6 +652,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
group->max_events = max_events;
+ mutex_init(&group->inotify_data.consumer_mutex);
+
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
group->inotify_data.ucounts = inc_ucount(current_user_ns(),
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index e6e689b..e0686ed 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -172,6 +172,9 @@ struct fsnotify_group {
spinlock_t idr_lock;
struct idr idr;
struct ucounts *ucounts;
+ struct mutex consumer_mutex; /* Prevent out of order
+ * delivery of events
+ * to threaded consumers */
} inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
--
2.9.3
Splice off the notification list while processing read events, which allows
it to be processed without taking and releasing the spinlock for each
event.
Signed-off-by: Jes Sorensen <[email protected]>
Reviewed-by: Josef Bacik <[email protected]>
---
fs/notify/inotify/inotify_user.c | 28 ++++++++++++++++++++++++----
1 file changed, 24 insertions(+), 4 deletions(-)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4a93750..45b1f69 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -224,22 +224,27 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
struct fsnotify_group *group;
struct fsnotify_event *kevent;
char __user *start;
- int ret;
+ int ret, consumed = 0;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ LIST_HEAD(tmp_list);
start = buf;
group = file->private_data;
mutex_lock(&group->inotify_data.consumer_mutex);
add_wait_queue(&group->notification_waitq, &wait);
+
+ spin_lock(&group->notification_lock);
+ list_splice_init(&group->notification_list, &tmp_list);
+ spin_unlock(&group->notification_lock);
+
while (1) {
- spin_lock(&group->notification_lock);
- kevent = get_one_event(&group->notification_list, count);
- spin_unlock(&group->notification_lock);
+ kevent = get_one_event(&tmp_list, count);
pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
if (kevent) {
+ consumed++;
ret = PTR_ERR(kevent);
if (IS_ERR(kevent))
break;
@@ -262,8 +267,23 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
if (start != buf)
break;
+ spin_lock(&group->notification_lock);
+ list_splice_init(&tmp_list, &group->notification_list);
+ group->q_len -= consumed;
+ consumed = 0;
+ spin_unlock(&group->notification_lock);
+
wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+
+ spin_lock(&group->notification_lock);
+ list_splice_init(&group->notification_list, &tmp_list);
+ spin_unlock(&group->notification_lock);
}
+ spin_lock(&group->notification_lock);
+ list_splice(&tmp_list, &group->notification_list);
+ group->q_len -= consumed;
+ spin_unlock(&group->notification_lock);
+
remove_wait_queue(&group->notification_waitq, &wait);
mutex_unlock(&group->inotify_data.consumer_mutex);
--
2.9.3
This splits up fsnotify_remove_first_event() and fsnotify_peek_first_event()
into a helper function with a wrapper, and introduces two new versions that
takes a list instead of the group as argument.
Signed-off-by: Jes Sorensen <[email protected]>
Reviewed-by: Josef Bacik <[email protected]>
---
fs/notify/notification.c | 38 +++++++++++++++++++++++++++++++-------
include/linux/fsnotify_backend.h | 4 ++++
2 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 66f85c6..7b38cf8 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -144,26 +144,43 @@ int fsnotify_add_event(struct fsnotify_group *group,
* Remove and return the first event from the notification list. It is the
* responsibility of the caller to destroy the obtained event
*/
-struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
+struct fsnotify_event *
+__fsnotify_remove_first_event(struct list_head *notification_list)
{
struct fsnotify_event *event;
- assert_spin_locked(&group->notification_lock);
-
- pr_debug("%s: group=%p\n", __func__, group);
-
- event = list_first_entry(&group->notification_list,
+ event = list_first_entry(notification_list,
struct fsnotify_event, list);
/*
* We need to init list head for the case of overflow event so that
* check in fsnotify_add_event() works
*/
list_del_init(&event->list);
- group->q_len--;
return event;
}
+struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
+{
+ struct list_head *notification_list = &group->notification_list;
+
+ assert_spin_locked(&group->notification_lock);
+
+ pr_debug("%s: group=%p\n", __func__, group);
+
+ group->q_len--;
+ return __fsnotify_remove_first_event(notification_list);
+}
+
+/*
+ * Note this version doesn't update the queue depth counter.
+ */
+struct fsnotify_event *
+fsnotify_list_remove_first_event(struct list_head *notification_list)
+{
+ return __fsnotify_remove_first_event(notification_list);
+}
+
/*
* This will not remove the event, that must be done with
* fsnotify_remove_first_event()
@@ -176,6 +193,13 @@ struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
struct fsnotify_event, list);
}
+struct fsnotify_event *
+fsnotify_list_peek_first_event(struct list_head *notification_list)
+{
+ return list_first_entry(notification_list,
+ struct fsnotify_event, list);
+}
+
/*
* Called when a group is being torn down to clean up any outstanding
* event notifications.
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index e0686ed..80d4c3b 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -313,8 +313,12 @@ extern int fsnotify_add_event(struct fsnotify_group *group,
extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
/* return, but do not dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
+/* return, but do not dequeue the first event on the notification list */
+extern struct fsnotify_event *fsnotify_list_peek_first_event(struct list_head *notification_list);
/* return AND dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
+/* return AND dequeue the first event on the notification list */
+extern struct fsnotify_event *fsnotify_list_remove_first_event(struct list_head *notification_list);
/* functions used to manipulate the marks attached to inodes */
--
2.9.3
Preparing to switch inotify to code not taking the spinlock for each
event in read()
Signed-off-by: Jes Sorensen <[email protected]>
Reviewed-by: Josef Bacik <[email protected]>
---
fs/notify/inotify/inotify_user.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 6f5b360..4a93750 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -138,18 +138,18 @@ static int round_event_name_len(struct fsnotify_event *fsn_event)
*
* Called with the group->notification_lock held.
*/
-static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+static struct fsnotify_event *get_one_event(struct list_head *notification_list,
size_t count)
{
size_t event_size = sizeof(struct inotify_event);
struct fsnotify_event *event;
- if (fsnotify_notify_queue_is_empty(group))
+ if (list_empty(notification_list))
return NULL;
- event = fsnotify_peek_first_event(group);
+ event = fsnotify_list_peek_first_event(notification_list);
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+ pr_debug("%s: event=%p\n", __func__, event);
event_size += round_event_name_len(event);
if (event_size > count)
@@ -157,7 +157,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
/* held the notification_lock the whole time, so this is the
* same event we peeked above */
- fsnotify_remove_first_event(group);
+ fsnotify_list_remove_first_event(notification_list);
return event;
}
@@ -234,7 +234,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
add_wait_queue(&group->notification_waitq, &wait);
while (1) {
spin_lock(&group->notification_lock);
- kevent = get_one_event(group, count);
+ kevent = get_one_event(&group->notification_list, count);
spin_unlock(&group->notification_lock);
pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
--
2.9.3
The conversion of notify from using a spinlock to a mutex missed
adding the call mutex_destroy().
Fixes: 986ab09 ("fsnotify: use a mutex instead of a spinlock to protect a groups mark list")
Signed-off-by: Jes Sorensen <[email protected]>
Reviewed-by: Josef Bacik <[email protected]>
---
fs/notify/group.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/notify/group.c b/fs/notify/group.c
index fbe3cbe..d231be3 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -36,6 +36,7 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
if (group->ops->free_group_priv)
group->ops->free_group_priv(group);
+ mutex_destroy(&group->mark_mutex);
kfree(group);
}
--
2.9.3