Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754663AbcJGKqT (ORCPT ); Fri, 7 Oct 2016 06:46:19 -0400 Received: from mail-wm0-f52.google.com ([74.125.82.52]:33538 "EHLO mail-wm0-f52.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753953AbcJGKqR (ORCPT ); Fri, 7 Oct 2016 06:46:17 -0400 From: Nikolay Borisov To: ebiederm@xmission.com Cc: john@johnmccutchan.com, eparis@parisplace.org, viro@zeniv.linux.org.uk, jack@suse.cz, serge@hallyn.com, avagin@openvz.org, linux-kernel@vger.kernel.org, containers@lists.linux-foundation.org, Nikolay Borisov Subject: [PATCH] inotify: Convert to using per-namespace limits Date: Fri, 7 Oct 2016 13:46:01 +0300 Message-Id: <1475837161-4626-1-git-send-email-kernel@kyup.com> X-Mailer: git-send-email 1.7.1 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8748 Lines: 256 This patchset converts inotify to using the newly introduced per-userns sysctl infrastructure. Currently the inotify instances/watches are being accounted in the user_struct structure. This means that in setups where multiple users in unprivileged containers map to the same underlying real user (i.e. pointing to the same user_struct) the inotify limits are going to be shared as well, allowing one user(or application) to exhaust all others limits. Fix this by switching the inotify sysctls to using the per-namespace/per-user limits. This will allow the server admin to set sensible global limits, which can further be tuned inside every individual user namespace. Signed-off-by: Nikolay Borisov --- Hello Eric, I saw you've finally sent your pull request for 4.9 and it includes your implementatino of the ucount infrastructure. So here is my respin of the inotify patches using that. fs/notify/inotify/inotify.h | 17 +++++++++++++ fs/notify/inotify/inotify_fsnotify.c | 6 ++--- fs/notify/inotify/inotify_user.c | 46 ++++++++++++------------------------ include/linux/fsnotify_backend.h | 3 ++- include/linux/sched.h | 4 ---- include/linux/user_namespace.h | 4 ++++ kernel/ucount.c | 6 ++++- 7 files changed, 45 insertions(+), 41 deletions(-) diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index ed855ef6f077..b5536f8ad3e0 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group, const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; + +#ifdef CONFIG_INOTIFY_USER +static void dec_inotify_instances(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES); +} + +static struct ucounts *inc_inotify_watches(struct ucounts *ucounts) +{ + return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES); +} + +static void dec_inotify_watches(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES); +} +#endif diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 2cd900c2c737..1e6b3cfc2cfd 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group) /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_destroy(&group->inotify_data.idr); - if (group->inotify_data.user) { - atomic_dec(&group->inotify_data.user->inotify_devs); - free_uid(group->inotify_data.user); - } + if (group->inotify_data.ucounts) + dec_inotify_instances(group->inotify_data.ucounts); } static void inotify_free_event(struct fsnotify_event *fsn_event) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index b8d08d0d0a4d..cfe48bb511c6 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -44,10 +44,8 @@ #include -/* these are configurable via /proc/sys/fs/inotify/ */ -static int inotify_max_user_instances __read_mostly; +/* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -static int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; @@ -59,22 +57,6 @@ static int zero; struct ctl_table inotify_table[] = { { - .procname = "max_user_instances", - .data = &inotify_max_user_instances, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, - { - .procname = "max_user_watches", - .data = &inotify_max_user_watches, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, - { .procname = "max_queued_events", .data = &inotify_max_queued_events, .maxlen = sizeof(int), @@ -500,7 +482,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); - atomic_dec(&group->inotify_data.user->inotify_watches); + dec_inotify_watches(group->inotify_data.ucounts); } /* ding dong the mark is dead */ @@ -584,14 +566,17 @@ static int inotify_new_watch(struct fsnotify_group *group, tmp_i_mark->fsn_mark.mask = mask; tmp_i_mark->wd = -1; - ret = -ENOSPC; - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) - goto out_err; - ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; + /* increment the number of watches the user has */ + if (!inc_inotify_watches(group->inotify_data.ucounts)) { + inotify_remove_from_idr(group, tmp_i_mark); + ret = -ENOSPC; + goto out_err; + } + /* we are on the idr, now get on the inode */ ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); @@ -601,8 +586,6 @@ static int inotify_new_watch(struct fsnotify_group *group, goto out_err; } - /* increment the number of watches the user has */ - atomic_inc(&group->inotify_data.user->inotify_watches); /* return the watch descriptor for this new mark */ ret = tmp_i_mark->wd; @@ -653,10 +636,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.user = get_current_user(); + group->inotify_data.ucounts = inc_ucount(current_user_ns(), + current_euid(), + UCOUNT_INOTIFY_INSTANCES); - if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > - inotify_max_user_instances) { + if (!group->inotify_data.ucounts) { fsnotify_destroy_group(group); return ERR_PTR(-EMFILE); } @@ -819,8 +803,8 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); inotify_max_queued_events = 16384; - inotify_max_user_instances = 128; - inotify_max_user_watches = 8192; + init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; + init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192; return 0; } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 58205f33af02..892959de1162 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -16,6 +16,7 @@ #include #include #include +#include /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily @@ -169,7 +170,7 @@ struct fsnotify_group { struct inotify_group_private_data { spinlock_t idr_lock; struct idr idr; - struct user_struct *user; + struct ucounts *ucounts; } inotify_data; #endif #ifdef CONFIG_FANOTIFY diff --git a/include/linux/sched.h b/include/linux/sched.h index 62c68e513e39..35230a2c73ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -840,10 +840,6 @@ struct user_struct { atomic_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ -#ifdef CONFIG_INOTIFY_USER - atomic_t inotify_watches; /* How many inotify watches does this user have? */ - atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ -#endif #ifdef CONFIG_FANOTIFY atomic_t fanotify_listeners; #endif diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index eb209d4523f5..cbcac7a5ec41 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -32,6 +32,10 @@ enum ucount_type { UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, +#ifdef CONFIG_INOTIFY_USER + UCOUNT_INOTIFY_INSTANCES, + UCOUNT_INOTIFY_WATCHES, +#endif UCOUNT_COUNTS, }; diff --git a/kernel/ucount.c b/kernel/ucount.c index 9d20d5dd298a..a6bcea47306b 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -57,7 +57,7 @@ static struct ctl_table_root set_root = { static int zero = 0; static int int_max = INT_MAX; -#define UCOUNT_ENTRY(name) \ +#define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(int), \ @@ -74,6 +74,10 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), +#ifdef CONFIG_INOTIFY_USER_ + UCOUNT_ENTRY("max_inotify_instances"), + UCOUNT_ENTRY("max_inotify_watches"), +#endif { } }; #endif /* CONFIG_SYSCTL */ -- 2.5.0