Cgroup v1's memory controller contains a pretty complicated
event notifications mechanism which is not used on cgroup v2.
Let's move the corresponding code into memcontrol-v1.c.
Please, note, that mem_cgroup_event_ratelimit() remains in
memcontrol.c, otherwise it would require exporting too many
details on memcg stats outside of memcontrol.c.
Signed-off-by: Roman Gushchin <[email protected]>
---
include/linux/memcontrol.h | 12 -
mm/memcontrol-v1.c | 653 +++++++++++++++++++++++++++++++++++
mm/memcontrol-v1.h | 51 +++
mm/memcontrol.c | 687 +------------------------------------
4 files changed, 709 insertions(+), 694 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 04f9f7b0e7c2f..394b92b620d9c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -69,18 +69,6 @@ struct mem_cgroup_id {
refcount_t ref;
};
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremented by the number of pages. This counter is used
- * to trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
- MEM_CGROUP_TARGET_THRESH,
- MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_NTARGETS,
-};
-
struct memcg_vmstats_percpu;
struct memcg_vmstats;
struct lruvec_stats_percpu;
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index f3ea01a17eea9..c47ffb6105931 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -6,6 +6,10 @@
#include <linux/pagewalk.h>
#include <linux/backing-dev.h>
#include <linux/swap_cgroup.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/sort.h>
+#include <linux/file.h>
#include "internal.h"
#include "swap.h"
@@ -60,6 +64,54 @@ static struct move_charge_struct {
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
+/* for OOM */
+struct mem_cgroup_eventfd_list {
+ struct list_head list;
+ struct eventfd_ctx *eventfd;
+};
+
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+ /*
+ * memcg which the event belongs to.
+ */
+ struct mem_cgroup *memcg;
+ /*
+ * eventfd to signal userspace about the event.
+ */
+ struct eventfd_ctx *eventfd;
+ /*
+ * Each of these stored in a list by the cgroup.
+ */
+ struct list_head list;
+ /*
+ * register_event() callback will be used to add new userspace
+ * waiter for changes related to this event. Use eventfd_signal()
+ * on eventfd to send notification to userspace.
+ */
+ int (*register_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args);
+ /*
+ * unregister_event() callback will be called when userspace closes
+ * the eventfd or on cgroup removing. This callback must be set,
+ * if you want provide notification functionality.
+ */
+ void (*unregister_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd);
+ /*
+ * All fields below needed to unregister event when
+ * userspace closes eventfd.
+ */
+ poll_table pt;
+ wait_queue_head_t *wqh;
+ wait_queue_entry_t wait;
+ struct work_struct remove;
+};
+
+extern spinlock_t memcg_oom_lock;
+
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz,
unsigned long new_usage_in_excess)
@@ -1306,6 +1358,607 @@ static void mem_cgroup_move_task(void)
}
#endif
+static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
+{
+ struct mem_cgroup_threshold_ary *t;
+ unsigned long usage;
+ int i;
+
+ rcu_read_lock();
+ if (!swap)
+ t = rcu_dereference(memcg->thresholds.primary);
+ else
+ t = rcu_dereference(memcg->memsw_thresholds.primary);
+
+ if (!t)
+ goto unlock;
+
+ usage = mem_cgroup_usage(memcg, swap);
+
+ /*
+ * current_threshold points to threshold just below or equal to usage.
+ * If it's not true, a threshold was crossed after last
+ * call of __mem_cgroup_threshold().
+ */
+ i = t->current_threshold;
+
+ /*
+ * Iterate backward over array of thresholds starting from
+ * current_threshold and check if a threshold is crossed.
+ * If none of thresholds below usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
+ eventfd_signal(t->entries[i].eventfd);
+
+ /* i = current_threshold + 1 */
+ i++;
+
+ /*
+ * Iterate forward over array of thresholds starting from
+ * current_threshold+1 and check if a threshold is crossed.
+ * If none of thresholds above usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
+ eventfd_signal(t->entries[i].eventfd);
+
+ /* Update current_threshold */
+ t->current_threshold = i - 1;
+unlock:
+ rcu_read_unlock();
+}
+
+static void mem_cgroup_threshold(struct mem_cgroup *memcg)
+{
+ while (memcg) {
+ __mem_cgroup_threshold(memcg, false);
+ if (do_memsw_account())
+ __mem_cgroup_threshold(memcg, true);
+
+ memcg = parent_mem_cgroup(memcg);
+ }
+}
+
+/*
+ * Check events in order.
+ *
+ */
+void memcg_check_events(struct mem_cgroup *memcg, int nid)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return;
+
+ /* threshold event is triggered in finer grain than soft limit */
+ if (unlikely(mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_THRESH))) {
+ bool do_softlimit;
+
+ do_softlimit = mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_SOFTLIMIT);
+ mem_cgroup_threshold(memcg);
+ if (unlikely(do_softlimit))
+ memcg1_update_tree(memcg, nid);
+ }
+}
+
+static int compare_thresholds(const void *a, const void *b)
+{
+ const struct mem_cgroup_threshold *_a = a;
+ const struct mem_cgroup_threshold *_b = b;
+
+ if (_a->threshold > _b->threshold)
+ return 1;
+
+ if (_a->threshold < _b->threshold)
+ return -1;
+
+ return 0;
+}
+
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_eventfd_list *ev;
+
+ spin_lock(&memcg_oom_lock);
+
+ list_for_each_entry(ev, &memcg->oom_notify, list)
+ eventfd_signal(ev->eventfd);
+
+ spin_unlock(&memcg_oom_lock);
+ return 0;
+}
+
+void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg)
+ mem_cgroup_oom_notify_cb(iter);
+}
+
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
+{
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ unsigned long threshold;
+ unsigned long usage;
+ int i, size, ret;
+
+ ret = page_counter_memparse(args, "-1", &threshold);
+ if (ret)
+ return ret;
+
+ mutex_lock(&memcg->thresholds_lock);
+
+ if (type == _MEM) {
+ thresholds = &memcg->thresholds;
+ usage = mem_cgroup_usage(memcg, false);
+ } else if (type == _MEMSWAP) {
+ thresholds = &memcg->memsw_thresholds;
+ usage = mem_cgroup_usage(memcg, true);
+ } else
+ BUG();
+
+ /* Check if a threshold crossed before adding a new one */
+ if (thresholds->primary)
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ size = thresholds->primary ? thresholds->primary->size + 1 : 1;
+
+ /* Allocate memory for new array of thresholds */
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ new->size = size;
+
+ /* Copy thresholds (if any) to new array */
+ if (thresholds->primary)
+ memcpy(new->entries, thresholds->primary->entries,
+ flex_array_size(new, entries, size - 1));
+
+ /* Add new threshold */
+ new->entries[size - 1].eventfd = eventfd;
+ new->entries[size - 1].threshold = threshold;
+
+ /* Sort thresholds. Registering of new threshold isn't time-critical */
+ sort(new->entries, size, sizeof(*new->entries),
+ compare_thresholds, NULL);
+
+ /* Find current threshold */
+ new->current_threshold = -1;
+ for (i = 0; i < size; i++) {
+ if (new->entries[i].threshold <= usage) {
+ /*
+ * new->current_threshold will not be used until
+ * rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ } else
+ break;
+ }
+
+ /* Free old spare buffer and save old primary buffer as spare */
+ kfree(thresholds->spare);
+ thresholds->spare = thresholds->primary;
+
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+
+unlock:
+ mutex_unlock(&memcg->thresholds_lock);
+
+ return ret;
+}
+
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, enum res_type type)
+{
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ unsigned long usage;
+ int i, j, size, entries;
+
+ mutex_lock(&memcg->thresholds_lock);
+
+ if (type == _MEM) {
+ thresholds = &memcg->thresholds;
+ usage = mem_cgroup_usage(memcg, false);
+ } else if (type == _MEMSWAP) {
+ thresholds = &memcg->memsw_thresholds;
+ usage = mem_cgroup_usage(memcg, true);
+ } else
+ BUG();
+
+ if (!thresholds->primary)
+ goto unlock;
+
+ /* Check if a threshold crossed before removing */
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ /* Calculate new number of threshold */
+ size = entries = 0;
+ for (i = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd != eventfd)
+ size++;
+ else
+ entries++;
+ }
+
+ new = thresholds->spare;
+
+ /* If no items related to eventfd have been cleared, nothing to do */
+ if (!entries)
+ goto unlock;
+
+ /* Set thresholds array to NULL if we don't have thresholds */
+ if (!size) {
+ kfree(new);
+ new = NULL;
+ goto swap_buffers;
+ }
+
+ new->size = size;
+
+ /* Copy thresholds and find current threshold */
+ new->current_threshold = -1;
+ for (i = 0, j = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd == eventfd)
+ continue;
+
+ new->entries[j] = thresholds->primary->entries[i];
+ if (new->entries[j].threshold <= usage) {
+ /*
+ * new->current_threshold will not be used
+ * until rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ }
+ j++;
+ }
+
+swap_buffers:
+ /* Swap primary and spare array */
+ thresholds->spare = thresholds->primary;
+
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+
+ /* If all events are unregistered, free the spare array */
+ if (!new) {
+ kfree(thresholds->spare);
+ thresholds->spare = NULL;
+ }
+unlock:
+ mutex_unlock(&memcg->thresholds_lock);
+}
+
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ struct mem_cgroup_eventfd_list *event;
+
+ event = kmalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ spin_lock(&memcg_oom_lock);
+
+ event->eventfd = eventfd;
+ list_add(&event->list, &memcg->oom_notify);
+
+ /* already in OOM ? */
+ if (memcg->under_oom)
+ eventfd_signal(eventfd);
+ spin_unlock(&memcg_oom_lock);
+
+ return 0;
+}
+
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ struct mem_cgroup_eventfd_list *ev, *tmp;
+
+ spin_lock(&memcg_oom_lock);
+
+ list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
+ if (ev->eventfd == eventfd) {
+ list_del(&ev->list);
+ kfree(ev);
+ }
+ }
+
+ spin_unlock(&memcg_oom_lock);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered. It tries to support fully configurable
+ * events for each user. Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+ struct mem_cgroup_event *event =
+ container_of(work, struct mem_cgroup_event, remove);
+ struct mem_cgroup *memcg = event->memcg;
+
+ remove_wait_queue(event->wqh, &event->wait);
+
+ event->unregister_event(memcg, event->eventfd);
+
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd);
+
+ eventfd_ctx_put(event->eventfd);
+ kfree(event);
+ css_put(&memcg->css);
+}
+
+/*
+ * Gets called on EPOLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct mem_cgroup_event *event =
+ container_of(wait, struct mem_cgroup_event, wait);
+ struct mem_cgroup *memcg = event->memcg;
+ __poll_t flags = key_to_poll(key);
+
+ if (flags & EPOLLHUP) {
+ /*
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
+ */
+ spin_lock(&memcg->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+ }
+
+ return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+ wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mem_cgroup_event *event =
+ container_of(pt, struct mem_cgroup_event, pt);
+
+ event->wqh = wqh;
+ add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event;
+ struct cgroup_subsys_state *cfile_css;
+ unsigned int efd, cfd;
+ struct fd efile;
+ struct fd cfile;
+ struct dentry *cdentry;
+ const char *name;
+ char *endp;
+ int ret;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return -EOPNOTSUPP;
+
+ buf = strstrip(buf);
+
+ efd = simple_strtoul(buf, &endp, 10);
+ if (*endp != ' ')
+ return -EINVAL;
+ buf = endp + 1;
+
+ cfd = simple_strtoul(buf, &endp, 10);
+ if ((*endp != ' ') && (*endp != '\0'))
+ return -EINVAL;
+ buf = endp + 1;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->memcg = memcg;
+ INIT_LIST_HEAD(&event->list);
+ init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+ init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+ INIT_WORK(&event->remove, memcg_event_remove);
+
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
+ goto out_kfree;
+ }
+
+ event->eventfd = eventfd_ctx_fileget(efile.file);
+ if (IS_ERR(event->eventfd)) {
+ ret = PTR_ERR(event->eventfd);
+ goto out_put_efile;
+ }
+
+ cfile = fdget(cfd);
+ if (!cfile.file) {
+ ret = -EBADF;
+ goto out_put_eventfd;
+ }
+
+ /* the process need read permission on control file */
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = file_permission(cfile.file, MAY_READ);
+ if (ret < 0)
+ goto out_put_cfile;
+
+ /*
+ * The control file must be a regular cgroup1 file. As a regular cgroup
+ * file can't be renamed, it's safe to access its name afterwards.
+ */
+ cdentry = cfile.file->f_path.dentry;
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Determine the event callbacks and set them in @event. This used
+ * to be done via struct cftype but cgroup core no longer knows
+ * about these events. The following is crude but the whole thing
+ * is for compatibility anyway.
+ *
+ * DO NOT ADD NEW FILES.
+ */
+ name = cdentry->d_name.name;
+
+ if (!strcmp(name, "memory.usage_in_bytes")) {
+ event->register_event = mem_cgroup_usage_register_event;
+ event->unregister_event = mem_cgroup_usage_unregister_event;
+ } else if (!strcmp(name, "memory.oom_control")) {
+ event->register_event = mem_cgroup_oom_register_event;
+ event->unregister_event = mem_cgroup_oom_unregister_event;
+ } else if (!strcmp(name, "memory.pressure_level")) {
+ event->register_event = vmpressure_register_event;
+ event->unregister_event = vmpressure_unregister_event;
+ } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+ event->register_event = memsw_cgroup_usage_register_event;
+ event->unregister_event = memsw_cgroup_usage_unregister_event;
+ } else {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Verify @cfile should belong to @css. Also, remaining events are
+ * automatically removed on cgroup destruction but the removal is
+ * asynchronous, so take an extra ref on @css.
+ */
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
+ &memory_cgrp_subsys);
+ ret = -EINVAL;
+ if (IS_ERR(cfile_css))
+ goto out_put_cfile;
+ if (cfile_css != css) {
+ css_put(cfile_css);
+ goto out_put_cfile;
+ }
+
+ ret = event->register_event(memcg, event->eventfd, buf);
+ if (ret)
+ goto out_put_css;
+
+ vfs_poll(efile.file, &event->pt);
+
+ spin_lock_irq(&memcg->event_list_lock);
+ list_add(&event->list, &memcg->event_list);
+ spin_unlock_irq(&memcg->event_list_lock);
+
+ fdput(cfile);
+ fdput(efile);
+
+ return nbytes;
+
+out_put_css:
+ css_put(css);
+out_put_cfile:
+ fdput(cfile);
+out_put_eventfd:
+ eventfd_ctx_put(event->eventfd);
+out_put_efile:
+ fdput(efile);
+out_kfree:
+ kfree(event);
+
+ return ret;
+}
+
+void memcg1_css_offline(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_event *event, *tmp;
+
+ /*
+ * Unregister events and notify userspace.
+ * Notify userspace about cgroup removing only after rmdir of cgroup
+ * directory to avoid race between userspace and kernelspace.
+ */
+ spin_lock_irq(&memcg->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+ list_del_init(&event->list);
+ schedule_work(&event->remove);
+ }
+ spin_unlock_irq(&memcg->event_list_lock);
+}
+
static int __init memcg1_init(void)
{
int node;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index d377c0be9880b..524a2c76ffc97 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val);
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremented by the number of pages. This counter is used
+ * to trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_NTARGETS,
+};
+
+/* Whether legacy memory+swap accounting is active */
+static bool do_memsw_account(void)
+{
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
+}
+
+/*
+ * Iteration constructs for visiting all cgroups (under a tree). If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root) \
+ for (iter = mem_cgroup_iter(root, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter) \
+ for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(NULL, iter, NULL))
+
+void memcg1_css_offline(struct mem_cgroup *memcg);
+
+/* for encoding cft->private value on file */
+enum res_type {
+ _MEM,
+ _MEMSWAP,
+ _KMEM,
+ _TCP,
+};
+
+bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_target target);
+unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
+void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
+ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+
+
#endif /* __MM_MEMCONTROL_V1_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b983cb6d1d228..6bc9009bee517 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -46,9 +46,6 @@
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/spinlock.h>
-#include <linux/eventfd.h>
-#include <linux/poll.h>
-#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/parser.h>
@@ -59,7 +56,6 @@
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
-#include <linux/file.h>
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
@@ -97,91 +93,13 @@ static bool cgroup_memory_nobpf __ro_after_init;
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
-/* Whether legacy memory+swap accounting is active */
-static bool do_memsw_account(void)
-{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
-}
-
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
-/* for OOM */
-struct mem_cgroup_eventfd_list {
- struct list_head list;
- struct eventfd_ctx *eventfd;
-};
-
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct mem_cgroup_event {
- /*
- * memcg which the event belongs to.
- */
- struct mem_cgroup *memcg;
- /*
- * eventfd to signal userspace about the event.
- */
- struct eventfd_ctx *eventfd;
- /*
- * Each of these stored in a list by the cgroup.
- */
- struct list_head list;
- /*
- * register_event() callback will be used to add new userspace
- * waiter for changes related to this event. Use eventfd_signal()
- * on eventfd to send notification to userspace.
- */
- int (*register_event)(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args);
- /*
- * unregister_event() callback will be called when userspace closes
- * the eventfd or on cgroup removing. This callback must be set,
- * if you want provide notification functionality.
- */
- void (*unregister_event)(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd);
- /*
- * All fields below needed to unregister event when
- * userspace closes eventfd.
- */
- poll_table pt;
- wait_queue_head_t *wqh;
- wait_queue_entry_t wait;
- struct work_struct remove;
-};
-
-static void mem_cgroup_threshold(struct mem_cgroup *memcg);
-static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-
-/* for encoding cft->private value on file */
-enum res_type {
- _MEM,
- _MEMSWAP,
- _KMEM,
- _TCP,
-};
-
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-/*
- * Iteration constructs for visiting all cgroups (under a tree). If
- * loops are exited prematurely (break), mem_cgroup_iter_break() must
- * be used for reference counting.
- */
-#define for_each_mem_cgroup_tree(iter, root) \
- for (iter = mem_cgroup_iter(root, NULL, NULL); \
- iter != NULL; \
- iter = mem_cgroup_iter(root, iter, NULL))
-
-#define for_each_mem_cgroup(iter) \
- for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
- iter != NULL; \
- iter = mem_cgroup_iter(NULL, iter, NULL))
-
static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
@@ -940,8 +858,8 @@ void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
}
-static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
- enum mem_cgroup_events_target target)
+bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_target target)
{
unsigned long val, next;
@@ -965,28 +883,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
return false;
}
-/*
- * Check events in order.
- *
- */
-void memcg_check_events(struct mem_cgroup *memcg, int nid)
-{
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- return;
-
- /* threshold event is triggered in finer grain than soft limit */
- if (unlikely(mem_cgroup_event_ratelimit(memcg,
- MEM_CGROUP_TARGET_THRESH))) {
- bool do_softlimit;
-
- do_softlimit = mem_cgroup_event_ratelimit(memcg,
- MEM_CGROUP_TARGET_SOFTLIMIT);
- mem_cgroup_threshold(memcg);
- if (unlikely(do_softlimit))
- memcg1_update_tree(memcg, nid);
- }
-}
-
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
/*
@@ -1726,7 +1622,7 @@ static struct lockdep_map memcg_oom_lock_dep_map = {
};
#endif
-static DEFINE_SPINLOCK(memcg_oom_lock);
+DEFINE_SPINLOCK(memcg_oom_lock);
/*
* Check OOM-Killer is already running under our hierarchy.
@@ -3547,7 +3443,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return -EINVAL;
}
-static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
unsigned long val;
@@ -4048,331 +3944,6 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
return 0;
}
-static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
-{
- struct mem_cgroup_threshold_ary *t;
- unsigned long usage;
- int i;
-
- rcu_read_lock();
- if (!swap)
- t = rcu_dereference(memcg->thresholds.primary);
- else
- t = rcu_dereference(memcg->memsw_thresholds.primary);
-
- if (!t)
- goto unlock;
-
- usage = mem_cgroup_usage(memcg, swap);
-
- /*
- * current_threshold points to threshold just below or equal to usage.
- * If it's not true, a threshold was crossed after last
- * call of __mem_cgroup_threshold().
- */
- i = t->current_threshold;
-
- /*
- * Iterate backward over array of thresholds starting from
- * current_threshold and check if a threshold is crossed.
- * If none of thresholds below usage is crossed, we read
- * only one element of the array here.
- */
- for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
- eventfd_signal(t->entries[i].eventfd);
-
- /* i = current_threshold + 1 */
- i++;
-
- /*
- * Iterate forward over array of thresholds starting from
- * current_threshold+1 and check if a threshold is crossed.
- * If none of thresholds above usage is crossed, we read
- * only one element of the array here.
- */
- for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
- eventfd_signal(t->entries[i].eventfd);
-
- /* Update current_threshold */
- t->current_threshold = i - 1;
-unlock:
- rcu_read_unlock();
-}
-
-static void mem_cgroup_threshold(struct mem_cgroup *memcg)
-{
- while (memcg) {
- __mem_cgroup_threshold(memcg, false);
- if (do_memsw_account())
- __mem_cgroup_threshold(memcg, true);
-
- memcg = parent_mem_cgroup(memcg);
- }
-}
-
-static int compare_thresholds(const void *a, const void *b)
-{
- const struct mem_cgroup_threshold *_a = a;
- const struct mem_cgroup_threshold *_b = b;
-
- if (_a->threshold > _b->threshold)
- return 1;
-
- if (_a->threshold < _b->threshold)
- return -1;
-
- return 0;
-}
-
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
-{
- struct mem_cgroup_eventfd_list *ev;
-
- spin_lock(&memcg_oom_lock);
-
- list_for_each_entry(ev, &memcg->oom_notify, list)
- eventfd_signal(ev->eventfd);
-
- spin_unlock(&memcg_oom_lock);
- return 0;
-}
-
-static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
-{
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg)
- mem_cgroup_oom_notify_cb(iter);
-}
-
-static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args, enum res_type type)
-{
- struct mem_cgroup_thresholds *thresholds;
- struct mem_cgroup_threshold_ary *new;
- unsigned long threshold;
- unsigned long usage;
- int i, size, ret;
-
- ret = page_counter_memparse(args, "-1", &threshold);
- if (ret)
- return ret;
-
- mutex_lock(&memcg->thresholds_lock);
-
- if (type == _MEM) {
- thresholds = &memcg->thresholds;
- usage = mem_cgroup_usage(memcg, false);
- } else if (type == _MEMSWAP) {
- thresholds = &memcg->memsw_thresholds;
- usage = mem_cgroup_usage(memcg, true);
- } else
- BUG();
-
- /* Check if a threshold crossed before adding a new one */
- if (thresholds->primary)
- __mem_cgroup_threshold(memcg, type == _MEMSWAP);
-
- size = thresholds->primary ? thresholds->primary->size + 1 : 1;
-
- /* Allocate memory for new array of thresholds */
- new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- goto unlock;
- }
- new->size = size;
-
- /* Copy thresholds (if any) to new array */
- if (thresholds->primary)
- memcpy(new->entries, thresholds->primary->entries,
- flex_array_size(new, entries, size - 1));
-
- /* Add new threshold */
- new->entries[size - 1].eventfd = eventfd;
- new->entries[size - 1].threshold = threshold;
-
- /* Sort thresholds. Registering of new threshold isn't time-critical */
- sort(new->entries, size, sizeof(*new->entries),
- compare_thresholds, NULL);
-
- /* Find current threshold */
- new->current_threshold = -1;
- for (i = 0; i < size; i++) {
- if (new->entries[i].threshold <= usage) {
- /*
- * new->current_threshold will not be used until
- * rcu_assign_pointer(), so it's safe to increment
- * it here.
- */
- ++new->current_threshold;
- } else
- break;
- }
-
- /* Free old spare buffer and save old primary buffer as spare */
- kfree(thresholds->spare);
- thresholds->spare = thresholds->primary;
-
- rcu_assign_pointer(thresholds->primary, new);
-
- /* To be sure that nobody uses thresholds */
- synchronize_rcu();
-
-unlock:
- mutex_unlock(&memcg->thresholds_lock);
-
- return ret;
-}
-
-static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
-{
- return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
-}
-
-static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
-{
- return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
-}
-
-static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, enum res_type type)
-{
- struct mem_cgroup_thresholds *thresholds;
- struct mem_cgroup_threshold_ary *new;
- unsigned long usage;
- int i, j, size, entries;
-
- mutex_lock(&memcg->thresholds_lock);
-
- if (type == _MEM) {
- thresholds = &memcg->thresholds;
- usage = mem_cgroup_usage(memcg, false);
- } else if (type == _MEMSWAP) {
- thresholds = &memcg->memsw_thresholds;
- usage = mem_cgroup_usage(memcg, true);
- } else
- BUG();
-
- if (!thresholds->primary)
- goto unlock;
-
- /* Check if a threshold crossed before removing */
- __mem_cgroup_threshold(memcg, type == _MEMSWAP);
-
- /* Calculate new number of threshold */
- size = entries = 0;
- for (i = 0; i < thresholds->primary->size; i++) {
- if (thresholds->primary->entries[i].eventfd != eventfd)
- size++;
- else
- entries++;
- }
-
- new = thresholds->spare;
-
- /* If no items related to eventfd have been cleared, nothing to do */
- if (!entries)
- goto unlock;
-
- /* Set thresholds array to NULL if we don't have thresholds */
- if (!size) {
- kfree(new);
- new = NULL;
- goto swap_buffers;
- }
-
- new->size = size;
-
- /* Copy thresholds and find current threshold */
- new->current_threshold = -1;
- for (i = 0, j = 0; i < thresholds->primary->size; i++) {
- if (thresholds->primary->entries[i].eventfd == eventfd)
- continue;
-
- new->entries[j] = thresholds->primary->entries[i];
- if (new->entries[j].threshold <= usage) {
- /*
- * new->current_threshold will not be used
- * until rcu_assign_pointer(), so it's safe to increment
- * it here.
- */
- ++new->current_threshold;
- }
- j++;
- }
-
-swap_buffers:
- /* Swap primary and spare array */
- thresholds->spare = thresholds->primary;
-
- rcu_assign_pointer(thresholds->primary, new);
-
- /* To be sure that nobody uses thresholds */
- synchronize_rcu();
-
- /* If all events are unregistered, free the spare array */
- if (!new) {
- kfree(thresholds->spare);
- thresholds->spare = NULL;
- }
-unlock:
- mutex_unlock(&memcg->thresholds_lock);
-}
-
-static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
-{
- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
-}
-
-static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
-{
- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
-}
-
-static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
-{
- struct mem_cgroup_eventfd_list *event;
-
- event = kmalloc(sizeof(*event), GFP_KERNEL);
- if (!event)
- return -ENOMEM;
-
- spin_lock(&memcg_oom_lock);
-
- event->eventfd = eventfd;
- list_add(&event->list, &memcg->oom_notify);
-
- /* already in OOM ? */
- if (memcg->under_oom)
- eventfd_signal(eventfd);
- spin_unlock(&memcg_oom_lock);
-
- return 0;
-}
-
-static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
-{
- struct mem_cgroup_eventfd_list *ev, *tmp;
-
- spin_lock(&memcg_oom_lock);
-
- list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
- if (ev->eventfd == eventfd) {
- list_del(&ev->list);
- kfree(ev);
- }
- }
-
- spin_unlock(&memcg_oom_lock);
-}
-
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
@@ -4613,243 +4184,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
#endif /* CONFIG_CGROUP_WRITEBACK */
-/*
- * DO NOT USE IN NEW FILES.
- *
- * "cgroup.event_control" implementation.
- *
- * This is way over-engineered. It tries to support fully configurable
- * events for each user. Such level of flexibility is completely
- * unnecessary especially in the light of the planned unified hierarchy.
- *
- * Please deprecate this and replace with something simpler if at all
- * possible.
- */
-
-/*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
-static void memcg_event_remove(struct work_struct *work)
-{
- struct mem_cgroup_event *event =
- container_of(work, struct mem_cgroup_event, remove);
- struct mem_cgroup *memcg = event->memcg;
-
- remove_wait_queue(event->wqh, &event->wait);
-
- event->unregister_event(memcg, event->eventfd);
-
- /* Notify userspace the event is going away. */
- eventfd_signal(event->eventfd);
-
- eventfd_ctx_put(event->eventfd);
- kfree(event);
- css_put(&memcg->css);
-}
-
-/*
- * Gets called on EPOLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
- int sync, void *key)
-{
- struct mem_cgroup_event *event =
- container_of(wait, struct mem_cgroup_event, wait);
- struct mem_cgroup *memcg = event->memcg;
- __poll_t flags = key_to_poll(key);
-
- if (flags & EPOLLHUP) {
- /*
- * If the event has been detached at cgroup removal, we
- * can simply return knowing the other side will cleanup
- * for us.
- *
- * We can't race against event freeing since the other
- * side will require wqh->lock via remove_wait_queue(),
- * which we hold.
- */
- spin_lock(&memcg->event_list_lock);
- if (!list_empty(&event->list)) {
- list_del_init(&event->list);
- /*
- * We are in atomic context, but cgroup_event_remove()
- * may sleep, so we have to call it in workqueue.
- */
- schedule_work(&event->remove);
- }
- spin_unlock(&memcg->event_list_lock);
- }
-
- return 0;
-}
-
-static void memcg_event_ptable_queue_proc(struct file *file,
- wait_queue_head_t *wqh, poll_table *pt)
-{
- struct mem_cgroup_event *event =
- container_of(pt, struct mem_cgroup_event, pt);
-
- event->wqh = wqh;
- add_wait_queue(wqh, &event->wait);
-}
-
-/*
- * DO NOT USE IN NEW FILES.
- *
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
-static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct cgroup_subsys_state *css = of_css(of);
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup_event *event;
- struct cgroup_subsys_state *cfile_css;
- unsigned int efd, cfd;
- struct fd efile;
- struct fd cfile;
- struct dentry *cdentry;
- const char *name;
- char *endp;
- int ret;
-
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- return -EOPNOTSUPP;
-
- buf = strstrip(buf);
-
- efd = simple_strtoul(buf, &endp, 10);
- if (*endp != ' ')
- return -EINVAL;
- buf = endp + 1;
-
- cfd = simple_strtoul(buf, &endp, 10);
- if ((*endp != ' ') && (*endp != '\0'))
- return -EINVAL;
- buf = endp + 1;
-
- event = kzalloc(sizeof(*event), GFP_KERNEL);
- if (!event)
- return -ENOMEM;
-
- event->memcg = memcg;
- INIT_LIST_HEAD(&event->list);
- init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
- init_waitqueue_func_entry(&event->wait, memcg_event_wake);
- INIT_WORK(&event->remove, memcg_event_remove);
-
- efile = fdget(efd);
- if (!efile.file) {
- ret = -EBADF;
- goto out_kfree;
- }
-
- event->eventfd = eventfd_ctx_fileget(efile.file);
- if (IS_ERR(event->eventfd)) {
- ret = PTR_ERR(event->eventfd);
- goto out_put_efile;
- }
-
- cfile = fdget(cfd);
- if (!cfile.file) {
- ret = -EBADF;
- goto out_put_eventfd;
- }
-
- /* the process need read permission on control file */
- /* AV: shouldn't we check that it's been opened for read instead? */
- ret = file_permission(cfile.file, MAY_READ);
- if (ret < 0)
- goto out_put_cfile;
-
- /*
- * The control file must be a regular cgroup1 file. As a regular cgroup
- * file can't be renamed, it's safe to access its name afterwards.
- */
- cdentry = cfile.file->f_path.dentry;
- if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
- ret = -EINVAL;
- goto out_put_cfile;
- }
-
- /*
- * Determine the event callbacks and set them in @event. This used
- * to be done via struct cftype but cgroup core no longer knows
- * about these events. The following is crude but the whole thing
- * is for compatibility anyway.
- *
- * DO NOT ADD NEW FILES.
- */
- name = cdentry->d_name.name;
-
- if (!strcmp(name, "memory.usage_in_bytes")) {
- event->register_event = mem_cgroup_usage_register_event;
- event->unregister_event = mem_cgroup_usage_unregister_event;
- } else if (!strcmp(name, "memory.oom_control")) {
- event->register_event = mem_cgroup_oom_register_event;
- event->unregister_event = mem_cgroup_oom_unregister_event;
- } else if (!strcmp(name, "memory.pressure_level")) {
- event->register_event = vmpressure_register_event;
- event->unregister_event = vmpressure_unregister_event;
- } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
- event->register_event = memsw_cgroup_usage_register_event;
- event->unregister_event = memsw_cgroup_usage_unregister_event;
- } else {
- ret = -EINVAL;
- goto out_put_cfile;
- }
-
- /*
- * Verify @cfile should belong to @css. Also, remaining events are
- * automatically removed on cgroup destruction but the removal is
- * asynchronous, so take an extra ref on @css.
- */
- cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
- &memory_cgrp_subsys);
- ret = -EINVAL;
- if (IS_ERR(cfile_css))
- goto out_put_cfile;
- if (cfile_css != css) {
- css_put(cfile_css);
- goto out_put_cfile;
- }
-
- ret = event->register_event(memcg, event->eventfd, buf);
- if (ret)
- goto out_put_css;
-
- vfs_poll(efile.file, &event->pt);
-
- spin_lock_irq(&memcg->event_list_lock);
- list_add(&event->list, &memcg->event_list);
- spin_unlock_irq(&memcg->event_list_lock);
-
- fdput(cfile);
- fdput(efile);
-
- return nbytes;
-
-out_put_css:
- css_put(css);
-out_put_cfile:
- fdput(cfile);
-out_put_eventfd:
- eventfd_ctx_put(event->eventfd);
-out_put_efile:
- fdput(efile);
-out_kfree:
- kfree(event);
-
- return ret;
-}
-
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
static int mem_cgroup_slab_show(struct seq_file *m, void *p)
{
@@ -5316,19 +4650,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup_event *event, *tmp;
- /*
- * Unregister events and notify userspace.
- * Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace.
- */
- spin_lock_irq(&memcg->event_list_lock);
- list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
- list_del_init(&event->list);
- schedule_work(&event->remove);
- }
- spin_unlock_irq(&memcg->event_list_lock);
+ memcg1_css_offline(memcg);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
--
2.45.1
On Tue, May 28, 2024 at 01:20:58PM GMT, Roman Gushchin wrote:
> Cgroup v1's memory controller contains a pretty complicated
> event notifications mechanism which is not used on cgroup v2.
> Let's move the corresponding code into memcontrol-v1.c.
>
> Please, note, that mem_cgroup_event_ratelimit() remains in
> memcontrol.c, otherwise it would require exporting too many
> details on memcg stats outside of memcontrol.c.
In the followup work, we should decouple v1 only fields from v2 fields
and then we should be able to move mem_cgroup_event_ratelimit() to the
memcontrol-v1.c file.
On Tue, May 28, 2024 at 01:20:58PM GMT, Roman Gushchin wrote:
> Cgroup v1's memory controller contains a pretty complicated
> event notifications mechanism which is not used on cgroup v2.
> Let's move the corresponding code into memcontrol-v1.c.
>
> Please, note, that mem_cgroup_event_ratelimit() remains in
> memcontrol.c, otherwise it would require exporting too many
> details on memcg stats outside of memcontrol.c.
>
> Signed-off-by: Roman Gushchin <[email protected]>
Acked-by: Shakeel Butt <[email protected]>
On Fri, May 31, 2024 at 11:33:46PM -0700, Shakeel Butt wrote:
> On Tue, May 28, 2024 at 01:20:58PM GMT, Roman Gushchin wrote:
> > Cgroup v1's memory controller contains a pretty complicated
> > event notifications mechanism which is not used on cgroup v2.
> > Let's move the corresponding code into memcontrol-v1.c.
> >
> > Please, note, that mem_cgroup_event_ratelimit() remains in
> > memcontrol.c, otherwise it would require exporting too many
> > details on memcg stats outside of memcontrol.c.
>
> In the followup work, we should decouple v1 only fields from v2 fields
> and then we should be able to move mem_cgroup_event_ratelimit() to the
> memcontrol-v1.c file.
Right, agree, this is my plan. In the RFC version I had a lot of #ifdef's
in mm/memcontrol.c and now I plan to do it in a nicer way, moving
cgroup v1 - specific stuff into separate functions.
Thanks!