Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20;
Date:   Thu, 24 Feb 2022 17:28:19 -0800
Message-Id: <20220225012819.1807147-1-surenb@google.com>
Mime-Version: 1.0
Subject: [RFC 1/1] mm: page_alloc: replace mm_percpu_wq with kthreads in drain_all_pages
From:   Suren Baghdasaryan <surenb@google.com>
To:     akpm@linux-foundation.org
Cc:     hannes@cmpxchg.org, mhocko@suse.com, pmladek@suse.com,
        peterz@infradead.org, guro@fb.com, shakeelb@google.com,
        minchan@kernel.org, timmurray@google.com, linux-mm@kvack.org,
        linux-kernel@vger.kernel.org, kernel-team@android.com,
        surenb@google.com
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk

Sending as an RFC to confirm if this is the right direction and to
clarify if other tasks currently executed on mm_percpu_wq should be
also moved to kthreads. The patch seems stable in testing but I want
to collect more performance data before submitting a non-RFC version.


Currently drain_all_pages uses mm_percpu_wq to drain pages from pcp
list during direct reclaim. The tasks on a workqueue can be delayed
by other tasks in the workqueues using the same per-cpu worker pool.
This results in sizable delays in drain_all_pages when cpus are highly
contended.
Memory management operations designed to relieve memory pressure should
not be allowed to block by other tasks, especially if the task in direct
reclaim has higher priority than the blocking tasks.
Replace the usage of mm_percpu_wq with per-cpu low priority FIFO
kthreads to execute draining tasks.

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 mm/page_alloc.c | 84 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 14 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3589febc6d31..c9ab2cf4b05b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -153,7 +153,8 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 /* work_structs for global per-cpu drains */
 struct pcpu_drain {
 	struct zone *zone;
-	struct work_struct work;
+	struct kthread_work work;
+	struct kthread_worker *worker;
 };
 static DEFINE_MUTEX(pcpu_drain_mutex);
 static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
@@ -2209,6 +2210,58 @@ _deferred_grow_zone(struct zone *zone, unsigned int order)
 
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
+static void drain_local_pages_func(struct kthread_work *work);
+
+static int alloc_drain_worker(unsigned int cpu)
+{
+	struct pcpu_drain *drain;
+
+	mutex_lock(&pcpu_drain_mutex);
+	drain = per_cpu_ptr(&pcpu_drain, cpu);
+	drain->worker = kthread_create_worker_on_cpu(cpu, 0, "pg_drain/%u", cpu);
+	if (IS_ERR(drain->worker)) {
+		drain->worker = NULL;
+		pr_err("Failed to create pg_drain/%u\n", cpu);
+		goto out;
+	}
+	/* Ensure the thread is not blocked by normal priority tasks */
+	sched_set_fifo_low(drain->worker->task);
+	kthread_init_work(&drain->work, drain_local_pages_func);
+out:
+	mutex_unlock(&pcpu_drain_mutex);
+
+	return 0;
+}
+
+static int free_drain_worker(unsigned int cpu)
+{
+	struct pcpu_drain *drain;
+
+	mutex_lock(&pcpu_drain_mutex);
+	drain = per_cpu_ptr(&pcpu_drain, cpu);
+	kthread_cancel_work_sync(&drain->work);
+	kthread_destroy_worker(drain->worker);
+	drain->worker = NULL;
+	mutex_unlock(&pcpu_drain_mutex);
+
+	return 0;
+}
+
+static void __init init_drain_workers(void)
+{
+	unsigned int cpu;
+
+	for_each_online_cpu(cpu)
+		alloc_drain_worker(cpu);
+
+	if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"page_alloc/drain:online",
+					alloc_drain_worker,
+					free_drain_worker)) {
+		pr_err("page_alloc_drain: Failed to allocate a hotplug state\n");
+	}
+}
+
 void __init page_alloc_init_late(void)
 {
 	struct zone *zone;
@@ -2245,6 +2298,8 @@ void __init page_alloc_init_late(void)
 
 	for_each_populated_zone(zone)
 		set_zone_contiguous(zone);
+
+	init_drain_workers();
 }
 
 #ifdef CONFIG_CMA
@@ -3144,7 +3199,7 @@ void drain_local_pages(struct zone *zone)
 		drain_pages(cpu);
 }
 
-static void drain_local_pages_wq(struct work_struct *work)
+static void drain_local_pages_func(struct kthread_work *work)
 {
 	struct pcpu_drain *drain;
 
@@ -3175,6 +3230,7 @@ static void drain_local_pages_wq(struct work_struct *work)
 static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
 {
 	int cpu;
+	struct pcpu_drain *drain;
 
 	/*
 	 * Allocate in the BSS so we won't require allocation in
@@ -3182,13 +3238,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
 	 */
 	static cpumask_t cpus_with_pcps;
 
-	/*
-	 * Make sure nobody triggers this path before mm_percpu_wq is fully
-	 * initialized.
-	 */
-	if (WARN_ON_ONCE(!mm_percpu_wq))
-		return;
-
 	/*
 	 * Do not drain if one is already in progress unless it's specific to
 	 * a zone. Such callers are primarily CMA and memory hotplug and need
@@ -3238,14 +3287,21 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
 	}
 
 	for_each_cpu(cpu, &cpus_with_pcps) {
-		struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
+		drain = per_cpu_ptr(&pcpu_drain, cpu);
 
 		drain->zone = zone;
-		INIT_WORK(&drain->work, drain_local_pages_wq);
-		queue_work_on(cpu, mm_percpu_wq, &drain->work);
+		if (likely(drain->worker))
+			kthread_queue_work(drain->worker, &drain->work);
+	}
+	/* Wait for kthreads to finish or drain itself */
+	for_each_cpu(cpu, &cpus_with_pcps) {
+		drain = per_cpu_ptr(&pcpu_drain, cpu);
+
+		if (likely(drain->worker))
+			kthread_flush_work(&drain->work);
+		else
+			drain_local_pages_func(&drain->work);
 	}
-	for_each_cpu(cpu, &cpus_with_pcps)
-		flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
 
 	mutex_unlock(&pcpu_drain_mutex);
 }
-- 
2.35.1.574.g5d30c73bfb-goog