Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) client-ip=23.128.96.18;
IronPort-SDR: w/7CJ+4awTzPK1F6xQ1zGRil1cY83RAKfs3dFeYOkCeVbu6FCHCK4T1WXDuJCPqoly/R5ZJ/9l
 X7xkRmC/eQ8Q==
IronPort-SDR: uRJWNIEvIVenX3zhCfRavpX69HZihlo15E+HdqNsuhc8s2r9sEnAo//jTyEmVjEzmIsM3NyzhF
 6er8odyQGP3w==
From:   Tim Chen <tim.c.chen@linux.intel.com>
To:     Michal Hocko <mhocko@suse.cz>
Cc:     Tim Chen <tim.c.chen@linux.intel.com>,
        Johannes Weiner <hannes@cmpxchg.org>,
        Andrew Morton <akpm@linux-foundation.org>,
        Dave Hansen <dave.hansen@intel.com>,
        Ying Huang <ying.huang@intel.com>,
        Dan Williams <dan.j.williams@intel.com>,
        David Rientjes <rientjes@google.com>,
        Shakeel Butt <shakeelb@google.com>, linux-mm@kvack.org,
        cgroups@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [RFC PATCH v1 09/11] mm: Use kswapd to demote pages when toptier memory is tight
Date:   Mon,  5 Apr 2021 10:08:33 -0700
Message-Id: <83c06bf70e38360358c84daab399f18f57e7eba4.1617642417.git.tim.c.chen@linux.intel.com>
In-Reply-To: <cover.1617642417.git.tim.c.chen@linux.intel.com>
References: <cover.1617642417.git.tim.c.chen@linux.intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Demote pages from memory cgroup that has excess
toptier memory usage when top tier memory is tight.

When free top tier memory falls below this fraction
"toptier_scale_factor/10000" of overall toptier memory in a node, kswapd
reclaims top tier memory from those mem cgroups that exceeded their
toptier memory soft limit by deomoting the top tier pages to
lower memory tier.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 Documentation/admin-guide/sysctl/vm.rst | 12 +++++
 include/linux/mmzone.h                  |  2 +
 mm/page_alloc.c                         | 14 +++++
 mm/vmscan.c                             | 69 ++++++++++++++++++++++++-
 4 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 9de3847c3469..6b49e2e90953 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -74,6 +74,7 @@ Currently, these files are in /proc/sys/vm:
 - vfs_cache_pressure
 - watermark_boost_factor
 - watermark_scale_factor
+- toptier_scale_factor
 - zone_reclaim_mode
 
 
@@ -962,6 +963,17 @@ too small for the allocation bursts occurring in the system. This knob
 can then be used to tune kswapd aggressiveness accordingly.
 
 
+toptier_scale_factor
+====================
+
+This factor controls when kswapd wakes up to demote pages of those
+cgroups that have exceeded their memory soft limit.
+
+The unit is in fractions of 10,000. The default value of 2000 means the
+if there are less than 20% of free top tier memory in the
+node/system, we will start to demote pages of those memory cgroups
+that have exceeded their soft memory limit.
+
 zone_reclaim_mode
 =================
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bbe649c4fdee..4ee0073d255f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -332,12 +332,14 @@ enum zone_watermarks {
 	WMARK_MIN,
 	WMARK_LOW,
 	WMARK_HIGH,
+	WMARK_TOPTIER,
 	NR_WMARK
 };
 
 #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
 #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define toptier_wmark_pages(z) (z->_watermark[WMARK_TOPTIER] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
 struct per_cpu_pages {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 471a2c342c4f..20f3caee60f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7964,6 +7964,20 @@ static void __setup_per_zone_wmarks(void)
 		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
 		zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
 
+		tmp = mult_frac(zone_managed_pages(zone),
+				toptier_scale_factor, 10000);
+		/*
+		 * Clamp toptier watermark between twice high watermark
+		 * and max managed pages.
+		 */
+		if (tmp < 2 * zone->_watermark[WMARK_HIGH])
+			tmp = 2 * zone->_watermark[WMARK_HIGH];
+		if (tmp > zone_managed_pages(zone))
+			tmp = zone_managed_pages(zone);
+		zone->_watermark[WMARK_TOPTIER] = tmp;
+
+		zone->watermark_boost = 0;
+
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 11bb0c6fa524..270880c8baef 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -185,6 +185,7 @@ static void set_task_reclaim_state(struct task_struct *task,
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
+int toptier_scale_factor = 2000;
 
 #ifdef CONFIG_MEMCG
 /*
@@ -3624,6 +3625,34 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
 	return false;
 }
 
+static bool pgdat_toptier_balanced(pg_data_t *pgdat, int order, int classzone_idx)
+{
+	int i;
+	unsigned long mark;
+	struct zone *zone;
+
+	zone = pgdat->node_zones + ZONE_NORMAL;
+
+	if (!node_state(pgdat->node_id, N_TOPTIER) ||
+	    next_demotion_node(pgdat->node_id) == -1 ||
+	    order > 0 || classzone_idx < ZONE_NORMAL) {
+		return true;
+	}
+
+	zone = pgdat->node_zones + ZONE_NORMAL;
+
+	if (!managed_zone(zone))
+		return true;
+
+	mark = min(toptier_wmark_pages(zone),
+		   zone_managed_pages(zone));
+
+	if (zone_page_state(zone, NR_FREE_PAGES) < mark)
+		return false;
+
+	return true;
+}
+
 /* Clear pgdat state for congested, dirty or under writeback. */
 static void clear_pgdat_congested(pg_data_t *pgdat)
 {
@@ -4049,6 +4078,39 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 	finish_wait(&pgdat->kswapd_wait, &wait);
 }
 
+static bool toptier_soft_reclaim(pg_data_t *pgdat,
+			      unsigned int reclaim_order,
+			      unsigned int classzone_idx)
+{
+	unsigned long nr_soft_scanned, nr_soft_reclaimed;
+	int ret;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.order = reclaim_order,
+		.may_unmap = 1,
+	};
+
+	if (!node_state(pgdat->node_id, N_TOPTIER) || kthread_should_stop())
+		return false;
+
+	set_task_reclaim_state(current, &sc.reclaim_state);
+
+	if (!pgdat_toptier_balanced(pgdat, 0, classzone_idx)) {
+		nr_soft_scanned = 0;
+		nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat,
+					0, GFP_KERNEL,
+					&nr_soft_scanned, N_TOPTIER);
+	}
+
+	set_task_reclaim_state(current, NULL);
+
+	if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx) &&
+	   !kthread_should_stop())
+		return true;
+	else
+		return false;
+}
+
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
@@ -4108,6 +4170,10 @@ static int kswapd(void *p)
 		WRITE_ONCE(pgdat->kswapd_order, 0);
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 
+		if (toptier_soft_reclaim(pgdat, 0,
+					highest_zoneidx))
+			goto kswapd_try_sleep;
+
 		ret = try_to_freeze();
 		if (kthread_should_stop())
 			break;
@@ -4173,7 +4239,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 
 	/* Hopeless node, leave it to direct reclaim if possible */
 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
-	    (pgdat_balanced(pgdat, order, highest_zoneidx) &&
+	    (pgdat_toptier_balanced(pgdat, 0, highest_zoneidx) &&
+	     pgdat_balanced(pgdat, order, highest_zoneidx) &&
 	     !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
 		/*
 		 * There may be plenty of free memory available, but it's too
-- 
2.20.1