Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) client-ip=23.128.96.18;
From:   Uladzislau Rezki <urezki@gmail.com>
Date:   Mon, 1 Feb 2021 20:57:53 +0100
To:     qiang.zhang@windriver.com
Cc:     urezki@gmail.com, paulmck@kernel.org, joel@joelfernandes.org,
        rcu@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v3] kvfree_rcu: Release page cache under memory pressure
Message-ID: <20210201195753.GA2026@pc638.lan>
References: <20210130131851.23285-1-qiang.zhang@windriver.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20210130131851.23285-1-qiang.zhang@windriver.com>
User-Agent: Mutt/1.10.1 (2018-07-13)
Precedence: bulk

Hello, Zqiang.

> From: Zqiang <qiang.zhang@windriver.com>
> 
> Add free per-cpu existing krcp's page cache operation, when
> the system is under memory pressure.
> 
> Signed-off-by: Zqiang <qiang.zhang@windriver.com>
> ---
>  kernel/rcu/tree.c | 26 ++++++++++++++++++++++++++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index c1ae1e52f638..644b0f3c7b9f 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3571,17 +3571,41 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  }
>  EXPORT_SYMBOL_GPL(kvfree_call_rcu);
>  
> +static int free_krc_page_cache(struct kfree_rcu_cpu *krcp)
> +{
> +	unsigned long flags;
> +	struct llist_node *page_list, *pos, *n;
> +	int freed = 0;
> +
> +	raw_spin_lock_irqsave(&krcp->lock, flags);
> +	page_list = llist_del_all(&krcp->bkvcache);
> +	krcp->nr_bkv_objs = 0;
> +	raw_spin_unlock_irqrestore(&krcp->lock, flags);
> +
> +	llist_for_each_safe(pos, n, page_list) {
> +		free_page((unsigned long)pos);
> +		freed++;
> +	}
> +
> +	return freed;
> +}
> +
>  static unsigned long
>  kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
>  {
>  	int cpu;
>  	unsigned long count = 0;
> +	unsigned long flags;
>  
>  	/* Snapshot count of all CPUs */
>  	for_each_possible_cpu(cpu) {
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>  
>  		count += READ_ONCE(krcp->count);
> +
> +		raw_spin_lock_irqsave(&krcp->lock, flags);
> +		count += krcp->nr_bkv_objs;
> +		raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  	}
>  
>  	return count;
> @@ -3598,6 +3622,8 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>  
>  		count = krcp->count;
> +		count += free_krc_page_cache(krcp);
> +
>  		raw_spin_lock_irqsave(&krcp->lock, flags);
>  		if (krcp->monitor_todo)
>  			kfree_rcu_drain_unlock(krcp, flags);
> -- 
> 2.17.1
> 
Thank you for your patch!

I spent some time to see how the patch behaves under low memory condition.
To simulate it, i used "rcuscale" tool with below parameters:

../rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig CONFIG_NR_CPUS=64 \
--bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 \
torture.disable_onoff_at_boot" --trust-make

64 CPUs + 512 MB of memory. In general, my test system was running on edge
hitting an out of memory sometimes, but could be considered as stable in
regards to a test completion and taken time, so both were pretty solid.

You can find a comparison on a plot, that can be downloaded following
a link: wget ftp://vps418301.ovh.net/incoming/release_page_cache_under_low_memory.png

In short, i see that a patched version can lead to longer test completion,
whereas the default variant is stable on almost all runs. After some analysis
and further digging i came to conclusion that a shrinker free_krc_page_cache()
concurs with run_page_cache_worker(krcp) running from kvfree_rcu() context.

i.e. During the test a page shrinker is pretty active, because of low memory
condition. Our callback drains it whereas kvfree_rcu() part refill it right
away making kind of vicious circle.

So, a run_page_cache_worker() should be backoff for some time when a system
runs into a low memory condition or high pressure:

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7077d73fcb53..446723b9646b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3163,7 +3163,7 @@ struct kfree_rcu_cpu {
 	bool initialized;
 	int count;
 
-	struct work_struct page_cache_work;
+	struct delayed_work page_cache_work;
 	atomic_t work_in_progress;
 	struct hrtimer hrtimer;
 
@@ -3419,7 +3419,7 @@ schedule_page_work_fn(struct hrtimer *t)
 	struct kfree_rcu_cpu *krcp =
 		container_of(t, struct kfree_rcu_cpu, hrtimer);
 
-	queue_work(system_highpri_wq, &krcp->page_cache_work);
+	queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
 	return HRTIMER_NORESTART;
 }
 
@@ -3428,7 +3428,7 @@ static void fill_page_cache_func(struct work_struct *work)
 	struct kvfree_rcu_bulk_data *bnode;
 	struct kfree_rcu_cpu *krcp =
 		container_of(work, struct kfree_rcu_cpu,
-			page_cache_work);
+			page_cache_work.work);
 	unsigned long flags;
 	bool pushed;
 	int i;
@@ -3452,15 +3452,22 @@ static void fill_page_cache_func(struct work_struct *work)
 	atomic_set(&krcp->work_in_progress, 0);
 }
 
+static bool backoff_page_cache_fill;
+
 static void
 run_page_cache_worker(struct kfree_rcu_cpu *krcp)
 {
 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
 			!atomic_xchg(&krcp->work_in_progress, 1)) {
-		hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
-			HRTIMER_MODE_REL);
-		krcp->hrtimer.function = schedule_page_work_fn;
-		hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+		if (READ_ONCE(backoff_page_cache_fill)) {
+			queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, HZ);
+			WRITE_ONCE(backoff_page_cache_fill, false);
+		} else {
+			hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
+				HRTIMER_MODE_REL);
+			krcp->hrtimer.function = schedule_page_work_fn;
+			hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+		}
 	}
 }
 
@@ -3644,6 +3651,8 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 	}
 
+	// Low memory condition, limit a page cache worker activity.
+	WRITE_ONCE(backoff_page_cache_fill, true);
 	return count;
 }
 
@@ -4634,7 +4643,7 @@ static void __init kfree_rcu_batch_init(void)
 		}
 
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
-		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
 		krcp->initialized = true;
 	}
 	if (register_shrinker(&kfree_rcu_shrinker))

below patch fixes it. We should backoff the page fill worker keeping it empty
until the situation with memory consumption is normalized.

Any thoughts ideas?

--
Vlad Rezki