Subject: [ANNOUNCE] v5.4.26-rt17

Dear RT folks!

I'm pleased to announce the v5.4.26-rt17 patch set.

Changes since v5.4.26-rt16:

- Revert the "cross CPU pagevec" access with a static key switch. The
old behaviour has been almost fully restored: There is no cross CPU
access of the local_lock. Instead the workqueue is scheduled on
remote CPU like in the !RT case.

- Make the `sched_clock_timer' timer expire in hardirq. Patch by Ahmed
S. Darwish.

- The preempt-lazy code on 32-bit PowerPC used the wrong mask and
eventually crashed. Patch by Thomas Graziadei.

- Remove the warning with more than two waiters on completion which
was woken up via complete_all().

Known issues
- It has been pointed out that due to changes to the printk code the
internal buffer representation changed. This is only an issue if tools
like `crash' are used to extract the printk buffer from a kernel memory
image.

The delta patch against v5.4.26-rt16 is appended below and can be found here:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.4/incr/patch-5.4.26-rt16-rt17.patch.xz

You can get this release via the git tree at:

git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.4.26-rt17

The RT patch against v5.4.26 can be found here:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.4/older/patch-5.4.26-rt17.patch.xz

The split quilt queue is available at:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.4/older/patches-5.4.26-rt17.tar.xz

Sebastian

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 172dfb567c25c..ab609d63d6443 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -533,12 +533,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)

1: stw r6,RESULT(r1) /* Save result */
stw r3,GPR3(r1) /* Update return value */
-2: andi. r0,r9,(_TIF_PERSYSCALL_MASK)@h
+2: andis. r0,r9,(_TIF_PERSYSCALL_MASK)@h
beq 4f

/* Clear per-syscall TIF flags if any are set. */

- li r11,_TIF_PERSYSCALL_MASK@h
+ lis r11,(_TIF_PERSYSCALL_MASK)@h
addi r12,r2,TI_FLAGS
3: lwarx r8,0,r12
andc r8,r8,r11
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6ce61770ef343..61f2f6ff94673 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -580,7 +580,6 @@ extern void page_frag_free(void *addr);
void page_alloc_init(void);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
-void drain_cpu_pages(unsigned int cpu, struct zone *zone);
void drain_local_pages(struct zone *zone);

void page_alloc_init_late(void);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 5030ab13db060..cd97d2c8840cc 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -197,12 +197,6 @@ struct platform_s2idle_ops {
void (*end)(void);
};

-#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
-extern bool pm_in_action;
-#else
-# define pm_in_action false
-#endif
-
#ifdef CONFIG_SUSPEND
extern suspend_state_t mem_sleep_current;
extern suspend_state_t mem_sleep_default;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 063c0c1e112bd..1ddf6a825468e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -12,6 +12,7 @@
#include <linux/fs.h>
#include <linux/atomic.h>
#include <linux/page-flags.h>
+#include <linux/locallock.h>
#include <asm/page.h>

struct notifier_block;
@@ -328,6 +329,7 @@ extern unsigned long nr_free_pagecache_pages(void);


/* linux/mm/swap.c */
+DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
extern void lru_cache_add(struct page *);
extern void lru_cache_add_anon(struct page *page);
extern void lru_cache_add_file(struct page *page);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c8713bbc44810..3c0a5a8170b02 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -689,10 +689,6 @@ static int load_image_and_restore(void)
return error;
}

-#ifndef CONFIG_SUSPEND
-bool pm_in_action;
-#endif
-
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
@@ -706,8 +702,6 @@ int hibernate(void)
return -EPERM;
}

- pm_in_action = true;
-
lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -784,7 +778,6 @@ int hibernate(void)
atomic_inc(&snapshot_device_available);
Unlock:
unlock_system_sleep();
- pm_in_action = false;
pr_info("hibernation exit\n");

return error;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index a9a6ada0c8e47..27f149f5d4a9f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -595,8 +595,6 @@ static int enter_state(suspend_state_t state)
return error;
}

-bool pm_in_action;
-
/**
* pm_suspend - Externally visible function for suspending the system.
* @state: System sleep state to enter.
@@ -611,7 +609,6 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;

- pm_in_action = true;
pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
error = enter_state(state);
if (error) {
@@ -621,7 +618,6 @@ int pm_suspend(suspend_state_t state)
suspend_stats.success++;
}
pr_info("suspend exit\n");
- pm_in_action = false;
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index a1f9c3f66304c..9fcb2a695a412 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,7 +8,6 @@
*
*/
#include "sched.h"
-#include "../../mm/internal.h"

DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overridden);
@@ -140,21 +139,10 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned int flags;
- int ret;

flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;

- ret = housekeeping_setup(str, flags);
-
- /*
- * Protect struct pagevec with a lock instead using preemption disable;
- * with lock protection, remote handling of events instead of queue
- * work on remote cpu is default behavior.
- */
- if (ret)
- static_branch_enable(&use_pvec_lock);
-
- return ret;
+ return housekeeping_setup(str, flags);
}
__setup("nohz_full=", housekeeping_nohz_full_setup);

diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index c58068d2ee06c..c56c315524ae6 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -35,7 +35,6 @@ EXPORT_SYMBOL(swake_up_locked);
void swake_up_all_locked(struct swait_queue_head *q)
{
struct swait_queue *curr;
- int wakes = 0;

while (!list_empty(&q->task_list)) {

@@ -43,11 +42,7 @@ void swake_up_all_locked(struct swait_queue_head *q)
task_list);
wake_up_process(curr->task);
list_del_init(&curr->task_list);
- wakes++;
}
- if (pm_in_action)
- return;
- WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
}
EXPORT_SYMBOL(swake_up_all_locked);

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index dbd69052eaa66..a5538dd76a819 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -207,7 +207,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)

if (sched_clock_timer.function != NULL) {
/* update timeout for clock wrap */
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt,
+ HRTIMER_MODE_REL_HARD);
}

r = rate;
@@ -251,9 +252,9 @@ void __init generic_sched_clock_init(void)
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
sched_clock_timer.function = sched_clock_poll;
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}

/*
@@ -290,7 +291,7 @@ void sched_clock_resume(void)
struct clock_read_data *rd = &cd.read_data[0];

rd->epoch_cyc = cd.actual_read_sched_clock();
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
rd->read_sched_clock = cd.actual_read_sched_clock;
}

diff --git a/localversion-rt b/localversion-rt
index 1199ebade17b4..1e584b47c987e 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt16
+-rt17
diff --git a/mm/compaction.c b/mm/compaction.c
index 03c9e335fa8b0..83cc3d1e5df7b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2244,16 +2244,12 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
block_start_pfn(cc->migrate_pfn, cc->order);

if (last_migrated_pfn < current_block_start) {
- if (static_branch_likely(&use_pvec_lock)) {
- cpu = raw_smp_processor_id();
- lru_add_drain_cpu(cpu);
- drain_cpu_pages(cpu, cc->zone);
- } else {
- cpu = get_cpu();
- lru_add_drain_cpu(cpu);
- drain_local_pages(cc->zone);
- put_cpu();
- }
+ cpu = get_cpu_light();
+ local_lock_irq(swapvec_lock);
+ lru_add_drain_cpu(cpu);
+ local_unlock_irq(swapvec_lock);
+ drain_local_pages(cc->zone);
+ put_cpu_light();
/* No more flushing until we migrate again */
last_migrated_pfn = 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 874741f333cf8..7dd7fbb577a9a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -32,12 +32,6 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

-#ifdef CONFIG_PREEMPT_RT
-extern struct static_key_true use_pvec_lock;
-#else
-extern struct static_key_false use_pvec_lock;
-#endif
-
void page_writeback_init(void);

vm_fault_t do_swap_page(struct vm_fault *vmf);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9bbc72c073e1..7df9930628c2b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2883,14 +2883,6 @@ static void drain_pages(unsigned int cpu)
}
}

-void drain_cpu_pages(unsigned int cpu, struct zone *zone)
-{
- if (zone)
- drain_pages_zone(cpu, zone);
- else
- drain_pages(cpu);
-}
-
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*
@@ -2901,7 +2893,10 @@ void drain_local_pages(struct zone *zone)
{
int cpu = smp_processor_id();

- drain_cpu_pages(cpu, zone);
+ if (zone)
+ drain_pages_zone(cpu, zone);
+ else
+ drain_pages(cpu);
}

static void drain_local_pages_wq(struct work_struct *work)
@@ -2917,9 +2912,9 @@ static void drain_local_pages_wq(struct work_struct *work)
* cpu which is allright but we also have to make sure to not move to
* a different one.
*/
- preempt_disable();
+ migrate_disable();
drain_local_pages(drain->zone);
- preempt_enable();
+ migrate_enable();
}

/*
@@ -2988,20 +2983,15 @@ void drain_all_pages(struct zone *zone)
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}

- if (static_branch_likely(&use_pvec_lock)) {
- for_each_cpu(cpu, &cpus_with_pcps)
- drain_cpu_pages(cpu, zone);
- } else {
- for_each_cpu(cpu, &cpus_with_pcps) {
- struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
+ for_each_cpu(cpu, &cpus_with_pcps) {
+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);

- drain->zone = zone;
- INIT_WORK(&drain->work, drain_local_pages_wq);
- queue_work_on(cpu, mm_percpu_wq, &drain->work);
- }
- for_each_cpu(cpu, &cpus_with_pcps)
- flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
+ drain->zone = zone;
+ INIT_WORK(&drain->work, drain_local_pages_wq);
+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
}
+ for_each_cpu(cpu, &cpus_with_pcps)
+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);

mutex_unlock(&pcpu_drain_mutex);
}
@@ -7686,8 +7676,9 @@ void __init free_area_init(unsigned long *zones_size)

static int page_alloc_cpu_dead(unsigned int cpu)
{
-
+ local_lock_irq_on(swapvec_lock, cpu);
lru_add_drain_cpu(cpu);
+ local_unlock_irq_on(swapvec_lock, cpu);
drain_pages(cpu);

/*
diff --git a/mm/swap.c b/mm/swap.c
index 98accaf51fce8..cdb4f1fa3a483 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -33,6 +33,7 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
+#include <linux/locallock.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>

@@ -44,111 +45,16 @@
/* How many pages do we try to swap or page in/out together? */
int page_cluster;

-#ifdef CONFIG_PREEMPT_RT
-DEFINE_STATIC_KEY_TRUE(use_pvec_lock);
-#else
-DEFINE_STATIC_KEY_FALSE(use_pvec_lock);
-#endif
-
-struct swap_pagevec {
- spinlock_t lock;
- struct pagevec pvec;
-};
-
-#define DEFINE_PER_CPU_PAGEVEC(lvar) \
- DEFINE_PER_CPU(struct swap_pagevec, lvar) = { \
- .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
-
-static DEFINE_PER_CPU_PAGEVEC(lru_add_pvec);
-static DEFINE_PER_CPU_PAGEVEC(lru_rotate_pvecs);
-static DEFINE_PER_CPU_PAGEVEC(lru_deactivate_file_pvecs);
-static DEFINE_PER_CPU_PAGEVEC(lru_deactivate_pvecs);
-static DEFINE_PER_CPU_PAGEVEC(lru_lazyfree_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
+static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU_PAGEVEC(activate_page_pvecs);
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
#endif
-
-static inline
-struct swap_pagevec *lock_swap_pvec(struct swap_pagevec __percpu *p)
-{
- struct swap_pagevec *swpvec;
-
- if (static_branch_likely(&use_pvec_lock)) {
- swpvec = raw_cpu_ptr(p);
-
- spin_lock(&swpvec->lock);
- } else {
- swpvec = &get_cpu_var(*p);
- }
- return swpvec;
-}
-
-static inline struct swap_pagevec *
-lock_swap_pvec_cpu(struct swap_pagevec __percpu *p, int cpu)
-{
- struct swap_pagevec *swpvec = per_cpu_ptr(p, cpu);
-
- if (static_branch_likely(&use_pvec_lock))
- spin_lock(&swpvec->lock);
-
- return swpvec;
-}
-
-static inline struct swap_pagevec *
-lock_swap_pvec_irqsave(struct swap_pagevec __percpu *p, unsigned long *flags)
-{
- struct swap_pagevec *swpvec;
-
- if (static_branch_likely(&use_pvec_lock)) {
- swpvec = raw_cpu_ptr(p);
-
- spin_lock_irqsave(&swpvec->lock, (*flags));
- } else {
- local_irq_save(*flags);
-
- swpvec = this_cpu_ptr(p);
- }
- return swpvec;
-}
-
-static inline struct swap_pagevec *
-lock_swap_pvec_cpu_irqsave(struct swap_pagevec __percpu *p, int cpu,
- unsigned long *flags)
-{
- struct swap_pagevec *swpvec = per_cpu_ptr(p, cpu);
-
- if (static_branch_likely(&use_pvec_lock))
- spin_lock_irqsave(&swpvec->lock, *flags);
- else
- local_irq_save(*flags);
-
- return swpvec;
-}
-
-static inline void unlock_swap_pvec(struct swap_pagevec *swpvec,
- struct swap_pagevec __percpu *p)
-{
- if (static_branch_likely(&use_pvec_lock))
- spin_unlock(&swpvec->lock);
- else
- put_cpu_var(*p);
-
-}
-
-static inline void unlock_swap_pvec_cpu(struct swap_pagevec *swpvec)
-{
- if (static_branch_likely(&use_pvec_lock))
- spin_unlock(&swpvec->lock);
-}
-
-static inline void
-unlock_swap_pvec_irqrestore(struct swap_pagevec *swpvec, unsigned long flags)
-{
- if (static_branch_likely(&use_pvec_lock))
- spin_unlock_irqrestore(&swpvec->lock, flags);
- else
- local_irq_restore(flags);
-}
+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
+DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);

/*
* This path almost never happens for VM activity - pages are normally
@@ -347,17 +253,15 @@ void rotate_reclaimable_page(struct page *page)
{
if (!PageLocked(page) && !PageDirty(page) &&
!PageUnevictable(page) && PageLRU(page)) {
- struct swap_pagevec *swpvec;
struct pagevec *pvec;
unsigned long flags;

get_page(page);
-
- swpvec = lock_swap_pvec_irqsave(&lru_rotate_pvecs, &flags);
- pvec = &swpvec->pvec;
+ local_lock_irqsave(rotate_lock, flags);
+ pvec = this_cpu_ptr(&lru_rotate_pvecs);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_move_tail(pvec);
- unlock_swap_pvec_irqrestore(swpvec, flags);
+ local_unlock_irqrestore(rotate_lock, flags);
}
}

@@ -392,32 +296,28 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
#ifdef CONFIG_SMP
static void activate_page_drain(int cpu)
{
- struct swap_pagevec *swpvec = lock_swap_pvec_cpu(&activate_page_pvecs, cpu);
- struct pagevec *pvec = &swpvec->pvec;
+ struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);

if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
- unlock_swap_pvec_cpu(swpvec);
}

static bool need_activate_page_drain(int cpu)
{
- return pagevec_count(per_cpu_ptr(&activate_page_pvecs.pvec, cpu)) != 0;
+ return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
}

void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- struct swap_pagevec *swpvec;
- struct pagevec *pvec;
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ activate_page_pvecs);

get_page(page);
- swpvec = lock_swap_pvec(&activate_page_pvecs);
- pvec = &swpvec->pvec;
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
- unlock_swap_pvec(swpvec, &activate_page_pvecs);
+ put_locked_var(swapvec_lock, activate_page_pvecs);
}
}

@@ -439,8 +339,7 @@ void activate_page(struct page *page)

static void __lru_cache_activate_page(struct page *page)
{
- struct swap_pagevec *swpvec = lock_swap_pvec(&lru_add_pvec);
- struct pagevec *pvec = &swpvec->pvec;
+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
int i;

/*
@@ -462,7 +361,7 @@ static void __lru_cache_activate_page(struct page *page)
}
}

- unlock_swap_pvec(swpvec, &lru_add_pvec);
+ put_locked_var(swapvec_lock, lru_add_pvec);
}

/*
@@ -504,13 +403,12 @@ EXPORT_SYMBOL(mark_page_accessed);

static void __lru_cache_add(struct page *page)
{
- struct swap_pagevec *swpvec = lock_swap_pvec(&lru_add_pvec);
- struct pagevec *pvec = &swpvec->pvec;
+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);

get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
__pagevec_lru_add(pvec);
- unlock_swap_pvec(swpvec, &lru_add_pvec);
+ put_locked_var(swapvec_lock, lru_add_pvec);
}

/**
@@ -694,40 +592,38 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
*/
void lru_add_drain_cpu(int cpu)
{
- struct swap_pagevec *swpvec = lock_swap_pvec_cpu(&lru_add_pvec, cpu);
- struct pagevec *pvec = &swpvec->pvec;
- unsigned long flags;
+ struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);

if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- unlock_swap_pvec_cpu(swpvec);

- swpvec = lock_swap_pvec_cpu_irqsave(&lru_rotate_pvecs, cpu, &flags);
- pvec = &swpvec->pvec;
+ pvec = &per_cpu(lru_rotate_pvecs, cpu);
if (pagevec_count(pvec)) {
+ unsigned long flags;

/* No harm done if a racing interrupt already did this */
+#ifdef CONFIG_PREEMPT_RT
+ local_lock_irqsave_on(rotate_lock, flags, cpu);
pagevec_move_tail(pvec);
+ local_unlock_irqrestore_on(rotate_lock, flags, cpu);
+#else
+ local_lock_irqsave(rotate_lock, flags);
+ pagevec_move_tail(pvec);
+ local_unlock_irqrestore(rotate_lock, flags);
+#endif
}
- unlock_swap_pvec_irqrestore(swpvec, flags);

- swpvec = lock_swap_pvec_cpu(&lru_deactivate_file_pvecs, cpu);
- pvec = &swpvec->pvec;
+ pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- unlock_swap_pvec_cpu(swpvec);

- swpvec = lock_swap_pvec_cpu(&lru_deactivate_pvecs, cpu);
- pvec = &swpvec->pvec;
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- unlock_swap_pvec_cpu(swpvec);

- swpvec = lock_swap_pvec_cpu(&lru_lazyfree_pvecs, cpu);
- pvec = &swpvec->pvec;
+ pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
- unlock_swap_pvec_cpu(swpvec);

activate_page_drain(cpu);
}
@@ -742,9 +638,6 @@ void lru_add_drain_cpu(int cpu)
*/
void deactivate_file_page(struct page *page)
{
- struct swap_pagevec *swpvec;
- struct pagevec *pvec;
-
/*
* In a workload with many unevictable page such as mprotect,
* unevictable page deactivation for accelerating reclaim is pointless.
@@ -753,12 +646,12 @@ void deactivate_file_page(struct page *page)
return;

if (likely(get_page_unless_zero(page))) {
- swpvec = lock_swap_pvec(&lru_deactivate_file_pvecs);
- pvec = &swpvec->pvec;
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ lru_deactivate_file_pvecs);

if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- unlock_swap_pvec(swpvec, &lru_deactivate_file_pvecs);
+ put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
}
}

@@ -773,16 +666,12 @@ void deactivate_file_page(struct page *page)
void deactivate_page(struct page *page)
{
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- struct swap_pagevec *swpvec;
- struct pagevec *pvec;
-
- swpvec = lock_swap_pvec(&lru_deactivate_pvecs);
- pvec = &swpvec->pvec;
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);

get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- unlock_swap_pvec(swpvec, &lru_deactivate_pvecs);
+ put_cpu_var(lru_deactivate_pvecs);
}
}

@@ -795,33 +684,36 @@ void deactivate_page(struct page *page)
*/
void mark_page_lazyfree(struct page *page)
{
- struct swap_pagevec *swpvec;
- struct pagevec *pvec;
-
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
- swpvec = lock_swap_pvec(&lru_lazyfree_pvecs);
- pvec = &swpvec->pvec;
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ lru_lazyfree_pvecs);

get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
- unlock_swap_pvec(swpvec, &lru_lazyfree_pvecs);
+ put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
}
}

void lru_add_drain(void)
{
- if (static_branch_likely(&use_pvec_lock)) {
- lru_add_drain_cpu(raw_smp_processor_id());
- } else {
- lru_add_drain_cpu(get_cpu());
- put_cpu();
- }
+ lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
+ local_unlock_cpu(swapvec_lock);
}

#ifdef CONFIG_SMP

+#ifdef CONFIG_PREEMPT_RT
+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
+{
+ local_lock_on(swapvec_lock, cpu);
+ lru_add_drain_cpu(cpu);
+ local_unlock_on(swapvec_lock, cpu);
+}
+
+#else
+
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

static void lru_add_drain_per_cpu(struct work_struct *dummy)
@@ -829,6 +721,16 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
lru_add_drain();
}

+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
+{
+ struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+ INIT_WORK(work, lru_add_drain_per_cpu);
+ queue_work_on(cpu, mm_percpu_wq, work);
+ cpumask_set_cpu(cpu, has_work);
+}
+#endif
+
/*
* Doesn't need any cpu hotplug locking because we do rely on per-cpu
* kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -838,54 +740,37 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
*/
void lru_add_drain_all(void)
{
- if (static_branch_likely(&use_pvec_lock)) {
- int cpu;
+ static DEFINE_MUTEX(lock);
+ static struct cpumask has_work;
+ int cpu;

- for_each_online_cpu(cpu) {
- if (pagevec_count(&per_cpu(lru_add_pvec.pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_rotate_pvecs.pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_file_pvecs.pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs.pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_lazyfree_pvecs.pvec, cpu)) ||
- need_activate_page_drain(cpu)) {
- lru_add_drain_cpu(cpu);
- }
- }
- } else {
- static DEFINE_MUTEX(lock);
- static struct cpumask has_work;
- int cpu;
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON(!mm_percpu_wq))
+ return;

- /*
- * Make sure nobody triggers this path before mm_percpu_wq
- * is fully initialized.
- */
- if (WARN_ON(!mm_percpu_wq))
- return;
+ mutex_lock(&lock);
+ cpumask_clear(&has_work);

- mutex_lock(&lock);
- cpumask_clear(&has_work);
+ for_each_online_cpu(cpu) {

- for_each_online_cpu(cpu) {
- struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
-
- if (pagevec_count(&per_cpu(lru_add_pvec.pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_rotate_pvecs.pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_file_pvecs.pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs.pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_lazyfree_pvecs.pvec, cpu)) ||
- need_activate_page_drain(cpu)) {
- INIT_WORK(work, lru_add_drain_per_cpu);
- queue_work_on(cpu, mm_percpu_wq, work);
- cpumask_set_cpu(cpu, &has_work);
- }
- }
-
- for_each_cpu(cpu, &has_work)
- flush_work(&per_cpu(lru_add_drain_work, cpu));
-
- mutex_unlock(&lock);
+ if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+ pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
+ need_activate_page_drain(cpu))
+ remote_lru_add_drain(cpu, &has_work);
}
+
+#ifndef CONFIG_PREEMPT_RT
+ for_each_cpu(cpu, &has_work)
+ flush_work(&per_cpu(lru_add_drain_work, cpu));
+#endif
+
+ mutex_unlock(&lock);
}
#else
void lru_add_drain_all(void)


2020-03-30 16:33:54

by Joe Korty

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.4.26-rt17

On Fri, Mar 20, 2020 at 08:28:30PM +0100, Sebastian Andrzej Siewior wrote:
> Dear RT folks!
>
> I'm pleased to announce the v5.4.26-rt17 patch set.
>
> Changes since v5.4.26-rt16:
>
> - Revert the "cross CPU pagevec" access with a static key switch. The
> old behaviour has been almost fully restored: There is no cross CPU
> access of the local_lock. Instead the workqueue is scheduled on
> remote CPU like in the !RT case.

[try #2]
This reversion fixes the oops I was getting when CONFIG_PREEMPT_RT=n,
CONFIG_PREEMPT=y, with nohz_full operational, when the system is under
heavy memory pressure (mem=4G on the kernel command line, 8 cpu setup,
make -j11 kernel build).