2024-03-22 13:46:45

by Raghavendra K T

[permalink] [raw]
Subject: [RFC PATCH 2 1/1] sched/numa: Increase the VMA accessing PID bits

Currently we use 64 bits to track tasks accessing VMA.

This increases probability of false positive cases and thus
potentially increase unnecssary scanning of VMA though task
had not accessed VMA. Increase it to 128 bits.

Suggested-by: Ingo Molnar <[email protected]>
Signed-off-by: Raghavendra K T <[email protected]>
---
include/linux/mm.h | 29 ++++++++++++++++++++++++++---
include/linux/mm_types.h | 7 ++++++-
kernel/sched/fair.c | 21 ++++++++++++++++-----
3 files changed, 48 insertions(+), 9 deletions(-)

There could be better idea than having array of 2 long variables for 128
bits?

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..d8ff7233cf9b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1741,13 +1741,26 @@ static inline int folio_xchg_access_time(struct folio *folio, int time)
return last_time << PAGE_ACCESS_TIME_BUCKETS;
}

+static inline int pid_array_idx(int pid_bit)
+{
+ return (pid_bit / BITS_PER_LONG);
+}
+
+static inline int pid_bit_idx(int pid_bit)
+{
+ return (pid_bit % BITS_PER_LONG);
+}
+
static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
unsigned int pid_bit;

- pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
- if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
- __set_bit(pid_bit, &vma->numab_state->pids_active[1]);
+ pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG * NR_PID_ARRAY));
+
+ if (vma->numab_state && !test_bit(pid_bit_idx(pid_bit),
+ &vma->numab_state->pids_active[1][pid_array_idx(pid_bit)])) {
+ __set_bit(pid_bit_idx(pid_bit),
+ &vma->numab_state->pids_active[1][pid_array_idx(pid_bit)]);
}
}
#else /* !CONFIG_NUMA_BALANCING */
@@ -1800,6 +1813,16 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
return false;
}

+static inline int pid_array_idx(int pid_bit)
+{
+ return 0;
+}
+
+static inline int pid_bit_idx(int pid_bit)
+{
+ return 0;
+}
+
static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e..34bb8e1f0e1c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -574,6 +574,11 @@ struct vma_lock {
struct rw_semaphore lock;
};

+#define NR_PID_ARRAY 2
+#define NR_TRACKED_PIDS (BITS_PER_LONG * NR_PID_ARRAY)
+
+#define NR_ACCESS_PID_HIST 2
+
struct vma_numab_state {
/*
* Initialised as time in 'jiffies' after which VMA
@@ -598,7 +603,7 @@ struct vma_numab_state {
* Window moves after next_pid_reset has expired approximately
* every VMA_PID_RESET_PERIOD jiffies:
*/
- unsigned long pids_active[2];
+ unsigned long pids_active[NR_ACCESS_PID_HIST][NR_PID_ARRAY];

/* MM scan sequence ID when scan first started after VMA creation */
int start_scan_seq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a16129f9a5c..63086ca00430 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3159,7 +3159,8 @@ static void reset_ptenuma_scan(struct task_struct *p)

static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
{
- unsigned long pids;
+ int pid_bit, pid_aidx, i;
+ unsigned long pids = 0;
/*
* Allow unconditional access first two times, so that all the (pages)
* of VMAs get prot_none fault introduced irrespective of accesses.
@@ -3169,8 +3170,13 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
return true;

- pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
- if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+ pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG * NR_PID_ARRAY));
+ pid_aidx = pid_array_idx(pid_bit);
+
+ for (i = 0; i < NR_ACCESS_PID_HIST; i++)
+ pids |= vma->numab_state->pids_active[i][pid_aidx];
+
+ if (test_bit(pid_bit_idx(pid_bit), &pids))
return true;

/*
@@ -3204,6 +3210,7 @@ static void task_numa_work(struct callback_head *work)
struct vma_iterator vmi;
bool vma_pids_skipped;
bool vma_pids_forced = false;
+ int i;

SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));

@@ -3341,8 +3348,12 @@ static void task_numa_work(struct callback_head *work)
time_after(jiffies, vma->numab_state->pids_active_reset)) {
vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
- vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
- vma->numab_state->pids_active[1] = 0;
+
+ for (i = 0; i < NR_PID_ARRAY; i++) {
+ vma->numab_state->pids_active[0][i] =
+ READ_ONCE(vma->numab_state->pids_active[1][i]);
+ vma->numab_state->pids_active[1][i] = 0;
+ }
}

/* Do not rescan VMAs twice within the same sequence. */
--
2.34.1