Subject: [PATCH v2 0/8] kernel/fork: Move thread stack free otu of the scheduler path.

This is a follup-up on the patch
sched: Delay task stack freeing on RT
https://lkml.kernel.org/r/[email protected]

It addresses the review feedback:
- Decouple stack accounting from its free invocation. The accounting
happens in do_exit(), the final free call happens later.

- Add put_task_stack_sched() to finish_task_switch(). Here the VMAP
stack is cached only. If it fails, or in the !VMAP case then the final
free happens in delayed_put_task_struct(). This is also an oportunity
to cache the stack.

Changes since v1:
- Drop the bit marked und use addtional RCU head to free the stack in
case that it can't be cached. Suggested by Andy Lutomirski.

v1: https://lore.kernel.org/all/[email protected]

From testing I observe the following:

| bash-1715 [006] ..... 124.901510: copy_process: allocC ffffc90007e70000
| sh-cmds.sh-1746 [007] ..... 124.907389: copy_process: allocC ffffc90007dc4000
| <idle>-0 [019] ...1. 124.918126: free_thread_stack: cache ffffc90007dc4000
| sh-cmds.sh-1746 [007] ..... 124.918279: copy_process: allocC ffffc90007de8000
| <idle>-0 [004] ...1. 124.920121: free_thread_stack: delay ffffc90007de8001
| <idle>-0 [007] ...1. 124.920299: free_thread_stack: cache ffffc90007e70000
| <idle>-0 [007] ..s1. 124.945433: free_thread_stack: cache ffffc90007de8000

TS 124.901510, bash started sh-cmds.sh, obtained stack from cache.
TS 124.907389, script invokes its first command, obtained stacak from
cache. As you can see bash was running on CPU6 but its child was moved
CPU7.
TS 124.918126, the first command is done, stack is ached on CPU19.
TS 124.918279, script's second command, ache from stack.
TS 124.920121, the command is done. The stack cache on CPU4 is full.
TS 124.920299, the script is done, caches stack on CPU7.
TS 124.945433, the RCU-callback of last command is now happening. On
CPU7, which is where the command was invoked (but not running). Instead
of freeing the stack, it was cached since CPU7 had an empty slot.

If I pin the script to CPU5 and run it with multiple commands then it
works as expected:

| bash-1799 [005] ..... 993.608131: copy_process: allocC ffffc90007fa0000
| sh-cmds.sh-1827 [005] ..... 993.608888: copy_process: allocC ffffc90007fa8000
| sh-cmds.sh-1827 [005] ..... 993.610734: copy_process: allocV ffffc90007ff4000
| sh-cmds.sh-1829 [005] ...1. 993.610757: free_thread_stack: cache ffffc90007fa8000
| sh-cmds.sh-1827 [005] ..... 993.612401: copy_process: allocC ffffc90007fa8000
| <...>-1830 [005] ...1. 993.612416: free_thread_stack: cache ffffc90007ff4000
| sh-cmds.sh-1827 [005] ..... 993.613707: copy_process: allocC ffffc90007ff4000
| sh-cmds.sh-1831 [005] ...1. 993.613723: free_thread_stack: cache ffffc90007fa8000
| sh-cmds.sh-1827 [005] ..... 993.615024: copy_process: allocC ffffc90007fa8000
| <...>-1832 [005] ...1. 993.615040: free_thread_stack: cache ffffc90007ff4000
| sh-cmds.sh-1827 [005] ..... 993.616380: copy_process: allocC ffffc90007ff4000
| <...>-1833 [005] ...1. 993.616397: free_thread_stack: cache ffffc90007fa8000
| bash-1799 [005] ...1. 993.617759: free_thread_stack: cache ffffc90007fa0000
| <idle>-0 [005] ...1. 993.617871: free_thread_stack: delay ffffc90007ff4001
| <idle>-0 [005] ..s1. 993.638311: free_thread_stack: free ffffc90007ff4000

and no new is allocated during its runtime and a cached stack is used.

Sebastian



Subject: [PATCH v2 3/8] kernel/fork, IA64: Provide a alloc_thread_stack_node() for IA64.

Provide a generic alloc_thread_stack_node() for IA64/
CONFIG_ARCH_THREAD_STACK_ALLOCATOR which returns stack pointer and sets
task_struct::stack so it behaves exactly like the other implementations.

Rename IA64's alloc_thread_stack_node() and add the generic version to
the fork code so it is in one place _and_ to drastically lower chances
of fat fingering the IA64 code.
Do the same for free_thread_stack().

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
arch/ia64/include/asm/thread_info.h | 6 +++---
kernel/fork.c | 17 +++++++++++++++++
2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 51d20cb377062..1684716f08201 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -55,15 +55,15 @@ struct thread_info {
#ifndef ASM_OFFSETS_C
/* how to get the thread information struct from C */
#define current_thread_info() ((struct thread_info *) ((char *) current + IA64_TASK_SIZE))
-#define alloc_thread_stack_node(tsk, node) \
+#define arch_alloc_thread_stack_node(tsk, node) \
((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE))
#define task_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
#else
#define current_thread_info() ((struct thread_info *) 0)
-#define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0)
+#define arch_alloc_thread_stack_node(tsk, node) ((unsigned long *) 0)
#define task_thread_info(tsk) ((struct thread_info *) 0)
#endif
-#define free_thread_stack(tsk) /* nothing */
+#define arch_free_thread_stack(tsk) /* nothing */
#define task_stack_page(tsk) ((void *)(tsk))

#define __HAVE_THREAD_FUNCTIONS
diff --git a/kernel/fork.c b/kernel/fork.c
index c47dcba5d66d2..a6697215fe663 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -330,6 +330,23 @@ void thread_stack_cache_init(void)
}

# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ unsigned long *stack;
+
+ stack = arch_alloc_thread_stack_node(tsk, node);
+ tsk->stack = stack;
+ return stack;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ arch_free_thread_stack(tsk);
+ tsk->stack = NULL;
+}
+
#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */

/* SLAB cache for signal_struct structures (tsk->signal) */
--
2.34.1

Subject: [PATCH v2 4/8] kernel/fork: Don't assign the stack pointer in dup_task_struct().

All four versions of alloc_thread_stack_node() assign now
task_struct::stack in case the allocation was successful.

Let alloc_thread_stack_node() return an error code instead of the stack
pointer and remove the stack assignment in dup_task_struct().

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
kernel/fork.c | 47 ++++++++++++++++-------------------------------
1 file changed, 16 insertions(+), 31 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index a6697215fe663..546bea2e3b28a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,7 +211,7 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}

-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
void *stack;
int i;
@@ -232,7 +232,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)

tsk->stack_vm_area = s;
tsk->stack = s->addr;
- return s->addr;
+ return 0;
}

/*
@@ -245,17 +245,16 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
-
+ if (!stack)
+ return -ENOMEM;
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- if (stack) {
- tsk->stack_vm_area = find_vm_area(stack);
- tsk->stack = stack;
- }
- return stack;
+ tsk->stack_vm_area = find_vm_area(stack);
+ tsk->stack = stack;
+ return 0;
}

static void free_thread_stack(struct task_struct *tsk)
@@ -282,16 +281,16 @@ static void free_thread_stack(struct task_struct *tsk)

# else /* !CONFIG_VMAP_STACK */

-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);

if (likely(page)) {
tsk->stack = kasan_reset_tag(page_address(page));
- return tsk->stack;
+ return 0;
}
- return NULL;
+ return -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
@@ -305,14 +304,13 @@ static void free_thread_stack(struct task_struct *tsk)

static struct kmem_cache *thread_stack_cache;

-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
stack = kasan_reset_tag(stack);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
@@ -332,13 +330,13 @@ void thread_stack_cache_init(void)
# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */

-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;

stack = arch_alloc_thread_stack_node(tsk, node);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
@@ -895,8 +893,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- unsigned long *stack;
- struct vm_struct *stack_vm_area __maybe_unused;
int err;

if (node == NUMA_NO_NODE)
@@ -909,24 +905,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (err)
goto free_tsk;

- stack = alloc_thread_stack_node(tsk, node);
- if (!stack)
+ err = alloc_thread_stack_node(tsk, node);
+ if (err)
goto free_tsk;

if (memcg_charge_kernel_stack(tsk))
goto free_stack;

- stack_vm_area = task_stack_vm_area(tsk);
-
- /*
- * arch_dup_task_struct() clobbers the stack-related fields. Make
- * sure they're properly initialized before using any stack-related
- * functions again.
- */
- tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = stack_vm_area;
-#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
--
2.34.1

Subject: [PATCH v2 7/8] kernel/fork: Only cache the VMAP stack in finish_task_switch().

The task stack could be deallocated later. For fork()/exec() kind of
workloads (say a shell script executing several commands) it is
important that the stack is released in finish_task_switch() so that in
VMAP_STACK case it can be cached and reused in the new task.

For PREEMPT_RT it would be good if the wake-up in vfree_atomic() could
be avoided in the scheduling path. Far worse are the other
free_thread_stack() implementations which invoke __free_pages()/
kmem_cache_free() with disabled preemption.

Cache the stack in free_thread_stack() in the VMAP_STACK case and
RCU-delay the free path otherwise. Free the stack in the RCU callback.
In the VMAP_STACK case this is another opportunity to fill the cache.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
kernel/fork.c | 76 ++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 63 insertions(+), 13 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 984f69d6f211f..aa17ed2a2afc7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -193,6 +193,41 @@ static inline void free_task_struct(struct task_struct *tsk)
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);

+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+ unsigned int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
static int free_vm_stack_cache(unsigned int cpu)
{
struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -296,24 +331,27 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)

static void free_thread_stack(struct task_struct *tsk)
{
- int i;
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);

- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i], NULL,
- tsk->stack_vm_area) != NULL)
- continue;
-
- tsk->stack = NULL;
- tsk->stack_vm_area = NULL;
- return;
- }
- vfree_atomic(tsk->stack);
tsk->stack = NULL;
tsk->stack_vm_area = NULL;
}

# else /* !CONFIG_VMAP_STACK */

+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
@@ -328,7 +366,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)

static void free_thread_stack(struct task_struct *tsk)
{
- __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ thread_stack_delayed_free(tsk);
tsk->stack = NULL;
}

@@ -337,6 +375,18 @@ static void free_thread_stack(struct task_struct *tsk)

static struct kmem_cache *thread_stack_cache;

+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ kmem_cache_free(thread_stack_cache, rh);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
@@ -348,7 +398,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)

static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, tsk->stack);
+ thread_stack_delayed_free(tsk);
tsk->stack = NULL;
}

--
2.34.1

Subject: [PATCH v2 6/8] kernel/fork: Move task stack account to do_exit().

There is no need to perform the stack accounting of the outgoing task in
its final schedule() invocation which happens with disabled preemption.
The task is leaving, the resources will be freed and the accounting can
happen in do_exit() before the actual schedule invocation which
frees the stack memory.

Move the accounting of the stack memory from release_task_stack() to
exit_task_stack_account() which then can be invoked from do_exit().

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
include/linux/sched/task_stack.h | 2 ++
kernel/exit.c | 1 +
kernel/fork.c | 35 +++++++++++++++++++++-----------
3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index d10150587d819..892562ebbd3aa 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -79,6 +79,8 @@ static inline void *try_get_task_stack(struct task_struct *tsk)
static inline void put_task_stack(struct task_struct *tsk) {}
#endif

+void exit_task_stack_account(struct task_struct *tsk);
+
#define task_stack_end_corrupted(task) \
(*(end_of_stack(task)) != STACK_END_MAGIC)

diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab93..c303cffe7fdb4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -845,6 +845,7 @@ void __noreturn do_exit(long code)
put_page(tsk->task_frag.page);

validate_creds_for_do_exit(tsk);
+ exit_task_stack_account(tsk);

check_stack_usage();
preempt_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index 919bdcf21b8e5..984f69d6f211f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,9 +211,8 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}

-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
- struct vm_struct *vm = task_stack_vm_area(tsk);
int i;
int ret;

@@ -239,6 +238,7 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
+ struct vm_struct *vm;
void *stack;
int i;

@@ -256,7 +256,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);

- if (memcg_charge_kernel_stack(tsk)) {
+ if (memcg_charge_kernel_stack(s)) {
vfree(s->addr);
return -ENOMEM;
}
@@ -279,7 +279,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!stack)
return -ENOMEM;

- if (memcg_charge_kernel_stack(tsk)) {
+ vm = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm)) {
vfree(stack);
return -ENOMEM;
}
@@ -288,19 +289,15 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- tsk->stack_vm_area = find_vm_area(stack);
+ tsk->stack_vm_area = vm;
tsk->stack = stack;
return 0;
}

static void free_thread_stack(struct task_struct *tsk)
{
- struct vm_struct *vm = task_stack_vm_area(tsk);
int i;

- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
-
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i], NULL,
tsk->stack_vm_area) != NULL)
@@ -454,12 +451,25 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
}

+void exit_task_stack_account(struct task_struct *tsk)
+{
+ account_kernel_stack(tsk, -1);
+
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm;
+ int i;
+
+ vm = task_stack_vm_area(tsk);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+ }
+}
+
static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */

- account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
}

@@ -918,6 +928,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
+ account_kernel_stack(tsk, 1);

err = scs_prepare(tsk, node);
if (err)
@@ -961,8 +972,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->wake_q.next = NULL;
tsk->worker_private = NULL;

- account_kernel_stack(tsk, 1);
-
kcov_task_init(tsk);
kmap_local_fork(tsk);

@@ -981,6 +990,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
return tsk;

free_stack:
+ exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
@@ -2449,6 +2459,7 @@ static __latent_entropy struct task_struct *copy_process(
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
+ exit_task_stack_account(p);
put_task_stack(p);
delayed_free_task(p);
fork_out:
--
2.34.1

Subject: [PATCH v2 5/8] kernel/fork: Move memcg_charge_kernel_stack() into CONFIG_VMAP_STACK.

memcg_charge_kernel_stack() is only used in the CONFIG_VMAP_STACK case.

Move memcg_charge_kernel_stack() into the CONFIG_VMAP_STACK block and
invoke it from within alloc_thread_stack_node().

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
kernel/fork.c | 69 +++++++++++++++++++++++++++------------------------
1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 546bea2e3b28a..919bdcf21b8e5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,6 +211,32 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}

+static int memcg_charge_kernel_stack(struct task_struct *tsk)
+{
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+ int i;
+ int ret;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ goto err;
+ }
+ return 0;
+err:
+ /*
+ * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
+ * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
+ * ignore this page.
+ */
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+ return ret;
+}
+
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
void *stack;
@@ -230,6 +256,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);

+ if (memcg_charge_kernel_stack(tsk)) {
+ vfree(s->addr);
+ return -ENOMEM;
+ }
+
tsk->stack_vm_area = s;
tsk->stack = s->addr;
return 0;
@@ -247,6 +278,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
0, node, __builtin_return_address(0));
if (!stack)
return -ENOMEM;
+
+ if (memcg_charge_kernel_stack(tsk)) {
+ vfree(stack);
+ return -ENOMEM;
+ }
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
@@ -418,36 +454,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
}

-static int memcg_charge_kernel_stack(struct task_struct *tsk)
-{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
- int ret;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
- if (vm) {
- int i;
-
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- /*
- * If memcg_kmem_charge_page() fails, page's
- * memory cgroup pointer is NULL, and
- * memcg_kmem_uncharge_page() in free_thread_stack()
- * will ignore this page.
- */
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
- 0);
- if (ret)
- return ret;
- }
- }
-#endif
- return 0;
-}
-
static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
@@ -909,9 +915,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (err)
goto free_tsk;

- if (memcg_charge_kernel_stack(tsk))
- goto free_stack;
-
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
--
2.34.1

Subject: [PATCH v2 1/8] kernel/fork: Redo ifdefs around task's handling.

The use of ifdef CONFIG_VMAP_STACK is confusing in terms what is
actually happenning and what can happen.
For instance from reading free_thread_stack() it appears that in the
CONFIG_VMAP_STACK case we may receive a non-NULL vm pointer but it may
also be NULL in which case __free_pages() is used to free the stack.
This is however not the case because in the VMAP case a non-NULL pointer
is always returned here.
Since it looks like this might happen, the compiler creates the correct
dead code with the invocation to __free_pages() and everything around
it. Twice.

Add spaces between the ifdef and the identifer to recognize the ifdef
level that we are currently in.
Add the current identifer as a comment behind #else and #endif.
Move the code within free_thread_stack() and alloc_thread_stack_node()
into the relavant ifdef block.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
kernel/fork.c | 74 +++++++++++++++++++++++++++------------------------
1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index d75a528f7b219..f63c0af6002da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -185,7 +185,7 @@ static inline void free_task_struct(struct task_struct *tsk)
*/
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)

-#ifdef CONFIG_VMAP_STACK
+# ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
@@ -210,11 +210,9 @@ static int free_vm_stack_cache(unsigned int cpu)

return 0;
}
-#endif

static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
-#ifdef CONFIG_VMAP_STACK
void *stack;
int i;

@@ -258,7 +256,34 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
tsk->stack = stack;
}
return stack;
-#else
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+ int i;
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL,
+ tsk->stack_vm_area) != NULL)
+ continue;
+
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+ return;
+ }
+ vfree_atomic(tsk->stack);
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+}
+
+# else /* !CONFIG_VMAP_STACK */
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);

@@ -267,36 +292,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
return tsk->stack;
}
return NULL;
-#endif
}

-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
- int i;
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
-
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i],
- NULL, tsk->stack_vm_area) != NULL)
- continue;
-
- return;
- }
-
- vfree_atomic(tsk->stack);
- return;
- }
-#endif
-
__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ tsk->stack = NULL;
}
-# else
+
+# endif /* CONFIG_VMAP_STACK */
+# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
+
static struct kmem_cache *thread_stack_cache;

static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
@@ -312,6 +318,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
static void free_thread_stack(struct task_struct *tsk)
{
kmem_cache_free(thread_stack_cache, tsk->stack);
+ tsk->stack = NULL;
}

void thread_stack_cache_init(void)
@@ -321,8 +328,9 @@ void thread_stack_cache_init(void)
THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
-# endif
-#endif
+
+# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -432,10 +440,6 @@ static void release_task_stack(struct task_struct *tsk)

account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
- tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = NULL;
-#endif
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
--
2.34.1

2022-02-21 19:21:00

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 0/8] kernel/fork: Move thread stack free otu of the scheduler path.

On 2/17/22 02:23, Sebastian Andrzej Siewior wrote:
> This is a follup-up on the patch
> sched: Delay task stack freeing on RT
> https://lkml.kernel.org/r/[email protected]
>
> It addresses the review feedback:
> - Decouple stack accounting from its free invocation. The accounting
> happens in do_exit(), the final free call happens later.
>
> - Add put_task_stack_sched() to finish_task_switch(). Here the VMAP
> stack is cached only. If it fails, or in the !VMAP case then the final
> free happens in delayed_put_task_struct(). This is also an oportunity
> to cache the stack.

My first two tries to apply this series to something failed. What's it
based on?

The rest of my review will be based on diffs, not real code, since I
failed to apply it.

2022-02-22 01:16:16

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 0/8] kernel/fork: Move thread stack free otu of the scheduler path.

On 2/17/22 02:23, Sebastian Andrzej Siewior wrote:
> This is a follup-up on the patch
> sched: Delay task stack freeing on RT
> https://lkml.kernel.org/r/[email protected]
>
> It addresses the review feedback:
> - Decouple stack accounting from its free invocation. The accounting
> happens in do_exit(), the final free call happens later.
>
> - Add put_task_stack_sched() to finish_task_switch(). Here the VMAP
> stack is cached only. If it fails, or in the !VMAP case then the final
> free happens in delayed_put_task_struct(). This is also an oportunity
> to cache the stack.
>
> Changes since v1:
> - Drop the bit marked und use addtional RCU head to free the stack in
> case that it can't be cached. Suggested by Andy Lutomirski.

Acked-by: Andy Lutomirski <[email protected]>

This version is much nicer. Thanks!

--Andy

2022-02-22 22:18:47

by tip-bot2 for Kyle Huey

[permalink] [raw]
Subject: [tip: core/core] fork: Don't assign the stack pointer in dup_task_struct()

The following commit has been merged into the core/core branch of tip:

Commit-ID: 7865aba3ade4cf30f0ac08e015550084a50d9afb
Gitweb: https://git.kernel.org/tip/7865aba3ade4cf30f0ac08e015550084a50d9afb
Author: Sebastian Andrzej Siewior <[email protected]>
AuthorDate: Thu, 17 Feb 2022 11:24:02 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 22 Feb 2022 22:25:01 +01:00

fork: Don't assign the stack pointer in dup_task_struct()

All four versions of alloc_thread_stack_node() assign now
task_struct::stack in case the allocation was successful.

Let alloc_thread_stack_node() return an error code instead of the stack
pointer and remove the stack assignment in dup_task_struct().

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/fork.c | 47 ++++++++++++++++-------------------------------
1 file changed, 16 insertions(+), 31 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 7b70c47..875bd43 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,7 +211,7 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}

-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
void *stack;
int i;
@@ -232,7 +232,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)

tsk->stack_vm_area = s;
tsk->stack = s->addr;
- return s->addr;
+ return 0;
}

/*
@@ -245,17 +245,16 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
-
+ if (!stack)
+ return -ENOMEM;
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- if (stack) {
- tsk->stack_vm_area = find_vm_area(stack);
- tsk->stack = stack;
- }
- return stack;
+ tsk->stack_vm_area = find_vm_area(stack);
+ tsk->stack = stack;
+ return 0;
}

static void free_thread_stack(struct task_struct *tsk)
@@ -282,16 +281,16 @@ static void free_thread_stack(struct task_struct *tsk)

# else /* !CONFIG_VMAP_STACK */

-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);

if (likely(page)) {
tsk->stack = kasan_reset_tag(page_address(page));
- return tsk->stack;
+ return 0;
}
- return NULL;
+ return -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
@@ -305,14 +304,13 @@ static void free_thread_stack(struct task_struct *tsk)

static struct kmem_cache *thread_stack_cache;

-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
stack = kasan_reset_tag(stack);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
@@ -332,13 +330,13 @@ void thread_stack_cache_init(void)
# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */

-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;

stack = arch_alloc_thread_stack_node(tsk, node);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
@@ -895,8 +893,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- unsigned long *stack;
- struct vm_struct *stack_vm_area __maybe_unused;
int err;

if (node == NUMA_NO_NODE)
@@ -909,24 +905,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (err)
goto free_tsk;

- stack = alloc_thread_stack_node(tsk, node);
- if (!stack)
+ err = alloc_thread_stack_node(tsk, node);
+ if (err)
goto free_tsk;

if (memcg_charge_kernel_stack(tsk))
goto free_stack;

- stack_vm_area = task_stack_vm_area(tsk);
-
- /*
- * arch_dup_task_struct() clobbers the stack-related fields. Make
- * sure they're properly initialized before using any stack-related
- * functions again.
- */
- tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = stack_vm_area;
-#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif

2022-02-22 23:55:57

by tip-bot2 for Kyle Huey

[permalink] [raw]
Subject: [tip: core/core] fork, IA64: Provide alloc_thread_stack_node() for IA64

The following commit has been merged into the core/core branch of tip:

Commit-ID: 2bb0529c0bc0698f3baf3e88ffd61a18eef252a7
Gitweb: https://git.kernel.org/tip/2bb0529c0bc0698f3baf3e88ffd61a18eef252a7
Author: Sebastian Andrzej Siewior <[email protected]>
AuthorDate: Thu, 17 Feb 2022 11:24:01 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 22 Feb 2022 22:25:01 +01:00

fork, IA64: Provide alloc_thread_stack_node() for IA64

Provide a generic alloc_thread_stack_node() for IA64 and
CONFIG_ARCH_THREAD_STACK_ALLOCATOR which returns stack pointer and sets
task_struct::stack so it behaves exactly like the other implementations.

Rename IA64's alloc_thread_stack_node() and add the generic version to the
fork code so it is in one place _and_ to drastically lower the chances of
fat fingering the IA64 code. Do the same for free_thread_stack().

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
arch/ia64/include/asm/thread_info.h | 6 +++---
kernel/fork.c | 17 +++++++++++++++++
2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 51d20cb..1684716 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -55,15 +55,15 @@ struct thread_info {
#ifndef ASM_OFFSETS_C
/* how to get the thread information struct from C */
#define current_thread_info() ((struct thread_info *) ((char *) current + IA64_TASK_SIZE))
-#define alloc_thread_stack_node(tsk, node) \
+#define arch_alloc_thread_stack_node(tsk, node) \
((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE))
#define task_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
#else
#define current_thread_info() ((struct thread_info *) 0)
-#define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0)
+#define arch_alloc_thread_stack_node(tsk, node) ((unsigned long *) 0)
#define task_thread_info(tsk) ((struct thread_info *) 0)
#endif
-#define free_thread_stack(tsk) /* nothing */
+#define arch_free_thread_stack(tsk) /* nothing */
#define task_stack_page(tsk) ((void *)(tsk))

#define __HAVE_THREAD_FUNCTIONS
diff --git a/kernel/fork.c b/kernel/fork.c
index 30c01ce..7b70c47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -330,6 +330,23 @@ void thread_stack_cache_init(void)
}

# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ unsigned long *stack;
+
+ stack = arch_alloc_thread_stack_node(tsk, node);
+ tsk->stack = stack;
+ return stack;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ arch_free_thread_stack(tsk);
+ tsk->stack = NULL;
+}
+
#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */

/* SLAB cache for signal_struct structures (tsk->signal) */

2022-02-23 00:53:29

by tip-bot2 for Kyle Huey

[permalink] [raw]
Subject: [tip: core/core] fork: Move task stack accounting to do_exit()

The following commit has been merged into the core/core branch of tip:

Commit-ID: 1a03d3f13ffe5dd24142d6db629e72c11b704d99
Gitweb: https://git.kernel.org/tip/1a03d3f13ffe5dd24142d6db629e72c11b704d99
Author: Sebastian Andrzej Siewior <[email protected]>
AuthorDate: Thu, 17 Feb 2022 11:24:04 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 22 Feb 2022 22:25:02 +01:00

fork: Move task stack accounting to do_exit()

There is no need to perform the stack accounting of the outgoing task in
its final schedule() invocation which happens with preemption disabled.
The task is leaving, the resources will be freed and the accounting can
happen in do_exit() before the actual schedule invocation which
frees the stack memory.

Move the accounting of the stack memory from release_task_stack() to
exit_task_stack_account() which then can be invoked from do_exit().

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
include/linux/sched/task_stack.h | 2 ++-
kernel/exit.c | 1 +-
kernel/fork.c | 35 ++++++++++++++++++++-----------
3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index d101505..892562e 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -79,6 +79,8 @@ static inline void *try_get_task_stack(struct task_struct *tsk)
static inline void put_task_stack(struct task_struct *tsk) {}
#endif

+void exit_task_stack_account(struct task_struct *tsk);
+
#define task_stack_end_corrupted(task) \
(*(end_of_stack(task)) != STACK_END_MAGIC)

diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25b..c303cff 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -845,6 +845,7 @@ void __noreturn do_exit(long code)
put_page(tsk->task_frag.page);

validate_creds_for_do_exit(tsk);
+ exit_task_stack_account(tsk);

check_stack_usage();
preempt_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index ac63e7f..2582812 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,9 +211,8 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}

-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
- struct vm_struct *vm = task_stack_vm_area(tsk);
int i;
int ret;

@@ -239,6 +238,7 @@ err:

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
+ struct vm_struct *vm;
void *stack;
int i;

@@ -256,7 +256,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);

- if (memcg_charge_kernel_stack(tsk)) {
+ if (memcg_charge_kernel_stack(s)) {
vfree(s->addr);
return -ENOMEM;
}
@@ -279,7 +279,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!stack)
return -ENOMEM;

- if (memcg_charge_kernel_stack(tsk)) {
+ vm = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm)) {
vfree(stack);
return -ENOMEM;
}
@@ -288,19 +289,15 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- tsk->stack_vm_area = find_vm_area(stack);
+ tsk->stack_vm_area = vm;
tsk->stack = stack;
return 0;
}

static void free_thread_stack(struct task_struct *tsk)
{
- struct vm_struct *vm = task_stack_vm_area(tsk);
int i;

- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
-
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i], NULL,
tsk->stack_vm_area) != NULL)
@@ -454,12 +451,25 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
}

+void exit_task_stack_account(struct task_struct *tsk)
+{
+ account_kernel_stack(tsk, -1);
+
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm;
+ int i;
+
+ vm = task_stack_vm_area(tsk);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+ }
+}
+
static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */

- account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
}

@@ -918,6 +928,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
+ account_kernel_stack(tsk, 1);

err = scs_prepare(tsk, node);
if (err)
@@ -961,8 +972,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->wake_q.next = NULL;
tsk->worker_private = NULL;

- account_kernel_stack(tsk, 1);
-
kcov_task_init(tsk);
kmap_local_fork(tsk);

@@ -981,6 +990,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
return tsk;

free_stack:
+ exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
@@ -2459,6 +2469,7 @@ bad_fork_cleanup_count:
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
+ exit_task_stack_account(p);
put_task_stack(p);
delayed_free_task(p);
fork_out:

2022-02-23 01:43:18

by tip-bot2 for Kyle Huey

[permalink] [raw]
Subject: [tip: core/core] fork: Only cache the VMAP stack in finish_task_switch()

The following commit has been merged into the core/core branch of tip:

Commit-ID: e540bf3162e822d7a1f07e69e3bb1b4f925ca368
Gitweb: https://git.kernel.org/tip/e540bf3162e822d7a1f07e69e3bb1b4f925ca368
Author: Sebastian Andrzej Siewior <[email protected]>
AuthorDate: Thu, 17 Feb 2022 11:24:05 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 22 Feb 2022 22:25:02 +01:00

fork: Only cache the VMAP stack in finish_task_switch()

The task stack could be deallocated later, but for fork()/exec() kind of
workloads (say a shell script executing several commands) it is important
that the stack is released in finish_task_switch() so that in VMAP_STACK
case it can be cached and reused in the new task.

For PREEMPT_RT it would be good if the wake-up in vfree_atomic() could
be avoided in the scheduling path. Far worse are the other
free_thread_stack() implementations which invoke __free_pages()/
kmem_cache_free() with disabled preemption.

Cache the stack in free_thread_stack() in the VMAP_STACK case and
RCU-delay the free path otherwise. Free the stack in the RCU callback.
In the VMAP_STACK case this is another opportunity to fill the cache.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/fork.c | 76 +++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 63 insertions(+), 13 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 2582812..177bc64 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -193,6 +193,41 @@ static inline void free_task_struct(struct task_struct *tsk)
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);

+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+ unsigned int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
static int free_vm_stack_cache(unsigned int cpu)
{
struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -296,24 +331,27 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)

static void free_thread_stack(struct task_struct *tsk)
{
- int i;
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);

- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i], NULL,
- tsk->stack_vm_area) != NULL)
- continue;
-
- tsk->stack = NULL;
- tsk->stack_vm_area = NULL;
- return;
- }
- vfree_atomic(tsk->stack);
tsk->stack = NULL;
tsk->stack_vm_area = NULL;
}

# else /* !CONFIG_VMAP_STACK */

+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
@@ -328,7 +366,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)

static void free_thread_stack(struct task_struct *tsk)
{
- __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ thread_stack_delayed_free(tsk);
tsk->stack = NULL;
}

@@ -337,6 +375,18 @@ static void free_thread_stack(struct task_struct *tsk)

static struct kmem_cache *thread_stack_cache;

+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ kmem_cache_free(thread_stack_cache, rh);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
@@ -348,7 +398,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)

static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, tsk->stack);
+ thread_stack_delayed_free(tsk);
tsk->stack = NULL;
}

2022-02-23 02:32:15

by tip-bot2 for Kyle Huey

[permalink] [raw]
Subject: [tip: core/core] fork: Move memcg_charge_kernel_stack() into CONFIG_VMAP_STACK

The following commit has been merged into the core/core branch of tip:

Commit-ID: f1c1a9ee00e4c53c9ccc03ec1aff4792948a25eb
Gitweb: https://git.kernel.org/tip/f1c1a9ee00e4c53c9ccc03ec1aff4792948a25eb
Author: Sebastian Andrzej Siewior <[email protected]>
AuthorDate: Thu, 17 Feb 2022 11:24:03 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 22 Feb 2022 22:25:01 +01:00

fork: Move memcg_charge_kernel_stack() into CONFIG_VMAP_STACK

memcg_charge_kernel_stack() is only used in the CONFIG_VMAP_STACK case.

Move memcg_charge_kernel_stack() into the CONFIG_VMAP_STACK block and
invoke it from within alloc_thread_stack_node().

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/fork.c | 69 ++++++++++++++++++++++++++------------------------
1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 875bd43..ac63e7f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,6 +211,32 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}

+static int memcg_charge_kernel_stack(struct task_struct *tsk)
+{
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+ int i;
+ int ret;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ goto err;
+ }
+ return 0;
+err:
+ /*
+ * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
+ * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
+ * ignore this page.
+ */
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+ return ret;
+}
+
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
void *stack;
@@ -230,6 +256,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);

+ if (memcg_charge_kernel_stack(tsk)) {
+ vfree(s->addr);
+ return -ENOMEM;
+ }
+
tsk->stack_vm_area = s;
tsk->stack = s->addr;
return 0;
@@ -247,6 +278,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
0, node, __builtin_return_address(0));
if (!stack)
return -ENOMEM;
+
+ if (memcg_charge_kernel_stack(tsk)) {
+ vfree(stack);
+ return -ENOMEM;
+ }
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
@@ -418,36 +454,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
}

-static int memcg_charge_kernel_stack(struct task_struct *tsk)
-{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
- int ret;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
- if (vm) {
- int i;
-
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- /*
- * If memcg_kmem_charge_page() fails, page's
- * memory cgroup pointer is NULL, and
- * memcg_kmem_uncharge_page() in free_thread_stack()
- * will ignore this page.
- */
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
- 0);
- if (ret)
- return ret;
- }
- }
-#endif
- return 0;
-}
-
static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
@@ -909,9 +915,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (err)
goto free_tsk;

- if (memcg_charge_kernel_stack(tsk))
- goto free_stack;
-
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif

2022-02-23 06:37:04

by tip-bot2 for Kyle Huey

[permalink] [raw]
Subject: [tip: core/core] fork: Redo ifdefs around task stack handling

The following commit has been merged into the core/core branch of tip:

Commit-ID: be9a2277cafd318976d59c41a7f45a934ec43b26
Gitweb: https://git.kernel.org/tip/be9a2277cafd318976d59c41a7f45a934ec43b26
Author: Sebastian Andrzej Siewior <[email protected]>
AuthorDate: Thu, 17 Feb 2022 11:23:59 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 22 Feb 2022 22:25:01 +01:00

fork: Redo ifdefs around task stack handling

The use of ifdef CONFIG_VMAP_STACK is confusing in terms what is
actually happenning and what can happen.

For instance from reading free_thread_stack() it appears that in the
CONFIG_VMAP_STACK case it may receive a non-NULL vm pointer but it may also
be NULL in which case __free_pages() is used to free the stack. This is
however not the case because in the VMAP case a non-NULL pointer is always
returned here. Since it looks like this might happen, the compiler creates
the correct dead code with the invocation to __free_pages() and everything
around it. Twice.

Add spaces between the ifdef and the identifer to recognize the ifdef
level which is currently in scope.

Add the current identifer as a comment behind #else and #endif.
Move the code within free_thread_stack() and alloc_thread_stack_node()
into the relevant ifdef blocks.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/fork.c | 74 ++++++++++++++++++++++++++------------------------
1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index a024bf6..f5cc101 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -185,7 +185,7 @@ static inline void free_task_struct(struct task_struct *tsk)
*/
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)

-#ifdef CONFIG_VMAP_STACK
+# ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
@@ -210,11 +210,9 @@ static int free_vm_stack_cache(unsigned int cpu)

return 0;
}
-#endif

static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
-#ifdef CONFIG_VMAP_STACK
void *stack;
int i;

@@ -258,45 +256,53 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
tsk->stack = stack;
}
return stack;
-#else
- struct page *page = alloc_pages_node(node, THREADINFO_GFP,
- THREAD_SIZE_ORDER);
-
- if (likely(page)) {
- tsk->stack = kasan_reset_tag(page_address(page));
- return tsk->stack;
- }
- return NULL;
-#endif
}

-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
struct vm_struct *vm = task_stack_vm_area(tsk);
+ int i;

- if (vm) {
- int i;
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);

- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL,
+ tsk->stack_vm_area) != NULL)
+ continue;

- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i],
- NULL, tsk->stack_vm_area) != NULL)
- continue;
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+ return;
+ }
+ vfree_atomic(tsk->stack);
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+}

- return;
- }
+# else /* !CONFIG_VMAP_STACK */

- vfree_atomic(tsk->stack);
- return;
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
+
+ if (likely(page)) {
+ tsk->stack = kasan_reset_tag(page_address(page));
+ return tsk->stack;
}
-#endif
+ return NULL;
+}

+static void free_thread_stack(struct task_struct *tsk)
+{
__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ tsk->stack = NULL;
}
-# else
+
+# endif /* CONFIG_VMAP_STACK */
+# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
+
static struct kmem_cache *thread_stack_cache;

static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
@@ -312,6 +318,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
static void free_thread_stack(struct task_struct *tsk)
{
kmem_cache_free(thread_stack_cache, tsk->stack);
+ tsk->stack = NULL;
}

void thread_stack_cache_init(void)
@@ -321,8 +328,9 @@ void thread_stack_cache_init(void)
THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
-# endif
-#endif
+
+# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -432,10 +440,6 @@ static void release_task_stack(struct task_struct *tsk)

account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
- tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = NULL;
-#endif
}

#ifdef CONFIG_THREAD_INFO_IN_TASK