2022-09-22 11:10:03

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH v4 03/25] rseq: Extend struct rseq with numa node id

Adding the NUMA node id to struct rseq is a straightforward thing to do,
and a good way to figure out if anything in the user-space ecosystem
prevents extending struct rseq.

This NUMA node id field allows memory allocators such as tcmalloc to
take advantage of fast access to the current NUMA node id to perform
NUMA-aware memory allocation.

It can also be useful for implementing fast-paths for NUMA-aware
user-space mutexes.

It also allows implementing getcpu(2) purely in user-space.

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
include/trace/events/rseq.h | 4 +++-
include/uapi/linux/rseq.h | 8 ++++++++
kernel/rseq.c | 19 +++++++++++++------
3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
index a04a64bc1a00..6bd442697354 100644
--- a/include/trace/events/rseq.h
+++ b/include/trace/events/rseq.h
@@ -16,13 +16,15 @@ TRACE_EVENT(rseq_update,

TP_STRUCT__entry(
__field(s32, cpu_id)
+ __field(s32, node_id)
),

TP_fast_assign(
__entry->cpu_id = raw_smp_processor_id();
+ __entry->node_id = cpu_to_node(raw_smp_processor_id());
),

- TP_printk("cpu_id=%d", __entry->cpu_id)
+ TP_printk("cpu_id=%d node_id=%d", __entry->cpu_id, __entry->node_id)
);

TRACE_EVENT(rseq_ip_fixup,
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 05d3c4cdeb40..1cb90a435c5c 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -131,6 +131,14 @@ struct rseq {
*/
__u32 flags;

+ /*
+ * Restartable sequences node_id field. Updated by the kernel. Read by
+ * user-space with single-copy atomicity semantics. This field should
+ * only be read by the thread which registered this data structure.
+ * Aligned on 32-bit. Contains the current NUMA node ID.
+ */
+ __u32 node_id;
+
/*
* Flexible array member at end of structure, after last feature field.
*/
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 46dc5c2ce2b7..cb7d8a5afc82 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -84,15 +84,17 @@
* F1. <failure>
*/

-static int rseq_update_cpu_id(struct task_struct *t)
+static int rseq_update_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;
+ u32 cpu_id = raw_smp_processor_id();
+ u32 node_id = cpu_to_node(cpu_id);

if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_put_user(node_id, &rseq->node_id, efault_end);
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally updated only if
@@ -108,9 +110,9 @@ static int rseq_update_cpu_id(struct task_struct *t)
return -EFAULT;
}

-static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0;

/*
* Reset cpu_id_start to its initial state (0).
@@ -124,6 +126,11 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Reset node_id to its initial state (0).
+ */
+ if (put_user(node_id, &t->rseq->node_id))
+ return -EFAULT;
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally reset only if
@@ -306,7 +313,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(ret < 0))
goto error;
}
- if (unlikely(rseq_update_cpu_id(t)))
+ if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
return;

@@ -353,7 +360,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_id(current);
+ ret = rseq_reset_rseq_cpu_node_id(current);
if (ret)
return ret;
current->rseq = NULL;
--
2.25.1


2022-09-23 11:23:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v4 03/25] rseq: Extend struct rseq with numa node id

On Thu, Sep 22, 2022 at 06:59:18AM -0400, Mathieu Desnoyers wrote:
> diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
> index a04a64bc1a00..6bd442697354 100644
> --- a/include/trace/events/rseq.h
> +++ b/include/trace/events/rseq.h
> @@ -16,13 +16,15 @@ TRACE_EVENT(rseq_update,
>
> TP_STRUCT__entry(
> __field(s32, cpu_id)
> + __field(s32, node_id)
> ),
>
> TP_fast_assign(
> __entry->cpu_id = raw_smp_processor_id();
> + __entry->node_id = cpu_to_node(raw_smp_processor_id());

Very minor nit: perhaps:

_entry->node_id = cpu_to_node(__entry->cpu_id);

just in case it does a reload and finds a different value.

> ),

2022-09-23 13:08:23

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH v4 03/25] rseq: Extend struct rseq with numa node id

On 2022-09-23 07:13, Peter Zijlstra wrote:
> On Thu, Sep 22, 2022 at 06:59:18AM -0400, Mathieu Desnoyers wrote:
>> diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
>> index a04a64bc1a00..6bd442697354 100644
>> --- a/include/trace/events/rseq.h
>> +++ b/include/trace/events/rseq.h
>> @@ -16,13 +16,15 @@ TRACE_EVENT(rseq_update,
>>
>> TP_STRUCT__entry(
>> __field(s32, cpu_id)
>> + __field(s32, node_id)
>> ),
>>
>> TP_fast_assign(
>> __entry->cpu_id = raw_smp_processor_id();
>> + __entry->node_id = cpu_to_node(raw_smp_processor_id());
>
> Very minor nit: perhaps:
>
> _entry->node_id = cpu_to_node(__entry->cpu_id);
>
> just in case it does a reload and finds a different value.

I agree with your proposed change, but I don't think it will have any
effect, nor that it matters, for the following reasons:

1) Tracepoint probes are executed with preemption disabled, therefore
preventing preemption and migration within the probe callback. So
reloading the processor ID should not matter.

2) trace_rseq_update is done in the __rseq_handle_notify_resume() called
from the exit to usermode loop. If a preemption/migration happens at any
point in this code called from the exit to usermode loop, the loop will
observe that the thread flags have been set again, and iterate on the
loop once more before going back to userspace. So indeed, even if we
*could* observe inconsistent cpu_id vs node_id in the trace, it would
not matter because it would never be observable by user-space.

But nevertheless, I agree with the change you propose, just not the
justification. :)

Thanks,

Mathieu


>
>> ),


--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

2022-09-23 13:51:03

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH v4.1 03/25 1/1] rseq: Extend struct rseq with numa node id

Adding the NUMA node id to struct rseq is a straightforward thing to do,
and a good way to figure out if anything in the user-space ecosystem
prevents extending struct rseq.

This NUMA node id field allows memory allocators such as tcmalloc to
take advantage of fast access to the current NUMA node id to perform
NUMA-aware memory allocation.

It can also be useful for implementing fast-paths for NUMA-aware
user-space mutexes.

It also allows implementing getcpu(2) purely in user-space.

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
Changes since v4:
- Use __entry->cpu_id as argument for cpu_to_node() in the rseq_update
tracepoint.
---
include/trace/events/rseq.h | 4 +++-
include/uapi/linux/rseq.h | 8 ++++++++
kernel/rseq.c | 19 +++++++++++++------
3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
index a04a64bc1a00..dde7a359b4ef 100644
--- a/include/trace/events/rseq.h
+++ b/include/trace/events/rseq.h
@@ -16,13 +16,15 @@ TRACE_EVENT(rseq_update,

TP_STRUCT__entry(
__field(s32, cpu_id)
+ __field(s32, node_id)
),

TP_fast_assign(
__entry->cpu_id = raw_smp_processor_id();
+ __entry->node_id = cpu_to_node(__entry->cpu_id);
),

- TP_printk("cpu_id=%d", __entry->cpu_id)
+ TP_printk("cpu_id=%d node_id=%d", __entry->cpu_id, __entry->node_id)
);

TRACE_EVENT(rseq_ip_fixup,
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 05d3c4cdeb40..1cb90a435c5c 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -131,6 +131,14 @@ struct rseq {
*/
__u32 flags;

+ /*
+ * Restartable sequences node_id field. Updated by the kernel. Read by
+ * user-space with single-copy atomicity semantics. This field should
+ * only be read by the thread which registered this data structure.
+ * Aligned on 32-bit. Contains the current NUMA node ID.
+ */
+ __u32 node_id;
+
/*
* Flexible array member at end of structure, after last feature field.
*/
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 46dc5c2ce2b7..cb7d8a5afc82 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -84,15 +84,17 @@
* F1. <failure>
*/

-static int rseq_update_cpu_id(struct task_struct *t)
+static int rseq_update_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;
+ u32 cpu_id = raw_smp_processor_id();
+ u32 node_id = cpu_to_node(cpu_id);

if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_put_user(node_id, &rseq->node_id, efault_end);
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally updated only if
@@ -108,9 +110,9 @@ static int rseq_update_cpu_id(struct task_struct *t)
return -EFAULT;
}

-static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0;

/*
* Reset cpu_id_start to its initial state (0).
@@ -124,6 +126,11 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Reset node_id to its initial state (0).
+ */
+ if (put_user(node_id, &t->rseq->node_id))
+ return -EFAULT;
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally reset only if
@@ -306,7 +313,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(ret < 0))
goto error;
}
- if (unlikely(rseq_update_cpu_id(t)))
+ if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
return;

@@ -353,7 +360,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_id(current);
+ ret = rseq_reset_rseq_cpu_node_id(current);
if (ret)
return ret;
current->rseq = NULL;
--
2.25.1