LinuxLists.cc - [PATCHv6 23/36] x86/vdso: Allocate timens vdso

2019-08-15 16:43:59

Subject: [PATCHv6 23/36] x86/vdso: Allocate timens vdso

As it has been discussed on timens RFC, adding a new conditional branch
`if (inside_time_ns)` on VDSO for all processes is undesirable.
It will add a penalty for everybody as branch predictor may mispredict
the jump. Also there are instruction cache lines wasted on cmp/jmp.

Those effects of introducing time namespace are very much unwanted
having in mind how much work have been spent on micro-optimisation
vdso code.

The propose is to allocate a second vdso code with dynamically
patched out (disabled by static_branch) timens code on boot time.

Allocate another vdso and copy original code.

Co-developed-by: Andrei Vagin <[email protected]>
Signed-off-by: Andrei Vagin <[email protected]>
Signed-off-by: Dmitry Safonov <[email protected]>
---
arch/x86/entry/vdso/vdso2c.h | 2 +-
arch/x86/entry/vdso/vma.c | 113 +++++++++++++++++++++++++++++++++--
arch/x86/include/asm/vdso.h | 9 +--
3 files changed, 114 insertions(+), 10 deletions(-)

diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 7556bb70ed8b..885b988aea19 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -157,7 +157,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
}
fprintf(outfile, "\n};\n\n");

- fprintf(outfile, "const struct vdso_image %s = {\n", image_name);
+ fprintf(outfile, "struct vdso_image %s __ro_after_init = {\n", image_name);
fprintf(outfile, "\t.text = raw_data,\n");
fprintf(outfile, "\t.size = %lu,\n", mapping_size);
if (alt_sec) {
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 9bd66f84db5e..8a8211fd4cfc 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -30,26 +30,128 @@
unsigned int __read_mostly vdso64_enabled = 1;
#endif

-void __init init_vdso_image(const struct vdso_image *image)
+void __init init_vdso_image(struct vdso_image *image)
{
BUG_ON(image->size % PAGE_SIZE != 0);

apply_alternatives((struct alt_instr *)(image->text + image->alt),
(struct alt_instr *)(image->text + image->alt +
image->alt_len));
+#ifdef CONFIG_TIME_NS
+ image->text_timens = vmalloc_32(image->size);
+ if (WARN_ON(image->text_timens == NULL))
+ return;
+
+ memcpy(image->text_timens, image->text, image->size);
+#endif
}

struct linux_binprm;

+#ifdef CONFIG_TIME_NS
+static inline struct timens_offsets *current_timens_offsets(void)
+{
+ return current->nsproxy->time_ns->offsets;
+}
+
+static int vdso_check_timens(struct vm_area_struct *vma, bool *in_timens)
+{
+ struct task_struct *tsk;
+
+ if (likely(vma->vm_mm == current->mm)) {
+ *in_timens = !!current_timens_offsets();
+ return 0;
+ }
+
+ /*
+ * .fault() handler can be called over remote process through
+ * interfaces like /proc/$pid/mem or process_vm_{readv,writev}()
+ * Considering such access to vdso as a slow-path.
+ */
+
+#ifdef CONFIG_MEMCG
+ rcu_read_lock();
+
+ tsk = rcu_dereference(vma->vm_mm->owner);
+ if (tsk) {
+ task_lock(tsk);
+ /*
+ * Shouldn't happen: nsproxy is unset in exit_mm().
+ * Before that exit_mm() holds mmap_sem to set (mm = NULL).
+ * It's impossible to have a fault in task without mm
+ * and mmap_sem is taken during the fault.
+ */
+ if (WARN_ON_ONCE(tsk->nsproxy == NULL)) {
+ task_unlock(tsk);
+ rcu_read_unlock();
+ return -EIO;
+ }
+ *in_timens = !!tsk->nsproxy->time_ns->offsets;
+ task_unlock(tsk);
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+#endif
+
+ read_lock(&tasklist_lock);
+ for_each_process(tsk) {
+ struct task_struct *c;
+
+ if (tsk->flags & PF_KTHREAD)
+ continue;
+ for_each_thread(tsk, c) {
+ if (c->mm == vma->vm_mm)
+ goto found;
+ if (c->mm)
+ break;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+
+found:
+ task_lock(tsk);
+ read_unlock(&tasklist_lock);
+ *in_timens = !!tsk->nsproxy->time_ns->offsets;
+ task_unlock(tsk);
+
+ return 0;
+}
+#else /* CONFIG_TIME_NS */
+static inline int vdso_check_timens(struct vm_area_struct *vma, bool *in_timens)
+{
+ *in_timens = false;
+ return 0;
+}
+static inline struct timens_offsets *current_timens_offsets(void)
+{
+ return NULL;
+}
+#endif /* CONFIG_TIME_NS */
+
static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+ unsigned long offset = vmf->pgoff << PAGE_SHIFT;
+ bool in_timens;
+ int err;

if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
return VM_FAULT_SIGBUS;

- vmf->page = virt_to_page(image->text + (vmf->pgoff << PAGE_SHIFT));
+ err = vdso_check_timens(vma, &in_timens);
+ if (err)
+ return VM_FAULT_SIGBUS;
+
+ WARN_ON_ONCE(in_timens && !image->text_timens);
+
+ if (in_timens && image->text_timens)
+ vmf->page = vmalloc_to_page(image->text_timens + offset);
+ else
+ vmf->page = virt_to_page(image->text + offset);
+
get_page(vmf->page);
return 0;
}
@@ -138,13 +240,14 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
return vmf_insert_pfn(vma, vmf->address,
vmalloc_to_pfn(tsc_pg));
} else if (sym_offset == image->sym_timens_page) {
- struct time_namespace *ns = current->nsproxy->time_ns;
+ /* We can fault only in current context for VM_PFNMAP mapping */
+ struct timens_offsets *offsets = current_timens_offsets();
unsigned long pfn;

- if (!ns->offsets)
+ if (!offsets)
pfn = page_to_pfn(ZERO_PAGE(0));
else
- pfn = page_to_pfn(virt_to_page(ns->offsets));
+ pfn = page_to_pfn(virt_to_page(offsets));

return vmf_insert_pfn(vma, vmf->address, pfn);
}
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9d420c545607..03f468c63a24 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -12,6 +12,7 @@

struct vdso_image {
void *text;
+ void *text_timens;
unsigned long size; /* Always a multiple of PAGE_SIZE */

unsigned long alt, alt_len;
@@ -30,18 +31,18 @@ struct vdso_image {
};

#ifdef CONFIG_X86_64
-extern const struct vdso_image vdso_image_64;
+extern struct vdso_image vdso_image_64;
#endif

#ifdef CONFIG_X86_X32
-extern const struct vdso_image vdso_image_x32;
+extern struct vdso_image vdso_image_x32;
#endif

#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
-extern const struct vdso_image vdso_image_32;
+extern struct vdso_image vdso_image_32;
#endif

-extern void __init init_vdso_image(const struct vdso_image *image);
+extern void __init init_vdso_image(struct vdso_image *image);

extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);

--
2.22.0

2019-08-16 15:24:08

by Andy Lutomirski

[permalink] [raw]

Subject: Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso

On 8/15/19 9:38 AM, Dmitry Safonov wrote:
> As it has been discussed on timens RFC, adding a new conditional branch
> `if (inside_time_ns)` on VDSO for all processes is undesirable.
> It will add a penalty for everybody as branch predictor may mispredict
> the jump. Also there are instruction cache lines wasted on cmp/jmp.
>
> Those effects of introducing time namespace are very much unwanted
> having in mind how much work have been spent on micro-optimisation
> vdso code.
>
> The propose is to allocate a second vdso code with dynamically
> patched out (disabled by static_branch) timens code on boot time.
>
> Allocate another vdso and copy original code.

I'm unconvinced that any of this magic is wise. I think you should make
a special timens vvar page that causes the normal fastpath to fail
(using a special vclock mode, a special seq value, or a special "last"
value) and then make the failure path detect that timens is in use and
use the timens path.

--Andy

2019-08-16 20:12:54

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso

On Fri, 16 Aug 2019, Andy Lutomirski wrote:
> On 8/15/19 9:38 AM, Dmitry Safonov wrote:
> > As it has been discussed on timens RFC, adding a new conditional branch
> > `if (inside_time_ns)` on VDSO for all processes is undesirable.
> > It will add a penalty for everybody as branch predictor may mispredict
> > the jump. Also there are instruction cache lines wasted on cmp/jmp.
> >
> > Those effects of introducing time namespace are very much unwanted
> > having in mind how much work have been spent on micro-optimisation
> > vdso code.
> >
> > The propose is to allocate a second vdso code with dynamically
> > patched out (disabled by static_branch) timens code on boot time.
> >
> > Allocate another vdso and copy original code.
>
>
> I'm unconvinced that any of this magic is wise. I think you should make a
> special timens vvar page that causes the normal fastpath to fail (using a
> special vclock mode, a special seq value, or a special "last" value) and then
> make the failure path detect that timens is in use and use the timens path.

My initial suggestion still stands. Do that at compile time. It really does
not matter whether we have another 2 or 3 variants of vdso binaries.

Use it and be done with it. No special magic, just straight forward
decisions to use a timens capable VDSO or not.

Thanks,

tglx

2019-08-16 22:48:25

by Dmitry Safonov

[permalink] [raw]

Subject: Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso

Hi Andy, Thomas,

thank you very much for your time and the reviews, appreciate that.

On 8/16/19 9:10 PM, Thomas Gleixner wrote:
> On Fri, 16 Aug 2019, Andy Lutomirski wrote:
[..]
>> I'm unconvinced that any of this magic is wise. I think you should make a
>> special timens vvar page that causes the normal fastpath to fail (using a
>> special vclock mode, a special seq value, or a special "last" value) and then
>> make the failure path detect that timens is in use and use the timens path.

I see. That's so clever, it haven't come on my mind.
Hmm, is that better because of the price of 5-byte NOP?
I'm a bit afraid to complicate that seq/vclock logic more..

So, what I'm driving at is would you change your mind if timens still
had boot-time dynamic patching but without introducing NOP?

We've got the point that you want to have no penalty at all for host
tasks [on RFC reply] by introducing `if` as trashing cache and branch
predictor, but I wasn't sure if NOP is also unacceptable.

At that moment we had a "plan B" with something that was half-wittedly
called "retcalls". The basic idea is that all that the timens brings
into vdso are calls clk_to_ns(), which are all placed in tails of
functions. So, if we could just memcpy() function returns in host vdso
over introduced time-ns tail calls - it would be a very same code that
lied before. There is a draft of those [1], that actually works on x86
on both mine and Andrei's machines.

Consulting with Andrei, I've decided that we better stick to
static_branchs as they are well known and have already backends for
other architectures. We probably mistakenly decided that a price of NOP
on scalar machines is negligible and would be acceptable.

Would those self-invented "retcalls" be something that could be reviewed
and potentially accepted in further iterations?

[1]
https://github.com/0x7f454c46/linux/commit/ab0eeb646f43#diff-c22e1e73e7367f371e1f12e3877ea12f

> My initial suggestion still stands. Do that at compile time. It really does
> not matter whether we have another 2 or 3 variants of vdso binaries.
>
> Use it and be done with it. No special magic, just straight forward
> decisions to use a timens capable VDSO or not.

I believe that was something we did in version 1 of the patches set.
It doesn't sound like a rocket science to do, but it resulted in a
couple of ugly patches.

The post-attempt notes about downsides of doing it compile-time are:

1. There is additional .so for each vdso: 64-bit, ia32, x32. The same
for every architecture to-be supported. It adds rules in Makefiles. [2]
2. If we still intend to keep setns() working without exec(), function
entries on both host/namespace vdso should be aligned to each other [3].
That results in a patch to vdso2c to generate offsets [4, 5] and in
linker magic to align another vdso [6].
3. As unexpected consequence, we also need to align local functions on
vdso [7].

So, it might be all related to my lack of skills, but it seems to bring
some big amount of complexity into build process. And in my point of
view, major issue is that it would not scale easily when the day will
come and there will be a need to introduce another vdso.so. As I didn't
want to be the guy that happens to be remembered as "he wrote this
unmaintainable pile of garbage", I've taken dynamic patching approach
that is done once a boot time.

Regardless, we both with Andrei want to improve the patches set and make
it acceptable and easy to maintain in future. I hope, that our effort to
do that is visible through evolution of patches. And we're very glad
that we've constructive critics and such patient maintainers.
So, if I'm mistaken in those points about compile-time vdso(s), or you
have in mind a plan how-to avoid those, I'd appreciate and rework it to
that direction.

[2] lkml.kernel.org/r/[email protected]
[3] lkml.kernel.org/r/[email protected]
[4] lkml.kernel.org/r/[email protected]
[5] lkml.kernel.org/r/[email protected]
[6] lkml.kernel.org/r/[email protected]
[7] lkml.kernel.org/r/[email protected]

Thanks,
Dmitry

2019-08-18 16:24:28

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso

Dmitry,

On Fri, 16 Aug 2019, Dmitry Safonov wrote:
> On 8/16/19 9:10 PM, Thomas Gleixner wrote:
> > On Fri, 16 Aug 2019, Andy Lutomirski wrote:
> [..]
> >> I'm unconvinced that any of this magic is wise. I think you should make a
> >> special timens vvar page that causes the normal fastpath to fail (using a
> >> special vclock mode, a special seq value, or a special "last" value) and then
> >> make the failure path detect that timens is in use and use the timens path.
>
> I see. That's so clever, it haven't come on my mind.
> Hmm, is that better because of the price of 5-byte NOP?
> I'm a bit afraid to complicate that seq/vclock logic more..

See below.

> > My initial suggestion still stands. Do that at compile time. It really does
> > not matter whether we have another 2 or 3 variants of vdso binaries.
> >
> > Use it and be done with it. No special magic, just straight forward
> > decisions to use a timens capable VDSO or not.
>
> I believe that was something we did in version 1 of the patches set.
> It doesn't sound like a rocket science to do, but it resulted in a
> couple of ugly patches.
>
> The post-attempt notes about downsides of doing it compile-time are:
>
> 1. There is additional .so for each vdso: 64-bit, ia32, x32. The same
> for every architecture to-be supported. It adds rules in Makefiles. [2]
> 2. If we still intend to keep setns() working without exec(), function
> entries on both host/namespace vdso should be aligned to each other [3].
> That results in a patch to vdso2c to generate offsets [4, 5] and in
> linker magic to align another vdso [6].
> 3. As unexpected consequence, we also need to align local functions on
> vdso [7].
>
> So, it might be all related to my lack of skills, but it seems to bring
> some big amount of complexity into build process. And in my point of
> view, major issue is that it would not scale easily when the day will
> come and there will be a need to introduce another vdso.so. As I didn't
> want to be the guy that happens to be remembered as "he wrote this
> unmaintainable pile of garbage", I've taken dynamic patching approach
> that is done once a boot time.

Fair enough. Did not think about that part.

> Regardless, we both with Andrei want to improve the patches set and make
> it acceptable and easy to maintain in future. I hope, that our effort to
> do that is visible through evolution of patches. And we're very glad
> that we've constructive critics and such patient maintainers.

I'm happy to review well written stuff which makes progress and takes
review comments into account or the submitter discusses them for
resolution.

> So, if I'm mistaken in those points about compile-time vdso(s), or you
> have in mind a plan how-to avoid those, I'd appreciate and rework it to
> that direction.

Coming back to Andy's idea. Create your time namespace page as an exact
copy of the vdso data page. When the page is created do:

memset(p->vdso_data, 0, sizeof(p->vdso_data));
p->vdso_data[0].clock_mode = CLOCK_TIMENS;
p->vdso_data[0].seq = 1;

/* Store the namespace offsets in basetime */
p->vdso_data[0].basetime[CLOCK_MONOTONIC].sec = myns->mono_sec;
p->vdso_data[0].basetime[CLOCK_MONOTONIC].nsec = myns->mono_nsec;
p->vdso_data[0].basetime[CLOCK_BOOTTIME].sec = myns->boot_sec;
p->vdso_data[0].basetime[CLOCK_BOOTTIME].nsec = myns->boot_nsec;

p->vdso_data[1].clock_mode = CLOCK_TIMENS;
p->vdso_data[1].seq = 1;

For a normal task the VVAR pages are installed in the normal ordering:

VVAR
PVCLOCK
HVCLOCK
TIMENS <- Not really required

Now for a timens task you install the pages in the following order

TIMENS
PVCLOCK
HVCLOCK
VVAR

The check for vdso_data->clock_mode is in the unlikely path of the now open
coded seq begin magic. So for the non-timens case most of the time 'seq' is
even, so the branch is not taken.

If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
update to finish and for 'seq' to become even anyway.

Patch below. I tested this with the normal order and by installing a
'timens' page unconditionally for all processes. I'll reply with the timens
testing hacks so you can see what I did.

The test results are pretty good.

Base (upstream) + VDSO patch + timens page

MONO 30ns 30ns 32ns
REAL 30ns 30ns 32ns
BOOT 30ns 30ns 32ns
MONOCOARSE 7ns 8ns 10ns
REALCOARSE 7ns 8ns 10ns
TAI 30ns 30ns 32ns
MONORAW 30ns 30ns 32ns

So except for the coarse clocks there is no change when the timens page is
not used, i.e. the regular VVAR page is at the proper place. But that's on
one machine, a different one showed an effect in the noise range. I'm not
worried about that as the VDSO behaviour varies depending on micro
architecture anyway.

With timens enabled the performance hit (cache hot microbenchmark) is
somewhere in the range of 5-7% when looking at the perf counters
numbers. The hit for the coarse accessors is larger obviously because the
overhead is constant time.

I did a quick comparison of the array vs. switch case (what you used for
your clk_to_ns() helper). The switch case is slower.

So I rather go for the array based approach. It's simpler code and the
I-cache footprint is smaller and no conditional branches involved.

That means your timens_to_host() and host_to_timens() conversion functions
should just use that special VDSO page and do the same array based
unconditional add/sub of the clock specific offset.

Thanks,

tglx

8<-----------------
Subject: lib/vdso: Prepare for time namespace support
From: Thomas Gleixner <[email protected]>
Date: Sun, 18 Aug 2019 16:53:06 +0200

To support time namespaces in the vdso with a minimal impact on regular non
time namespace affected tasks, the namespace handling needs to be hidden in
a slow path.

The most obvious place is vdso_seq_begin(). If a task belongs to a time
namespace then the VVAR page which contains the system wide vdso data is
replaced with a namespace specific page which has the same layout as the
VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path
and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time
namespace handling path.

The extra check in the case that vdso_data->seq is odd, e.g. a concurrent
update of the vdso data is in progress, is not really affecting regular
tasks which are not part of a time namespace as the task is spin waiting
for the update to finish and vdso_data->seq to become even again.

If a time namespace task hits that code path, it invokes the corresponding
time getter function which retrieves the real VVAR page, reads host time
and then adds the offset for the requested clock which is stored in the
special VVAR page.

If VDSO time namespace support is disabled the whole magic is compiled out.

Initial testing shows that the disabled case is almost identical to the
host case which does not take the slow timens path. With the special timens
page installed the performance hit is constant time and in the range of
5-7%.

For the vdso functions which are not using the sequence count an
unconditional check for vdso_data->clock_mode is added which switches to
the real vdso when the clock_mode is VCLOCK_TIMENS.

Suggested-by: Andy Lutomirski <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
include/linux/time.h | 6 ++
include/vdso/datapage.h | 19 ++++++-
lib/vdso/Kconfig | 6 ++
lib/vdso/gettimeofday.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++--
4 files changed, 155 insertions(+), 4 deletions(-)

--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -96,4 +96,10 @@ static inline bool itimerspec64_valid(co
*/
#define time_after32(a, b) ((s32)((u32)(b) - (u32)(a)) < 0)
#define time_before32(b, a) time_after32(a, b)
+
+struct timens_offset {
+ s64 sec;
+ u64 nsec;
+};
+
#endif
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -21,6 +21,8 @@
#define CS_RAW 1
#define CS_BASES (CS_RAW + 1)

+#define VCLOCK_TIMENS UINT_MAX
+
/**
* struct vdso_timestamp - basetime per clock_id
* @sec: seconds
@@ -48,6 +50,7 @@ struct vdso_timestamp {
* @mult: clocksource multiplier
* @shift: clocksource shift
* @basetime[clock_id]: basetime per clock_id
+ * @offset[clock_id]: time namespace offset per clock_id
* @tz_minuteswest: minutes west of Greenwich
* @tz_dsttime: type of DST correction
* @hrtimer_res: hrtimer resolution
@@ -55,6 +58,17 @@ struct vdso_timestamp {
*
* vdso_data will be accessed by 64 bit and compat code at the same time
* so we should be careful before modifying this structure.
+ *
+ * @basetime is used to store the base time for the system wide time getter
+ * VVAR page.
+ *
+ * @offset is used by the special time namespace VVAR pages which are
+ * installed instead of the real VVAR page. These namespace pages must set
+ * @seq to 1 and @clock_mode to VLOCK_TIMENS to force the code into the
+ * time namespace slow path. The namespace aware functions retrieve the
+ * real system wide VVAR page, read host time and add the per clock offset.
+ * For clocks which are not affected by time namespace adjustement the
+ * offset must be zero.
*/
struct vdso_data {
u32 seq;
@@ -65,7 +79,10 @@ struct vdso_data {
u32 mult;
u32 shift;

- struct vdso_timestamp basetime[VDSO_BASES];
+ union {
+ struct vdso_timestamp basetime[VDSO_BASES];
+ struct timens_offset offset[VDSO_BASES];
+ };

s32 tz_minuteswest;
s32 tz_dsttime;
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -33,4 +33,10 @@ config CROSS_COMPILE_COMPAT_VDSO
If a 64 bit compiler (i.e. x86_64) can compile the VDSO for
32 bit, it does not need to define this parameter.

+config VDSO_TIMENS
+ bool
+ help
+ Selected by architectures which support time namespaces in the
+ VDSO
+
endif
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,6 +38,51 @@ u64 vdso_calc_delta(u64 cycles, u64 last
}
#endif

+#ifndef CONFIG_VDSO_TIMENS
+static __always_inline
+const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
+{
+ return NULL;
+}
+#endif
+
+static int do_hres_ns(const struct vdso_data *vdns, clockid_t clk,
+ struct __kernel_timespec *ts)
+{
+ const struct vdso_data *vd = __arch_get_timens_vdso_data(vdns);
+ const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
+ const struct timens_offset *offs = &vdns->offset[clk];
+ u64 cycles, last, ns;
+ s64 sec;
+ u32 seq;
+
+ do {
+ seq = vdso_read_begin(vd);
+ cycles = __arch_get_hw_counter(vd->clock_mode);
+ ns = vdso_ts->nsec;
+ last = vd->cycle_last;
+ if (unlikely((s64)cycles < 0))
+ return -1;
+
+ ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
+ ns >>= vd->shift;
+ sec = vdso_ts->sec;
+ } while (unlikely(vdso_read_retry(vd, seq)));
+
+ /* Add the namespace offset */
+ sec += offs->sec;
+ ns += offs->nsec;
+
+ /*
+ * Do this outside the loop: a race inside the loop could result
+ * in __iter_div_u64_rem() being extremely slow.
+ */
+ ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return 0;
+}
+
static int do_hres(const struct vdso_data *vd, clockid_t clk,
struct __kernel_timespec *ts)
{
@@ -46,7 +91,28 @@ static int do_hres(const struct vdso_dat
u32 seq;

do {
- seq = vdso_read_begin(vd);
+ /*
+ * Open coded to handle VCLOCK_TIMENS. Time namespace
+ * enabled tasks have a special VVAR page installed which
+ * has vd->seq set to 1 and vd->clock_mode set to
+ * VCLOCK_TIMENS. For non time namespace affected tasks
+ * this does not affect performance because if vd->seq is
+ * odd, i.e. a concurrent update is in progress the extra
+ * check for vd->clock_mode is just a few extra
+ * instructions while spin waiting for vd->seq to become
+ * even again.
+ */
+ while (1) {
+ seq = READ_ONCE(vd->seq);
+ if (likely(!(seq & 1)))
+ break;
+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) &&
+ vd->clock_mode == VCLOCK_TIMENS)
+ return do_hres_ns(vd, clk, ts);
+ cpu_relax();
+ }
+ smp_rmb();
+
cycles = __arch_get_hw_counter(vd->clock_mode);
ns = vdso_ts->nsec;
last = vd->cycle_last;
@@ -68,6 +134,34 @@ static int do_hres(const struct vdso_dat
return 0;
}

+static void do_coarse_ns(const struct vdso_data *vdns, clockid_t clk,
+ struct __kernel_timespec *ts)
+{
+ const struct vdso_data *vd = __arch_get_timens_vdso_data(vdns);
+ const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
+ const struct timens_offset *offs = &vdns->offset[clk];
+ u64 nsec;
+ s64 sec;
+ s32 seq;
+
+ do {
+ seq = vdso_read_begin(vd);
+ sec = vdso_ts->sec;
+ nsec = vdso_ts->nsec;
+ } while (unlikely(vdso_read_retry(vd, seq)));
+
+ /* Add the namespace offset */
+ sec += offs->sec;
+ nsec += offs->nsec;
+
+ /*
+ * Do this outside the loop: a race inside the loop could result
+ * in __iter_div_u64_rem() being extremely slow.
+ */
+ ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
+ ts->tv_nsec = nsec;
+}
+
static void do_coarse(const struct vdso_data *vd, clockid_t clk,
struct __kernel_timespec *ts)
{
@@ -75,7 +169,23 @@ static void do_coarse(const struct vdso_
u32 seq;

do {
- seq = vdso_read_begin(vd);
+ /*
+ * Open coded to handle VCLOCK_TIMENS. See comment in
+ * do_hres().
+ */
+ while (1) {
+ seq = READ_ONCE(vd->seq);
+ if (likely(!(seq & 1)))
+ break;
+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) &&
+ vd->clock_mode == VCLOCK_TIMENS) {
+ do_coarse_ns(vd, clk, ts);
+ return;
+ }
+ cpu_relax();
+ }
+ smp_rmb();
+
ts->tv_sec = vdso_ts->sec;
ts->tv_nsec = vdso_ts->nsec;
} while (unlikely(vdso_read_retry(vd, seq)));
@@ -156,6 +266,10 @@ static __maybe_unused int
}

if (unlikely(tz != NULL)) {
+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) &&
+ vd->clock_mode == VCLOCK_TIMENS)
+ vd = __arch_get_timens_vdso_data(vd);
+
tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest;
tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime;
}
@@ -167,7 +281,12 @@ static __maybe_unused int
static __maybe_unused time_t __cvdso_time(time_t *time)
{
const struct vdso_data *vd = __arch_get_vdso_data();
- time_t t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec);
+ time_t t;
+
+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) && vd->clock_mode == VCLOCK_TIMENS)
+ vd = __arch_get_timens_vdso_data(vd);
+
+ t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec);

if (time)
*time = t;
@@ -189,6 +308,9 @@ int __cvdso_clock_getres_common(clockid_
if (unlikely((u32) clock >= MAX_CLOCKS))
return -1;

+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) && vd->clock_mode == VCLOCK_TIMENS)
+ vd = __arch_get_timens_vdso_data(vd);
+
hrtimer_res = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res);
/*
* Convert the clockid to a bitmask and use it to check which

2019-08-18 16:25:48

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso

On Sun, 18 Aug 2019, Thomas Gleixner wrote:
>
> Patch below. I tested this with the normal order and by installing a
> 'timens' page unconditionally for all processes. I'll reply with the timens
> testing hacks so you can see what I did.

First hack...

Thanks,

tglx

8<-----------------

Subject: x86/vdso: Hack to enable time name space for testing
From: Thomas Gleixner <[email protected]>
Date: Sun, 18 Aug 2019 16:42:26 +0200

Select CONFIG_VDSO_TIMENS and prepare for the extra magic time namespace
vvar page. The fault handler is not handling it yet as the path is never
taken (hopefully)

Not-Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/Kconfig | 1 +
arch/x86/entry/vdso/vdso-layout.lds.S | 3 ++-
arch/x86/entry/vdso/vdso2c.c | 3 +++
arch/x86/include/asm/vdso.h | 1 +
arch/x86/include/asm/vdso/gettimeofday.h | 9 +++++++++
5 files changed, 16 insertions(+), 1 deletion(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -224,6 +224,7 @@ config X86
select VIRT_TO_BUS
select X86_FEATURE_NAMES if PROC_FS
select PROC_PID_ARCH_STATUS if PROC_FS
+ select VDSO_TIMENS

config INSTRUCTION_DECODER
def_bool y
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -16,7 +16,7 @@ SECTIONS
* segment.
*/

- vvar_start = . - 3 * PAGE_SIZE;
+ vvar_start = . - 4 * PAGE_SIZE;
vvar_page = vvar_start;

/* Place all vvars at the offsets in asm/vvar.h. */
@@ -28,6 +28,7 @@ SECTIONS

pvclock_page = vvar_start + PAGE_SIZE;
hvclock_page = vvar_start + 2 * PAGE_SIZE;
+ timens_page = vvar_start + 3 * PAGE_SIZE;

. = SIZEOF_HEADERS;

--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -75,12 +75,14 @@ enum {
sym_vvar_page,
sym_pvclock_page,
sym_hvclock_page,
+ sym_timens_page,
};

const int special_pages[] = {
sym_vvar_page,
sym_pvclock_page,
sym_hvclock_page,
+ sym_timens_page,
};

struct vdso_sym {
@@ -93,6 +95,7 @@ struct vdso_sym required_syms[] = {
[sym_vvar_page] = {"vvar_page", true},
[sym_pvclock_page] = {"pvclock_page", true},
[sym_hvclock_page] = {"hvclock_page", true},
+ [sym_timens_page] = {"timens_page", true},
{"VDSO32_NOTE_MASK", true},
{"__kernel_vsyscall", true},
{"__kernel_sigreturn", true},
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -21,6 +21,7 @@ struct vdso_image {
long sym_vvar_page;
long sym_pvclock_page;
long sym_hvclock_page;
+ long sym_timens_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn;
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -265,6 +265,15 @@ static __always_inline const struct vdso
return __vdso_data;
}

+/* HACK .... */
+#define VDSO_TIMENS_PAGEOFFSET (3 * PAGE_SIZE)
+
+static __always_inline
+const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
+{
+ return (void *)vd + VDSO_TIMENS_PAGEOFFSET;
+}
+
/*
* x86 specific delta calculation.
*

2019-08-18 16:30:43

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso

On Sun, 18 Aug 2019, Thomas Gleixner wrote:

> On Sun, 18 Aug 2019, Thomas Gleixner wrote:
> >
> > Patch below. I tested this with the normal order and by installing a
> > 'timens' page unconditionally for all processes. I'll reply with the timens
> > testing hacks so you can see what I did.
>
> First hack...

And the second one.

Thanks,

tglx

8<-----------------

Subject: x86/vdso: Hack to test the time namespace path
From: Thomas Gleixner <[email protected]>
Date: Sun, 18 Aug 2019 16:49:00 +0200

Install a special TIMENS vvar page which forces the VDSO to take the time
namespace path for testing.

Not-Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/entry/vdso/vma.c | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)

--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -84,6 +84,33 @@ static int vdso_mremap(const struct vm_s
return 0;
}

+/* Hack for testing */
+static struct page *vdso_timens_page;
+
+static int __init init_vdso_timens(void)
+{
+ struct vdso_data *vdata;
+ void *va;
+
+ vdso_timens_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!vdso_timens_page)
+ return -ENOMEM;
+
+ /* Hack: vdso data is at offset 0x80 in the page ... */
+ va = page_address(vdso_timens_page);
+ vdata = (struct vdso_data *)(va + 0x80);
+
+ vdata[0].seq = 1;
+ vdata[0].clock_mode = UINT_MAX;
+ vdata[1].seq = 1;
+ vdata[1].clock_mode = UINT_MAX;
+
+ /* All offsets are zero */
+
+ return 0;
+}
+subsys_initcall(init_vdso_timens);
+
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
@@ -106,7 +133,7 @@ static vm_fault_t vvar_fault(const struc
if (sym_offset == 0)
return VM_FAULT_SIGBUS;

- if (sym_offset == image->sym_vvar_page) {
+ if (sym_offset == image->sym_timens_page) {
return vmf_insert_pfn(vma, vmf->address,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_pvclock_page) {
@@ -123,6 +150,11 @@ static vm_fault_t vvar_fault(const struc
if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
return vmf_insert_pfn(vma, vmf->address,
vmalloc_to_pfn(tsc_pg));
+ } else if (sym_offset == image->sym_vvar_page) {
+ unsigned long pfn;
+
+ pfn = page_to_pfn(vdso_timens_page);
+ return vmf_insert_pfn(vma, vmf->address, pfn);
}

return VM_FAULT_SIGBUS;

2019-08-19 14:18:17

by Dmitry Safonov

[permalink] [raw]

Subject: Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso

Hi Thomas,

On 8/18/19 5:21 PM, Thomas Gleixner wrote:
[..]
> I'm happy to review well written stuff which makes progress and takes
> review comments into account or the submitter discusses them for
> resolution.

Thanks again for both your and Andy time!

[..]
> Coming back to Andy's idea. Create your time namespace page as an exact
> copy of the vdso data page. When the page is created do:
>
> memset(p->vdso_data, 0, sizeof(p->vdso_data));
> p->vdso_data[0].clock_mode = CLOCK_TIMENS;
> p->vdso_data[0].seq = 1;
>
> /* Store the namespace offsets in basetime */
> p->vdso_data[0].basetime[CLOCK_MONOTONIC].sec = myns->mono_sec;
> p->vdso_data[0].basetime[CLOCK_MONOTONIC].nsec = myns->mono_nsec;
> p->vdso_data[0].basetime[CLOCK_BOOTTIME].sec = myns->boot_sec;
> p->vdso_data[0].basetime[CLOCK_BOOTTIME].nsec = myns->boot_nsec;
>
> p->vdso_data[1].clock_mode = CLOCK_TIMENS;
> p->vdso_data[1].seq = 1;
>
> For a normal task the VVAR pages are installed in the normal ordering:
>
> VVAR
> PVCLOCK
> HVCLOCK
> TIMENS <- Not really required
>
> Now for a timens task you install the pages in the following order
>
> TIMENS
> PVCLOCK
> HVCLOCK
> VVAR
>
> The check for vdso_data->clock_mode is in the unlikely path of the now open
> coded seq begin magic. So for the non-timens case most of the time 'seq' is
> even, so the branch is not taken.
>
> If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
> for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
> update to finish and for 'seq' to become even anyway.
>
> Patch below. I tested this with the normal order and by installing a
> 'timens' page unconditionally for all processes. I'll reply with the timens
> testing hacks so you can see what I did.
>
> The test results are pretty good.
>
> Base (upstream) + VDSO patch + timens page
>
> MONO 30ns 30ns 32ns
> REAL 30ns 30ns 32ns
> BOOT 30ns 30ns 32ns
> MONOCOARSE 7ns 8ns 10ns
> REALCOARSE 7ns 8ns 10ns
> TAI 30ns 30ns 32ns
> MONORAW 30ns 30ns 32ns
>
> So except for the coarse clocks there is no change when the timens page is
> not used, i.e. the regular VVAR page is at the proper place. But that's on
> one machine, a different one showed an effect in the noise range. I'm not
> worried about that as the VDSO behaviour varies depending on micro
> architecture anyway.
>
> With timens enabled the performance hit (cache hot microbenchmark) is
> somewhere in the range of 5-7% when looking at the perf counters
> numbers. The hit for the coarse accessors is larger obviously because the
> overhead is constant time.
>
> I did a quick comparison of the array vs. switch case (what you used for
> your clk_to_ns() helper). The switch case is slower.
>
> So I rather go for the array based approach. It's simpler code and the
> I-cache footprint is smaller and no conditional branches involved.
>
> That means your timens_to_host() and host_to_timens() conversion functions
> should just use that special VDSO page and do the same array based
> unconditional add/sub of the clock specific offset.

I was a bit scarred that clock_mode change would result in some complex
logic, but your patch showed me that it's definitely not so black as I
was painting it.
Will rework the patches set with Andrei based on your and Andy's
suggestions and patches.

Thanks,
Dmitry

2019-08-19 14:47:31

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [PATCHv6 23/36] x86/vdso: Allocate timens vdso

Dmitry,

On Mon, 19 Aug 2019, Dmitry Safonov wrote:
> On 8/18/19 5:21 PM, Thomas Gleixner wrote:
> > That means your timens_to_host() and host_to_timens() conversion functions
> > should just use that special VDSO page and do the same array based
> > unconditional add/sub of the clock specific offset.
>
> I was a bit scarred that clock_mode change would result in some complex
> logic, but your patch showed me that it's definitely not so black as I
> was painting it.

Right. It took me a while to find the right spot which does not affect the
non-timens path and at the same time gives a reasonable result for the
timens case.

One thing occured to me while doing that vvar_fault() hack for testing. For
the timens case it will hit

if (sym_offset == image->sym_vvar_page) {

first, which is then installing the special vvar page.

It's clear that the code will hit the next fault immediately when trying to
access the real vvar page at the timens offset. So it might be sensible to
map that one in one go to avoid the immediate second page fault. But that
should be a separate patch after the initial 'functional' one.

Thanks,

tglx