Allocate the time namespace page among VVAR pages and add the logic
to handle faults on VVAR properly.
If a task belongs to a time namespace then the VVAR page which contains
the system wide VDSO data is replaced with a namespace specific page
which has the same layout as the VVAR page. That page has vdso_data->seq
set to 1 to enforce the slow path and vdso_data->clock_mode set to
VCLOCK_TIMENS to enforce the time namespace handling path.
The extra check in the case that vdso_data->seq is odd, e.g. a concurrent
update of the VDSO data is in progress, is not really affecting regular
tasks which are not part of a time namespace as the task is spin waiting
for the update to finish and vdso_data->seq to become even again.
If a time namespace task hits that code path, it invokes the corresponding
time getter function which retrieves the real VVAR page, reads host time
and then adds the offset for the requested clock which is stored in the
special VVAR page.
v2: Code cleanups suggested by Vincenzo.
Cc: Vincenzo Frascino <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Dmitry Safonov <[email protected]>
Andrei Vagin (6):
arm64/vdso: use the fault callback to map vvar pages
arm64/vdso: Zap vvar pages when switching to a time namespace
arm64/vdso: Add time napespace page
arm64/vdso: Handle faults on timens page
arm64/vdso: Restrict splitting VVAR VMA
arm64: enable time namespace support
arch/arm64/Kconfig | 1 +
.../include/asm/vdso/compat_gettimeofday.h | 11 ++
arch/arm64/include/asm/vdso/gettimeofday.h | 8 ++
arch/arm64/kernel/vdso.c | 134 ++++++++++++++++--
arch/arm64/kernel/vdso/vdso.lds.S | 3 +-
arch/arm64/kernel/vdso32/vdso.lds.S | 3 +-
include/vdso/datapage.h | 1 +
7 files changed, 147 insertions(+), 14 deletions(-)
--
2.24.1
The VVAR page layout depends on whether a task belongs to the root or
non-root time namespace. Whenever a task changes its namespace, the VVAR
page tables are cleared and then they will be re-faulted with a
corresponding layout.
Signed-off-by: Andrei Vagin <[email protected]>
---
arch/arm64/kernel/vdso.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 290c36d74e03..6ac9cdeac5be 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -131,6 +131,38 @@ static int __vdso_init(enum arch_vdso_type arch_index)
return 0;
}
+#ifdef CONFIG_TIME_NS
+/*
+ * The vvar page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_data() for details.
+ */
+int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+ struct mm_struct *mm = task->mm;
+ struct vm_area_struct *vma;
+
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long size = vma->vm_end - vma->vm_start;
+
+ if (vma_is_special_mapping(vma, vdso_lookup[ARM64_VDSO].dm))
+ zap_page_range(vma, vma->vm_start, size);
+#ifdef CONFIG_COMPAT_VDSO
+ if (vma_is_special_mapping(vma, vdso_lookup[ARM64_VDSO32].dm))
+ zap_page_range(vma, vma->vm_start, size);
+#endif
+ }
+
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+#endif
+
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
--
2.24.1
This is required to support time namespaces where a time namespace data
page is different for each namespace.
Signed-off-by: Andrei Vagin <[email protected]>
---
arch/arm64/kernel/vdso.c | 24 +++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 354b11e27c07..290c36d74e03 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -114,28 +114,32 @@ static int __vdso_init(enum arch_vdso_type arch_index)
PAGE_SHIFT;
/* Allocate the vDSO pagelist, plus a page for the data. */
- vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages + 1,
+ vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages,
sizeof(struct page *),
GFP_KERNEL);
if (vdso_pagelist == NULL)
return -ENOMEM;
- /* Grab the vDSO data page. */
- vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data));
-
-
/* Grab the vDSO code pages. */
pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start);
for (i = 0; i < vdso_lookup[arch_index].vdso_pages; i++)
- vdso_pagelist[i + 1] = pfn_to_page(pfn + i);
+ vdso_pagelist[i] = pfn_to_page(pfn + i);
- vdso_lookup[arch_index].dm->pages = &vdso_pagelist[0];
- vdso_lookup[arch_index].cm->pages = &vdso_pagelist[1];
+ vdso_lookup[arch_index].cm->pages = vdso_pagelist;
return 0;
}
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ if (vmf->pgoff == 0)
+ return vmf_insert_pfn(vma, vmf->address,
+ sym_to_pfn(vdso_data));
+ return VM_FAULT_SIGBUS;
+}
+
static int __setup_additional_pages(enum arch_vdso_type arch_index,
struct mm_struct *mm,
struct linux_binprm *bprm,
@@ -155,7 +159,7 @@ static int __setup_additional_pages(enum arch_vdso_type arch_index,
}
ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE,
- VM_READ|VM_MAYREAD,
+ VM_READ|VM_MAYREAD|VM_PFNMAP,
vdso_lookup[arch_index].dm);
if (IS_ERR(ret))
goto up_fail;
@@ -215,6 +219,7 @@ static struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = {
#ifdef CONFIG_COMPAT_VDSO
{
.name = "[vvar]",
+ .fault = vvar_fault,
},
{
.name = "[vdso]",
@@ -396,6 +401,7 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = {
{
.name = "[vvar]",
+ .fault = vvar_fault,
},
{
.name = "[vdso]",
--
2.24.1
Hi Vincenzo,
Have you had a chance to look at this series? Let me know if I need to
rebase this patch set and resend it again.
Thanks,
Andrei
Hi Andrei,
On 4/1/20 7:50 AM, Andrei Vagin wrote:
> Hi Vincenzo,
>
> Have you had a chance to look at this series? Let me know if I need to
> rebase this patch set and resend it again.
>
Sorry I still did not get the chance to have a look at your v2.
I will try to do it by the end of this week, beginning of next.
> Thanks,
> Andrei
>
--
Regards,
Vincenzo
On 2/25/20 7:37 AM, Andrei Vagin wrote:
> This is required to support time namespaces where a time namespace data
> page is different for each namespace.
>
> Signed-off-by: Andrei Vagin <[email protected]>
Reviewed-by: Vincenzo Frascino <[email protected]>
> ---
> arch/arm64/kernel/vdso.c | 24 +++++++++++++++---------
> 1 file changed, 15 insertions(+), 9 deletions(-)
>
> diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
> index 354b11e27c07..290c36d74e03 100644
> --- a/arch/arm64/kernel/vdso.c
> +++ b/arch/arm64/kernel/vdso.c
> @@ -114,28 +114,32 @@ static int __vdso_init(enum arch_vdso_type arch_index)
> PAGE_SHIFT;
>
> /* Allocate the vDSO pagelist, plus a page for the data. */
> - vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages + 1,
> + vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages,
> sizeof(struct page *),
> GFP_KERNEL);
> if (vdso_pagelist == NULL)
> return -ENOMEM;
>
> - /* Grab the vDSO data page. */
> - vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data));
> -
> -
> /* Grab the vDSO code pages. */
> pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start);
>
> for (i = 0; i < vdso_lookup[arch_index].vdso_pages; i++)
> - vdso_pagelist[i + 1] = pfn_to_page(pfn + i);
> + vdso_pagelist[i] = pfn_to_page(pfn + i);
>
> - vdso_lookup[arch_index].dm->pages = &vdso_pagelist[0];
> - vdso_lookup[arch_index].cm->pages = &vdso_pagelist[1];
> + vdso_lookup[arch_index].cm->pages = vdso_pagelist;
>
> return 0;
> }
>
> +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
> + struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> + if (vmf->pgoff == 0)
> + return vmf_insert_pfn(vma, vmf->address,
> + sym_to_pfn(vdso_data));
> + return VM_FAULT_SIGBUS;
> +}
> +
> static int __setup_additional_pages(enum arch_vdso_type arch_index,
> struct mm_struct *mm,
> struct linux_binprm *bprm,
> @@ -155,7 +159,7 @@ static int __setup_additional_pages(enum arch_vdso_type arch_index,
> }
>
> ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE,
> - VM_READ|VM_MAYREAD,
> + VM_READ|VM_MAYREAD|VM_PFNMAP,
> vdso_lookup[arch_index].dm);
> if (IS_ERR(ret))
> goto up_fail;
> @@ -215,6 +219,7 @@ static struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = {
> #ifdef CONFIG_COMPAT_VDSO
> {
> .name = "[vvar]",
> + .fault = vvar_fault,
> },
> {
> .name = "[vdso]",
> @@ -396,6 +401,7 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
> static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = {
> {
> .name = "[vvar]",
> + .fault = vvar_fault,
> },
> {
> .name = "[vdso]",
>
--
Regards,
Vincenzo
On 2/25/20 7:37 AM, Andrei Vagin wrote:
> The VVAR page layout depends on whether a task belongs to the root or
> non-root time namespace. Whenever a task changes its namespace, the VVAR
> page tables are cleared and then they will be re-faulted with a
> corresponding layout.
>
> Signed-off-by: Andrei Vagin <[email protected]>
Reviewed-by: Vincenzo Frascino <[email protected]>
> ---
> arch/arm64/kernel/vdso.c | 32 ++++++++++++++++++++++++++++++++
> 1 file changed, 32 insertions(+)
>
> diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
> index 290c36d74e03..6ac9cdeac5be 100644
> --- a/arch/arm64/kernel/vdso.c
> +++ b/arch/arm64/kernel/vdso.c
> @@ -131,6 +131,38 @@ static int __vdso_init(enum arch_vdso_type arch_index)
> return 0;
> }
>
> +#ifdef CONFIG_TIME_NS
> +/*
> + * The vvar page layout depends on whether a task belongs to the root or
> + * non-root time namespace. Whenever a task changes its namespace, the VVAR
> + * page tables are cleared and then they will re-faulted with a
> + * corresponding layout.
> + * See also the comment near timens_setup_vdso_data() for details.
> + */
> +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
> +{
> + struct mm_struct *mm = task->mm;
> + struct vm_area_struct *vma;
> +
> + if (down_write_killable(&mm->mmap_sem))
> + return -EINTR;
> +
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + unsigned long size = vma->vm_end - vma->vm_start;
> +
> + if (vma_is_special_mapping(vma, vdso_lookup[ARM64_VDSO].dm))
> + zap_page_range(vma, vma->vm_start, size);
> +#ifdef CONFIG_COMPAT_VDSO
> + if (vma_is_special_mapping(vma, vdso_lookup[ARM64_VDSO32].dm))
> + zap_page_range(vma, vma->vm_start, size);
> +#endif
> + }
> +
> + up_write(&mm->mmap_sem);
> + return 0;
> +}
> +#endif
> +
> static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
> struct vm_area_struct *vma, struct vm_fault *vmf)
> {
>
--
Regards,
Vincenzo
Hi Andrei,
On 2/25/20 7:37 AM, Andrei Vagin wrote:
> Allocate the time namespace page among VVAR pages and add the logic
> to handle faults on VVAR properly.
>
> If a task belongs to a time namespace then the VVAR page which contains
> the system wide VDSO data is replaced with a namespace specific page
> which has the same layout as the VVAR page. That page has vdso_data->seq
> set to 1 to enforce the slow path and vdso_data->clock_mode set to
> VCLOCK_TIMENS to enforce the time namespace handling path.
>
> The extra check in the case that vdso_data->seq is odd, e.g. a concurrent
> update of the VDSO data is in progress, is not really affecting regular
> tasks which are not part of a time namespace as the task is spin waiting
> for the update to finish and vdso_data->seq to become even again.
>
> If a time namespace task hits that code path, it invokes the corresponding
> time getter function which retrieves the real VVAR page, reads host time
> and then adds the offset for the requested clock which is stored in the
> special VVAR page.
>
> v2: Code cleanups suggested by Vincenzo.
>
Sorry for the delay, I completed this morning the review of your patches and I
do not have anymore comments on them. Thank you for making the effort and
bringing the time namespace support to arm64.
I have though a question on something I encountered during the testing of the
patches: I noticed that all the tests related to CLOCK_BOOTTIME_ALARM fail on
arm64 (please find the results below the scissors). Is this expected?
--->8---
1..4
ok 1 clockid: 1 abs:0
ok 2 clockid: 1 abs:1
not ok 3 # error clock_gettime: Invalid argument
not ok 4 # error clock_gettime: Invalid argument
Bail out!
# Pass 2 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 2
1..1
ok 1 exec
# Pass 1 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0
1..8
not ok 1 # error Warning: failed to find clock_gettime in vDSO
./timens.sh: line 5: 15382 Segmentation fault (core dumped) ./gettime_perf
1..1
ok 1 Passed for /proc/uptime
# Pass 1 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0
1..10
ok 1 Passed for CLOCK_BOOTTIME (syscall)
ok 2 Passed for CLOCK_BOOTTIME (vdso)
not ok 3 # error syscall(SYS_clock_gettime(9)): Invalid argument
not ok 4 # error clock_gettime(9): Invalid argument
ok 5 Passed for CLOCK_MONOTONIC (syscall)
ok 6 Passed for CLOCK_MONOTONIC (vdso)
ok 7 Passed for CLOCK_MONOTONIC_COARSE (syscall)
ok 8 Passed for CLOCK_MONOTONIC_COARSE (vdso)
ok 9 Passed for CLOCK_MONOTONIC_RAW (syscall)
ok 10 Passed for CLOCK_MONOTONIC_RAW (vdso)
Bail out!
# Pass 8 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 2
1..3
ok 1 clockid=7
ok 2 clockid=1
not ok 3 # error timerfd_create: Operation not supported
Bail out!
# Pass 2 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 1
1..3
ok 1 clockid=7
ok 2 clockid=1
ok 3 clockid=9
# Pass 3 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0
[...]
--
Regards,
Vincenzo
On Thu, Apr 9, 2020 at 6:23 AM Vincenzo Frascino
<[email protected]> wrote:
>
> Hi Andrei,
>
> On 2/25/20 7:37 AM, Andrei Vagin wrote:
> > Allocate the time namespace page among VVAR pages and add the logic
> > to handle faults on VVAR properly.
> >
> > If a task belongs to a time namespace then the VVAR page which contains
> > the system wide VDSO data is replaced with a namespace specific page
> > which has the same layout as the VVAR page. That page has vdso_data->seq
> > set to 1 to enforce the slow path and vdso_data->clock_mode set to
> > VCLOCK_TIMENS to enforce the time namespace handling path.
> >
> > The extra check in the case that vdso_data->seq is odd, e.g. a concurrent
> > update of the VDSO data is in progress, is not really affecting regular
> > tasks which are not part of a time namespace as the task is spin waiting
> > for the update to finish and vdso_data->seq to become even again.
> >
> > If a time namespace task hits that code path, it invokes the corresponding
> > time getter function which retrieves the real VVAR page, reads host time
> > and then adds the offset for the requested clock which is stored in the
> > special VVAR page.
> >
> > v2: Code cleanups suggested by Vincenzo.
> >
>
> Sorry for the delay, I completed this morning the review of your patches and I
> do not have anymore comments on them. Thank you for making the effort and
> bringing the time namespace support to arm64.
Thank you for the review of these patches.
>
> I have though a question on something I encountered during the testing of the
> patches: I noticed that all the tests related to CLOCK_BOOTTIME_ALARM fail on
> arm64 (please find the results below the scissors). Is this expected?
static int alarm_clock_get_timespec(clockid_t which_clock, struct
timespec64 *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
return -EINVAL;
It is probably that you get EINVAL from here ^^^. I will send a
separate patch to handle this case in tests properly.
Thanks,
Andrei
Hi Andrei,
On 4/11/20 8:33 AM, Andrei Vagin wrote:
> On Thu, Apr 9, 2020 at 6:23 AM Vincenzo Frascino
> <[email protected]> wrote:
>>
>> Hi Andrei,
>>
[...]
>> Sorry for the delay, I completed this morning the review of your patches and I
>> do not have anymore comments on them. Thank you for making the effort and
>> bringing the time namespace support to arm64.
>
> Thank you for the review of these patches.
>
>>
>> I have though a question on something I encountered during the testing of the
>> patches: I noticed that all the tests related to CLOCK_BOOTTIME_ALARM fail on
>> arm64 (please find the results below the scissors). Is this expected?
>
> static int alarm_clock_get_timespec(clockid_t which_clock, struct
> timespec64 *tp)
> {
> struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
>
> if (!alarmtimer_get_rtcdev())
> return -EINVAL;
>
> It is probably that you get EINVAL from here ^^^. I will send a
> separate patch to handle this case in tests properly.
>
This makes sense :) Please let me know when you post the fix so I can test it again.
Are you planning as well to rebase this set?
> Thanks,
> Andrei
>
--
Regards,
Vincenzo
On Tue, Apr 14, 2020 at 2:02 AM Vincenzo Frascino
<[email protected]> wrote:
>
> Hi Andrei,
>
> On 4/11/20 8:33 AM, Andrei Vagin wrote:
> > On Thu, Apr 9, 2020 at 6:23 AM Vincenzo Frascino
> > <[email protected]> wrote:
> >>
> >> I have though a question on something I encountered during the testing of the
> >> patches: I noticed that all the tests related to CLOCK_BOOTTIME_ALARM fail on
> >> arm64 (please find the results below the scissors). Is this expected?
> >
> > static int alarm_clock_get_timespec(clockid_t which_clock, struct
> > timespec64 *tp)
> > {
> > struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
> >
> > if (!alarmtimer_get_rtcdev())
> > return -EINVAL;
> >
> > It is probably that you get EINVAL from here ^^^. I will send a
> > separate patch to handle this case in tests properly.
> >
>
> This makes sense :) Please let me know when you post the fix so I can test it again.
I have sent this fix: https://lkml.org/lkml/2020/4/15/72
>
> Are you planning as well to rebase this set?
What is the right tree to rebase on?
Thanks,
Andrei
Hi Andrei,
On 4/15/20 5:14 PM, Andrei Vagin wrote:
> On Tue, Apr 14, 2020 at 2:02 AM Vincenzo Frascino
> <[email protected]> wrote:
>>
>> Hi Andrei,
>>
>> On 4/11/20 8:33 AM, Andrei Vagin wrote:
>>> On Thu, Apr 9, 2020 at 6:23 AM Vincenzo Frascino
>>> <[email protected]> wrote:
>>>>
>>>> I have though a question on something I encountered during the testing of the
>>>> patches: I noticed that all the tests related to CLOCK_BOOTTIME_ALARM fail on
>>>> arm64 (please find the results below the scissors). Is this expected?
>>>
>>> static int alarm_clock_get_timespec(clockid_t which_clock, struct
>>> timespec64 *tp)
>>> {
>>> struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
>>>
>>> if (!alarmtimer_get_rtcdev())
>>> return -EINVAL;
>>>
>>> It is probably that you get EINVAL from here ^^^. I will send a
>>> separate patch to handle this case in tests properly.
>>>
>>
>> This makes sense :) Please let me know when you post the fix so I can test it again.
>
> I have sent this fix: https://lkml.org/lkml/2020/4/15/72
>
That's good, I will try it by the end of this week or beginning of next and let
you know the results.
>>
>> Are you planning as well to rebase this set?>
> What is the right tree to rebase on?
>
I guess master, I was asking because it would make easier my testing :)
> Thanks,
> Andrei
>
--
Regards,
Vincenzo