Some user space applications need to read a couple of different clocks.
Each read requires moving from user space to kernel space.
Reading each clock separately (syscall) introduces extra
unpredictable/unmeasurable delay. Minimizing this delay contributes to user
space actions on these clocks (e.g. synchronization etc).
Introduce a new system call clock_compare, which can be used to measure
the offset between two clocks, from variety of types: PHC, virtual PHC
and various system clocks (CLOCK_REALTIME, CLOCK_MONOTONIC, etc).
The system call returns the clocks timestamps.
When possible, use crosstimespec to sync read values.
Else, read clock A twice (before, and after reading clock B) and average these
times – to be as close as possible to the time we read clock B.
Signed-off-by: Sagi Maimon <[email protected]>
---
Addressed comments from:
- Arnd Bergman : https://www.spinics.net/lists/netdev/msg980859.html
Changes since version 6:
- cheaper implantation regarding timespec64 operations.
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
drivers/ptp/ptp_clock.c | 34 ++++--
include/linux/posix-clock.h | 2 +
include/linux/syscalls.h | 4 +
include/uapi/asm-generic/unistd.h | 5 +-
kernel/time/posix-clock.c | 25 +++++
kernel/time/posix-timers.c | 138 +++++++++++++++++++++++++
kernel/time/posix-timers.h | 2 +
8 files changed, 200 insertions(+), 11 deletions(-)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7e8d46f4147f..727930d27e05 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -383,6 +383,7 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common clock_compare sys_clock_compare
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 15b804ba4868..37ce66d4159f 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -156,17 +156,31 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx)
return err;
}
+static int ptp_clock_getcrosstime(struct posix_clock *pc, struct system_device_crosststamp *xtstamp)
+{
+ struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+ int err;
+
+ if (!ptp->info->getcrosststamp)
+ err = -EOPNOTSUPP;
+ else
+ err = ptp->info->getcrosststamp(ptp->info, xtstamp);
+
+ return err;
+}
+
static struct posix_clock_operations ptp_clock_ops = {
- .owner = THIS_MODULE,
- .clock_adjtime = ptp_clock_adjtime,
- .clock_gettime = ptp_clock_gettime,
- .clock_getres = ptp_clock_getres,
- .clock_settime = ptp_clock_settime,
- .ioctl = ptp_ioctl,
- .open = ptp_open,
- .release = ptp_release,
- .poll = ptp_poll,
- .read = ptp_read,
+ .owner = THIS_MODULE,
+ .clock_adjtime = ptp_clock_adjtime,
+ .clock_gettime = ptp_clock_gettime,
+ .clock_getres = ptp_clock_getres,
+ .clock_settime = ptp_clock_settime,
+ .clock_getcrosstime = ptp_clock_getcrosstime,
+ .ioctl = ptp_ioctl,
+ .open = ptp_open,
+ .release = ptp_release,
+ .poll = ptp_poll,
+ .read = ptp_read,
};
static void ptp_clock_release(struct device *dev)
diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
index ef8619f48920..3a5b4bb3f56b 100644
--- a/include/linux/posix-clock.h
+++ b/include/linux/posix-clock.h
@@ -47,6 +47,8 @@ struct posix_clock_operations {
int (*clock_settime)(struct posix_clock *pc,
const struct timespec64 *ts);
+ int (*clock_getcrosstime)(struct posix_clock *pc,
+ struct system_device_crosststamp *xtstamp);
/*
* Optional character device methods:
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 77eb9b0e7685..47c5de3bdb18 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1188,6 +1188,10 @@ asmlinkage long sys_ni_syscall(void);
asmlinkage long sys_ni_posix_timers(void);
+asmlinkage long clock_compare(const clockid_t clock_a, const clockid_t clock_b,
+ struct __kernel_timespec __user *tp_a,
+ struct __kernel_timespec __user *tp_b,
+ s64 __user *offs_err);
/*
* Kernel code should not call syscalls (i.e., sys_xyzyyz()) directly.
* Instead, use one of the functions which work equivalently, such as
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 75f00965ab15..537a35afd237 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
#define __NR_lsm_list_modules 461
__SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
+#define __NR_clock_compare 462
+__SYSCALL(__NR_clock_compare, sys_clock_compare)
+
#undef __NR_syscalls
-#define __NR_syscalls 462
+#define __NR_syscalls 463
/*
* 32 bit systems traditionally used different
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9de66bbbb3d1..68b2d6741036 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -327,9 +327,34 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
return err;
}
+static int pc_clock_get_crosstime(clockid_t id, struct system_device_crosststamp *xtstamp)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ err = -EACCES;
+ goto out;
+ }
+
+ if (cd.clk->ops.clock_getcrosstime)
+ err = cd.clk->ops.clock_getcrosstime(cd.clk, xtstamp);
+ else
+ err = -EOPNOTSUPP;
+out:
+ put_clock_desc(&cd);
+
+ return err;
+}
+
const struct k_clock clock_posix_dynamic = {
.clock_getres = pc_clock_getres,
.clock_set = pc_clock_settime,
.clock_get_timespec = pc_clock_gettime,
.clock_adj = pc_clock_adjtime,
+ .clock_get_crosstimespec = pc_clock_get_crosstime,
};
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index b924f0f096fa..ea43527cd5e9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1426,6 +1426,144 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
#endif
+/**
+ * clock_compare - Get couple of clocks time stamps
+ * @clock_a: clock a ID
+ * @clock_b: clock b ID
+ * @tp_a: Pointer to a user space timespec64 for clock a storage
+ * @tp_b: Pointer to a user space timespec64 for clock b storage
+ *
+ * clock_compare gets time sample of two clocks.
+ * Supported clocks IDs: PHC, virtual PHC and various system clocks.
+ *
+ * In case of PHC that supports crosstimespec and the other clock is Monotonic raw
+ * or system time, crosstimespec will be used to synchronously capture
+ * system/device time stamp.
+ *
+ * In other cases: Read clock_a twice (before, and after reading clock_b) and
+ * average these times – to be as close as possible to the time we read clock_b.
+ *
+ * Returns:
+ * 0 Success. @tp_a and @tp_b contains the time stamps
+ * -EINVAL @clock a or b ID is not a valid clock ID
+ * -EFAULT Copying the time stamp to @tp_a or @tp_b faulted
+ * -EOPNOTSUPP Dynamic POSIX clock does not support crosstimespec()
+ **/
+SYSCALL_DEFINE5(clock_compare, const clockid_t, clock_a, const clockid_t, clock_b,
+ struct __kernel_timespec __user *, tp_a, struct __kernel_timespec __user *,
+ tp_b, s64 __user *, offs_err)
+{
+ struct timespec64 ts_a, ts_a1, ts_b, ts_a2;
+ struct system_device_crosststamp xtstamp_a1, xtstamp_a2, xtstamp_b;
+ const struct k_clock *kc_a, *kc_b;
+ ktime_t ktime_a;
+ s64 ts_offs_err = 0;
+ int error = 0;
+ bool crosstime_support_a = false;
+ bool crosstime_support_b = false;
+
+ kc_a = clockid_to_kclock(clock_a);
+ if (!kc_a) {
+ error = -EINVAL;
+ return error;
+ }
+
+ kc_b = clockid_to_kclock(clock_b);
+ if (!kc_b) {
+ error = -EINVAL;
+ return error;
+ }
+
+ // In case crosstimespec supported and b clock is Monotonic raw or system
+ // time, synchronously capture system/device time stamp
+ if (clock_a < 0) {
+ error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a1);
+ if (!error) {
+ if (clock_b == CLOCK_MONOTONIC_RAW) {
+ ts_b = ktime_to_timespec64(xtstamp_a1.sys_monoraw);
+ ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
+ goto out;
+ } else if (clock_b == CLOCK_REALTIME) {
+ ts_b = ktime_to_timespec64(xtstamp_a1.sys_realtime);
+ ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
+ goto out;
+ } else {
+ crosstime_support_a = true;
+ }
+ }
+ }
+
+ // In case crosstimespec supported and a clock is Monotonic raw or system
+ // time, synchronously capture system/device time stamp
+ if (clock_b < 0) {
+ // Synchronously capture system/device time stamp
+ error = kc_b->clock_get_crosstimespec(clock_b, &xtstamp_b);
+ if (!error) {
+ if (clock_a == CLOCK_MONOTONIC_RAW) {
+ ts_a1 = ktime_to_timespec64(xtstamp_b.sys_monoraw);
+ ts_b = ktime_to_timespec64(xtstamp_b.device);
+ goto out;
+ } else if (clock_a == CLOCK_REALTIME) {
+ ts_a1 = ktime_to_timespec64(xtstamp_b.sys_realtime);
+ ts_b = ktime_to_timespec64(xtstamp_b.device);
+ goto out;
+ } else {
+ crosstime_support_b = true;
+ }
+ }
+ }
+
+ if (crosstime_support_a)
+ error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a1);
+ else
+ error = kc_a->clock_get_timespec(clock_a, &ts_a1);
+
+ if (error)
+ return error;
+
+ if (crosstime_support_b)
+ error = kc_b->clock_get_crosstimespec(clock_b, &xtstamp_b);
+ else
+ error = kc_b->clock_get_timespec(clock_b, &ts_b);
+
+ if (error)
+ return error;
+
+ if (crosstime_support_a)
+ error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a2);
+ else
+ error = kc_a->clock_get_timespec(clock_a, &ts_a2);
+
+ if (error)
+ return error;
+
+ if (crosstime_support_a) {
+ ktime_a = ktime_sub(xtstamp_a2.device, xtstamp_a1.device);
+ ts_offs_err = ktime_divns(ktime_a, 2);
+ ktime_a = ktime_add_ns(xtstamp_a1.device, (u64)ts_offs_err);
+ ts_a1 = ktime_to_timespec64(ktime_a);
+ } else {
+ ts_a = timespec64_sub(ts_a2, ts_a1);
+ ktime_a = timespec64_to_ktime(ts_a);
+ ts_offs_err = ktime_divns(ktime_a, 2);
+ timespec64_add_ns(&ts_a1, (u64)ts_offs_err);
+ }
+
+ if (crosstime_support_b)
+ ts_b = ktime_to_timespec64(xtstamp_b.device);
+out:
+ if (put_timespec64(&ts_a1, tp_a))
+ error = -EFAULT;
+
+ if (!error && put_timespec64(&ts_b, tp_b))
+ error = -EFAULT;
+
+ if (!error && copy_to_user(offs_err, &ts_offs_err, sizeof(ts_offs_err)))
+ error = -EFAULT;
+
+ return error;
+}
+
static const struct k_clock clock_realtime = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_timespec = posix_get_realtime_timespec,
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index f32a2ebba9b8..b1f6075f35bb 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -11,6 +11,8 @@ struct k_clock {
struct timespec64 *tp);
/* Returns the clock value in the root time namespace. */
ktime_t (*clock_get_ktime)(const clockid_t which_clock);
+ int (*clock_get_crosstimespec)(const clockid_t which_clock,
+ struct system_device_crosststamp *xtstamp);
int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);
int (*timer_create)(struct k_itimer *timer);
int (*nsleep)(const clockid_t which_clock, int flags,
--
2.26.3
On Thu, Mar 14 2024 at 11:05, Sagi Maimon wrote:
> Some user space applications need to read a couple of different clocks.
> Each read requires moving from user space to kernel space.
> Reading each clock separately (syscall) introduces extra
> unpredictable/unmeasurable delay. Minimizing this delay contributes to user
> space actions on these clocks (e.g. synchronization etc).
I asked for a proper description of the actual problem several times now
and you still provide some handwaving blurb. Feel free to ignore me, but
then please don't be surprised if I ignore you too.
Also why does reading two random clocks make any sense at all? Your code
allows to read CLOCK_TAI and CLOCK_THREAD_CPUTIME_ID. What for?
This is about PTP, no?
Again you completely fail to explain why the existing PTP ioctls are not
sufficient for the purpose and why you need to provide a new interface
which is completely ill defined.
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> drivers/ptp/ptp_clock.c | 34 ++++--
> include/linux/posix-clock.h | 2 +
> include/linux/syscalls.h | 4 +
> include/uapi/asm-generic/unistd.h | 5 +-
> kernel/time/posix-clock.c | 25 +++++
> kernel/time/posix-timers.c | 138 +++++++++++++++++++++++++
> kernel/time/posix-timers.h | 2 +
This needs to be split up into:
1) Infrastructure in posix-timers.c
2) Wire up the syscall in x86
3) Infrastructure in posix-clock.c
4) Usage in ptp_clock.c
and not as a big lump of everything.
> +/**
> + * clock_compare - Get couple of clocks time stamps
So the name of the syscall suggest that it compares two clocks, but the
actual functionality is to read two clocks.
This does not make any sense at all. Naming matters.
> + * @clock_a: clock a ID
> + * @clock_b: clock b ID
> + * @tp_a: Pointer to a user space timespec64 for clock a storage
> + * @tp_b: Pointer to a user space timespec64 for clock b storage
> + *
> + * clock_compare gets time sample of two clocks.
> + * Supported clocks IDs: PHC, virtual PHC and various system clocks.
> + *
> + * In case of PHC that supports crosstimespec and the other clock is Monotonic raw
> + * or system time, crosstimespec will be used to synchronously capture
> + * system/device time stamp.
> + *
> + * In other cases: Read clock_a twice (before, and after reading clock_b) and
> + * average these times – to be as close as possible to the time we read clock_b.
> + *
> + * Returns:
> + * 0 Success. @tp_a and @tp_b contains the time stamps
> + * -EINVAL @clock a or b ID is not a valid clock ID
> + * -EFAULT Copying the time stamp to @tp_a or @tp_b faulted
> + * -EOPNOTSUPP Dynamic POSIX clock does not support crosstimespec()
> + **/
> +SYSCALL_DEFINE5(clock_compare, const clockid_t, clock_a, const clockid_t, clock_b,
> + struct __kernel_timespec __user *, tp_a, struct __kernel_timespec __user *,
> + tp_b, s64 __user *, offs_err)
> +{
> + struct timespec64 ts_a, ts_a1, ts_b, ts_a2;
> + struct system_device_crosststamp xtstamp_a1, xtstamp_a2, xtstamp_b;
> + const struct k_clock *kc_a, *kc_b;
> + ktime_t ktime_a;
> + s64 ts_offs_err = 0;
> + int error = 0;
> + bool crosstime_support_a = false;
> + bool crosstime_support_b = false;
Please read and follow the documentation provided at:
https://www.kernel.org/doc/html/latest/process/maintainer-tip.html
> + kc_a = clockid_to_kclock(clock_a);
> + if (!kc_a) {
> + error = -EINVAL;
> + return error;
What's wrong about 'return -EINVAL;' ?
> + }
> +
> + kc_b = clockid_to_kclock(clock_b);
> + if (!kc_b) {
> + error = -EINVAL;
> + return error;
> + }
> +
> + // In case crosstimespec supported and b clock is Monotonic raw or system
> + // time, synchronously capture system/device time stamp
Please don't use C++ comments.
> + if (clock_a < 0) {
This is just wrong. posix-clocks ar not the only ones which have a
negative clock id. See clockid_to_kclock()
> + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a1);
What's a crosstimespec?
This function fills in a system_device_crosststamp, so why not name it
so that the purpose of the function is obvious?
ptp_clock::info::getcrosststamp() has a reasonable name. So why do you
need to come up with something which makes the code obfuscated?
> + if (!error) {
> + if (clock_b == CLOCK_MONOTONIC_RAW) {
> + ts_b = ktime_to_timespec64(xtstamp_a1.sys_monoraw);
> + ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
> + goto out;
> + } else if (clock_b == CLOCK_REALTIME) {
> + ts_b = ktime_to_timespec64(xtstamp_a1.sys_realtime);
> + ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
> + goto out;
> + } else {
> + crosstime_support_a = true;
Right. If clock_b is anything else than CLOCK_MONOTONIC_RAW or
CLOCK_REALTIME then this is true.
> + }
> + }
So in case of an error, this just keeps going with an uninitialized
xtstamp_a1 and if the clock_b part succeeds it continues and operates on
garbage.
> + }
> +
> + // In case crosstimespec supported and a clock is Monotonic raw or system
> + // time, synchronously capture system/device time stamp
> + if (clock_b < 0) {
> + // Synchronously capture system/device time stamp
> + error = kc_b->clock_get_crosstimespec(clock_b, &xtstamp_b);
> + if (!error) {
> + if (clock_a == CLOCK_MONOTONIC_RAW) {
> + ts_a1 = ktime_to_timespec64(xtstamp_b.sys_monoraw);
> + ts_b = ktime_to_timespec64(xtstamp_b.device);
> + goto out;
> + } else if (clock_a == CLOCK_REALTIME) {
> + ts_a1 = ktime_to_timespec64(xtstamp_b.sys_realtime);
> + ts_b = ktime_to_timespec64(xtstamp_b.device);
> + goto out;
> + } else {
> + crosstime_support_b = true;
> + }
> + }
> + }
> +
> + if (crosstime_support_a)
> + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a1);
What? crosstime_support_a is only true when the exactly same call
returned success. Why does it need to be called here again?
> + else
> + error = kc_a->clock_get_timespec(clock_a, &ts_a1);
> +
> + if (error)
> + return error;
> +
> + if (crosstime_support_b)
> + error = kc_b->clock_get_crosstimespec(clock_b, &xtstamp_b);
> + else
> + error = kc_b->clock_get_timespec(clock_b, &ts_b);
> +
> + if (error)
> + return error;
> +
> + if (crosstime_support_a)
> + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a2);
> + else
> + error = kc_a->clock_get_timespec(clock_a, &ts_a2);
> +
> + if (error)
> + return error;
The logic and the code flow here are unreadable garbage and there are
zero comments what this is supposed to do.
> + if (crosstime_support_a) {
> + ktime_a = ktime_sub(xtstamp_a2.device, xtstamp_a1.device);
> + ts_offs_err = ktime_divns(ktime_a, 2);
> + ktime_a = ktime_add_ns(xtstamp_a1.device, (u64)ts_offs_err);
> + ts_a1 = ktime_to_timespec64(ktime_a);
This is just wrong.
read(a1);
read(b);
read(a2);
You _CANNOT_ assume that (a1 + ((a2 - a1) / 2) is anywhere close to the
point in time where 'b' is read. This code is preemtible and
interruptible. I explained this to you before.
Your explanation in the comment above the function is just wishful
thinking.
> + * In other cases: Read clock_a twice (before, and after reading clock_b) and
> + * average these times – to be as close as possible to the time we read clock_b.
Can you please sit down and provide a precise technical description of
the problem you are trying to solve and explain your proposed solution
at the conceptual level instead of throwing out random implementations
every few days?
Thanks,
tglx
hi Thomas
On Thu, Mar 14, 2024 at 1:12 PM Thomas Gleixner <[email protected]> wrote:
>
> On Thu, Mar 14 2024 at 11:05, Sagi Maimon wrote:
> > Some user space applications need to read a couple of different clocks.
> > Each read requires moving from user space to kernel space.
> > Reading each clock separately (syscall) introduces extra
> > unpredictable/unmeasurable delay. Minimizing this delay contributes to user
> > space actions on these clocks (e.g. synchronization etc).
>
> I asked for a proper description of the actual problem several times now
> and you still provide some handwaving blurb. Feel free to ignore me, but
> then please don't be surprised if I ignore you too.
>
Nobody is ignoring your notes, and I address any notes given by any
maintainer in the most serious way.
As far as explaining the actual problem this is the best that I can,
but let me try to explain better:
We did many tests with different CPU loading and compared sampling the
same clock twice,
once in user space and once by using the system call.
We have noticed an improvement up to hundreds of nanoseconds while
using the system call.
Those results improved our ability to sync different PHCs
> Also why does reading two random clocks make any sense at all? Your code
> allows to read CLOCK_TAI and CLOCK_THREAD_CPUTIME_ID. What for?
>
Initially we needed to sync some different PHCs for our user space
application, that is why we came with this idea.
The first idea was an IOCTL that returned samples of several PHCs for
the need of synchronization.
Richard Cochran suggested a system call instead, which will add the
ability to get various system clocks, while this
implementation is more complex then IOCTL, I think that he was right,
for future usage.
> This is about PTP, no?
>
yes
> Again you completely fail to explain why the existing PTP ioctls are not
> sufficient for the purpose and why you need to provide a new interface
> which is completely ill defined.
>
The same as my answer above .....
> > arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> > drivers/ptp/ptp_clock.c | 34 ++++--
> > include/linux/posix-clock.h | 2 +
> > include/linux/syscalls.h | 4 +
> > include/uapi/asm-generic/unistd.h | 5 +-
> > kernel/time/posix-clock.c | 25 +++++
> > kernel/time/posix-timers.c | 138 +++++++++++++++++++++++++
> > kernel/time/posix-timers.h | 2 +
>
> This needs to be split up into:
>
> 1) Infrastructure in posix-timers.c
> 2) Wire up the syscall in x86
> 3) Infrastructure in posix-clock.c
> 4) Usage in ptp_clock.c
>
> and not as a big lump of everything.
>
I know, but I think the benefit worth it
I agree that an IOCTL won't require such a big changes in the kernel code
> > +/**
> > + * clock_compare - Get couple of clocks time stamps
>
> So the name of the syscall suggest that it compares two clocks, but the
> actual functionality is to read two clocks.
>
> This does not make any sense at all. Naming matters.
>
you are right the name was suggested by Richard when the main idea was
returning the offset between the clocks.
If you have a better name, please tell me
> > + * @clock_a: clock a ID
> > + * @clock_b: clock b ID
> > + * @tp_a: Pointer to a user space timespec64 for clock a storage
> > + * @tp_b: Pointer to a user space timespec64 for clock b storage
> > + *
> > + * clock_compare gets time sample of two clocks.
> > + * Supported clocks IDs: PHC, virtual PHC and various system clocks.
> > + *
> > + * In case of PHC that supports crosstimespec and the other clock is Monotonic raw
> > + * or system time, crosstimespec will be used to synchronously capture
> > + * system/device time stamp.
> > + *
> > + * In other cases: Read clock_a twice (before, and after reading clock_b) and
> > + * average these times – to be as close as possible to the time we read clock_b.
> > + *
> > + * Returns:
> > + * 0 Success. @tp_a and @tp_b contains the time stamps
> > + * -EINVAL @clock a or b ID is not a valid clock ID
> > + * -EFAULT Copying the time stamp to @tp_a or @tp_b faulted
> > + * -EOPNOTSUPP Dynamic POSIX clock does not support crosstimespec()
> > + **/
> > +SYSCALL_DEFINE5(clock_compare, const clockid_t, clock_a, const clockid_t, clock_b,
> > + struct __kernel_timespec __user *, tp_a, struct __kernel_timespec __user *,
> > + tp_b, s64 __user *, offs_err)
> > +{
> > + struct timespec64 ts_a, ts_a1, ts_b, ts_a2;
> > + struct system_device_crosststamp xtstamp_a1, xtstamp_a2, xtstamp_b;
> > + const struct k_clock *kc_a, *kc_b;
> > + ktime_t ktime_a;
> > + s64 ts_offs_err = 0;
> > + int error = 0;
> > + bool crosstime_support_a = false;
> > + bool crosstime_support_b = false;
>
> Please read and follow the documentation provided at:
>
> https://www.kernel.org/doc/html/latest/process/maintainer-tip.html
>
> > + kc_a = clockid_to_kclock(clock_a);
> > + if (!kc_a) {
> > + error = -EINVAL;
> > + return error;
>
> What's wrong about 'return -EINVAL;' ?
>
correct will be fixed on next patch
> > + }
> > +
> > + kc_b = clockid_to_kclock(clock_b);
> > + if (!kc_b) {
> > + error = -EINVAL;
> > + return error;
> > + }
> > +
> > + // In case crosstimespec supported and b clock is Monotonic raw or system
> > + // time, synchronously capture system/device time stamp
>
> Please don't use C++ comments.
>
will be fixed on next patch
> > + if (clock_a < 0) {
>
> This is just wrong. posix-clocks ar not the only ones which have a
> negative clock id. See clockid_to_kclock()
>
> > + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a1);
>
> What's a crosstimespec?
>
> This function fills in a system_device_crosststamp, so why not name it
> so that the purpose of the function is obvious?
>
correct , it will be fixed on next patch
> ptp_clock::info::getcrosststamp() has a reasonable name. So why do you
> need to come up with something which makes the code obfuscated?
>
correct , it will be fixed on next patch
> > + if (!error) {
> > + if (clock_b == CLOCK_MONOTONIC_RAW) {
> > + ts_b = ktime_to_timespec64(xtstamp_a1.sys_monoraw);
> > + ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
> > + goto out;
> > + } else if (clock_b == CLOCK_REALTIME) {
> > + ts_b = ktime_to_timespec64(xtstamp_a1.sys_realtime);
> > + ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
> > + goto out;
> > + } else {
> > + crosstime_support_a = true;
>
> Right. If clock_b is anything else than CLOCK_MONOTONIC_RAW or
> CLOCK_REALTIME then this is true.
>
> > + }
> > + }
>
> So in case of an error, this just keeps going with an uninitialized
> xtstamp_a1 and if the clock_b part succeeds it continues and operates on
> garbage.
>
On error xtstamp_a1 will be taken again using clock_get_crosstimespec
so no one will be operating on garbage.
Please explain the problem better, because I don't see it.
> > + }
> > +
> > + // In case crosstimespec supported and a clock is Monotonic raw or system
> > + // time, synchronously capture system/device time stamp
> > + if (clock_b < 0) {
> > + // Synchronously capture system/device time stamp
> > + error = kc_b->clock_get_crosstimespec(clock_b, &xtstamp_b);
> > + if (!error) {
> > + if (clock_a == CLOCK_MONOTONIC_RAW) {
> > + ts_a1 = ktime_to_timespec64(xtstamp_b.sys_monoraw);
> > + ts_b = ktime_to_timespec64(xtstamp_b.device);
> > + goto out;
> > + } else if (clock_a == CLOCK_REALTIME) {
> > + ts_a1 = ktime_to_timespec64(xtstamp_b.sys_realtime);
> > + ts_b = ktime_to_timespec64(xtstamp_b.device);
> > + goto out;
> > + } else {
> > + crosstime_support_b = true;
> > + }
> > + }
> > + }
> > +
> > + if (crosstime_support_a)
> > + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a1);
>
> What? crosstime_support_a is only true when the exactly same call
> returned success. Why does it need to be called here again?
>
I wanted to take the three samples as close as possible to each other.
minimum code between the calls, that is why the call is done again.
> > + else
> > + error = kc_a->clock_get_timespec(clock_a, &ts_a1);
> > +
> > + if (error)
> > + return error;
> > +
> > + if (crosstime_support_b)
> > + error = kc_b->clock_get_crosstimespec(clock_b, &xtstamp_b);
> > + else
> > + error = kc_b->clock_get_timespec(clock_b, &ts_b);
> > +
> > + if (error)
> > + return error;
> > +
> > + if (crosstime_support_a)
> > + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a2);
> > + else
> > + error = kc_a->clock_get_timespec(clock_a, &ts_a2);
> > +
> > + if (error)
> > + return error;
>
> The logic and the code flow here are unreadable garbage and there are
> zero comments what this is supposed to do.
>
I will add comments.
please no need to use negative words like "garbage" (not the first time),
please keep it professional and civilized.
> > + if (crosstime_support_a) {
> > + ktime_a = ktime_sub(xtstamp_a2.device, xtstamp_a1.device);
> > + ts_offs_err = ktime_divns(ktime_a, 2);
> > + ktime_a = ktime_add_ns(xtstamp_a1.device, (u64)ts_offs_err);
> > + ts_a1 = ktime_to_timespec64(ktime_a);
>
> This is just wrong.
>
> read(a1);
> read(b);
> read(a2);
>
> You _CANNOT_ assume that (a1 + ((a2 - a1) / 2) is anywhere close to the
> point in time where 'b' is read. This code is preemtible and
> interruptible. I explained this to you before.
>
> Your explanation in the comment above the function is just wishful
> thinking.
>
you explained it before, but still it is better then two consecutive
user space calls which are also preemptible
and the userspace to kernel context switch time is added.
> > + * In other cases: Read clock_a twice (before, and after reading clock_b) and
> > + * average these times – to be as close as possible to the time we read clock_b.
>
> Can you please sit down and provide a precise technical description of
> the problem you are trying to solve and explain your proposed solution
> at the conceptual level instead of throwing out random implementations
> every few days?
>
> Thanks,
>
> tglx
>
>
On Thu, Mar 14, 2024 at 1:12 PM Thomas Gleixner <[email protected]> wrote:
>
> On Thu, Mar 14 2024 at 11:05, Sagi Maimon wrote:
> > Some user space applications need to read a couple of different clocks.
> > Each read requires moving from user space to kernel space.
> > Reading each clock separately (syscall) introduces extra
> > unpredictable/unmeasurable delay. Minimizing this delay contributes to user
> > space actions on these clocks (e.g. synchronization etc).
>
> I asked for a proper description of the actual problem several times now
> and you still provide some handwaving blurb. Feel free to ignore me, but
> then please don't be surprised if I ignore you too.
>
> Also why does reading two random clocks make any sense at all? Your code
> allows to read CLOCK_TAI and CLOCK_THREAD_CPUTIME_ID. What for?
>
> This is about PTP, no?
>
> Again you completely fail to explain why the existing PTP ioctls are not
> sufficient for the purpose and why you need to provide a new interface
> which is completely ill defined.
>
> > arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> > drivers/ptp/ptp_clock.c | 34 ++++--
> > include/linux/posix-clock.h | 2 +
> > include/linux/syscalls.h | 4 +
> > include/uapi/asm-generic/unistd.h | 5 +-
> > kernel/time/posix-clock.c | 25 +++++
> > kernel/time/posix-timers.c | 138 +++++++++++++++++++++++++
> > kernel/time/posix-timers.h | 2 +
>
> This needs to be split up into:
>
> 1) Infrastructure in posix-timers.c
> 2) Wire up the syscall in x86
> 3) Infrastructure in posix-clock.c
> 4) Usage in ptp_clock.c
>
> and not as a big lump of everything.
>
> > +/**
> > + * clock_compare - Get couple of clocks time stamps
>
> So the name of the syscall suggest that it compares two clocks, but the
> actual functionality is to read two clocks.
>
> This does not make any sense at all. Naming matters.
>
> > + * @clock_a: clock a ID
> > + * @clock_b: clock b ID
> > + * @tp_a: Pointer to a user space timespec64 for clock a storage
> > + * @tp_b: Pointer to a user space timespec64 for clock b storage
> > + *
> > + * clock_compare gets time sample of two clocks.
> > + * Supported clocks IDs: PHC, virtual PHC and various system clocks.
> > + *
> > + * In case of PHC that supports crosstimespec and the other clock is Monotonic raw
> > + * or system time, crosstimespec will be used to synchronously capture
> > + * system/device time stamp.
> > + *
> > + * In other cases: Read clock_a twice (before, and after reading clock_b) and
> > + * average these times – to be as close as possible to the time we read clock_b.
> > + *
> > + * Returns:
> > + * 0 Success. @tp_a and @tp_b contains the time stamps
> > + * -EINVAL @clock a or b ID is not a valid clock ID
> > + * -EFAULT Copying the time stamp to @tp_a or @tp_b faulted
> > + * -EOPNOTSUPP Dynamic POSIX clock does not support crosstimespec()
> > + **/
> > +SYSCALL_DEFINE5(clock_compare, const clockid_t, clock_a, const clockid_t, clock_b,
> > + struct __kernel_timespec __user *, tp_a, struct __kernel_timespec __user *,
> > + tp_b, s64 __user *, offs_err)
> > +{
> > + struct timespec64 ts_a, ts_a1, ts_b, ts_a2;
> > + struct system_device_crosststamp xtstamp_a1, xtstamp_a2, xtstamp_b;
> > + const struct k_clock *kc_a, *kc_b;
> > + ktime_t ktime_a;
> > + s64 ts_offs_err = 0;
> > + int error = 0;
> > + bool crosstime_support_a = false;
> > + bool crosstime_support_b = false;
>
> Please read and follow the documentation provided at:
>
> https://www.kernel.org/doc/html/latest/process/maintainer-tip.html
>
I have missed this part on prviews reply.
I have read the documentation above and I think that the variable
declarations at the beginning of a function is in reverse fir tree
order
meaning from big to small, but I guess that I am missing something,
can you please explain what is wrong with the variable declaration,
so I can fix it.
> > + kc_a = clockid_to_kclock(clock_a);
> > + if (!kc_a) {
> > + error = -EINVAL;
> > + return error;
>
> What's wrong about 'return -EINVAL;' ?
>
> > + }
> > +
> > + kc_b = clockid_to_kclock(clock_b);
> > + if (!kc_b) {
> > + error = -EINVAL;
> > + return error;
> > + }
> > +
> > + // In case crosstimespec supported and b clock is Monotonic raw or system
> > + // time, synchronously capture system/device time stamp
>
> Please don't use C++ comments.
>
> > + if (clock_a < 0) {
>
> This is just wrong. posix-clocks ar not the only ones which have a
> negative clock id. See clockid_to_kclock()
>
> > + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a1);
>
> What's a crosstimespec?
>
> This function fills in a system_device_crosststamp, so why not name it
> so that the purpose of the function is obvious?
>
> ptp_clock::info::getcrosststamp() has a reasonable name. So why do you
> need to come up with something which makes the code obfuscated?
>
> > + if (!error) {
> > + if (clock_b == CLOCK_MONOTONIC_RAW) {
> > + ts_b = ktime_to_timespec64(xtstamp_a1.sys_monoraw);
> > + ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
> > + goto out;
> > + } else if (clock_b == CLOCK_REALTIME) {
> > + ts_b = ktime_to_timespec64(xtstamp_a1.sys_realtime);
> > + ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
> > + goto out;
> > + } else {
> > + crosstime_support_a = true;
>
> Right. If clock_b is anything else than CLOCK_MONOTONIC_RAW or
> CLOCK_REALTIME then this is true.
>
> > + }
> > + }
>
> So in case of an error, this just keeps going with an uninitialized
> xtstamp_a1 and if the clock_b part succeeds it continues and operates on
> garbage.
>
> > + }
> > +
> > + // In case crosstimespec supported and a clock is Monotonic raw or system
> > + // time, synchronously capture system/device time stamp
> > + if (clock_b < 0) {
> > + // Synchronously capture system/device time stamp
> > + error = kc_b->clock_get_crosstimespec(clock_b, &xtstamp_b);
> > + if (!error) {
> > + if (clock_a == CLOCK_MONOTONIC_RAW) {
> > + ts_a1 = ktime_to_timespec64(xtstamp_b.sys_monoraw);
> > + ts_b = ktime_to_timespec64(xtstamp_b.device);
> > + goto out;
> > + } else if (clock_a == CLOCK_REALTIME) {
> > + ts_a1 = ktime_to_timespec64(xtstamp_b.sys_realtime);
> > + ts_b = ktime_to_timespec64(xtstamp_b.device);
> > + goto out;
> > + } else {
> > + crosstime_support_b = true;
> > + }
> > + }
> > + }
> > +
> > + if (crosstime_support_a)
> > + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a1);
>
> What? crosstime_support_a is only true when the exactly same call
> returned success. Why does it need to be called here again?
>
> > + else
> > + error = kc_a->clock_get_timespec(clock_a, &ts_a1);
> > +
> > + if (error)
> > + return error;
> > +
> > + if (crosstime_support_b)
> > + error = kc_b->clock_get_crosstimespec(clock_b, &xtstamp_b);
> > + else
> > + error = kc_b->clock_get_timespec(clock_b, &ts_b);
> > +
> > + if (error)
> > + return error;
> > +
> > + if (crosstime_support_a)
> > + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a2);
> > + else
> > + error = kc_a->clock_get_timespec(clock_a, &ts_a2);
> > +
> > + if (error)
> > + return error;
>
> The logic and the code flow here are unreadable garbage and there are
> zero comments what this is supposed to do.
>
> > + if (crosstime_support_a) {
> > + ktime_a = ktime_sub(xtstamp_a2.device, xtstamp_a1.device);
> > + ts_offs_err = ktime_divns(ktime_a, 2);
> > + ktime_a = ktime_add_ns(xtstamp_a1.device, (u64)ts_offs_err);
> > + ts_a1 = ktime_to_timespec64(ktime_a);
>
> This is just wrong.
>
> read(a1);
> read(b);
> read(a2);
>
> You _CANNOT_ assume that (a1 + ((a2 - a1) / 2) is anywhere close to the
> point in time where 'b' is read. This code is preemtible and
> interruptible. I explained this to you before.
>
> Your explanation in the comment above the function is just wishful
> thinking.
>
> > + * In other cases: Read clock_a twice (before, and after reading clock_b) and
> > + * average these times – to be as close as possible to the time we read clock_b.
>
> Can you please sit down and provide a precise technical description of
> the problem you are trying to solve and explain your proposed solution
> at the conceptual level instead of throwing out random implementations
> every few days?
>
> Thanks,
>
> tglx
>
>
On Thu, Mar 14, 2024 at 02:19:39PM +0200, Sagi Maimon wrote:
> On Thu, Mar 14, 2024 at 1:12 PM Thomas Gleixner <[email protected]> wrote:
> > On Thu, Mar 14 2024 at 11:05, Sagi Maimon wrote:
> > > + if (crosstime_support_a) {
> > > + ktime_a = ktime_sub(xtstamp_a2.device, xtstamp_a1.device);
> > > + ts_offs_err = ktime_divns(ktime_a, 2);
> > > + ktime_a = ktime_add_ns(xtstamp_a1.device, (u64)ts_offs_err);
> > > + ts_a1 = ktime_to_timespec64(ktime_a);
> >
> > This is just wrong.
> >
> > read(a1);
> > read(b);
> > read(a2);
> >
> > You _CANNOT_ assume that (a1 + ((a2 - a1) / 2) is anywhere close to the
> > point in time where 'b' is read. This code is preemtible and
> > interruptible. I explained this to you before.
> >
> > Your explanation in the comment above the function is just wishful
> > thinking.
> >
> you explained it before, but still it is better then two consecutive
> user space calls which are also preemptible
> and the userspace to kernel context switch time is added.
How much "better" is that in reality?
The time for a user<->kernel transition should be trivial relative to the time
a task spends not running after having been preempted.
Either:
(a) Your userspace application can handle the arbitrary delta resulting from a
preemption, in which case the trivial cost shouldn't matter.
i.e. this patch *is not necessary* to solve your problem.
(b) Your userspace application cannot handle the arbitrary delta resulting from
a preemption, in which case you need to do something to handle that, which
you haven't described at all.
i.e. with the information you have provided so far, this patch is
*insufficient* to solve your problem.
> > > + * In other cases: Read clock_a twice (before, and after reading clock_b) and
> > > + * average these times – to be as close as possible to the time we read clock_b.
> >
> > Can you please sit down and provide a precise technical description of
> > the problem you are trying to solve and explain your proposed solution
> > at the conceptual level instead of throwing out random implementations
> > every few days?
100% agreed.
Please, explain the actual problem you are solving here. What *specifically*
are you trying to do in userspace with these values? "Synchronization" is too
vague a description.
Making what is already the best case *marginally better* without handling the
common and worst cases is a waste of time. It doesn't actually solve the
problem, and it misleads people into thinknig that a problem is solved when it
is not.
Mark.
Sagi!
On Thu, Mar 14 2024 at 14:19, Sagi Maimon wrote:
> On Thu, Mar 14, 2024 at 1:12 PM Thomas Gleixner <[email protected]> wrote:
>> On Thu, Mar 14 2024 at 11:05, Sagi Maimon wrote:
>> > Some user space applications need to read a couple of different clocks.
>> > Each read requires moving from user space to kernel space.
>> > Reading each clock separately (syscall) introduces extra
>> > unpredictable/unmeasurable delay. Minimizing this delay contributes to user
>> > space actions on these clocks (e.g. synchronization etc).
>>
>> I asked for a proper description of the actual problem several times now
>> and you still provide some handwaving blurb. Feel free to ignore me, but
>> then please don't be surprised if I ignore you too.
>>
> Nobody is ignoring your notes, and I address any notes given by any
> maintainer in the most serious way.
> As far as explaining the actual problem this is the best that I can,
> but let me try to explain better:
> We did many tests with different CPU loading and compared sampling the
> same clock twice,
> once in user space and once by using the system call.
> We have noticed an improvement up to hundreds of nanoseconds while
> using the system call.
> Those results improved our ability to sync different PHCs
So let me express how I understand the problem - as far as I decoded it
from your writeups:
Synchronizing two PHCs requires to read timestamps from both and
correlate them. Currently this requires several seperate system calls.
This is subject to unmeasurable delays due to system call overhead,
preemption and interrupts which makes the correlation imprecise.
Therefore you want a system call, which samples two clocks at once, to
make the correlation more precise.
Right? For the further comments I assume this is what you are trying to
say and to solve.
So far so good, except that I do not agree with that reasoning at all:
1. The delays are measurable and as precise as the cross time stamp
mechanism (hardware or software based) allows.
2. The system call overhead is completely irrelevant.
3. The time deltas between the sample points are irrelevant within a
reasonable upper bound to the time delta between the two outer
sample points.
4. The alledged higher precision is based on a guesstimate and not on
correctness. Just because it behaves slightly better in testing
does not make it any more correct.
5. The problem can be solved with maximal possible accuracy by using
the existing PTP IOCTLs.
See below.
>> Also why does reading two random clocks make any sense at all? Your code
>> allows to read CLOCK_TAI and CLOCK_THREAD_CPUTIME_ID. What for?
>>
> Initially we needed to sync some different PHCs for our user space
> application, that is why we came with this idea.
> The first idea was an IOCTL that returned samples of several PHCs for
> the need of synchronization.
> Richard Cochran suggested a system call instead, which will add the
> ability to get various system clocks, while this
> implementation is more complex then IOCTL, I think that he was right,
> for future usage.
Which future usage? We are not introducing swiss army knife interfaces
just because there might be an illusional use case somewhere in the
unspecified future.
Adding a system call needs a proper design and justification. Handwaving
future usage is not enough.
Documentation/process/adding-syscalls.rst is very clear about what is
required for a new system call.
>> This needs to be split up into:
>>
>> 1) Infrastructure in posix-timers.c
>> 2) Wire up the syscall in x86
>> 3) Infrastructure in posix-clock.c
>> 4) Usage in ptp_clock.c
>>
>> and not as a big lump of everything.
>>
> I know, but I think the benefit worth it
It's worth it because it makes review easier. It's well documented in
the process documentation that patches should do one thing and not a
whole lump of changes.
>> > + if (!error) {
>> > + if (clock_b == CLOCK_MONOTONIC_RAW) {
>> > + ts_b = ktime_to_timespec64(xtstamp_a1.sys_monoraw);
>> > + ts_a1 = ktime_to_timespec64(xtstamp_a1device);
>> > + goto out;
>> > + } else if (clock_b == CLOCK_REALTIME) {
>> > + ts_b = ktime_to_timespec64(xtstamp_a1.sys_realtime);
>> > + ts_a1 = ktime_to_timespec64(xtstamp_a1device);
>> > + goto out;
>> > + } else {
>> > + crosstime_support_a = true;
>>
>> Right. If clock_b is anything else than CLOCK_MONOTONIC_RAW or
>> CLOCK_REALTIME then this is true.
>>
>> > + }
>> > + }
>>
>> So in case of an error, this just keeps going with an uninitialized
>> xtstamp_a1 and if the clock_b part succeeds it continues and operates on
>> garbage.
>>
> On error xtstamp_a1 will be taken again using clock_get_crosstimespec
> so no one will be operating on garbage.
It will not, because crosstime_support_a == false. It will therefore
fall back to kc_a->clock_get_timespec(), no?
Sorry, I misread the code vs. using the uninitialized value, but this is
just unneccesary hard to follow.
>> > + if (crosstime_support_a)
>> > + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a2);
>> > + else
>> > + error = kc_a->clock_get_timespec(clock_a, &ts_a2);
>> > +
>> > + if (error)
>> > + return error;
>>
>> The logic and the code flow here are unreadable garbage and there are
>> zero comments what this is supposed to do.
>>
> I will add comments.
> please no need to use negative words like "garbage" (not the first time),
> please keep it professional and civilized.
Let me rephrase:
The code and the logic is incomprehensible unless I waste an unjustified
amount of time to decode it. Sorry, I don't have that time.
>> > + if (crosstime_support_a) {
>> > + ktime_a = ktime_sub(xtstamp_a2.device, xtstamp_a1.device);
>> > + ts_offs_err = ktime_divns(ktime_a, 2);
>> > + ktime_a = ktime_add_ns(xtstamp_a1.device, (u64)ts_offs_err);
>> > + ts_a1 = ktime_to_timespec64(ktime_a);
>>
>> This is just wrong.
>>
>> read(a1);
>> read(b);
>> read(a2);
>>
>> You _CANNOT_ assume that (a1 + ((a2 - a1) / 2) is anywhere close to the
>> point in time where 'b' is read. This code is preemtible and
>> interruptible. I explained this to you before.
>>
>> Your explanation in the comment above the function is just wishful
>> thinking.
>>
> you explained it before, but still it is better then two consecutive
> user space calls which are also preemptible
> and the userspace to kernel context switch time is added.
It might be marginally better, but it is still just _pretending_ that it
does the right thing, is correct and better than the existing IOCTLs.
If your user space implementation has the same algorithm, then I'm
absolutely not surprised that the results are not useful. Why?
You simply cannot use the midpoint of the outer samples if you want to
have precise results if there is no guarantee that b was sampled exactly
in the midpoint of a1 and a2. A hardware implementation might give that
guarantee, but the kernel cannot.
But why using the midpoint in the first place?
There is absolutely no reason to do so because the sampling points a1, b
and a2 can be precisely determined with the precision of the cross time
stamp mechanism, which is best with a hardware based cross time stamp
obviously.
The whole point of ptp::info::getcrosststamp() is to get properly
correlated clock samples of
1) PHC clock
2) CLOCK_MONOTONIC_RAW
3) CLOCK_REALTIME
So if you take 3 samples:
get_cross_timestamp(a1);
get_cross_timestamp(b);
get_cross_timestamp(a2);
then each of them provides:
- device time
- correlated CLOCK_MONOTONIC_RAW
- correlated CLOCK_REALTIME
Ergo the obvious thing to do is:
d1 = b.sys_monoraw - a1.sys_monoraw;
d2 = a2.sys_monoraw - a1.sys_monoraw;
tsa = a1.device + ((a2.device - a1.device) * d1) / d2;
Which is maximaly precise under the assumption that in the time between
the sample points a1 and a2 neither the system clock nor the PCH clocks
are changing their frequency significantly. That is a valid assumption
when you put a reasonable upper bound on d2.
Even when the device does not implement getcrosststamp() then loop based
sampling like it is implemented in the PTP_SYS_OFFSET[_EXTENDED] IOTCLs
is providing reasonably accurate results to the extent possible.
Your algorithm is imprecise by definition and you can apply as much
testing as you want, it won't become magically correct. It's still a
guesstimate, i.e. an estimate made without using adequate or complete
information.
Now why a new syscall?
This can be done from user space with existing interfaces and the very
same precision today:
ioctl(fda, PTP_SYS_OFFSET*, &a1);
ioctl(fdb, PTP_SYS_OFFSET*, &b);
ioctl(fda, PTP_SYS_OFFSET*, &a2);
u64 d1 = timespec_delta_ns(b.sys_monoraw, a1.sys_monoraw);
u64 d2 = timespec_delta_ns(a2.sys_monoraw, a1.sys_monoraw);
u64 td = (timespec_delta_ns(a2.device, a1.device) * d1) / d2
tsa = timespec_add_ns(a1.device, td);
tsb = b.device;
with the extra benefit of:
1) The correct CLOCK_REALTIME at that sample point,
i.e. b.sys_realtime
2) The correct CLOCK_MONOTONIC_RAW at that sample point,
i.e. b.sys_monoraw
It works with PTP_SYS_OFFSET_PRECISE and PTP_SYS_OFFSET[_EXTENDED], with
the obvious limitations of PTP_SYS_OFFSET[_EXTENDED], which are still
vastly superior to your proposed (a2 - a1) / 2 guestimate which is just
reading the PCH clocks with clock_get_timespec().
It is completely independent of the load, the syscall overhead and the
actual time delta between the sample points when you apply a reasonable
upper bound for d2, i.e. the time delta between the sample points a1 and
a2 to eliminate the issue that system clock and/or the PCH clocks change
their frequency significantly during that time. You'd need to do that in
the kernel too.
The actual frequency difference between PCH A and system clock is
completely irrelevant when the frequencies of both are stable accross
the sample period.
You might still argue that the time delta between the sample points a1
and a2 matters and is slightly shorter in the kernel, but that is a
non-argument because:
1) The kernel implementation does not guarantee atomicity of the
consecutive samples either. The time delta is just statistically
better, which is obviously useless when you want to make
guarantees.
2) It does not matter when the time delta is slightly larger because
you need a large frequency change of the involved clocks in the
sample interval between the sample points a1 and a2 to make an
actual difference in the resulting accuracy.
A typical temperature drift of a high quality cyrstal is less than
1ppm per degree Celsius and even if you assume that the overall
system drift is 10ppm per degree Celsius then still the actual
error for a bound time delta between the sample points a1 and a2 is
just somewhere in the irrelevant noise, unless you manage to blow
torch or ice spray your crystals during the sample interval.
If your clocks are not stable enough then nothing can cure it and
you cannot do high precision timekeeping with them.
So what is your new syscall solving that can't be done with the existing
IOCTLs other than providing worse precision results based on
guesstimates and some handwavy future use for random clock ids?
Nothing as far as I can tell, but I might be missing something important
here.
Thanks,
tglx
---
arch/x86/kernel/tsc.c:119: "Math is hard, let's go shopping." - John Stultz
On Thu, Mar 14 2024 at 17:46, Sagi Maimon wrote:
Can you please trim your replies? I really have better things to do than
doing detective work to find 10 new lines within 200+ irrelevant ones.
> On Thu, Mar 14, 2024 at 1:12 PM Thomas Gleixner <[email protected]> wrote:
>> Please read and follow the documentation provided at:
>>
>> https://www.kernel.org/doc/html/latest/process/maintainer-tip.html
>>
> I have missed this part on prviews reply.
> I have read the documentation above and I think that the variable
> declarations at the beginning of a function is in reverse fir tree
> order meaning from big to small, but I guess that I am missing something,
> can you please explain what is wrong with the variable declaration,
> so I can fix it.
>> > + struct timespec64 ts_a, ts_a1, ts_b, ts_a2;
>> > + struct system_device_crosststamp xtstamp_a1, xtstamp_a2, xtstamp_b;
>> > + const struct k_clock *kc_a, *kc_b;
>> > + ktime_t ktime_a;
>> > + s64 ts_offs_err = 0;
>> > + int error = 0;
>> > + bool crosstime_support_a = false;
>> > + bool crosstime_support_b = false;
It's not about the data type. Look at the three layouts and figure out
which one is better to parse.
On Thu, Mar 14, 2024 at 8:08 PM Thomas Gleixner <[email protected]> wrote:
>
> Sagi!
>
> On Thu, Mar 14 2024 at 14:19, Sagi Maimon wrote:
> > On Thu, Mar 14, 2024 at 1:12 PM Thomas Gleixner <[email protected]> wrote:
> >> On Thu, Mar 14 2024 at 11:05, Sagi Maimon wrote:
> >> > Some user space applications need to read a couple of different clocks.
> >> > Each read requires moving from user space to kernel space.
> >> > Reading each clock separately (syscall) introduces extra
> >> > unpredictable/unmeasurable delay. Minimizing this delay contributes to user
> >> > space actions on these clocks (e.g. synchronization etc).
> >>
> >> I asked for a proper description of the actual problem several times now
> >> and you still provide some handwaving blurb. Feel free to ignore me, but
> >> then please don't be surprised if I ignore you too.
> >>
> > Nobody is ignoring your notes, and I address any notes given by any
> > maintainer in the most serious way.
> > As far as explaining the actual problem this is the best that I can,
> > but let me try to explain better:
> > We did many tests with different CPU loading and compared sampling the
> > same clock twice,
> > once in user space and once by using the system call.
> > We have noticed an improvement up to hundreds of nanoseconds while
> > using the system call.
> > Those results improved our ability to sync different PHCs
>
> So let me express how I understand the problem - as far as I decoded it
> from your writeups:
>
> Synchronizing two PHCs requires to read timestamps from both and
> correlate them. Currently this requires several seperate system calls.
> This is subject to unmeasurable delays due to system call overhead,
> preemption and interrupts which makes the correlation imprecise.
>
> Therefore you want a system call, which samples two clocks at once, to
> make the correlation more precise.
>
> Right? For the further comments I assume this is what you are trying to
> say and to solve
You are right.
>
> So far so good, except that I do not agree with that reasoning at all:
>
> 1. The delays are measurable and as precise as the cross time stamp
> mechanism (hardware or software based) allows.
>
Most of the PHCs do not support crosstime stamps.
> 2. The system call overhead is completely irrelevant.
>
You are right in case of long preemption, but in other cases it is relevant.
> 3. The time deltas between the sample points are irrelevant within a
> reasonable upper bound to the time delta between the two outer
> sample points.
>
See below
> 4. The alledged higher precision is based on a guesstimate and not on
> correctness. Just because it behaves slightly better in testing
> does not make it any more correct.
>
See below
> 5. The problem can be solved with maximal possible accuracy by using
> the existing PTP IOCTLs.
>
> See below.
>
> >> Also why does reading two random clocks make any sense at all? Your code
> >> allows to read CLOCK_TAI and CLOCK_THREAD_CPUTIME_ID. What for?
> >>
> > Initially we needed to sync some different PHCs for our user space
> > application, that is why we came with this idea.
> > The first idea was an IOCTL that returned samples of several PHCs for
> > the need of synchronization.
> > Richard Cochran suggested a system call instead, which will add the
> > ability to get various system clocks, while this
> > implementation is more complex then IOCTL, I think that he was right,
> > for future usage.
>
> Which future usage? We are not introducing swiss army knife interfaces
> just because there might be an illusional use case somewhere in the
> unspecified future.
>
> Adding a system call needs a proper design and justification. Handwaving
> future usage is not enough.
>
> Documentation/process/adding-syscalls.rst is very clear about what is
> required for a new system call.
>
> >> This needs to be split up into:
> >>
> >> 1) Infrastructure in posix-timers.c
> >> 2) Wire up the syscall in x86
> >> 3) Infrastructure in posix-clock.c
> >> 4) Usage in ptp_clock.c
> >>
> >> and not as a big lump of everything.
> >>
> > I know, but I think the benefit worth it
>
> It's worth it because it makes review easier. It's well documented in
> the process documentation that patches should do one thing and not a
> whole lump of changes.
>
No problem it will be split into two different patches.
> >> > + if (!error) {
> >> > + if (clock_b == CLOCK_MONOTONIC_RAW) {
> >> > + ts_b = ktime_to_timespec64(xtstamp_a1.sys_monoraw);
> >> > + ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
> >> > + goto out;
> >> > + } else if (clock_b == CLOCK_REALTIME) {
> >> > + ts_b = ktime_to_timespec64(xtstamp_a1.sys_realtime);
> >> > + ts_a1 = ktime_to_timespec64(xtstamp_a1.device);
> >> > + goto out;
> >> > + } else {
> >> > + crosstime_support_a = true;
> >>
> >> Right. If clock_b is anything else than CLOCK_MONOTONIC_RAW or
> >> CLOCK_REALTIME then this is true.
> >>
> >> > + }
> >> > + }
> >>
> >> So in case of an error, this just keeps going with an uninitialized
> >> xtstamp_a1 and if the clock_b part succeeds it continues and operates on
> >> garbage.
> >>
> > On error xtstamp_a1 will be taken again using clock_get_crosstimespec
> > so no one will be operating on garbage.
>
> It will not, because crosstime_support_a == false. It will therefore
> fall back to kc_a->clock_get_timespec(), no?
>
> Sorry, I misread the code vs. using the uninitialized value, but this is
> just unneccesary hard to follow.
>
> >> > + if (crosstime_support_a)
> >> > + error = kc_a->clock_get_crosstimespec(clock_a, &xtstamp_a2);
> >> > + else
> >> > + error = kc_a->clock_get_timespec(clock_a, &ts_a2);
> >> > +
> >> > + if (error)
> >> > + return error;
> >>
> >> The logic and the code flow here are unreadable garbage and there are
> >> zero comments what this is supposed to do.
> >>
> > I will add comments.
> > please no need to use negative words like "garbage" (not the first time),
> > please keep it professional and civilized.
>
> Let me rephrase:
>
> The code and the logic is incomprehensible unless I waste an unjustified
> amount of time to decode it. Sorry, I don't have that time.
>
> >> > + if (crosstime_support_a) {
> >> > + ktime_a = ktime_sub(xtstamp_a2.device, xtstamp_a1.device);
> >> > + ts_offs_err = ktime_divns(ktime_a, 2);
> >> > + ktime_a = ktime_add_ns(xtstamp_a1.device, (u64)ts_offs_err);
> >> > + ts_a1 = ktime_to_timespec64(ktime_a);
> >>
> >> This is just wrong.
> >>
> >> read(a1);
> >> read(b);
> >> read(a2);
> >>
> >> You _CANNOT_ assume that (a1 + ((a2 - a1) / 2) is anywhere close to the
> >> point in time where 'b' is read. This code is preemtible and
> >> interruptible. I explained this to you before.
> >>
> >> Your explanation in the comment above the function is just wishful
> >> thinking.
> >>
> > you explained it before, but still it is better then two consecutive
> > user space calls which are also preemptible
> > and the userspace to kernel context switch time is added.
>
> It might be marginally better, but it is still just _pretending_ that it
> does the right thing, is correct and better than the existing IOCTLs.
>
> If your user space implementation has the same algorithm, then I'm
> absolutely not surprised that the results are not useful. Why?
>
> You simply cannot use the midpoint of the outer samples if you want to
> have precise results if there is no guarantee that b was sampled exactly
> in the midpoint of a1 and a2. A hardware implementation might give that
> guarantee, but the kernel cannot.
>
> But why using the midpoint in the first place?
>
> There is absolutely no reason to do so because the sampling points a1, b
> and a2 can be precisely determined with the precision of the cross time
> stamp mechanism, which is best with a hardware based cross time stamp
> obviously.
>
> The whole point of ptp::info::getcrosststamp() is to get properly
> correlated clock samples of
>
> 1) PHC clock
> 2) CLOCK_MONOTONIC_RAW
> 3) CLOCK_REALTIME
>
> So if you take 3 samples:
>
> get_cross_timestamp(a1);
> get_cross_timestamp(b);
> get_cross_timestamp(a2);
>
> then each of them provides:
>
> - device time
> - correlated CLOCK_MONOTONIC_RAW
> - correlated CLOCK_REALTIME
>
> Ergo the obvious thing to do is:
>
> d1 = b.sys_monoraw - a1.sys_monoraw;
> d2 = a2.sys_monoraw - a1.sys_monoraw;
>
> tsa = a1.device + ((a2.device - a1.device) * d1) / d2;
>
> Which is maximaly precise under the assumption that in the time between
> the sample points a1 and a2 neither the system clock nor the PCH clocks
> are changing their frequency significantly. That is a valid assumption
> when you put a reasonable upper bound on d2.
>
You are right.
In fact, we are running this calculation on a user space application.
We use the new system call to get pairs of mono and PHC and then run
that calculation in user space.
That is why the system call returns pairs of clock samples and not the
diff between them.
> Even when the device does not implement getcrosststamp() then loop based
> sampling like it is implemented in the PTP_SYS_OFFSET[_EXTENDED] IOTCLs
> is providing reasonably accurate results to the extent possible.
>
> Your algorithm is imprecise by definition and you can apply as much
> testing as you want, it won't become magically correct. It's still a
> guesstimate, i.e. an estimate made without using adequate or complete
> information.
>
> Now why a new syscall?
>
> This can be done from user space with existing interfaces and the very
> same precision today:
>
> ioctl(fda, PTP_SYS_OFFSET*, &a1);
> ioctl(fdb, PTP_SYS_OFFSET*, &b);
> ioctl(fda, PTP_SYS_OFFSET*, &a2);
>
> u64 d1 = timespec_delta_ns(b.sys_monoraw, a1.sys_monoraw);
> u64 d2 = timespec_delta_ns(a2.sys_monoraw, a1.sys_monoraw);
> u64 td = (timespec_delta_ns(a2.device, a1.device) * d1) / d2
>
> tsa = timespec_add_ns(a1.device, td);
> tsb = b.device;
>
> with the extra benefit of:
>
> 1) The correct CLOCK_REALTIME at that sample point,
> i.e. b.sys_realtime
>
> 2) The correct CLOCK_MONOTONIC_RAW at that sample point,
> i.e. b.sys_monoraw
>
If PTP_SYS_OFFSET IOCTL returns sys_monoraw, then you are right, but
unfortunately the only IOCTL that returns sys_monoraw is
PTP_SYS_OFFSET_PRECISE (getcrosststamp)
And most of the drivers does not support it.
> It works with PTP_SYS_OFFSET_PRECISE and PTP_SYS_OFFSET[_EXTENDED], with
> the obvious limitations of PTP_SYS_OFFSET[_EXTENDED], which are still
> vastly superior to your proposed (a2 - a1) / 2 guestimate which is just
> reading the PCH clocks with clock_get_timespec().
>
It only works with PTP_SYS_OFFSET_PRECISE (which most of the NIC
drivers does not support and I take it under consideration in my
system call),
PTP_SYS_OFFSET_EXTENDED ioctl returns system time before, PHC, system
time after , and no monotic raw.
> It is completely independent of the load, the syscall overhead and the
> actual time delta between the sample points when you apply a reasonable
> upper bound for d2, i.e. the time delta between the sample points a1 and
> a2 to eliminate the issue that system clock and/or the PCH clocks change
> their frequency significantly during that time. You'd need to do that in
> the kernel too.
>
> The actual frequency difference between PCH A and system clock is
> completely irrelevant when the frequencies of both are stable accross
> the sample period.
>
> You might still argue that the time delta between the sample points a1
> and a2 matters and is slightly shorter in the kernel, but that is a
> non-argument because:
>
> 1) The kernel implementation does not guarantee atomicity of the
> consecutive samples either. The time delta is just statistically
> better, which is obviously useless when you want to make
> guarantees.
>
> 2) It does not matter when the time delta is slightly larger because
> you need a large frequency change of the involved clocks in the
> sample interval between the sample points a1 and a2 to make an
> actual difference in the resulting accuracy.
>
> A typical temperature drift of a high quality cyrstal is less than
> 1ppm per degree Celsius and even if you assume that the overall
> system drift is 10ppm per degree Celsius then still the actual
> error for a bound time delta between the sample points a1 and a2 is
> just somewhere in the irrelevant noise, unless you manage to blow
> torch or ice spray your crystals during the sample interval.
>
> If your clocks are not stable enough then nothing can cure it and
> you cannot do high precision timekeeping with them.
>
You are right in case of long preemption (which still the system call
is better), but in other cases it is relevant.
> So what is your new syscall solving that can't be done with the existing
> IOCTLs other than providing worse precision results based on
> guesstimates and some handwavy future use for random clock ids?
>
> Nothing as far as I can tell, but I might be missing something important
> here.
>
Few points to consider:
1) Most of the PHCs do not support cross time stamping.
2) Users can implement your suggested code in user space while using
the new system call to get pairs of mono and PHC
. This is what we did already in user space.
3) User with less tight requirement will benefit high accuracy with
the new system call
> Thanks,
>
> tglx
> ---
> arch/x86/kernel/tsc.c:119: "Math is hard, let's go shopping." - John Stultz
Sagi!
On Wed, Mar 20 2024 at 16:42, Sagi Maimon wrote:
> On Thu, Mar 14, 2024 at 8:08 PM Thomas Gleixner <[email protected]> wrote:
>> Which is maximaly precise under the assumption that in the time between
>> the sample points a1 and a2 neither the system clock nor the PCH clocks
>> are changing their frequency significantly. That is a valid assumption
>> when you put a reasonable upper bound on d2.
>>
>
> You are right.
> In fact, we are running this calculation on a user space application.
> We use the new system call to get pairs of mono and PHC and then run
> that calculation in user space.
> That is why the system call returns pairs of clock samples and not the
> diff between them.
Please stop claiming things which are fundamentally wrong:
The proposed system call returns the PHC sample and the midpoint of
two CLOCK_WHATEVER samples which have been sampled before and after
the PHC sample.
That is fundamentally different from a pair of samples as I explained
to you in great length more than once by now.
I understand that you can't rely on the PTP_SYS_OFFSET_PRECISE IOCTL
alone because there is not much hardware support, but what you are
proposing is way worse than the other two PTP_SYS_OFFSET variants.
PTP_SYS_OFFSET at least gives the caller a choice of analysis of the
interleaving system timestamps.
PTP_SYS_OFFSET_EXTENDED moves the outer sample points as close as
possible to the actual PCH read and provides both outer samples to user
space for analysis. It was introduced for a reason, no?
Your proposed system call is just declaring arbitrarily that the
CLOCK_WHATEVER sample is exactly the midpoint of the two outer samples
and is therefore superior and correct.
It is neither superior nor correct because the midpoint is an
ill-defined assumption as I explained to you multiple times now.
Aside of that the approach loses the extended information of
PTP_SYS_OFFSET and PTP_SYS_OFFSET_EXTENDED including the more precise
sampling behaviour of the latter. IOW, it is ignoring and throwing away
the effort of people who cared about making the best out of the
limitations of hardware including the already existing algorithms to
make sense out of it.
The P at the beginning of PTP does not mean 'Potentially precise' and
the lack of C in PTP does not mean that Correctness is overrated.
The problem is that these non hardware assisted IOCTL variants sample
only CLOCK_REALTIME and not CLOCK_MONOTONIC_RAW, which is all you need
to solve your problem at hand, no?
That's absolutely not rocket science to solve. The below sketch does
exactly that without creating an ill-defined syscall monstrosity, at the
same time it is fully preserving the benefits of the existing IOCTL
variants and therefore allows to apply already existing algorithms to
analyse that data. That's too simple and too obviously correct, right?
The thing is a sketch and it's neither compiled nor tested. It's just
for illustration and you can keep all bugs you might find in it.
On top this needs an analyis whether any of the gettimex64()
implementations does something special instead of invoking the
ptp_read_system_prets() and ptp_read_system_postts() helpers as close as
possible to the PCH readout, but that's not rocket science either. It's
just 21 callbacks to look at.
It might also require a new set of variant '3' IOTCLS to make that flag
field work, but that's not going to make the change more complex and
it's an exercise left to the experts of that IOCTL interface.
Thanks,
tglx
---
drivers/ptp/ptp_chardev.c | 36 ++++++++++++++++++++++--------------
include/linux/ptp_clock_kernel.h | 28 +++++++++++++++++++---------
include/uapi/linux/ptp_clock.h | 10 ++++++++--
3 files changed, 49 insertions(+), 25 deletions(-)
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -164,9 +164,9 @@ long ptp_ioctl(struct posix_clock_contex
struct ptp_sys_offset_precise precise_offset;
struct system_device_crosststamp xtstamp;
struct ptp_clock_info *ops = ptp->info;
+ struct ptp_system_timestamp sts = { };
struct ptp_sys_offset *sysoff = NULL;
struct timestamp_event_queue *tsevq;
- struct ptp_system_timestamp sts;
struct ptp_clock_request req;
struct ptp_clock_caps caps;
struct ptp_clock_time *pct;
@@ -358,11 +358,13 @@ long ptp_ioctl(struct posix_clock_contex
extoff = NULL;
break;
}
- if (extoff->n_samples > PTP_MAX_SAMPLES
- || extoff->rsv[0] || extoff->rsv[1] || extoff->rsv[2]) {
+ if (!extoff->n_samples || extoff->n_samples > PTP_MAX_SAMPLES ||
+ (extoff->flags & ~PTP_SYS_OFFSET_VALID_FLAGS) ||
+ extoff->rsv[0] || extoff->rsv[1]) {
err = -EINVAL;
break;
}
+ sts.flags = extoff->flags;
for (i = 0; i < extoff->n_samples; i++) {
err = ptp->info->gettimex64(ptp->info, &ts, &sts);
if (err)
@@ -386,29 +388,35 @@ long ptp_ioctl(struct posix_clock_contex
sysoff = NULL;
break;
}
- if (sysoff->n_samples > PTP_MAX_SAMPLES) {
+ if (!sysoff->n_samples || sysoff->n_samples > PTP_MAX_SAMPLES ||
+ (sysoff->flags & ~PTP_SYS_OFFSET_VALID_FLAGS) ||
+ sysoff->rsv[0] || sysoff->rsv[1]) {
err = -EINVAL;
break;
}
+ sts.flags = sysoff->flags;
pct = &sysoff->ts[0];
for (i = 0; i < sysoff->n_samples; i++) {
- ktime_get_real_ts64(&ts);
- pct->sec = ts.tv_sec;
- pct->nsec = ts.tv_nsec;
- pct++;
- if (ops->gettimex64)
- err = ops->gettimex64(ops, &ts, NULL);
- else
+ if (ops->gettimex64) {
+ err = ops->gettimex64(ops, &ts, &sts);
+ } else {
+ ptp_read_system_prets(&sts);
err = ops->gettime64(ops, &ts);
+ }
if (err)
goto out;
+
+ pct->sec = sts.pre_ts.tv_sec;
+ pct->nsec = sts.pre_ts.tv_nsec;
+ pct++;
pct->sec = ts.tv_sec;
pct->nsec = ts.tv_nsec;
pct++;
}
- ktime_get_real_ts64(&ts);
- pct->sec = ts.tv_sec;
- pct->nsec = ts.tv_nsec;
+ if (!ops->gettimex64)
+ ptp_read_system_postts(&sts);
+ pct->sec = sts.post_ts.tv_sec;
+ pct->nsec = sts.post_ts.tv_nsec;
if (copy_to_user((void __user *)arg, sysoff, sizeof(*sysoff)))
err = -EFAULT;
break;
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -44,13 +44,15 @@ struct ptp_clock_request {
struct system_device_crosststamp;
/**
- * struct ptp_system_timestamp - system time corresponding to a PHC timestamp
- * @pre_ts: system timestamp before capturing PHC
- * @post_ts: system timestamp after capturing PHC
+ * struct ptp_system_timestamp - System time corresponding to a PHC timestamp
+ * @flags: Flags to select the system clock to sample
+ * @pre_ts: System timestamp before capturing PHC
+ * @post_ts: System timestamp after capturing PHC
*/
struct ptp_system_timestamp {
- struct timespec64 pre_ts;
- struct timespec64 post_ts;
+ unsigned int flags;
+ struct timespec64 pre_ts;
+ struct timespec64 post_ts;
};
/**
@@ -457,14 +459,22 @@ static inline ktime_t ptp_convert_timest
static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
{
- if (sts)
- ktime_get_real_ts64(&sts->pre_ts);
+ if (sts) {
+ if (sts->flags & PTP_SYS_OFFSET_MONO_RAW)
+ ktime_get_raw_ts64(&sts->pre_ts);
+ else
+ ktime_get_real_ts64(&sts->pre_ts);
+ }
}
static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
{
- if (sts)
- ktime_get_real_ts64(&sts->post_ts);
+ if (sts) {
+ if (sts->flags & PTP_SYS_OFFSET_MONO_RAW)
+ ktime_get_raw_ts64(&sts->post_ts);
+ else
+ ktime_get_real_ts64(&sts->post_ts);
+ }
}
#endif
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -76,6 +76,10 @@
*/
#define PTP_PEROUT_V1_VALID_FLAGS (0)
+/* Bits for PTP_SYS_OFFSET and PTP_SYS_OFFSET_EXTENDED */
+#define PTP_SYS_OFFSET_MONO_RAW (1U << 0)
+#define PTP_SYS_OFFSET_VALID_FLAGS (PTP_SYS_OFFSET_MONO_RAW)
+
/*
* struct ptp_clock_time - represents a time value
*
@@ -146,7 +150,8 @@ struct ptp_perout_request {
struct ptp_sys_offset {
unsigned int n_samples; /* Desired number of measurements. */
- unsigned int rsv[3]; /* Reserved for future use. */
+ unsigned int flags;
+ unsigned int rsv[2]; /* Reserved for future use. */
/*
* Array of interleaved system/phc time stamps. The kernel
* will provide 2*n_samples + 1 time stamps, with the last
@@ -157,7 +162,8 @@ struct ptp_sys_offset {
struct ptp_sys_offset_extended {
unsigned int n_samples; /* Desired number of measurements. */
- unsigned int rsv[3]; /* Reserved for future use. */
+ unsigned int flags;
+ unsigned int rsv[2]; /* Reserved for future use. */
/*
* Array of [system, phc, system] time stamps. The kernel will provide
* 3*n_samples time stamps.
On Sat, Mar 23 2024 at 01:38, Thomas Gleixner wrote:
> PTP_SYS_OFFSET_EXTENDED moves the outer sample points as close as
> possible to the actual PCH read and provides both outer samples to user
> space for analysis. It was introduced for a reason, no?
That said, it's a sad state of affairs that 16 drivers which did exist
before the introduction of the gettimex64() callback have not been
converted over to it within 4.5 years.
What's even worse is that 14 drivers have been merged _after_ the
gettimex64() callback got introduced without implementing it:
2019-02-12 drivers/net/ethernet/freescale/enetc/enetc_ptp.c
2019-10-24 drivers/net/ethernet/aquantia/atlantic/aq_ptp.c
2019-11-15 drivers/net/dsa/ocelot/felix_vsc9959.c
2020-01-12 drivers/net/ethernet/xscale/ptp_ixp46x.c
2020-06-20 drivers/net/ethernet/mscc/ocelot_vsc7514.c
2020-06-24 drivers/net/phy/mscc/mscc_ptp.c
2020-08-24 drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
2020-11-05 drivers/net/dsa/hirschmann/hellcreek_ptp.c
2022-02-01 drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c
2022-03-04 drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c
2022-05-10 drivers/net/ethernet/sfc/siena/ptp.c
2022-11-02 drivers/net/ethernet/renesas/rcar_gen4_ptp.c
2023-01-13 drivers/net/dsa/microchip/ksz_ptp.c
2023-03-22 drivers/net/wireless/intel/iwlwifi/mvm/ptp.c
Not Sagi's fault at all, but it's telling and coherent with the approach
to solve the problem at hand.
See the previous reply for the observation on the letters P and C in PTP.
Oh well,
tglx
On Sat Mar 23 2024, Thomas Gleixner wrote:
> On Sat, Mar 23 2024 at 01:38, Thomas Gleixner wrote:
>> PTP_SYS_OFFSET_EXTENDED moves the outer sample points as close as
>> possible to the actual PCH read and provides both outer samples to user
>> space for analysis. It was introduced for a reason, no?
>
> That said, it's a sad state of affairs that 16 drivers which did exist
> before the introduction of the gettimex64() callback have not been
> converted over to it within 4.5 years.
>
> What's even worse is that 14 drivers have been merged _after_ the
> gettimex64() callback got introduced without implementing it:
>
[...]
> 2020-11-05 drivers/net/dsa/hirschmann/hellcreek_ptp.c
Oh, my bad. Let me switch this one to gettimex64() then.
Thanks,
Kurt
Hi Thomas
On Sat, Mar 23, 2024 at 2:38 AM Thomas Gleixner <[email protected]> wrote:
>
> Sagi!
>
> On Wed, Mar 20 2024 at 16:42, Sagi Maimon wrote:
> > On Thu, Mar 14, 2024 at 8:08 PM Thomas Gleixner <[email protected]> wrote:
> >> Which is maximaly precise under the assumption that in the time between
> >> the sample points a1 and a2 neither the system clock nor the PCH clocks
> >> are changing their frequency significantly. That is a valid assumption
> >> when you put a reasonable upper bound on d2.
> >>
> >
> > You are right.
> > In fact, we are running this calculation on a user space application.
> > We use the new system call to get pairs of mono and PHC and then run
> > that calculation in user space.
> > That is why the system call returns pairs of clock samples and not the
> > diff between them.
>
> Please stop claiming things which are fundamentally wrong:
>
> The proposed system call returns the PHC sample and the midpoint of
> two CLOCK_WHATEVER samples which have been sampled before and after
> the PHC sample.
>
> That is fundamentally different from a pair of samples as I explained
> to you in great length more than once by now.
>
> I understand that you can't rely on the PTP_SYS_OFFSET_PRECISE IOCTL
> alone because there is not much hardware support, but what you are
> proposing is way worse than the other two PTP_SYS_OFFSET variants.
>
> PTP_SYS_OFFSET at least gives the caller a choice of analysis of the
> interleaving system timestamps.
>
> PTP_SYS_OFFSET_EXTENDED moves the outer sample points as close as
> possible to the actual PCH read and provides both outer samples to user
> space for analysis. It was introduced for a reason, no?
>
> Your proposed system call is just declaring arbitrarily that the
> CLOCK_WHATEVER sample is exactly the midpoint of the two outer samples
> and is therefore superior and correct.
>
> It is neither superior nor correct because the midpoint is an
> ill-defined assumption as I explained to you multiple times now.
>
> Aside of that the approach loses the extended information of
> PTP_SYS_OFFSET and PTP_SYS_OFFSET_EXTENDED including the more precise
> sampling behaviour of the latter. IOW, it is ignoring and throwing away
> the effort of people who cared about making the best out of the
> limitations of hardware including the already existing algorithms to
> make sense out of it.
>
> The P at the beginning of PTP does not mean 'Potentially precise' and
> the lack of C in PTP does not mean that Correctness is overrated.
>
> The problem is that these non hardware assisted IOCTL variants sample
> only CLOCK_REALTIME and not CLOCK_MONOTONIC_RAW, which is all you need
> to solve your problem at hand, no?
>
> That's absolutely not rocket science to solve. The below sketch does
> exactly that without creating an ill-defined syscall monstrosity, at the
> same time it is fully preserving the benefits of the existing IOCTL
> variants and therefore allows to apply already existing algorithms to
> analyse that data. That's too simple and too obviously correct, right?
>
> The thing is a sketch and it's neither compiled nor tested. It's just
> for illustration and you can keep all bugs you might find in it.
>
> On top this needs an analyis whether any of the gettimex64()
> implementations does something special instead of invoking the
> ptp_read_system_prets() and ptp_read_system_postts() helpers as close as
> possible to the PCH readout, but that's not rocket science either. It's
> just 21 callbacks to look at.
>
I like your suggestion, thanks!
it is what our user space needs from the kernel and with minimum kernel changes.
I will write it, test it and upload it with your permission (it is you
idea after all).
> It might also require a new set of variant '3' IOTCLS to make that flag
> field work, but that's not going to make the change more complex and
> it's an exercise left to the experts of that IOCTL interface.
>
I think that I understand your meaning.
There is a backward compatibility problem here.
Existing user space application using PTP_SYS_OFFSET_EXTENDED ioctl
won't have any problems
because of the "extoff->rsv[0] || extoff->rsv[1] || extoff->rsv[2]"
test, but what about all
old user space applications using: PTP_SYS_OFFSET ?
How can it be solved?
Can you explain what you meant above regarding the IOCTL?
Thanks,
Sagi
> Thanks,
>
> tglx
> ---
> drivers/ptp/ptp_chardev.c | 36 ++++++++++++++++++++++--------------
> include/linux/ptp_clock_kernel.h | 28 +++++++++++++++++++---------
> include/uapi/linux/ptp_clock.h | 10 ++++++++--
> 3 files changed, 49 insertions(+), 25 deletions(-)
>
> --- a/drivers/ptp/ptp_chardev.c
> +++ b/drivers/ptp/ptp_chardev.c
> @@ -164,9 +164,9 @@ long ptp_ioctl(struct posix_clock_contex
> struct ptp_sys_offset_precise precise_offset;
> struct system_device_crosststamp xtstamp;
> struct ptp_clock_info *ops = ptp->info;
> + struct ptp_system_timestamp sts = { };
> struct ptp_sys_offset *sysoff = NULL;
> struct timestamp_event_queue *tsevq;
> - struct ptp_system_timestamp sts;
> struct ptp_clock_request req;
> struct ptp_clock_caps caps;
> struct ptp_clock_time *pct;
> @@ -358,11 +358,13 @@ long ptp_ioctl(struct posix_clock_contex
> extoff = NULL;
> break;
> }
> - if (extoff->n_samples > PTP_MAX_SAMPLES
> - || extoff->rsv[0] || extoff->rsv[1] || extoff->rsv[2]) {
> + if (!extoff->n_samples || extoff->n_samples > PTP_MAX_SAMPLES ||
> + (extoff->flags & ~PTP_SYS_OFFSET_VALID_FLAGS) ||
> + extoff->rsv[0] || extoff->rsv[1]) {
> err = -EINVAL;
> break;
> }
> + sts.flags = extoff->flags;
> for (i = 0; i < extoff->n_samples; i++) {
> err = ptp->info->gettimex64(ptp->info, &ts, &sts);
> if (err)
> @@ -386,29 +388,35 @@ long ptp_ioctl(struct posix_clock_contex
> sysoff = NULL;
> break;
> }
> - if (sysoff->n_samples > PTP_MAX_SAMPLES) {
> + if (!sysoff->n_samples || sysoff->n_samples > PTP_MAX_SAMPLES ||
> + (sysoff->flags & ~PTP_SYS_OFFSET_VALID_FLAGS) ||
> + sysoff->rsv[0] || sysoff->rsv[1]) {
> err = -EINVAL;
> break;
> }
> + sts.flags = sysoff->flags;
> pct = &sysoff->ts[0];
> for (i = 0; i < sysoff->n_samples; i++) {
> - ktime_get_real_ts64(&ts);
> - pct->sec = ts.tv_sec;
> - pct->nsec = ts.tv_nsec;
> - pct++;
> - if (ops->gettimex64)
> - err = ops->gettimex64(ops, &ts, NULL);
> - else
> + if (ops->gettimex64) {
> + err = ops->gettimex64(ops, &ts, &sts);
> + } else {
> + ptp_read_system_prets(&sts);
> err = ops->gettime64(ops, &ts);
> + }
> if (err)
> goto out;
> +
> + pct->sec = sts.pre_ts.tv_sec;
> + pct->nsec = sts.pre_ts.tv_nsec;
> + pct++;
> pct->sec = ts.tv_sec;
> pct->nsec = ts.tv_nsec;
> pct++;
> }
> - ktime_get_real_ts64(&ts);
> - pct->sec = ts.tv_sec;
> - pct->nsec = ts.tv_nsec;
> + if (!ops->gettimex64)
> + ptp_read_system_postts(&sts);
> + pct->sec = sts.post_ts.tv_sec;
> + pct->nsec = sts.post_ts.tv_nsec;
> if (copy_to_user((void __user *)arg, sysoff, sizeof(*sysoff)))
> err = -EFAULT;
> break;
> --- a/include/linux/ptp_clock_kernel.h
> +++ b/include/linux/ptp_clock_kernel.h
> @@ -44,13 +44,15 @@ struct ptp_clock_request {
> struct system_device_crosststamp;
>
> /**
> - * struct ptp_system_timestamp - system time corresponding to a PHC timestamp
> - * @pre_ts: system timestamp before capturing PHC
> - * @post_ts: system timestamp after capturing PHC
> + * struct ptp_system_timestamp - System time corresponding to a PHC timestamp
> + * @flags: Flags to select the system clock to sample
> + * @pre_ts: System timestamp before capturing PHC
> + * @post_ts: System timestamp after capturing PHC
> */
> struct ptp_system_timestamp {
> - struct timespec64 pre_ts;
> - struct timespec64 post_ts;
> + unsigned int flags;
> + struct timespec64 pre_ts;
> + struct timespec64 post_ts;
> };
>
> /**
> @@ -457,14 +459,22 @@ static inline ktime_t ptp_convert_timest
>
> static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
> {
> - if (sts)
> - ktime_get_real_ts64(&sts->pre_ts);
> + if (sts) {
> + if (sts->flags & PTP_SYS_OFFSET_MONO_RAW)
> + ktime_get_raw_ts64(&sts->pre_ts);
> + else
> + ktime_get_real_ts64(&sts->pre_ts);
> + }
> }
>
> static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
> {
> - if (sts)
> - ktime_get_real_ts64(&sts->post_ts);
> + if (sts) {
> + if (sts->flags & PTP_SYS_OFFSET_MONO_RAW)
> + ktime_get_raw_ts64(&sts->post_ts);
> + else
> + ktime_get_real_ts64(&sts->post_ts);
> + }
> }
>
> #endif
> --- a/include/uapi/linux/ptp_clock.h
> +++ b/include/uapi/linux/ptp_clock.h
> @@ -76,6 +76,10 @@
> */
> #define PTP_PEROUT_V1_VALID_FLAGS (0)
>
> +/* Bits for PTP_SYS_OFFSET and PTP_SYS_OFFSET_EXTENDED */
> +#define PTP_SYS_OFFSET_MONO_RAW (1U << 0)
> +#define PTP_SYS_OFFSET_VALID_FLAGS (PTP_SYS_OFFSET_MONO_RAW)
> +
> /*
> * struct ptp_clock_time - represents a time value
> *
> @@ -146,7 +150,8 @@ struct ptp_perout_request {
>
> struct ptp_sys_offset {
> unsigned int n_samples; /* Desired number of measurements. */
> - unsigned int rsv[3]; /* Reserved for future use. */
> + unsigned int flags;
> + unsigned int rsv[2]; /* Reserved for future use. */
> /*
> * Array of interleaved system/phc time stamps. The kernel
> * will provide 2*n_samples + 1 time stamps, with the last
> @@ -157,7 +162,8 @@ struct ptp_sys_offset {
>
> struct ptp_sys_offset_extended {
> unsigned int n_samples; /* Desired number of measurements. */
> - unsigned int rsv[3]; /* Reserved for future use. */
> + unsigned int flags;
> + unsigned int rsv[2]; /* Reserved for future use. */
> /*
> * Array of [system, phc, system] time stamps. The kernel will provide
> * 3*n_samples time stamps.
Sagi!
On Thu, Mar 28 2024 at 17:40, Sagi Maimon wrote:
> On Sat, Mar 23, 2024 at 2:38 AM Thomas Gleixner <[email protected]> wrote:
>> On top this needs an analyis whether any of the gettimex64()
>> implementations does something special instead of invoking the
>> ptp_read_system_prets() and ptp_read_system_postts() helpers as close as
>> possible to the PCH readout, but that's not rocket science either. It's
>> just 21 callbacks to look at.
>>
> I like your suggestion, thanks!
> it is what our user space needs from the kernel and with minimum kernel changes.
> I will write it, test it and upload it with your permission (it is you
> idea after all).
You don't need permission. I made a suggestion and when you are doing the
work I'm not in a position to veto posting it. We have an explicit tag
for that 'Suggested-by:', which only says that someone suggested it to
you, but then you went and implemented it, made sure it works etc.
>> It might also require a new set of variant '3' IOTCLS to make that flag
>> field work, but that's not going to make the change more complex and
>> it's an exercise left to the experts of that IOCTL interface.
>>
> I think that I understand your meaning.
> There is a backward compatibility problem here.
>
> Existing user space application using PTP_SYS_OFFSET_EXTENDED ioctl
> won't have any problems because of the "extoff->rsv[0] ||
> extoff->rsv[1] || extoff->rsv[2]" test, but what about all old user
> space applications using: PTP_SYS_OFFSET ?
So if there is a backwards compability issue with PTP_SYS_OFFSET2, then
you need to introduce PTP_SYS_OFFSET3. The PTP_SYS_*2 variants were
introduced to avoid backwards compatibility issues as well, but
unfortunately that did not address the reserved fields problem for
PTP_SYS_OFFSET2. PTP_SYS_OFFSET_EXTENDED2 should just work, but maybe
the PTP maintainers want a full extension to '3'. Either way is fine.
Thanks,
tglx
On Mon, Apr 1, 2024 at 1:46 PM Thomas Gleixner <[email protected]> wrote:
>
> Sagi!
>
> On Thu, Mar 28 2024 at 17:40, Sagi Maimon wrote:
> > On Sat, Mar 23, 2024 at 2:38 AM Thomas Gleixner <[email protected]> wrote:
> >> On top this needs an analyis whether any of the gettimex64()
> >> implementations does something special instead of invoking the
> >> ptp_read_system_prets() and ptp_read_system_postts() helpers as close as
> >> possible to the PCH readout, but that's not rocket science either. It's
> >> just 21 callbacks to look at.
> >>
> > I like your suggestion, thanks!
> > it is what our user space needs from the kernel and with minimum kernel changes.
> > I will write it, test it and upload it with your permission (it is you
> > idea after all).
>
> You don't need permission. I made a suggestion and when you are doing the
> work I'm not in a position to veto posting it. We have an explicit tag
> for that 'Suggested-by:', which only says that someone suggested it to
> you, but then you went and implemented it, made sure it works etc.
>
> >> It might also require a new set of variant '3' IOTCLS to make that flag
> >> field work, but that's not going to make the change more complex and
> >> it's an exercise left to the experts of that IOCTL interface.
> >>
> > I think that I understand your meaning.
> > There is a backward compatibility problem here.
> >
> > Existing user space application using PTP_SYS_OFFSET_EXTENDED ioctl
> > won't have any problems because of the "extoff->rsv[0] ||
> > extoff->rsv[1] || extoff->rsv[2]" test, but what about all old user
> > space applications using: PTP_SYS_OFFSET ?
>
> So if there is a backwards compability issue with PTP_SYS_OFFSET2, then
> you need to introduce PTP_SYS_OFFSET3. The PTP_SYS_*2 variants were
> introduced to avoid backwards compatibility issues as well, but
> unfortunately that did not address the reserved fields problem for
> PTP_SYS_OFFSET2. PTP_SYS_OFFSET_EXTENDED2 should just work, but maybe
> the PTP maintainers want a full extension to '3'. Either way is fine.
>
https://patchwork.kernel.org/project/netdevbpf/patch/[email protected]/
This was my attempt to solve a similar issue with the new ioctl op to
avoid backward compatibility issues. Instead of flags I used the
clockid_t in a similar fashion.
Thanks,
> Thanks,
>
> tglx
>
>
On Mon, Apr 01 2024 at 22:42, Mahesh Bandewar (महेश बंडेवार) wrote:
> On Mon, Apr 1, 2024 at 1:46 PM Thomas Gleixner <[email protected]> wrote:
>> So if there is a backwards compability issue with PTP_SYS_OFFSET2, then
>> you need to introduce PTP_SYS_OFFSET3. The PTP_SYS_*2 variants were
>> introduced to avoid backwards compatibility issues as well, but
>> unfortunately that did not address the reserved fields problem for
>> PTP_SYS_OFFSET2. PTP_SYS_OFFSET_EXTENDED2 should just work, but maybe
>> the PTP maintainers want a full extension to '3'. Either way is fine.
>>
> https://patchwork.kernel.org/project/netdevbpf/patch/[email protected]/
>
> This was my attempt to solve a similar issue with the new ioctl op to
> avoid backward compatibility issues. Instead of flags I used the
> clockid_t in a similar fashion.
Works as well. I'm not seing the point for CLOCK_MONOTONIC and the
change logs are not really telling anything about the problem being
solved....
Thanks,
tglx
On Tue, Apr 02 2024 at 14:16, Mahesh Bandewar (महेश बंडेवार) wrote:
> On Tue, Apr 2, 2024 at 2:25 AM Thomas Gleixner <[email protected]> wrote:
>> Works as well. I'm not seing the point for CLOCK_MONOTONIC and the
>> change logs are not really telling anything about the problem being
>> solved....
>>
> https://lore.kernel.org/lkml/[email protected]/T/#:~:text=*%20[PATCHv3%20net%2Dnext%200/3]%20add%20ptp_gettimex64any()%20API,21:24%20Mahesh%20Bandewar%200%20siblings%2C%200%20replies;
>
> This is the cover letter where I tried to explain the need for this.
The justification for a patch needs to be in the change log and not in
the cover letter because the cover letter is not part of the git
history.
> Granted, my current use case is for CLOCK_MONOTONIC_RAW but just
> because I don't have a use case doesn't mean someone else may not have
> it and hence added it.
Then why did you not five other clock IDs? Someone else might have a
use case, no?
While a syscall/ioctl should be flexible for future use, the kernel does
not add features just because there might be some use case. It's
documented how this works.
Thanks,
tglx
On Tue, Apr 02 2024 at 16:37, Mahesh Bandewar (महेश बंडेवार) wrote:
> On Tue, Apr 2, 2024 at 3:37 PM Thomas Gleixner <[email protected]> wrote:
> The modification that you have proposed (in a couple of posts back)
> would work but it's still not ideal since the pre/post ts are not
> close enough as they are currently (properly implemented!)
> gettimex64() would have. The only way to do that would be to have
> another ioctl as I have proposed which is a superset of current
> gettimex64 and pre-post collection is the closest possible.
Errm. What I posted as sketch _is_ using gettimex64() with the extra
twist of the flag vs. a clockid (which is an implementation detail) and
the difference that I carry the information in ptp_system_timestamp
instead of needing a new argument clockid to all existing callbacks
because the modification to ptp_read_prets() and postts() will just be
sufficient, no?
For the case where the driver does not provide gettimex64() then the
extension of the original offset ioctl is still providing a better
mechanism than the proposed syscall.
I also clearly said that all drivers should be converted over to
gettimex64().
> Having said that, the 'flag' modification proposal is a good backup
> for the drivers that don't have good implementation (close enough but
> not ideal). Also, you don't need a new ioctl-op. So if we really want
> precision, I believe, we need a new ioctl op (with supporting
> implementation similar to the mlx4 code above). but we want to save
> the new ioctl-op and have less precision then proposed modification
> would work fine.
I disagree. The existing gettimex64() is good enough if the driver
implements it correctly today. If not then those drivers need to be
fixed independent of this.
So assumed that a driver does:
gettimex64()
ptp_prets(sts);
read_clock();
ptp_postts(sts);
today then having:
static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
{
if (sts) {
if (sts->flags & PTP_SYS_OFFSET_MONO_RAW)
ktime_get_raw_ts64(&sts->pre_ts);
else
ktime_get_real_ts64(&sts->pre_ts);
}
}
static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
{
if (sts) {
if (sts->flags & PTP_SYS_OFFSET_MONO_RAW)
ktime_get_raw_ts64(&sts->post_ts);
else
ktime_get_real_ts64(&sts->post_ts);
}
}
or
static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
{
if (sts) {
switch (sts->clockid) {
case CLOCK_MONOTONIC_RAW:
time_get_raw_ts64(&sts->pre_ts);
break;
case CLOCK_REALTIME:
ktime_get_real_ts64(&sts->pre_ts);
break;
}
}
}
static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
{
if (sts) {
switch (sts->clockid) {
case CLOCK_MONOTONIC_RAW:
time_get_raw_ts64(&sts->post_ts);
break;
case CLOCK_REALTIME:
ktime_get_real_ts64(&sts->post_ts);
break;
}
}
}
is doing the exact same thing as your proposal but without touching any
driver which implements gettimex64() correctly at all.
While your proposal requires to touch every single driver for no reason,
no?
It is just an implementation detail whether you use a flag or a
clockid. You can carry the clockid for the clocks which actually can be
read in that context in a reserved field of PTP_SYS_OFFSET_EXTENDED:
struct ptp_sys_offset_extended {
unsigned int n_samples; /* Desired number of measurements. */
clockid_t clockid;
unsigned int rsv[2]; /* Reserved for future use. */
};
and in the IOCTL:
if (extoff->clockid != CLOCK_MONOTONIC_RAW)
return -EINVAL;
sts.clockid = extoff->clockid;
and it all just works, no?
I have no problem to decide that PTP_SYS_OFFSET will not get this
treatment and the drivers have to be converted over to
PTP_SYS_OFFSET_EXTENDED.
But adding yet another callback just to carry a clockid as argument is a
more than pointless exercise as I demonstrated.
Thanks,
tglx
On Wed, Apr 03 2024 at 15:48, Thomas Gleixner wrote:
> On Tue, Apr 02 2024 at 16:37, Mahesh Bandewar (महेश बंडेवार) wrote:
> It is just an implementation detail whether you use a flag or a
> clockid. You can carry the clockid for the clocks which actually can be
> read in that context in a reserved field of PTP_SYS_OFFSET_EXTENDED:
>
> struct ptp_sys_offset_extended {
> unsigned int n_samples; /* Desired number of measurements. */
> clockid_t clockid;
> unsigned int rsv[2]; /* Reserved for future use. */
> };
>
> and in the IOCTL:
>
> if (extoff->clockid != CLOCK_MONOTONIC_RAW)
> return -EINVAL;
That should obviously be:
switch (extoff->clockid) {
case CLOCK_REALTIME:
case CLOCK_MONOTONIC_RAW:
break;
default:
return -EINVAL;
}
..
> sts.clockid = extoff->clockid;
>
> and it all just works, no?
>
> I have no problem to decide that PTP_SYS_OFFSET will not get this
> treatment and the drivers have to be converted over to
> PTP_SYS_OFFSET_EXTENDED.
>
> But adding yet another callback just to carry a clockid as argument is a
> more than pointless exercise as I demonstrated.
>
> Thanks,
>
> tglx
On Tue, Apr 2, 2024 at 2:25 AM Thomas Gleixner <[email protected]> wrote:
>
> On Mon, Apr 01 2024 at 22:42, Mahesh Bandewar (महेश बंडेवार) wrote:
> > On Mon, Apr 1, 2024 at 1:46 PM Thomas Gleixner <tglx@linutronixde> wrote:
> >> So if there is a backwards compability issue with PTP_SYS_OFFSET2, then
> >> you need to introduce PTP_SYS_OFFSET3. The PTP_SYS_*2 variants were
> >> introduced to avoid backwards compatibility issues as well, but
> >> unfortunately that did not address the reserved fields problem for
> >> PTP_SYS_OFFSET2. PTP_SYS_OFFSET_EXTENDED2 should just work, but maybe
> >> the PTP maintainers want a full extension to '3'. Either way is fine.
> >>
> > https://patchwork.kernel.org/project/netdevbpf/patch/[email protected]/
> >
> > This was my attempt to solve a similar issue with the new ioctl op to
> > avoid backward compatibility issues. Instead of flags I used the
> > clockid_t in a similar fashion.
>
> Works as well. I'm not seing the point for CLOCK_MONOTONIC and the
> change logs are not really telling anything about the problem being
> solved....
>
https://lore.kernel.org/lkml/[email protected]/T/#:~:text=*%20[PATCHv3%20net%2Dnext%200/3]%20add%20ptp_gettimex64any()%20API,21:24%20Mahesh%20Bandewar%200%20siblings%2C%200%20replies;
This is the cover letter where I tried to explain the need for this.
Granted, my current use case is for CLOCK_MONOTONIC_RAW but just
because I don't have a use case doesn't mean someone else may not have
it and hence added it. In either way it's just a matter of
adding/removing another flag/clock-id.
Thanks,
--mahesh..
> Thanks,
>
> tglx
On Tue, Apr 2, 2024 at 3:37 PM Thomas Gleixner <[email protected]> wrote:
>
> On Tue, Apr 02 2024 at 14:16, Mahesh Bandewar (महेश बंडेवार) wrote:
> > On Tue, Apr 2, 2024 at 2:25 AM Thomas Gleixner <tglx@linutronixde> wrote:
> >> Works as well. I'm not seing the point for CLOCK_MONOTONIC and the
> >> change logs are not really telling anything about the problem being
> >> solved....
> >>
> > https://lore.kernel.org/lkml/[email protected]/T/#:~:text=*%20[PATCHv3%20net%2Dnext%200/3]%20add%20ptp_gettimex64any()%20API,21:24%20Mahesh%20Bandewar%200%20siblings%2C%200%20replies;
> >
> > This is the cover letter where I tried to explain the need for this.
>
> The justification for a patch needs to be in the change log and not in
> the cover letter because the cover letter is not part of the git
> history.
>
ack
> > Granted, my current use case is for CLOCK_MONOTONIC_RAW but just
> > because I don't have a use case doesn't mean someone else may not have
> > it and hence added it.
>
> Then why did you not five other clock IDs? Someone else might have a
> use case, no?
>
> While a syscall/ioctl should be flexible for future use, the kernel does
> not add features just because there might be some use case. It's
> documented how this works.
>
I see your point. I don't mind removing the CLOCK_MONOTONIC for now
and just have CLOCK_REALTIME and CLOCK_MONOTONIC_RAW support. Also as
I mentioned, it will be just a matter of adding new clock-ids and
support for the pre/post-ts for respective clock-ids if needed in the
future.
The modification that you have proposed (in a couple of posts back)
would work but it's still not ideal since the pre/post ts are not
close enough as they are currently (properly implemented!)
gettimex64() would have. The only way to do that would be to have
another ioctl as I have proposed which is a superset of current
gettimex64 and pre-post collection is the closest possible.
Here is my sample mlx4 (since I use that) of the new ioctl method
(just for the reference)
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -43,6 +43,7 @@
#include <linux/io-mapping.h>
#include <linux/delay.h>
#include <linux/etherdevice.h>
+#include <linux/ptp_clock_kernel.h>
#include <net/devlink.h>
#include <uapi/rdma/mlx4-abi.h>
@@ -1929,7 +1930,7 @@ static void unmap_bf_area(struct mlx4_dev *dev)
io_mapping_free(mlx4_priv(dev)->bf_mapping);
}
-u64 mlx4_read_clock(struct mlx4_dev *dev)
+u64 mlx4_read_clock(struct mlx4_dev *dev, struct ptp_system_timestamp
*sts, int clkid)
{
u32 clockhi, clocklo, clockhi1;
u64 cycles;
@@ -1937,7 +1938,13 @@ u64 mlx4_read_clock(struct mlx4_dev *dev)
struct mlx4_priv *priv = mlx4_priv(dev);
for (i = 0; i < 10; i++) {
- clockhi = swab32(readl(priv->clock_mapping));
+ if (sts) {
+ ptp_read_any_prets(sts, clkid);
+ clockhi = swab32(readl(priv->clock_mapping));
+ ptp_read_any_postts(sts, clkid);
+ } else {
+ clockhi = swab32(readl(priv->clock_mapping));
+ }
clocklo = swab32(readl(priv->clock_mapping + 4));
clockhi1 = swab32(readl(priv->clock_mapping));
if (clockhi == clockhi1)
Having said that, the 'flag' modification proposal is a good backup
for the drivers that don't have good implementation (close enough but
not ideal). Also, you don't need a new ioctl-op. So if we really want
precision, I believe, we need a new ioctl op (with supporting
implementation similar to the mlx4 code above). but we want to save
the new ioctl-op and have less precision then proposed modification
would work fine.
> Thanks,
>
> tglx
>
Hi Mahesh
What is the status of your patch?
if your patch is upstreamed , then it will have all I need.
But, If not , I will upstream my patch.
BR,
On Thu, Apr 11, 2024 at 5:56 AM Mahesh Bandewar (महेश बंडेवार)
<[email protected]> wrote:
>
> On Wed, Apr 3, 2024 at 6:48 AM Thomas Gleixner <[email protected]> wrote:
> >
> > On Tue, Apr 02 2024 at 16:37, Mahesh Bandewar (महेश बंडेवार) wrote:
> > > On Tue, Apr 2, 2024 at 3:37 PM Thomas Gleixner <[email protected]> wrote:
> > > The modification that you have proposed (in a couple of posts back)
> > > would work but it's still not ideal since the pre/post ts are not
> > > close enough as they are currently (properly implemented!)
> > > gettimex64() would have. The only way to do that would be to have
> > > another ioctl as I have proposed which is a superset of current
> > > gettimex64 and pre-post collection is the closest possible.
> >
> > Errm. What I posted as sketch _is_ using gettimex64() with the extra
> > twist of the flag vs. a clockid (which is an implementation detail) and
> > the difference that I carry the information in ptp_system_timestamp
> > instead of needing a new argument clockid to all existing callbacks
> > because the modification to ptp_read_prets() and postts() will just be
> > sufficient, no?
> >
> OK, that makes sense.
>
> > For the case where the driver does not provide gettimex64() then the
> > extension of the original offset ioctl is still providing a better
> > mechanism than the proposed syscall.
> >
> > I also clearly said that all drivers should be converted over to
> > gettimex64().
> >
> I agree. Honestly that should have been mandatory and
> ptp_register_clock() should fail otherwise! Probably should have been
> part of gettimex64 implementation :(
>
> I don't think we can do anything other than just hoping all driver
> implementations include gettimex64 implementation.
>
> > > Having said that, the 'flag' modification proposal is a good backup
> > > for the drivers that don't have good implementation (close enough but
> > > not ideal). Also, you don't need a new ioctl-op. So if we really want
> > > precision, I believe, we need a new ioctl op (with supporting
> > > implementation similar to the mlx4 code above). but we want to save
> > > the new ioctl-op and have less precision then proposed modification
> > > would work fine.
> >
> > I disagree. The existing gettimex64() is good enough if the driver
> > implements it correctly today. If not then those drivers need to be
> > fixed independent of this.
> >
> > So assumed that a driver does:
> >
> > gettimex64()
> > ptp_prets(sts);
> > read_clock();
> > ptp_postts(sts);
> >
> > today then having:
> >
> > static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
> > {
> > if (sts) {
> > if (sts->flags & PTP_SYS_OFFSET_MONO_RAW)
> > ktime_get_raw_ts64(&sts->pre_ts);
> > else
> > ktime_get_real_ts64(&sts->pre_ts);
> > }
> > }
> >
> > static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
> > {
> > if (sts) {
> > if (sts->flags & PTP_SYS_OFFSET_MONO_RAW)
> > ktime_get_raw_ts64(&sts->post_ts);
> > else
> > ktime_get_real_ts64(&sts->post_ts);
> > }
> > }
> >
> > or
> >
> > static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
> > {
> > if (sts) {
> > switch (sts->clockid) {
> > case CLOCK_MONOTONIC_RAW:
> > time_get_raw_ts64(&sts->pre_ts);
> > break;
> > case CLOCK_REALTIME:
> > ktime_get_real_ts64(&sts->pre_ts);
> > break;
> > }
> > }
> > }
> >
> > static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
> > {
> > if (sts) {
> > switch (sts->clockid) {
> > case CLOCK_MONOTONIC_RAW:
> > time_get_raw_ts64(&sts->post_ts);
> > break;
> > case CLOCK_REALTIME:
> > ktime_get_real_ts64(&sts->post_ts);
> > break;
> > }
> > }
> > }
> >
> > is doing the exact same thing as your proposal but without touching any
> > driver which implements gettimex64() correctly at all.
> >
> I see. Yes, this makes sense.
>
> > While your proposal requires to touch every single driver for no reason,
> > no?
> >
> > It is just an implementation detail whether you use a flag or a
> > clockid. You can carry the clockid for the clocks which actually can be
> > read in that context in a reserved field of PTP_SYS_OFFSET_EXTENDED:
> >
> > struct ptp_sys_offset_extended {
> > unsigned int n_samples; /* Desired number of measurements. */
> > clockid_t clockid;
> > unsigned int rsv[2]; /* Reserved for future use. */
> > };
> >
> > and in the IOCTL:
> >
> > if (extoff->clockid != CLOCK_MONOTONIC_RAW)
> > return -EINVAL;
> >
> > sts.clockid = extoff->clockid;
> >
> > and it all just works, no?
> >
> Yes, this should work. However, I didn't check if struct
> ptp_system_timestamp is used in some other context.
>
> > I have no problem to decide that PTP_SYS_OFFSET will not get this
> > treatment and the drivers have to be converted over to
> > PTP_SYS_OFFSET_EXTENDED.
> >
> > But adding yet another callback just to carry a clockid as argument is a
> > more than pointless exercise as I demonstrated.
> >
> Agreed. As I said, I thought we cannot change the gettimex64() without
> breaking the compatibility but the fact that CLOCK_REALTIME is "0"
> works well for the backward compatibility case.
>
> I can spin up an updated patch/series that updates gettimex64
> implementation instead of adding a new ioctl-op If you all agree.
>
> thanks,
> --mahesh..
>
> > Thanks,
> >
> > tglx