This code makes enabling the vector extension on a riscv manchine
optional by adding an option to prctl() to allow a process to enable,
disable or query its vector context state.
-added prctl to enable/disable/query current vector state
-added actual function in riscv specific code to change/query the process
state
- Fixed problem with initial set of patches
(missing some EXPORT_SYMBOL() macro calls)
- rebased to 6.0-rc1
---
arch/riscv/configs/defconfig | 6 ++++++
arch/riscv/include/asm/kvm_vcpu_vector.h | 8 ++++----
arch/riscv/include/asm/processor.h | 6 ++++++
arch/riscv/include/asm/switch_to.h | 11 +++++++++++
arch/riscv/kernel/cpufeature.c | 3 ++-
arch/riscv/kernel/process.c | 20 +++++++++++++++++++-
arch/riscv/kvm/vcpu_vector.c | 14 +++++++-------
include/uapi/linux/prctl.h | 6 ++++++
kernel/sys.c | 7 +++++++
9 files changed, 68 insertions(+), 13 deletions(-)
diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index aed332a9d4ea..fce054286b1f 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig
@@ -209,3 +209,9 @@ CONFIG_RCU_EQS_DEBUG=y
# CONFIG_FTRACE is not set
# CONFIG_RUNTIME_TESTING_MENU is not set
CONFIG_MEMTEST=y
+CONFIG_ARCH_RV64I=y
+CONFIG_64BIT=y
+CONFIG_VECTOR=y
+CONFIG_ARCH_RV64I=y
+CONFIG_64BIT=y
+CONFIG_VECTOR=y
diff --git a/arch/riscv/include/asm/kvm_vcpu_vector.h b/arch/riscv/include/asm/kvm_vcpu_vector.h
index 1dcc1b2e05bb..c7101ff943a0 100644
--- a/arch/riscv/include/asm/kvm_vcpu_vector.h
+++ b/arch/riscv/include/asm/kvm_vcpu_vector.h
@@ -22,9 +22,9 @@ void __kvm_riscv_vector_save(struct kvm_cpu_context *context);
void __kvm_riscv_vector_restore(struct kvm_cpu_context *context);
void kvm_riscv_vcpu_vector_reset(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_guest_vector_save(struct kvm_cpu_context *cntx,
- unsigned long isa);
+ unsigned long *isa);
void kvm_riscv_vcpu_guest_vector_restore(struct kvm_cpu_context *cntx,
- unsigned long isa);
+ unsigned long *isa);
void kvm_riscv_vcpu_host_vector_save(struct kvm_cpu_context *cntx);
void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cntx);
void kvm_riscv_vcpu_free_vector_context(struct kvm_vcpu *vcpu);
@@ -34,12 +34,12 @@ static inline void kvm_riscv_vcpu_vector_reset(struct kvm_vcpu *vcpu)
}
static inline void kvm_riscv_vcpu_guest_vector_save(struct kvm_cpu_context *cntx,
- unsigned long isa)
+ unsigned long *isa)
{
}
static inline void kvm_riscv_vcpu_guest_vector_restore(struct kvm_cpu_context *cntx,
- unsigned long isa)
+ unsigned long *isa)
{
}
diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index a09141ecf6aa..f2d0a91ce174 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -88,6 +88,12 @@ extern void riscv_fill_hwcap(void);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
extern unsigned long signal_minsigstksz __ro_after_init;
+
+#ifdef CONFIG_VECTOR
+extern int rvv_proc_enable(unsigned long x);
+#define RVV_PROC_ENABLE(x) rvv_proc_enable(x)
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_RISCV_PROCESSOR_H */
diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h
index 527951c033d4..d9747450311c 100644
--- a/arch/riscv/include/asm/switch_to.h
+++ b/arch/riscv/include/asm/switch_to.h
@@ -80,6 +80,17 @@ extern unsigned long riscv_vsize;
extern void __vstate_save(struct __riscv_v_state *save_to, void *datap);
extern void __vstate_restore(struct __riscv_v_state *restore_from, void *datap);
+static inline bool vstate_query(struct pt_regs *regs)
+{
+ return (regs->status & SR_VS) != 0;
+}
+
+static inline void vstate_on(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ regs->status = (regs->status & ~(SR_VS)) | SR_VS_INITIAL;
+}
+
static inline void __vstate_clean(struct pt_regs *regs)
{
regs->status = (regs->status & ~(SR_VS)) | SR_VS_CLEAN;
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 0487ab19b234..3be469cb9266 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -37,6 +37,8 @@ __ro_after_init DEFINE_STATIC_KEY_FALSE(cpu_hwcap_fpu);
#include <asm/vector.h>
__ro_after_init DEFINE_STATIC_KEY_FALSE(cpu_hwcap_vector);
unsigned long riscv_vsize __read_mostly;
+EXPORT_SYMBOL(cpu_hwcap_vector);
+EXPORT_SYMBOL(riscv_vsize);
#endif
/**
@@ -346,4 +348,3 @@ void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin,
}
}
#endif
-}
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index e88a37fc77ed..a5a76d1374ec 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -15,6 +15,7 @@
#include <linux/tick.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/prctl.h>
#include <asm/unistd.h>
#include <asm/processor.h>
@@ -134,7 +135,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc,
if (WARN_ON(!vstate->datap))
return;
}
- regs->status |= SR_VS_INITIAL;
/*
* Restore the initial value to the vector register
@@ -230,3 +229,22 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
p->thread.sp = (unsigned long)childregs; /* kernel sp */
return 0;
}
+
+#ifdef CONFIG_VECTOR
+int rvv_proc_enable(unsigned long x)
+{
+ switch (x) {
+ case PR_RVV_DISABLE:
+ vstate_off(current, task_pt_regs(current));
+ return 0;
+ case PR_RVV_ENABLE:
+ vstate_on(current, task_pt_regs(current));
+ return 0;
+ case PR_RVV_QUERY:
+ return vstate_query(task_pt_regs(current));
+ default:
+ return -(EINVAL);
+
+ }
+}
+#endif
diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c
index 37bf4ffd47dd..9d1613da561a 100644
--- a/arch/riscv/kvm/vcpu_vector.c
+++ b/arch/riscv/kvm/vcpu_vector.c
@@ -20,7 +20,7 @@
extern unsigned long riscv_vsize;
void kvm_riscv_vcpu_vector_reset(struct kvm_vcpu *vcpu)
{
- unsigned long isa = vcpu->arch.isa;
+ unsigned long isa = *vcpu->arch.isa;
struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
cntx->sstatus &= ~SR_VS;
@@ -39,20 +39,20 @@ static void kvm_riscv_vcpu_vector_clean(struct kvm_cpu_context *cntx)
}
void kvm_riscv_vcpu_guest_vector_save(struct kvm_cpu_context *cntx,
- unsigned long isa)
+ unsigned long *isa)
{
if ((cntx->sstatus & SR_VS) == SR_VS_DIRTY) {
- if (riscv_isa_extension_available(&isa, v))
+ if (riscv_isa_extension_available(isa, v))
__kvm_riscv_vector_save(cntx);
kvm_riscv_vcpu_vector_clean(cntx);
}
}
void kvm_riscv_vcpu_guest_vector_restore(struct kvm_cpu_context *cntx,
- unsigned long isa)
+ unsigned long *isa)
{
if ((cntx->sstatus & SR_VS) != SR_VS_OFF) {
- if (riscv_isa_extension_available(&isa, v))
+ if (riscv_isa_extension_available(isa, v))
__kvm_riscv_vector_restore(cntx);
kvm_riscv_vcpu_vector_clean(cntx);
}
@@ -122,7 +122,7 @@ int kvm_riscv_vcpu_get_reg_vector(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg,
unsigned long rtype)
{
- unsigned long isa = vcpu->arch.isa;
+ unsigned long isa = *vcpu->arch.isa;
unsigned long __user *uaddr =
(unsigned long __user *)(unsigned long)reg->addr;
unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
@@ -149,7 +149,7 @@ int kvm_riscv_vcpu_set_reg_vector(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg,
unsigned long rtype)
{
- unsigned long isa = vcpu->arch.isa;
+ unsigned long isa = *vcpu->arch.isa;
unsigned long __user *uaddr =
(unsigned long __user *)(unsigned long)reg->addr;
unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a5e06dcbba13..8ea56e4c48f8 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -281,6 +281,12 @@ struct prctl_mm_map {
# define PR_SME_VL_LEN_MASK 0xffff
# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */
+/* RISC-V V vector extension */
+#define PR_RVV_STATE 65
+# define PR_RVV_DISABLE 0
+# define PR_RVV_ENABLE 1
+# define PR_RVV_QUERY 2
+
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0
diff --git a/kernel/sys.c b/kernel/sys.c
index b911fa6d81ab..3049b1823273 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -138,6 +138,9 @@
#ifndef GET_TAGGED_ADDR_CTRL
# define GET_TAGGED_ADDR_CTRL() (-EINVAL)
#endif
+#ifndef RVV_PROC_ENABLE
+# define RVV_PROC_ENABLE(x) (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -2620,6 +2623,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_RVV_STATE:
+ error = RVV_PROC_ENABLE(arg2);
+ break;
+
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
--
2.25.1
Hi Darius, Andrew, Palmer
On 9/21/22 14:43, Chris Stillson wrote:
> diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
>
> @@ -134,7 +135,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc,
> if (WARN_ON(!vstate->datap))
> return;
> }
> - regs->status |= SR_VS_INITIAL;
>
Perhaps not obvious from the patch, but this is a major user experience
change: As in V unit would be turned off for a new task and we will rely
on a userspace prctl (also introduced in this patch) to enable V.
I know some of you had different opinion on this in the past [1], so
this is to make sure everyone's on same page.
And if we agree this is the way to go, how exactly will this be done in
userspace.
glibc dynamic loader will invoke the prctl() ? How will it decide
whether to do this (or not) - will it be unconditional or will it use
the hwcap - does latter plumbing exist already ? If so is it AT_HWCAP /
HWCAP2.
Also for static linked executables, where will the prctl be called from ?
[1] https://sourceware.org/pipermail/libc-alpha/2021-November/132883.html
On Thu, 08 Dec 2022 21:16:06 PST (-0800), Vineet Gupta wrote:
> Hi Darius, Andrew, Palmer
>
> On 9/21/22 14:43, Chris Stillson wrote:
>> diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
>>
>> @@ -134,7 +135,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc,
>> if (WARN_ON(!vstate->datap))
>> return;
>> }
>> - regs->status |= SR_VS_INITIAL;
>>
>
> Perhaps not obvious from the patch, but this is a major user experience
> change: As in V unit would be turned off for a new task and we will rely
> on a userspace prctl (also introduced in this patch) to enable V.
IMO that's the only viable option: enabling V adds more user-visible
state, which is a uABI break. I haven't really had time to poke through
all the versions here, but I'd have the call look something like
prctl(RISCV_ENABLE_V, min_vlenb, max_vlenb, flags);
where
* min_vlenb is the smallest VLENB that userspace can support. There's
alreday an LLVM argument for this, I haven't dug into the generated
code but I assume it'll blow up on smaller VLENB systems somehow.
* max_vlenb is the largest VLENB that userspace can support.
* flags is just a placeholder for now, with 0 meaning "V as defined by
1.0 for all threads in this proces". That should give us an out if
something more complicated happens in the future.
That way VLA code can call `prctl(RISCV_ENABLE_V, 128, 8192, 0)` as it
supports any V 1.0 implementation, while code with other constraints can
avoid having V turned on in an unsupported configuration.
I think we can start out with no flags, but there's a few I could see
being useful already:
* Cross process/thread enabling. I think a reasonable default is
"enable V for all current and future threads in this process", but one
could imagine flags for "just this thread" vs "all current threads", a
default for new threads, and a default for child processes. I don't
think it matters so much what we pick as a default, just that it's
written down.
* Setting the VLENB bounds vs updating them. I'm thinking for shared
libraries, where they'd only want to enable V in the shared library if
it's already in a supported configuration. I'm not sure what the
right rules are here, but again it's best to write that down.
* Some way to disable V. Maybe we just say `prctl(RISCV_ENABLE_V, 0, 0,
...)` disables V, or maybe it's a flag? Again, it should just be
written down.
* What exactly we're enabling -- is it the V extension, or just the V
registers?
There's a bunch of subtly here, though, so I think we'd at least want
glibc and gdb support posted before committing to any uABI. It's
probably also worth looking at what the Arm folks did for SVE: I gave it
a quick glance and it seems like there's a lot of similarities with what
I'm suggesting here, but again a lot of this is pretty subtle stuff so
it's hard to tell just at a glance.
> I know some of you had different opinion on this in the past [1], so
> this is to make sure everyone's on same page.
> And if we agree this is the way to go, how exactly will this be done in
> userspace.
>
> glibc dynamic loader will invoke the prctl() ? How will it decide
> whether to do this (or not) - will it be unconditional or will it use
> the hwcap - does latter plumbing exist already ? If so is it AT_HWCAP /
> HWCAP2.
That part I haven't sorted out yet, and I don't think it's sufficient to
just say "userspace should enable what it can support" because of how
pervasive V instructions are going to be.
I don't think we need HWCAP, as userspace will need to call the prctl()
anyway to turn on V and thus can just use the success/failure of that to
sort things out.
Maybe it's sufficient to rely on some sort of sticky prctl() (or sysctl
type thing, the differences there would be pretty subtle) and just not
worry about it, but having some way of encoding this in the ELF seems
nice. That said, we've had a bunch of trouble sorting out the ISA
encoding in ELFs so maybe it's just not worth bothering?
> Also for static linked executables, where will the prctl be called from ?
I guess that's pretty far in the weeds, but we could at least hook CRT
to insert the relevant code. We'd really need to sort out how we're
going to encode the V support in binaries, though.
> [1] https://sourceware.org/pipermail/libc-alpha/2021-November/132883.html
Requiring application programmers (i.e. those who write main()) to
make a prctl() call is obviously completely unacceptable, because
application programmers don't know whether the V extension is being
used. Auto-vectorization and libc-function implementations will use
the V extension without any application-programmer knowledge or
intervention. And obviously we don't want to preclude that.
This suggests that ld.so, early-stage libc, or possibly both will need
to make this prctl() call, perhaps by parsing the ELF headers of the
binary and each library to determine if the V extension is used.
Personally, I'm agnostic to whether we put this onus on the kernel or
on user-space--I just want to make sure we're all on the same page
that it needs to be hidden behind libc/ld.so/etc. The onus can't be
on the application programmer.
On Thu, Dec 8, 2022 at 8:27 PM Palmer Dabbelt <[email protected]> wrote:
>
> On Thu, 08 Dec 2022 21:16:06 PST (-0800), Vineet Gupta wrote:
> > Hi Darius, Andrew, Palmer
> >
> > On 9/21/22 14:43, Chris Stillson wrote:
> >> diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
> >>
> >> @@ -134,7 +135,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc,
> >> if (WARN_ON(!vstate->datap))
> >> return;
> >> }
> >> - regs->status |= SR_VS_INITIAL;
> >>
> >
> > Perhaps not obvious from the patch, but this is a major user experience
> > change: As in V unit would be turned off for a new task and we will rely
> > on a userspace prctl (also introduced in this patch) to enable V.
>
> IMO that's the only viable option: enabling V adds more user-visible
> state, which is a uABI break. I haven't really had time to poke through
> all the versions here, but I'd have the call look something like
>
> prctl(RISCV_ENABLE_V, min_vlenb, max_vlenb, flags);
>
> where
>
> * min_vlenb is the smallest VLENB that userspace can support. There's
> alreday an LLVM argument for this, I haven't dug into the generated
> code but I assume it'll blow up on smaller VLENB systems somehow.
> * max_vlenb is the largest VLENB that userspace can support.
> * flags is just a placeholder for now, with 0 meaning "V as defined by
> 1.0 for all threads in this proces". That should give us an out if
> something more complicated happens in the future.
>
> That way VLA code can call `prctl(RISCV_ENABLE_V, 128, 8192, 0)` as it
> supports any V 1.0 implementation, while code with other constraints can
> avoid having V turned on in an unsupported configuration.
VLA code needs to read the vlenb CSR; it can't assume 8192 (or any
other small number) is a safe upper bound.
>
> I think we can start out with no flags, but there's a few I could see
> being useful already:
>
> * Cross process/thread enabling. I think a reasonable default is
> "enable V for all current and future threads in this process", but one
> could imagine flags for "just this thread" vs "all current threads", a
> default for new threads, and a default for child processes. I don't
> think it matters so much what we pick as a default, just that it's
> written down.
> * Setting the VLENB bounds vs updating them. I'm thinking for shared
> libraries, where they'd only want to enable V in the shared library if
> it's already in a supported configuration. I'm not sure what the
> right rules are here, but again it's best to write that down.
> * Some way to disable V. Maybe we just say `prctl(RISCV_ENABLE_V, 0, 0,
> ...)` disables V, or maybe it's a flag? Again, it should just be
> written down.
> * What exactly we're enabling -- is it the V extension, or just the V
> registers?
>
> There's a bunch of subtly here, though, so I think we'd at least want
> glibc and gdb support posted before committing to any uABI. It's
> probably also worth looking at what the Arm folks did for SVE: I gave it
> a quick glance and it seems like there's a lot of similarities with what
> I'm suggesting here, but again a lot of this is pretty subtle stuff so
> it's hard to tell just at a glance.
>
> > I know some of you had different opinion on this in the past [1], so
> > this is to make sure everyone's on same page.
> > And if we agree this is the way to go, how exactly will this be done in
> > userspace.
> >
> > glibc dynamic loader will invoke the prctl() ? How will it decide
> > whether to do this (or not) - will it be unconditional or will it use
> > the hwcap - does latter plumbing exist already ? If so is it AT_HWCAP /
> > HWCAP2.
>
> That part I haven't sorted out yet, and I don't think it's sufficient to
> just say "userspace should enable what it can support" because of how
> pervasive V instructions are going to be.
>
> I don't think we need HWCAP, as userspace will need to call the prctl()
> anyway to turn on V and thus can just use the success/failure of that to
> sort things out.
>
> Maybe it's sufficient to rely on some sort of sticky prctl() (or sysctl
> type thing, the differences there would be pretty subtle) and just not
> worry about it, but having some way of encoding this in the ELF seems
> nice. That said, we've had a bunch of trouble sorting out the ISA
> encoding in ELFs so maybe it's just not worth bothering?
>
> > Also for static linked executables, where will the prctl be called from ?
>
> I guess that's pretty far in the weeds, but we could at least hook CRT
> to insert the relevant code. We'd really need to sort out how we're
> going to encode the V support in binaries, though.
>
> > [1] https://sourceware.org/pipermail/libc-alpha/2021-November/132883.html
* Andrew Waterman:
> This suggests that ld.so, early-stage libc, or possibly both will need
> to make this prctl() call, perhaps by parsing the ELF headers of the
> binary and each library to determine if the V extension is used.
If the string functions use the V extension, it will be enabled
unconditionally. So I don't see why it's okay for libc to trigger this
alleged UAPI change, when the kernel can't do it by default.
Thanks,
Florian
* Darius Rad:
> On Fri, Dec 09, 2022 at 11:02:57AM +0100, Florian Weimer wrote:
>> * Andrew Waterman:
>>
>> > This suggests that ld.so, early-stage libc, or possibly both will need
>> > to make this prctl() call, perhaps by parsing the ELF headers of the
>> > binary and each library to determine if the V extension is used.
>>
>> If the string functions use the V extension, it will be enabled
>> unconditionally. So I don't see why it's okay for libc to trigger this
>> alleged UAPI change, when the kernel can't do it by default.
>>
>
> Because the call to enable can fail and userspace needs to deal with that.
Failure is usually indicated by an AT_HWCAP or AT_HWCAP2 bit remaining
zero, or perhaps a special CPU register (although that is more unusual).
It's possible to do this differently, but every mid-level startup code
will have to replicate it (the libcs, other run-time environments like
Go, and so on).
Still it's much better than executing the instruction to see if it
traps, so I won't complain too much.
Thanks,
Florian
On Fri, Dec 09, 2022 at 11:02:57AM +0100, Florian Weimer wrote:
> * Andrew Waterman:
>
> > This suggests that ld.so, early-stage libc, or possibly both will need
> > to make this prctl() call, perhaps by parsing the ELF headers of the
> > binary and each library to determine if the V extension is used.
>
> If the string functions use the V extension, it will be enabled
> unconditionally. So I don't see why it's okay for libc to trigger this
> alleged UAPI change, when the kernel can't do it by default.
>
Because the call to enable can fail and userspace needs to deal with that.
// darius
On Fri, Dec 09, 2022 at 01:32:33PM +0100, Florian Weimer via Libc-alpha wrote:
> * Darius Rad:
>
> > On Fri, Dec 09, 2022 at 11:02:57AM +0100, Florian Weimer wrote:
> >> * Andrew Waterman:
> >>
> >> > This suggests that ld.so, early-stage libc, or possibly both will need
> >> > to make this prctl() call, perhaps by parsing the ELF headers of the
> >> > binary and each library to determine if the V extension is used.
> >>
> >> If the string functions use the V extension, it will be enabled
> >> unconditionally. So I don't see why it's okay for libc to trigger this
> >> alleged UAPI change, when the kernel can't do it by default.
> >>
> >
> > Because the call to enable can fail and userspace needs to deal with that.
>
> Failure is usually indicated by an AT_HWCAP or AT_HWCAP2 bit remaining
> zero, or perhaps a special CPU register (although that is more unusual).
That would indicate that the extension is not present, which is one of, but
not the only way it can fail.
The vector extension relies on dynamically allocated memory in the kernel,
which can fail.
It also provides the opportunity for the kernel to deny access to the
vector extension, perhaps due to administrative policy or other future
mechanism.
// darius
* Darius Rad:
> On Fri, Dec 09, 2022 at 01:32:33PM +0100, Florian Weimer via Libc-alpha wrote:
>> * Darius Rad:
>>
>> > On Fri, Dec 09, 2022 at 11:02:57AM +0100, Florian Weimer wrote:
>> >> * Andrew Waterman:
>> >>
>> >> > This suggests that ld.so, early-stage libc, or possibly both will need
>> >> > to make this prctl() call, perhaps by parsing the ELF headers of the
>> >> > binary and each library to determine if the V extension is used.
>> >>
>> >> If the string functions use the V extension, it will be enabled
>> >> unconditionally. So I don't see why it's okay for libc to trigger this
>> >> alleged UAPI change, when the kernel can't do it by default.
>> >>
>> >
>> > Because the call to enable can fail and userspace needs to deal with that.
>>
>> Failure is usually indicated by an AT_HWCAP or AT_HWCAP2 bit remaining
>> zero, or perhaps a special CPU register (although that is more unusual).
>
> That would indicate that the extension is not present, which is one of, but
> not the only way it can fail.
I think you should bring down the number of failure modes. HWCAP has
the advantage that it communicates kernel/hypervisor/firmware/CPU
support in a single bit, which simplifies the programming model and
avoids hard-to-detect bugs. It's not clear why it would be beneficial
to continue on ENOMEM failures here because the system must clearly be
in bad shape at this point, and launching a new process is very unlikely
to improve matters. So I think the simpler programming model is the way
to go here.
> The vector extension relies on dynamically allocated memory in the kernel,
> which can fail.
But this failure can be reported as part of execve and clone.
> It also provides the opportunity for the kernel to deny access to the
> vector extension, perhaps due to administrative policy or other future
> mechanism.
HWCAP can do this, too.
Thanks,
Florian
在 2022-12-08星期四的 22:27 -0800,Palmer Dabbelt写道:
> On Thu, 08 Dec 2022 21:16:06 PST (-0800), Vineet Gupta wrote:
> > Hi Darius, Andrew, Palmer
> >
> > On 9/21/22 14:43, Chris Stillson wrote:
> > > diff --git a/arch/riscv/kernel/process.c
> > > b/arch/riscv/kernel/process.c
> > >
> > > @@ -134,7 +135,6 @@ void start_thread(struct pt_regs *regs,
> > > unsigned long pc,
> > > if (WARN_ON(!vstate->datap))
> > > return;
> > > }
> > > - regs->status |= SR_VS_INITIAL;
> > >
> >
> > Perhaps not obvious from the patch, but this is a major user
> > experience
> > change: As in V unit would be turned off for a new task and we will
> > rely
> > on a userspace prctl (also introduced in this patch) to enable V.
>
> IMO that's the only viable option: enabling V adds more user-visible
> state, which is a uABI break. I haven't really had time to poke
> through
> all the versions here, but I'd have the call look something like
>
> prctl(RISCV_ENABLE_V, min_vlenb, max_vlenb, flags);
Should we make this extra switch more future-proof by not only limiting
it to V, but also other extensions that will introduce extra state,
e.g. P ?
>
> where
>
> * min_vlenb is the smallest VLENB that userspace can support.
> There's
> alreday an LLVM argument for this, I haven't dug into the generated
> code but I assume it'll blow up on smaller VLENB systems somehow.
> * max_vlenb is the largest VLENB that userspace can support.
> * flags is just a placeholder for now, with 0 meaning "V as defined
> by
> 1.0 for all threads in this proces". That should give us an out if
> something more complicated happens in the future.
>
> That way VLA code can call `prctl(RISCV_ENABLE_V, 128, 8192, 0)` as
> it
> supports any V 1.0 implementation, while code with other constraints
> can
> avoid having V turned on in an unsupported configuration.
>
> I think we can start out with no flags, but there's a few I could see
> being useful already:
>
> * Cross process/thread enabling. I think a reasonable default is
> "enable V for all current and future threads in this process", but
> one
> could imagine flags for "just this thread" vs "all current
> threads", a
> default for new threads, and a default for child processes. I
> don't
> think it matters so much what we pick as a default, just that it's
> written down.
> * Setting the VLENB bounds vs updating them. I'm thinking for shared
> libraries, where they'd only want to enable V in the shared library
> if
> it's already in a supported configuration. I'm not sure what the
> right rules are here, but again it's best to write that down.
> * Some way to disable V. Maybe we just say `prctl(RISCV_ENABLE_V, 0,
> 0,
> ...)` disables V, or maybe it's a flag? Again, it should just be
> written down.
> * What exactly we're enabling -- is it the V extension, or just the V
> registers?
>
> There's a bunch of subtly here, though, so I think we'd at least want
> glibc and gdb support posted before committing to any uABI. It's
> probably also worth looking at what the Arm folks did for SVE: I gave
> it
> a quick glance and it seems like there's a lot of similarities with
> what
> I'm suggesting here, but again a lot of this is pretty subtle stuff
> so
> it's hard to tell just at a glance.
>
> > I know some of you had different opinion on this in the past [1],
> > so
> > this is to make sure everyone's on same page.
> > And if we agree this is the way to go, how exactly will this be
> > done in
> > userspace.
> >
> > glibc dynamic loader will invoke the prctl() ? How will it decide
> > whether to do this (or not) - will it be unconditional or will it
> > use
> > the hwcap - does latter plumbing exist already ? If so is it
> > AT_HWCAP /
> > HWCAP2.
>
> That part I haven't sorted out yet, and I don't think it's sufficient
> to
> just say "userspace should enable what it can support" because of how
> pervasive V instructions are going to be.
>
> I don't think we need HWCAP, as userspace will need to call the
> prctl()
> anyway to turn on V and thus can just use the success/failure of that
> to
> sort things out.
>
> Maybe it's sufficient to rely on some sort of sticky prctl() (or
> sysctl
> type thing, the differences there would be pretty subtle) and just
> not
> worry about it, but having some way of encoding this in the ELF seems
> nice. That said, we've had a bunch of trouble sorting out the ISA
> encoding in ELFs so maybe it's just not worth bothering?
>
> > Also for static linked executables, where will the prctl be called
> > from ?
>
> I guess that's pretty far in the weeds, but we could at least hook
> CRT
> to insert the relevant code. We'd really need to sort out how we're
> going to encode the V support in binaries, though.
>
> > [1]
> > https://sourceware.org/pipermail/libc-alpha/2021-November/132883.html
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv
On Fri, 09 Dec 2022 05:04:23 PST (-0800), [email protected] wrote:
> * Darius Rad:
>
>> On Fri, Dec 09, 2022 at 01:32:33PM +0100, Florian Weimer via Libc-alpha wrote:
>>> * Darius Rad:
>>>
>>> > On Fri, Dec 09, 2022 at 11:02:57AM +0100, Florian Weimer wrote:
>>> >> * Andrew Waterman:
>>> >>
>>> >> > This suggests that ld.so, early-stage libc, or possibly both will need
>>> >> > to make this prctl() call, perhaps by parsing the ELF headers of the
>>> >> > binary and each library to determine if the V extension is used.
>>> >>
>>> >> If the string functions use the V extension, it will be enabled
>>> >> unconditionally. So I don't see why it's okay for libc to trigger this
>>> >> alleged UAPI change, when the kernel can't do it by default.
>>> >>
>>> >
>>> > Because the call to enable can fail and userspace needs to deal with that.
>>>
>>> Failure is usually indicated by an AT_HWCAP or AT_HWCAP2 bit remaining
>>> zero, or perhaps a special CPU register (although that is more unusual).
>>
>> That would indicate that the extension is not present, which is one of, but
>> not the only way it can fail.
>
> I think you should bring down the number of failure modes. HWCAP has
> the advantage that it communicates kernel/hypervisor/firmware/CPU
> support in a single bit, which simplifies the programming model and
> avoids hard-to-detect bugs. It's not clear why it would be beneficial
> to continue on ENOMEM failures here because the system must clearly be
> in bad shape at this point, and launching a new process is very unlikely
> to improve matters. So I think the simpler programming model is the way
> to go here.
>
>> The vector extension relies on dynamically allocated memory in the kernel,
>> which can fail.
The issue I'm worried about is that V needs more space in the
ucontext-type structures. We have an extensibility scheme there so we
think it can be made to work, but IIUC we'll need glibc to be updated to
handle the extended contexts in order to avoid losing state when doing
ucontext-related operations like signal handling.
I don't see a way to handle that with just HWCAP, as we essentially need
some bi-directional communicaton between userspace and the kernel so
they can both decide to turn on V. I don't think we strictly need a
system call to do that, we kicked around the idea of encoding this in
the ELF, but there's a lot of flavors of vector in RISC-V and we've had
trouble trying to encode these in binaries before so it seems easier to
just use the syscall.
> But this failure can be reported as part of execve and clone.
>
>> It also provides the opportunity for the kernel to deny access to the
>> vector extension, perhaps due to administrative policy or other future
>> mechanism.
>
> HWCAP can do this, too.
>
> Thanks,
> Florian
On 12/9/22 09:21, Palmer Dabbelt wrote:
> On Fri, 09 Dec 2022 05:04:23 PST (-0800), [email protected] wrote:
>> * Darius Rad:
>>
>>> On Fri, Dec 09, 2022 at 01:32:33PM +0100, Florian Weimer via
>>> Libc-alpha wrote:
>>>> * Darius Rad:
>>>>
>>>> > On Fri, Dec 09, 2022 at 11:02:57AM +0100, Florian Weimer wrote:
>>>> >> * Andrew Waterman:
>>>> >>
>>>> >> > This suggests that ld.so, early-stage libc, or possibly both
>>>> will need
>>>> >> > to make this prctl() call, perhaps by parsing the ELF headers
>>>> of the
>>>> >> > binary and each library to determine if the V extension is used.
>>>> >>
>>>> >> If the string functions use the V extension, it will be enabled
>>>> >> unconditionally. So I don't see why it's okay for libc to
>>>> trigger this
>>>> >> alleged UAPI change, when the kernel can't do it by default.
>>>> >>
>>>> >
>>>> > Because the call to enable can fail and userspace needs to deal
>>>> with that.
>>>>
>>>> Failure is usually indicated by an AT_HWCAP or AT_HWCAP2 bit remaining
>>>> zero, or perhaps a special CPU register (although that is more
>>>> unusual).
>>>
>>> That would indicate that the extension is not present, which is one
>>> of, but
>>> not the only way it can fail.
>>
>> I think you should bring down the number of failure modes. HWCAP has
>> the advantage that it communicates kernel/hypervisor/firmware/CPU
>> support in a single bit, which simplifies the programming model and
>> avoids hard-to-detect bugs. It's not clear why it would be beneficial
>> to continue on ENOMEM failures here because the system must clearly be
>> in bad shape at this point, and launching a new process is very unlikely
>> to improve matters. So I think the simpler programming model is the way
>> to go here.
>>
>>> The vector extension relies on dynamically allocated memory in the
>>> kernel,
>>> which can fail.
>
> The issue I'm worried about is that V needs more space in the
> ucontext-type structures. We have an extensibility scheme there so we
> think it can be made to work, but IIUC we'll need glibc to be updated
> to handle the extended contexts in order to avoid losing state when
> doing ucontext-related operations like signal handling.
Sorry this is not relevant to this thread. I started a different thread
on ABI/sigcontext aspects, lets discuss it there.
>
> I don't see a way to handle that with just HWCAP, as we essentially
> need some bi-directional communicaton between userspace and the kernel
> so they can both decide to turn on V. I don't think we strictly need
> a system call to do that, we kicked around the idea of encoding this
> in the ELF, but there's a lot of flavors of vector in RISC-V and we've
> had trouble trying to encode these in binaries before so it seems
> easier to just use the syscall.
>
>> But this failure can be reported as part of execve and clone.
>>
>>> It also provides the opportunity for the kernel to deny access to the
>>> vector extension, perhaps due to administrative policy or other future
>>> mechanism.
>>
>> HWCAP can do this, too.
Having the prctl as general purpose knob to disable the V unit for
various reasons makes sense.
But keeping the V unit disabled by default and using prctl as a
gatekeeper to enable it feels unnecessary and tedious.
Here's my reasoning below (I'm collating comments from prior msgs as well).
1. Doesn't it add another userspace ABI which is already a headache for
this feature. And that needs to be built into not just libc but
potentially other runtimes too. Even after implemention there will be an
interim pain as the new prctl takes time to trickle down into tooling
and headers. Besides the new stuff will never be compatible with older
kernel but that is a minor point since older kernel not supporting V is
a deal breaker anyways.
2. People want the prctl gatekeeping for ability to gracefully handle
memory allocation failure for the extra V-state within kernel. But that
is only additional 4K (for typical 128 wide V regs) per task. If that is
failing, the system is not doing well anyways. Besides it is not an
issue at all since ENOMEM in clone/execve for the additional space
should handle the failure anyways. Only very sophisticated apps would
downgrade from executing V to Scalar code if the prctl failed. Instead
memory allocation is more likely to be an issue when copying V state on
a deep user stack across signal handling but there's nothing we can do
about it.
3. Another argument to prctl gatekeeping is ensuring user-space conforms
to vector length. But isn't the holy grail of RV V-extension VLA (Vector
Length Agnostic) programming. I expect most implements to follow that.
If there are super sophisticated (or dumb) apps that don't follow, they
will fail randomly. I think of Vector Length as any other ISA extensions
- its not that currently apps are required to prctl() for bitmanip
extension if they want to use it. Sure they could use AT_HWCAP (or
/proc/cpuinfo or any other portable way) to query the capability, same
can be done for V as well. Besides vlen is readable from user space so
the app can read it to make sure. My worry is we are providing
additional safety net to a small category of apps at the expense of
making it tiresome for everyone else.
HWCAP solves the kernel to user-space communication of capabilities. The
prctl is for user-space to kernel communication but I'm not convinced it
is really solving problems for the common case.
Thx,
-Vineet
On Fri, Dec 9, 2022 at 11:42 AM Vineet Gupta <[email protected]> wrote:
>
>
> On 12/9/22 09:21, Palmer Dabbelt wrote:
> > On Fri, 09 Dec 2022 05:04:23 PST (-0800), [email protected] wrote:
> >> * Darius Rad:
> >>
> >>> On Fri, Dec 09, 2022 at 01:32:33PM +0100, Florian Weimer via
> >>> Libc-alpha wrote:
> >>>> * Darius Rad:
> >>>>
> >>>> > On Fri, Dec 09, 2022 at 11:02:57AM +0100, Florian Weimer wrote:
> >>>> >> * Andrew Waterman:
> >>>> >>
> >>>> >> > This suggests that ld.so, early-stage libc, or possibly both
> >>>> will need
> >>>> >> > to make this prctl() call, perhaps by parsing the ELF headers
> >>>> of the
> >>>> >> > binary and each library to determine if the V extension is used.
> >>>> >>
> >>>> >> If the string functions use the V extension, it will be enabled
> >>>> >> unconditionally. So I don't see why it's okay for libc to
> >>>> trigger this
> >>>> >> alleged UAPI change, when the kernel can't do it by default.
> >>>> >>
> >>>> >
> >>>> > Because the call to enable can fail and userspace needs to deal
> >>>> with that.
> >>>>
> >>>> Failure is usually indicated by an AT_HWCAP or AT_HWCAP2 bit remaining
> >>>> zero, or perhaps a special CPU register (although that is more
> >>>> unusual).
> >>>
> >>> That would indicate that the extension is not present, which is one
> >>> of, but
> >>> not the only way it can fail.
> >>
> >> I think you should bring down the number of failure modes. HWCAP has
> >> the advantage that it communicates kernel/hypervisor/firmware/CPU
> >> support in a single bit, which simplifies the programming model and
> >> avoids hard-to-detect bugs. It's not clear why it would be beneficial
> >> to continue on ENOMEM failures here because the system must clearly be
> >> in bad shape at this point, and launching a new process is very unlikely
> >> to improve matters. So I think the simpler programming model is the way
> >> to go here.
> >>
> >>> The vector extension relies on dynamically allocated memory in the
> >>> kernel,
> >>> which can fail.
> >
> > The issue I'm worried about is that V needs more space in the
> > ucontext-type structures. We have an extensibility scheme there so we
> > think it can be made to work, but IIUC we'll need glibc to be updated
> > to handle the extended contexts in order to avoid losing state when
> > doing ucontext-related operations like signal handling.
>
> Sorry this is not relevant to this thread. I started a different thread
> on ABI/sigcontext aspects, lets discuss it there.
>
> >
> > I don't see a way to handle that with just HWCAP, as we essentially
> > need some bi-directional communicaton between userspace and the kernel
> > so they can both decide to turn on V. I don't think we strictly need
> > a system call to do that, we kicked around the idea of encoding this
> > in the ELF, but there's a lot of flavors of vector in RISC-V and we've
> > had trouble trying to encode these in binaries before so it seems
> > easier to just use the syscall.
> >
> >> But this failure can be reported as part of execve and clone.
> >>
> >>> It also provides the opportunity for the kernel to deny access to the
> >>> vector extension, perhaps due to administrative policy or other future
> >>> mechanism.
> >>
> >> HWCAP can do this, too.
>
> Having the prctl as general purpose knob to disable the V unit for
> various reasons makes sense.
>
> But keeping the V unit disabled by default and using prctl as a
> gatekeeper to enable it feels unnecessary and tedious.
> Here's my reasoning below (I'm collating comments from prior msgs as well).
>
> 1. Doesn't it add another userspace ABI which is already a headache for
> this feature. And that needs to be built into not just libc but
> potentially other runtimes too. Even after implemention there will be an
> interim pain as the new prctl takes time to trickle down into tooling
> and headers. Besides the new stuff will never be compatible with older
> kernel but that is a minor point since older kernel not supporting V is
> a deal breaker anyways.
>
> 2. People want the prctl gatekeeping for ability to gracefully handle
> memory allocation failure for the extra V-state within kernel. But that
> is only additional 4K (for typical 128 wide V regs) per task. If that is
> failing, the system is not doing well anyways. Besides it is not an
> issue at all since ENOMEM in clone/execve for the additional space
> should handle the failure anyways. Only very sophisticated apps would
> downgrade from executing V to Scalar code if the prctl failed. Instead
> memory allocation is more likely to be an issue when copying V state on
> a deep user stack across signal handling but there's nothing we can do
> about it.
>
> 3. Another argument to prctl gatekeeping is ensuring user-space conforms
> to vector length. But isn't the holy grail of RV V-extension VLA (Vector
> Length Agnostic) programming.
Yes, a suitable ABI for the V extension should cater cleanly to the
VLA model, since that's a major selling point of this ISA. The
baseline assumption should be that programs will execute correctly
regardless of VLEN (subject to the constraint that the V extension
requires VLEN >= 128, of course).
It's of course valid to construct programs with VLEN-dependent
behavior (e.g. dynamic dispatch to routines optimized differently for
different VLEN), but it should be considered the program's
responsibility to get that right. I don't think the ABI needs to
furnish guard rails.
> I expect most implements to follow that.
> If there are super sophisticated (or dumb) apps that don't follow, they
> will fail randomly. I think of Vector Length as any other ISA extensions
> - its not that currently apps are required to prctl() for bitmanip
> extension if they want to use it. Sure they could use AT_HWCAP (or
> /proc/cpuinfo or any other portable way) to query the capability, same
> can be done for V as well. Besides vlen is readable from user space so
> the app can read it to make sure. My worry is we are providing
> additional safety net to a small category of apps at the expense of
> making it tiresome for everyone else.
>
> HWCAP solves the kernel to user-space communication of capabilities. The
> prctl is for user-space to kernel communication but I'm not convinced it
> is really solving problems for the common case.
>
> Thx,
> -Vineet
On Fri, Dec 09, 2022 at 11:42:19AM -0800, Vineet Gupta wrote:
>
> But keeping the V unit disabled by default and using prctl as a gatekeeper
> to enable it feels unnecessary and tedious.
> Here's my reasoning below (I'm collating comments from prior msgs as well).
Please reference the previous discussion [1] which has covered topics that
have not been discussed recently.
[1] https://lists.infradead.org/pipermail/linux-riscv/2021-September/thread.html#8361
>
> 1. Doesn't it add another userspace ABI which is already a headache for this
> feature. And that needs to be built into not just libc but potentially other
> runtimes too. Even after implemention there will be an interim pain as the
> new prctl takes time to trickle down into tooling and headers. Besides the
> new stuff will never be compatible with older kernel but that is a minor
> point since older kernel not supporting V is a deal breaker anyways.
>
None of this is relevant because there is no existing user space ABI for
vector. It is being invented now. If this is done poorly, for example, by
missing this opportunity to add a mechanism for user space to request use
of the vector extension, it will be much more painful to add later.
> 2. People want the prctl gatekeeping for ability to gracefully handle memory
> allocation failure for the extra V-state within kernel. But that is only
> additional 4K (for typical 128 wide V regs) per task.
But vector state scales up to as much as 256k. Are you suggesting that
there is no possibility that future systems would support more than
VLEN=128?
> If that is failing,
> the system is not doing well anyways. Besides it is not an issue at all
> since ENOMEM in clone/execve for the additional space should handle the
> failure anyways. Only very sophisticated apps would downgrade from executing
> V to Scalar code if the prctl failed.
This seems unlikely. As vector support does not exist in any present
hardware, and the vector extension is only optional in the RISC-V profiles
that include it, I would think that it is almost certain that any
application that supports V would have a fallback path for when the V
extension is not available.
Another motivation for requiring that user space request use of the vector
extension is that the vector unit may be shared between multiple harts
and/or have power or performance implications in the system. By requiring
that user space request access, it allows the system to decline that
access, and user space can handle this gracefully.
If we add a mechanism for user space to request access to the vector
extension, and it turns out that it was unnecessary, the worst that has
happened is a slight inconvenience.
If we do not add such a mechanism, and later determine that it is
necessary, we have a much greater problem. There would be backward
compatibility issues with the ABI, and such a mechanism could probably not
be fully implemented at all due to the desire to support potential future
legacy vector code.
This is a similar problem on x86. According to some, it was handled poorly
with AVX-512 by missing this type of mechanism, and improved with AMX [2].
There is opportunity to learn from that experience and do things better on
RISC-V.
[2] https://lore.kernel.org/lkml/[email protected]/
// darius
On 12/13/22 08:43, Darius Rad wrote:
> On Fri, Dec 09, 2022 at 11:42:19AM -0800, Vineet Gupta wrote:
>> But keeping the V unit disabled by default and using prctl as a gatekeeper
>> to enable it feels unnecessary and tedious.
>> Here's my reasoning below (I'm collating comments from prior msgs as well).
> Please reference the previous discussion [1] which has covered topics that
> have not been discussed recently.
>
> [1] https://lists.infradead.org/pipermail/linux-riscv/2021-September/thread.html#8361
I sure read thru that thread, and many more :-) to get context.
The highlight is we should something because AVX/AMX do so (or failed to
do so).
But on the flip side ARM SVE is not disabling this by default.
Your other concerns seems to be potential power implications for leaving
it on and sharing of V unit across harts (see more on that below)
Maybe leaving it on all the time will be motivation for hw designers to
be more considerate of the idle power draw.
>
>> 2. People want the prctl gatekeeping for ability to gracefully handle memory
>> allocation failure for the extra V-state within kernel. But that is only
>> additional 4K (for typical 128 wide V regs) per task.
> But vector state scales up to as much as 256k. Are you suggesting that
> there is no possibility that future systems would support more than
> VLEN=128?
I mentioned "typical". And below also said that memory allocation
concerns are moot, since fork/execve failures due to failing to allocate
would take care of those anyways.
>> If that is failing,
>> the system is not doing well anyways. Besides it is not an issue at all
>> since ENOMEM in clone/execve for the additional space should handle the
>> failure anyways. Only very sophisticated apps would downgrade from executing
>> V to Scalar code if the prctl failed.
> This seems unlikely. As vector support does not exist in any present
> hardware, and the vector extension is only optional in the RISC-V profiles
> that include it, I would think that it is almost certain that any
> application that supports V would have a fallback path for when the V
> extension is not available.
For specialized cases sure we would expect fat binaries with IFUNC based
dispatches (glibc mem*/str* are obvious examples). But with newer
compilers autovec is increasing becoming default even at medium
optimization levels such as -O2. So V code littered all over is just a
matter of time for the profiles/variants which allow V. For less capable
systems w/o V this is all but moot discussion since kernel itself need
not be built with V enabled.
> Another motivation for requiring that user space request use of the vector
> extension is that the vector unit may be shared between multiple harts
> and/or have power or performance implications in the system. By requiring
> that user space request access, it allows the system to decline that
> access, and user space can handle this gracefully.
But in this specific example won't the prctl cause more pain. So 2
concurrent processes on 2 different harts with shared V unit. One sends
prctl to enable and other wants to disable, what would the kernel do.
Will it just be whoever ends up running later wins. Granted I'm not too
familiar with how such a cross-hart sharing would work in a Vector
instructions being part of ISA (vs. Vector accelerator with job
push/pull approach)
Honestly I'm sympathetic to your power concerns with keeping V enabled
all the time. But the mechanics of implementing this prctl makes me
wary. Assuming this is done from dynamic loader, it implies loader
itself needs to be built with V disabled. This also leaves bunch of perf
on table since loader does tons of of string and memory operations which
could potentially benefit from V enabled code, granted it is not deal
breaker.
> If we add a mechanism for user space to request access to the vector
> extension, and it turns out that it was unnecessary, the worst that has
> happened is a slight inconvenience.
>
> If we do not add such a mechanism, and later determine that it is
> necessary, we have a much greater problem. There would be backward
> compatibility issues with the ABI, and such a mechanism could probably not
> be fully implemented at all due to the desire to support potential future
> legacy vector code.
Very true, but this in itself is not sufficient of a reason to warrant
adding it now.
> This is a similar problem on x86. According to some, it was handled poorly
> with AVX-512 by missing this type of mechanism, and improved with AMX [2].
> There is opportunity to learn from that experience and do things better on
> RISC-V.
>
> [2] https://lore.kernel.org/lkml/[email protected]/
Right, but then why did ARM SVE guys chose to not take this path.
-Vineet
On 12/14/22 14:07, Vineet Gupta wrote:
> On 12/13/22 08:43, Darius Rad wrote:
>> On Fri, Dec 09, 2022 at 11:42:19AM -0800, Vineet Gupta wrote:
>>> If that is failing,
>>> the system is not doing well anyways. Besides it is not an issue at all
>>> since ENOMEM in clone/execve for the additional space should handle the
>>> failure anyways. Only very sophisticated apps would downgrade from
>>> executing
>>> V to Scalar code if the prctl failed.
>> This seems unlikely. As vector support does not exist in any present
>> hardware, and the vector extension is only optional in the RISC-V
>> profiles
>> that include it, I would think that it is almost certain that any
>> application that supports V would have a fallback path for when the V
>> extension is not available.
>
> For specialized cases sure we would expect fat binaries with IFUNC based
> dispatches (glibc mem*/str* are obvious examples). But with newer
> compilers autovec is increasing becoming default even at medium
> optimization levels such as -O2. So V code littered all over is just a
> matter of time for the profiles/variants which allow V. For less capable
Autovectorization is only possible where the profile *requires* V. And
no existing profile does that.
> systems w/o V this is all but moot discussion since kernel itself need
> not be built with V enabled.
Distro kernels will be built with V enabled, and will spend most of
their time running on systems without V.
>> Another motivation for requiring that user space request use of the
>> vector
>> extension is that the vector unit may be shared between multiple harts
>> and/or have power or performance implications in the system. By
>> requiring
>> that user space request access, it allows the system to decline that
>> access, and user space can handle this gracefully.
>
> But in this specific example won't the prctl cause more pain. So 2
> concurrent processes on 2 different harts with shared V unit. One sends
> prctl to enable and other wants to disable, what would the kernel do.
> Will it just be whoever ends up running later wins. Granted I'm not too
> familiar with how such a cross-hart sharing would work in a Vector
> instructions being part of ISA (vs. Vector accelerator with job
> push/pull approach)
The vector execution unit can be shared between harts, not the vector
state. Think SMT.
> Honestly I'm sympathetic to your power concerns with keeping V enabled
> all the time. But the mechanics of implementing this prctl makes me
> wary. Assuming this is done from dynamic loader, it implies loader
> itself needs to be built with V disabled. This also leaves bunch of perf
> on table since loader does tons of of string and memory operations which
> could potentially benefit from V enabled code, granted it is not deal
> breaker.
The dynamic loader can do dynamic dispatch based on the result of the
prctl() just as well as any other library.
And the distro's dynamic loader can't be compiled with autovectorization
enabled anyway, because again the profiles support processors without V.
>> If we add a mechanism for user space to request access to the vector
>> extension, and it turns out that it was unnecessary, the worst that has
>> happened is a slight inconvenience.
>>
>> If we do not add such a mechanism, and later determine that it is
>> necessary, we have a much greater problem. There would be backward
>> compatibility issues with the ABI, and such a mechanism could probably
>> not
>> be fully implemented at all due to the desire to support potential future
>> legacy vector code.
>
> Very true, but this in itself is not sufficient of a reason to warrant
> adding it now.
It is, because changing the sigcontext layout without an opt-in is
already an ABI break (it completely blows past MINSIGSTKSZ).
Regards,
Samuel
On Wed, Dec 14, 2022 at 12:07:03PM -0800, Vineet Gupta wrote:
> On 12/13/22 08:43, Darius Rad wrote:
> > On Fri, Dec 09, 2022 at 11:42:19AM -0800, Vineet Gupta wrote:
> > > But keeping the V unit disabled by default and using prctl as a gatekeeper
> > > to enable it feels unnecessary and tedious.
> > > Here's my reasoning below (I'm collating comments from prior msgs as well).
> > Please reference the previous discussion [1] which has covered topics that
> > have not been discussed recently.
> >
> > [1] https://lists.infradead.org/pipermail/linux-riscv/2021-September/thread.html#8361
>
> I sure read thru that thread, and many more :-) to get context.
> The highlight is we should something because AVX/AMX do so (or failed to do
> so).
> But on the flip side ARM SVE is not disabling this by default.
> Your other concerns seems to be potential power implications for leaving it
> on and sharing of V unit across harts (see more on that below)
> Maybe leaving it on all the time will be motivation for hw designers to be
> more considerate of the idle power draw.
>
That is not quite the same take away I had from the AMX discussion. I
would also disagree that ARM SVE is not disabling things by default,
although the behavior is somewhat different.
The significant point that I see from that discussion, which also applies
to SVE, and also applies to RISC-V vector, is that the extension is
necessarily dependant upon a functional unit that is reasonably large with
respect to the size of the processor and has a significant amount of
additional architectural state. The argument there is that AMX/SVE/RVV is
a significant system resource and should be treated as such: by having the
kernel track usage of it and by having a process specifically request
access to it.
For any of AMX/SVE/RVV, use of the extension operates as follows:
1. A process requests access to the extension,
2. The kernel allocates memory for the additional state for this process,
3. The kernel enables the extension for the process, and finally
4. The process is able to use the instructions.
I don't recall the exact details, but my understanding is that AMX is going
to use an x86 specific mechanism and require and explict request from user
space to request access to AMX.
For SVE, it is in fact disabled by default in the kernel. When a thread
executes the first SVE instruction, it will cause an exception, the kernel
will allocate memory for SVE state and enable TIF_SVE. Further use of SVE
instructions will proceed without exceptions. Although SVE is disabled by
default, it is enabled automatically. Since this is done automatically
during an exception handler, there is no opportunity for memory allocation
errors to be reported, as there are in the AMX case.
For RVV, I do not recall ever seeing Linux patches that automatically enable
vector. I have seen it enabled unconditionally, or with a manual enable
(i.e., prctl).
It is possible to write a program that does not ever use AMX, and when that
program is run, the process will not incur the power or memory overhead of
AMX. It is also possible to do that with SVE. This is simply not possible
if RISC-V will, by default for every process, enable and allocate state
memory for vector.
So my thought would be what is the motivation for being even less flexible
than SVE, if you feel that the AMX mechanism is too onerous?
> >
> > > 2. People want the prctl gatekeeping for ability to gracefully handle memory
> > > allocation failure for the extra V-state within kernel. But that is only
> > > additional 4K (for typical 128 wide V regs) per task.
> > But vector state scales up to as much as 256k. Are you suggesting that
> > there is no possibility that future systems would support more than
> > VLEN=128?
>
> I mentioned "typical". And below also said that memory allocation concerns
> are moot, since fork/execve failures due to failing to allocate would take
> care of those anyways.
>
But again, what if one were using such an admittedly atypical system? Why
should such a user be compelled to take a memory hit for every process,
even if they specifically go out of their way to avoid using vector
instructions?
> > > If that is failing,
> > > the system is not doing well anyways. Besides it is not an issue at all
> > > since ENOMEM in clone/execve for the additional space should handle the
> > > failure anyways. Only very sophisticated apps would downgrade from executing
> > > V to Scalar code if the prctl failed.
> > This seems unlikely. As vector support does not exist in any present
> > hardware, and the vector extension is only optional in the RISC-V profiles
> > that include it, I would think that it is almost certain that any
> > application that supports V would have a fallback path for when the V
> > extension is not available.
>
> For specialized cases sure we would expect fat binaries with IFUNC based
> dispatches (glibc mem*/str* are obvious examples). But with newer compilers
> autovec is increasing becoming default even at medium optimization levels
> such as -O2. So V code littered all over is just a matter of time for the
> profiles/variants which allow V. For less capable systems w/o V this is all
> but moot discussion since kernel itself need not be built with V enabled.
>
To me, that seems like a lot of speculation, and certainly not worth
relying on to limit functionality.
>
> > Another motivation for requiring that user space request use of the vector
> > extension is that the vector unit may be shared between multiple harts
> > and/or have power or performance implications in the system. By requiring
> > that user space request access, it allows the system to decline that
> > access, and user space can handle this gracefully.
>
> But in this specific example won't the prctl cause more pain. So 2
> concurrent processes on 2 different harts with shared V unit. One sends
> prctl to enable and other wants to disable, what would the kernel do. Will
> it just be whoever ends up running later wins. Granted I'm not too familiar
> with how such a cross-hart sharing would work in a Vector instructions being
> part of ISA (vs. Vector accelerator with job push/pull approach)
>
I think you are misunderstanding both the mechanism and the potential
implementation.
The mechanism is not a switch per hart, it is a switch per process (or
thread). It indicates to the kernel that this process (or thread) is using
the resource, and the kernel will allocate memory, enable instructions,
etc., as needed when that processes is scheduled on a hart.
The theoretical implementation is something like what was done with
floating point in the AMD Bulldozer architecture. All the sharing of
resources is done automatically in hardware, but obviously performance
measurements could demonstrate that each thread/hart does not have its own
execution unit. I would imagine that in such an implementation, the vector
unit would not be able to, for example, enter a low power mode unless both
harts that depend on it are not using it. I would also imagine that single
thread performance for vector would be optimized if only one thread were
using the vector unit at a time. If the vector unit is enabled
unconditionally by default for all processes, this is not possible.
// darius
Darius Rad <[email protected]> writes:
> On Wed, Dec 14, 2022 at 12:07:03PM -0800, Vineet Gupta wrote:
>> On 12/13/22 08:43, Darius Rad wrote:
>> > On Fri, Dec 09, 2022 at 11:42:19AM -0800, Vineet Gupta wrote:
>> > > But keeping the V unit disabled by default and using prctl as a gatekeeper
>> > > to enable it feels unnecessary and tedious.
>> > > Here's my reasoning below (I'm collating comments from prior msgs as well).
>> > Please reference the previous discussion [1] which has covered topics that
>> > have not been discussed recently.
>> >
>> > [1] https://lists.infradead.org/pipermail/linux-riscv/2021-September/thread.html#8361
>>
>> I sure read thru that thread, and many more :-) to get context.
>> The highlight is we should something because AVX/AMX do so (or failed to do
>> so).
>> But on the flip side ARM SVE is not disabling this by default.
>> Your other concerns seems to be potential power implications for leaving it
>> on and sharing of V unit across harts (see more on that below)
>> Maybe leaving it on all the time will be motivation for hw designers to be
>> more considerate of the idle power draw.
>>
>
> That is not quite the same take away I had from the AMX discussion. I
> would also disagree that ARM SVE is not disabling things by default,
> although the behavior is somewhat different.
>
> The significant point that I see from that discussion, which also applies
> to SVE, and also applies to RISC-V vector, is that the extension is
> necessarily dependant upon a functional unit that is reasonably large with
> respect to the size of the processor and has a significant amount of
> additional architectural state. The argument there is that AMX/SVE/RVV is
> a significant system resource and should be treated as such: by having the
> kernel track usage of it and by having a process specifically request
> access to it.
>
> For any of AMX/SVE/RVV, use of the extension operates as follows:
>
> 1. A process requests access to the extension,
>
> 2. The kernel allocates memory for the additional state for this process,
>
> 3. The kernel enables the extension for the process, and finally
>
> 4. The process is able to use the instructions.
>
> I don't recall the exact details, but my understanding is that AMX is going
> to use an x86 specific mechanism and require and explict request from user
> space to request access to AMX.
Yes, it uses arch_prctl, and on top of that a "lazy trigger" (AFAIK) as
SVE (first use enable via trap).
> For SVE, it is in fact disabled by default in the kernel. When a thread
> executes the first SVE instruction, it will cause an exception, the kernel
> will allocate memory for SVE state and enable TIF_SVE. Further use of SVE
> instructions will proceed without exceptions. Although SVE is disabled by
> default, it is enabled automatically. Since this is done automatically
> during an exception handler, there is no opportunity for memory allocation
> errors to be reported, as there are in the AMX case.
Glibc has an SVE optimized memcpy, right? Doesn't that mean that pretty
much all processes on an SVE capable system will enable SVE (lazily)? If
so, that's close to "enabled by default" (unless SVE is disabled system
wide).
> For RVV, I do not recall ever seeing Linux patches that automatically enable
> vector. I have seen it enabled unconditionally, or with a manual enable
> (i.e., prctl).
>
> It is possible to write a program that does not ever use AMX, and when that
> program is run, the process will not incur the power or memory overhead of
> AMX. It is also possible to do that with SVE. This is simply not possible
> if RISC-V will, by default for every process, enable and allocate state
> memory for vector.
>
> So my thought would be what is the motivation for being even less flexible
> than SVE, if you feel that the AMX mechanism is too onerous?
AMX is a bit different from SVE and V; SVE/V is/would be used by glibc
for memcpy and such, where I doubt that AMX would be used there. Then
again, there's AVX512 which many argue that "turned on by default" was a
mistake (ABI breakage/power consumption).
>> >
>> > > 2. People want the prctl gatekeeping for ability to gracefully handle memory
>> > > allocation failure for the extra V-state within kernel. But that is only
>> > > additional 4K (for typical 128 wide V regs) per task.
>> > But vector state scales up to as much as 256k. Are you suggesting that
>> > there is no possibility that future systems would support more than
>> > VLEN=128?
>>
>> I mentioned "typical". And below also said that memory allocation concerns
>> are moot, since fork/execve failures due to failing to allocate would take
>> care of those anyways.
>>
>
> But again, what if one were using such an admittedly atypical system? Why
> should such a user be compelled to take a memory hit for every process,
> even if they specifically go out of their way to avoid using vector
> instructions?
For the sake of discussion; Nobody is arguing against not having knobs
to turn V on/off per-process/per-system, right? The discussion is about
on/off, and broader what a "typical" RV system looks like. If most
systems that fold in the A profile has V, it might make sense not
requiring users to explicitly enable it, and vice-versa.
Using RVA23 as a ball-gazing aid, [1] states that it might mandate V. If
so, assuming that "most system will be designed for V usage" is not
crazy.
Now moving on! The thread is leaning towards "disabled by default" ("AMX
way"), let's try to move the discussion forward!
The Linux kernel itself would benefit from using V
(hashing/crypto). What kind of policy would determine if the kernel is
allowed to use V? Default off, with an explicit enable kernel knob
(cmdline/sysctl/sysfs/...)?
There will likely be V support in glibc (str*/mem*). For systems that
prefer having V "always-on", the UX of requiring all binaries to
explicitly call prctl() is not great (as Andrew pointed out in earlier
posts). A V knob based on some system policy in crt0? :-P
Björn
[1] https://lists.riscv.org/g/tech-profiles/message/48
* Björn Töpel:
>> For SVE, it is in fact disabled by default in the kernel. When a thread
>> executes the first SVE instruction, it will cause an exception, the kernel
>> will allocate memory for SVE state and enable TIF_SVE. Further use of SVE
>> instructions will proceed without exceptions. Although SVE is disabled by
>> default, it is enabled automatically. Since this is done automatically
>> during an exception handler, there is no opportunity for memory allocation
>> errors to be reported, as there are in the AMX case.
>
> Glibc has an SVE optimized memcpy, right? Doesn't that mean that pretty
> much all processes on an SVE capable system will enable SVE (lazily)? If
> so, that's close to "enabled by default" (unless SVE is disabled system
> wide).
Yes, see sysdeps/aarch64/multiarch/memcpy.c:
static inline __typeof (__redirect_memcpy) *
select_memcpy_ifunc (void)
{
INIT_ARCH ();
if (sve && HAVE_AARCH64_SVE_ASM)
{
if (IS_A64FX (midr))
return __memcpy_a64fx;
return __memcpy_sve;
}
if (IS_THUNDERX (midr))
return __memcpy_thunderx;
if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
return __memcpy_thunderx2;
if (IS_FALKOR (midr) || IS_PHECDA (midr))
return __memcpy_falkor;
return __memcpy_generic;
}
And the __memcpy_sve implementation actually uses SVE.
If there were a prctl to select the vector width and enable the vector
extension, we'd have to pick a width in glibc anyway. Likewise for any
other libc, the Go runtime, and so on. That's why I think the kernel is
in a better position to handle this.
> AMX is a bit different from SVE and V; SVE/V is/would be used by glibc
> for memcpy and such, where I doubt that AMX would be used there. Then
> again, there's AVX512 which many argue that "turned on by default" was a
> mistake (ABI breakage/power consumption).
I don't think AMX is useful for string operations or the math functions
currently implemented in glibc.
Not everything in AVX-512 has high power consumption on relevant CPUs.
Furthermore, the extra registers that don't need VZEROUPPER help us to
avoid transactions aborts in RTM mode. If we had to enable AVX-512
explicitly in every process, I'm not sure if we would be using it today.
The complicated choices around AVX-512 (and AVX2 for earlier CPUs)
aren't particularly unique. These functions have different trade-offs
(optimizing for single thread/single process usage vs global system
behavior) on other architectures, too.
> There will likely be V support in glibc (str*/mem*). For systems that
> prefer having V "always-on", the UX of requiring all binaries to
> explicitly call prctl() is not great (as Andrew pointed out in earlier
> posts). A V knob based on some system policy in crt0? :-P
It wouldn't be in crt0 (statically linked), it would be in the dynamic
loader. So not quite as bad if policy revisions are required. But
glibc is not the only provider of userspace startup code, so future
tuning of userspace policy will remain complicated.
Thanks,
Florian
On 12/15/22 04:28, Florian Weimer via Libc-alpha wrote:
> * Björn Töpel:
>
>>> For SVE, it is in fact disabled by default in the kernel. When a thread
>>> executes the first SVE instruction, it will cause an exception, the kernel
>>> will allocate memory for SVE state and enable TIF_SVE. Further use of SVE
>>> instructions will proceed without exceptions. Although SVE is disabled by
>>> default, it is enabled automatically. Since this is done automatically
>>> during an exception handler, there is no opportunity for memory allocation
>>> errors to be reported, as there are in the AMX case.
>>
>> Glibc has an SVE optimized memcpy, right? Doesn't that mean that pretty
>> much all processes on an SVE capable system will enable SVE (lazily)? If
>> so, that's close to "enabled by default" (unless SVE is disabled system
>> wide).
>
> Yes, see sysdeps/aarch64/multiarch/memcpy.c:
>
> static inline __typeof (__redirect_memcpy) *
> select_memcpy_ifunc (void)
> {
> INIT_ARCH ();
>
> if (sve && HAVE_AARCH64_SVE_ASM)
> {
> if (IS_A64FX (midr))
> return __memcpy_a64fx;
> return __memcpy_sve;
> }
>
> if (IS_THUNDERX (midr))
> return __memcpy_thunderx;
>
> if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
> return __memcpy_thunderx2;
>
> if (IS_FALKOR (midr) || IS_PHECDA (midr))
> return __memcpy_falkor;
>
> return __memcpy_generic;
> }
>
> And the __memcpy_sve implementation actually uses SVE.
>
> If there were a prctl to select the vector width and enable the vector
> extension, we'd have to pick a width in glibc anyway.
There *is* a prctl to adjust the SVE vector width, but glibc does not need to select
because SVE dynamically adjusts to the currently enabled width. The kernel selects a
default width that fits within the default signal frame size.
The other thing of note for SVE is that, with the default function ABI all of the SVE
state is call-clobbered, which allows the kernel to drop instead of save state across
system calls. (There is a separate vector function call ABI when SVE types are used.)
So while strcpy may enable SVE for the thread, the next syscall may disable it again.
r~
On Thu, Dec 15, 2022 at 10:57 AM Vineet Gupta <[email protected]> wrote:
>
>
>
> On 12/15/22 07:33, Richard Henderson wrote:
> > On 12/15/22 04:28, Florian Weimer via Libc-alpha wrote:
> >> * Björn Töpel:
> >>
> >>>> For SVE, it is in fact disabled by default in the kernel. When a
> >>>> thread
> >>>> executes the first SVE instruction, it will cause an exception, the
> >>>> kernel
> >>>> will allocate memory for SVE state and enable TIF_SVE. Further use
> >>>> of SVE
> >>>> instructions will proceed without exceptions. Although SVE is
> >>>> disabled by
> >>>> default, it is enabled automatically. Since this is done
> >>>> automatically
> >>>> during an exception handler, there is no opportunity for memory
> >>>> allocation
> >>>> errors to be reported, as there are in the AMX case.
> >>>
> >>> Glibc has an SVE optimized memcpy, right? Doesn't that mean that pretty
> >>> much all processes on an SVE capable system will enable SVE
> >>> (lazily)? If
> >>> so, that's close to "enabled by default" (unless SVE is disabled system
> >>> wide).
> >>
> >> Yes, see sysdeps/aarch64/multiarch/memcpy.c:
> >>
> >> static inline __typeof (__redirect_memcpy) *
> >> select_memcpy_ifunc (void)
> >> {
> >> INIT_ARCH ();
> >> if (sve && HAVE_AARCH64_SVE_ASM)
> >> {
> >> if (IS_A64FX (midr))
> >> return __memcpy_a64fx;
> >> return __memcpy_sve;
> >> }
> >> if (IS_THUNDERX (midr))
> >> return __memcpy_thunderx;
> >> if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
> >> return __memcpy_thunderx2;
> >> if (IS_FALKOR (midr) || IS_PHECDA (midr))
> >> return __memcpy_falkor;
> >> return __memcpy_generic;
> >> }
> >> And the __memcpy_sve implementation actually uses SVE.
> >>
> >> If there were a prctl to select the vector width and enable the vector
> >> extension, we'd have to pick a width in glibc anyway.
> >
> > There *is* a prctl to adjust the SVE vector width, but glibc does not
> > need to select because SVE dynamically adjusts to the currently
> > enabled width. The kernel selects a default width that fits within
> > the default signal frame size.
> >
> > The other thing of note for SVE is that, with the default function ABI
> > all of the SVE state is call-clobbered, which allows the kernel to
> > drop instead of save state across system calls. (There is a separate
> > vector function call ABI when SVE types are used.)
>
> For the RV psABI, it is similar - all V regs are
> caller-saved/call-clobbered [1] and syscalls are not required to
> preserve V regs [2]
> However last I checked ARM documentation the ABI doc seemed to suggest
> that some (parts) of the SVE regs are callee-saved [3]
Yes the lower 64 bits which overlap with the floating point registers.
Thanks,
Andrew Pinski
>
> >
> > So while strcpy may enable SVE for the thread, the next syscall may
> > disable it again.
>
> Next syscall could trash them, but will it disable SVE ? Despite
> syscall/function-call clobbers, using V in tight loops such as mem*/str*
> still is a win.
>
>
> [1]
> https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc
> [2]
> https://github.com/riscv/riscv-v-spec/blob/master/calling-convention.adoc
> [3]
> https://github.com/ARM-software/abi-aa/blob/2982a9f3b512a5bfdc9e3fea5d3b298f9165c36b/aapcs64/aapcs64.rst#the-base-procedure-call-standard
> Sec 6.1.3 ".... In other cases it need only preserve the low 64 bits of
> z8-z15"
>
On Thu, Dec 15, 2022 at 10:59 AM Andrew Pinski <[email protected]> wrote:
>
> On Thu, Dec 15, 2022 at 10:57 AM Vineet Gupta <[email protected]> wrote:
> >
> >
> >
> > On 12/15/22 07:33, Richard Henderson wrote:
> > > On 12/15/22 04:28, Florian Weimer via Libc-alpha wrote:
> > >> * Björn Töpel:
> > >>
> > >>>> For SVE, it is in fact disabled by default in the kernel. When a
> > >>>> thread
> > >>>> executes the first SVE instruction, it will cause an exception, the
> > >>>> kernel
> > >>>> will allocate memory for SVE state and enable TIF_SVE. Further use
> > >>>> of SVE
> > >>>> instructions will proceed without exceptions. Although SVE is
> > >>>> disabled by
> > >>>> default, it is enabled automatically. Since this is done
> > >>>> automatically
> > >>>> during an exception handler, there is no opportunity for memory
> > >>>> allocation
> > >>>> errors to be reported, as there are in the AMX case.
> > >>>
> > >>> Glibc has an SVE optimized memcpy, right? Doesn't that mean that pretty
> > >>> much all processes on an SVE capable system will enable SVE
> > >>> (lazily)? If
> > >>> so, that's close to "enabled by default" (unless SVE is disabled system
> > >>> wide).
> > >>
> > >> Yes, see sysdeps/aarch64/multiarch/memcpy.c:
> > >>
> > >> static inline __typeof (__redirect_memcpy) *
> > >> select_memcpy_ifunc (void)
> > >> {
> > >> INIT_ARCH ();
> > >> if (sve && HAVE_AARCH64_SVE_ASM)
> > >> {
> > >> if (IS_A64FX (midr))
> > >> return __memcpy_a64fx;
> > >> return __memcpy_sve;
> > >> }
> > >> if (IS_THUNDERX (midr))
> > >> return __memcpy_thunderx;
> > >> if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
> > >> return __memcpy_thunderx2;
> > >> if (IS_FALKOR (midr) || IS_PHECDA (midr))
> > >> return __memcpy_falkor;
> > >> return __memcpy_generic;
> > >> }
> > >> And the __memcpy_sve implementation actually uses SVE.
> > >>
> > >> If there were a prctl to select the vector width and enable the vector
> > >> extension, we'd have to pick a width in glibc anyway.
> > >
> > > There *is* a prctl to adjust the SVE vector width, but glibc does not
> > > need to select because SVE dynamically adjusts to the currently
> > > enabled width. The kernel selects a default width that fits within
> > > the default signal frame size.
> > >
> > > The other thing of note for SVE is that, with the default function ABI
> > > all of the SVE state is call-clobbered, which allows the kernel to
> > > drop instead of save state across system calls. (There is a separate
> > > vector function call ABI when SVE types are used.)
> >
> > For the RV psABI, it is similar - all V regs are
> > caller-saved/call-clobbered [1] and syscalls are not required to
> > preserve V regs [2]
> > However last I checked ARM documentation the ABI doc seemed to suggest
> > that some (parts) of the SVE regs are callee-saved [3]
>
> Yes the lower 64 bits which overlap with the floating point registers.
I should expand on that, only the specific callee registers have to
save the lower 64bits because they overlap with the floating point
registers.
Thanks,
Andrew
>
> Thanks,
> Andrew Pinski
>
>
> >
> > >
> > > So while strcpy may enable SVE for the thread, the next syscall may
> > > disable it again.
> >
> > Next syscall could trash them, but will it disable SVE ? Despite
> > syscall/function-call clobbers, using V in tight loops such as mem*/str*
> > still is a win.
> >
> >
> > [1]
> > https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc
> > [2]
> > https://github.com/riscv/riscv-v-spec/blob/master/calling-convention.adoc
> > [3]
> > https://github.com/ARM-software/abi-aa/blob/2982a9f3b512a5bfdc9e3fea5d3b298f9165c36b/aapcs64/aapcs64.rst#the-base-procedure-call-standard
> > Sec 6.1.3 ".... In other cases it need only preserve the low 64 bits of
> > z8-z15"
> >
On 12/15/22 07:33, Richard Henderson wrote:
> On 12/15/22 04:28, Florian Weimer via Libc-alpha wrote:
>> * Björn Töpel:
>>
>>>> For SVE, it is in fact disabled by default in the kernel. When a
>>>> thread
>>>> executes the first SVE instruction, it will cause an exception, the
>>>> kernel
>>>> will allocate memory for SVE state and enable TIF_SVE. Further use
>>>> of SVE
>>>> instructions will proceed without exceptions. Although SVE is
>>>> disabled by
>>>> default, it is enabled automatically. Since this is done
>>>> automatically
>>>> during an exception handler, there is no opportunity for memory
>>>> allocation
>>>> errors to be reported, as there are in the AMX case.
>>>
>>> Glibc has an SVE optimized memcpy, right? Doesn't that mean that pretty
>>> much all processes on an SVE capable system will enable SVE
>>> (lazily)? If
>>> so, that's close to "enabled by default" (unless SVE is disabled system
>>> wide).
>>
>> Yes, see sysdeps/aarch64/multiarch/memcpy.c:
>>
>> static inline __typeof (__redirect_memcpy) *
>> select_memcpy_ifunc (void)
>> {
>> INIT_ARCH ();
>> if (sve && HAVE_AARCH64_SVE_ASM)
>> {
>> if (IS_A64FX (midr))
>> return __memcpy_a64fx;
>> return __memcpy_sve;
>> }
>> if (IS_THUNDERX (midr))
>> return __memcpy_thunderx;
>> if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
>> return __memcpy_thunderx2;
>> if (IS_FALKOR (midr) || IS_PHECDA (midr))
>> return __memcpy_falkor;
>> return __memcpy_generic;
>> }
>> And the __memcpy_sve implementation actually uses SVE.
>>
>> If there were a prctl to select the vector width and enable the vector
>> extension, we'd have to pick a width in glibc anyway.
>
> There *is* a prctl to adjust the SVE vector width, but glibc does not
> need to select because SVE dynamically adjusts to the currently
> enabled width. The kernel selects a default width that fits within
> the default signal frame size.
>
> The other thing of note for SVE is that, with the default function ABI
> all of the SVE state is call-clobbered, which allows the kernel to
> drop instead of save state across system calls. (There is a separate
> vector function call ABI when SVE types are used.)
For the RV psABI, it is similar - all V regs are
caller-saved/call-clobbered [1] and syscalls are not required to
preserve V regs [2]
However last I checked ARM documentation the ABI doc seemed to suggest
that some (parts) of the SVE regs are callee-saved [3]
>
> So while strcpy may enable SVE for the thread, the next syscall may
> disable it again.
Next syscall could trash them, but will it disable SVE ? Despite
syscall/function-call clobbers, using V in tight loops such as mem*/str*
still is a win.
[1]
https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc
[2]
https://github.com/riscv/riscv-v-spec/blob/master/calling-convention.adoc
[3]
https://github.com/ARM-software/abi-aa/blob/2982a9f3b512a5bfdc9e3fea5d3b298f9165c36b/aapcs64/aapcs64.rst#the-base-procedure-call-standard
Sec 6.1.3 ".... In other cases it need only preserve the low 64 bits of
z8-z15"
On 12/15/22 10:57, Vineet Gupta wrote:
>> The other thing of note for SVE is that, with the default function ABI all of the SVE
>> state is call-clobbered, which allows the kernel to drop instead of save state across
>> system calls. (There is a separate vector function call ABI when SVE types are used.)
>
> For the RV psABI, it is similar - all V regs are caller-saved/call-clobbered [1] and
> syscalls are not required to preserve V regs [2]
> However last I checked ARM documentation the ABI doc seemed to suggest that some (parts)
> of the SVE regs are callee-saved [3]
As Pinski mentioned, just some low bits that overlap with scalar fp state; the high bits
and the predicate registers gets zeroed when re-enabling.
>> So while strcpy may enable SVE for the thread, the next syscall may disable it again.
>
> Next syscall could trash them, but will it disable SVE ?
Yes. See fp_user_discard() in arch/arm64/kernel/syscall.c.
r~