2023-01-11 13:29:32

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

Use static key to reduce untagged_addr() overhead.

The key only gets enabled when the first process enables LAM.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/uaccess.h | 8 ++++++--
arch/x86/kernel/process_64.c | 4 ++++
2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 32c9dd052e43..f9f85d596581 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -24,6 +24,8 @@ static inline bool pagefault_disabled(void);
#endif

#ifdef CONFIG_ADDRESS_MASKING
+DECLARE_STATIC_KEY_FALSE(tagged_addr_key);
+
/*
* Mask out tag bits from the address.
*
@@ -32,8 +34,10 @@ static inline bool pagefault_disabled(void);
*/
#define __untagged_addr(untag_mask, addr) ({ \
u64 __addr = (__force u64)(addr); \
- s64 sign = (s64)__addr >> 63; \
- __addr &= untag_mask | sign; \
+ if (static_branch_likely(&tagged_addr_key)) { \
+ s64 sign = (s64)__addr >> 63; \
+ __addr &= untag_mask | sign; \
+ } \
(__force __typeof__(addr))__addr; \
})

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 88aae519c8f8..1b41c60ebf6e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -745,6 +745,9 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)

#ifdef CONFIG_ADDRESS_MASKING

+DEFINE_STATIC_KEY_FALSE(tagged_addr_key);
+EXPORT_SYMBOL_GPL(tagged_addr_key);
+
#define LAM_U57_BITS 6

static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
@@ -781,6 +784,7 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)

mmap_write_unlock(mm);

+ static_branch_enable(&tagged_addr_key);
return 0;
}
#endif
--
2.38.2


2023-01-17 13:17:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:

> #define __untagged_addr(untag_mask, addr) ({ \
> u64 __addr = (__force u64)(addr); \
> - s64 sign = (s64)__addr >> 63; \
> - __addr &= untag_mask | sign; \
> + if (static_branch_likely(&tagged_addr_key)) { \
> + s64 sign = (s64)__addr >> 63; \
> + __addr &= untag_mask | sign; \
> + } \
> (__force __typeof__(addr))__addr; \
> })
>
> #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)

Is the compiler clever enough to put the memop inside the branch?

2023-01-17 15:17:02

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 04:57:03PM +0300, Kirill A. Shutemov wrote:
> On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote:
> > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:
> >
> > > #define __untagged_addr(untag_mask, addr)
> > > u64 __addr = (__force u64)(addr); \
> > > - s64 sign = (s64)__addr >> 63; \
> > > - __addr &= untag_mask | sign; \
> > > + if (static_branch_likely(&tagged_addr_key)) { \
> > > + s64 sign = (s64)__addr >> 63; \
> > > + __addr &= untag_mask | sign; \
> > > + } \
> > > (__force __typeof__(addr))__addr; \
> > > })
> > >
> > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
> >
> > Is the compiler clever enough to put the memop inside the branch?
>
> Hm. You mean current_untag_mask() inside static_branch_likely()?
>
> But it is preprocessor who does this, not compiler. So, yes, the memop is
> inside the branch.
>
> Or I didn't understand your question.

Nah, call it a pre-lunch dip, I overlooked the whole CPP angle -- d'0h.

That said, I did just put it through a compiler to see wth it did and it
is pretty gross:

GCC-12.2:

0000 00000000000023b0 <write_ok_or_segv>:
0000 23b0: 48 89 fa mov %rdi,%rdx
0003 23b3: eb 76 jmp 242b <write_ok_or_segv+0x7b>
0005 23b5: 65 48 8b 0d 00 00 00 00 mov %gs:0x0(%rip),%rcx # 23bd <write_ok_or_segv+0xd> 23b9: R_X86_64_PC32 tlbstate_untag_mask-0x4
000d 23bd: 48 89 f8 mov %rdi,%rax
0010 23c0: 48 c1 f8 3f sar $0x3f,%rax
0014 23c4: 48 09 c8 or %rcx,%rax
0017 23c7: 48 21 f8 and %rdi,%rax
001a 23ca: 48 b9 00 f0 ff ff ff 7f 00 00 movabs $0x7ffffffff000,%rcx
0024 23d4: 48 39 f1 cmp %rsi,%rcx
0027 23d7: 72 14 jb 23ed <write_ok_or_segv+0x3d>
0029 23d9: 48 29 f1 sub %rsi,%rcx
002c 23dc: be 01 00 00 00 mov $0x1,%esi
0031 23e1: 48 39 c1 cmp %rax,%rcx
0034 23e4: 72 07 jb 23ed <write_ok_or_segv+0x3d>
0036 23e6: 89 f0 mov %esi,%eax
0038 23e8: e9 00 00 00 00 jmp 23ed <write_ok_or_segv+0x3d> 23e9: R_X86_64_PLT32 __x86_return_thunk-0x4
003d 23ed: 65 48 8b 04 25 00 00 00 00 mov %gs:0x0,%rax 23f2: R_X86_64_32S pcpu_hot
0046 23f6: 48 89 90 68 0b 00 00 mov %rdx,0xb68(%rax)
004d 23fd: be 01 00 00 00 mov $0x1,%esi
0052 2402: bf 0b 00 00 00 mov $0xb,%edi
0057 2407: 48 c7 80 78 0b 00 00 06 00 00 00 movq $0x6,0xb78(%rax)
0062 2412: 48 c7 80 70 0b 00 00 0e 00 00 00 movq $0xe,0xb70(%rax)
006d 241d: e8 00 00 00 00 call 2422 <write_ok_or_segv+0x72> 241e: R_X86_64_PLT32 force_sig_fault-0x4
0072 2422: 31 f6 xor %esi,%esi
0074 2424: 89 f0 mov %esi,%eax
0076 2426: e9 00 00 00 00 jmp 242b <write_ok_or_segv+0x7b> 2427: R_X86_64_PLT32 __x86_return_thunk-0x4
007b 242b: 48 89 f8 mov %rdi,%rax
007e 242e: eb 9a jmp 23ca <write_ok_or_segv+0x1a>

Note the stupid jump to the end and back. Not all sites do this mind
you, but a fair number seem to do it.

Let me try llvm to see if it is any smarter.

CLANG-16:

0000 0000000000002d50 <write_ok_or_segv>:
0000 2d50: 41 57 push %r15
0002 2d52: 41 56 push %r14
0004 2d54: 41 54 push %r12
0006 2d56: 53 push %rbx
0007 2d57: 48 89 f3 mov %rsi,%rbx
000a 2d5a: 48 89 fa mov %rdi,%rdx
000d 2d5d: 49 89 fe mov %rdi,%r14
0010 2d60: eb 15 jmp 2d77 <write_ok_or_segv+0x27>
0012 2d62: 48 89 d0 mov %rdx,%rax
0015 2d65: 48 c1 f8 3f sar $0x3f,%rax
0019 2d69: 65 4c 8b 35 00 00 00 00 mov %gs:0x0(%rip),%r14 # 2d71 <write_ok_or_segv+0x21> 2d6d: R_X86_64_PC32 tlbstate_untag_mask-0x4
0021 2d71: 49 09 c6 or %rax,%r14
0024 2d74: 49 21 d6 and %rdx,%r14
0027 2d77: f3 0f 1e fa endbr64
002b 2d7b: 49 bf 00 f0 ff ff ff 7f 00 00 movabs $0x7ffffffff000,%r15
0035 2d85: 4d 89 fc mov %r15,%r12
0038 2d88: 49 29 dc sub %rbx,%r12
003b 2d8b: 72 05 jb 2d92 <write_ok_or_segv+0x42>
003d 2d8d: 4d 39 f4 cmp %r14,%r12
0040 2d90: 73 34 jae 2dc6 <write_ok_or_segv+0x76>
0042 2d92: 65 48 8b 05 00 00 00 00 mov %gs:0x0(%rip),%rax # 2d9a <write_ok_or_segv+0x4a> 2d96: R_X86_64_PC32 pcpu_hot-0x4
004a 2d9a: 48 c7 80 78 0b 00 00 06 00 00 00 movq $0x6,0xb78(%rax)
0055 2da5: 48 89 90 68 0b 00 00 mov %rdx,0xb68(%rax)
005c 2dac: 48 c7 80 70 0b 00 00 0e 00 00 00 movq $0xe,0xb70(%rax)
0067 2db7: bf 0b 00 00 00 mov $0xb,%edi
006c 2dbc: be 01 00 00 00 mov $0x1,%esi
0071 2dc1: e8 00 00 00 00 call 2dc6 <write_ok_or_segv+0x76> 2dc2: R_X86_64_PLT32 force_sig_fault-0x4
0076 2dc6: 4d 39 f4 cmp %r14,%r12
0079 2dc9: 0f 93 c1 setae %cl
007c 2dcc: 49 39 df cmp %rbx,%r15
007f 2dcf: 0f 93 c0 setae %al
0082 2dd2: 20 c8 and %cl,%al
0084 2dd4: 5b pop %rbx
0085 2dd5: 41 5c pop %r12
0087 2dd7: 41 5e pop %r14
0089 2dd9: 41 5f pop %r15
008b 2ddb: e9 00 00 00 00 jmp 2de0 <__pfx_get_gate_vma> 2ddc: R_X86_64_PLT32 __x86_return_thunk-0x4

Well, it got the untag right, but OMG.. :-( Joao, Sami, any idea why it
put an ENDBR in there?

2023-01-17 15:22:55

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote:
> On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:
>
> > #define __untagged_addr(untag_mask, addr)
> > u64 __addr = (__force u64)(addr); \
> > - s64 sign = (s64)__addr >> 63; \
> > - __addr &= untag_mask | sign; \
> > + if (static_branch_likely(&tagged_addr_key)) { \
> > + s64 sign = (s64)__addr >> 63; \
> > + __addr &= untag_mask | sign; \
> > + } \
> > (__force __typeof__(addr))__addr; \
> > })
> >
> > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
>
> Is the compiler clever enough to put the memop inside the branch?

Hm. You mean current_untag_mask() inside static_branch_likely()?

But it is preprocessor who does this, not compiler. So, yes, the memop is
inside the branch.

Or I didn't understand your question.

--
Kiryl Shutsemau / Kirill A. Shutemov

2023-01-17 17:49:18

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 7:02 AM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Jan 17, 2023 at 04:57:03PM +0300, Kirill A. Shutemov wrote:
> > On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote:
> > > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:
> > >
> > > > #define __untagged_addr(untag_mask, addr)
> > > > u64 __addr = (__force u64)(addr); \
> > > > - s64 sign = (s64)__addr >> 63; \
> > > > - __addr &= untag_mask | sign; \
> > > > + if (static_branch_likely(&tagged_addr_key)) { \
> > > > + s64 sign = (s64)__addr >> 63; \
> > > > + __addr &= untag_mask | sign; \
> > > > + } \
> > > > (__force __typeof__(addr))__addr; \
> > > > })
> > > >
> > > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
> > >
> > > Is the compiler clever enough to put the memop inside the branch?
> >
> > Hm. You mean current_untag_mask() inside static_branch_likely()?
> >
> > But it is preprocessor who does this, not compiler. So, yes, the memop is
> > inside the branch.
> >
> > Or I didn't understand your question.
>
> Nah, call it a pre-lunch dip, I overlooked the whole CPP angle -- d'0h.
>
> That said, I did just put it through a compiler to see wth it did and it
> is pretty gross:

Yeah, I think the static branch likely just makes things worse.

And if we really want to make the "no untag mask exists" case better,
I think the code should probably use static_branch_unlikely() rather
than *_likely(). That should make it jump to the masking code, and
leave the unmasked code as a fallthrough, no?

The reason clang seems to generate saner code is that clang seems to
largely ignore the whole "__builtin_expect()", at least not to the
point where it tries to make the unlikely case be out-of-line.

But on the whole, I think we'd be better off without this whole static branch.

The cost of "untagged_addr()" generally shouldn't be worth this. There
are few performance-crticial users - the most common case is, I think,
just mmap() and friends, and the single load is going to be a
non-issue there.

Looking around, I think the only situation where we may care is
strnlen_user() and strncpy_from_user(). Those *can* be
performance-critical. They're used for paths and for execve() strings,
and can be a bit hot.

And both of those cases actually just use it because of the whole
"maximum address" calculation to avoid traversing into kernel
addresses, so I wonder if we could use alternatives there, kind of
like the get_user/put_user cases did. Except it's generic code, so ..

But maybe even those aren't worth worrying about. At least they do the
unmasking outside the loop - although then in the case of execve(),
the string copies themselves are obviously done in a loop anyway.

Kirill, do you have clear numbers for that static key being a noticeable win?

Linus

2023-01-17 19:06:46

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 9:18 AM Linus Torvalds
<[email protected]> wrote:
>
> The reason clang seems to generate saner code is that clang seems to
> largely ignore the whole "__builtin_expect()", at least not to the
> point where it tries to make the unlikely case be out-of-line.

Side note: that's not something new or unusual. It's been the case
since I started testing clang - we have several code-paths where we
use "unlikely()" to try to get very unlikely cases to be out-of-line,
and clang just mostly ignores it, or treats it as a very weak hint. I
think the only way to get clang to treat it as a *strong* hint is to
use PGO.

And in this case it actually made code generation look better,
probably because this particular use of static_branch_likely() is a
bit confused about which side should be the preferred one. It's using
the static branch to make the old case not have the masked load, but
then it's saying that the new case is the likely one.

So clang ignoring the likely() hint is probably the right thing here,
and then the wrong thing in some other places.

Linus

2023-01-17 19:08:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 09:18:01AM -0800, Linus Torvalds wrote:

> The reason clang seems to generate saner code is that clang seems to
> largely ignore the whole "__builtin_expect()", at least not to the
> point where it tries to make the unlikely case be out-of-line.

So in this case there is only a 'likely' hint, we're explicitly trying
to keep the thing in-line so we can jump over it.

It is GCC that generated an implicit else (and marked it 'unlikely' --
which we didn't ask for), but worse, it failed to spot the else case is
in fact shared with the normal case and it could've simply lifted that
mov instruction.

That is, instead of this:

0003 23b3: eb 76 jmp 242b <write_ok_or_segv+0x7b>
0005 23b5: 65 48 8b 0d 00 00 00 00 mov %gs:0x0(%rip),%rcx # 23bd <write_ok_or_segv+0xd> 23b9: R_X86_64_PC32 tlbstate_untag_mask-0x4
000d 23bd: 48 89 f8 mov %rdi,%rax
0010 23c0: 48 c1 f8 3f sar $0x3f,%rax
0014 23c4: 48 09 c8 or %rcx,%rax
0017 23c7: 48 21 f8 and %rdi,%rax

001a 23ca: 48 b9 00 f0 ff ff ff 7f 00 00 movabs $0x7ffffffff000,%rcx

007b 242b: 48 89 f8 mov %rdi,%rax
007e 242e: eb 9a jmp 23ca <write_ok_or_segv+0x1a>

It could've just done:

0003 48 89 f8 mov %rdi,%rax
0006 eb 76 jmp +18
0008 65 48 8b 0d 00 00 00 00 mov %gs:0x0(%rip),%rcx # 23bd <write_ok_or_segv+0xd> 23b9: R_X86_64_PC32 tlbstate_untag_mask-0x4
0010 48 c1 f8 3f sar $0x3f,%rax
0014 48 09 c8 or %rcx,%rax
0017 48 21 f8 and %rdi,%rax

001a 48 b9 00 f0 ff ff ff 7f 00 00 movabs $0x7ffffffff000,%rcx

and everything would've been good. In all the cases I've seen it do
this, it was the same, it has this silly move out of line that's also
part of the regular branch.

That is, I like __builtin_expect() to be a strong hint. If I don't want
things out of line, I shouldn't have put unlikely on it. What I don't
like is that implicit else branches get the opposite strong hint.

What I like even less is that it found it needed that else branch at
all.

2023-01-17 19:51:25

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 10:26 AM Nick Desaulniers
<[email protected]> wrote:
>
> On Tue, Jan 17, 2023 at 9:29 AM Linus Torvalds
> <[email protected]> wrote:
> >
> > Side note: that's not something new or unusual. It's been the case
> > since I started testing clang - we have several code-paths where we
> > use "unlikely()" to try to get very unlikely cases to be out-of-line,
> > and clang just mostly ignores it, or treats it as a very weak hint. I
> > think the only way to get clang to treat it as a *strong* hint is to
> > use PGO.
>
> I'd be surprised if that were intentional or by design.
>
> Do you guys have a bug report we could look at?

Heh. I actually sent you an example long ago. Let me go fish it out of
my mail archives and quote some of it below so that you can find it in
yours..

Linus

[ Time passes. Found this email to you and Bill Wendling from Feb 16,
2020, Message-ID
CAHk-=wigVshsByCMjkUiZyQSR5N5zi2aAeQc+VJCzQV=nm8E7g@mail.gmail.com ]:

Anyway, I'm looking at clang code generation, and comparing it with
gcc on one of my "this has been optimized to hell and back" functions:
__d_lookup_rcu().

It looks like clang does frame pointers, and ignores our
likely/unlikely annotations.

So this code:

if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
int tlen;
const char *tname;
......

doesn't actually jump out of line, but instead generates the unlikely
case as the fallthrough:

testb $2, (%r12)
je .LBB50_9
... unlikely code goes here...

and then the likely case ends up having unfortunate reloads inside the
hot loop. Possibly because it has one fewer free registers than gcc
because of the frame pointer.

I didn't look into _why_ clang generates frame pointers but gcc
doesn't. It may be just a compiler default, I think we don't end up
explicitly asking either way.

[ And then Bill replied with this ]

It's not a no-op. We add branch probabilities to the IR, whether
they're honored or not depends on the transformation. But they
*should* be honored when available. I've seen in the past that instead
of moving unlikely blocks out of the way (like gcc, which moves them
below the function's "ret" instruction), LLVM will do something like
this:

<normal code>
<jmp to loop conditional test>
<unlikely code>
<more likely code>
<loop conditional test>
<...>

I.e. the loop is rotated and the unlikely code is first and the hotter
code is closer together but between the unlikely and conditional test.
Could this be what's going on? Otherwise, maybe clang decided that
it's not beneficial to move the code out-of-line because the benefit
was minimal? (I'm guessing here.)

2023-01-17 20:41:06

by Nick Desaulniers

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 9:29 AM Linus Torvalds
<[email protected]> wrote:
>
> On Tue, Jan 17, 2023 at 9:18 AM Linus Torvalds
> <[email protected]> wrote:
> >
> > The reason clang seems to generate saner code is that clang seems to
> > largely ignore the whole "__builtin_expect()", at least not to the
> > point where it tries to make the unlikely case be out-of-line.
>
> Side note: that's not something new or unusual. It's been the case
> since I started testing clang - we have several code-paths where we
> use "unlikely()" to try to get very unlikely cases to be out-of-line,
> and clang just mostly ignores it, or treats it as a very weak hint. I
> think the only way to get clang to treat it as a *strong* hint is to
> use PGO.

I'd be surprised if that were intentional or by design.

Do you guys have a bug report we could look at?

> So clang ignoring the likely() hint is probably the right thing here,
> and then the wrong thing in some other places.

--
Thanks,
~Nick Desaulniers

2023-01-17 20:43:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 09:18:01AM -0800, Linus Torvalds wrote:

> But maybe even those aren't worth worrying about. At least they do the
> unmasking outside the loop - although then in the case of execve(),
> the string copies themselves are obviously done in a loop anyway.
>
> Kirill, do you have clear numbers for that static key being a noticeable win?

Numbers would be good; I think this all started as a precaution, but
clearly that's not really working out so well :/

2023-01-17 21:24:44

by Nick Desaulniers

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 10:34 AM Linus Torvalds
<[email protected]> wrote:
>
> On Tue, Jan 17, 2023 at 10:26 AM Nick Desaulniers
> <[email protected]> wrote:
> >
> > On Tue, Jan 17, 2023 at 9:29 AM Linus Torvalds
> > <[email protected]> wrote:
> > >
> > > Side note: that's not something new or unusual. It's been the case
> > > since I started testing clang - we have several code-paths where we
> > > use "unlikely()" to try to get very unlikely cases to be out-of-line,
> > > and clang just mostly ignores it, or treats it as a very weak hint. I
> > > think the only way to get clang to treat it as a *strong* hint is to
> > > use PGO.
> >
> > I'd be surprised if that were intentional or by design.
> >
> > Do you guys have a bug report we could look at?
>
> Heh. I actually sent you an example long ago. Let me go fish it out of
> my mail archives and quote some of it below so that you can find it in
> yours..
>
> Linus
>
> [ Time passes. Found this email to you and Bill Wendling from Feb 16,
> 2020, Message-ID
> CAHk-=wigVshsByCMjkUiZyQSR5N5zi2aAeQc+VJCzQV=nm8E7g@mail.gmail.com ]:
>
> Anyway, I'm looking at clang code generation, and comparing it with
> gcc on one of my "this has been optimized to hell and back" functions:
> __d_lookup_rcu().
>
> It looks like clang does frame pointers, and ignores our
> likely/unlikely annotations.
>
> So this code:
>
> if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
> int tlen;
> const char *tname;
> ......
>
> doesn't actually jump out of line, but instead generates the unlikely
> case as the fallthrough:
>
> testb $2, (%r12)
> je .LBB50_9
> ... unlikely code goes here...


Perhaps that was compiler version or config specific?

$ make LLVM=1 -j128 defconfig fs/dcache.o
$ llvm-objdump -d --no-show-raw-insn
--disassemble-symbols=__d_lookup_rcu fs/dcache.o
0000000000003210 <__d_lookup_rcu>:
3210: endbr64
3214: pushq %rbp
3215: pushq %r15
3217: pushq %r14
3219: pushq %r12
321b: pushq %rbx
321c: testb $0x2, (%rdi)
321f: jne 0x32d7 <__d_lookup_rcu+0xc7>
...
32d7: popq %rbx
32d8: popq %r12
32da: popq %r14
32dc: popq %r15
32de: popq %rbp
32df: jmp 0x3300 <__d_lookup_rcu_op_compare>

That looks like what you want, yeah?

Your original report was from nearly 3 years ago; could have fixed a
few instances of branch weights not getting propagated since then.

What's going on in this case in this thread? I know we don't support
hot/cold attributes on labels yet, but if static_branch_likely (or
friends) is being used, we assign the indirect branches a 0%
likeliness/branch-weight.

>
> and then the likely case ends up having unfortunate reloads inside the
> hot loop. Possibly because it has one fewer free registers than gcc
> because of the frame pointer.
>
> I didn't look into _why_ clang generates frame pointers but gcc
> doesn't. It may be just a compiler default, I think we don't end up
> explicitly asking either way.
>
> [ And then Bill replied with this ]
>
> It's not a no-op. We add branch probabilities to the IR, whether
> they're honored or not depends on the transformation. But they
> *should* be honored when available. I've seen in the past that instead
> of moving unlikely blocks out of the way (like gcc, which moves them
> below the function's "ret" instruction), LLVM will do something like
> this:
>
> <normal code>
> <jmp to loop conditional test>
> <unlikely code>
> <more likely code>
> <loop conditional test>
> <...>
>
> I.e. the loop is rotated and the unlikely code is first and the hotter
> code is closer together but between the unlikely and conditional test.
> Could this be what's going on? Otherwise, maybe clang decided that
> it's not beneficial to move the code out-of-line because the benefit
> was minimal? (I'm guessing here.)



--
Thanks,
~Nick Desaulniers

2023-01-17 22:33:54

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 12:10 PM Linus Torvalds
<[email protected]> wrote:
>
> That said, clang still generates more register pressure than gcc,
> causing the function prologue and epilogue to be rather bigger
> (pushing and popping six registers, as opposed to gcc that only needs
> three)

.. and at least part of that is the same thing with the bad byte mask
generation (see that "clang *still* messes up" link for details).

Basically, the byte mask is computed by

mask = bytemask_from_count(tcount);

where we have

#define bytemask_from_count(cnt) (~(~0ul << (cnt)*8))

and clang tries very very hard to avoid that "multiply by 8", so
instead it keeps a shadow copy of that "(cnt)*8" value in the loop.

That is wrong for a couple of reasons:

(a) it adds register pressure for no good reason

(b) when you shift left by that value, only the low 6 bits of that
value matters

And guess how that "tcount" is updated? It's this:

tcount -= sizeof(unsigned long);

in the loop, and thus the update of that shadow value of "(cnt)*8" is done as

addl $-64, %ecx

inside that loop.

This is truly stupid and wasted work, because the low 6 bits of the
value - remember, the only part that matters - DOES NOT CHANGE when
you do that.

So clang has decided that it needs to

(a) avoid the "expensive" multiply-by-8 at the end by turning it into
a repeated "add $-64" inside the loop

(b) added register pressure and one extra instruction inside the loop

(c) not realized that that extra instruction doesn't actually *do*
anything, because it only affects the bits that don't actually matter
in the end.

which is all kind of silly, wouldn't you agree. Every single step
there was pointless.

But with my other simplifications, the fact that clang does these
extra things is no longer all that noticeable. It *used* to be a
horrible disaster because the extra register pressure ended up meaning
that you had spills and all kinds of nastiness. Now the function is
simple enough that even with the extra register pressure, there's no
need for spills.

.. until you look at the 32-bit version, which still needs spills. Gcc
does too, but clang just makes it worse by having the extra pointless
shadow variable.

If I cared about 32-bit, I might write up a bugzilla entry. As it is,
it's just "clang tries to be clever, and in the process is actually
being stupid".

Linus

2023-01-17 23:14:18

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 11:17 AM Nick Desaulniers
<[email protected]> wrote:
>
> Perhaps that was compiler version or config specific?

Possible, but...

The clang code generation annoyed me enough that I actually ended up
rewriting the unlikely test to be outside the loop in commit
ae2a823643d7 ("dcache: move the DCACHE_OP_COMPARE case out of the
__d_lookup_rcu loop").

I think that then made clang no longer have the whole "rotate loop
with unlikely case in the middle" issue.

And then because clang *still* messed up by trying to be too clever (see

https://lore.kernel.org/all/CAHk-=wjyOB66pofW0mfzDN7SO8zS1EMRZuR-_2aHeO+7kuSrAg@mail.gmail.com/

for details), I also ended up doing commit c4e34dd99f2e ("x86:
simplify load_unaligned_zeropad() implementation").

The end result is that now the compiler almost *cannot* mess up any more.

So the reason clang now does a good job on __d_lookup_rcu() is largely
that I took away all the places where it did badly ;)

That said, clang still generates more register pressure than gcc,
causing the function prologue and epilogue to be rather bigger
(pushing and popping six registers, as opposed to gcc that only needs
three)

Gcc is also better able to schedule the prologue and epilogue together
with the work of the function, which clang seems to always do it as a
"push all" and "pop all" sequence.

That scheduling doesn't matter in that particular place (although it
does make the unlikely case of calling __d_lookup_rcu_op_compare
pointlessly push all regs only to then pop them), but I've seen a few
other cases where it ends up meaning that it always does that full
function prologue even when the *likely* case then returns early and
doesn't actually need any of that work because it didn't use any of
those registers.

But yeah, the RCU pathname lookup looks fine these days. And I don't
actually think it was due to clang changes ;)

Linus

2023-01-20 00:33:12

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv14 08/17] x86/mm: Reduce untagged_addr() overhead until the first LAM user

On Tue, Jan 17, 2023 at 04:02:06PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 17, 2023 at 04:57:03PM +0300, Kirill A. Shutemov wrote:
> > On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote:
> > > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote:
> > >
> > > > #define __untagged_addr(untag_mask, addr)
> > > > u64 __addr = (__force u64)(addr); \
> > > > - s64 sign = (s64)__addr >> 63; \
> > > > - __addr &= untag_mask | sign; \
> > > > + if (static_branch_likely(&tagged_addr_key)) { \
> > > > + s64 sign = (s64)__addr >> 63; \
> > > > + __addr &= untag_mask | sign; \
> > > > + } \
> > > > (__force __typeof__(addr))__addr; \
> > > > })
> > > >
> > > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
> > >
> > > Is the compiler clever enough to put the memop inside the branch?
> >
> > Hm. You mean current_untag_mask() inside static_branch_likely()?
> >
> > But it is preprocessor who does this, not compiler. So, yes, the memop is
> > inside the branch.
> >
> > Or I didn't understand your question.
>
> Nah, call it a pre-lunch dip, I overlooked the whole CPP angle -- d'0h.
>
> That said, I did just put it through a compiler to see wth it did and it
> is pretty gross:

I tried to replace static branch with alternative. It kinda works, but
required few hack. Thanks to Andrew Cooper for helping to untangle them.

I am not sure if it worth the effort. I don't have any evidence that it
helps. untagged_addr() overhead is rather small and hides in noise of
syscall cost.

I only made alternative for untagged_addr(), but not for
untagged_addr_remote(). _remote() case has very few users.

BTW, it would be nice to be able to apply alternative later, delaying it
until the first user of LAM, like I did with static_branch.
We don't have a way to do this right?

Any opinions? I am okay dropping the patch altogether.

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index c44b56f7ffba..3f0c31044f02 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -75,6 +75,12 @@
# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31))
#endif

+#ifdef CONFIG_ADDRESS_MASKING
+# define DISABLE_LAM 0
+#else
+# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31))
+#endif
+
#ifdef CONFIG_INTEL_IOMMU_SVM
# define DISABLE_ENQCMD 0
#else
@@ -115,7 +121,7 @@
#define DISABLED_MASK10 0
#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
DISABLE_CALL_DEPTH_TRACKING)
-#define DISABLED_MASK12 0
+#define DISABLED_MASK12 (DISABLE_LAM)
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index f9f85d596581..57ccb91fcccf 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -9,6 +9,7 @@
#include <linux/kasan-checks.h>
#include <linux/mm_types.h>
#include <linux/string.h>
+#include <linux/mmap_lock.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
@@ -24,28 +25,48 @@ static inline bool pagefault_disabled(void);
#endif

#ifdef CONFIG_ADDRESS_MASKING
-DECLARE_STATIC_KEY_FALSE(tagged_addr_key);
+static inline unsigned long __untagged_addr(unsigned long addr)
+{
+ /*
+ * Magic with the 'sign' allows to untag userspace pointer without
+ * any branches while leaving kernel addresses intact.
+ */
+ long sign;
+
+ /*
+ * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation
+ * in alternative instructions. The relocation gets wrong when gets
+ * copied to the target place.
+ */
+ asm (ALTERNATIVE("",
+ "sar $63, %[sign]\n\t" /* user_ptr ? 0 : -1UL */
+ "or %%gs:tlbstate_untag_mask, %[sign]\n\t"
+ "and %[sign], %[addr]\n\t", X86_FEATURE_LAM)
+ : [addr] "+r" (addr), [sign] "=r" (sign)
+ : "m" (tlbstate_untag_mask), "[sign]" (addr));
+
+ return addr;
+}

-/*
- * Mask out tag bits from the address.
- *
- * Magic with the 'sign' allows to untag userspace pointer without any branches
- * while leaving kernel addresses intact.
- */
-#define __untagged_addr(untag_mask, addr) ({ \
- u64 __addr = (__force u64)(addr); \
- if (static_branch_likely(&tagged_addr_key)) { \
- s64 sign = (s64)__addr >> 63; \
- __addr &= untag_mask | sign; \
- } \
- (__force __typeof__(addr))__addr; \
+#define untagged_addr(addr) ({ \
+ unsigned long __addr = (__force unsigned long)(addr); \
+ (__force __typeof__(addr))__untagged_addr(__addr); \
})

-#define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr)
+static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
+ unsigned long addr)
+{
+ long sign = addr >> 63;
+
+ mmap_assert_locked(mm);
+ addr &= (mm)->context.untag_mask | sign;
+
+ return addr;
+}

#define untagged_addr_remote(mm, addr) ({ \
- mmap_assert_locked(mm); \
- __untagged_addr((mm)->context.untag_mask, addr); \
+ unsigned long __addr = (__force unsigned long)(addr); \
+ (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \
})

#else
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0831d2be190f..e006725afdf1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -745,9 +745,6 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)

#ifdef CONFIG_ADDRESS_MASKING

-DEFINE_STATIC_KEY_FALSE(tagged_addr_key);
-EXPORT_SYMBOL_GPL(tagged_addr_key);
-
#define LAM_U57_BITS 6

static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
@@ -787,8 +784,6 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);

mmap_write_unlock(mm);
-
- static_branch_enable(&tagged_addr_key);
return 0;
}
#endif
--
Kiryl Shutsemau / Kirill A. Shutemov