2023-03-03 21:10:06

by Mateusz Guzik

[permalink] [raw]

On 3/4/23, Mateusz Guzik <[email protected]> wrote:
> On 3/3/23, Linus Torvalds <[email protected]> wrote:
>> On Fri, Mar 3, 2023 at 12:39 PM Mateusz Guzik <[email protected]> wrote:
>>>
>>> I think there is a systemic problem which comes with the kzalloc API
>>
>> Well, it's not necessarily the API that is bad, but the implementation.
>>
>> We could easily make kzalloc() with a constant size just expand to
>> kmalloc+memset, and get the behavior you want.
>>
>> We already do magical things for "find the right slab bucket" part of
>> kmalloc too for constant sizes. It's changed over the years, but that
>> policy goes back a long long time. See
>>
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=95203fe78007f9ab3aebb96606473ae18c00a5a8
>>
>> from the BK history tree.
>>
>> Exactly because some things are worth optimizing for when the size is
>> known at compile time.
>>
>> Maybe just extending kzalloc() similarly? Trivial and entirely untested
>> patch:
>>
>> --- a/include/linux/slab.h
>> +++ b/include/linux/slab.h
>> @@ -717,6 +717,12 @@ static inline void *kmem_cache_zalloc(struct
>> kmem_cache *k, gfp_t flags)
>> */
>> static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags)
>> {
>> + if (__builtin_constant_p(size)) {
>> + void *ret = kmalloc(size, flags);
>> + if (ret)
>> + memset(ret, 0, size);
>> + return ret;
>> + }
>> return kmalloc(size, flags | __GFP_ZERO);
>> }
>>
>
> So I played with this and have a rather nasty summary. Bullet points:
> 1. patched kzalloc does not reduce memsets calls during kernel build
> 2. patched kmem_cache_zalloc_ptr + 2 consumers converted *does* drop
> it significantly (36150671 -> 14414454)
> 3. ... inline memset generated by gcc sucks by resorting to rep stosq
> around 48 bytes
> 4. memsets not sorted out have sizes not known at compilation time and
> are not necessarily perf bugs on their own [read: would benefit from
> faster memset]
>
> Onto the the meat:
>
> I patched the kernel with a slightly tweaked version of the above:
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 45af70315a94..7abb5490690f 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -717,6 +717,12 @@ static inline void *kmem_cache_zalloc(struct
> kmem_cache *k, gfp_t flags)
> */
> static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags)
> {
> + if (__builtin_constant_p(size)) {
> + void *ret = kmalloc(size, flags);
> + if (likely(ret))
> + memset(ret, 0, size);
> + return ret;
> + }
> return kmalloc(size, flags | __GFP_ZERO);
> }
>
> and verified it indeed zeroes inline:
>
> void kztest(void)
> {
> void *ptr;
>
> ptr = kzalloc(32, GFP_KERNEL);
> if (unlikely(!ptr))
> return;
> memsettest_rec(ptr);
> }
>
> $ objdump --disassemble=kztest vmlinux
> [snip]
> call ffffffff8135e130 <kmalloc_trace>
> test %rax,%rax
> je ffffffff81447d5f <kztest+0x4f>
> movq $0x0,(%rax)
> mov %rax,%rdi
> movq $0x0,0x8(%rax)
> movq $0x0,0x10(%rax)
> movq $0x0,0x18(%rax)
> call ffffffff81454060 <memsettest_rec>
> [snip]
>
> This did *NOT* lead to reduction of memset calls when building the kernel.
>
> I verified few cases by hand, it is all either kmem_cache_zalloc or
> explicitly added memsets with sizes not known at compilation time.
>
> Two most frequent callers:
> @[
> memset+5
> __alloc_file+40
> alloc_empty_file+73
> path_openat+77
> do_filp_open+182
> do_sys_openat2+159
> __x64_sys_openat+89
> do_syscall_64+93
> entry_SYSCALL_64_after_hwframe+114
> ]: 11028994
> @[
> memset+5
> security_file_alloc+45
> __alloc_file+89
> alloc_empty_file+73
> path_openat+77
> do_filp_open+182
> do_sys_openat2+159
> __x64_sys_openat+89
> do_syscall_64+93
> entry_SYSCALL_64_after_hwframe+114
> ]: 11028994
>
> My wip addition is:
>
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 45af70315a94..12b5b02ef3d3 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -710,6 +710,17 @@ static inline void *kmem_cache_zalloc(struct
> kmem_cache *k, gfp_t flags)
> return kmem_cache_alloc(k, flags | __GFP_ZERO);
> }
>
> +#define kmem_cache_zalloc_ptr(k, f, retp) ({ \
> + __typeof(retp) _retp = kmem_cache_alloc(k, f); \
> + bool _rv = false; \
> + retp = _retp; \
> + if (likely(_retp)) { \
> + memset(_retp, 0, sizeof(*_retp)); \
> + _rv = true; \
> + } \
> + _rv; \
> +})
> +
> diff --git a/security/security.c b/security/security.c
> index cf6cc576736f..0f769ede0e54 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -600,8 +600,7 @@ static int lsm_file_alloc(struct file *file)
> return 0;
> }
>
> - file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
> - if (file->f_security == NULL)
> + if (!kmem_cache_zalloc_ptr(lsm_file_cache, GFP_KERNEL,
> file->f_security))
> return -ENOMEM;
> return 0;
> }

This one is actually buggy -- f_security is a void * pointer and
sizeof(*file->f_security) returns just 1. The macro does not have any
safety belts against that -- it should probably check for void * at
compilation time and get a BUG_ON for runtime mismatch. Does not
affect the idea though.

Good news: gcc provides a lot of control as to how it inlines string
ops, most notably:
-mstringop-strategy=alg
Override the internal decision heuristic for the particular
algorithm to use for inlining string
operations. The allowed values for alg are:

rep_byte
rep_4byte
rep_8byte
Expand using i386 "rep" prefix of the specified size.

byte_loop
loop
unrolled_loop
Expand into an inline loop.

libcall
Always use a library call.

I'm going to play with it and send something more presentable.

> diff --git a/fs/file_table.c b/fs/file_table.c
> index 372653b92617..8e0dabf9530e 100644
> --- a/fs/file_table.c
> +++ b/fs/file_table.c
> @@ -136,8 +136,7 @@ static struct file *__alloc_file(int flags, const
> struct cred *cred)
> struct file *f;
> int error;
>
> - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
> - if (unlikely(!f))
> + if (!kmem_cache_zalloc_ptr(filp_cachep, GFP_KERNEL, f))
> return ERR_PTR(-ENOMEM);
>
> f->f_cred = get_cred(cred);
>
> As mentioned above it cuts total calls in more than half.
>
> The problem is it is it rolls with rep stosq way too easily, partially
> defeating the point of inlining anything. clang does not have this
> problem.
>
> Take a look at __alloc_file:
> [snip]
> mov 0x19cab05(%rip),%rdi # ffffffff82df4318 <filp_cachep>
> call ffffffff813dd610 <kmem_cache_alloc>
> test %rax,%rax
> je ffffffff814298b7 <__alloc_file+0xc7>
> mov %rax,%r12
> mov $0x1d,%ecx
> xor %eax,%eax
> mov %r12,%rdi
> rep stos %rax,%es:(%rdi)
> [/snip]
>
> Here is a sample consumer which can't help but have a variable size --
> select, used by gmake:
> @[
> memset+5
> do_pselect.constprop.0+202
> __x64_sys_pselect6+101
> do_syscall_64+93
> entry_SYSCALL_64_after_hwframe+114
> ]: 13160
>
> In conclusion:
> 1. fixing up memsets is worthwhile regardless of what happens to its
> current consumers -- not all of them are necessarily doing something
> wrong
> 2. inlining memset can be a pessimization vs non-plain-erms memset as
> evidenced above. will have to figure out how to convince gcc to be
> less eager to use it.
>
> Sometimes I hate computers.
>
> --
> Mateusz Guzik <mjguzik gmail.com>
>

--
Mateusz Guzik <mjguzik gmail.com>

2023-03-04 20:35:08

On Sat, Mar 4, 2023 at 12:51 PM Yury Norov <[email protected]> wrote:
>
> > That particular code sequence is arguably broken to begin with.
> > setall() should really only be used as a mask, most definitely not as
> > some kind of "all possible cpus".
>
> Sorry, don't understand this.

See the example patch I sent out.

Literally just make the rule be "we play games with cpumasks in that
they have two different 'sizes', so just make sure the bits in the
bigger and faster size are always clear".

That simple rule just means that we can then use that bigger constant
size in all cases where "upper bits zero" just don't matter.

Which is basically all of them.

Your for_each_cpu_not() example is actually a great example: it should
damn well not exist at all. I hadn't even noticed how broken it was.
Exactly like the other broken case (that I *did* notice -
cpumask_complement), it has no actual valid users. It _literally_ only
exists as a pointless test-case.

So this is *literally* what I'm talking about: you are making up silly
cases that then act as "arguments" for making all the _real_ cases
slower.

Stop it.

Silly useless cases are just that - silly and useless. They should not
be arguments for the real cases then being optimized and simplified.

Updated patch to remove 'for_each_cpu_not()' attached.

It's still completely untested. Treat this very much as a "Let's make
the common cases faster, at least for !MAXSMP".

Linus

Attachments:

patch.diff (7.52 kB)

2023-03-04 21:03:35

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH v3 2/2] vfs: avoid duplicating creds in faccessat if possible

On Sat, Mar 4, 2023 at 1:01 PM Linus Torvalds
<[email protected]> wrote:
>
> Silly useless cases are just that - silly and useless. They should not
> be arguments for the real cases then being optimized and simplified.

There's a missing "not" above, that was hopefully obvious from the
context: "They should not be arguments for the real cases then NOT
being optimized and simplified"

Linus

2023-03-04 21:10:58

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH v3 2/2] vfs: avoid duplicating creds in faccessat if possible

On Sat, Mar 4, 2023 at 1:01 PM Linus Torvalds
<[email protected]> wrote:
>
> It's still completely untested. Treat this very much as a "Let's make
> the common cases faster, at least for !MAXSMP".

Ok, so I started "testing" it in the sense that I actually looked at
the code it generated, and went all "why didn't it make any
difference".

And that's because the patch had the

#ifdef CONFIG_CPUMASK_OFFSTACK

condition exactly the wrong way around.

So if somebody actually wants to play with that patch, you need to
change that to be

#ifndef CONFIG_CPUMASK_OFFSTACK

(and then you obviously need to have a kernel config that does *not*
have MAXSMP set).

That at least simplifies some of the code generation when I look at
it. Whether the end result _works_ or not, I still haven't checked.
That patch is still very much meant as a "look, something like this
should make our cpumask handling much more efficient"

Linus

2023-03-04 23:09:23

On Sun, Mar 5, 2023 at 1:44 AM Linus Torvalds
<[email protected]> wrote:
>
> On Sat, Mar 4, 2023 at 1:10 PM Linus Torvalds
> <[email protected]> wrote:
> >
> > Whether the end result _works_ or not, I still haven't checked.
>
> Well, this particular patch at least boots for me for my normal
> config. Not that I've run any extensive tests, but I'm writing this
> email while running this patch, so ..
>

Hi Linus,

can you share your "normal config", please?

This what latest Debian kernel from unstable has (discussed Kconfigs):

# egrep 'NR_CPUS|CPUMASK_OFFSTACK|MAXSMP' /boot/config-6.1.0-5-amd64
397:CONFIG_MAXSMP=y
398:CONFIG_NR_CPUS_RANGE_BEGIN=8192
399:CONFIG_NR_CPUS_RANGE_END=8192
400:CONFIG_NR_CPUS_DEFAULT=8192
401:CONFIG_NR_CPUS=8192
10214:CONFIG_CPUMASK_OFFSTACK=y
10215:# CONFIG_FORCE_NR_CPUS is not set

What are "sane" settings in your eyes?

Full Debian kernel-config for AMD64 is attached.

Thanks.

Best regards,
-Sedat-

Attachments:

config-6.1.0-5-amd64 (253.41 kB)

2023-03-05 17:23:47

by David Laight

[permalink] [raw]

Subject: RE: [PATCH v3 2/2] vfs: avoid duplicating creds in faccessat if possible

From: Linus Torvalds
> Sent: 04 March 2023 20:48
>
> On Sat, Mar 4, 2023 at 12:31 PM Mateusz Guzik <[email protected]> wrote:
> >
> > Good news: gcc provides a lot of control as to how it inlines string
> > ops, most notably:
> > -mstringop-strategy=alg
>
> Note that any static decision is always going to be crap somewhere.
> You can make it do the "optimal" thing for any particular machine, but
> I consider that to be just garbage.
>
> What I would actually like to see is the compiler always generate an
> out-of-line call for the "big enough to not just do inline trivially"
> case, but do so with the "rep stosb/movsb" calling convention.

I think you also want it to differentiate between requests that
are known to be a whole number of words and ones that might
be byte sized.

For the kmalloc+memzero case you know you can zero a whole
number of words - so all the checks memset has to do for
byte length/alignment can be removed.

The same is true for memcpy() calls used for structure copies.
The compiler knows that aligned full-word copies can be done.
So it shouldn't be calling a function that has to redo the tests.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2023-03-05 18:17:53

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH v3 2/2] vfs: avoid duplicating creds in faccessat if possible

On Sun, Mar 5, 2023 at 1:26 AM Sedat Dilek <[email protected]> wrote:
>
> can you share your "normal config", please?

Well, I just have CONFIG_NR_CPUS set to 64.

That happens to be the number of actual cores (well, threads) I have
on my desktop, but it's what I use for my laptop too (that has 8
cores).

Basically, I consider CONFIG_NR_CPUS=64 is likely the "sweet spot" for
code generation and still covering 99+% of all machines out there.

Now, MAXSMP is great for (a) coverage testing and for (b) being able
to handle pretty much *anything* out there, but it really was
originally meant for the SGI kind of hardware: not exactly
off-the-shelf.

So I use MAXSMP for compile testing (by virtue of "allmodconfig"), and
it's great for that. But unless you have more than several hundred
cpus in your machine, you should never use it.

There are a few main issues with MAXSMP:

- the simple ("common") embedded cpu masks end up being big (ie any
data structure that has a "cpumask_t" in it will be huge, just because
the static size of 'struct cpumask' is 8192 bits, ie 1kB)

- the fancy case of using a "cpumask_var_t" will use a pointer and a
dynamic allocation (which is then sized to be appropriate to the
*actual* number of CPU's, so that you don't have to allocate 8192 bits
for everything).

- the code generation ends up inevitably being about variable-sized
loops, because nobody wants to traverse those kinds of data structures

In contrast, if you use CONFIG_NR_CPUS=64, both the embeddeed and
"fancy" version will be just a single 64-bit word. No extra pointer
overhead, no indirection through said pointers, and no need for loops
(ok, there will still be loops for walking the bits in the word, but a
lot of them will actually be about using instructions like "bsf" etc).

So you end up with better code, smaller data structures, and less
pointer chasing.

So those two situations are generally the two "sane" configurations: a
good reasonable NR_CPUS that works for just about everybody, and then
another extreme config for the 1% (or - more likely - 0.01%)

Now, it's not like 64 is somehow magical. Picking something like
NR_CPUS of 192 is perfectly fine too - it will use three words for the
bitmap, it will still avoid the pointer indirection, it will have a
few small fixed-size loops. It's certainly not *wrong*. It will cover
bigger HEDT machines, but I feel like the HEDT people probably are
special enough that they could probably just use the MAXSMP case, or -
if they care - just build their own kernels.

So you *can* certainly pick other values. We used to have special UP
vs SMP kernel builds, and that clearly no longer makes sense. Nobody
cares about UP on x86-64.

But I do feel like MAXSMP is simply not a great config for 99.9% of
all people, and if you are willing to have two configs, then that "64
or MAXSMP" seems to be the sane split.

And with that split, there will be *very* few people who actually use MAXSMP.

Linus

2023-03-05 18:43:49

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH v3 2/2] vfs: avoid duplicating creds in faccessat if possible

On Sun, Mar 5, 2023 at 10:17 AM Linus Torvalds
<[email protected]> wrote:
>
> There are a few main issues with MAXSMP:

It's probably worth noting that most architectures don't even support
MAXSMP at all.

Only x86-64 does.

For example, ia64 and sparc64, which both did techncially support a
lot of cores, just made "cpumask_t" huge, and had no support for the
whole "use a pointer to an indirect allocation".

That ends up meaning that you allocate those huge structures on the
stack or just make other structures enormous when they contain a CPU
mask, but it mostly works. It's a horrid, horrid model, though. But at
least ia64 had 64kB stacks anyway, and in the book of "bad engineering
decisions of Itanium", this is all just a footnote.

arm64 also has that "range 2 4096" for number of CPUs but defaults to
a much saner 256 cpus.

I suspect (and sincerely hope) that nobody actually tries to use an
arm64 build with that 4k cpu build. If/when arm64 actually does get up
to that 'thousands of cores" situation, they'll hopefully enable the
MAXSMP kind of indirection and off-stack cpu mask arrays.

So MAXSMP and the whole CPUMASK_OFFSTACK option is an architecture
choice, and you don't have to do it the way x86-64 does it. But the
x86 choice is likely the best tested and thought out by far.

For example, POWERPC technically supports CPUMASK_OFFSTACK too, but
really only in theory. On powerpc, you have

config NR_CPUS
range 2 8192 if SMP
default "32" if PPC64

so while configuration the range is technically up to 8k CPUs, I doubt
people use that value very much. And we have

select CPUMASK_OFFSTACK if NR_CPUS >= 8192

so it only uses that OFFSTACK one if you pick exactly 8192 CPUs (which
presumably nobody does in real life outside of build testing - it's
not the default, and I think most of the POWER range tops up in the
192 core range, eg E980 with 16 sockets of 12 cores each).

So I suspect that x86-64 is the *only* one to actually use this
widely, and I think distros have been *much* too eager to do so.

The fact that most distros default to

CONFIG_MAXSMP=y
CONFIG_NR_CPUS=8192

seems pretty crazy, when I have a hard time finding anything with more
than 192 cores. I'm sure they exist. But do they _really_ run
unmodified vendor kernels?

Linus

2023-03-06 05:44:04

by Yury Norov

[permalink] [raw]

Subject: Re: [PATCH v3 2/2] vfs: avoid duplicating creds in faccessat if possible

On Sat, Mar 04, 2023 at 03:08:49PM -0800, Linus Torvalds wrote:
> On Sat, Mar 4, 2023 at 1:10 PM Linus Torvalds
> <[email protected]> wrote:
> >
> > Whether the end result _works_ or not, I still haven't checked.
>
> Well, this particular patch at least boots for me for my normal
> config. Not that I've run any extensive tests, but I'm writing this
> email while running this patch, so ..
>
> Linus

I didn't test it properly, but the approach looks good. Need some time
to think on implications of the new rule. At the first glance, there
should be no major impact on cpumask machinery.

It should be very well tested on arm and m68k because they implement
their own bitmap functions.

Please see comments inline.

Thanks,
Yury

[...]

> diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
> index 10c92bd9b807..bd9576e8d856 100644
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -50,8 +50,30 @@ static inline void set_nr_cpu_ids(unsigned int nr)
> #endif
> }
>
> -/* Deprecated. Always use nr_cpu_ids. */
> -#define nr_cpumask_bits nr_cpu_ids
> +/*
> + * The difference between nr_cpumask_bits and nr_cpu_ids is that
> + * 'nr_cpu_ids' is the actual number of CPU ids in the system, while
> + * nr_cpumask_bits is a "reasonable upper value" that is often more
> + * efficient because it can be a fixed constant.
> + *
> + * So when clearing or traversing a cpumask, use 'nr_cpumask_bits',
> + * but when checking exact limits (and when _setting_ bits), use the
> + * tighter exact limit of 'nr_cpu_ids'.
> + *
> + * NOTE! The code depends on any exyta bits in nr_cpumask_bits a always

s/exyta/extra ?
s/a always/as always ?

> + * being (a) allocated and (b) zero, so that the only effect of using
> + * 'nr_cpumask_bits' is that we might return a higher maximum CPU value
> + * (which is why we have that pattern of
> + *
> + * Returns >= nr_cpu_ids if no cpus set.
> + *
> + * for many of the functions - they can return that higher value).
> + */
> +#ifndef CONFIG_CPUMASK_OFFSTACK
> + #define nr_cpumask_bits ((unsigned int)NR_CPUS)
> +#else
> + #define nr_cpumask_bits nr_cpu_ids
> +#endif
>
> /*
> * The following particular system cpumasks and operations manage
> @@ -114,7 +136,7 @@ static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bit
> /* verify cpu argument to cpumask_* operators */
> static __always_inline unsigned int cpumask_check(unsigned int cpu)
> {
> - cpu_max_bits_warn(cpu, nr_cpumask_bits);
> + cpu_max_bits_warn(cpu, nr_cpu_ids);
> return cpu;
> }
>
> @@ -248,16 +270,6 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
> #define for_each_cpu(cpu, mask) \
> for_each_set_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
>
> -/**
> - * for_each_cpu_not - iterate over every cpu in a complemented mask
> - * @cpu: the (optionally unsigned) integer iterator
> - * @mask: the cpumask pointer
> - *
> - * After the loop, cpu is >= nr_cpu_ids.
> - */
> -#define for_each_cpu_not(cpu, mask) \
> - for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
> -

We can do it like:

for ((bit) = 0;
(bit) = find_next_zero_bit((addr), nr_cpumask_bits, (bit)),
(bit) < nr_cpu_ids;
(bit)++)

> #if NR_CPUS == 1
> static inline
> unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
> @@ -495,10 +507,14 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *
> /**
> * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
> * @dstp: the cpumask pointer
> + *
> + * Note: since we set bits, we should use the tighter 'bitmap_set()' with
> + * the eact number of bits, not 'bitmap_fill()' that will fill past the

s/eact/exact

> + * end.
> */
> static inline void cpumask_setall(struct cpumask *dstp)
> {
> - bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
> + bitmap_set(cpumask_bits(dstp), 0, nr_cpu_ids);
> }

It should be like:

+ bitmap_set(cpumask_bits(dstp), 0, nr_cpu_ids);
+ bitmap_clear(cpumask_bits(dstp), nr_cpu_ids, nr_cpumask_bits);

Because bitmap_set() will not zero memory beyond round_up(nr_cpu_ids, 64).