2021-01-10 17:44:19

by Alexey Gladkov

[permalink] [raw]
Subject: [RFC PATCH v2 2/8] Add a reference to ucounts for each user

Before this, only the owner of the user namespace had an entry in ucounts.
This entry addressed the user in the given user namespace.

Now we create such an entry in ucounts for all users in the user namespace.
Each user has only one entry for each user namespace.

This commit is in preparation for migrating rlimits to ucounts.

Signed-off-by: Alexey Gladkov <[email protected]>
---
include/linux/cred.h | 1 +
include/linux/user_namespace.h | 2 ++
kernel/cred.c | 17 +++++++++++++++--
kernel/ucount.c | 12 +++++++++++-
kernel/user_namespace.c | 1 +
5 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 18639c069263..307744fcc387 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -144,6 +144,7 @@ struct cred {
#endif
struct user_struct *user; /* real user ID subscription */
struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
+ struct ucounts *ucounts;
struct group_info *group_info; /* supplementary groups for euid/fsgid */
/* RCU deletion */
union {
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 84fefa9247c4..483568a56f7f 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -102,6 +102,8 @@ bool setup_userns_sysctls(struct user_namespace *ns);
void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
+void put_ucounts(struct ucounts *ucounts);
+void set_cred_ucounts(const struct cred *cred, struct user_namespace *ns, kuid_t uid);

#ifdef CONFIG_USER_NS

diff --git a/kernel/cred.c b/kernel/cred.c
index 421b1149c651..d19e2e97092c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -119,6 +119,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
+ put_ucounts(cred->ucounts);
put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -144,6 +145,9 @@ void __put_cred(struct cred *cred)
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);

+ BUG_ON(cred->ucounts == NULL);
+ BUG_ON(cred->ucounts->ns != cred->user_ns);
+
if (cred->non_rcu)
put_cred_rcu(&cred->rcu);
else
@@ -271,6 +275,9 @@ struct cred *prepare_creds(void)
get_uid(new->user);
get_user_ns(new->user_ns);

+ new->ucounts = NULL;
+ set_cred_ucounts(new, new->user_ns, new->euid);
+
#ifdef CONFIG_KEYS
key_get(new->session_keyring);
key_get(new->process_keyring);
@@ -363,6 +370,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
+ set_cred_ucounts(new, new->user_ns, new->euid);
}

#ifdef CONFIG_KEYS
@@ -485,8 +493,11 @@ int commit_creds(struct cred *new)
* in set_user().
*/
alter_cred_subscribers(new, 2);
- if (new->user != old->user)
- atomic_inc(&new->user->processes);
+ if (new->user != old->user || new->user_ns != old->user_ns) {
+ if (new->user != old->user)
+ atomic_inc(&new->user->processes);
+ set_cred_ucounts(new, new->user_ns, new->euid);
+ }
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
@@ -661,6 +672,7 @@ void __init cred_init(void)
/* allocate a slab in which we can store credentials */
cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
+ set_cred_ucounts(&init_cred, &init_user_ns, GLOBAL_ROOT_UID);
}

/**
@@ -704,6 +716,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
get_uid(new->user);
get_user_ns(new->user_ns);
get_group_info(new->group_info);
+ set_cred_ucounts(new, new->user_ns, new->euid);

#ifdef CONFIG_KEYS
new->session_keyring = NULL;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 0f2c7c11df19..80a39073bcef 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -161,7 +161,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
return ucounts;
}

-static void put_ucounts(struct ucounts *ucounts)
+void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;

@@ -175,6 +175,16 @@ static void put_ucounts(struct ucounts *ucounts)
kfree(ucounts);
}

+void set_cred_ucounts(const struct cred *cred, struct user_namespace *ns, kuid_t uid)
+{
+ if (cred->ucounts) {
+ if (cred->ucounts->ns == ns && uid_eq(cred->ucounts->uid, uid))
+ return;
+ put_ucounts(cred->ucounts);
+ }
+ ((struct cred *) cred)->ucounts = get_ucounts(ns, uid);
+}
+
static inline bool atomic_inc_below(atomic_t *v, int u)
{
int c, old;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..4b8a4468d391 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1280,6 +1280,7 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)

put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));
+ set_cred_ucounts(cred, user_ns, cred->euid);

return 0;
}
--
2.29.2


2021-01-13 06:20:00

by kernel test robot

[permalink] [raw]
Subject: 59ebc79722: kernel_BUG_at_kernel/cred.c


Greeting,

FYI, we noticed the following commit (built with gcc-9):

commit: 59ebc797229e679f2c87fc13f6859ba7c0f2bdc3 ("[RFC PATCH v2 2/8] Add a reference to ucounts for each user")
url: https://github.com/0day-ci/linux/commits/Alexey-Gladkov/Count-rlimits-in-each-user-namespace/20210111-014938
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 2ff90100ace886895e4fbb2850b8d5e49d931ed6

in testcase: trinity
version: trinity-i386
with following parameters:

runtime: 300s

test-description: Trinity is a linux system call fuzz tester.
test-url: http://codemonkey.org.uk/projects/trinity/


on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 8G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):


+------------------------------------------+------------+------------+
| | e58c759c87 | 59ebc79722 |
+------------------------------------------+------------+------------+
| boot_successes | 10 | 0 |
| boot_failures | 0 | 12 |
| kernel_BUG_at_kernel/cred.c | 0 | 7 |
| invalid_opcode:#[##] | 0 | 7 |
| RIP:__put_cred | 0 | 7 |
| Kernel_panic-not_syncing:Fatal_exception | 0 | 7 |
| WARNING:at_kernel/ucount.c:#dec_ucount | 0 | 5 |
| RIP:dec_ucount | 0 | 5 |
+------------------------------------------+------------+------------+


If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>


[ 16.291000] kernel BUG at kernel/cred.c:148!
[ 16.292585] invalid opcode: 0000 [#1] SMP PTI
[ 16.295176] CPU: 0 PID: 581 Comm: trinity-c1 Not tainted 5.11.0-rc2-00426-g59ebc797229e #1
[ 16.300880] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[ 16.304261] RIP: 0010:__put_cred (kbuild/src/consumer/kernel/cred.c:148 (discriminator 1))
[ 16.308047] Code: 00 00 4c 8d 87 a0 00 00 00 85 c0 74 08 4c 89 c7 e9 1d ff ff ff 48 c7 c6 20 c3 28 a9 4c 89 c7 e9 ce 79 04 00 0f 0b 0f 0b 0f 0b <0f> 0b 0f 0b 0f 1f 40 00 e9 5b 6b 2f 00 66 66 2e 0f 1f 84 00 00 00
All code
========
0: 00 00 add %al,(%rax)
2: 4c 8d 87 a0 00 00 00 lea 0xa0(%rdi),%r8
9: 85 c0 test %eax,%eax
b: 74 08 je 0x15
d: 4c 89 c7 mov %r8,%rdi
10: e9 1d ff ff ff jmpq 0xffffffffffffff32
15: 48 c7 c6 20 c3 28 a9 mov $0xffffffffa928c320,%rsi
1c: 4c 89 c7 mov %r8,%rdi
1f: e9 ce 79 04 00 jmpq 0x479f2
24: 0f 0b ud2
26: 0f 0b ud2
28: 0f 0b ud2
2a:* 0f 0b ud2 <-- trapping instruction
2c: 0f 0b ud2
2e: 0f 1f 40 00 nopl 0x0(%rax)
32: e9 5b 6b 2f 00 jmpq 0x2f6b92
37: 66 data16
38: 66 data16
39: 2e cs
3a: 0f .byte 0xf
3b: 1f (bad)
3c: 84 00 test %al,(%rax)
...

Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: 0f 0b ud2
4: 0f 1f 40 00 nopl 0x0(%rax)
8: e9 5b 6b 2f 00 jmpq 0x2f6b68
d: 66 data16
e: 66 data16
f: 2e cs
10: 0f .byte 0xf
11: 1f (bad)
12: 84 00 test %al,(%rax)
...
[ 16.314607] RSP: 0018:ffffa9090080bee8 EFLAGS: 00010246
[ 16.316319] RAX: 0000000000000000 RBX: ffff97ecc5ba8d80 RCX: 000000000000fffe
[ 16.318408] RDX: ffff97ecc6316d80 RSI: 0000000000000000 RDI: ffff97ecc6316cc0
[ 16.320545] RBP: ffff97ecc6316cc0 R08: 00000000000000c0 R09: ffff97ecc6316cc0
[ 16.322689] R10: 0000000000000004 R11: 0000000000003433 R12: ffffffffffffffff
[ 16.326628] R13: ffff97ecc6316d60 R14: 0000000000000000 R15: ffff97ecc5be4380
[ 16.332744] FS: 0000000000000000(0000) GS:ffff97edf7c00000(0063) knlGS:000000000a305880
[ 16.335685] CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
[ 16.337531] CR2: 00000000f7971de0 CR3: 0000000105a34000 CR4: 00000000000006f0
[ 16.339776] Call Trace:
[ 16.343257] keyctl_session_to_parent (kbuild/src/consumer/security/keys/keyctl.c:1711)
[ 16.344926] __do_fast_syscall_32 (kbuild/src/consumer/arch/x86/entry/common.c:78 kbuild/src/consumer/arch/x86/entry/common.c:137)
[ 16.346403] do_fast_syscall_32 (kbuild/src/consumer/arch/x86/entry/common.c:160)
[ 16.347724] entry_SYSENTER_compat_after_hwframe (kbuild/src/consumer/arch/x86/entry/entry_64_compat.S:141)
[ 16.352881] RIP: 0023:0xf7f71549
[ 16.354461] Code: Unable to access opcode bytes at RIP 0xf7f7151f.

Code starting with the faulting instruction
===========================================
[ 16.359740] RSP: 002b:00000000ffbc55dc EFLAGS: 00000206 ORIG_RAX: 0000000000000120
[ 16.362299] RAX: ffffffffffffffda RBX: 0000000000000012 RCX: 000000007818a343
[ 16.364587] RDX: 0000000002000000 RSI: 000000000000fffc RDI: 000000003e3e3e3e
[ 16.366789] RBP: 00000000fffffffd R08: 0000000000000000 R09: 0000000000000000
[ 16.369117] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[ 16.372090] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 16.377777] Modules linked in:
[ 16.378939] ---[ end trace 6eb09af71dd8bf1b ]---
[ 16.380446] RIP: 0010:__put_cred (kbuild/src/consumer/kernel/cred.c:148 (discriminator 1))
[ 16.381914] Code: 00 00 4c 8d 87 a0 00 00 00 85 c0 74 08 4c 89 c7 e9 1d ff ff ff 48 c7 c6 20 c3 28 a9 4c 89 c7 e9 ce 79 04 00 0f 0b 0f 0b 0f 0b <0f> 0b 0f 0b 0f 1f 40 00 e9 5b 6b 2f 00 66 66 2e 0f 1f 84 00 00 00
All code
========
0: 00 00 add %al,(%rax)
2: 4c 8d 87 a0 00 00 00 lea 0xa0(%rdi),%r8
9: 85 c0 test %eax,%eax
b: 74 08 je 0x15
d: 4c 89 c7 mov %r8,%rdi
10: e9 1d ff ff ff jmpq 0xffffffffffffff32
15: 48 c7 c6 20 c3 28 a9 mov $0xffffffffa928c320,%rsi
1c: 4c 89 c7 mov %r8,%rdi
1f: e9 ce 79 04 00 jmpq 0x479f2
24: 0f 0b ud2
26: 0f 0b ud2
28: 0f 0b ud2
2a:* 0f 0b ud2 <-- trapping instruction
2c: 0f 0b ud2
2e: 0f 1f 40 00 nopl 0x0(%rax)
32: e9 5b 6b 2f 00 jmpq 0x2f6b92
37: 66 data16
38: 66 data16
39: 2e cs
3a: 0f .byte 0xf
3b: 1f (bad)
3c: 84 00 test %al,(%rax)
...

Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: 0f 0b ud2
4: 0f 1f 40 00 nopl 0x0(%rax)
8: e9 5b 6b 2f 00 jmpq 0x2f6b68
d: 66 data16
e: 66 data16
f: 2e cs
10: 0f .byte 0xf
11: 1f (bad)
12: 84 00 test %al,(%rax)


To reproduce:

# build kernel
cd linux
cp config-5.11.0-rc2-00426-g59ebc797229e .config
make HOSTCC=gcc-9 CC=gcc-9 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage

git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> job-script # job-script is attached in this email



Thanks,
Oliver Sang


Attachments:
(No filename) (7.79 kB)
config-5.11.0-rc2-00426-g59ebc797229e (127.73 kB)
job-script (4.10 kB)
dmesg.xz (11.70 kB)
Download all attachments

2021-01-13 16:30:32

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC PATCH v2 2/8] Add a reference to ucounts for each user


The subject is wrong. This should be:
[RFC PATCH v2 2/8] Add a reference to ucounts for each cred.

Further the explanation could use a little work. Something along the
lines of:

For RLIMIT_NPROC and some other rlimits the user_struct that holds the
global limit is kept alive for the lifetime of a process by keeping it
in struct cred. Add a ucounts reference to struct cred, so that
RLIMIT_NPROC can switch from using a per user limit to using a per user
per user namespace limit.

Nits about the code below.

Alexey Gladkov <[email protected]> writes:

> Before this, only the owner of the user namespace had an entry in ucounts.
> This entry addressed the user in the given user namespace.
>
> Now we create such an entry in ucounts for all users in the user namespace.
> Each user has only one entry for each user namespace.
>
> This commit is in preparation for migrating rlimits to ucounts.
>
> Signed-off-by: Alexey Gladkov <[email protected]>
> ---
> include/linux/cred.h | 1 +
> include/linux/user_namespace.h | 2 ++
> kernel/cred.c | 17 +++++++++++++++--
> kernel/ucount.c | 12 +++++++++++-
> kernel/user_namespace.c | 1 +
> 5 files changed, 30 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index 18639c069263..307744fcc387 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -144,6 +144,7 @@ struct cred {
> #endif
> struct user_struct *user; /* real user ID subscription */
> struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
> + struct ucounts *ucounts;
> struct group_info *group_info; /* supplementary groups for euid/fsgid */
> /* RCU deletion */
> union {
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 84fefa9247c4..483568a56f7f 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -102,6 +102,8 @@ bool setup_userns_sysctls(struct user_namespace *ns);
> void retire_userns_sysctls(struct user_namespace *ns);
> struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
> void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
> +void put_ucounts(struct ucounts *ucounts);
> +void set_cred_ucounts(const struct cred *cred, struct user_namespace *ns, kuid_t uid);
>
> #ifdef CONFIG_USER_NS
>
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 421b1149c651..d19e2e97092c 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -119,6 +119,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
> if (cred->group_info)
> put_group_info(cred->group_info);
> free_uid(cred->user);
> + put_ucounts(cred->ucounts);
> put_user_ns(cred->user_ns);
> kmem_cache_free(cred_jar, cred);
> }
> @@ -144,6 +145,9 @@ void __put_cred(struct cred *cred)
> BUG_ON(cred == current->cred);
> BUG_ON(cred == current->real_cred);
>
> + BUG_ON(cred->ucounts == NULL);
> + BUG_ON(cred->ucounts->ns != cred->user_ns);
> +
> if (cred->non_rcu)
> put_cred_rcu(&cred->rcu);
> else
> @@ -271,6 +275,9 @@ struct cred *prepare_creds(void)
> get_uid(new->user);
> get_user_ns(new->user_ns);
>
> + new->ucounts = NULL;
> + set_cred_ucounts(new, new->user_ns, new->euid);
> +
This hunk should be:
atomic_inc(&new->count);

That means you get to skip the lookup by uid and user_ns which while it
should be cheap is completely unnecessary in this case.

> #ifdef CONFIG_KEYS
> key_get(new->session_keyring);
> key_get(new->process_keyring);
> @@ -363,6 +370,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
> ret = create_user_ns(new);
> if (ret < 0)
> goto error_put;
> + set_cred_ucounts(new, new->user_ns, new->euid);
> }
>
> #ifdef CONFIG_KEYS
> @@ -485,8 +493,11 @@ int commit_creds(struct cred *new)
> * in set_user().
> */
> alter_cred_subscribers(new, 2);
> - if (new->user != old->user)
> - atomic_inc(&new->user->processes);
> + if (new->user != old->user || new->user_ns != old->user_ns) {
> + if (new->user != old->user)
> + atomic_inc(&new->user->processes);
> + set_cred_ucounts(new, new->user_ns, new->euid);
> + }
> rcu_assign_pointer(task->real_cred, new);
> rcu_assign_pointer(task->cred, new);
> if (new->user != old->user)
> @@ -661,6 +672,7 @@ void __init cred_init(void)
> /* allocate a slab in which we can store credentials */
> cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
> + set_cred_ucounts(&init_cred, &init_user_ns, GLOBAL_ROOT_UID);
Unfortuantely this is needed here because this is the first cred
and there is no ucount reference to copy.
> }
>
> /**
> @@ -704,6 +716,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
> get_uid(new->user);
> get_user_ns(new->user_ns);
> get_group_info(new->group_info);
> + set_cred_ucounts(new, new->user_ns, new->euid);
This hunk should be:
atomic_inc(&new->count);

>
> #ifdef CONFIG_KEYS
> new->session_keyring = NULL;
> diff --git a/kernel/ucount.c b/kernel/ucount.c
> index 0f2c7c11df19..80a39073bcef 100644
> --- a/kernel/ucount.c
> +++ b/kernel/ucount.c
> @@ -161,7 +161,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
> return ucounts;
> }
>
> -static void put_ucounts(struct ucounts *ucounts)
> +void put_ucounts(struct ucounts *ucounts)
> {
> unsigned long flags;
>
> @@ -175,6 +175,16 @@ static void put_ucounts(struct ucounts *ucounts)
> kfree(ucounts);
> }
>
> +void set_cred_ucounts(const struct cred *cred, struct user_namespace *ns, kuid_t uid)
> +{
> + if (cred->ucounts) {
> + if (cred->ucounts->ns == ns && uid_eq(cred->ucounts->uid, uid))
> + return;
> + put_ucounts(cred->ucounts);
> + }
> + ((struct cred *) cred)->ucounts = get_ucounts(ns, uid);
> +}
> +

That can become:
void reset_cred_ucounts(struct cred *cred, struct user_namespace *ns, kuid_t uid)
{
struct ucounts *old = cred->ucounts;

if (old && old->ns && uid_eq(old->uid, uid))
return;

cred->ucounts = get_ucounts(ns, uid);
if (old)
put_ucounts(old);
}

Removing the const on struct cred will make any mistakes where you use
this with anything except a brand new cred show up at compile time.

Changing the tests around just makes it a little clearer what the code
is doing.

Changing the name emphasises that prepare_cred should not be using this
only commit_cred and friends where the ucounts may have changed.


> static inline bool atomic_inc_below(atomic_t *v, int u)
> {
> int c, old;
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index af612945a4d0..4b8a4468d391 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -1280,6 +1280,7 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)
>
> put_user_ns(cred->user_ns);
> set_cred_user_ns(cred, get_user_ns(user_ns));
> + set_cred_ucounts(cred, user_ns, cred->euid);
>
> return 0;
> }