struct pid's count is an atomic_t field used as a refcount. Use
refcount_t for it which is basically atomic_t but does additional
checking to prevent use-after-free bugs.
For memory ordering, the only change is with the following:
- if ((atomic_read(&pid->count) == 1) ||
- atomic_dec_and_test(&pid->count)) {
+ if (refcount_dec_and_test(&pid->count)) {
kmem_cache_free(ns->pid_cachep, pid);
Here the change is from:
Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst)
This ACQUIRE should take care of making sure the free happens after the
refcount_dec_and_test().
The above hunk also removes atomic_read() since it is not needed for the
code to work and it is unclear how beneficial it is. The removal lets
refcount_dec_and_test() check for cases where get_pid() happened before
the object was freed.
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
Changed to RFC to get any feedback on the memory ordering.
include/linux/pid.h | 5 +++--
kernel/pid.c | 7 +++----
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..8cb86d377ff5 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
#define _LINUX_PID_H
#include <linux/rculist.h>
+#include <linux/refcount.h>
enum pid_type
{
@@ -56,7 +57,7 @@ struct upid {
struct pid
{
- atomic_t count;
+ refcount_t count;
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
@@ -69,7 +70,7 @@ extern struct pid init_struct_pid;
static inline struct pid *get_pid(struct pid *pid)
{
if (pid)
- atomic_inc(&pid->count);
+ refcount_inc(&pid->count);
return pid;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..89c4849fab5d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,7 +37,7 @@
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
-#include <linux/proc_fs.h>
+#include <linux/refcount.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
@@ -106,8 +106,7 @@ void put_pid(struct pid *pid)
return;
ns = pid->numbers[pid->level].ns;
- if ((atomic_read(&pid->count) == 1) ||
- atomic_dec_and_test(&pid->count)) {
+ if (refcount_dec_and_test(&pid->count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -210,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
get_pid_ns(ns);
- atomic_set(&pid->count, 1);
+ refcount_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
--
2.22.0.410.gd8fdbe21b5-goog
On Mon, Jun 24, 2019 at 02:45:34PM -0400, Joel Fernandes (Google) wrote:
> struct pid's count is an atomic_t field used as a refcount. Use
> refcount_t for it which is basically atomic_t but does additional
> checking to prevent use-after-free bugs.
>
> For memory ordering, the only change is with the following:
> - if ((atomic_read(&pid->count) == 1) ||
> - atomic_dec_and_test(&pid->count)) {
> + if (refcount_dec_and_test(&pid->count)) {
> kmem_cache_free(ns->pid_cachep, pid);
>
> Here the change is from:
> Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst)
> This ACQUIRE should take care of making sure the free happens after the
> refcount_dec_and_test().
>
> The above hunk also removes atomic_read() since it is not needed for the
> code to work and it is unclear how beneficial it is. The removal lets
> refcount_dec_and_test() check for cases where get_pid() happened before
> the object was freed.
>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>
> ---
> Changed to RFC to get any feedback on the memory ordering.
I had a question about refcount_inc().
As per Documentation/core-api/refcount-vs-atomic.rst , it says:
A control dependency (on success) for refcounters guarantees that
if a reference for an object was successfully obtained (reference
counter increment or addition happened, function returned true),
then further stores are ordered against this operation.
However, in refcount_inc() I don't see any memory barriers (in the case where
CONFIG_REFCOUNT_FULL=n). Is the documentation wrong?
get_pid() does a refcount_inc() but doesn't have any memory barriers either.
thanks,
- Joel
>
> include/linux/pid.h | 5 +++--
> kernel/pid.c | 7 +++----
> 2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 14a9a39da9c7..8cb86d377ff5 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -3,6 +3,7 @@
> #define _LINUX_PID_H
>
> #include <linux/rculist.h>
> +#include <linux/refcount.h>
>
> enum pid_type
> {
> @@ -56,7 +57,7 @@ struct upid {
>
> struct pid
> {
> - atomic_t count;
> + refcount_t count;
> unsigned int level;
> /* lists of tasks that use this pid */
> struct hlist_head tasks[PIDTYPE_MAX];
> @@ -69,7 +70,7 @@ extern struct pid init_struct_pid;
> static inline struct pid *get_pid(struct pid *pid)
> {
> if (pid)
> - atomic_inc(&pid->count);
> + refcount_inc(&pid->count);
> return pid;
> }
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 20881598bdfa..89c4849fab5d 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -37,7 +37,7 @@
> #include <linux/init_task.h>
> #include <linux/syscalls.h>
> #include <linux/proc_ns.h>
> -#include <linux/proc_fs.h>
> +#include <linux/refcount.h>
> #include <linux/sched/task.h>
> #include <linux/idr.h>
>
> @@ -106,8 +106,7 @@ void put_pid(struct pid *pid)
> return;
>
> ns = pid->numbers[pid->level].ns;
> - if ((atomic_read(&pid->count) == 1) ||
> - atomic_dec_and_test(&pid->count)) {
> + if (refcount_dec_and_test(&pid->count)) {
> kmem_cache_free(ns->pid_cachep, pid);
> put_pid_ns(ns);
> }
> @@ -210,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> }
>
> get_pid_ns(ns);
> - atomic_set(&pid->count, 1);
> + refcount_set(&pid->count, 1);
> for (type = 0; type < PIDTYPE_MAX; ++type)
> INIT_HLIST_HEAD(&pid->tasks[type]);
>
> --
> 2.22.0.410.gd8fdbe21b5-goog
On Mon, Jun 24, 2019 at 8:52 PM Joel Fernandes <[email protected]> wrote:
> On Mon, Jun 24, 2019 at 02:45:34PM -0400, Joel Fernandes (Google) wrote:
> > struct pid's count is an atomic_t field used as a refcount. Use
> > refcount_t for it which is basically atomic_t but does additional
> > checking to prevent use-after-free bugs.
> >
> > For memory ordering, the only change is with the following:
> > - if ((atomic_read(&pid->count) == 1) ||
> > - atomic_dec_and_test(&pid->count)) {
> > + if (refcount_dec_and_test(&pid->count)) {
> > kmem_cache_free(ns->pid_cachep, pid);
> >
> > Here the change is from:
> > Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst)
> > This ACQUIRE should take care of making sure the free happens after the
> > refcount_dec_and_test().
> >
> > The above hunk also removes atomic_read() since it is not needed for the
> > code to work and it is unclear how beneficial it is. The removal lets
> > refcount_dec_and_test() check for cases where get_pid() happened before
> > the object was freed.
[...]
> I had a question about refcount_inc().
>
> As per Documentation/core-api/refcount-vs-atomic.rst , it says:
>
> A control dependency (on success) for refcounters guarantees that
> if a reference for an object was successfully obtained (reference
> counter increment or addition happened, function returned true),
> then further stores are ordered against this operation.
>
> However, in refcount_inc() I don't see any memory barriers (in the case where
> CONFIG_REFCOUNT_FULL=n). Is the documentation wrong?
That part of the documentation only talks about cases where you have a
control dependency on the return value of the refcount operation. But
refcount_inc() does not return a value, so this isn't relevant for
refcount_inc().
Also, AFAIU, the control dependency mentioned in the documentation has
to exist *in the caller* - it's just pointing out that if you write
code like the following, you have a control dependency between the
refcount operation and the write:
if (refcount_inc_not_zero(&obj->refcount)) {
WRITE_ONCE(obj->x, y);
}
For more information on the details of this stuff, try reading the
section "CONTROL DEPENDENCIES" of Documentation/memory-barriers.txt.
On Mon, Jun 24, 2019 at 3:10 PM Jann Horn <[email protected]> wrote:
>
> On Mon, Jun 24, 2019 at 8:52 PM Joel Fernandes <[email protected]> wrote:
> > On Mon, Jun 24, 2019 at 02:45:34PM -0400, Joel Fernandes (Google) wrote:
> > > struct pid's count is an atomic_t field used as a refcount. Use
> > > refcount_t for it which is basically atomic_t but does additional
> > > checking to prevent use-after-free bugs.
> > >
> > > For memory ordering, the only change is with the following:
> > > - if ((atomic_read(&pid->count) == 1) ||
> > > - atomic_dec_and_test(&pid->count)) {
> > > + if (refcount_dec_and_test(&pid->count)) {
> > > kmem_cache_free(ns->pid_cachep, pid);
> > >
> > > Here the change is from:
> > > Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst)
> > > This ACQUIRE should take care of making sure the free happens after the
> > > refcount_dec_and_test().
> > >
> > > The above hunk also removes atomic_read() since it is not needed for the
> > > code to work and it is unclear how beneficial it is. The removal lets
> > > refcount_dec_and_test() check for cases where get_pid() happened before
> > > the object was freed.
> [...]
> > I had a question about refcount_inc().
> >
> > As per Documentation/core-api/refcount-vs-atomic.rst , it says:
> >
> > A control dependency (on success) for refcounters guarantees that
> > if a reference for an object was successfully obtained (reference
> > counter increment or addition happened, function returned true),
> > then further stores are ordered against this operation.
> >
> > However, in refcount_inc() I don't see any memory barriers (in the case where
> > CONFIG_REFCOUNT_FULL=n). Is the documentation wrong?
>
> That part of the documentation only talks about cases where you have a
> control dependency on the return value of the refcount operation. But
> refcount_inc() does not return a value, so this isn't relevant for
> refcount_inc().
>
> Also, AFAIU, the control dependency mentioned in the documentation has
> to exist *in the caller* - it's just pointing out that if you write
> code like the following, you have a control dependency between the
> refcount operation and the write:
>
> if (refcount_inc_not_zero(&obj->refcount)) {
> WRITE_ONCE(obj->x, y);
> }
>
> For more information on the details of this stuff, try reading the
> section "CONTROL DEPENDENCIES" of Documentation/memory-barriers.txt.
Makes sense now, thank you Jann!
- Joel
On Mon, Jun 24, 2019 at 09:10:15PM +0200, Jann Horn wrote:
> That part of the documentation only talks about cases where you have a
> control dependency on the return value of the refcount operation. But
> refcount_inc() does not return a value, so this isn't relevant for
> refcount_inc().
>
> Also, AFAIU, the control dependency mentioned in the documentation has
> to exist *in the caller* - it's just pointing out that if you write
> code like the following, you have a control dependency between the
> refcount operation and the write:
>
> if (refcount_inc_not_zero(&obj->refcount)) {
> WRITE_ONCE(obj->x, y);
> }
>
> For more information on the details of this stuff, try reading the
> section "CONTROL DEPENDENCIES" of Documentation/memory-barriers.txt.
IIRC the argument went as follows:
- if you use refcount_inc(), you've already got a stable object and
have ACQUIRED it otherwise, typically through locking.
- if you use refcount_inc_not_zero(), you have a semi stable object
(RCU), but you still need to ensure any changes to the object happen
after acquiring a reference, and this is where the control dependency
comes in as Jann already explained.
Specifically, it would be bad to allow STOREs to happen before we know
the refcount isn't 0, as that would be a UaF.
Also see the comment in lib/refcount.c.
On Tue, Jun 25, 2019 at 09:34:07AM +0200, Peter Zijlstra wrote:
> On Mon, Jun 24, 2019 at 09:10:15PM +0200, Jann Horn wrote:
> > That part of the documentation only talks about cases where you have a
> > control dependency on the return value of the refcount operation. But
> > refcount_inc() does not return a value, so this isn't relevant for
> > refcount_inc().
> >
> > Also, AFAIU, the control dependency mentioned in the documentation has
> > to exist *in the caller* - it's just pointing out that if you write
> > code like the following, you have a control dependency between the
> > refcount operation and the write:
> >
> > if (refcount_inc_not_zero(&obj->refcount)) {
> > WRITE_ONCE(obj->x, y);
> > }
> >
> > For more information on the details of this stuff, try reading the
> > section "CONTROL DEPENDENCIES" of Documentation/memory-barriers.txt.
>
> IIRC the argument went as follows:
>
> - if you use refcount_inc(), you've already got a stable object and
> have ACQUIRED it otherwise, typically through locking.
>
> - if you use refcount_inc_not_zero(), you have a semi stable object
> (RCU), but you still need to ensure any changes to the object happen
> after acquiring a reference, and this is where the control dependency
> comes in as Jann already explained.
>
> Specifically, it would be bad to allow STOREs to happen before we know
> the refcount isn't 0, as that would be a UaF.
>
> Also see the comment in lib/refcount.c.
>
Thanks a lot for the explanations and the pointers to the comment in
lib/refcount.c . It makes it really clearly.
Also, does this patch look good to you? If so and if ok with you, could you
Ack it? The patch is not really "RFC" but I still tagged it as such since I
wanted to have this discussion.
Thanks!
- Joel
On Mon, Jun 24, 2019 at 02:45:34PM -0400, Joel Fernandes (Google) wrote:
> struct pid's count is an atomic_t field used as a refcount. Use
> refcount_t for it which is basically atomic_t but does additional
> checking to prevent use-after-free bugs.
>
> For memory ordering, the only change is with the following:
> - if ((atomic_read(&pid->count) == 1) ||
> - atomic_dec_and_test(&pid->count)) {
> + if (refcount_dec_and_test(&pid->count)) {
> kmem_cache_free(ns->pid_cachep, pid);
>
> Here the change is from:
> Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst)
> This ACQUIRE should take care of making sure the free happens after the
> refcount_dec_and_test().
>
> The above hunk also removes atomic_read() since it is not needed for the
> code to work and it is unclear how beneficial it is. The removal lets
> refcount_dec_and_test() check for cases where get_pid() happened before
> the object was freed.
>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
As always with these matters, it's quite possible that I'm missing
something subtle here; said this, ;-) the patch does look good to
me: FWIW,
Reviewed-by: Andrea Parri <[email protected]>
Thanks,
Andrea
>
> ---
> Changed to RFC to get any feedback on the memory ordering.
>
>
> include/linux/pid.h | 5 +++--
> kernel/pid.c | 7 +++----
> 2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 14a9a39da9c7..8cb86d377ff5 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -3,6 +3,7 @@
> #define _LINUX_PID_H
>
> #include <linux/rculist.h>
> +#include <linux/refcount.h>
>
> enum pid_type
> {
> @@ -56,7 +57,7 @@ struct upid {
>
> struct pid
> {
> - atomic_t count;
> + refcount_t count;
> unsigned int level;
> /* lists of tasks that use this pid */
> struct hlist_head tasks[PIDTYPE_MAX];
> @@ -69,7 +70,7 @@ extern struct pid init_struct_pid;
> static inline struct pid *get_pid(struct pid *pid)
> {
> if (pid)
> - atomic_inc(&pid->count);
> + refcount_inc(&pid->count);
> return pid;
> }
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 20881598bdfa..89c4849fab5d 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -37,7 +37,7 @@
> #include <linux/init_task.h>
> #include <linux/syscalls.h>
> #include <linux/proc_ns.h>
> -#include <linux/proc_fs.h>
> +#include <linux/refcount.h>
> #include <linux/sched/task.h>
> #include <linux/idr.h>
>
> @@ -106,8 +106,7 @@ void put_pid(struct pid *pid)
> return;
>
> ns = pid->numbers[pid->level].ns;
> - if ((atomic_read(&pid->count) == 1) ||
> - atomic_dec_and_test(&pid->count)) {
> + if (refcount_dec_and_test(&pid->count)) {
> kmem_cache_free(ns->pid_cachep, pid);
> put_pid_ns(ns);
> }
> @@ -210,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> }
>
> get_pid_ns(ns);
> - atomic_set(&pid->count, 1);
> + refcount_set(&pid->count, 1);
> for (type = 0; type < PIDTYPE_MAX; ++type)
> INIT_HLIST_HEAD(&pid->tasks[type]);
>
> --
> 2.22.0.410.gd8fdbe21b5-goog