The pattern of wait_event(percpu_ref_is_zero()) has been used in several
kernel components, and this way actually has the following risk:
- percpu_ref_is_zero() can be returned just between
atomic_long_sub_and_test() and ref->data->release(ref)
- given the refcount is found as zero, percpu_ref_exit() could
be called, and the host data structure is freed
- then use-after-free is triggered in ->release() when the user host
data structure is freed after percpu_ref_exit() returns
Reported-by: Zhong Jinghua <[email protected]>
Fixes: 2b0d3d3e4fcf ("percpu_ref: reduce memory footprint of percpu_ref in fast path")
Signed-off-by: Ming Lei <[email protected]>
---
include/linux/percpu-refcount.h | 41 ++++++++++++++++++++++-----------
lib/percpu-refcount.c | 22 ++++++++++++++++++
2 files changed, 50 insertions(+), 13 deletions(-)
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 006c6aae261e..6ef29ebffd58 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -55,6 +55,7 @@
#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/gfp.h>
+#include <linux/sched.h>
struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);
@@ -104,6 +105,7 @@ struct percpu_ref_data {
bool force_atomic:1;
bool allow_reinit:1;
bool auto_exit:1;
+ bool being_release:1;
struct rcu_head rcu;
struct percpu_ref *ref;
};
@@ -137,6 +139,7 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);
+wait_queue_head_t *percpu_ref_get_switch_waitq(void);
/**
* percpu_ref_kill - drop the initial ref
@@ -319,6 +322,29 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
return ret;
}
+/* Internal helper, please do not call it outside */
+static inline void __percpu_ref_put_many(struct percpu_ref *ref,
+ unsigned long nr)
+{
+ struct percpu_ref_data *data = ref->data;
+ struct percpu_ref copy = *ref;
+ bool release = false;
+
+ data->being_release = 1;
+ if (unlikely(atomic_long_sub_and_test(nr, &data->count))) {
+ data->release(ref);
+ release = true;
+ }
+ data->being_release = 0;
+
+ if (release) {
+ if (data->auto_exit)
+ percpu_ref_exit(©);
+ /* re-use switch waitq for ack the release done */
+ wake_up_all(percpu_ref_get_switch_waitq());
+ }
+}
+
/**
* percpu_ref_put_many - decrement a percpu refcount
* @ref: percpu_ref to put
@@ -337,19 +363,8 @@ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
if (__ref_is_percpu(ref, &percpu_count))
this_cpu_sub(*percpu_count, nr);
- else {
- struct percpu_ref_data *data = ref->data;
- struct percpu_ref copy = *ref;
- bool release = false;
-
- if (unlikely(atomic_long_sub_and_test(nr, &data->count))) {
- data->release(ref);
- release = true;
- }
-
- if (release && data->auto_exit)
- percpu_ref_exit(©);
- }
+ else
+ __percpu_ref_put_many(ref, nr);
rcu_read_unlock();
}
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index c0cadf92948f..fd50eda233ed 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -140,6 +140,22 @@ void percpu_ref_exit(struct percpu_ref *ref)
if (!data)
return;
+ /*
+ * We may reach here because wait_event(percpu_ref_is_zero())
+ * returns, and ->release() may not be completed or even started
+ * ye, then use-after-free is caused, so drain ->release() here
+ */
+ if (!data->auto_exit) {
+ /*
+ * Order reading the atomic count in percpu_ref_is_zero
+ * and reading data->being_release. The counter pair is
+ * the one implied in atomic_long_sub_and_test() called
+ * from __percpu_ref_put_many().
+ */
+ smp_rmb();
+ wait_event(percpu_ref_switch_waitq, !data->being_release);
+ }
+
spin_lock_irqsave(&percpu_ref_switch_lock, flags);
ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) <<
__PERCPU_REF_FLAG_BITS;
@@ -480,3 +496,9 @@ void percpu_ref_resurrect(struct percpu_ref *ref)
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_resurrect);
+
+wait_queue_head_t *percpu_ref_get_switch_waitq()
+{
+ return &percpu_ref_switch_waitq;
+}
+EXPORT_SYMBOL_GPL(percpu_ref_get_switch_waitq);
--
2.38.1
On Wed, Dec 14, 2022 at 04:16:51PM +0800, Hillf Danton wrote:
> On 14 Dec 2022 10:51:01 +0800 Ming Lei <[email protected]>
> > The pattern of wait_event(percpu_ref_is_zero()) has been used in several
>
> For example?
blk_mq_freeze_queue_wait() and target_wait_for_sess_cmds().
>
> > kernel components, and this way actually has the following risk:
> >
> > - percpu_ref_is_zero() can be returned just between
> > atomic_long_sub_and_test() and ref->data->release(ref)
> >
> > - given the refcount is found as zero, percpu_ref_exit() could
> > be called, and the host data structure is freed
> >
> > - then use-after-free is triggered in ->release() when the user host
> > data structure is freed after percpu_ref_exit() returns
>
> The race between exit and the release callback should be considered at the
> corresponding callsite, given the comment below, and closed for instance
> by synchronizing rcu.
>
> /**
> * percpu_ref_put_many - decrement a percpu refcount
> * @ref: percpu_ref to put
> * @nr: number of references to put
> *
> * Decrement the refcount, and if 0, call the release function (which was passed
> * to percpu_ref_init())
> *
> * This function is safe to call as long as @ref is between init and exit.
> */
Not sure if the above comment implies that the callsite should cover the
race.
But blk-mq can really avoid the trouble by using the existed call_rcu():
diff --git a/block/blk-core.c b/block/blk-core.c
index 3866b6c4cd88..9321767470dc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -254,14 +254,15 @@ EXPORT_SYMBOL_GPL(blk_clear_pm_only);
static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
- kmem_cache_free(blk_requestq_cachep,
- container_of(rcu_head, struct request_queue, rcu_head));
+ struct request_queue *q = container_of(rcu_head,
+ struct request_queue, rcu_head);
+
+ percpu_ref_exit(&q->q_usage_counter);
+ kmem_cache_free(blk_requestq_cachep, q);
}
static void blk_free_queue(struct request_queue *q)
{
- percpu_ref_exit(&q->q_usage_counter);
-
if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
Thanks,
Ming
Hello,
On Wed, Dec 14, 2022 at 09:30:08PM +0800, Ming Lei wrote:
> On Wed, Dec 14, 2022 at 04:16:51PM +0800, Hillf Danton wrote:
> > On 14 Dec 2022 10:51:01 +0800 Ming Lei <[email protected]>
> > > The pattern of wait_event(percpu_ref_is_zero()) has been used in several
> >
> > For example?
>
> blk_mq_freeze_queue_wait() and target_wait_for_sess_cmds().
>
> >
> > > kernel components, and this way actually has the following risk:
> > >
> > > - percpu_ref_is_zero() can be returned just between
> > > atomic_long_sub_and_test() and ref->data->release(ref)
> > >
> > > - given the refcount is found as zero, percpu_ref_exit() could
> > > be called, and the host data structure is freed
> > >
> > > - then use-after-free is triggered in ->release() when the user host
> > > data structure is freed after percpu_ref_exit() returns
> >
> > The race between exit and the release callback should be considered at the
> > corresponding callsite, given the comment below, and closed for instance
> > by synchronizing rcu.
> >
> > /**
> > * percpu_ref_put_many - decrement a percpu refcount
> > * @ref: percpu_ref to put
> > * @nr: number of references to put
> > *
> > * Decrement the refcount, and if 0, call the release function (which was passed
> > * to percpu_ref_init())
> > *
> > * This function is safe to call as long as @ref is between init and exit.
> > */
>
> Not sure if the above comment implies that the callsite should cover the
> race.
>
> But blk-mq can really avoid the trouble by using the existed call_rcu():
>
I struggle with the dependency on release(). release() itself should not
block, but a common pattern would be to through a call_rcu() in and
schedule additional work - see block/blk-cgroup.c, blkg_release().
I think the dependency really is the completion of release() and the
work scheduled on it's behalf rather than strictly starting the
release() callback. This series doesn't preclude that from happening.
/**
* percpu_ref_exit - undo percpu_ref_init()
* @ref: percpu_ref to exit
*
* This function exits @ref. The caller is responsible for ensuring that
* @ref is no longer in active use. The usual places to invoke this
* function from are the @ref->release() callback or in init failure path
* where percpu_ref_init() succeeded but other parts of the initialization
* of the embedding object failed.
*/
I think the percpu_ref_exit() comment explains the more common use case
approach to percpu refcounts. release() triggering percpu_ref_exit() is
the ideal case.
Thanks,
Dennis
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 3866b6c4cd88..9321767470dc 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -254,14 +254,15 @@ EXPORT_SYMBOL_GPL(blk_clear_pm_only);
>
> static void blk_free_queue_rcu(struct rcu_head *rcu_head)
> {
> - kmem_cache_free(blk_requestq_cachep,
> - container_of(rcu_head, struct request_queue, rcu_head));
> + struct request_queue *q = container_of(rcu_head,
> + struct request_queue, rcu_head);
> +
> + percpu_ref_exit(&q->q_usage_counter);
> + kmem_cache_free(blk_requestq_cachep, q);
> }
>
> static void blk_free_queue(struct request_queue *q)
> {
> - percpu_ref_exit(&q->q_usage_counter);
> -
> if (q->poll_stat)
> blk_stat_remove_callback(q, q->poll_cb);
> blk_stat_free_callback(q->poll_cb);
>
>
> Thanks,
> Ming
>
On Wed, Dec 14, 2022 at 08:07:28AM -0800, Dennis Zhou wrote:
> Hello,
>
> On Wed, Dec 14, 2022 at 09:30:08PM +0800, Ming Lei wrote:
> > On Wed, Dec 14, 2022 at 04:16:51PM +0800, Hillf Danton wrote:
> > > On 14 Dec 2022 10:51:01 +0800 Ming Lei <[email protected]>
> > > > The pattern of wait_event(percpu_ref_is_zero()) has been used in several
> > >
> > > For example?
> >
> > blk_mq_freeze_queue_wait() and target_wait_for_sess_cmds().
> >
> > >
> > > > kernel components, and this way actually has the following risk:
> > > >
> > > > - percpu_ref_is_zero() can be returned just between
> > > > atomic_long_sub_and_test() and ref->data->release(ref)
> > > >
> > > > - given the refcount is found as zero, percpu_ref_exit() could
> > > > be called, and the host data structure is freed
> > > >
> > > > - then use-after-free is triggered in ->release() when the user host
> > > > data structure is freed after percpu_ref_exit() returns
> > >
> > > The race between exit and the release callback should be considered at the
> > > corresponding callsite, given the comment below, and closed for instance
> > > by synchronizing rcu.
> > >
> > > /**
> > > * percpu_ref_put_many - decrement a percpu refcount
> > > * @ref: percpu_ref to put
> > > * @nr: number of references to put
> > > *
> > > * Decrement the refcount, and if 0, call the release function (which was passed
> > > * to percpu_ref_init())
> > > *
> > > * This function is safe to call as long as @ref is between init and exit.
> > > */
> >
> > Not sure if the above comment implies that the callsite should cover the
> > race.
> >
> > But blk-mq can really avoid the trouble by using the existed call_rcu():
> >
>
> I struggle with the dependency on release(). release() itself should not
> block, but a common pattern would be to through a call_rcu() in and
Yes, release() is called with rcu read lock, and I guess the trouble may
be originated from the fact release() may do nothing related with
actual data releasing.
> schedule additional work - see block/blk-cgroup.c, blkg_release().
I believe the pattern is user specific, and the motivation of using call_rcu
can't be just for avoiding such potential race between release() and
percpu_ref_exit().
>
> I think the dependency really is the completion of release() and the
> work scheduled on it's behalf rather than strictly starting the
> release() callback. This series doesn't preclude that from happening.
Yeah.
For any additional work or sort of thing scheduled in release(), only
the caller can guarantee they are drained before percpu_exit_ref(), so
I agree now it is better for caller to avoid the race.
>
> /**
> * percpu_ref_exit - undo percpu_ref_init()
> * @ref: percpu_ref to exit
> *
> * This function exits @ref. The caller is responsible for ensuring that
> * @ref is no longer in active use. The usual places to invoke this
> * function from are the @ref->release() callback or in init failure path
> * where percpu_ref_init() succeeded but other parts of the initialization
> * of the embedding object failed.
> */
>
> I think the percpu_ref_exit() comment explains the more common use case
> approach to percpu refcounts. release() triggering percpu_ref_exit() is
> the ideal case.
But most of callers don't use in this way actually.
Thanks,
Ming
On Wed, Dec 14, 2022 at 10:51:01AM +0800, Ming Lei wrote:
> The pattern of wait_event(percpu_ref_is_zero()) has been used in several
> kernel components, and this way actually has the following risk:
I'd much rather see those components updated to wait for notification from
->release rather than doing this or update percpu_ref_is_zero() to wait for
->release() to finish.
Thanks.
--
tejun
Hi Ming,
I love your patch! Yet something to improve:
[auto build test ERROR on dennis-percpu/for-next]
[also build test ERROR on linus/master v6.1 next-20221216]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Ming-Lei/lib-percpu-refcount-fix-use-after-free-by-late-release/20221214-105440
base: https://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git for-next
patch link: https://lore.kernel.org/r/20221214025101.1268437-4-ming.lei%40redhat.com
patch subject: [PATCH 3/3] lib/percpu-refcount: drain ->release() in perpcu_ref_exit()
config: s390-buildonly-randconfig-r006-20221215
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 98b13979fb05f3ed288a900deb843e7b27589e58)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install s390 cross compiling tool for clang build
# apt-get install binutils-s390x-linux-gnu
# https://github.com/intel-lab-lkp/linux/commit/29f8a1f1e11734bdaf885ad8e9bcd614c2087fc0
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Ming-Lei/lib-percpu-refcount-fix-use-after-free-by-late-release/20221214-105440
git checkout 29f8a1f1e11734bdaf885ad8e9bcd614c2087fc0
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
>> lib/percpu-refcount.c:499:47: error: a function declaration without a prototype is deprecated in all versions of C [-Werror,-Wstrict-prototypes]
wait_queue_head_t *percpu_ref_get_switch_waitq()
^
void
1 error generated.
vim +499 lib/percpu-refcount.c
498
> 499 wait_queue_head_t *percpu_ref_get_switch_waitq()
--
0-DAY CI Kernel Test Service
https://01.org/lkp
Hi Ming,
I love your patch! Perhaps something to improve:
[auto build test WARNING on dennis-percpu/for-next]
[also build test WARNING on linus/master v6.1 next-20221216]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Ming-Lei/lib-percpu-refcount-fix-use-after-free-by-late-release/20221214-105440
base: https://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git for-next
patch link: https://lore.kernel.org/r/20221214025101.1268437-4-ming.lei%40redhat.com
patch subject: [PATCH 3/3] lib/percpu-refcount: drain ->release() in perpcu_ref_exit()
config: i386-randconfig-a001
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/29f8a1f1e11734bdaf885ad8e9bcd614c2087fc0
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Ming-Lei/lib-percpu-refcount-fix-use-after-free-by-late-release/20221214-105440
git checkout 29f8a1f1e11734bdaf885ad8e9bcd614c2087fc0
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=i386 olddefconfig
make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
All warnings (new ones prefixed by >>):
lib/percpu-refcount.c: In function 'percpu_ref_get_switch_waitq':
>> lib/percpu-refcount.c:499:20: warning: old-style function definition [-Wold-style-definition]
499 | wait_queue_head_t *percpu_ref_get_switch_waitq()
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~
vim +499 lib/percpu-refcount.c
498
> 499 wait_queue_head_t *percpu_ref_get_switch_waitq()
--
0-DAY CI Kernel Test Service
https://01.org/lkp
Hi Ming,
I love your patch! Perhaps something to improve:
[auto build test WARNING on dennis-percpu/for-next]
[also build test WARNING on linus/master v6.1 next-20221216]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Ming-Lei/lib-percpu-refcount-fix-use-after-free-by-late-release/20221214-105440
base: https://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git for-next
patch link: https://lore.kernel.org/r/20221214025101.1268437-4-ming.lei%40redhat.com
patch subject: [PATCH 3/3] lib/percpu-refcount: drain ->release() in perpcu_ref_exit()
config: alpha-randconfig-s053-20221216
compiler: alpha-linux-gcc (GCC) 12.1.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# apt-get install sparse
# sparse version: v0.6.4-39-gce1a6720-dirty
# https://github.com/intel-lab-lkp/linux/commit/29f8a1f1e11734bdaf885ad8e9bcd614c2087fc0
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Ming-Lei/lib-percpu-refcount-fix-use-after-free-by-late-release/20221214-105440
git checkout 29f8a1f1e11734bdaf885ad8e9bcd614c2087fc0
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=alpha olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=alpha SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
sparse warnings: (new ones prefixed by >>)
>> lib/percpu-refcount.c:499:48: sparse: sparse: non-ANSI function declaration of function 'percpu_ref_get_switch_waitq'
vim +/percpu_ref_get_switch_waitq +499 lib/percpu-refcount.c
498
> 499 wait_queue_head_t *percpu_ref_get_switch_waitq()
--
0-DAY CI Kernel Test Service
https://01.org/lkp