Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67;
Date:   Thu, 25 Jan 2018 15:30:33 +0800
From:   Boqun Feng <boqun.feng@gmail.com>
To:     "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc:     lianglihao@huawei.com, guohanjun@huawei.com, heng.z@huawei.com,
        hb.chen@huawei.com, lihao.liang@gmail.com,
        linux-kernel@vger.kernel.org
Subject: Re: [PATCH RFC 01/16] prcu: Add PRCU implementation
Message-ID: <20180125073033.4rl7bun62newplb3@tardis>
References: <1516694381-20333-1-git-send-email-lianglihao@huawei.com>
 <1516694381-20333-2-git-send-email-lianglihao@huawei.com>
 <20180125061618.GU3741@linux.vnet.ibm.com>
MIME-Version: 1.0
Content-Type: multipart/signed; micalg=pgp-sha256;
        protocol="application/pgp-signature"; boundary="vaizlpdknh5hghcj"
Content-Disposition: inline
In-Reply-To: <20180125061618.GU3741@linux.vnet.ibm.com>
User-Agent: NeoMutt/20171215
Sender: linux-kernel-owner@vger.kernel.org
Precedence: bulk


--vaizlpdknh5hghcj
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Wed, Jan 24, 2018 at 10:16:18PM -0800, Paul E. McKenney wrote:
> On Tue, Jan 23, 2018 at 03:59:26PM +0800, lianglihao@huawei.com wrote:
> > From: Heng Zhang <heng.z@huawei.com>
> >=20
> > This RCU implementation (PRCU) is based on a fast consensus protocol
> > published in the following paper:
> >=20
> > Fast Consensus Using Bounded Staleness for Scalable Read-mostly Synchro=
nization.
> > Haibo Chen, Heng Zhang, Ran Liu, Binyu Zang, and Haibing Guan.
> > IEEE Transactions on Parallel and Distributed Systems (TPDS), 2016.
> > https://dl.acm.org/citation.cfm?id=3D3024114.3024143
> >=20
> > Signed-off-by: Heng Zhang <heng.z@huawei.com>
> > Signed-off-by: Lihao Liang <lianglihao@huawei.com>
>=20
> A few comments and questions interspersed.
>=20
> 							Thanx, Paul
>=20
> > ---
> >  include/linux/prcu.h |  37 +++++++++++++++
> >  kernel/rcu/Makefile  |   2 +-
> >  kernel/rcu/prcu.c    | 125 +++++++++++++++++++++++++++++++++++++++++++=
++++++++
> >  kernel/sched/core.c  |   2 +
> >  4 files changed, 165 insertions(+), 1 deletion(-)
> >  create mode 100644 include/linux/prcu.h
> >  create mode 100644 kernel/rcu/prcu.c
> >=20
> > diff --git a/include/linux/prcu.h b/include/linux/prcu.h
> > new file mode 100644
> > index 00000000..653b4633
> > --- /dev/null
> > +++ b/include/linux/prcu.h
> > @@ -0,0 +1,37 @@
> > +#ifndef __LINUX_PRCU_H
> > +#define __LINUX_PRCU_H
> > +
> > +#include <linux/atomic.h>
> > +#include <linux/mutex.h>
> > +#include <linux/wait.h>
> > +
> > +#define CONFIG_PRCU
> > +
> > +struct prcu_local_struct {
> > +	unsigned int locked;
> > +	unsigned int online;
> > +	unsigned long long version;
> > +};
> > +
> > +struct prcu_struct {
> > +	atomic64_t global_version;
> > +	atomic_t active_ctr;
> > +	struct mutex mtx;
> > +	wait_queue_head_t wait_q;
> > +};
> > +
> > +#ifdef CONFIG_PRCU
> > +void prcu_read_lock(void);
> > +void prcu_read_unlock(void);
> > +void synchronize_prcu(void);
> > +void prcu_note_context_switch(void);
> > +
> > +#else /* #ifdef CONFIG_PRCU */
> > +
> > +#define prcu_read_lock() do {} while (0)
> > +#define prcu_read_unlock() do {} while (0)
> > +#define synchronize_prcu() do {} while (0)
> > +#define prcu_note_context_switch() do {} while (0)
>=20
> If CONFIG_PRCU=3Dn and some code is built that uses PRCU, shouldn't you
> get a build error rather than an error-free but inoperative PRCU?
>=20
> Of course, Peter's question about purpose of the patch set applies
> here as well.
>=20
> > +
> > +#endif /* #ifdef CONFIG_PRCU */
> > +#endif /* __LINUX_PRCU_H */
> > diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
> > index 23803c7d..8791419c 100644
> > --- a/kernel/rcu/Makefile
> > +++ b/kernel/rcu/Makefile
> > @@ -2,7 +2,7 @@
> >  # and is generally not a function of system call inputs.
> >  KCOV_INSTRUMENT :=3D n
> >=20
> > -obj-y +=3D update.o sync.o
> > +obj-y +=3D update.o sync.o prcu.o
> >  obj-$(CONFIG_CLASSIC_SRCU) +=3D srcu.o
> >  obj-$(CONFIG_TREE_SRCU) +=3D srcutree.o
> >  obj-$(CONFIG_TINY_SRCU) +=3D srcutiny.o
> > diff --git a/kernel/rcu/prcu.c b/kernel/rcu/prcu.c
> > new file mode 100644
> > index 00000000..a00b9420
> > --- /dev/null
> > +++ b/kernel/rcu/prcu.c
> > @@ -0,0 +1,125 @@
> > +#include <linux/smp.h>
> > +#include <linux/prcu.h>
> > +#include <linux/percpu.h>
> > +#include <linux/compiler.h>
> > +#include <linux/sched.h>
> > +
> > +#include <asm/barrier.h>
> > +
> > +DEFINE_PER_CPU_SHARED_ALIGNED(struct prcu_local_struct, prcu_local);
> > +
> > +struct prcu_struct global_prcu =3D {
> > +	.global_version =3D ATOMIC64_INIT(0),
> > +	.active_ctr =3D ATOMIC_INIT(0),
> > +	.mtx =3D __MUTEX_INITIALIZER(global_prcu.mtx),
> > +	.wait_q =3D __WAIT_QUEUE_HEAD_INITIALIZER(global_prcu.wait_q)
> > +};
> > +struct prcu_struct *prcu =3D &global_prcu;
> > +
> > +static inline void prcu_report(struct prcu_local_struct *local)
> > +{
> > +	unsigned long long global_version;
> > +	unsigned long long local_version;
> > +
> > +	global_version =3D atomic64_read(&prcu->global_version);
> > +	local_version =3D local->version;
> > +	if (global_version > local_version)
> > +		cmpxchg(&local->version, local_version, global_version);
> > +}
> > +
> > +void prcu_read_lock(void)
> > +{
> > +	struct prcu_local_struct *local;
> > +
> > +	local =3D get_cpu_ptr(&prcu_local);
> > +	if (!local->online) {
> > +		WRITE_ONCE(local->online, 1);
> > +		smp_mb();
> > +	}
> > +
> > +	local->locked++;
> > +	put_cpu_ptr(&prcu_local);
> > +}
> > +EXPORT_SYMBOL(prcu_read_lock);
> > +
> > +void prcu_read_unlock(void)
> > +{
> > +	int locked;
> > +	struct prcu_local_struct *local;
> > +
> > +	barrier();
> > +	local =3D get_cpu_ptr(&prcu_local);
> > +	locked =3D local->locked;
> > +	if (locked) {
> > +		local->locked--;
> > +		if (locked =3D=3D 1)
> > +			prcu_report(local);
>=20
> Is ordering important here?  It looks to me that the compiler could
> rearrange some of the accesses within prcu_report() with the local->locked
> decrement.  There appears to be some potential for load and store tearing,
> though perhaps you have verified that your compiler avoids this on
> the architecture that you are using.
>=20
> > +		put_cpu_ptr(&prcu_local);
> > +	} else {
>=20
> Hmmm...  We get here if the RCU read-side critical section was preempted.
> If none of them are preempted, ->active_ctr remains zero.
>=20
> > +		put_cpu_ptr(&prcu_local);
> > +		if (!atomic_dec_return(&prcu->active_ctr))
> > +			wake_up(&prcu->wait_q);
> > +	}
> > +}
> > +EXPORT_SYMBOL(prcu_read_unlock);
> > +
> > +static void prcu_handler(void *info)
> > +{
> > +	struct prcu_local_struct *local;
> > +
> > +	local =3D this_cpu_ptr(&prcu_local);
> > +	if (!local->locked)

And I think a smp_mb() is needed here, because in the following case:

	CPU 0				CPU 1
	=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D		=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
	{X is initially 0}

	WRITE_ONCE(X, 1);

	prcu_read_unlock(void):
	  if (locked) {
	  				synchronize_prcu(void):
					  ...
					  <send IPI to CPU 0>
	    local->locked--;
	# switch to IPI
	  WRITE_ONCE(local->version,....)
	  				  <read CPU 0 version to be latest>
					  <return>

					r1 =3D READ_ONCE(X);

r1 could be 0, which breaks RCU guarantees.

> > +		WRITE_ONCE(local->version, atomic64_read(&prcu->global_version));
> > +}
> > +
> > +void synchronize_prcu(void)
> > +{
> > +	int cpu;
> > +	cpumask_t cpus;
> > +	unsigned long long version;
> > +	struct prcu_local_struct *local;
> > +
> > +	version =3D atomic64_add_return(1, &prcu->global_version);
> > +	mutex_lock(&prcu->mtx);
> > +
> > +	local =3D get_cpu_ptr(&prcu_local);
> > +	local->version =3D version;
> > +	put_cpu_ptr(&prcu_local);
> > +
> > +	cpumask_clear(&cpus);
> > +	for_each_possible_cpu(cpu) {
> > +		local =3D per_cpu_ptr(&prcu_local, cpu);
> > +		if (!READ_ONCE(local->online))
> > +			continue;
> > +		if (READ_ONCE(local->version) < version) {
>=20
> On 32-bit systems, given that ->version is long long, you might see
> load tearing.  And on some 32-bit systems, the cmpxchg() in prcu_hander()
> might not build.
>=20

/me curious about why an atomic64_t is used here for global version. I
think maybe 32bit global version still suffices.

Regards,
Boqun

> Or is the idea that only prcu_handler() updates ->version?  But in that
> case, you wouldn't need the READ_ONCE() above.  What am I missing here?
>=20
> > +			smp_call_function_single(cpu, prcu_handler, NULL, 0);
> > +			cpumask_set_cpu(cpu, &cpus);
> > +		}
> > +	}
> > +
> > +	for_each_cpu(cpu, &cpus) {
> > +		local =3D per_cpu_ptr(&prcu_local, cpu);
> > +		while (READ_ONCE(local->version) < version)
>=20
> This ->version read can also tear on some 32-bit systems, and this
> one most definitely can race with the prcu_handler() above.  Does the
> algorithm operate correctly in that case?  (It doesn't look that way
> to me, but I might be missing something.) Or are 32-bit systems excluded?
>=20
> > +			cpu_relax();
> > +	}
>=20
> I might be missing something, but I believe we need a memory barrier
> here on non-TSO systems.  Without that, couldn't we miss a preemption?
>=20
> > +
> > +	if (atomic_read(&prcu->active_ctr))
> > +		wait_event(prcu->wait_q, !atomic_read(&prcu->active_ctr));
> > +
> > +	mutex_unlock(&prcu->mtx);
> > +}
> > +EXPORT_SYMBOL(synchronize_prcu);
> > +
> > +void prcu_note_context_switch(void)
> > +{
> > +	struct prcu_local_struct *local;
> > +
> > +	local =3D get_cpu_ptr(&prcu_local);
> > +	if (local->locked) {
> > +		atomic_add(local->locked, &prcu->active_ctr);
> > +		local->locked =3D 0;
> > +	}
> > +	local->online =3D 0;
> > +	prcu_report(local);
> > +	put_cpu_ptr(&prcu_local);
> > +}
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 326d4f88..a308581b 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -15,6 +15,7 @@
> >  #include <linux/init_task.h>
> >  #include <linux/context_tracking.h>
> >  #include <linux/rcupdate_wait.h>
> > +#include <linux/prcu.h>
> >=20
> >  #include <linux/blkdev.h>
> >  #include <linux/kprobes.h>
> > @@ -3383,6 +3384,7 @@ static void __sched notrace __schedule(bool preem=
pt)
> >=20
> >  	local_irq_disable();
> >  	rcu_note_context_switch(preempt);
> > +	prcu_note_context_switch();
> >=20
> >  	/*
> >  	 * Make sure that signal_pending_state()->signal_pending() below
> > --=20
> > 2.14.1.729.g59c0ea183
> >=20
>=20

--vaizlpdknh5hghcj
Content-Type: application/pgp-signature; name="signature.asc"

-----BEGIN PGP SIGNATURE-----

iQEzBAABCAAdFiEEj5IosQTPz8XU1wRHSXnow7UH+rgFAlpph4YACgkQSXnow7UH
+rgLNggApGRdEu7tYB1wsnbaCWba77pJfR+iW6g2j1W1bKvW/PGJ/2r5R67c9zTG
fWmYOr5wLEFc2rQODWaANkyN/NVuXrZ5JwEzh28unx8Rpko3p4bp2KJ7J7Kmfe/K
+RmxW8CvdaxhuQKI+JhWV8u9rc0bjJz5CIASEWQwNphd/jP0cEAQsfy8boEgIlcC
xg5VGbPla06E79TexncB0fmqIYJJtTZQpZ5Gd5vj/ZruzJH4kIcwC/mWxf1TtdgB
oDuoSA9veOSDDKD390GzMJXbsF8M+8aZODkNolImjwCU1rro7mN6ZCBlyYsFYaU4
5Gm/imgWtTpViUf03E0e//f6Yxb9dQ==
=DrHH
-----END PGP SIGNATURE-----

--vaizlpdknh5hghcj--