Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67;
From:   =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= <jschoenh@amazon.de>
To:     Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>
Cc:     =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= <jschoenh@amazon.de>,
        linux-kernel@vger.kernel.org
Subject: [RFC 41/60] cosched: Introduce locking for leader activities
Date:   Fri,  7 Sep 2018 23:40:28 +0200
Message-Id: <20180907214047.26914-42-jschoenh@amazon.de>
In-Reply-To: <20180907214047.26914-1-jschoenh@amazon.de>
References: <20180907214047.26914-1-jschoenh@amazon.de>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Sender: linux-kernel-owner@vger.kernel.org
Precedence: bulk

With hierarchical runqueues and locks at each level, it is often
necessary to get multiple locks. Introduce the first of two locking
strategies, which is suitable for typical leader activities.

To avoid deadlocks the general rule is that multiple locks have to be
taken from bottom to top. Leaders make scheduling decisions and the
necessary maintenance for their part of the runqueue hierarchy.  Hence,
they need to gather locks for all runqueues they own to operate freely
on them.

Provide two functions that do that: rq_lock_owned() and
rq_unlock_owned(). Typically, they walk from the already locked per-CPU
runqueue upwards, locking/unlocking runqueues as they go along, stopping
when they would leave their area of responsibility.

Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
---
 kernel/sched/cosched.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h   | 11 ++++++
 2 files changed, 105 insertions(+)

diff --git a/kernel/sched/cosched.c b/kernel/sched/cosched.c
index 1b442e20faad..df62ee6d0520 100644
--- a/kernel/sched/cosched.c
+++ b/kernel/sched/cosched.c
@@ -514,3 +514,97 @@ void cosched_offline_group(struct task_group *tg)
 	taskgroup_for_each_cfsrq(tg, cfs)
 		list_del_rcu(&cfs->sdrq.tg_siblings);
 }
+
+/*****************************************************************************
+ * Locking related functions
+ *****************************************************************************/
+
+/*
+ * Lock owned part of the runqueue hierarchy from the specified runqueue
+ * upwards.
+ *
+ * You may call rq_lock_owned() again in some nested code path. Currently, this
+ * is needed for put_prev_task(), which is sometimes called from within
+ * pick_next_task_fair(), and for throttle_cfs_rq(), which is sometimes called
+ * during enqueuing and dequeuing.
+ *
+ * When not called nested, returns the uppermost locked runqueue; used by
+ * pick_next_task_fair() to avoid going up the hierarchy again.
+ */
+struct rq *rq_lock_owned(struct rq *rq, struct rq_owner_flags *orf)
+{
+	int cpu = rq->sdrq_data.leader;
+	struct rq *ret = rq;
+
+	lockdep_assert_held(&rq->lock);
+
+	orf->nested = rq->sdrq_data.parent_locked;
+	if (orf->nested)
+		return NULL;
+
+	orf->cookie = lockdep_cookie();
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	/* Lowest level is already locked, begin with next level */
+	rq = parent_rq(rq);
+
+	while (rq) {
+		/*
+		 * FIXME: This avoids ascending the hierarchy, if upper
+		 * levels are not in use. Can we do this with leader==-1
+		 * instead?
+		 */
+		if (root_task_group.scheduled < rq->sdrq_data.level)
+			break;
+
+		/*
+		 * Leadership is always taken, never given; if we're not
+		 * already the leader, we won't be after taking the lock.
+		 */
+		if (cpu != READ_ONCE(rq->sdrq_data.leader))
+			break;
+
+		rq_lock(rq, &rq->sdrq_data.rf);
+
+		/* Did we race with a leadership change? */
+		if (cpu != READ_ONCE(rq->sdrq_data.leader)) {
+			rq_unlock(rq, &rq->sdrq_data.rf);
+			break;
+		}
+
+		/* Apply the cookie that's not stored with the data structure */
+		lockdep_repin_lock(&rq->lock, orf->cookie);
+
+		ret->sdrq_data.parent_locked = true;
+		update_rq_clock(rq);
+		ret = rq;
+
+		rq = parent_rq(rq);
+	}
+
+	return ret;
+}
+
+void rq_unlock_owned(struct rq *rq, struct rq_owner_flags *orf)
+{
+	bool parent_locked = rq->sdrq_data.parent_locked;
+
+	if (orf->nested)
+		return;
+
+	/* Lowest level must stay locked, begin with next level */
+	lockdep_assert_held(&rq->lock);
+	rq->sdrq_data.parent_locked = false;
+
+	while (parent_locked) {
+		rq = parent_rq(rq);
+		lockdep_assert_held(&rq->lock);
+
+		parent_locked = rq->sdrq_data.parent_locked;
+		rq->sdrq_data.parent_locked = false;
+
+		lockdep_unpin_lock(&rq->lock, orf->cookie);
+		rq_unlock(rq, &rq->sdrq_data.rf);
+	}
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0dfefa31704e..7dba8fdc48c7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -506,6 +506,13 @@ struct rq_flags {
 #endif
 };
 
+struct rq_owner_flags {
+#ifdef CONFIG_COSCHEDULING
+	bool nested;
+	struct pin_cookie cookie;
+#endif
+};
+
 #ifdef CONFIG_COSCHEDULING
 struct sdrq_data {
 	/*
@@ -1197,6 +1204,8 @@ void cosched_init_sdrq(struct task_group *tg, struct cfs_rq *cfs,
 		       struct cfs_rq *sd_parent, struct cfs_rq *tg_parent);
 void cosched_online_group(struct task_group *tg);
 void cosched_offline_group(struct task_group *tg);
+struct rq *rq_lock_owned(struct rq *rq, struct rq_owner_flags *orf);
+void rq_unlock_owned(struct rq *rq, struct rq_owner_flags *orf);
 #else /* !CONFIG_COSCHEDULING */
 static inline void cosched_init_bottom(void) { }
 static inline void cosched_init_topology(void) { }
@@ -1206,6 +1215,8 @@ static inline void cosched_init_sdrq(struct task_group *tg, struct cfs_rq *cfs,
 				     struct cfs_rq *tg_parent) { }
 static inline void cosched_online_group(struct task_group *tg) { }
 static inline void cosched_offline_group(struct task_group *tg) { }
+static inline struct rq *rq_lock_owned(struct rq *rq, struct rq_owner_flags *orf) { return rq; }
+static inline void rq_unlock_owned(struct rq *rq, struct rq_owner_flags *orf) { }
 #endif /* !CONFIG_COSCHEDULING */
 
 #ifdef CONFIG_SCHED_SMT
-- 
2.9.3.1.gcba166c.dirty