When CONFIG_PARAVIRT_SPINLOCKS is on, osq_lock() will spin on both
node's and node->prev's cachelines while waiting for its node->locked
value to be set. That can cause cacheline contention with another
CPU modifying the node->prev's cacheline. The reason for accessing
node->prev's cacheline is to read the node->prev->cpu value as a
parameter value for the vcpu_is_preempted() call. However this value
is a constant as long as node->prev doesn't change.
Minimize that contention by caching the node->prev and node->prev->cpu
values and updating the cached values and accessing node->prev's
cacheline iff node->prev changes.
By running an osq_lock/unlock microbenchmark which repeatedly calls
osq_lock/unlock in a loop with 96 locking threads running on a 2-socket
96-CPU machine, the locking rates before and after this patch were:
Before patch: 1701 kops/s
After patch: 3052 kops/s
% Change: +79.4%
It can be seen that this patch enables a rather significant performance
boost to the OSQ lock.
Signed-off-by: Waiman Long <[email protected]>
---
kernel/locking/osq_lock.c | 22 ++++++++++++++++++++--
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index d5610ad52b92..6d7218f4b6e4 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -87,12 +87,29 @@ osq_wait_next(struct optimistic_spin_queue *lock,
return next;
}
+#ifndef vcpu_is_preempted
+#define prev_vcpu_is_preempted(n, p, c) false
+#else
+static inline bool prev_vcpu_is_preempted(struct optimistic_spin_node *node,
+ struct optimistic_spin_node **pprev,
+ int *ppvcpu)
+{
+ struct optimistic_spin_node *prev = READ_ONCE(node->prev);
+
+ if (prev != *pprev) {
+ *pprev = prev;
+ *ppvcpu = node_cpu(prev);
+ }
+ return vcpu_is_preempted(*ppvcpu);
+}
+#endif
+
bool osq_lock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *prev, *next;
int curr = encode_cpu(smp_processor_id());
- int old;
+ int old, pvcpu;
node->locked = 0;
node->next = NULL;
@@ -110,6 +127,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
+ pvcpu = node_cpu(prev);
/*
* osq_lock() unqueue
@@ -141,7 +159,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* polling, be careful.
*/
if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
- vcpu_is_preempted(node_cpu(node->prev))))
+ prev_vcpu_is_preempted(node, &prev, &pvcpu)))
return true;
/* unqueue */
--
2.39.3
Hi Waiman,
kernel test robot noticed the following build warnings:
[auto build test WARNING on tip/locking/core]
[also build test WARNING on tip/master arm-perf/for-next/perf tip/auto-latest linus/master v6.7-rc6 next-20231222]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Waiman-Long/locking-osq_lock-Minimize-access-to-node-prev-s-cacheline/20231222-165756
base: tip/locking/core
patch link: https://lore.kernel.org/r/20231221011953.1611615-1-longman%40redhat.com
patch subject: [PATCH] locking/osq_lock: Minimize access to node->prev's cacheline
config: arc-defconfig (https://download.01.org/0day-ci/archive/20231223/[email protected]/config)
compiler: arc-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231223/[email protected]/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
All warnings (new ones prefixed by >>):
kernel/locking/osq_lock.c: In function 'osq_lock':
>> kernel/locking/osq_lock.c:112:18: warning: variable 'pvcpu' set but not used [-Wunused-but-set-variable]
112 | int old, pvcpu;
| ^~~~~
vim +/pvcpu +112 kernel/locking/osq_lock.c
106
107 bool osq_lock(struct optimistic_spin_queue *lock)
108 {
109 struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
110 struct optimistic_spin_node *prev, *next;
111 int curr = encode_cpu(smp_processor_id());
> 112 int old, pvcpu;
113
114 node->locked = 0;
115 node->next = NULL;
116 node->cpu = curr;
117
118 /*
119 * We need both ACQUIRE (pairs with corresponding RELEASE in
120 * unlock() uncontended, or fastpath) and RELEASE (to publish
121 * the node fields we just initialised) semantics when updating
122 * the lock tail.
123 */
124 old = atomic_xchg(&lock->tail, curr);
125 if (old == OSQ_UNLOCKED_VAL)
126 return true;
127
128 prev = decode_cpu(old);
129 node->prev = prev;
130 pvcpu = node_cpu(prev);
131
132 /*
133 * osq_lock() unqueue
134 *
135 * node->prev = prev osq_wait_next()
136 * WMB MB
137 * prev->next = node next->prev = prev // unqueue-C
138 *
139 * Here 'node->prev' and 'next->prev' are the same variable and we need
140 * to ensure these stores happen in-order to avoid corrupting the list.
141 */
142 smp_wmb();
143
144 WRITE_ONCE(prev->next, node);
145
146 /*
147 * Normally @prev is untouchable after the above store; because at that
148 * moment unlock can proceed and wipe the node element from stack.
149 *
150 * However, since our nodes are static per-cpu storage, we're
151 * guaranteed their existence -- this allows us to apply
152 * cmpxchg in an attempt to undo our queueing.
153 */
154
155 /*
156 * Wait to acquire the lock or cancellation. Note that need_resched()
157 * will come with an IPI, which will wake smp_cond_load_relaxed() if it
158 * is implemented with a monitor-wait. vcpu_is_preempted() relies on
159 * polling, be careful.
160 */
161 if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
162 prev_vcpu_is_preempted(node, &prev, &pvcpu)))
163 return true;
164
165 /* unqueue */
166 /*
167 * Step - A -- stabilize @prev
168 *
169 * Undo our @prev->next assignment; this will make @prev's
170 * unlock()/unqueue() wait for a next pointer since @lock points to us
171 * (or later).
172 */
173
174 for (;;) {
175 /*
176 * cpu_relax() below implies a compiler barrier which would
177 * prevent this comparison being optimized away.
178 */
179 if (data_race(prev->next) == node &&
180 cmpxchg(&prev->next, node, NULL) == node)
181 break;
182
183 /*
184 * We can only fail the cmpxchg() racing against an unlock(),
185 * in which case we should observe @node->locked becoming
186 * true.
187 */
188 if (smp_load_acquire(&node->locked))
189 return true;
190
191 cpu_relax();
192
193 /*
194 * Or we race against a concurrent unqueue()'s step-B, in which
195 * case its step-C will write us a new @node->prev pointer.
196 */
197 prev = READ_ONCE(node->prev);
198 }
199
200 /*
201 * Step - B -- stabilize @next
202 *
203 * Similar to unlock(), wait for @node->next or move @lock from @node
204 * back to @prev.
205 */
206
207 next = osq_wait_next(lock, node, prev);
208 if (!next)
209 return false;
210
211 /*
212 * Step - C -- unlink
213 *
214 * @prev is stable because its still waiting for a new @prev->next
215 * pointer, @next is stable because our @node->next pointer is NULL and
216 * it will wait in Step-A.
217 */
218
219 WRITE_ONCE(next->prev, prev);
220 WRITE_ONCE(prev->next, next);
221
222 return false;
223 }
224
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki