2013-06-10 17:17:19

by Manfred Spraul

[permalink] [raw]
Subject: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

Hi Andrew,

I have cleaned up/improved my updates to sysv sem.
Could you replace my patches in -akpm with this series?

- 1: cacheline align output from ipc_rcu_alloc
- 2: cacheline align semaphore structures
- 3: seperate-wait-for-zero-and-alter-tasks
- 4: Always-use-only-one-queue-for-alter-operations
- 5: Replace the global sem_otime with a distributed otime
- 6: Rename-try_atomic_semop-to-perform_atomic

The first 2 patches result up to a ~30% performance improvement on a 2-core i3.

3,4 remove unnecessary loops in do_smart_update() and restore FIFO behavior.

I would expect that the 5th one will cause an improvement for multi-socket
systems, but I don't have a test setup.

6 is just a cleanup/function rename.

--
Manfred


2013-06-10 17:17:21

by Manfred Spraul

[permalink] [raw]
Subject: [PATCH 2/6] ipc/sem.c: cacheline align the semaphore structures

As now each semaphore has it's own spinlock and parallel operations
are possible, give each semaphore it's own cacheline.

On a i3 laptop, this gives up to 28% better performance:

#semscale 10 | grep "interleave 2"
- before:
Cpus 1, interleave 2 delay 0: 36109234 in 10 secs
Cpus 2, interleave 2 delay 0: 55276317 in 10 secs
Cpus 3, interleave 2 delay 0: 62411025 in 10 secs
Cpus 4, interleave 2 delay 0: 81963928 in 10 secs

-after:
Cpus 1, interleave 2 delay 0: 35527306 in 10 secs
Cpus 2, interleave 2 delay 0: 70922909 in 10 secs <<< + 28%
Cpus 3, interleave 2 delay 0: 80518538 in 10 secs
Cpus 4, interleave 2 delay 0: 89115148 in 10 secs <<< + 8.7%

i3, with 2 cores and with hyperthreading enabled.
Interleave 2 in order use first the full cores.
HT partially hides the delay from cacheline trashing, thus
the improvement is "only" 8.7% if 4 threads are running.

Andrew: Could you merge it into -akpm and then forward it towards Linus' tree?

Signed-off-by: Manfred Spraul <[email protected]>
---
ipc/sem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ipc/sem.c b/ipc/sem.c
index 70480a3..1afbc57 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -96,7 +96,7 @@ struct sem {
int sempid; /* pid of last operation */
spinlock_t lock; /* spinlock for fine-grained semtimedop */
struct list_head sem_pending; /* pending single-sop operations */
-};
+} ____cacheline_aligned_in_smp;

/* One queue for each sleeping process in the system. */
struct sem_queue {
--
1.8.1.4

2013-06-10 17:17:31

by Manfred Spraul

[permalink] [raw]
Subject: [PATCH 6/6] ipc/sem.c: Rename try_atomic_semop() to perform_atomic_semop(), docu update

Cleanup: Some minor points that I noticed while writing the
previous patches

1) The name try_atomic_semop() is misleading: The function performs the
operation (if it is possible).

2) Some documentation updates.

No real code change, a rename and documentation changes.

Signed-off-by: Manfred Spraul <[email protected]>
---
ipc/sem.c | 32 +++++++++++++++++++++-----------
1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/ipc/sem.c b/ipc/sem.c
index e6d21f6..f9d1c06 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -154,12 +154,15 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
#define SEMOPM_FAST 64 /* ~ 372 bytes on stack */

/*
- * linked list protection:
+ * Locking:
* sem_undo.id_next,
+ * sem_array.complex_count,
* sem_array.pending{_alter,_cont},
- * sem_array.sem_undo: sem_lock() for read/write
+ * sem_array.sem_undo: global sem_lock() for read/write
* sem_undo.proc_next: only "current" is allowed to read/write that field.
*
+ * sem_array.sem_base[i].pending_{const,alter}:
+ * global or semaphore sem_lock() for read/write
*/

#define sc_semmsl sem_ctls[0]
@@ -536,12 +539,19 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
}

-/*
- * Determine whether a sequence of semaphore operations would succeed
- * all at once. Return 0 if yes, 1 if need to sleep, else return error code.
+/** perform_atomic_semop - Perform (if possible) a semaphore operation
+ * @sma: semaphore array
+ * @sops: array with operations that should be checked
+ * @nsems: number of sops
+ * @un: undo array
+ * @pid: pid that did the change
+ *
+ * Returns 0 if the operation was possible.
+ * Returns 1 if the operation is impossible, the caller must sleep.
+ * Negative values are error codes.
*/

-static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
+static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
int nsops, struct sem_undo *un, int pid)
{
int result, sem_op;
@@ -724,8 +734,8 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
q = container_of(walk, struct sem_queue, list);
walk = walk->next;

- error = try_atomic_semop(sma, q->sops, q->nsops,
- q->undo, q->pid);
+ error = perform_atomic_semop(sma, q->sops, q->nsops,
+ q->undo, q->pid);

if (error <= 0) {
/* operation completed, remove from queue & wakeup */
@@ -836,7 +846,7 @@ again:
if (semnum != -1 && sma->sem_base[semnum].semval == 0)
break;

- error = try_atomic_semop(sma, q->sops, q->nsops,
+ error = perform_atomic_semop(sma, q->sops, q->nsops,
q->undo, q->pid);

/* Does q->sleeper still need to sleep? */
@@ -1680,7 +1690,6 @@ static int get_queue_result(struct sem_queue *q)
return error;
}

-
SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
unsigned, nsops, const struct timespec __user *, timeout)
{
@@ -1778,7 +1787,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
if (un && un->semid == -1)
goto out_unlock_free;

- error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
+ error = perform_atomic_semop(sma, sops, nsops, un,
+ task_tgid_vnr(current));
if (error <= 0) {
if (alter && error == 0)
do_smart_update(sma, sops, nsops, 1, &tasks);
--
1.8.1.4

2013-06-10 17:17:30

by Manfred Spraul

[permalink] [raw]
Subject: [PATCH 5/6] ipc/sem.c: Replace shared sem_otime with per-semaphore value

sem_otime contains the time of the last semaphore operation
that completed successfully. Every operation updates this
value, thus access from multiple cpus can cause trashing.

Therefore the patch replaces the variable with a per-semaphore
variable. The per-array sem_otime is only calculated when required.

No performance improvement on a single-socket i3 - only important
for larger systems.

Signed-off-by: Manfred Spraul <[email protected]>
---
include/linux/sem.h | 1 -
ipc/sem.c | 37 +++++++++++++++++++++++++++++++------
2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 55e17f6..976ce3a 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -12,7 +12,6 @@ struct task_struct;
struct sem_array {
struct kern_ipc_perm ____cacheline_aligned_in_smp
sem_perm; /* permissions .. see ipc.h */
- time_t sem_otime; /* last semop time */
time_t sem_ctime; /* last change time */
struct sem *sem_base; /* ptr to first semaphore in array */
struct list_head pending_alter; /* pending operations */
diff --git a/ipc/sem.c b/ipc/sem.c
index dcf99ef..e6d21f6 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -99,6 +99,7 @@ struct sem {
/* that alter the semaphore */
struct list_head pending_const; /* pending single-sop operations */
/* that do not alter the semaphore*/
+ time_t sem_otime; /* candidate for sem_otime */
} ____cacheline_aligned_in_smp;

/* One queue for each sleeping process in the system. */
@@ -909,8 +910,14 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
}
}
}
- if (otime)
- sma->sem_otime = get_seconds();
+ if (otime) {
+ if (sops == NULL) {
+ sma->sem_base[0].sem_otime = get_seconds();
+ } else {
+ sma->sem_base[sops[0].sem_num].sem_otime =
+ get_seconds();
+ }
+ }
}


@@ -1056,6 +1063,21 @@ static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in,
}
}

+static time_t get_semotime(struct sem_array *sma)
+{
+ int i;
+ time_t res;
+
+ res = sma->sem_base[0].sem_otime;
+ for (i = 1; i < sma->sem_nsems; i++) {
+ time_t to = sma->sem_base[i].sem_otime;
+
+ if (to > res)
+ res = to;
+ }
+ return res;
+}
+
static int semctl_nolock(struct ipc_namespace *ns, int semid,
int cmd, int version, void __user *p)
{
@@ -1129,9 +1151,9 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
goto out_unlock;

kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
- tbuf.sem_otime = sma->sem_otime;
- tbuf.sem_ctime = sma->sem_ctime;
- tbuf.sem_nsems = sma->sem_nsems;
+ tbuf.sem_otime = get_semotime(sma);
+ tbuf.sem_ctime = sma->sem_ctime;
+ tbuf.sem_nsems = sma->sem_nsems;
rcu_read_unlock();
if (copy_semid_to_user(p, &tbuf, version))
return -EFAULT;
@@ -2019,6 +2041,9 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
{
struct user_namespace *user_ns = seq_user_ns(s);
struct sem_array *sma = it;
+ time_t sem_otime;
+
+ sem_otime = get_semotime(sma);

return seq_printf(s,
"%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
@@ -2030,7 +2055,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
from_kgid_munged(user_ns, sma->sem_perm.gid),
from_kuid_munged(user_ns, sma->sem_perm.cuid),
from_kgid_munged(user_ns, sma->sem_perm.cgid),
- sma->sem_otime,
+ sem_otime,
sma->sem_ctime);
}
#endif
--
1.8.1.4

2013-06-10 17:17:28

by Manfred Spraul

[permalink] [raw]
Subject: [PATCH 4/6] ipc/sem.c: Always use only one queue for alter operations.

There are two places that can contain alter operations:
- the global queue: sma->pending_alter
- the per-semaphore queues: sma->sem_base[].pending_alter.

Since one of the queues must be processed first, this causes an odd
priorization of the wakeups:
Right now, complex operations have priority over simple ops.

The patch restores the behavior of linux <=3.0.9: The longest
waiting operation has the highest priority.

This is done by using only one queue:
- if there are complex ops, then sma->pending_alter is used.
- otherwise, the per-semaphore queues are used.

As a side effect, do_smart_update_queue() becomes much simpler:
No more goto logic.

Signed-off-by: Manfred Spraul <[email protected]>
---
ipc/sem.c | 128 ++++++++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 88 insertions(+), 40 deletions(-)

diff --git a/ipc/sem.c b/ipc/sem.c
index e7f3d64..dcf99ef 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -192,6 +192,53 @@ void __init sem_init (void)
IPC_SEM_IDS, sysvipc_sem_proc_show);
}

+/**
+ * unmerge_queues - unmerge queues, if possible.
+ * @sma: semaphore array
+ *
+ * The function unmerges the wait queues if complex_count is 0.
+ * It must be called prior to dropping the global semaphore array lock.
+ */
+static void unmerge_queues(struct sem_array *sma)
+{
+ struct sem_queue *q, *tq;
+
+ /* complex operations still around? */
+ if (sma->complex_count)
+ return;
+ /*
+ * We will switch back to simple mode.
+ * Move all pending operation back into the per-semaphore
+ * queues.
+ */
+ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
+ struct sem *curr;
+ curr = &sma->sem_base[q->sops[0].sem_num];
+
+ list_add_tail(&q->list, &curr->pending_alter);
+ }
+ INIT_LIST_HEAD(&sma->pending_alter);
+}
+
+/**
+ * merge_queues - Merge single semop queues into global queue
+ * @sma: semaphore array
+ *
+ * This function merges all per-semaphore queues into the global queue.
+ * It is necessary to achieve FIFO ordering for the pending single-sop
+ * operations when a multi-semop operation must sleep.
+ * Only the alter operations must be moved, the const operations can stay.
+ */
+static void merge_queues(struct sem_array *sma)
+{
+ int i;
+ for (i = 0; i < sma->sem_nsems; i++) {
+ struct sem *sem = sma->sem_base + i;
+
+ list_splice_init(&sem->pending_alter, &sma->pending_alter);
+ }
+}
+
/*
* If the request contains only one semaphore operation, and there are
* no complex transactions pending, lock only the semaphore involved.
@@ -262,6 +309,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
static inline void sem_unlock(struct sem_array *sma, int locknum)
{
if (locknum == -1) {
+ unmerge_queues(sma);
spin_unlock(&sma->sem_perm.lock);
} else {
struct sem *sem = sma->sem_base + locknum;
@@ -829,49 +877,38 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
int otime, struct list_head *pt)
{
int i;
- int progress;

otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);

- progress = 1;
-retry_global:
- if (sma->complex_count) {
- if (update_queue(sma, -1, pt)) {
- progress = 1;
- otime = 1;
- sops = NULL;
- }
- }
- if (!progress)
- goto done;
-
- if (!sops) {
- /* No semops; something special is going on. */
- for (i = 0; i < sma->sem_nsems; i++) {
- if (update_queue(sma, i, pt)) {
- otime = 1;
- progress = 1;
+ if (!list_empty(&sma->pending_alter)) {
+ /* semaphore array uses the global queue - just process it. */
+ otime |= update_queue(sma, -1, pt);
+ } else {
+ if (!sops) {
+ /*
+ * No sops, thus the modified semaphores are not
+ * known. Check all.
+ */
+ for (i = 0; i < sma->sem_nsems; i++)
+ otime |= update_queue(sma, i, pt);
+ } else {
+ /*
+ * Check the semaphores that were increased:
+ * - No complex ops, thus all sleeping ops are
+ * decrease.
+ * - if we decreased the value, then any sleeping
+ * semaphore ops wont be able to run: If the
+ * previous value was too small, then the new
+ * value will be too small, too.
+ */
+ for (i = 0; i < nsops; i++) {
+ if (sops[i].sem_op > 0) {
+ otime |= update_queue(sma,
+ sops[i].sem_num, pt);
+ }
}
}
- goto done_checkretry;
- }
-
- /* Check the semaphores that were modified. */
- for (i = 0; i < nsops; i++) {
- if (sops[i].sem_op > 0 ||
- (sops[i].sem_op < 0 &&
- sma->sem_base[sops[i].sem_num].semval == 0))
- if (update_queue(sma, sops[i].sem_num, pt)) {
- otime = 1;
- progress = 1;
- }
- }
-done_checkretry:
- if (progress) {
- progress = 0;
- goto retry_global;
}
-done:
if (otime)
sma->sem_otime = get_seconds();
}
@@ -1741,11 +1778,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
struct sem *curr;
curr = &sma->sem_base[sops->sem_num];

- if (alter)
- list_add_tail(&queue.list, &curr->pending_alter);
- else
+ if (alter) {
+ if (sma->complex_count) {
+ list_add_tail(&queue.list,
+ &sma->pending_alter);
+ } else {
+
+ list_add_tail(&queue.list,
+ &curr->pending_alter);
+ }
+ } else {
list_add_tail(&queue.list, &curr->pending_const);
+ }
} else {
+ if (!sma->complex_count)
+ merge_queues(sma);
+
if (alter)
list_add_tail(&queue.list, &sma->pending_alter);
else
--
1.8.1.4

2013-06-10 17:17:26

by Manfred Spraul

[permalink] [raw]
Subject: [PATCH 3/6] ipc/sem: seperate wait-for-zero and alter tasks into seperate queues

Introduce seperate queues for operations that do not modify the
semaphore values.
Advantages:
- Simpler logic in check_restart().
- Faster update_queue(): Right now, all wait-for-zero operations
are always tested, even if the semaphore value is not 0.
- wait-for-zero gets again priority, as in linux <=3.0.9

Signed-off-by: Manfred Spraul <[email protected]>
---
include/linux/sem.h | 5 +-
ipc/sem.c | 209 +++++++++++++++++++++++++++++++++++++---------------
2 files changed, 153 insertions(+), 61 deletions(-)

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 53d4265..55e17f6 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -15,7 +15,10 @@ struct sem_array {
time_t sem_otime; /* last semop time */
time_t sem_ctime; /* last change time */
struct sem *sem_base; /* ptr to first semaphore in array */
- struct list_head sem_pending; /* pending operations to be processed */
+ struct list_head pending_alter; /* pending operations */
+ /* that alter the array */
+ struct list_head pending_const; /* pending complex operations */
+ /* that do not alter semvals */
struct list_head list_id; /* undo requests on this array */
int sem_nsems; /* no. of semaphores in array */
int complex_count; /* pending complex operations */
diff --git a/ipc/sem.c b/ipc/sem.c
index 1afbc57..e7f3d64 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -95,7 +95,10 @@ struct sem {
int semval; /* current value */
int sempid; /* pid of last operation */
spinlock_t lock; /* spinlock for fine-grained semtimedop */
- struct list_head sem_pending; /* pending single-sop operations */
+ struct list_head pending_alter; /* pending single-sop operations */
+ /* that alter the semaphore */
+ struct list_head pending_const; /* pending single-sop operations */
+ /* that do not alter the semaphore*/
} ____cacheline_aligned_in_smp;

/* One queue for each sleeping process in the system. */
@@ -152,7 +155,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
/*
* linked list protection:
* sem_undo.id_next,
- * sem_array.sem_pending{,last},
+ * sem_array.pending{_alter,_cont},
* sem_array.sem_undo: sem_lock() for read/write
* sem_undo.proc_next: only "current" is allowed to read/write that field.
*
@@ -337,7 +340,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
* Without the check/retry algorithm a lockless wakeup is possible:
* - queue.status is initialized to -EINTR before blocking.
* - wakeup is performed by
- * * unlinking the queue entry from sma->sem_pending
+ * * unlinking the queue entry from the pending list
* * setting queue.status to IN_WAKEUP
* This is the notification for the blocked thread that a
* result value is imminent.
@@ -418,12 +421,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
sma->sem_base = (struct sem *) &sma[1];

for (i = 0; i < nsems; i++) {
- INIT_LIST_HEAD(&sma->sem_base[i].sem_pending);
+ INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
+ INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
spin_lock_init(&sma->sem_base[i].lock);
}

sma->complex_count = 0;
- INIT_LIST_HEAD(&sma->sem_pending);
+ INIT_LIST_HEAD(&sma->pending_alter);
+ INIT_LIST_HEAD(&sma->pending_const);
INIT_LIST_HEAD(&sma->list_id);
sma->sem_nsems = nsems;
sma->sem_ctime = get_seconds();
@@ -609,60 +614,130 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
* update_queue is O(N^2) when it restarts scanning the whole queue of
* waiting operations. Therefore this function checks if the restart is
* really necessary. It is called after a previously waiting operation
- * was completed.
+ * modified the array.
+ * Note that wait-for-zero operations are handled without restart.
*/
static int check_restart(struct sem_array *sma, struct sem_queue *q)
{
- struct sem *curr;
- struct sem_queue *h;
-
- /* if the operation didn't modify the array, then no restart */
- if (q->alter == 0)
- return 0;
-
- /* pending complex operations are too difficult to analyse */
- if (sma->complex_count)
+ /* pending complex alter operations are too difficult to analyse */
+ if (!list_empty(&sma->pending_alter))
return 1;

/* we were a sleeping complex operation. Too difficult */
if (q->nsops > 1)
return 1;

- curr = sma->sem_base + q->sops[0].sem_num;
+ /* It is impossible that someone waits for the new value:
+ * - complex operations always restart.
+ * - wait-for-zero are handled seperately.
+ * - q is a previously sleeping simple operation that
+ * altered the array. It must be a decrement, because
+ * simple increments never sleep.
+ * - If there are older (higher priority) decrements
+ * in the queue, then they have observed the original
+ * semval value and couldn't proceed. The operation
+ * decremented to value - thus they won't proceed either.
+ */
+ return 0;
+}

- /* No-one waits on this queue */
- if (list_empty(&curr->sem_pending))
- return 0;
+/**
+ * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks
+ * @sma: semaphore array.
+ * @semnum: semaphore that was modified.
+ * @pt: list head for the tasks that must be woken up.
+ *
+ * wake_const_ops must be called after a semaphore in a semaphore array
+ * was set to 0. If complex const operations are pending, wake_const_ops must
+ * be called with semnum = -1, as well as with the number of each modified
+ * semaphore.
+ * The tasks that must be woken up are added to @pt. The return code
+ * is stored in q->pid.
+ * The function returns 1 if at least one operation was completed successfully.
+ */
+static int wake_const_ops(struct sem_array *sma, int semnum,
+ struct list_head *pt)
+{
+ struct sem_queue *q;
+ struct list_head *walk;
+ struct list_head *pending_list;
+ int semop_completed = 0;
+
+ if (semnum == -1)
+ pending_list = &sma->pending_const;
+ else
+ pending_list = &sma->sem_base[semnum].pending_const;
+
+ walk = pending_list->next;
+ while (walk != pending_list) {
+ int error;
+
+ q = container_of(walk, struct sem_queue, list);
+ walk = walk->next;
+
+ error = try_atomic_semop(sma, q->sops, q->nsops,
+ q->undo, q->pid);
+
+ if (error <= 0) {
+ /* operation completed, remove from queue & wakeup */
+
+ unlink_queue(sma, q);
+
+ wake_up_sem_queue_prepare(pt, q, error);
+ if (error == 0)
+ semop_completed = 1;
+ }
+ }
+ return semop_completed;
+}

- /* the new semaphore value */
- if (curr->semval) {
- /* It is impossible that someone waits for the new value:
- * - q is a previously sleeping simple operation that
- * altered the array. It must be a decrement, because
- * simple increments never sleep.
- * - The value is not 0, thus wait-for-zero won't proceed.
- * - If there are older (higher priority) decrements
- * in the queue, then they have observed the original
- * semval value and couldn't proceed. The operation
- * decremented to value - thus they won't proceed either.
+/**
+ * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks
+ * @sma: semaphore array
+ * @sops: operations that were performed
+ * @nsops: number of operations
+ * @pt: list head of the tasks that must be woken up.
+ *
+ * do_smart_wakeup_zero() checks all required queue for wait-for-zero
+ * operations, based on the actual changes that were performed on the
+ * semaphore array.
+ * The function returns 1 if at least one operation was completed successfully.
+ */
+static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
+ int nsops, struct list_head *pt)
+{
+ int i;
+ int semop_completed = 0;
+ int got_zero = 0;
+
+ /* first: the per-semaphore queues, if known */
+ if (sops) {
+ for (i = 0; i < nsops; i++) {
+ int num = sops[i].sem_num;
+
+ if (sma->sem_base[num].semval == 0) {
+ got_zero = 1;
+ semop_completed |= wake_const_ops(sma, num, pt);
+ }
+ }
+ } else {
+ /*
+ * No sops means modified semaphores not known.
+ * Assume all were changed.
*/
- BUG_ON(q->sops[0].sem_op >= 0);
- return 0;
+ for (i = 0; i < sma->sem_nsems; i++) {
+ if (sma->sem_base[i].semval == 0)
+ semop_completed |= wake_const_ops(sma, i, pt);
+ }
}
/*
- * semval is 0. Check if there are wait-for-zero semops.
- * They must be the first entries in the per-semaphore queue
+ * If one of the modified semaphores got 0,
+ * then check the global queue, too.
*/
- h = list_first_entry(&curr->sem_pending, struct sem_queue, list);
- BUG_ON(h->nsops != 1);
- BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num);
+ if (got_zero)
+ semop_completed |= wake_const_ops(sma, -1, pt);

- /* Yes, there is a wait-for-zero semop. Restart */
- if (h->sops[0].sem_op == 0)
- return 1;
-
- /* Again - no-one is waiting for the new value. */
- return 0;
+ return semop_completed;
}


@@ -678,6 +753,8 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
* semaphore.
* The tasks that must be woken up are added to @pt. The return code
* is stored in q->pid.
+ * The function internally checks if const operations can now succeed.
+ *
* The function return 1 if at least one semop was completed successfully.
*/
static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
@@ -688,9 +765,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
int semop_completed = 0;

if (semnum == -1)
- pending_list = &sma->sem_pending;
+ pending_list = &sma->pending_alter;
else
- pending_list = &sma->sem_base[semnum].sem_pending;
+ pending_list = &sma->sem_base[semnum].pending_alter;

again:
walk = pending_list->next;
@@ -702,13 +779,12 @@ again:

/* If we are scanning the single sop, per-semaphore list of
* one semaphore and that semaphore is 0, then it is not
- * necessary to scan the "alter" entries: simple increments
+ * necessary to scan further: simple increments
* that affect only one entry succeed immediately and cannot
* be in the per semaphore pending queue, and decrements
* cannot be successful if the value is already 0.
*/
- if (semnum != -1 && sma->sem_base[semnum].semval == 0 &&
- q->alter)
+ if (semnum != -1 && sma->sem_base[semnum].semval == 0)
break;

error = try_atomic_semop(sma, q->sops, q->nsops,
@@ -724,6 +800,7 @@ again:
restart = 0;
} else {
semop_completed = 1;
+ do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
restart = check_restart(sma, q);
}

@@ -742,8 +819,8 @@ again:
* @otime: force setting otime
* @pt: list head of the tasks that must be woken up.
*
- * do_smart_update() does the required called to update_queue, based on the
- * actual changes that were performed on the semaphore array.
+ * do_smart_update() does the required calls to update_queue and wakeup_zero,
+ * based on the actual changes that were performed on the semaphore array.
* Note that the function does not do the actual wake-up: the caller is
* responsible for calling wake_up_sem_queue_do(@pt).
* It is safe to perform this call after dropping all locks.
@@ -754,6 +831,8 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
int i;
int progress;

+ otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
+
progress = 1;
retry_global:
if (sma->complex_count) {
@@ -813,14 +892,14 @@ static int count_semncnt (struct sem_array * sma, ushort semnum)
struct sem_queue * q;

semncnt = 0;
- list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) {
+ list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) {
struct sembuf * sops = q->sops;
BUG_ON(sops->sem_num != semnum);
if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT))
semncnt++;
}

- list_for_each_entry(q, &sma->sem_pending, list) {
+ list_for_each_entry(q, &sma->pending_alter, list) {
struct sembuf * sops = q->sops;
int nsops = q->nsops;
int i;
@@ -839,14 +918,14 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum)
struct sem_queue * q;

semzcnt = 0;
- list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) {
+ list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) {
struct sembuf * sops = q->sops;
BUG_ON(sops->sem_num != semnum);
if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT))
semzcnt++;
}

- list_for_each_entry(q, &sma->sem_pending, list) {
+ list_for_each_entry(q, &sma->pending_const, list) {
struct sembuf * sops = q->sops;
int nsops = q->nsops;
int i;
@@ -884,13 +963,22 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)

/* Wake up all pending processes and let them fail with EIDRM. */
INIT_LIST_HEAD(&tasks);
- list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
+ list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
+ unlink_queue(sma, q);
+ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ }
+
+ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
}
for (i = 0; i < sma->sem_nsems; i++) {
struct sem *sem = sma->sem_base + i;
- list_for_each_entry_safe(q, tq, &sem->sem_pending, list) {
+ list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
+ unlink_queue(sma, q);
+ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ }
+ list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
}
@@ -1654,14 +1742,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
curr = &sma->sem_base[sops->sem_num];

if (alter)
- list_add_tail(&queue.list, &curr->sem_pending);
+ list_add_tail(&queue.list, &curr->pending_alter);
else
- list_add(&queue.list, &curr->sem_pending);
+ list_add_tail(&queue.list, &curr->pending_const);
} else {
if (alter)
- list_add_tail(&queue.list, &sma->sem_pending);
+ list_add_tail(&queue.list, &sma->pending_alter);
else
- list_add(&queue.list, &sma->sem_pending);
+ list_add_tail(&queue.list, &sma->pending_const);
+
sma->complex_count++;
}

--
1.8.1.4

2013-06-10 17:18:39

by Manfred Spraul

[permalink] [raw]
Subject: [PATCH 1/6] ipc/util.c, ipc_rcu_alloc: cacheline align allocation

Enforce that ipc_rcu_alloc returns a cacheline aligned pointer on SMP.

Rational:
The SysV sem code tries to move the main spinlock into a seperate cacheline
(____cacheline_aligned_in_smp). This works only if ipc_rcu_alloc returns
cacheline aligned pointers.
vmalloc and kmalloc return cacheline algined pointers, the implementation
of ipc_rcu_alloc breaks that.

Andrew: Could you merge it into -akpm and then forward it towards Linus' tree?

Signed-off-by: Manfred Spraul <[email protected]>
---
ipc/util.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ipc/util.c b/ipc/util.c
index 809ec5e..9623c8e 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -469,9 +469,7 @@ void ipc_free(void* ptr, int size)
struct ipc_rcu {
struct rcu_head rcu;
atomic_t refcount;
- /* "void *" makes sure alignment of following data is sane. */
- void *data[0];
-};
+} ____cacheline_aligned_in_smp;

/**
* ipc_rcu_alloc - allocate ipc and rcu space
@@ -489,12 +487,14 @@ void *ipc_rcu_alloc(int size)
if (unlikely(!out))
return NULL;
atomic_set(&out->refcount, 1);
- return out->data;
+ return out+1;
}

int ipc_rcu_getref(void *ptr)
{
- return atomic_inc_not_zero(&container_of(ptr, struct ipc_rcu, data)->refcount);
+ struct ipc_rcu *p = ((struct ipc_rcu*)ptr)-1;
+
+ return atomic_inc_not_zero(&p->refcount);
}

/**
@@ -508,7 +508,7 @@ static void ipc_schedule_free(struct rcu_head *head)

void ipc_rcu_putref(void *ptr)
{
- struct ipc_rcu *p = container_of(ptr, struct ipc_rcu, data);
+ struct ipc_rcu *p = ((struct ipc_rcu*)ptr)-1;

if (!atomic_dec_and_test(&p->refcount))
return;
--
1.8.1.4

2013-06-14 15:38:40

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

Hi all,

On 06/10/2013 07:16 PM, Manfred Spraul wrote:
> Hi Andrew,
>
> I have cleaned up/improved my updates to sysv sem.
> Could you replace my patches in -akpm with this series?
>
> - 1: cacheline align output from ipc_rcu_alloc
> - 2: cacheline align semaphore structures
> - 3: seperate-wait-for-zero-and-alter-tasks
> - 4: Always-use-only-one-queue-for-alter-operations
> - 5: Replace the global sem_otime with a distributed otime
> - 6: Rename-try_atomic_semop-to-perform_atomic
Just to keep everyone updated:
I have updated my testapp:
https://github.com/manfred-colorfu/ipcscale/blob/master/sem-waitzero.cpp

Something like this gives a nice output:

# sem-waitzero -t 5 -m 0 | grep 'Cpus' | gawk '{printf("%f -
%s\n",$7/$2,$0);}' | sort -n -r

The first number is the number of operations per cpu during 5 seconds.

Mike was kind enough to run in on a 32-core (4-socket) Intel system:
- master doesn't scale at all when multiple sockets are used:
interleave 4: (i.e.: use cpu 0, then 4, then 8 (2nd socket), then 12):
34,717586.000000 - Cpus 1, interleave 4 delay 0: 34717586 in 5 secs
24,507337.500000 - Cpus 2, interleave 4 delay 0: 49014675 in 5 secs
3,487540.000000 - Cpus 3, interleave 4 delay 0: 10462620 in 5 secs
2,708145.000000 - Cpus 4, interleave 4 delay 0: 10832580 in 5 secs
interleave 8: (i.e.: use cpu 0, then 8 (2nd socket):
34,587329.000000 - Cpus 1, interleave 8 delay 0: 34587329 in 5 secs
7,746981.500000 - Cpus 2, interleave 8 delay 0: 15493963 in 5 secs

- with my patches applied, it scales linearly - but only sometimes
example for good scaling (18 threads in parallel - linear scaling):
33,928616.111111 - Cpus 18, interleave 8 delay 0: 610715090 in
5 secs
example for bad scaling:
5,829109.600000 - Cpus 5, interleave 8 delay 0: 29145548 in 5 secs

For me, it looks like a livelock somewhere:
Good example: all threads contribute the same amount to the final result:
> Result matrix:
> Thread 0: 33476433
> Thread 1: 33697100
> Thread 2: 33514249
> Thread 3: 33657413
> Thread 4: 33727959
> Thread 5: 33580684
> Thread 6: 33530294
> Thread 7: 33666761
> Thread 8: 33749836
> Thread 9: 32636493
> Thread 10: 33550620
> Thread 11: 33403314
> Thread 12: 33594457
> Thread 13: 33331920
> Thread 14: 33503588
> Thread 15: 33585348
> Cpus 16, interleave 8 delay 0: 536206469 in 5 secs
Bad example: one thread is as fast as it should be, others are slow:
> Result matrix:
> Thread 0: 31629540
> Thread 1: 5336968
> Thread 2: 6404314
> Thread 3: 9190595
> Thread 4: 9681006
> Thread 5: 9935421
> Thread 6: 9424324
> Cpus 7, interleave 8 delay 0: 81602168 in 5 secs

The results are not stable: the same test is sometimes fast, sometimes slow.
I have no idea where the livelock could be and I wasn't able to notice
anything on my i3 laptop.

Thus: Who has an idea?
What I can say is that the livelock can't be in do_smart_update(): The
function is never called.

--
Manfred

2013-06-14 19:06:06

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On Fri, 2013-06-14 at 17:38 +0200, Manfred Spraul wrote:
> Hi all,
>
> On 06/10/2013 07:16 PM, Manfred Spraul wrote:
> > Hi Andrew,
> >
> > I have cleaned up/improved my updates to sysv sem.
> > Could you replace my patches in -akpm with this series?
> >
> > - 1: cacheline align output from ipc_rcu_alloc
> > - 2: cacheline align semaphore structures
> > - 3: seperate-wait-for-zero-and-alter-tasks
> > - 4: Always-use-only-one-queue-for-alter-operations
> > - 5: Replace the global sem_otime with a distributed otime
> > - 6: Rename-try_atomic_semop-to-perform_atomic
> Just to keep everyone updated:
> I have updated my testapp:
> https://github.com/manfred-colorfu/ipcscale/blob/master/sem-waitzero.cpp
>
> Something like this gives a nice output:
>
> # sem-waitzero -t 5 -m 0 | grep 'Cpus' | gawk '{printf("%f -
> %s\n",$7/$2,$0);}' | sort -n -r
>
> The first number is the number of operations per cpu during 5 seconds.
>
> Mike was kind enough to run in on a 32-core (4-socket) Intel system:
> - master doesn't scale at all when multiple sockets are used:
> interleave 4: (i.e.: use cpu 0, then 4, then 8 (2nd socket), then 12):
> 34,717586.000000 - Cpus 1, interleave 4 delay 0: 34717586 in 5 secs
> 24,507337.500000 - Cpus 2, interleave 4 delay 0: 49014675 in 5 secs
> 3,487540.000000 - Cpus 3, interleave 4 delay 0: 10462620 in 5 secs
> 2,708145.000000 - Cpus 4, interleave 4 delay 0: 10832580 in 5 secs
> interleave 8: (i.e.: use cpu 0, then 8 (2nd socket):
> 34,587329.000000 - Cpus 1, interleave 8 delay 0: 34587329 in 5 secs
> 7,746981.500000 - Cpus 2, interleave 8 delay 0: 15493963 in 5 secs
>
> - with my patches applied, it scales linearly - but only sometimes
> example for good scaling (18 threads in parallel - linear scaling):
> 33,928616.111111 - Cpus 18, interleave 8 delay 0: 610715090 in
> 5 secs
> example for bad scaling:
> 5,829109.600000 - Cpus 5, interleave 8 delay 0: 29145548 in 5 secs
>
> For me, it looks like a livelock somewhere:
> Good example: all threads contribute the same amount to the final result:
> > Result matrix:
> > Thread 0: 33476433
> > Thread 1: 33697100
> > Thread 2: 33514249
> > Thread 3: 33657413
> > Thread 4: 33727959
> > Thread 5: 33580684
> > Thread 6: 33530294
> > Thread 7: 33666761
> > Thread 8: 33749836
> > Thread 9: 32636493
> > Thread 10: 33550620
> > Thread 11: 33403314
> > Thread 12: 33594457
> > Thread 13: 33331920
> > Thread 14: 33503588
> > Thread 15: 33585348
> > Cpus 16, interleave 8 delay 0: 536206469 in 5 secs
> Bad example: one thread is as fast as it should be, others are slow:
> > Result matrix:
> > Thread 0: 31629540
> > Thread 1: 5336968
> > Thread 2: 6404314
> > Thread 3: 9190595
> > Thread 4: 9681006
> > Thread 5: 9935421
> > Thread 6: 9424324
> > Cpus 7, interleave 8 delay 0: 81602168 in 5 secs
>
> The results are not stable: the same test is sometimes fast, sometimes slow.
> I have no idea where the livelock could be and I wasn't able to notice
> anything on my i3 laptop.
>
> Thus: Who has an idea?
> What I can say is that the livelock can't be in do_smart_update(): The
> function is never called.

64 core DL980, using all cores is stable at being horribly _unstable_,
much worse than the 32 core UV2000, but if using only 32 cores, it
becomes considerably more stable than the newer/faster UV box.

32 of 64 cores DL980 without the -rt killing goto again loop removal I
showed you. Unstable, not wonderful throughput.

Result matrix:
Thread 0: 7253945
Thread 1: 9050395
Thread 2: 7708921
Thread 3: 7274316
Thread 4: 9815215
Thread 5: 9924773
Thread 6: 7743325
Thread 7: 8643970
Thread 8: 11268731
Thread 9: 9610031
Thread 10: 7540230
Thread 11: 8432077
Thread 12: 11071762
Thread 13: 10436946
Thread 14: 8051919
Thread 15: 7461884
Thread 16: 11706359
Thread 17: 10512449
Thread 18: 8225636
Thread 19: 7809035
Thread 20: 10465783
Thread 21: 10072878
Thread 22: 7632289
Thread 23: 6758903
Thread 24: 10763830
Thread 25: 8974703
Thread 26: 7054996
Thread 27: 7367430
Thread 28: 9816388
Thread 29: 9622796
Thread 30: 6500835
Thread 31: 7959901

# Events: 802K cycles
#
# Overhead Symbol
# ........ ..........................................
#
18.42% [k] SYSC_semtimedop
15.39% [k] sem_lock
10.26% [k] _raw_spin_lock
9.00% [k] perform_atomic_semop
7.89% [k] system_call
7.70% [k] ipc_obtain_object_check
6.95% [k] ipcperms
6.62% [k] copy_user_generic_string
4.16% [.] __semop
2.57% [.] worker_thread(void*)
2.30% [k] copy_from_user
1.75% [k] sem_unlock
1.25% [k] ipc_obtain_object

With -goto again loop whacked, it's nearly stable, but not quite, and
throughput mostly looks like so..

Result matrix:
Thread 0: 24164305
Thread 1: 24224024
Thread 2: 24112445
Thread 3: 24076559
Thread 4: 24364901
Thread 5: 24249681
Thread 6: 24048409
Thread 7: 24267064
Thread 8: 24614799
Thread 9: 24330378
Thread 10: 24132766
Thread 11: 24158460
Thread 12: 24456538
Thread 13: 24300952
Thread 14: 24079298
Thread 15: 24100075
Thread 16: 24643074
Thread 17: 24369761
Thread 18: 24151657
Thread 19: 24143953
Thread 20: 24575677
Thread 21: 24169945
Thread 22: 24055378
Thread 23: 24016710
Thread 24: 24548028
Thread 25: 24290316
Thread 26: 24169379
Thread 27: 24119776
Thread 28: 24399737
Thread 29: 24256724
Thread 30: 23914777
Thread 31: 24215780

and profile like so.

# Events: 802K cycles
#
# Overhead Symbol
# ........ ...............................
#
17.38% [k] SYSC_semtimedop
13.26% [k] system_call
11.31% [k] copy_user_generic_string
7.62% [.] __semop
7.18% [k] _raw_spin_lock
5.66% [k] ipcperms
5.40% [k] sem_lock
4.65% [k] perform_atomic_semop
4.22% [k] ipc_obtain_object_check
4.08% [.] worker_thread(void*)
4.06% [k] copy_from_user
2.40% [k] ipc_obtain_object
1.98% [k] pid_vnr
1.45% [k] wake_up_sem_queue_do
1.39% [k] sys_semop
1.35% [k] sys_semtimedop
1.30% [k] sem_unlock
1.14% [k] security_ipc_permission

So that goto again loop is not only an -rt killer, it seems to be part
of the instability picture too.

Back to virgin source + your patch series

Using 64 cores with or without loop removed, it's uniformly unstable as
hell. With goto again loop removed, it improves some, but not much, so
loop isn't the biggest deal, except to -rt, where it's utterly deadly.
.
Result matrix:
Thread 0: 997088
Thread 1: 1962065
Thread 2: 117899
Thread 3: 125918
Thread 4: 80233
Thread 5: 85001
Thread 6: 88413
Thread 7: 104424
Thread 8: 1549782
Thread 9: 2172206
Thread 10: 119314
Thread 11: 127109
Thread 12: 81179
Thread 13: 89026
Thread 14: 91497
Thread 15: 103410
Thread 16: 1661969
Thread 17: 2223131
Thread 18: 119739
Thread 19: 126294
Thread 20: 81172
Thread 21: 87850
Thread 22: 90621
Thread 23: 102964
Thread 24: 1641042
Thread 25: 2152851
Thread 26: 118818
Thread 27: 125801
Thread 28: 79316
Thread 29: 99029
Thread 30: 101513
Thread 31: 91206
Thread 32: 1825614
Thread 33: 2432801
Thread 34: 120599
Thread 35: 131854
Thread 36: 81346
Thread 37: 103464
Thread 38: 105223
Thread 39: 101554
Thread 40: 1980013
Thread 41: 2574055
Thread 42: 122887
Thread 43: 131096
Thread 44: 80521
Thread 45: 105162
Thread 46: 110329
Thread 47: 104078
Thread 48: 1925173
Thread 49: 2552441
Thread 50: 123806
Thread 51: 134857
Thread 52: 82148
Thread 53: 105312
Thread 54: 109728
Thread 55: 107766
Thread 56: 1999696
Thread 57: 2699455
Thread 58: 128375
Thread 59: 128289
Thread 60: 80071
Thread 61: 106968
Thread 62: 111768
Thread 63: 115243

# Events: 1M cycles
#
# Overhead Symbol
# ........ .......................................
#
30.73% [k] ipc_obtain_object_check
29.46% [k] sem_lock
25.12% [k] ipcperms
4.93% [k] SYSC_semtimedop
4.35% [k] perform_atomic_semop
2.83% [k] _raw_spin_lock
0.40% [k] system_call

ipc_obtain_object_check():

: * Call inside the RCU critical section. ↑
: * The ipc object is *not* locked on exit. â–’
: */ â–’
: struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id) â–’
: { â–’
: struct kern_ipc_perm *out = ipc_obtain_object(ids, id); â–’
0.00 : ffffffff81256a2b: 48 89 c2 mov %rax,%rdx â–’
: â–’
: if (IS_ERR(out)) â–’
0.02 : ffffffff81256a2e: 77 20 ja ffffffff81256a50 <ipc_obtain_object_check+0x40> â–’
: goto out; â–’
: â–’
: if (ipc_checkid(out, id)) â–’
0.00 : ffffffff81256a30: 8d 83 ff 7f 00 00 lea 0x7fff(%rbx),%eax â–’
0.00 : ffffffff81256a36: 85 db test %ebx,%ebx â–’
0.00 : ffffffff81256a38: 0f 48 d8 cmovs %eax,%ebx â–’
0.02 : ffffffff81256a3b: c1 fb 0f sar $0xf,%ebx â–’
0.00 : ffffffff81256a3e: 48 63 c3 movslq %ebx,%rax â–’
0.00 : ffffffff81256a41: 48 3b 42 28 cmp 0x28(%rdx),%rax â–’
99.84 : ffffffff81256a45: 48 c7 c0 d5 ff ff ff mov $0xffffffffffffffd5,%rax â–’
0.00 : ffffffff81256a4c: 48 0f 45 d0 cmovne %rax,%rdx â–’
: return ERR_PTR(-EIDRM); â–’
: out: â–’
: return out; â–’
: } â–’
0.03 : ffffffff81256a50: 48 83 c4 08 add $0x8,%rsp â–’
0.00 : ffffffff81256a54: 48 89 d0 mov %rdx,%rax â–’
0.02 : ffffffff81256a57: 5b pop %rbx â–’
0.00 : ffffffff81256a58: c9 leaveq

sem_lock():

: static inline void spin_lock(spinlock_t *lock) â–’
: { â–’
: raw_spin_lock(&lock->rlock); â–’
0.10 : ffffffff81258a7c: 4c 8d 6b 08 lea 0x8(%rbx),%r13 â–’
0.01 : ffffffff81258a80: 4c 89 ef mov %r13,%rdi â–’
0.01 : ffffffff81258a83: e8 08 4f 35 00 callq ffffffff815ad990 <_raw_spin_lock> â–’
: â–’
: /* â–’
: * If sma->complex_count was set while we were spinning, â–’
: * we may need to look at things we did not lock here. â–’
: */ â–’
: if (unlikely(sma->complex_count)) { â–’
0.02 : ffffffff81258a88: 41 8b 44 24 7c mov 0x7c(%r12),%eax â–®
6.18 : ffffffff81258a8d: 85 c0 test %eax,%eax â–’
0.00 : ffffffff81258a8f: 75 29 jne ffffffff81258aba <sem_lock+0x7a> â–’
: __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); â–’
: } â–’
: â–’
: static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) â–’
: { â–’
: struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); â–’
0.00 : ffffffff81258a91: 41 0f b7 54 24 02 movzwl 0x2(%r12),%edx â–’
84.33 : ffffffff81258a97: 41 0f b7 04 24 movzwl (%r12),%eax â–’
: /* â–’
: * Another process is holding the global lock on the â–’
: * sem_array; we cannot enter our critical section, â–’
: * but have to wait for the global lock to be released. â–’
: */ â–’
: if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { â–’
0.42 : ffffffff81258a9c: 66 39 c2 cmp %ax,%dx â–’
0.01 : ffffffff81258a9f: 75 76 jne ffffffff81258b17 <sem_lock+0xd7> â–’
: spin_unlock(&sem->lock); â–’
: spin_unlock_wait(&sma->sem_perm.lock); â–’
: goto again;


ipcperms():

: static inline int audit_dummy_context(void) â–’
: { â–’
: void *p = current->audit_context; â–’
0.01 : ffffffff81255f9e: 48 8b 82 d0 05 00 00 mov 0x5d0(%rdx),%rax â–’
: return !p || *(int *)p; â–’
0.01 : ffffffff81255fa5: 48 85 c0 test %rax,%rax â–’
0.00 : ffffffff81255fa8: 74 06 je ffffffff81255fb0 <ipcperms+0x50> â–’
0.00 : ffffffff81255faa: 8b 00 mov (%rax),%eax â–’
0.00 : ffffffff81255fac: 85 c0 test %eax,%eax â–’
0.00 : ffffffff81255fae: 74 60 je ffffffff81256010 <ipcperms+0xb0> â–’
: int requested_mode, granted_mode; â–’
: â–’
: audit_ipc_obj(ipcp); â–’
: requested_mode = (flag >> 6) | (flag >> 3) | flag; â–’
: granted_mode = ipcp->mode; â–’
: if (uid_eq(euid, ipcp->cuid) || â–’
0.02 : ffffffff81255fb0: 45 3b 6c 24 18 cmp 0x18(%r12),%r13d â–’
: kuid_t euid = current_euid(); â–’
: int requested_mode, granted_mode; â–’
: â–’
: audit_ipc_obj(ipcp); â–’
: requested_mode = (flag >> 6) | (flag >> 3) | flag; â–’
: granted_mode = ipcp->mode; â–’
99.18 : ffffffff81255fb5: 41 0f b7 5c 24 20 movzwl 0x20(%r12),%ebx â–’
: if (uid_eq(euid, ipcp->cuid) || â–’
0.46 : ffffffff81255fbb: 74 07 je ffffffff81255fc4 <ipcperms+0x64> â–’
0.00 : ffffffff81255fbd: 45 3b 6c 24 10 cmp 0x10(%r12),%r13d â–’
0.00 : ffffffff81255fc2: 75 5c jne ffffffff81256020 <ipcperms+0xc0> â–’
: uid_eq(euid, ipcp->uid)) â–’
: granted_mode >>= 6; â–®
0.02 : ffffffff81255fc4: c1 fb 06 sar $0x6,%ebx â–’
: else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid)) â–’
: granted_mode >>= 3; â–’
: /* is there some bit set in requested_mode but not in granted_mode? */ â–’
: if ((requested_mode & ~granted_mode & 0007) && â–’
0.00 : ffffffff81255fc7: 44 89 f0 mov %r14d,%eax â–’
0.00 : ffffffff81255fca: 44 89 f2 mov %r14d,%edx â–’
0.00 : ffffffff81255fcd: f7 d3 not %ebx â–’
0.02 : ffffffff81255fcf: 66 c1 f8 06 sar $0x6,%ax â–’
0.00 : ffffffff81255fd3: 66 c1 fa 03 sar $0x3,%dx â–’
0.00 : ffffffff81255fd7: 09 d0 or %edx,%eax â–’
0.02 : ffffffff81255fd9: 44 09 f0 or %r14d,%eax â–’
0.00 : ffffffff81255fdc: 83 e0 07 and $0x7,%eax â–’
0.00 : ffffffff81255fdf: 85 d8 test %ebx,%eax â–’
0.00 : ffffffff81255fe1: 75 75 jne ffffffff81256058 <ipcperms+0xf8> â–’
: !ns_capable(ns-




2013-06-15 05:27:22

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On 06/14/2013 09:05 PM, Mike Galbraith wrote:
> 32 of 64 cores DL980 without the -rt killing goto again loop removal I
> showed you. Unstable, not wonderful throughput.
Unfortunately the -rt approach is defintively unstable:
> @@ -285,9 +274,29 @@ static inline int sem_lock(struct sem_ar
> * but have to wait for the global lock to be released.
> */
> if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {
> - spin_unlock(&sem->lock);
> - spin_unlock_wait(&sma->sem_perm.lock);
> - goto again;
> + spin_lock(&sma->sem_perm.lock);
> + if (sma->complex_count)
> + goto wait_array;
> +
> + /*
> + * Acquiring our sem->lock under the global lock
> + * forces new complex operations to wait for us
> + * to exit our critical section.
> + */
> + spin_lock(&sem->lock);
> + spin_unlock(&sma->sem_perm.lock);

Assume there is one op (semctl(), whatever) that acquires the global
lock - and a continuous stream of simple ops.
- spin_is_locked() returns true due to the semctl().
- then simple ops will switch to spin_lock(&sma->sem_perm.lock).
- since the spinlock is acquired, the next operation will get true from
spin_is_locked().

It will stay that way around - as long as there is at least one op
waiting for sma->sem_perm.lock.
With enough cpus, it will stay like this forever.

--
Manfred

2013-06-15 05:48:47

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On Sat, 2013-06-15 at 07:27 +0200, Manfred Spraul wrote:

> Assume there is one op (semctl(), whatever) that acquires the global
> lock - and a continuous stream of simple ops.
> - spin_is_locked() returns true due to the semctl().
> - then simple ops will switch to spin_lock(&sma->sem_perm.lock).
> - since the spinlock is acquired, the next operation will get true from
> spin_is_locked().
>
> It will stay that way around - as long as there is at least one op
> waiting for sma->sem_perm.lock.
> With enough cpus, it will stay like this forever.

Yup, pondered that yesterday, scratching my head over how to do better.
Hints highly welcome. Maybe if I figure out how to scratch dual lock
thingy properly for -rt, non-rt will start acting sane too, as that spot
seems to be itchy in both kernels.

-Mike

2013-06-15 07:30:15

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On Sat, 2013-06-15 at 07:48 +0200, Mike Galbraith wrote:
> On Sat, 2013-06-15 at 07:27 +0200, Manfred Spraul wrote:
>
> > Assume there is one op (semctl(), whatever) that acquires the global
> > lock - and a continuous stream of simple ops.
> > - spin_is_locked() returns true due to the semctl().
> > - then simple ops will switch to spin_lock(&sma->sem_perm.lock).
> > - since the spinlock is acquired, the next operation will get true from
> > spin_is_locked().
> >
> > It will stay that way around - as long as there is at least one op
> > waiting for sma->sem_perm.lock.
> > With enough cpus, it will stay like this forever.
>
> Yup, pondered that yesterday, scratching my head over how to do better.
> Hints highly welcome. Maybe if I figure out how to scratch dual lock
> thingy properly for -rt, non-rt will start acting sane too, as that spot
> seems to be itchy in both kernels.

Gee, just trying to flip back to a single semaphore lock mode if you had
to do the global wait thing fixed up -rt. 10 consecutive sem-waitzero 5
8 64 runs with the 3.8-rt9 kernel went like so, which is one hell of an
improvement.

Result matrix:
Thread 0: 20209311
Thread 1: 20255372
Thread 2: 20082611
...
Thread 61: 20162924
Thread 62: 20048995
Thread 63: 20142689

I must have screwed up something :)

static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
int nsops)
{
struct sem *sem;
int locknum;

if (nsops == 1 && !sma->complex_count) {
sem = sma->sem_base + sops->sem_num;

/*
* Another process is holding the global lock on the
* sem_array; we cannot enter our critical section,
* but have to wait for the global lock to be released.
*/
if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {
spin_lock(&sma->sem_perm.lock);
if (sma->complex_count)
goto wait_array;

/*
* Acquiring our sem->lock under the global lock
* forces new complex operations to wait for us
* to exit our critical section.
*/
spin_lock(&sem->lock);
spin_unlock(&sma->sem_perm.lock);
} else {
/* Lock just the semaphore we are interested in. */
spin_lock(&sem->lock);

/*
* If sma->complex_count was set prior to acquisition,
* we must fall back to the global array lock.
*/
if (unlikely(sma->complex_count)) {
spin_unlock(&sem->lock);
goto lock_array;
}
}

locknum = sops->sem_num;
} else {
int i;
/*
* Lock the semaphore array, and wait for all of the
* individual semaphore locks to go away. The code
* above ensures no new single-lock holders will enter
* their critical section while the array lock is held.
*/
lock_array:
spin_lock(&sma->sem_perm.lock);
wait_array:
for (i = 0; i < sma->sem_nsems; i++) {
sem = sma->sem_base + i;
#ifdef CONFIG_PREEMPT_RT_BASE
if (spin_is_locked(&sem->lock))
#endif
spin_unlock_wait(&sem->lock);
}
locknum = -1;

if (nsops == 1 && !sma->complex_count) {
sem = sma->sem_base + sops->sem_num;
spin_lock(&sem->lock);
spin_unlock(&sma->sem_perm.lock);
locknum = sops->sem_num;
}
}
return locknum;
}

Not very pretty, but it works markedly better.

2013-06-15 08:36:30

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On Sat, 2013-06-15 at 09:30 +0200, Mike Galbraith wrote:

> Gee, just trying to flip back to a single semaphore lock mode if you had
> to do the global wait thing fixed up -rt.

But master wants a sharper rock tossed at it, it's still unstable.

Result matrix:
Thread 0: 11190947
Thread 1: 11743861
Thread 2: 10039223
Thread 3: 9776111
Thread 4: 9734815
Thread 5: 9941121
Thread 6: 11604118
Thread 7: 11716281
Thread 8: 9828587
Thread 9: 9589199
...

And that's one of the better samples. Probably -rt sleepy locks help
get us flipped back to single sem lock mode.

Hohum. On the bright side, -rt gets to scale better than mainline for a
little while, that's kinda different.

-Mike

2013-06-15 11:10:23

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On 06/14/2013 09:05 PM, Mike Galbraith wrote:
> # Events: 802K cycles
> #
> # Overhead Symbol
> # ........ ..........................................
> #
> 18.42% [k] SYSC_semtimedop
> 15.39% [k] sem_lock
> 10.26% [k] _raw_spin_lock
> 9.00% [k] perform_atomic_semop
> 7.89% [k] system_call
> 7.70% [k] ipc_obtain_object_check
> 6.95% [k] ipcperms
> 6.62% [k] copy_user_generic_string
> 4.16% [.] __semop
> 2.57% [.] worker_thread(void*)
> 2.30% [k] copy_from_user
> 1.75% [k] sem_unlock
> 1.25% [k] ipc_obtain_object
~ 280 mio ops.
2.3% copy_from_user,
9% perform_atomic_semop.

> # Events: 802K cycles
> #
> # Overhead Symbol
> # ........ ...............................
> #
> 17.38% [k] SYSC_semtimedop
> 13.26% [k] system_call
> 11.31% [k] copy_user_generic_string
> 7.62% [.] __semop
> 7.18% [k] _raw_spin_lock
> 5.66% [k] ipcperms
> 5.40% [k] sem_lock
> 4.65% [k] perform_atomic_semop
> 4.22% [k] ipc_obtain_object_check
> 4.08% [.] worker_thread(void*)
> 4.06% [k] copy_from_user
> 2.40% [k] ipc_obtain_object
> 1.98% [k] pid_vnr
> 1.45% [k] wake_up_sem_queue_do
> 1.39% [k] sys_semop
> 1.35% [k] sys_semtimedop
> 1.30% [k] sem_unlock
> 1.14% [k] security_ipc_permission
~ 700 mio ops.
4% copy_from_user -> as expected a bit more
4.6% perform_atomic_semop --> less.

Thus: Could you send the oprofile output from perform_atomic_semop()?

Perhaps that gives us a hint.

My current guess:
sem_lock() somehow ends up in lock_array.
Lock_array scans all struct sem -> transfer of that cacheline from all
cpus to the cpu that does the lock_array..
Then the next write by the "correct" cpu causes a transfer back when
setting sem->pid.

--
Manfred

2013-06-15 11:37:13

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On Sat, 2013-06-15 at 13:10 +0200, Manfred Spraul wrote:
> On 06/14/2013 09:05 PM, Mike Galbraith wrote:
> > # Events: 802K cycles
> > #
> > # Overhead Symbol
> > # ........ ..........................................
> > #
> > 18.42% [k] SYSC_semtimedop
> > 15.39% [k] sem_lock
> > 10.26% [k] _raw_spin_lock
> > 9.00% [k] perform_atomic_semop
> > 7.89% [k] system_call
> > 7.70% [k] ipc_obtain_object_check
> > 6.95% [k] ipcperms
> > 6.62% [k] copy_user_generic_string
> > 4.16% [.] __semop
> > 2.57% [.] worker_thread(void*)
> > 2.30% [k] copy_from_user
> > 1.75% [k] sem_unlock
> > 1.25% [k] ipc_obtain_object
> ~ 280 mio ops.
> 2.3% copy_from_user,
> 9% perform_atomic_semop.
>
> > # Events: 802K cycles
> > #
> > # Overhead Symbol
> > # ........ ...............................
> > #
> > 17.38% [k] SYSC_semtimedop
> > 13.26% [k] system_call
> > 11.31% [k] copy_user_generic_string
> > 7.62% [.] __semop
> > 7.18% [k] _raw_spin_lock
> > 5.66% [k] ipcperms
> > 5.40% [k] sem_lock
> > 4.65% [k] perform_atomic_semop
> > 4.22% [k] ipc_obtain_object_check
> > 4.08% [.] worker_thread(void*)
> > 4.06% [k] copy_from_user
> > 2.40% [k] ipc_obtain_object
> > 1.98% [k] pid_vnr
> > 1.45% [k] wake_up_sem_queue_do
> > 1.39% [k] sys_semop
> > 1.35% [k] sys_semtimedop
> > 1.30% [k] sem_unlock
> > 1.14% [k] security_ipc_permission
> ~ 700 mio ops.
> 4% copy_from_user -> as expected a bit more
> 4.6% perform_atomic_semop --> less.
>
> Thus: Could you send the oprofile output from perform_atomic_semop()?

Ok, newly profiled 32 core run.


Percent | Source code & Disassembly of vmlinux
------------------------------------------------
:
:
:
: Disassembly of section .text:
:
: ffffffff812584d0 <perform_atomic_semop>:
: * Negative values are error codes.
: */
:
: static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
: int nsops, struct sem_undo *un, int pid)
: {
3.70 : ffffffff812584d0: 55 push %rbp
0.00 : ffffffff812584d1: 48 89 e5 mov %rsp,%rbp
0.00 : ffffffff812584d4: 41 54 push %r12
3.40 : ffffffff812584d6: 53 push %rbx
0.00 : ffffffff812584d7: e8 64 dc 35 00 callq ffffffff815b6140 <mcount>
: int result, sem_op;
: struct sembuf *sop;
: struct sem * curr;
:
: for (sop = sops; sop < sops + nsops; sop++) {
0.00 : ffffffff812584dc: 48 63 d2 movslq %edx,%rdx
: * Negative values are error codes.
: */
:
: static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
: int nsops, struct sem_undo *un, int pid)
: {
0.00 : ffffffff812584df: 45 89 c4 mov %r8d,%r12d
3.62 : ffffffff812584e2: 48 89 cb mov %rcx,%rbx
: int result, sem_op;
: struct sembuf *sop;
: struct sem * curr;
:
: for (sop = sops; sop < sops + nsops; sop++) {
0.00 : ffffffff812584e5: 48 8d 14 52 lea (%rdx,%rdx,2),%rdx
0.00 : ffffffff812584e9: 49 89 f2 mov %rsi,%r10
0.00 : ffffffff812584ec: 4c 8d 04 56 lea (%rsi,%rdx,2),%r8
3.53 : ffffffff812584f0: 4c 39 c6 cmp %r8,%rsi
0.00 : ffffffff812584f3: 0f 83 17 01 00 00 jae ffffffff81258610 <perform_atomic_semop+0x140>
: curr = sma->sem_base + sop->sem_num;
0.00 : ffffffff812584f9: 0f b7 0e movzwl (%rsi),%ecx
: sem_op = sop->sem_op;
0.00 : ffffffff812584fc: 0f bf 56 02 movswl 0x2(%rsi),%edx
: int result, sem_op;
: struct sembuf *sop;
: struct sem * curr;
:
: for (sop = sops; sop < sops + nsops; sop++) {
: curr = sma->sem_base + sop->sem_num;
0.00 : ffffffff81258500: 49 89 c9 mov %rcx,%r9
3.75 : ffffffff81258503: 49 c1 e1 06 shl $0x6,%r9
0.00 : ffffffff81258507: 4c 03 4f 40 add 0x40(%rdi),%r9
: sem_op = sop->sem_op;
: result = curr->semval;
:
: if (!sem_op && result)
4.52 : ffffffff8125850b: 85 d2 test %edx,%edx
: struct sem * curr;
:
: for (sop = sops; sop < sops + nsops; sop++) {
: curr = sma->sem_base + sop->sem_num;
: sem_op = sop->sem_op;
: result = curr->semval;
0.00 : ffffffff8125850d: 41 8b 01 mov (%r9),%eax
:
: if (!sem_op && result)
18.66 : ffffffff81258510: 0f 84 e2 00 00 00 je ffffffff812585f8 <perform_atomic_semop+0x128>
: goto would_block;
:
: result += sem_op;
: if (result < 0)
3.52 : ffffffff81258516: 41 89 d3 mov %edx,%r11d
0.00 : ffffffff81258519: 41 01 c3 add %eax,%r11d
0.00 : ffffffff8125851c: 0f 88 de 00 00 00 js ffffffff81258600 <perform_atomic_semop+0x130>
: goto would_block;
: if (result > SEMVMX)
0.00 : ffffffff81258522: 41 81 fb ff 7f 00 00 cmp $0x7fff,%r11d
3.84 : ffffffff81258529: 49 89 f2 mov %rsi,%r10
0.00 : ffffffff8125852c: 0f 8f bb 00 00 00 jg ffffffff812585ed <perform_atomic_semop+0x11d>
0.00 : ffffffff81258532: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
: goto out_of_range;
: if (sop->sem_flg & SEM_UNDO) {
0.00 : ffffffff81258538: 41 f6 42 05 10 testb $0x10,0x5(%r10)
3.66 : ffffffff8125853d: 74 1a je ffffffff81258559 <perform_atomic_semop+0x89>
: int undo = un->semadj[sop->sem_num] - sem_op;
: /*
: * Exceeding the undo range is an error.
: */
: if (undo < (-SEMAEM - 1) || undo > SEMAEM)
0.00 : ffffffff8125853f: 48 8b 43 40 mov 0x40(%rbx),%rax
0.00 : ffffffff81258543: 0f bf 04 48 movswl (%rax,%rcx,2),%eax
0.00 : ffffffff81258547: 29 d0 sub %edx,%eax
0.00 : ffffffff81258549: 05 00 80 00 00 add $0x8000,%eax
0.00 : ffffffff8125854e: 3d ff ff 00 00 cmp $0xffff,%eax
0.00 : ffffffff81258553: 0f 87 94 00 00 00 ja ffffffff812585ed <perform_atomic_semop+0x11d>
: {
: int result, sem_op;
: struct sembuf *sop;
: struct sem * curr;
:
: for (sop = sops; sop < sops + nsops; sop++) {
3.70 : ffffffff81258559: 49 83 c2 06 add $0x6,%r10
: * Exceeding the undo range is an error.
: */
: if (undo < (-SEMAEM - 1) || undo > SEMAEM)
: goto out_of_range;
: }
: curr->semval = result;
0.01 : ffffffff8125855d: 45 89 19 mov %r11d,(%r9)
: {
: int result, sem_op;
: struct sembuf *sop;
: struct sem * curr;
:
: for (sop = sops; sop < sops + nsops; sop++) {
0.01 : ffffffff81258560: 4d 39 c2 cmp %r8,%r10
0.00 : ffffffff81258563: 0f 83 a7 00 00 00 jae ffffffff81258610 <perform_atomic_semop+0x140>
: curr = sma->sem_base + sop->sem_num;
0.00 : ffffffff81258569: 41 0f b7 0a movzwl (%r10),%ecx
: sem_op = sop->sem_op;
0.00 : ffffffff8125856d: 41 0f bf 52 02 movswl 0x2(%r10),%edx
: int result, sem_op;
: struct sembuf *sop;
: struct sem * curr;
:
: for (sop = sops; sop < sops + nsops; sop++) {
: curr = sma->sem_base + sop->sem_num;
0.00 : ffffffff81258572: 49 89 c9 mov %rcx,%r9
0.00 : ffffffff81258575: 49 c1 e1 06 shl $0x6,%r9
0.00 : ffffffff81258579: 4c 03 4f 40 add 0x40(%rdi),%r9
: sem_op = sop->sem_op;
: result = curr->semval;
:
: if (!sem_op && result)
0.00 : ffffffff8125857d: 85 d2 test %edx,%edx
: struct sem * curr;
:
: for (sop = sops; sop < sops + nsops; sop++) {
: curr = sma->sem_base + sop->sem_num;
: sem_op = sop->sem_op;
: result = curr->semval;
0.00 : ffffffff8125857f: 41 8b 01 mov (%r9),%eax
:
: if (!sem_op && result)
0.00 : ffffffff81258582: 75 54 jne ffffffff812585d8 <perform_atomic_semop+0x108>
0.00 : ffffffff81258584: 85 c0 test %eax,%eax
0.00 : ffffffff81258586: 74 50 je ffffffff812585d8 <perform_atomic_semop+0x108>
:
: out_of_range:
: result = -ERANGE;
: goto undo;
:
: would_block:
0.00 : ffffffff81258588: 4c 89 d0 mov %r10,%rax
: if (sop->sem_flg & IPC_NOWAIT)
0.00 : ffffffff8125858b: 0f bf 40 04 movswl 0x4(%rax),%eax
0.00 : ffffffff8125858f: 25 00 08 00 00 and $0x800,%eax
0.00 : ffffffff81258594: 83 f8 01 cmp $0x1,%eax
0.00 : ffffffff81258597: 45 19 c0 sbb %r8d,%r8d
0.00 : ffffffff8125859a: 41 83 e0 0c and $0xc,%r8d
0.00 : ffffffff8125859e: 41 83 e8 0b sub $0xb,%r8d
: result = -EAGAIN;
: else
: result = 1;
:
: undo:
: sop--;
0.00 : ffffffff812585a2: 49 8d 4a fa lea -0x6(%r10),%rcx
: while (sop >= sops) {
0.00 : ffffffff812585a6: 48 39 ce cmp %rcx,%rsi
0.00 : ffffffff812585a9: 77 1f ja ffffffff812585ca <perform_atomic_semop+0xfa>
0.00 : ffffffff812585ab: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
: sma->sem_base[sop->sem_num].semval -= sop->sem_op;
0.00 : ffffffff812585b0: 0f b7 01 movzwl (%rcx),%eax
0.00 : ffffffff812585b3: 0f bf 51 02 movswl 0x2(%rcx),%edx
: sop--;
0.00 : ffffffff812585b7: 48 83 e9 06 sub $0x6,%rcx
: result = 1;
:
: undo:
: sop--;
: while (sop >= sops) {
: sma->sem_base[sop->sem_num].semval -= sop->sem_op;
0.00 : ffffffff812585bb: 48 c1 e0 06 shl $0x6,%rax
0.00 : ffffffff812585bf: 48 03 47 40 add 0x40(%rdi),%rax
0.00 : ffffffff812585c3: 29 10 sub %edx,(%rax)
: else
: result = 1;
:
: undo:
: sop--;
: while (sop >= sops) {
0.00 : ffffffff812585c5: 48 39 ce cmp %rcx,%rsi
0.00 : ffffffff812585c8: 76 e6 jbe ffffffff812585b0 <perform_atomic_semop+0xe0>
: sma->sem_base[sop->sem_num].semval -= sop->sem_op;
: sop--;
: }
:
: return result;
: }
0.00 : ffffffff812585ca: 5b pop %rbx
0.00 : ffffffff812585cb: 44 89 c0 mov %r8d,%eax
0.00 : ffffffff812585ce: 41 5c pop %r12
0.00 : ffffffff812585d0: c9 leaveq
0.00 : ffffffff812585d1: c3 retq
0.00 : ffffffff812585d2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
:
: if (!sem_op && result)
: goto would_block;
:
: result += sem_op;
: if (result < 0)
0.00 : ffffffff812585d8: 41 89 d3 mov %edx,%r11d
0.00 : ffffffff812585db: 41 01 c3 add %eax,%r11d
0.00 : ffffffff812585de: 78 a8 js ffffffff81258588 <perform_atomic_semop+0xb8>
: goto would_block;
: if (result > SEMVMX)
0.00 : ffffffff812585e0: 41 81 fb ff 7f 00 00 cmp $0x7fff,%r11d
0.00 : ffffffff812585e7: 0f 8e 4b ff ff ff jle ffffffff81258538 <perform_atomic_semop+0x68>
: if (sop->sem_flg & IPC_NOWAIT)
: result = -EAGAIN;
: else
: result = 1;
:
: undo:
0.00 : ffffffff812585ed: 41 b8 de ff ff ff mov $0xffffffde,%r8d
0.00 : ffffffff812585f3: eb ad jmp ffffffff812585a2 <perform_atomic_semop+0xd2>
0.00 : ffffffff812585f5: 0f 1f 00 nopl (%rax)
: for (sop = sops; sop < sops + nsops; sop++) {
: curr = sma->sem_base + sop->sem_num;
: sem_op = sop->sem_op;
: result = curr->semval;
:
: if (!sem_op && result)
3.56 : ffffffff812585f8: 85 c0 test %eax,%eax
0.00 : ffffffff812585fa: 0f 84 16 ff ff ff je ffffffff81258516 <perform_atomic_semop+0x46>
:
: out_of_range:
: result = -ERANGE;
: goto undo;
:
: would_block:
0.00 : ffffffff81258600: 48 89 f0 mov %rsi,%rax
0.00 : ffffffff81258603: 49 89 f2 mov %rsi,%r10
0.00 : ffffffff81258606: e9 80 ff ff ff jmpq ffffffff8125858b <perform_atomic_semop+0xbb>
0.00 : ffffffff8125860b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
: goto out_of_range;
: }
: curr->semval = result;
: }
:
: sop--;
3.58 : ffffffff81258610: 4d 8d 4a fa lea -0x6(%r10),%r9
: while (sop >= sops) {
0.00 : ffffffff81258614: 4c 39 ce cmp %r9,%rsi
0.00 : ffffffff81258617: 77 3b ja ffffffff81258654 <perform_atomic_semop+0x184>
0.00 : ffffffff81258619: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
: sma->sem_base[sop->sem_num].sempid = pid;
0.00 : ffffffff81258620: 41 0f b7 01 movzwl (%r9),%eax
3.51 : ffffffff81258624: 48 8b 57 40 mov 0x40(%rdi),%rdx
22.37 : ffffffff81258628: 48 c1 e0 06 shl $0x6,%rax
0.00 : ffffffff8125862c: 44 89 64 02 04 mov %r12d,0x4(%rdx,%rax,1)
: if (sop->sem_flg & SEM_UNDO)
3.79 : ffffffff81258631: 41 f6 41 05 10 testb $0x10,0x5(%r9)
0.00 : ffffffff81258636: 74 13 je ffffffff8125864b <perform_atomic_semop+0x17b>
: un->semadj[sop->sem_num] -= sop->sem_op;
0.00 : ffffffff81258638: 41 0f b7 01 movzwl (%r9),%eax
0.00 : ffffffff8125863c: 41 0f b7 51 02 movzwl 0x2(%r9),%edx
0.00 : ffffffff81258641: 48 01 c0 add %rax,%rax
0.00 : ffffffff81258644: 48 03 43 40 add 0x40(%rbx),%rax
0.00 : ffffffff81258648: 66 29 10 sub %dx,(%rax)
: sop--;
3.58 : ffffffff8125864b: 49 83 e9 06 sub $0x6,%r9
: }
: curr->semval = result;
: }
:
: sop--;
: while (sop >= sops) {
0.00 : ffffffff8125864f: 4c 39 ce cmp %r9,%rsi
0.00 : ffffffff81258652: 76 cc jbe ffffffff81258620 <perform_atomic_semop+0x150>
: sma->sem_base[sop->sem_num].semval -= sop->sem_op;
: sop--;
: }
:
: return result;
: }
0.00 : ffffffff81258654: 5b pop %rbx
: else
: result = 1;
:
: undo:
: sop--;
: while (sop >= sops) {
0.00 : ffffffff81258655: 45 31 c0 xor %r8d,%r8d
: sma->sem_base[sop->sem_num].semval -= sop->sem_op;
: sop--;
: }
:
: return result;
: }
3.67 : ffffffff81258658: 44 89 c0 mov %r8d,%eax
0.00 : ffffffff8125865b: 41 5c pop %r12
0.00 : ffffffff8125865d: c9 leaveq

2013-06-18 06:48:53

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On Sat, 2013-06-15 at 13:10 +0200, Manfred Spraul wrote:
> On 06/14/2013 09:05 PM, Mike Galbraith wrote:
> > # Events: 802K cycles
> > #
> > # Overhead Symbol
> > # ........ ..........................................
> > #
> > 18.42% [k] SYSC_semtimedop
> > 15.39% [k] sem_lock
> > 10.26% [k] _raw_spin_lock
> > 9.00% [k] perform_atomic_semop
> > 7.89% [k] system_call
> > 7.70% [k] ipc_obtain_object_check
> > 6.95% [k] ipcperms
> > 6.62% [k] copy_user_generic_string
> > 4.16% [.] __semop
> > 2.57% [.] worker_thread(void*)
> > 2.30% [k] copy_from_user
> > 1.75% [k] sem_unlock
> > 1.25% [k] ipc_obtain_object
> ~ 280 mio ops.
> 2.3% copy_from_user,
> 9% perform_atomic_semop.
>
> > # Events: 802K cycles
> > #
> > # Overhead Symbol
> > # ........ ...............................
> > #
> > 17.38% [k] SYSC_semtimedop
> > 13.26% [k] system_call
> > 11.31% [k] copy_user_generic_string
> > 7.62% [.] __semop
> > 7.18% [k] _raw_spin_lock
> > 5.66% [k] ipcperms
> > 5.40% [k] sem_lock
> > 4.65% [k] perform_atomic_semop
> > 4.22% [k] ipc_obtain_object_check
> > 4.08% [.] worker_thread(void*)
> > 4.06% [k] copy_from_user
> > 2.40% [k] ipc_obtain_object
> > 1.98% [k] pid_vnr
> > 1.45% [k] wake_up_sem_queue_do
> > 1.39% [k] sys_semop
> > 1.35% [k] sys_semtimedop
> > 1.30% [k] sem_unlock
> > 1.14% [k] security_ipc_permission
> ~ 700 mio ops.
> 4% copy_from_user -> as expected a bit more
> 4.6% perform_atomic_semop --> less.
>
> Thus: Could you send the oprofile output from perform_atomic_semop()?
>
> Perhaps that gives us a hint.
>
> My current guess:
> sem_lock() somehow ends up in lock_array.
> Lock_array scans all struct sem -> transfer of that cacheline from all
> cpus to the cpu that does the lock_array..
> Then the next write by the "correct" cpu causes a transfer back when
> setting sem->pid.

Profiling sem_lock(), observe sma->complex_count.

: again:
: if (nsops == 1 && !sma->complex_count) {
0.00 : ffffffff81258a64: 75 5a jne ffffffff81258ac0 <sem_lock+0x80>
0.50 : ffffffff81258a66: 41 8b 44 24 7c mov 0x7c(%r12),%eax
23.04 : ffffffff81258a6b: 85 c0 test %eax,%eax
0.00 : ffffffff81258a6d: 75 51 jne ffffffff81258ac0 <sem_lock+0x80>
: struct sem *sem = sma->sem_base + sops->sem_num;
0.01 : ffffffff81258a6f: 41 0f b7 1e movzwl (%r14),%ebx
0.48 : ffffffff81258a73: 48 c1 e3 06 shl $0x6,%rbx
0.52 : ffffffff81258a77: 49 03 5c 24 40 add 0x40(%r12),%rbx
: raw_spin_lock_init(&(_lock)->rlock); \
: } while (0)
:
: static inline void spin_lock(spinlock_t *lock)
: {
: raw_spin_lock(&lock->rlock);
1.45 : ffffffff81258a7c: 4c 8d 6b 08 lea 0x8(%rbx),%r13
0.47 : ffffffff81258a80: 4c 89 ef mov %r13,%rdi
0.50 : ffffffff81258a83: e8 08 4f 35 00 callq ffffffff815ad990 <_raw_spin_lock>
:
: /*
: * If sma->complex_count was set while we were spinning,
: * we may need to look at things we did not lock here.
: */
: if (unlikely(sma->complex_count)) {
0.53 : ffffffff81258a88: 41 8b 44 24 7c mov 0x7c(%r12),%eax
34.33 : ffffffff81258a8d: 85 c0 test %eax,%eax
0.02 : ffffffff81258a8f: 75 29 jne ffffffff81258aba <sem_lock+0x7a>
: __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
: }
:

We're taking cache misses for _both_ reads, so I speculated that
somebody somewhere has to be banging on that structure, so I tried
putting complex_count on a different cache line, and indeed, the
overhead disappeared, and box started scaling linearly and reliably so
to 32 cores. 64 is still unstable, but 32 became rock solid.

Moving ____cacheline_aligned_in_smp upward one at a time to try to find
out which field was causing trouble, I ended up at sem_base.. a pointer
that's not modified. That makes zero sense to me, does anybody have an
idea why having sem_base and complex_count in the same cache line would
cause this?

---
include/linux/sem.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

Index: linux-2.6/include/linux/sem.h
===================================================================
--- linux-2.6.orig/include/linux/sem.h
+++ linux-2.6/include/linux/sem.h
@@ -10,10 +10,10 @@ struct task_struct;

/* One sem_array data structure for each set of semaphores in the system. */
struct sem_array {
- struct kern_ipc_perm ____cacheline_aligned_in_smp
- sem_perm; /* permissions .. see ipc.h */
+ struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */
time_t sem_ctime; /* last change time */
- struct sem *sem_base; /* ptr to first semaphore in array */
+ struct sem ____cacheline_aligned_in_smp
+ *sem_base; /* ptr to first semaphore in array */
struct list_head pending_alter; /* pending operations */
/* that alter the array */
struct list_head pending_const; /* pending complex operations */
@@ -21,7 +21,7 @@ struct sem_array {
struct list_head list_id; /* undo requests on this array */
int sem_nsems; /* no. of semaphores in array */
int complex_count; /* pending complex operations */
-};
+} ____cacheline_aligned_in_smp;

#ifdef CONFIG_SYSVIPC

If I put the ____cacheline_aligned_in_smp any place below sem_base, the
overhead goes *poof* gone. For a 64 core run, massive overhead
reappears, but at spin_is_locked(), and we thrash badly again, with the
characteristic wildly uneven distribution.. but we still perform much
better than without the move in total throughput.

In -rt, even without moving complex_count, only using my livelock free
version of sem_lock(), we magically scale to 64 cores with your patches,
returning a 650 fold throughput improvement for sem-waitzero. I say
"magically" because that patch should not have the effect it does, but
it has that same effect on mainline up to 32 cores. Changing memory
access patterns around a little has a wild and fully repeatable impact
on throughput.

Something is rotten in Denmark, any ideas out there as to what that
something might be? HTH can moving that pointer have the effect it
does? How can my livelock fix for -rt have the same effect on mainline
as moving complex_count, and much more effect for -rt, to the point that
we start scaling perfectly to 64 cores? AFAIKT, it should be noop or
maybe cost a bit, but it's confused, insists that it's a performance
patch, when it's really only a remove the darn livelockable loop patch.

-Mike

2013-06-18 07:14:22

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On Tue, 2013-06-18 at 08:48 +0200, Mike Galbraith wrote:
> On Sat, 2013-06-15 at 13:10 +0200, Manfred Spraul wrote:

P.S.

> > My current guess:
> > sem_lock() somehow ends up in lock_array.

Tracing shows that happens precisely one time, at end of benchmark.

-Mike

2013-06-19 12:57:57

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH 0/6] ipc/sem.c: performance improvements, FIFO

On Tue, 2013-06-18 at 09:14 +0200, Mike Galbraith wrote:
> On Tue, 2013-06-18 at 08:48 +0200, Mike Galbraith wrote:
> > On Sat, 2013-06-15 at 13:10 +0200, Manfred Spraul wrote:
>
> P.S.
>
> > > My current guess:
> > > sem_lock() somehow ends up in lock_array.
>
> Tracing shows that happens precisely one time, at end of benchmark.

FWIW, below is a profile of 3.8-rt scaling to 64 cores. No mucking
about with sem_array, only livelock hack doing it's thing.. whatever
that is.

# Events: 1M cycles
#
# Overhead Symbol
# ........ .......................................
#
16.71% [k] sys_semtimedop
11.31% [k] system_call
11.23% [k] copy_user_generic_string
7.59% [.] __semop
4.88% [k] sem_lock
4.52% [k] rt_spin_lock
4.48% [k] rt_spin_unlock
3.75% [.] worker_thread(void*)
3.56% [k] perform_atomic_semop
3.15% [k] idr_find
3.15% [k] ipc_obtain_object_check
3.04% [k] pid_vnr
2.97% [k] migrate_enable
2.79% [k] migrate_disable
2.46% [k] ipcperms
2.01% [k] sysret_check
1.86% [k] pin_current_cpu
1.49% [k] unpin_current_cpu
1.13% [k] __rcu_read_lock
1.11% [k] __rcu_read_unlock
1.11% [k] ipc_obtain_object

Percent | Source code & Disassembly of vmlinux
------------------------------------------------
:
:
:
: Disassembly of section .text:
:
: ffffffff81273390 <sem_lock>:
: * checking each local lock once. This means that the local lock paths
: * cannot start their critical sections while the global lock is held.
: */
: static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
: int nsops)
: {
3.90 : ffffffff81273390: 41 55 push %r13
0.00 : ffffffff81273392: 49 89 f5 mov %rsi,%r13
0.00 : ffffffff81273395: 41 54 push %r12
0.00 : ffffffff81273397: 41 89 d4 mov %edx,%r12d
3.86 : ffffffff8127339a: 55 push %rbp
0.00 : ffffffff8127339b: 48 89 fd mov %rdi,%rbp
0.00 : ffffffff8127339e: 53 push %rbx
0.00 : ffffffff8127339f: 48 83 ec 08 sub $0x8,%rsp
: struct sem *sem;
: int locknum;
:
: if (nsops == 1 && !sma->complex_count) {
3.76 : ffffffff812733a3: 83 fa 01 cmp $0x1,%edx
0.00 : ffffffff812733a6: 75 0c jne ffffffff812733b4 <sem_lock+0x24>
0.00 : ffffffff812733a8: 44 8b 87 a4 00 00 00 mov 0xa4(%rdi),%r8d
8.92 : ffffffff812733af: 45 85 c0 test %r8d,%r8d
0.00 : ffffffff812733b2: 74 5c je ffffffff81273410 <sem_lock+0x80>
: * individual semaphore locks to go away. The code
: * above ensures no new single-lock holders will enter
: * their critical section while the array lock is held.
: */
: lock_array:
: spin_lock(&sma->sem_perm.lock);
0.00 : ffffffff812733b4: e8 57 4b e1 ff callq ffffffff81087f10 <migrate_disable>
0.00 : ffffffff812733b9: 48 89 ef mov %rbp,%rdi
0.00 : ffffffff812733bc: e8 1f d4 34 00 callq ffffffff815c07e0 <rt_spin_lock>
: wait_array:
: for (i = 0; i < sma->sem_nsems; i++) {
0.00 : ffffffff812733c1: 8b 8d a0 00 00 00 mov 0xa0(%rbp),%ecx
0.00 : ffffffff812733c7: 85 c9 test %ecx,%ecx
0.00 : ffffffff812733c9: 7e 2b jle ffffffff812733f6 <sem_lock+0x66>
0.00 : ffffffff812733cb: 31 db xor %ebx,%ebx
0.00 : ffffffff812733cd: 0f 1f 00 nopl (%rax)
: sem = sma->sem_base + i;
0.00 : ffffffff812733d0: 48 63 c3 movslq %ebx,%rax
0.00 : ffffffff812733d3: 48 c1 e0 07 shl $0x7,%rax
0.00 : ffffffff812733d7: 48 03 45 68 add 0x68(%rbp),%rax
: #ifdef CONFIG_PREEMPT_RT_BASE
: if (spin_is_locked(&sem->lock))
0.00 : ffffffff812733db: 48 83 78 20 00 cmpq $0x0,0x20(%rax)
0.00 : ffffffff812733e0: 74 09 je ffffffff812733eb <sem_lock+0x5b>
: #endif
: spin_unlock_wait(&sem->lock);
0.00 : ffffffff812733e2: 48 8d 78 08 lea 0x8(%rax),%rdi
0.00 : ffffffff812733e6: e8 b5 d4 34 00 callq ffffffff815c08a0 <rt_spin_unlock_wait>
: * their critical section while the array lock is held.
: */
: lock_array:
: spin_lock(&sma->sem_perm.lock);
: wait_array:
: for (i = 0; i < sma->sem_nsems; i++) {
0.00 : ffffffff812733eb: 83 c3 01 add $0x1,%ebx
0.00 : ffffffff812733ee: 39 9d a0 00 00 00 cmp %ebx,0xa0(%rbp)
0.00 : ffffffff812733f4: 7f da jg ffffffff812733d0 <sem_lock+0x40>
: #endif
: spin_unlock_wait(&sem->lock);
: }
: locknum = -1;
:
: if (nsops == 1 && !sma->complex_count) {
0.00 : ffffffff812733f6: 41 83 ec 01 sub $0x1,%r12d
0.00 : ffffffff812733fa: 74 51 je ffffffff8127344d <sem_lock+0xbd>
: spin_unlock(&sma->sem_perm.lock);
: locknum = sops->sem_num;
: }
: }
: return locknum;
: }
0.00 : ffffffff812733fc: 48 83 c4 08 add $0x8,%rsp
:
: if (nsops == 1 && !sma->complex_count) {
: sem = sma->sem_base + sops->sem_num;
: spin_lock(&sem->lock);
: spin_unlock(&sma->sem_perm.lock);
: locknum = sops->sem_num;
0.00 : ffffffff81273400: b8 ff ff ff ff mov $0xffffffff,%eax
: }
: }
: return locknum;
: }
0.00 : ffffffff81273405: 5b pop %rbx
0.00 : ffffffff81273406: 5d pop %rbp
0.00 : ffffffff81273407: 41 5c pop %r12
0.00 : ffffffff81273409: 41 5d pop %r13
0.00 : ffffffff8127340b: c3 retq
0.00 : ffffffff8127340c: 0f 1f 40 00 nopl 0x0(%rax)
: {
: struct sem *sem;
: int locknum;
:
: if (nsops == 1 && !sma->complex_count) {
: sem = sma->sem_base + sops->sem_num;
3.69 : ffffffff81273410: 0f b7 1e movzwl (%rsi),%ebx
0.01 : ffffffff81273413: 48 c1 e3 07 shl $0x7,%rbx
3.98 : ffffffff81273417: 48 03 5f 68 add 0x68(%rdi),%rbx
: /*
: * Another process is holding the global lock on the
: * sem_array; we cannot enter our critical section,
: * but have to wait for the global lock to be released.
: */
: if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {
10.68 : ffffffff8127341b: 48 83 7f 18 00 cmpq $0x0,0x18(%rdi)
22.07 : ffffffff81273420: 75 60 jne ffffffff81273482 <sem_lock+0xf2>
: */
: spin_lock(&sem->lock);
: spin_unlock(&sma->sem_perm.lock);
: } else {
: /* Lock just the semaphore we are interested in. */
: spin_lock(&sem->lock);
0.01 : ffffffff81273422: 48 83 c3 08 add $0x8,%rbx
3.69 : ffffffff81273426: e8 e5 4a e1 ff callq ffffffff81087f10 <migrate_disable>
7.42 : ffffffff8127342b: 48 89 df mov %rbx,%rdi
0.00 : ffffffff8127342e: e8 ad d3 34 00 callq ffffffff815c07e0 <rt_spin_lock>
:
: /*
: * If sma->complex_count was set prior to acquisition,
: * we must fall back to the global array lock.
: */
: if (unlikely(sma->complex_count)) {
3.75 : ffffffff81273433: 8b b5 a4 00 00 00 mov 0xa4(%rbp),%esi
16.44 : ffffffff81273439: 85 f6 test %esi,%esi
0.00 : ffffffff8127343b: 75 68 jne ffffffff812734a5 <sem_lock+0x115>
:
: if (nsops == 1 && !sma->complex_count) {
: sem = sma->sem_base + sops->sem_num;
: spin_lock(&sem->lock);
: spin_unlock(&sma->sem_perm.lock);
: locknum = sops->sem_num;
0.00 : ffffffff8127343d: 41 0f b7 45 00 movzwl 0x0(%r13),%eax
: }
: }
: return locknum;
: }
0.00 : ffffffff81273442: 48 83 c4 08 add $0x8,%rsp
3.94 : ffffffff81273446: 5b pop %rbx
0.00 : ffffffff81273447: 5d pop %rbp
0.01 : ffffffff81273448: 41 5c pop %r12
3.86 : ffffffff8127344a: 41 5d pop %r13
0.00 : ffffffff8127344c: c3 retq
: #endif
: spin_unlock_wait(&sem->lock);
: }
: locknum = -1;
:
: if (nsops == 1 && !sma->complex_count) {
0.00 : ffffffff8127344d: 8b 95 a4 00 00 00 mov 0xa4(%rbp),%edx
0.00 : ffffffff81273453: 85 d2 test %edx,%edx
0.00 : ffffffff81273455: 75 a5 jne ffffffff812733fc <sem_lock+0x6c>
: sem = sma->sem_base + sops->sem_num;
0.00 : ffffffff81273457: 41 0f b7 5d 00 movzwl 0x0(%r13),%ebx
0.00 : ffffffff8127345c: 48 c1 e3 07 shl $0x7,%rbx
0.00 : ffffffff81273460: 48 03 5d 68 add 0x68(%rbp),%rbx
: spin_lock(&sem->lock);
0.00 : ffffffff81273464: e8 a7 4a e1 ff callq ffffffff81087f10 <migrate_disable>
0.00 : ffffffff81273469: 48 8d 7b 08 lea 0x8(%rbx),%rdi
0.00 : ffffffff8127346d: e8 6e d3 34 00 callq ffffffff815c07e0 <rt_spin_lock>
: spin_unlock(&sma->sem_perm.lock);
0.00 : ffffffff81273472: 48 89 ef mov %rbp,%rdi
0.00 : ffffffff81273475: e8 e6 d3 34 00 callq ffffffff815c0860 <rt_spin_unlock>
0.00 : ffffffff8127347a: e8 41 48 e1 ff callq ffffffff81087cc0 <migrate_enable>
0.00 : ffffffff8127347f: 90 nop
0.00 : ffffffff81273480: eb bb jmp ffffffff8127343d <sem_lock+0xad>
0.00 : ffffffff81273482: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
: * Another process is holding the global lock on the
: * sem_array; we cannot enter our critical section,
: * but have to wait for the global lock to be released.
: */
: if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {
: spin_lock(&sma->sem_perm.lock);
0.00 : ffffffff81273488: e8 83 4a e1 ff callq ffffffff81087f10 <migrate_disable>
0.00 : ffffffff8127348d: 48 89 ef mov %rbp,%rdi
0.00 : ffffffff81273490: e8 4b d3 34 00 callq ffffffff815c07e0 <rt_spin_lock>
: if (sma->complex_count)
0.00 : ffffffff81273495: 8b bd a4 00 00 00 mov 0xa4(%rbp),%edi
0.00 : ffffffff8127349b: 85 ff test %edi,%edi
0.00 : ffffffff8127349d: 0f 85 1e ff ff ff jne ffffffff812733c1 <sem_lock+0x31>
0.00 : ffffffff812734a3: eb bf jmp ffffffff81273464 <sem_lock+0xd4>
: /*
: * If sma->complex_count was set prior to acquisition,
: * we must fall back to the global array lock.
: */
: if (unlikely(sma->complex_count)) {
: spin_unlock(&sem->lock);
0.00 : ffffffff812734a5: 48 89 df mov %rbx,%rdi
0.00 : ffffffff812734a8: e8 b3 d3 34 00 callq ffffffff815c0860 <rt_spin_unlock>
0.00 : ffffffff812734ad: 0f 1f 00 nopl (%rax)
0.00 : ffffffff812734b0: e8 0b 48 e1 ff callq ffffffff81087cc0 <migrate_enable>
: goto lock_array;
0.00 : ffffffff812734b5: e9 fa fe ff ff jmpq ffffffff812733b4 <sem_lock+0x24>