Binary files linux-2.5.43/arch/i386/boot/compressed/vmlinux.bin.gz and linux-2.5.43-ipc/arch/i386/boot/compressed/vmlinux.bin.gz differ
diff -urN -X dontdiff linux-2.5.43/include/linux/ipc.h linux-2.5.43-ipc/include/linux/ipc.h
--- linux-2.5.43/include/linux/ipc.h Tue Oct 15 20:26:43 2002
+++ linux-2.5.43-ipc/include/linux/ipc.h Wed Oct 16 09:48:28 2002
@@ -56,6 +56,7 @@
/* used by in-kernel data structures */
struct kern_ipc_perm
{
+ spinlock_t lock;
key_t key;
uid_t uid;
gid_t gid;
diff -urN -X dontdiff linux-2.5.43/ipc/shm.c linux-2.5.43-ipc/ipc/shm.c
--- linux-2.5.43/ipc/shm.c Tue Oct 15 20:28:22 2002
+++ linux-2.5.43-ipc/ipc/shm.c Wed Oct 16 09:48:28 2002
@@ -38,8 +38,6 @@
#define shm_lock(id) ((struct shmid_kernel*)ipc_lock(&shm_ids,id))
#define shm_unlock(id) ipc_unlock(&shm_ids,id)
-#define shm_lockall() ipc_lockall(&shm_ids)
-#define shm_unlockall() ipc_unlockall(&shm_ids)
#define shm_get(id) ((struct shmid_kernel*)ipc_get(&shm_ids,id))
#define shm_buildid(id, seq) \
ipc_buildid(&shm_ids, id, seq)
@@ -409,14 +407,12 @@
memset(&shm_info,0,sizeof(shm_info));
down(&shm_ids.sem);
- shm_lockall();
shm_info.used_ids = shm_ids.in_use;
shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp);
shm_info.shm_tot = shm_tot;
shm_info.swap_attempts = 0;
shm_info.swap_successes = 0;
err = shm_ids.max_id;
- shm_unlockall();
up(&shm_ids.sem);
if(copy_to_user (buf, &shm_info, sizeof(shm_info)))
return -EFAULT;
diff -urN -X dontdiff linux-2.5.43/ipc/util.c linux-2.5.43-ipc/ipc/util.c
--- linux-2.5.43/ipc/util.c Tue Oct 15 20:27:54 2002
+++ linux-2.5.43-ipc/ipc/util.c Wed Oct 16 09:48:28 2002
@@ -92,8 +92,10 @@
{
int id;
struct kern_ipc_perm* p;
+ int max_id = ids->max_id;
- for (id = 0; id <= ids->max_id; id++) {
+ read_barrier_depends();
+ for (id = 0; id <= max_id; id++) {
p = ids->entries[id].p;
if(p==NULL)
continue;
@@ -106,8 +108,8 @@
static int grow_ary(struct ipc_ids* ids, int newsize)
{
struct ipc_id* new;
- struct ipc_id* old;
int i;
+ struct rcu_ipc_array *arg = NULL;
if(newsize > IPCMNI)
newsize = IPCMNI;
@@ -121,14 +123,19 @@
for(i=ids->size;i<newsize;i++) {
new[i].p = NULL;
}
+ arg = (struct rcu_ipc_array *) kmalloc(sizeof(*arg), GFP_KERNEL);
+ if(arg == NULL)
+ return ids->size;
+ arg->entries = ids->entries;
+ arg->size = ids->size;
+
spin_lock(&ids->ary);
-
- old = ids->entries;
ids->entries = new;
- i = ids->size;
+ wmb();
ids->size = newsize;
spin_unlock(&ids->ary);
- ipc_free(old, sizeof(struct ipc_id)*i);
+
+ call_rcu(&arg->rh, ipc_free_callback, arg);
return ids->size;
}
@@ -166,7 +173,9 @@
if(ids->seq > ids->seq_max)
ids->seq = 0;
- spin_lock(&ids->ary);
+ new->lock = SPIN_LOCK_UNLOCKED;
+ rcu_read_lock();
+ spin_lock(&new->lock);
ids->entries[id].p = new;
return id;
}
@@ -188,6 +197,7 @@
int lid = id % SEQ_MULTIPLIER;
if(lid >= ids->size)
BUG();
+ rmb();
p = ids->entries[lid].p;
ids->entries[lid].p = NULL;
if(p==NULL)
@@ -239,7 +249,12 @@
else
kfree(ptr);
}
-
+static void ipc_free_callback(void * arg)
+{
+ struct rcu_ipc_array *a = (struct rcu_ipc_array *)arg;
+ ipc_free(a->entries, a->size);
+ kfree(arg);
+}
/**
* ipcperms - check IPC permissions
* @ipcp: IPC permission set
diff -urN -X dontdiff linux-2.5.43/ipc/util.h linux-2.5.43-ipc/ipc/util.h
--- linux-2.5.43/ipc/util.h Tue Oct 15 20:28:24 2002
+++ linux-2.5.43-ipc/ipc/util.h Wed Oct 16 09:48:28 2002
@@ -4,6 +4,7 @@
*
* ipc helper functions (c) 1999 Manfred Spraul <[email protected]>
*/
+#include <linux/rcupdate.h>
#define USHRT_MAX 0xffff
#define SEQ_MULTIPLIER (IPCMNI)
@@ -12,6 +13,12 @@
void msg_init (void);
void shm_init (void);
+struct rcu_ipc_array {
+ struct rcu_head rh;
+ struct ipc_id* entries;
+ int size;
+};
+
struct ipc_ids {
int size;
int in_use;
@@ -44,11 +51,7 @@
*/
void* ipc_alloc(int size);
void ipc_free(void* ptr, int size);
-
-extern inline void ipc_lockall(struct ipc_ids* ids)
-{
- spin_lock(&ids->ary);
-}
+void ipc_free_callback(void* arg);
extern inline struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id)
{
@@ -56,32 +59,43 @@
int lid = id % SEQ_MULTIPLIER;
if(lid >= ids->size)
return NULL;
-
+ rmb();
out = ids->entries[lid].p;
return out;
}
-extern inline void ipc_unlockall(struct ipc_ids* ids)
-{
- spin_unlock(&ids->ary);
-}
extern inline struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
{
struct kern_ipc_perm* out;
int lid = id % SEQ_MULTIPLIER;
- if(lid >= ids->size)
- return NULL;
- spin_lock(&ids->ary);
+ rcu_read_lock();
+ if(lid >= ids->size) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ rmb();
out = ids->entries[lid].p;
- if(out==NULL)
- spin_unlock(&ids->ary);
+ if(out==NULL) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ spin_lock(&out->lock);
return out;
}
extern inline void ipc_unlock(struct ipc_ids* ids, int id)
{
- spin_unlock(&ids->ary);
+ int lid = id % SEQ_MULTIPLIER;
+ struct kern_ipc_perm* out;
+
+ if(lid >= ids->size)
+ return;
+ rmb();
+ out = ids->entries[lid].p;
+ if (out)
+ spin_unlock(&out->lock);
+ rcu_read_unlock();
}
extern inline int ipc_buildid(struct ipc_ids* ids, int id, int seq)
On Thu, 17 Oct 2002, mingming cao wrote:
> Hi Linus,
>
> This is the latest version of the ipc lock patch. It breaks down the
> three global IPC locks into one lock per IPC ID, also addresses the
> cache line bouncing problem introduced in the original patch. The
> original post could be found at:
> http://marc.theaimsgroup.com/?l=linux-kernel&m=102980357802682&w=2
>
> The original patch breaks down the global IPC locks, yet added another
> layer of locking to protect the IPC ID array in case of resizing. Some
> concern was raised that the read/write lock may cause cache line
> bouncing.
>
> Since write lock is only used when the array is dynamically resized,
> RCU seems perfectly fit for this situation. By doing so it could reduce
> the possible lock contention in some applications where the IPC
> resources are heavily used, without introducing cache line bouncing.
>
> Besides the RCU changes, it also remove the redundant ipc_lockall() and
> ipc_unlockall() as suggested by Hugh Dickins.
>
> Patch is against 2.5.43 kernel. It requires Dipankar Sarma's
> read_barrier_depends RCU helper patch:
> http://marc.theaimsgroup.com/?l=linux-kernel&m=103479438017486&w=2
>
> We use the ipc lock on OracleApps and it gave us the best number.
> Please include.
This looks very good to me: I'm glad you found the RCU idea works out.
No need for performance numbers, this is now clearly the right way to
go. And read_barrier_depends is in 2.5.44, so no problem there.
I'm ignorant of RCU, and my mind goes mushy around memory barriers,
but I expect you've consulted the best there; and I'll be wanting to
refer to this implementation as a nice example of how to use RCU.
But please make a couple of small cleanups, unless you disagree.
Now delete spinlock_t ary and all references to it: only grow_ary
is using it, and it's already protected by sem, and we'd be in
trouble with concurrent allocations if it were not.
And I'd be happier to see ipc_unlock without those conditionals i.e.
delete the "if(lid >= ids->size) return;" and the "if (out)" - they
seem to encourage calling ipc_unlock where ipc_lock did not succeed,
but that would be unsafe. If you found somewhere that's being done,
I think we need to fix that place, not work around it in ipc_unlock.
Linus is away this week (so I've left him off, to avoid clogging up
/dev/null): perhaps Andrew could take your patch into his -mm tree
when you've made those changes (or persuaded us against)?
Hugh
On Sun, 20 Oct 2002, Hugh Dickins wrote:
>
> This looks very good to me: I'm glad you found the RCU idea works out.
> ...
> And I'd be happier to see ipc_unlock without those conditionals i.e.
> delete the "if(lid >= ids->size) return;" and the "if (out)" - they
> seem to encourage calling ipc_unlock where ipc_lock did not succeed,
> but that would be unsafe. If you found somewhere that's being done,
> I think we need to fix that place, not work around it in ipc_unlock.
Sorry, MingMing, it doesn't look so good to me now.
The "if(lid >= ids->size) return;" still looks unnecessary,
but I think I see why you have "if (out)" in ipc_unlock: because
of ipc_rmid, which has already nulled out entries[lid].p, yes?
A minor point is, wouldn't that skipping of spin_unlock leave you
with the wrong preempt count, on a CONFIG_PREEMPT y configuration?
But that's easily dealt with.
A much more serious point: we could certainly bring the ipc_rmid
and ipc_unlock much closer together; but however close we bring them
(unlock implicit within rmid), there will still be a race with one
cpu in ipc_lock spinning on out->lock, while we in ipc_rmid null
entries[lid].p and unlock and free the structure containing that lock.
I think you're going to have to extend RCU to freeing the entry
(though this is a much less exceptional case than growing the array,
so less clear to me that RCU is appropriate here), if you're to
avoid reverting to the earlier rwlock or embedded spinlock designs.
Perhaps there's a simpler solution - ask around - but I don't see it.
Hugh
Hugh Dickins wrote:
>
> I'm ignorant of RCU, and my mind goes mushy around memory barriers,
> but I expect you've consulted the best there; and I'll be wanting to
> refer to this implementation as a nice example of how to use RCU.
Yes the RCU patch author Dipankar has already reviewed the memory
barriers in ipclock patch.
> Now delete spinlock_t ary and all references to it: only grow_ary
> is using it, and it's already protected by sem, and we'd be in
> trouble with concurrent allocations if it were not.
>
Oh, right. grow_ary does not need spinlock_t ary anymore.:-)
Hugh Dickins wrote:
>
> The "if(lid >= ids->size) return;" still looks unnecessary,
> but I think I see why you have "if (out)" in ipc_unlock: because
> of ipc_rmid, which has already nulled out entries[lid].p, yes?
>
Thanks a lot for your comments. Yes. That's the consideration.
> A minor point is, wouldn't that skipping of spin_unlock leave you
> with the wrong preempt count, on a CONFIG_PREEMPT y configuration?
> But that's easily dealt with.
>
> A much more serious point: we could certainly bring the ipc_rmid
> and ipc_unlock much closer together; but however close we bring them
> (unlock implicit within rmid), there will still be a race with one
> cpu in ipc_lock spinning on out->lock, while we in ipc_rmid null
> entries[lid].p and unlock and free the structure containing that lock.
>
Thanks for pointing this out. This is a issue that has to be addressed.
A simple solution I could think of for this problem, moving the per_id
lock out of the kern_ipc_perm structure, and put it in the ipc_id
structure. Actually I did this way at the first time, then I agreed
with you that moving the per_id lock into there kern_ipc_perm structure
will help reduce cacheline bouncing.
I think that having the per_id lock stay out of the structure it
protects will easy the job of ipc_rmid(), also will avoid the wrong
preempt count problem caused by the additional check "if (out)" in
ipc_unlock() as you mentioned above.
Is this solution looks good to you? If so, I will update the patch for
2.5.44 soon.
Mingming
On Mon, 21 Oct 2002, mingming cao wrote:
> Hugh Dickins wrote:
> > A much more serious point: we could certainly bring the ipc_rmid
> > and ipc_unlock much closer together; but however close we bring them
> > (unlock implicit within rmid), there will still be a race with one
> > cpu in ipc_lock spinning on out->lock, while we in ipc_rmid null
> > entries[lid].p and unlock and free the structure containing that lock.
>
> A simple solution I could think of for this problem, moving the per_id
> lock out of the kern_ipc_perm structure, and put it in the ipc_id
> structure. Actually I did this way at the first time, then I agreed
> with you that moving the per_id lock into there kern_ipc_perm structure
> will help reduce cacheline bouncing.
>
> I think that having the per_id lock stay out of the structure it
> protects will easy the job of ipc_rmid(), also will avoid the wrong
> preempt count problem caused by the additional check "if (out)" in
> ipc_unlock() as you mentioned above.
>
> Is this solution looks good to you? If so, I will update the patch for
> 2.5.44 soon.
Sorry, no, doesn't look good to me. I do agree that it's an easy
solution to this problem, but there's no point in having gone the RCU
route to avoid cacheline bouncing, if you now reintroduce it all here.
I believe, as I said, that you'll have to go further, applying RCU
to freeing the entries themselves. I had toyed with the idea of never
freeing entries once allocated, which is a similarly simple solution;
rejected that as in general too wasteful; and concluded that RCU is a
reasonable compromise - it amounts to lazy freeing, I think.
I'm happy to be overruled by someone who understands these cacheline
bounce issues better than we do, but nobody else has spoken up yet.
Hugh
On Mon, Oct 21, 2002 at 11:11:15AM -0700, mingming cao wrote:
> A simple solution I could think of for this problem, moving the per_id
> lock out of the kern_ipc_perm structure, and put it in the ipc_id
> structure. Actually I did this way at the first time, then I agreed
> with you that moving the per_id lock into there kern_ipc_perm structure
> will help reduce cacheline bouncing.
>
> I think that having the per_id lock stay out of the structure it
> protects will easy the job of ipc_rmid(), also will avoid the wrong
> preempt count problem caused by the additional check "if (out)" in
> ipc_unlock() as you mentioned above.
I took a quick look at the original ipc code and I don't understand
something - it seems to me the ipc_ids structs are protected by the semaphore
inside for all operations, so why do we need the spinlock in the
first place ? Am I missing something here ?
Thanks
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.
Hugh wrote:
>I had toyed with the idea of never
>freeing entries once allocated, which is a similarly simple solution;
>
Not possible, the structure size for ipc sem depends on the number of
semaphores in the semaphore set.
Probably the best approach is to add a "deleted" flag into the ipc_id
structure, and to check that flag after acquiring the spinlock. And
perform the actual free operations for the ipc element in a rcu callback.
At which context do the rcu callbacks run? The semaphore sets are
allocated with vmalloc for large sets, and that function is only
permitted from process context, not from bh or irq context. According to
a comment in rcupdate.c, rcu_process_callbacks runs in a tasklet, i.e.
at bh context.
>I'm happy to be overruled by someone who understands these cacheline
>bounce issues better than we do, but nobody else has spoken up yet.
>
>
Are there any good documents about cacheline bouncing, or rules how to
reduce it?
For example, should a spinlock and the data it protects be in the same
cacheline, or in different cachelines?
I guess that "same cacheline" means that only one cacheline is
transfered if a cpu acquires the spinlock and touches the data.
But OTHO a spinning cpu would probably force the cacheline into shared
state, and that'll slow down the data access for the cpu that owns the
spinlock.
--
Manfred
On Tue, 22 Oct 2002, Dipankar Sarma wrote:
>
> I took a quick look at the original ipc code and I don't understand
> something - it seems to me the ipc_ids structs are protected by the semaphore
> inside for all operations, so why do we need the spinlock in the
> first place ? Am I missing something here ?
I made that mistake too at first, Mingming set me straight.
Many of the entry points down() the ipc_ids.sem semaphore, but the
most significant ones do not. ipc/sem.c is probably the best example
(if confusing, since it involves quite different meanings of semaphore):
sys_semop() is the frequent, fast entry point, uses sem_lock without down.
Hugh
Dipankar Sarma wrote:
>
> I took a quick look at the original ipc code and I don't understand
> something - it seems to me the ipc_ids structs are protected by the semaphore
> inside for all operations, so why do we need the spinlock in the
> first place ? Am I missing something here ?
The semaphore is used to protect the fields in ipc_ids structure, while
the spinlock is used to protect IPC ids. For the current implementation,
there is one spinlock for all IPC ids of the same type(i.e. for all
messages queues). The patch is intend to breaks down the global
spinlock and have a lock per IPC id.
On Mon, 21 Oct 2002, Manfred Spraul wrote:
>
> Probably the best approach is to add a "deleted" flag into the ipc_id
> structure, and to check that flag after acquiring the spinlock. And
> perform the actual free operations for the ipc element in a rcu callback.
Yes, that's what I was proposing.
> At which context do the rcu callbacks run? The semaphore sets are
> allocated with vmalloc for large sets, and that function is only
> permitted from process context, not from bh or irq context. According to
> a comment in rcupdate.c, rcu_process_callbacks runs in a tasklet, i.e.
> at bh context.
Hah! Very good point. Seems we need to think again.
> For example, should a spinlock and the data it protects be in the same
> cacheline, or in different cachelines?
> I guess that "same cacheline" means that only one cacheline is
> transfered if a cpu acquires the spinlock and touches the data.
> But OTHO a spinning cpu would probably force the cacheline into shared
> state, and that'll slow down the data access for the cpu that owns the
> spinlock.
Well, yes, but that's not the issue as I understand it. There you're
thinking of contention on the spinlock, and its effect spreading to
the data protected by that lock. Whereas the more pernicious effect
of cacheline bouncing is when there is no particular contention on
a spinlock, but data is (mis)distributed in such a way that mods to
the same cacheline are likely to occur from different cpus at at about
the same time.
In the original design, Mingming nicely split up the locks (greatly
reducing contention), but had them in an array (causing lots of bounce,
I believe): I'm resisting a return to that design.
Hugh
On Mon, Oct 21, 2002 at 12:41:58PM -0700, mingming cao wrote:
> > I took a quick look at the original ipc code and I don't understand
> > something - it seems to me the ipc_ids structs are protected by the semaphore
> > inside for all operations, so why do we need the spinlock in the
> > first place ? Am I missing something here ?
>
> The semaphore is used to protect the fields in ipc_ids structure, while
> the spinlock is used to protect IPC ids. For the current implementation,
> there is one spinlock for all IPC ids of the same type(i.e. for all
> messages queues). The patch is intend to breaks down the global
Well, if the semaphore is grabbed then the critical section is serialized
including accessing of IPC ids, so there would be no need to have
a separate spinlock for the IPC ids. Hugh pointed out the right reason,
semaphore IPCs use the spinlock for serialization in many paths,
not the semaphore in ipc_ids.
Hugh's point is right - you have two data structures to protect - ipc_id
arrays and the kern_ipc_perms attached to it. I would use a single
spinlock in ipc_ids to serialize all the updates and use RCU for both
to implement safe lockfree lookups. For the second, you would probably
want to add a rcu_head to struct kern_ipc_perms and an RCU callback
to free it.
Thanks
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.
Hugh Dickins wrote:
>
> In the original design, Mingming nicely split up the locks (greatly
> reducing contention), but had them in an array (causing lots of bounce,
> I believe):
I am not an expert of cacheline bouncing, so please point me if I miss
something. I wonder if we could reduce the bounce even with current
design (the spinlock is in the data it protects). We have to go through
that array anyway to get access to the data (and the spinlock).
> I'm resisting a return to that design.
OK, I will back to original design.