Good day Al and all
could you review couple patches that implement $subj
for vfs and tmpfs. In short the idea is that we can
protect operations taking semaphore related for set
of names. definitely, protection at vfs layer isn't
enough and filesystem will need to protect their own
structures by itself, but in some cases vfs patch is
enough. for example, tmpfs. during some loads one can
see quite high load in /tmp. being mounted as tmpfs
on big smp, we can get high contention on i_sem.
probably someone could try more-less real load?
I wrote simple program that spawn few processes, then
chdir to the given directory, then loops creating and
unlinking file. The test box is dual PIII-1GHz:
run 1: 2 processes create/unlink file on regular tmpfs
[root@bob root]# mount -t tmpfs none /test
[root@bob root]# (cd /test; time /root/crunl ./f 1000000 2)
#1998: 1000000 iterations, create/unlink ./f-0-1998
#1999: 1000000 iterations, create/unlink ./f-1-1999
#384: done
#384: done
wait for completion ... OK
real 0m36.224s
user 0m0.823s
sys 0m47.994s
run 2: 2 processes create/unlink file on tmpfs + pdirops
[root@bob root]# mount -t tmpfs -o pdirops none /test
[root@bob root]# (cd /test; time /root/crunl ./f 1000000 2)
#1992: 1000000 iterations, create/unlink ./f-0-1992
#1993: 1000000 iterations, create/unlink ./f-1-1993
#384: done
#384: done
wait for completion ... OK
real 0m15.108s
user 0m0.592s
sys 0m29.406s
run 3: 1 process creates/unlinks file on regular tmpfs
[root@bob root]# mount -t tmpfs none /test
[root@bob root]# (cd /test; time /root/crunl ./f 1000000 1)
#2004: 1000000 iterations, create/unlink ./f-0-2004
#384: done
wait for completion ... OK
real 0m11.950s
user 0m0.262s
sys 0m7.465s
run 4: 1 process creates/unlinks file on tmpf + pdirops
[root@bob root]# mount -t tmpfs -o pdirops none /test
[root@bob root]# (cd /test; time /root/crunl ./f 1000000 1)
#2009: 1000000 iterations, create/unlink ./f-0-2009
#384: done
wait for completion ... OK
real 0m8.047s
user 0m0.243s
sys 0m7.646s
2 processes creating/unlinking on regular tmpfs cause ~200K context switches:
procs memory swap io system cpu
r b w swpd free buff cache si so bi bo in cs us sy id
2 0 0 0 1005760 6616 10928 0 0 0 0 1007 215095 1 64 35
2 0 0 0 1005760 6616 10928 0 0 0 0 1007 213580 1 67 32
2 0 0 0 1005760 6616 10928 0 0 0 0 1007 214445 1 63 36
2 0 0 0 1005760 6616 10928 0 0 0 0 1007 216250 1 63 36
2 processes creating/unlinking on tmpfs + pdirops cause ~44 context switches:
procs memory swap io system cpu
r b w swpd free buff cache si so bi bo in cs us sy id
2 0 1 0 1001824 6632 10912 0 0 0 0 1008 41 2 98 0
2 0 2 0 1002144 6632 10912 0 0 0 0 1008 45 2 98 0
2 0 2 0 1001632 6632 10912 0 0 0 0 1007 47 2 98 0
the next benchmark is rename. two processes generate random name, create file,
generate new name, rename created file in new name and unlink:
run 5: regular tmpfs
[root@bob root]# mount -t tmpfs none /test
[root@bob root]# (cd /test; time /root/rndrename ./f 1000000 2)
#2036: 1000000 iterations
#2037: 1000000 iterations
wait for completion ... OK
real 1m22.381s
user 0m10.254s
sys 1m50.214s
run 6: tmpfs + pdirops
[root@bob root]# mount -t tmpfs -o pdirops none /test
[root@bob root]# (cd /test; time /root/rndrename ./f 1000000 2)
#2044: 1000000 iterations
#2045: 1000000 iterations
wait for completion ... OK
real 0m39.403s
user 0m9.411s
sys 1m8.626s
thanks, Alex
fs/inode.c | 1
fs/namei.c | 66 ++++++++++++++++++++++++++++++++++++++---------------
include/linux/fs.h | 11 ++++----
3 files changed, 54 insertions(+), 24 deletions(-)
Index: linux-2.6.10/fs/namei.c
===================================================================
--- linux-2.6.10.orig/fs/namei.c 2005-01-28 19:32:13.000000000 +0300
+++ linux-2.6.10/fs/namei.c 2005-02-19 20:40:05.763437304 +0300
@@ -104,6 +104,28 @@
* any extra contention...
*/
+static inline struct semaphore * lock_sem(struct inode *dir, struct qstr *name)
+{
+ if (IS_PDIROPS(dir)) {
+ struct super_block *sb;
+ /* name->hash expected to be already calculated */
+ sb = dir->i_sb;
+ BUG_ON(sb->s_pdirops_sems == NULL);
+ return sb->s_pdirops_sems + name->hash % sb->s_pdirops_size;
+ }
+ return &dir->i_sem;
+}
+
+static inline void lock_dir(struct inode *dir, struct qstr *name)
+{
+ down(lock_sem(dir, name));
+}
+
+static inline void unlock_dir(struct inode *dir, struct qstr *name)
+{
+ up(lock_sem(dir, name));
+}
+
/* In order to reduce some races, while at the same time doing additional
* checking and hopefully speeding things up, we copy filenames to the
* kernel data space before using them..
@@ -380,7 +402,7 @@
struct dentry * result;
struct inode *dir = parent->d_inode;
- down(&dir->i_sem);
+ lock_dir(dir, name);
/*
* First re-do the cached lookup just in case it was created
* while we waited for the directory semaphore..
@@ -406,7 +428,7 @@
else
result = dentry;
}
- up(&dir->i_sem);
+ unlock_dir(dir, name);
return result;
}
@@ -414,7 +436,7 @@
* Uhhuh! Nasty case: the cache was re-populated while
* we waited on the semaphore. Need to revalidate.
*/
- up(&dir->i_sem);
+ unlock_dir(dir, name);
if (result->d_op && result->d_op->d_revalidate) {
if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
dput(result);
@@ -1182,12 +1204,26 @@
/*
* p1 and p2 should be directories on the same fs.
*/
-struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+struct dentry *lock_rename(struct dentry *p1, struct qstr *n1,
+ struct dentry *p2, struct qstr *n2)
{
struct dentry *p;
if (p1 == p2) {
- down(&p1->d_inode->i_sem);
+ if (IS_PDIROPS(p1->d_inode)) {
+ unsigned int h1, h2;
+ h1 = n1->hash % p1->d_inode->i_sb->s_pdirops_size;
+ h2 = n2->hash % p2->d_inode->i_sb->s_pdirops_size;
+ if (h1 < h2) {
+ lock_dir(p1->d_inode, n1);
+ lock_dir(p2->d_inode, n2);
+ } else if (h1 > h2) {
+ lock_dir(p2->d_inode, n2);
+ lock_dir(p1->d_inode, n1);
+ } else
+ lock_dir(p1->d_inode, n1);
+ } else
+ down(&p1->d_inode->i_sem);
return NULL;
}
@@ -1195,31 +1231,35 @@
for (p = p1; p->d_parent != p; p = p->d_parent) {
if (p->d_parent == p2) {
- down(&p2->d_inode->i_sem);
- down(&p1->d_inode->i_sem);
+ lock_dir(p2->d_inode, n2);
+ lock_dir(p1->d_inode, n1);
return p;
}
}
for (p = p2; p->d_parent != p; p = p->d_parent) {
if (p->d_parent == p1) {
- down(&p1->d_inode->i_sem);
- down(&p2->d_inode->i_sem);
+ lock_dir(p1->d_inode, n1);
+ lock_dir(p2->d_inode, n2);
return p;
}
}
- down(&p1->d_inode->i_sem);
- down(&p2->d_inode->i_sem);
+ lock_dir(p1->d_inode, n1);
+ lock_dir(p2->d_inode, n2);
return NULL;
}
-void unlock_rename(struct dentry *p1, struct dentry *p2)
+void unlock_rename(struct dentry *p1, struct qstr *n1,
+ struct dentry *p2, struct qstr *n2)
{
- up(&p1->d_inode->i_sem);
+ unlock_dir(p1->d_inode, n1);
if (p1 != p2) {
- up(&p2->d_inode->i_sem);
+ unlock_dir(p2->d_inode, n2);
up(&p1->d_inode->i_sb->s_vfs_rename_sem);
+ } else if (IS_PDIROPS(p1->d_inode) &&
+ n1->hash != n2->hash) {
+ unlock_dir(p2->d_inode, n2);
}
}
@@ -1386,13 +1426,13 @@
dir = nd->dentry;
nd->flags &= ~LOOKUP_PARENT;
- down(&dir->d_inode->i_sem);
+ lock_dir(dir->d_inode, &nd->last);
dentry = __lookup_hash(&nd->last, nd->dentry, nd);
do_last:
error = PTR_ERR(dentry);
if (IS_ERR(dentry)) {
- up(&dir->d_inode->i_sem);
+ unlock_dir(dir->d_inode, &nd->last);
goto exit;
}
@@ -1401,7 +1441,7 @@
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current->fs->umask;
error = vfs_create(dir->d_inode, dentry, mode, nd);
- up(&dir->d_inode->i_sem);
+ unlock_dir(dir->d_inode, &nd->last);
dput(nd->dentry);
nd->dentry = dentry;
if (error)
@@ -1415,7 +1455,7 @@
/*
* It already exists.
*/
- up(&dir->d_inode->i_sem);
+ unlock_dir(dir->d_inode, &nd->last);
error = -EEXIST;
if (flag & O_EXCL)
@@ -1499,7 +1539,7 @@
goto exit;
}
dir = nd->dentry;
- down(&dir->d_inode->i_sem);
+ lock_dir(dir->d_inode, &nd->last);
dentry = __lookup_hash(&nd->last, nd->dentry, nd);
putname(nd->last.name);
goto do_last;
@@ -1517,7 +1557,7 @@
{
struct dentry *dentry;
- down(&nd->dentry->d_inode->i_sem);
+ lock_dir(nd->dentry->d_inode, &nd->last);
dentry = ERR_PTR(-EEXIST);
if (nd->last_type != LAST_NORM)
goto fail;
@@ -1602,7 +1642,7 @@
}
dput(dentry);
}
- up(&nd.dentry->d_inode->i_sem);
+ unlock_dir(nd.dentry->d_inode, &nd.last);
path_release(&nd);
out:
putname(tmp);
@@ -1656,7 +1696,7 @@
error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
dput(dentry);
}
- up(&nd.dentry->d_inode->i_sem);
+ unlock_dir(nd.dentry->d_inode, &nd.last);
path_release(&nd);
out:
putname(tmp);
@@ -1757,14 +1797,14 @@
error = -EBUSY;
goto exit1;
}
- down(&nd.dentry->d_inode->i_sem);
+ lock_dir(nd.dentry->d_inode, &nd.last);
dentry = lookup_hash(&nd.last, nd.dentry);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
error = vfs_rmdir(nd.dentry->d_inode, dentry);
dput(dentry);
}
- up(&nd.dentry->d_inode->i_sem);
+ unlock_dir(nd.dentry->d_inode, &nd.last);
exit1:
path_release(&nd);
exit:
@@ -1826,7 +1866,7 @@
error = -EISDIR;
if (nd.last_type != LAST_NORM)
goto exit1;
- down(&nd.dentry->d_inode->i_sem);
+ lock_dir(nd.dentry->d_inode, &nd.last);
dentry = lookup_hash(&nd.last, nd.dentry);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
@@ -1840,7 +1880,7 @@
exit2:
dput(dentry);
}
- up(&nd.dentry->d_inode->i_sem);
+ unlock_dir(nd.dentry->d_inode, &nd.last);
if (inode)
iput(inode); /* truncate the inode here */
exit1:
@@ -1902,7 +1942,7 @@
error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
dput(dentry);
}
- up(&nd.dentry->d_inode->i_sem);
+ unlock_dir(nd.dentry->d_inode, &nd.last);
path_release(&nd);
out:
putname(to);
@@ -1986,7 +2026,7 @@
error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
dput(new_dentry);
}
- up(&nd.dentry->d_inode->i_sem);
+ unlock_dir(nd.dentry->d_inode, &nd.last);
out_release:
path_release(&nd);
out:
@@ -2174,7 +2214,7 @@
if (newnd.last_type != LAST_NORM)
goto exit2;
- trap = lock_rename(new_dir, old_dir);
+ trap = lock_rename(new_dir, &newnd.last, old_dir, &oldnd.last);
old_dentry = lookup_hash(&oldnd.last, old_dir);
error = PTR_ERR(old_dentry);
@@ -2212,7 +2252,7 @@
exit4:
dput(old_dentry);
exit3:
- unlock_rename(new_dir, old_dir);
+ unlock_rename(new_dir, &newnd.last, old_dir, &oldnd.last);
exit2:
path_release(&newnd);
exit1:
Index: linux-2.6.10/fs/super.c
===================================================================
--- linux-2.6.10.orig/fs/super.c 2005-01-28 19:32:13.000000000 +0300
+++ linux-2.6.10/fs/super.c 2005-02-19 18:11:50.000000000 +0300
@@ -172,6 +172,10 @@
down_write(&s->s_umount);
fs->kill_sb(s);
put_filesystem(fs);
+ if (s->s_flags & S_PDIROPS) {
+ BUG_ON(s->s_pdirops_sems == NULL);
+ kfree(s->s_pdirops_sems);
+ }
put_super(s);
}
}
@@ -791,6 +795,22 @@
EXPORT_SYMBOL(get_sb_single);
+static int init_pdirops_sems(struct super_block *sb)
+{
+ int i;
+
+ /* XXX: should be memsize dependent? -bzzz */
+ sb->s_pdirops_size = 1024;
+
+ sb->s_pdirops_sems = kmalloc(sizeof(struct semaphore) *
+ sb->s_pdirops_size, GFP_KERNEL);
+ if (sb->s_pdirops_sems == NULL)
+ return -ENOMEM;
+ for (i = 0; i < sb->s_pdirops_size; i++)
+ sema_init(&sb->s_pdirops_sems[i], 1);
+ return 0;
+}
+
struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
@@ -827,6 +847,11 @@
error = security_sb_kern_mount(sb, secdata);
if (error)
goto out_sb;
+ if (sb->s_flags & S_PDIROPS) {
+ error = init_pdirops_sems(sb);
+ if (error)
+ goto out_sb;
+ }
mnt->mnt_sb = sb;
mnt->mnt_root = dget(sb->s_root);
mnt->mnt_mountpoint = sb->s_root;
Index: linux-2.6.10/include/linux/fs.h
===================================================================
--- linux-2.6.10.orig/include/linux/fs.h 2005-01-28 19:32:15.000000000 +0300
+++ linux-2.6.10/include/linux/fs.h 2005-02-19 18:11:50.000000000 +0300
@@ -150,6 +150,7 @@
#define S_DIRSYNC 64 /* Directory modifications are synchronous */
#define S_NOCMTIME 128 /* Do not update file c/mtime */
#define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
+#define S_PDIROPS 512 /* Parallel directory operations */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -180,6 +181,7 @@
#define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME)
#define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL)
#define IS_ONE_SECOND(inode) __IS_FLG(inode, MS_ONE_SECOND)
+#define IS_PDIROPS(inode) __IS_FLG(inode, S_PDIROPS)
#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
@@ -797,6 +799,8 @@
* even looking at it. You had been warned.
*/
struct semaphore s_vfs_rename_sem; /* Kludge */
+ struct semaphore *s_pdirops_sems;
+ int s_pdirops_size;
};
/*
Index: linux-2.6.10/include/linux/namei.h
===================================================================
--- linux-2.6.10.orig/include/linux/namei.h 2005-01-28 19:29:50.000000000 +0300
+++ linux-2.6.10/include/linux/namei.h 2005-02-19 19:59:19.291357592 +0300
@@ -69,8 +69,10 @@
extern int follow_down(struct vfsmount **, struct dentry **);
extern int follow_up(struct vfsmount **, struct dentry **);
-extern struct dentry *lock_rename(struct dentry *, struct dentry *);
-extern void unlock_rename(struct dentry *, struct dentry *);
+extern struct dentry *lock_rename(struct dentry *, struct qstr *,
+ struct dentry *, struct qstr *);
+extern void unlock_rename(struct dentry *, struct qstr *,
+ struct dentry *, struct qstr *);
static inline void nd_set_link(struct nameidata *nd, char *path)
{
Index: linux-2.6.10/mm/shmem.c
===================================================================
--- linux-2.6.10.orig/mm/shmem.c 2005-01-28 19:32:16.000000000 +0300
+++ linux-2.6.10/mm/shmem.c 2005-02-19 20:05:32.642599576 +0300
@@ -1849,7 +1849,7 @@
#endif
};
-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes, struct super_block *sb)
{
char *this_char, *value, *rest;
@@ -1858,6 +1858,9 @@
continue;
if ((value = strchr(this_char,'=')) != NULL) {
*value++ = 0;
+ } else if (!strcmp(this_char,"pdirops")) {
+ sb->s_flags |= S_PDIROPS;
+ continue;
} else {
printk(KERN_ERR
"tmpfs: No value for mount option '%s'\n",
@@ -1928,7 +1931,7 @@
max_blocks = sbinfo->max_blocks;
max_inodes = sbinfo->max_inodes;
}
- if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
+ if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes, sb))
return -EINVAL;
/* Keep it simple: disallow limited <-> unlimited remount */
if ((max_blocks || max_inodes) == !sbinfo)
@@ -1978,7 +1981,7 @@
inodes = blocks;
if (shmem_parse_options(data, &mode,
- &uid, &gid, &blocks, &inodes))
+ &uid, &gid, &blocks, &inodes, sb))
return -EINVAL;
}
Alex Tomas wrote:
> +static inline struct semaphore * lock_sem(struct inode *dir, struct qstr *name)
> +{
> + if (IS_PDIROPS(dir)) {
> + struct super_block *sb;
> + /* name->hash expected to be already calculated */
> + sb = dir->i_sb;
> + BUG_ON(sb->s_pdirops_sems == NULL);
> + return sb->s_pdirops_sems + name->hash % sb->s_pdirops_size;
> + }
> + return &dir->i_sem;
> +}
> +
> +static inline void lock_dir(struct inode *dir, struct qstr *name)
> +{
> + down(lock_sem(dir, name));
> +}
> +
> @@ -1182,12 +1204,26 @@
> /*
> * p1 and p2 should be directories on the same fs.
> */
> -struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
> +struct dentry *lock_rename(struct dentry *p1, struct qstr *n1,
> + struct dentry *p2, struct qstr *n2)
> {
> struct dentry *p;
>
> if (p1 == p2) {
> - down(&p1->d_inode->i_sem);
> + if (IS_PDIROPS(p1->d_inode)) {
> + unsigned int h1, h2;
> + h1 = n1->hash % p1->d_inode->i_sb->s_pdirops_size;
> + h2 = n2->hash % p2->d_inode->i_sb->s_pdirops_size;
> + if (h1 < h2) {
> + lock_dir(p1->d_inode, n1);
> + lock_dir(p2->d_inode, n2);
> + } else if (h1 > h2) {
> + lock_dir(p2->d_inode, n2);
> + lock_dir(p1->d_inode, n1);
> + } else
> + lock_dir(p1->d_inode, n1);
> + } else
> + down(&p1->d_inode->i_sem);
> return NULL;
> }
>
> @@ -1195,31 +1231,35 @@
>
> for (p = p1; p->d_parent != p; p = p->d_parent) {
> if (p->d_parent == p2) {
> - down(&p2->d_inode->i_sem);
> - down(&p1->d_inode->i_sem);
> + lock_dir(p2->d_inode, n2);
> + lock_dir(p1->d_inode, n1);
> return p;
> }
> }
>
> for (p = p2; p->d_parent != p; p = p->d_parent) {
> if (p->d_parent == p1) {
> - down(&p1->d_inode->i_sem);
> - down(&p2->d_inode->i_sem);
> + lock_dir(p1->d_inode, n1);
> + lock_dir(p2->d_inode, n2);
> return p;
> }
> }
>
> - down(&p1->d_inode->i_sem);
> - down(&p2->d_inode->i_sem);
> + lock_dir(p1->d_inode, n1);
> + lock_dir(p2->d_inode, n2);
> return NULL;
> }
With luck you have s_pdirops_size (or 1024) different renames altering
concurrently one directory inode. Therefore you need a lock protecting
your filesystem data. This is basically the job done by i_sem. So in my
opinion you only move "The Problem" from the VFS to the lowlevel
filesystems. But then there is no need for i_sem or your s_pdirops_sems
anymore.
Regards,
Jan
>>>>> Jan Blunck (JB) writes:
JB> With luck you have s_pdirops_size (or 1024) different renames altering
JB> concurrently one directory inode. Therefore you need a lock protecting
JB> your filesystem data. This is basically the job done by i_sem. So in
JB> my opinion you only move "The Problem" from the VFS to the lowlevel
JB> filesystems. But then there is no need for i_sem or your
JB> s_pdirops_sems anymore.
1) i_sem protects dcache too
2) tmpfs has no "own" data, so we can use it this way (see 2nd patch)
3) I have pdirops patch for ext3, but it needs some cleaning ...
thanks, Alex