2004-06-23 17:10:51

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH] Make POSIX locks compatible with the NPTL thread model

fs/locks.c | 46 ++++------------------------------------------
fs/nfs/direct.c | 4 ++--
fs/nfs/file.c | 2 +-
fs/nfs/nfs4proc.c | 12 ++++++------
fs/nfs/nfs4state.c | 18 +++++++++---------
fs/nfs/nfs4xdr.c | 8 ++++----
fs/nfs/read.c | 4 ++--
fs/nfs/write.c | 4 ++--
fs/open.c | 2 +-
include/linux/fs.h | 2 +-
include/linux/nfs_fs.h | 8 ++++----
include/linux/nfs_page.h | 2 +-
include/linux/nfs_xdr.h | 4 ++--
13 files changed, 39 insertions(+), 77 deletions(-)

diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/fs/locks.c linux-2.6.7-01-lock_owner_fixup/fs/locks.c
--- linux-2.6.7-00-fix_locks/fs/locks.c 2004-06-23 12:46:29.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/fs/locks.c 2004-06-22 16:48:19.000000000 -0400
@@ -317,7 +317,7 @@ static int flock_to_posix_lock(struct fi
if (l->l_len == 0)
fl->fl_end = OFFSET_MAX;

- fl->fl_owner = current->files;
+ fl->fl_owner = 0;
fl->fl_pid = current->tgid;
fl->fl_file = filp;
fl->fl_flags = FL_POSIX;
@@ -357,7 +357,7 @@ static int flock64_to_posix_lock(struct
if (l->l_len == 0)
fl->fl_end = OFFSET_MAX;

- fl->fl_owner = current->files;
+ fl->fl_owner = 0;
fl->fl_pid = current->tgid;
fl->fl_file = filp;
fl->fl_flags = FL_POSIX;
@@ -920,7 +920,6 @@ int posix_lock_file(struct file *filp, s
*/
int locks_mandatory_locked(struct inode *inode)
{
- fl_owner_t owner = current->files;
unsigned int pid = current->tgid;
struct file_lock *fl;

@@ -931,7 +930,7 @@ int locks_mandatory_locked(struct inode
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!IS_POSIX(fl))
continue;
- if (fl->fl_owner != owner)
+ if (fl->fl_owner != 0)
break;
if (fl->fl_pid != pid)
break;
@@ -961,7 +960,7 @@ int locks_mandatory_area(int read_write,
int error;

locks_init_lock(&fl);
- fl.fl_owner = current->files;
+ fl.fl_owner = 0;
fl.fl_pid = current->tgid;
fl.fl_file = filp;
fl.fl_flags = FL_POSIX | FL_ACCESS;
@@ -1985,18 +1984,6 @@ int lock_may_write(struct inode *inode,

EXPORT_SYMBOL(lock_may_write);

-static inline void __steal_locks(struct file *file, fl_owner_t from)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct file_lock *fl = inode->i_flock;
-
- while (fl) {
- if (fl->fl_file == file && fl->fl_owner == from)
- fl->fl_owner = current->files;
- fl = fl->fl_next;
- }
-}
-
/* When getting ready for executing a binary, we make sure that current
* has a files_struct on its own. Before dropping the old files_struct,
* we take over ownership of all locks for all file descriptors we own.
@@ -2005,31 +1992,6 @@ static inline void __steal_locks(struct
*/
void steal_locks(fl_owner_t from)
{
- struct files_struct *files = current->files;
- int i, j;
-
- if (from == files)
- return;
-
- lock_kernel();
- j = 0;
- for (;;) {
- unsigned long set;
- i = j * __NFDBITS;
- if (i >= files->max_fdset || i >= files->max_fds)
- break;
- set = files->open_fds->fds_bits[j++];
- while (set) {
- if (set & 1) {
- struct file *file = files->fd[i];
- if (file)
- __steal_locks(file, from);
- }
- i++;
- set >>= 1;
- }
- }
- unlock_kernel();
}
EXPORT_SYMBOL(steal_locks);

diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/fs/nfs/direct.c linux-2.6.7-01-lock_owner_fixup/fs/nfs/direct.c
--- linux-2.6.7-00-fix_locks/fs/nfs/direct.c 2004-06-16 14:58:26.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/fs/nfs/direct.c 2004-06-22 16:48:19.000000000 -0400
@@ -129,7 +129,7 @@ nfs_direct_read_seg(struct inode *inode,
.inode = inode,
.args = {
.fh = NFS_FH(inode),
- .lockowner = current->files,
+ .pid = current->tgid,
},
.res = {
.fattr = &rdata.fattr,
@@ -259,7 +259,7 @@ nfs_direct_write_seg(struct inode *inode
.inode = inode,
.args = {
.fh = NFS_FH(inode),
- .lockowner = current->files,
+ .pid = current->tgid,
},
.res = {
.fattr = &wdata.fattr,
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/fs/nfs/file.c linux-2.6.7-01-lock_owner_fixup/fs/nfs/file.c
--- linux-2.6.7-00-fix_locks/fs/nfs/file.c 2004-06-16 14:58:08.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/fs/nfs/file.c 2004-06-22 16:48:19.000000000 -0400
@@ -340,7 +340,7 @@ nfs_lock(struct file *filp, int cmd, str
* Not sure whether that would be unique, though, or whether
* that would break in other places.
*/
- if (!fl->fl_owner || !(fl->fl_flags & FL_POSIX))
+ if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;

/*
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/fs/nfs/nfs4proc.c linux-2.6.7-01-lock_owner_fixup/fs/nfs/nfs4proc.c
--- linux-2.6.7-00-fix_locks/fs/nfs/nfs4proc.c 2004-06-16 14:58:17.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/fs/nfs/nfs4proc.c 2004-06-22 16:48:19.000000000 -0400
@@ -1529,7 +1529,7 @@ nfs4_request_init(struct nfs_page *req,
state = (struct nfs4_state *)filp->private_data;
req->wb_state = state;
req->wb_cred = get_rpccred(state->owner->so_cred);
- req->wb_lockowner = current->files;
+ req->wb_pid = current->tgid;
}

static int
@@ -1653,7 +1653,7 @@ nfs4_request_compatible(struct nfs_page
state = (struct nfs4_state *)filp->private_data;
if (req->wb_state != state)
return 0;
- if (req->wb_lockowner != current->files)
+ if (req->wb_pid != current->tgid)
return 0;
cred = state->owner->so_cred;
if (req->wb_cred != cred)
@@ -1780,7 +1780,7 @@ nfs4_proc_getlk(struct nfs4_state *state

nlo.clientid = clp->cl_clientid;
down(&state->lock_sema);
- lsp = nfs4_find_lock_state(state, request->fl_owner);
+ lsp = nfs4_find_lock_state(state, request->fl_pid);
if (lsp)
nlo.id = lsp->ls_id;
else {
@@ -1839,7 +1839,7 @@ nfs4_proc_unlck(struct nfs4_state *state
int status = 0;

down(&state->lock_sema);
- lsp = nfs4_find_lock_state(state, request->fl_owner);
+ lsp = nfs4_find_lock_state(state, request->fl_pid);
if (!lsp)
goto out;
luargs.seqid = lsp->ls_seqid;
@@ -1886,7 +1886,7 @@ nfs4_proc_setlk(struct nfs4_state *state
int status;

down(&state->lock_sema);
- lsp = nfs4_find_lock_state(state, request->fl_owner);
+ lsp = nfs4_find_lock_state(state, request->fl_pid);
if (lsp == NULL) {
struct nfs4_state_owner *owner = state->owner;
struct nfs_open_to_lock otl = {
@@ -1895,7 +1895,7 @@ nfs4_proc_setlk(struct nfs4_state *state
},
};
status = -ENOMEM;
- lsp = nfs4_alloc_lock_state(state, request->fl_owner);
+ lsp = nfs4_alloc_lock_state(state, request->fl_pid);
if (!lsp)
goto out;
otl.lock_seqid = lsp->ls_seqid;
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/fs/nfs/nfs4state.c linux-2.6.7-01-lock_owner_fixup/fs/nfs/nfs4state.c
--- linux-2.6.7-00-fix_locks/fs/nfs/nfs4state.c 2004-06-16 14:57:44.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/fs/nfs/nfs4state.c 2004-06-22 16:48:19.000000000 -0400
@@ -496,11 +496,11 @@ nfs4_close_state(struct nfs4_state *stat
* that is compatible with current->files
*/
static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+__nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid)
{
struct nfs4_lock_state *pos;
list_for_each_entry(pos, &state->lock_states, ls_locks) {
- if (pos->ls_owner != fl_owner)
+ if (pos->ls_pid != pid)
continue;
atomic_inc(&pos->ls_count);
return pos;
@@ -509,11 +509,11 @@ __nfs4_find_lock_state(struct nfs4_state
}

struct nfs4_lock_state *
-nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid)
{
struct nfs4_lock_state *lsp;
read_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, fl_owner);
+ lsp = __nfs4_find_lock_state(state, pid);
read_unlock(&state->state_lock);
return lsp;
}
@@ -525,7 +525,7 @@ nfs4_find_lock_state(struct nfs4_state *
* The caller must be holding state->lock_sema
*/
struct nfs4_lock_state *
-nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+nfs4_alloc_lock_state(struct nfs4_state *state, unsigned int pid)
{
struct nfs4_lock_state *lsp;
struct nfs4_client *clp = state->owner->so_client;
@@ -537,7 +537,7 @@ nfs4_alloc_lock_state(struct nfs4_state
lsp->ls_id = -1;
memset(lsp->ls_stateid.data, 0, sizeof(lsp->ls_stateid.data));
atomic_set(&lsp->ls_count, 1);
- lsp->ls_owner = fl_owner;
+ lsp->ls_pid = pid;
lsp->ls_parent = state;
INIT_LIST_HEAD(&lsp->ls_locks);
spin_lock(&clp->cl_lock);
@@ -551,12 +551,12 @@ nfs4_alloc_lock_state(struct nfs4_state
* requests.
*/
void
-nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner)
+nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, unsigned int pid)
{
if (test_bit(LK_STATE_IN_USE, &state->flags)) {
struct nfs4_lock_state *lsp;

- lsp = nfs4_find_lock_state(state, fl_owner);
+ lsp = nfs4_find_lock_state(state, pid);
if (lsp) {
memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
nfs4_put_lock_state(lsp);
@@ -628,7 +628,7 @@ nfs4_notify_unlck(struct inode *inode, s
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!(fl->fl_flags & FL_POSIX))
continue;
- if (fl->fl_owner != lsp->ls_owner)
+ if (fl->fl_pid != lsp->ls_pid)
continue;
/* Exit if we find at least one lock which is not consumed */
if (nfs4_check_unlock(fl,request) == 0)
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/fs/nfs/nfs4xdr.c linux-2.6.7-01-lock_owner_fixup/fs/nfs/nfs4xdr.c
--- linux-2.6.7-00-fix_locks/fs/nfs/nfs4xdr.c 2004-06-16 14:58:28.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/fs/nfs/nfs4xdr.c 2004-06-22 16:48:19.000000000 -0400
@@ -887,7 +887,7 @@ static int encode_putrootfh(struct xdr_s
return 0;
}

-static void encode_stateid(struct xdr_stream *xdr, struct nfs4_state *state, fl_owner_t lockowner)
+static void encode_stateid(struct xdr_stream *xdr, struct nfs4_state *state, unsigned int pid)
{
extern nfs4_stateid zero_stateid;
nfs4_stateid stateid;
@@ -895,7 +895,7 @@ static void encode_stateid(struct xdr_st

RESERVE_SPACE(16);
if (state != NULL) {
- nfs4_copy_stateid(&stateid, state, lockowner);
+ nfs4_copy_stateid(&stateid, state, pid);
WRITEMEM(stateid.data, sizeof(stateid.data));
} else
WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
@@ -908,7 +908,7 @@ static int encode_read(struct xdr_stream
RESERVE_SPACE(4);
WRITE32(OP_READ);

- encode_stateid(xdr, args->state, args->lockowner);
+ encode_stateid(xdr, args->state, args->pid);

RESERVE_SPACE(12);
WRITE64(args->offset);
@@ -1075,7 +1075,7 @@ static int encode_write(struct xdr_strea
RESERVE_SPACE(4);
WRITE32(OP_WRITE);

- encode_stateid(xdr, args->state, args->lockowner);
+ encode_stateid(xdr, args->state, args->pid);

RESERVE_SPACE(16);
WRITE64(args->offset);
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/fs/nfs/read.c linux-2.6.7-01-lock_owner_fixup/fs/nfs/read.c
--- linux-2.6.7-00-fix_locks/fs/nfs/read.c 2004-06-16 14:57:41.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/fs/nfs/read.c 2004-06-22 16:48:19.000000000 -0400
@@ -108,7 +108,7 @@ nfs_readpage_sync(struct file *file, str
rdata->inode = inode;
INIT_LIST_HEAD(&rdata->pages);
rdata->args.fh = NFS_FH(inode);
- rdata->args.lockowner = current->files;
+ rdata->args.pid = current->tgid;
rdata->args.pages = &page;
rdata->args.pgbase = 0UL;
rdata->args.count = rsize;
@@ -225,7 +225,7 @@ static void nfs_read_rpcsetup(struct nfs
data->args.pgbase = req->wb_pgbase + offset;
data->args.pages = data->pagevec;
data->args.count = count;
- data->args.lockowner = req->wb_lockowner;
+ data->args.pid = req->wb_pid;
data->args.state = req->wb_state;

data->res.fattr = &data->fattr;
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/fs/nfs/write.c linux-2.6.7-01-lock_owner_fixup/fs/nfs/write.c
--- linux-2.6.7-00-fix_locks/fs/nfs/write.c 2004-06-16 14:58:17.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/fs/nfs/write.c 2004-06-22 16:48:19.000000000 -0400
@@ -189,7 +189,7 @@ static int nfs_writepage_sync(struct fil
wdata->flags = how;
wdata->inode = inode;
wdata->args.fh = NFS_FH(inode);
- wdata->args.lockowner = current->files;
+ wdata->args.pid = current->tgid;
wdata->args.pages = &page;
wdata->args.stable = NFS_FILE_SYNC;
wdata->args.pgbase = offset;
@@ -868,7 +868,7 @@ static void nfs_write_rpcsetup(struct nf
data->args.pgbase = req->wb_pgbase + offset;
data->args.pages = data->pagevec;
data->args.count = count;
- data->args.lockowner = req->wb_lockowner;
+ data->args.pid = req->wb_pid;
data->args.state = req->wb_state;

data->res.fattr = &data->fattr;
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/fs/open.c linux-2.6.7-01-lock_owner_fixup/fs/open.c
--- linux-2.6.7-00-fix_locks/fs/open.c 2004-06-16 14:57:46.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/fs/open.c 2004-06-22 16:48:19.000000000 -0400
@@ -1007,7 +1007,7 @@ int filp_close(struct file *filp, fl_own
}

dnotify_flush(filp, id);
- locks_remove_posix(filp, id);
+ locks_remove_posix(filp, 0);
fput(filp);
return retval;
}
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/include/linux/fs.h linux-2.6.7-01-lock_owner_fixup/include/linux/fs.h
--- linux-2.6.7-00-fix_locks/include/linux/fs.h 2004-06-16 14:57:54.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/include/linux/fs.h 2004-06-22 16:48:19.000000000 -0400
@@ -625,7 +625,7 @@ struct file_lock {
struct file_lock *fl_next; /* singly linked list for this inode */
struct list_head fl_link; /* doubly linked list of all locks */
struct list_head fl_block; /* circular list of blocked processes */
- fl_owner_t fl_owner;
+ fl_owner_t fl_owner; /* 0 if lock owned by a local process */
unsigned int fl_pid;
wait_queue_head_t fl_wait;
struct file *fl_file;
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/include/linux/nfs_fs.h linux-2.6.7-01-lock_owner_fixup/include/linux/nfs_fs.h
--- linux-2.6.7-00-fix_locks/include/linux/nfs_fs.h 2004-06-16 14:58:01.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/include/linux/nfs_fs.h 2004-06-22 16:48:19.000000000 -0400
@@ -595,7 +595,7 @@ struct nfs4_state_owner {

struct nfs4_lock_state {
struct list_head ls_locks; /* Other lock stateids */
- fl_owner_t ls_owner; /* POSIX lock owner */
+ unsigned int ls_pid; /* pid of owner process */
struct nfs4_state * ls_parent; /* Parent nfs4_state */
u32 ls_seqid;
u32 ls_id;
@@ -665,13 +665,13 @@ extern struct nfs4_state *nfs4_find_stat
extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp);
extern int nfs4_handle_error(struct nfs_server *, int);
extern void nfs4_schedule_state_recovery(struct nfs4_client *);
-extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t);
-extern struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t);
+extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid);
+extern struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, unsigned int pid);
extern void nfs4_put_lock_state(struct nfs4_lock_state *state);
extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls);
extern void nfs4_notify_setlk(struct inode *, struct file_lock *, struct nfs4_lock_state *);
extern void nfs4_notify_unlck(struct inode *, struct file_lock *, struct nfs4_lock_state *);
-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
+extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, unsigned int pid);



diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/include/linux/nfs_page.h linux-2.6.7-01-lock_owner_fixup/include/linux/nfs_page.h
--- linux-2.6.7-00-fix_locks/include/linux/nfs_page.h 2004-06-16 14:57:47.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/include/linux/nfs_page.h 2004-06-22 16:48:19.000000000 -0400
@@ -30,7 +30,7 @@ struct nfs_page {
struct list_head wb_list, /* Defines state of page: */
*wb_list_head; /* read/write/commit */
struct file *wb_file;
- fl_owner_t wb_lockowner;
+ unsigned int wb_pid;
struct inode *wb_inode;
struct rpc_cred *wb_cred;
struct nfs4_state *wb_state;
diff -u --recursive --new-file --show-c-function linux-2.6.7-00-fix_locks/include/linux/nfs_xdr.h linux-2.6.7-01-lock_owner_fixup/include/linux/nfs_xdr.h
--- linux-2.6.7-00-fix_locks/include/linux/nfs_xdr.h 2004-06-16 14:58:18.000000000 -0400
+++ linux-2.6.7-01-lock_owner_fixup/include/linux/nfs_xdr.h 2004-06-22 16:48:19.000000000 -0400
@@ -235,7 +235,7 @@ struct nfs_lockres {

struct nfs_readargs {
struct nfs_fh * fh;
- fl_owner_t lockowner;
+ unsigned int pid;
struct nfs4_state * state;
__u64 offset;
__u32 count;
@@ -259,7 +259,7 @@ struct nfs_readres {

struct nfs_writeargs {
struct nfs_fh * fh;
- fl_owner_t lockowner;
+ unsigned int pid;
struct nfs4_state * state;
__u64 offset;
__u32 count;


Attachments:
linux-2.6.7-02-lock_owner_fixup.dif (17.34 kB)

2004-06-23 19:30:10

by Chris Wright

[permalink] [raw]
Subject: Re: [PATCH] Make POSIX locks compatible with the NPTL thread model

* Trond Myklebust ([email protected]) wrote:
> At some point in 2.5.x we introduced the NPTL thread model at the kernel
> level, and hence redefined the idea of a process: a process appears
> currently to be defined as one or more threads with the same tgid...
> However we failed to completely update the POSIX locking code to reflect
> that change: currently, the POSIX locking code defines the process to be
> a set of one or more threads with the same tgid and a shared file
> table...
>
> As a result we end up with abominations like the steal_locks() function
> that is required in order to move the locks from from one file table to
> another on exec etc.

Just ran some quick tests to verify this patch still works fine with
execve() after plain CLONE_FILES as well as full CLONE_THREAD. Passed
my tests. Nice to see the steal_locks bit go. However, without this
patch (only the prior one, I'm getting an oops).

thanks,
-chris
--
Linux Security Modules http://lsm.immunix.org http://lsm.bkbits.net

2004-06-23 19:50:18

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH] Make POSIX locks compatible with the NPTL thread model

P? on , 23/06/2004 klokka 15:29, skreiv Chris Wright:
> Just ran some quick tests to verify this patch still works fine with
> execve() after plain CLONE_FILES as well as full CLONE_THREAD. Passed
> my tests. Nice to see the steal_locks bit go. However, without this
> patch (only the prior one, I'm getting an oops).

Can you show us the Oops? I'm surprised that should be occurring...

Note that with both patches, we still have some problems in the locking
downcall interface to the filesystem. Currently there exists an
atomicity problem: after the call to ->lock() there is a window during
which the filesystem believes that a lock may have been taken/released
but the VFS has not yet registered this fact. In 2.4.x, this window did
not exist because the VFS held the BKL from beginning to end.

As far as NFS is concerned, this makes use of the VFS in order to
recover from server reboots (which is in fact the only reason why we
bother to have the VFS track the locking) buggy...

IMHO, NFS/CIFS/... should just call posix_lock_file() directly from
their ->lock() method. That way they will also be able to full take
responsibility for protecting the call.

Cheers,
Trond

2004-06-23 21:44:59

by Chris Wright

[permalink] [raw]
Subject: Re: [PATCH] Make POSIX locks compatible with the NPTL thread model

* Trond Myklebust ([email protected]) wrote:
> P? on , 23/06/2004 klokka 15:29, skreiv Chris Wright:
> > Just ran some quick tests to verify this patch still works fine with
> > execve() after plain CLONE_FILES as well as full CLONE_THREAD. Passed
> > my tests. Nice to see the steal_locks bit go. However, without this
> > patch (only the prior one, I'm getting an oops).
>
> Can you show us the Oops? I'm surprised that should be occurring...

Yes, it's a BUG in locks_remove_flock. The first patch changes
locks_remove_posix, so posix lock is missed on filp_close and the BUG
is hit. I believe the problem is that locks can have same fl_owner,
w/out having same tgid.

kernel BUG at fs/locks.c:1729!
invalid operand: 0000 [#1]
PREEMPT SMP
Modules linked in:
CPU: 0
EIP: 0060:[<c017b890>] Not tainted
EFLAGS: 00010246 (2.6.7)
EIP is at locks_remove_flock+0xf1/0x131
eax: d5c67224 ebx: d555cae8 ecx: d5194190 edx: 00000000
esi: d555ca40 edi: d520e45c ebp: d5346efc esp: d5346ee8
ds: 007b es: 007b ss: 0068
Process lock_clone_exec (pid: 1864, threadinfo=d5346000 task=d5194190)
Stack: 7fffffff 00000000 d520e45c 00000000 d7e438b4 d5346f20 c01635a8 d520e45c
00000000 d555ca40 d51c2544 d520e45c 00000000 d5866750 d5346f3c c0161c38
d520e45c d5866750 00000001 00000003 00000009 d5346f64 c012058f d520e45c
Call Trace:
[<c010529f>] show_stack+0x80/0x96
[<c0105436>] show_registers+0x15f/0x1ae
[<c01055d3>] die+0xb5/0x17a
[<c01059e9>] do_invalid_op+0xb5/0xb7
[<c0104f45>] error_code+0x2d/0x38
[<c01635a8>] __fput+0x34/0x142
[<c0161c38>] filp_close+0x57/0x81
[<c012058f>] put_files_struct+0x8c/0xff
[<c012152c>] do_exit+0x1fb/0x63a
[<c0121a2d>] do_group_exit+0x3d/0x177
[<c0104469>] sysenter_past_esp+0x52/0x71
Code: 0f 0b c1 06 04 7c 57 c0 e9 68 ff ff ff c7 44 24 04 02 00 00

> Note that with both patches, we still have some problems in the locking
> downcall interface to the filesystem. Currently there exists an
> atomicity problem: after the call to ->lock() there is a window during
> which the filesystem believes that a lock may have been taken/released
> but the VFS has not yet registered this fact. In 2.4.x, this window did
> not exist because the VFS held the BKL from beginning to end.
>
> As far as NFS is concerned, this makes use of the VFS in order to
> recover from server reboots (which is in fact the only reason why we
> bother to have the VFS track the locking) buggy...
>
> IMHO, NFS/CIFS/... should just call posix_lock_file() directly from
> their ->lock() method. That way they will also be able to full take
> responsibility for protecting the call.
>
> Cheers,
> Trond

--
Linux Security Modules http://lsm.immunix.org http://lsm.bkbits.net

2004-06-23 23:02:13

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH] Make POSIX locks compatible with the NPTL thread model

P? on , 23/06/2004 klokka 17:41, skreiv Chris Wright:

> Yes, it's a BUG in locks_remove_flock. The first patch changes
> locks_remove_posix, so posix lock is missed on filp_close and the BUG
> is hit. I believe the problem is that locks can have same fl_owner,
> w/out having same tgid.

Yep. I agree with that analysis. Blech...

That just goes to show how broken the posix locks "logic" is when
applied to CLONE_FILES: POSIX locks are not supposed to be inherited by
child processes, and so the current VFS code checks both lock->fl_pid &
lock->fl_owner in tests such as posix_locks_conflict(). Currently (as I
said) on filp_close() we remove all locks with the same lock->fl_owner
without checking the pid.

However upon the last fput(), then in practice locks_remove_flock()
expects all locks with the same *file descriptor* to have been cleaned
up.

So what do we do? There is no way that we can keep the current rules, as
they provide no consistent way to inform the underlying NFS or CIFS
filesystem when to test for lock->fl_pid, when to test for
lock->fl_owner, and when to test for lock->fl_file.

Cheers,
Trond

2004-06-24 06:10:36

by Chris Wright

[permalink] [raw]
Subject: Re: [PATCH] Make POSIX locks compatible with the NPTL thread model

* Trond Myklebust ([email protected]) wrote:
> So what do we do? There is no way that we can keep the current rules, as
> they provide no consistent way to inform the underlying NFS or CIFS
> filesystem when to test for lock->fl_pid, when to test for
> lock->fl_owner, and when to test for lock->fl_file.

No bright ideas here. Each avenue I tried tonight was a dead end.

cheers,
-chris
--
Linux Security Modules http://lsm.immunix.org http://lsm.bkbits.net

2004-06-24 11:54:19

by Andreas Gruenbacher

[permalink] [raw]
Subject: Re: [PATCH] Make POSIX locks compatible with the NPTL thread model

Hi Trond,

On Wednesday 23 June 2004 19:07, Trond Myklebust wrote:
> Hi,
>
> At some point in 2.5.x we introduced the NPTL thread model at the kernel
> level, and hence redefined the idea of a process: a process appears
> currently to be defined as one or more threads with the same tgid...
> However we failed to completely update the POSIX locking code to reflect
> that change: currently, the POSIX locking code defines the process to be
> a set of one or more threads with the same tgid and a shared file
> table...
>
> As a result we end up with abominations like the steal_locks() function
> that is required in order to move the locks from from one file table to
> another on exec etc.

Nice to see you working on this. I briefly looked into how to fix the same
thing but didn't find the time to finish it. Here is what I thought; maybe
it's useful to you.

There are local and remote locks, and both of them need a pid discriminator.
We have used the files_struct pointer so far which was either a struct
files_struct pointer or a struct nlm_host pointer. By using pointers we had a
host+pid "uniquifier". Now we could change the fields from:

struct file_lock {
[...]
fl_owner_t fl_owner;
unsigned int fl_pid;
[...]
};

to:

struct file_lock {
[...]
struct nlm_host *fl_host; /* NULL for local locks */
unsigned int fl_pid;
[...]
};

There would be no casting of other types into a fl_host here.

Cheers,
--
Andreas Gruenbacher <[email protected]>
SUSE Labs, SUSE LINUX AG

2004-06-24 21:09:13

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH] Make POSIX locks compatible with the NPTL thread model

P? to , 24/06/2004 klokka 07:56, skreiv Andreas Gruenbacher:

> There are local and remote locks, and both of them need a pid discriminator.
> We have used the files_struct pointer so far which was either a struct
> files_struct pointer or a struct nlm_host pointer. By using pointers we had a
> host+pid "uniquifier". Now we could change the fields from:
>
> struct file_lock {
> [...]
> fl_owner_t fl_owner;
> unsigned int fl_pid;
> [...]
> };
>
> to:
>
> struct file_lock {
> [...]
> struct nlm_host *fl_host; /* NULL for local locks */
> unsigned int fl_pid;
> [...]
> };
>
> There would be no casting of other types into a fl_host here.

This was what the second patch I sent in did. However there is a flaw:
see Chris's Oops which shows that CLONE_FILES can still screw us...

Seeing that Oops, I'm starting to believe that the only way we can
achieve both goals of making the locking rules robust *and*
self-consistent is to scrap the fl_pid field...

If we do that, we end up with a situation in which an fcntl(F_GETLK)
call can return bogus values in the "pid" field, if someone calls
clone(CLONE_FILES) (without CLONE_THREAD). Also if someone calls
clone(CLONE_THREAD) (omitting CLONE_FILES) then fcntl(F_UNLCK) will fail
to clear all locks with the same pid. Finally we will still be saddled
with steal_locks()...

HOWEVER

At least NFS, CIFS,... will be able to set up ->lock() methods that are
consistent with what the VFS is doing above them, so that close() will
indeed free the same locks on the remote system as on the local system.
Currently that is not the case...
Note that doing this will require some work. In particular the NFS Lock
Manager protocol (used by NFSv2/v3 but not NFSv4) assumes that a 32-bit
"pid" field labels the lockowner. We would somehow have to convert the
lock->fl_owner into such a field - trivial on 32-bit systems, but less
so on systems with 64-bit pointers.
On the NLM/lockd server side, we need to stuff those same 32-bit "pid'
fields into the fl_lock fields in such a way that the "pid" values from
different clients do not conflict.


Cheers,
Trond