Hi Andrew.
Currently reading or writing file->f_pos is not atomic on 32bit environment,
so two or more simultaneous access can corrupt file->f_pos value.
There are some past discussions about this issue, but this is not fixed yet.
http://marc.info/?l=linux-kernel&m=120764199819899&w=2
http://marc.info/?l=linux-kernel&m=114490379102476&w=2
Protecting file->f_pos with a lock adds some overhead but I think we should
fix this.
I used seqlock to serialize the access to file->f_pos. This approach is similar
to inode->i_size synchronization.
Thanks.
Signed-off-by: Hisashi Hifumi <[email protected]>
diff -Nrup linux-2.6.27-rc9.org/fs/block_dev.c linux-2.6.27-rc9/fs/block_dev.c
--- linux-2.6.27-rc9.org/fs/block_dev.c 2008-10-07 11:44:42.000000000 +0900
+++ linux-2.6.27-rc9/fs/block_dev.c 2008-10-07 11:48:26.000000000 +0900
@@ -225,13 +225,11 @@ static loff_t block_llseek(struct file *
offset += size;
break;
case 1:
- offset += file->f_pos;
+ offset += file_pos_read(file);
}
retval = -EINVAL;
if (offset >= 0 && offset <= size) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- }
+ file_pos_update(file, offset);
retval = offset;
}
mutex_unlock(&bd_inode->i_mutex);
diff -Nrup linux-2.6.27-rc9.org/fs/file_table.c linux-2.6.27-rc9/fs/file_table.c
--- linux-2.6.27-rc9.org/fs/file_table.c 2008-10-07 11:44:42.000000000 +0900
+++ linux-2.6.27-rc9/fs/file_table.c 2008-10-07 11:48:26.000000000 +0900
@@ -121,6 +121,7 @@ struct file *get_empty_filp(void)
tsk = current;
INIT_LIST_HEAD(&f->f_u.fu_list);
atomic_long_set(&f->f_count, 1);
+ f_pos_ordered_init(f);
rwlock_init(&f->f_owner.lock);
f->f_uid = tsk->fsuid;
f->f_gid = tsk->fsgid;
diff -Nrup linux-2.6.27-rc9.org/fs/locks.c linux-2.6.27-rc9/fs/locks.c
--- linux-2.6.27-rc9.org/fs/locks.c 2008-10-07 11:44:42.000000000 +0900
+++ linux-2.6.27-rc9/fs/locks.c 2008-10-07 11:48:26.000000000 +0900
@@ -317,7 +317,7 @@ static int flock_to_posix_lock(struct fi
start = 0;
break;
case SEEK_CUR:
- start = filp->f_pos;
+ start = file_pos_read(filp);
break;
case SEEK_END:
start = i_size_read(filp->f_path.dentry->d_inode);
@@ -367,7 +367,7 @@ static int flock64_to_posix_lock(struct
start = 0;
break;
case SEEK_CUR:
- start = filp->f_pos;
+ start = file_pos_read(filp);
break;
case SEEK_END:
start = i_size_read(filp->f_path.dentry->d_inode);
diff -Nrup linux-2.6.27-rc9.org/fs/read_write.c linux-2.6.27-rc9/fs/read_write.c
--- linux-2.6.27-rc9.org/fs/read_write.c 2008-10-07 11:44:42.000000000 +0900
+++ linux-2.6.27-rc9/fs/read_write.c 2008-10-07 11:48:26.000000000 +0900
@@ -42,15 +42,13 @@ generic_file_llseek_unlocked(struct file
offset += inode->i_size;
break;
case SEEK_CUR:
- offset += file->f_pos;
+ offset += file_pos_read(file);
}
retval = -EINVAL;
if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
/* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
+ if (file_pos_update(file, offset))
file->f_version = 0;
- }
retval = offset;
}
return retval;
@@ -83,14 +81,12 @@ loff_t default_llseek(struct file *file,
offset += i_size_read(file->f_path.dentry->d_inode);
break;
case SEEK_CUR:
- offset += file->f_pos;
+ offset += file_pos_read(file);
}
retval = -EINVAL;
if (offset >= 0) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
+ if (file_pos_update(file, offset))
file->f_version = 0;
- }
retval = offset;
}
unlock_kernel();
@@ -324,16 +320,6 @@ ssize_t vfs_write(struct file *file, con
EXPORT_SYMBOL(vfs_write);
-static inline loff_t file_pos_read(struct file *file)
-{
- return file->f_pos;
-}
-
-static inline void file_pos_write(struct file *file, loff_t pos)
-{
- file->f_pos = pos;
-}
-
asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
struct file *file;
diff -Nrup linux-2.6.27-rc9.org/include/linux/fs.h linux-2.6.27-rc9/include/linux/fs.h
--- linux-2.6.27-rc9.org/include/linux/fs.h 2008-10-07 11:44:44.000000000 +0900
+++ linux-2.6.27-rc9/include/linux/fs.h 2008-10-07 11:48:26.000000000 +0900
@@ -805,6 +805,16 @@ static inline int ra_has_index(struct fi
#define FILE_MNT_WRITE_TAKEN 1
#define FILE_MNT_WRITE_RELEASED 2
+/*
+ * Use sequence lock to get consistent f_pos on 32-bit processors.
+ */
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+#define __NEED_F_POS_ORDERED
+#define f_pos_ordered_init(file) seqlock_init(&file->f_pos_seqlock)
+#else
+#define f_pos_ordered_init(file) do { } while (0)
+#endif
+
struct file {
/*
* fu_list becomes invalid after file_free is called and queued via
@@ -822,6 +832,9 @@ struct file {
unsigned int f_flags;
mode_t f_mode;
loff_t f_pos;
+#ifdef __NEED_F_POS_ORDERED
+ seqlock_t f_pos_seqlock;
+#endif
struct fown_struct f_owner;
unsigned int f_uid, f_gid;
struct file_ra_state f_ra;
@@ -850,6 +863,73 @@ extern spinlock_t files_lock;
#define get_file(x) atomic_long_inc(&(x)->f_count)
#define file_count(x) atomic_long_read(&(x)->f_count)
+/*
+ * file_pos_read/write is atomic by using sequence lock on 32bit arch.
+ */
+static inline loff_t file_pos_read(struct file *file)
+{
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+ loff_t pos;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&file->f_pos_seqlock);
+ pos = file->f_pos;
+ } while (read_seqretry(&file->f_pos_seqlock, seq));
+ return pos;
+#elif BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT)
+ loff_t pos;
+
+ preempt_disable();
+ pos = file->f_pos;
+ preempt_enable();
+ return pos;
+#else
+ return file->f_pos;
+#endif
+}
+
+static inline void file_pos_write(struct file *file, loff_t pos)
+{
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+ write_seqlock(&file->f_pos_seqlock);
+ file->f_pos = pos;
+ write_sequnlock(&file->f_pos_seqlock);
+#elif BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT)
+ preempt_disable();
+ file->f_pos = pos;
+ preempt_enable();
+#else
+ file->f_pos = pos;
+#endif
+}
+
+static inline int file_pos_update(struct file *file, loff_t offset)
+{
+ int ret = 0;
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+ write_seqlock(&file->f_pos_seqlock);
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ ret = 1;
+ }
+ write_sequnlock(&file->f_pos_seqlock);
+#elif BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT)
+ preempt_disable();
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ ret = 1;
+ }
+ preempt_enable();
+#else
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ ret = 1;
+ }
+#endif
+ return ret;
+}
+
#ifdef CONFIG_DEBUG_WRITECOUNT
static inline void file_take_write(struct file *f)
{
Hisashi Hifumi <[email protected]> writes:
> Hi Andrew.
>
> Currently reading or writing file->f_pos is not atomic on 32bit environment,
> so two or more simultaneous access can corrupt file->f_pos value.
> There are some past discussions about this issue, but this is not fixed yet.
> http://marc.info/?l=linux-kernel&m=120764199819899&w=2
> http://marc.info/?l=linux-kernel&m=114490379102476&w=2
Have you benchmarked if cmpxchg is cheaper than the seqlock? It's not
clear to me a seqlock is really the right locking primitive for
this. Normally seqlocks should be used when reading is much more
frequent than writing, but it's doubtful that this is actually the
case for f_pos.
-Andi
--
[email protected]
At 15:43 08/10/07, Andi Kleen wrote:
>Hisashi Hifumi <[email protected]> writes:
>
>> Hi Andrew.
>>
>> Currently reading or writing file->f_pos is not atomic on 32bit environment,
>> so two or more simultaneous access can corrupt file->f_pos value.
>> There are some past discussions about this issue, but this is not fixed yet.
>> http://marc.info/?l=linux-kernel&m=120764199819899&w=2
>> http://marc.info/?l=linux-kernel&m=114490379102476&w=2
>
>Have you benchmarked if cmpxchg is cheaper than the seqlock? It's not
>clear to me a seqlock is really the right locking primitive for
>this. Normally seqlocks should be used when reading is much more
>frequent than writing, but it's doubtful that this is actually the
>case for f_pos.
Maybe cmpxchg8b is good for i486 or later x86, but i386 or other architectures
that do not have similar instruction needs some locking primitive. I think lazy
seqlock is one option for making file->f_pos access atomic.
> Maybe cmpxchg8b is good for i486 or later x86, but i386 or other architectures
> that do not have similar instruction needs some locking primitive. I think lazy
We have a cmpxchg emulation on 386. That works because only UP 386s are
supported, so it can be done in software.
> seqlock is one option for making file->f_pos access atomic.
The question is if it's the right option. At least all the common
operations on fds (read/write) are all writers, not readers.
-Andi
--
[email protected]
On Tuesday 07 October 2008 21:29, Andi Kleen wrote:
> > Maybe cmpxchg8b is good for i486 or later x86, but i386 or other
> > architectures that do not have similar instruction needs some locking
> > primitive. I think lazy
>
> We have a cmpxchg emulation on 386. That works because only UP 386s are
> supported, so it can be done in software.
>
> > seqlock is one option for making file->f_pos access atomic.
>
> The question is if it's the right option. At least all the common
> operations on fds (read/write) are all writers, not readers.
Common operations are read, do something, write. So seqlocks then cost
one atomic operation, a couple of memory barriers (all noops on x86),
and some predictable branches etc.
cmpxchg based would require 2 lock ; cmpxchg8b on 32-bit. Fairly heavy.
Also I don't think we have generic accessors to do this, so I think
that is for another project.
Anyway, I think importantly this creates some usable accessors for the
f_pos problem. I think we actually need to touch a _lot_ of code to
cover all f_pos accesses in the kernel, but I guess this gets the ball
rolling.
So.. is everyone agreed that corrupting f_pos is a bad thing? (serious
question) If so, then we should get something like this merged sooner
rather than later.
On Wed, 8 Oct 2008 03:27:44 +1100 Nick Piggin <[email protected]> wrote:
> On Tuesday 07 October 2008 21:29, Andi Kleen wrote:
> > > Maybe cmpxchg8b is good for i486 or later x86, but i386 or other
> > > architectures that do not have similar instruction needs some locking
> > > primitive. I think lazy
> >
> > We have a cmpxchg emulation on 386. That works because only UP 386s are
> > supported, so it can be done in software.
> >
> > > seqlock is one option for making file->f_pos access atomic.
> >
> > The question is if it's the right option. At least all the common
> > operations on fds (read/write) are all writers, not readers.
>
> Common operations are read, do something, write. So seqlocks then cost
> one atomic operation, a couple of memory barriers (all noops on x86),
> and some predictable branches etc.
>
> cmpxchg based would require 2 lock ; cmpxchg8b on 32-bit. Fairly heavy.
> Also I don't think we have generic accessors to do this, so I think
> that is for another project.
>
> Anyway, I think importantly this creates some usable accessors for the
> f_pos problem. I think we actually need to touch a _lot_ of code to
> cover all f_pos accesses in the kernel, but I guess this gets the ball
> rolling.
Aneesh is proposing using using seqlocks to make percpu_counter.count
atomic on 32-bit.
This patch uses seqlocks to make file.f_pos atomic on 32-bit.
I think we should come up with a common atomic 64-bit type. We already
partly have that: atomic64_t. But for reasons which I don't recall,
atomic64_t is 64-bit-only at present.
If we generalise atomic64_t to all architectures then we can use it in
both the above applications and surely in other places in the future.
> So.. is everyone agreed that corrupting f_pos is a bad thing? (serious
> question) If so, then we should get something like this merged sooner
> rather than later.
- two threads/processes sharing the same fd
- both appending the same fd
- both hit the small race window right around the time when the file
flips over a multiple of 4G.
It's pretty damn improbable, and I think we can afford to spend the
time to get this right in 2.6.29.
On Wed, Oct 08, 2008 at 03:27:44AM +1100, Nick Piggin wrote:
> So.. is everyone agreed that corrupting f_pos is a bad thing? (serious
> question) If so, then we should get something like this merged sooner
> rather than later.
I'm not convinced it's a bad thing. f_pos is going to get 'corrupted'
(ie not pointing where you think it's going to point) sooner or later
anyway if you have two threads accessing the same file without locking.
The fact that it can point to an offset that was accessed by neither
thread A nor thread B seems irrelevant to me.
Separately, if we do decide something along these lines is a good idea,
then file_pos_update() needs to be better. The current patch has:
if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
/* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
+ if (file_pos_update(file, offset))
file->f_version = 0;
- }
retval = offset;
}
It seems to me that the set of f_version to 0 should be atomic with
the changing of f_pos. So something like this might work better:
+static inline void file_pos_update(struct file *file, loff_t offset, int v)
+{
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+ write_seqlock(&file->f_pos_seqlock);
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ if (v)
+ file->f_version = 0;
+ }
+ write_sequnlock(&file->f_pos_seqlock);
+#elif BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT)
+ preempt_disable();
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ if (v)
+ file->f_version = 0;
+ }
+ preempt_enable();
+#else
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ if (v)
+ file->f_version = 0;
+ }
+#endif
+}
Since 'v' will be known at compile time, the tests optimise away to nothing.
I also think the patch would be more favourably received if it were
structured as:
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+static inline loff_t file_pos_read(struct file *file)
+{
...
+}
+
+static inline void file_pos_write(struct file *file, loff_t pos)
+{
...
+}
+
...
+#elif BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT)
...
+#else
...
+#endif
--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
On Tue, 2008-10-07 at 10:50 -0700, Andrew Morton wrote:
> On Wed, 8 Oct 2008 03:27:44 +1100 Nick Piggin <[email protected]> wrote:
> > So.. is everyone agreed that corrupting f_pos is a bad thing? (serious
> > question) If so, then we should get something like this merged sooner
> > rather than later.
>
> - two threads/processes sharing the same fd
>
> - both appending the same fd
>
> - both hit the small race window right around the time when the file
> flips over a multiple of 4G.
>
> It's pretty damn improbable, and I think we can afford to spend the
> time to get this right in 2.6.29.
The whole point is that such usage is outside the specification and thus
we don't strictly need to fix this.
So the question Nick is asking is, do we want to slow down the kernel
for a few broken user-space applications. Esp. since the race doesn't
affect anybody else except the broken users of the file descriptor.
IMHO not worth fixing..
On Wednesday 08 October 2008 04:50, Andrew Morton wrote:
> On Wed, 8 Oct 2008 03:27:44 +1100 Nick Piggin <[email protected]>
wrote:
> > On Tuesday 07 October 2008 21:29, Andi Kleen wrote:
> > > > Maybe cmpxchg8b is good for i486 or later x86, but i386 or other
> > > > architectures that do not have similar instruction needs some locking
> > > > primitive. I think lazy
> > >
> > > We have a cmpxchg emulation on 386. That works because only UP 386s are
> > > supported, so it can be done in software.
> > >
> > > > seqlock is one option for making file->f_pos access atomic.
> > >
> > > The question is if it's the right option. At least all the common
> > > operations on fds (read/write) are all writers, not readers.
> >
> > Common operations are read, do something, write. So seqlocks then cost
> > one atomic operation, a couple of memory barriers (all noops on x86),
> > and some predictable branches etc.
> >
> > cmpxchg based would require 2 lock ; cmpxchg8b on 32-bit. Fairly heavy.
> > Also I don't think we have generic accessors to do this, so I think
> > that is for another project.
> >
> > Anyway, I think importantly this creates some usable accessors for the
> > f_pos problem. I think we actually need to touch a _lot_ of code to
> > cover all f_pos accesses in the kernel, but I guess this gets the ball
> > rolling.
>
> Aneesh is proposing using using seqlocks to make percpu_counter.count
> atomic on 32-bit.
>
> This patch uses seqlocks to make file.f_pos atomic on 32-bit.
>
> I think we should come up with a common atomic 64-bit type. We already
> partly have that: atomic64_t. But for reasons which I don't recall,
> atomic64_t is 64-bit-only at present.
>
> If we generalise atomic64_t to all architectures then we can use it in
> both the above applications and surely in other places in the future.
seqlocks can't really make it a general 64-bit atomic type. Well, they
_could_, but then your actual type is much bigger than 64 bits, or you
map to a hash of seqlocks or something awful...
Anyway, my main point is that the bulk of the work will be in the changes
all over the kernel to use the accessors. How it works behind that is
obviously trivially changed by comparison (it could start off without
changing any code at all and just wrap the racy accessors).
> > So.. is everyone agreed that corrupting f_pos is a bad thing? (serious
> > question) If so, then we should get something like this merged sooner
> > rather than later.
>
> - two threads/processes sharing the same fd
>
> - both appending the same fd
>
> - both hit the small race window right around the time when the file
> flips over a multiple of 4G.
>
> It's pretty damn improbable, and I think we can afford to spend the
> time to get this right in 2.6.29.
My question is: what is "right"? Do we actually care about this and
intend to fix it? Because there have been people in the past who have
said no...
On Wednesday 08 October 2008 05:59, Peter Zijlstra wrote:
> On Tue, 2008-10-07 at 10:50 -0700, Andrew Morton wrote:
> > It's pretty damn improbable, and I think we can afford to spend the
> > time to get this right in 2.6.29.
>
> The whole point is that such usage is outside the specification and thus
> we don't strictly need to fix this.
>
> So the question Nick is asking is, do we want to slow down the kernel
> for a few broken user-space applications. Esp. since the race doesn't
> affect anybody else except the broken users of the file descriptor.
Right you are. That's the fundamental question. The actual details of
the fix and how likely the race is don't really matter until we
answer the first question (except to say that the "fix" is never going
to be free).
We've lasted this long with the current semantics. So the natural
reaction to anything that strengthens the semantics now is "why?". If
we do that then we can basically never return to the weaker semantics.
So there had better be a really good reason.
On Wed, Oct 08, 2008 at 01:35:44PM +1100, Nick Piggin wrote:
> Right you are. That's the fundamental question. The actual details of
> the fix and how likely the race is don't really matter until we
> answer the first question (except to say that the "fix" is never going
> to be free).
>
> We've lasted this long with the current semantics. So the natural
> reaction to anything that strengthens the semantics now is "why?". If
> we do that then we can basically never return to the weaker semantics.
> So there had better be a really good reason.
And it's worth saying that letter-of-the-standard arguments aren't
necessarily enough. Linux does not honour the POSIX guarantee that
writes are atomic (if they cross page boundaries, it's not certain).
This seems like even more of a corner case to me.
--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
At 11:35 08/10/08, Nick Piggin wrote:
>On Wednesday 08 October 2008 05:59, Peter Zijlstra wrote:
>>
>> The whole point is that such usage is outside the specification and thus
>> we don't strictly need to fix this.
>>
>> So the question Nick is asking is, do we want to slow down the kernel
>> for a few broken user-space applications. Esp. since the race doesn't
>> affect anybody else except the broken users of the file descriptor.
>
>Right you are. That's the fundamental question. The actual details of
>the fix and how likely the race is don't really matter until we
>answer the first question (except to say that the "fix" is never going
>to be free).
Simultaneous access by two or more writer can corrupt file content,
so this case needs some locks(flock or fcntl) to preserve synchronization
of file content. This is responsibility of user-space application.
But file->f_pos race issue can occur even if multiple threads just read
simultaneously. I think this is not responsibility of user-space application.
To avoid this currently, an application needs some locks to protect file offset
even if it just read a file. So I think f_pos race should be fixed.
On Wednesday 08 October 2008 15:48, Hisashi Hifumi wrote:
> At 11:35 08/10/08, Nick Piggin wrote:
> >On Wednesday 08 October 2008 05:59, Peter Zijlstra wrote:
> >> The whole point is that such usage is outside the specification and thus
> >> we don't strictly need to fix this.
> >>
> >> So the question Nick is asking is, do we want to slow down the kernel
> >> for a few broken user-space applications. Esp. since the race doesn't
> >> affect anybody else except the broken users of the file descriptor.
> >
> >Right you are. That's the fundamental question. The actual details of
> >the fix and how likely the race is don't really matter until we
> >answer the first question (except to say that the "fix" is never going
> >to be free).
>
> Simultaneous access by two or more writer can corrupt file content,
> so this case needs some locks(flock or fcntl) to preserve synchronization
> of file content. This is responsibility of user-space application.
Right.
> But file->f_pos race issue can occur even if multiple threads just read
> simultaneously. I think this is not responsibility of user-space
> application. To avoid this currently, an application needs some locks to
> protect file offset even if it just read a file. So I think f_pos race
> should be fixed.
What would they possibly hope to be reading, though? IOW. a read(2) still
*writes* to the fpos which userspace is very much aware of, and in exactly
the same way as write(2), so userspace should require the same obligations
to protect it in both cases I think. If you say they must protect file
content for writes, then it is valid to say that they must protect fd data
as well (ie. file offset).
On Wed, Oct 08, 2008 at 01:48:10PM +0900, Hisashi Hifumi wrote:
> Simultaneous access by two or more writer can corrupt file content,
> so this case needs some locks(flock or fcntl) to preserve synchronization
> of file content. This is responsibility of user-space application.
> But file->f_pos race issue can occur even if multiple threads just read
> simultaneously. I think this is not responsibility of user-space application.
> To avoid this currently, an application needs some locks to protect file offset
> even if it just read a file. So I think f_pos race should be fixed.
Why is this application not using pread() / pwrite() to use thread-local
file pointers?
--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
On Wed, 08 Oct 2008 13:48:10 +0900 Hisashi Hifumi <[email protected]> wrote:
>
> At 11:35 08/10/08, Nick Piggin wrote:
> >On Wednesday 08 October 2008 05:59, Peter Zijlstra wrote:
> >>
> >> The whole point is that such usage is outside the specification and thus
> >> we don't strictly need to fix this.
> >>
> >> So the question Nick is asking is, do we want to slow down the kernel
> >> for a few broken user-space applications. Esp. since the race doesn't
> >> affect anybody else except the broken users of the file descriptor.
> >
> >Right you are. That's the fundamental question. The actual details of
> >the fix and how likely the race is don't really matter until we
> >answer the first question (except to say that the "fix" is never going
> >to be free).
>
> Simultaneous access by two or more writer can corrupt file content,
> so this case needs some locks(flock or fcntl) to preserve synchronization
> of file content. This is responsibility of user-space application.
> But file->f_pos race issue can occur even if multiple threads just read
> simultaneously.
But how could that userspace code possibly be reliable even if f_pos
updates were atomic? The two threads/processes will still read
either the same data, or one will get "A" and the other will get "B" or
one will get "B" and the other will get "A".
And when that application gets fixed, so too does this f_pos problem?
I think this is not responsibility of user-space application.
> To avoid this currently, an application needs some locks to protect file offset
> even if it just read a file. So I think f_pos race should be fixed.
On Wed, 2008-10-08 at 13:48 +0900, Hisashi Hifumi wrote:
> Simultaneous access by two or more writer can corrupt file content,
> so this case needs some locks(flock or fcntl) to preserve synchronization
> of file content. This is responsibility of user-space application.
> But file->f_pos race issue can occur even if multiple threads just read
> simultaneously. I think this is not responsibility of user-space application.
> To avoid this currently, an application needs some locks to protect file offset
> even if it just read a file. So I think f_pos race should be fixed.
Just to add to all those who already said you're wrong :-)
You're wrong, if two threads would like to read the same file they
either dup() the fd or open() the file twice. There is absolutely no
valid reason to have two threads read from the same fd without
synchronising their access to it - never.
Peter Zijlstra a ?crit :
> On Wed, 2008-10-08 at 13:48 +0900, Hisashi Hifumi wrote:
>
>> Simultaneous access by two or more writer can corrupt file content,
>> so this case needs some locks(flock or fcntl) to preserve synchronization
>> of file content. This is responsibility of user-space application.
>> But file->f_pos race issue can occur even if multiple threads just read
>> simultaneously. I think this is not responsibility of user-space application.
>> To avoid this currently, an application needs some locks to protect file offset
>> even if it just read a file. So I think f_pos race should be fixed.
>
> Just to add to all those who already said you're wrong :-)
>
> You're wrong, if two threads would like to read the same file they
> either dup() the fd or open() the file twice. There is absolutely no
> valid reason to have two threads read from the same fd without
> synchronising their access to it - never.
>
About dup() syscall, it wont help, since old and new descriptor points to
the same "struct file", definitly sharing file position, since first Unixes.
To quote the fine manual :
After successful return of dup or dup2, the old and new descriptors may
be used interchangeably. They share locks, file position pointers and
flags; for example, if the file position is modified by using lseek on
one of the descriptors, the position is also changed for the other.
pread()/pwrite() are used my multi-threaded applications that want to share
a single "struct file". Or they must use some form of synchronization around
regular read()/write()/lseek() calls.
There is no generic f_pos race, only buggy applications.
A far more interesting problem is the "tail -f logfile" problem that raised
recently in lkml, when file is NFS mounted, where reader can get nul bytes...
(Subject : blocks of zeros (NULLs) in NFS files in kernels >= 2.6.20 )
On Wednesday 08 October 2008 19:32, Eric Dumazet wrote:
> A far more interesting problem is the "tail -f logfile" problem that raised
> recently in lkml, when file is NFS mounted, where reader can get nul
> bytes... (Subject : blocks of zeros (NULLs) in NFS files in kernels >=
> 2.6.20 )
Not so interesting outside NFS, AFAIKS? It's just a matter of what
semantics NFS chooses to implement... I guess stronger semantics
will tend to be more costly, all else being equal, probably the
reason why they do it that way.
That behaviour does seem a bit surprising, though.
On Wed, 2008-10-08 at 10:32 +0200, Eric Dumazet wrote:
> About dup() syscall, it wont help, since old and new descriptor points to
> the same "struct file", definitly sharing file position, since first Unixes.
>
> To quote the fine manual :
>
> After successful return of dup or dup2, the old and new descriptors may
> be used interchangeably. They share locks, file position pointers and
> flags; for example, if the file position is modified by using lseek on
> one of the descriptors, the position is also changed for the other.
Ah, ok. I'll try to remember for next time I write a multi-threaded user
app (which given the size of my kernel todo list won't be any time soon
I guess ;-)
> pread()/pwrite() are used my multi-threaded applications that want to share
> a single "struct file".
Yeah, pread/pwrite is good.
> Or they must use some form of synchronization around
> regular read()/write()/lseek() calls.
>
> There is no generic f_pos race, only buggy applications.
Agreed.
On Tue 2008-10-07 20:59:23, Peter Zijlstra wrote:
> On Tue, 2008-10-07 at 10:50 -0700, Andrew Morton wrote:
> > On Wed, 8 Oct 2008 03:27:44 +1100 Nick Piggin <[email protected]> wrote:
>
> > > So.. is everyone agreed that corrupting f_pos is a bad thing? (serious
> > > question) If so, then we should get something like this merged sooner
> > > rather than later.
> >
> > - two threads/processes sharing the same fd
> >
> > - both appending the same fd
> >
> > - both hit the small race window right around the time when the file
> > flips over a multiple of 4G.
> >
> > It's pretty damn improbable, and I think we can afford to spend the
> > time to get this right in 2.6.29.
>
> The whole point is that such usage is outside the specification and thus
> we don't strictly need to fix this.
>
> So the question Nick is asking is, do we want to slow down the kernel
> for a few broken user-space applications. Esp. since the race doesn't
> affect anybody else except the broken users of the file descriptor.
Why is it outside spec?
> IMHO not worth fixing..
I believe we even have append-only flag, similar to immutable, which
is used for security...??
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Tue 2008-10-07 20:52:09, Matthew Wilcox wrote:
> On Wed, Oct 08, 2008 at 01:35:44PM +1100, Nick Piggin wrote:
> > Right you are. That's the fundamental question. The actual details of
> > the fix and how likely the race is don't really matter until we
> > answer the first question (except to say that the "fix" is never going
> > to be free).
> >
> > We've lasted this long with the current semantics. So the natural
> > reaction to anything that strengthens the semantics now is "why?". If
> > we do that then we can basically never return to the weaker semantics.
> > So there had better be a really good reason.
>
> And it's worth saying that letter-of-the-standard arguments aren't
> necessarily enough. Linux does not honour the POSIX guarantee that
> writes are atomic (if they cross page boundaries, it's not certain).
> This seems like even more of a corner case to me.
We have append-only files, and normal users should not be able to work
around that restriction.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Thu, 09 Oct 2008 14:23:19 +0200, Pavel Machek said:
> We have append-only files, and normal users should not be able to work
> around that restriction.
Can you give an example of two *plausible* starting offsets, and 2 write
lengths, which when subjected to this race manage to override the append-only
attribute? I believe somebody already checked the code, and for append-only
we force the offset in the kernel for *each* write.
On Thu, Oct 09, 2008 at 02:23:19PM +0200, Pavel Machek wrote:
> On Tue 2008-10-07 20:52:09, Matthew Wilcox wrote:
> > And it's worth saying that letter-of-the-standard arguments aren't
> > necessarily enough. Linux does not honour the POSIX guarantee that
> > writes are atomic (if they cross page boundaries, it's not certain).
> > This seems like even more of a corner case to me.
>
> We have append-only files, and normal users should not be able to work
> around that restriction.
Is it possible to work around this restriction by exploiting this?
IS_APPEND() forces the user to have O_APPEND in their flags.
O_APPEND is only checked in generic_write_checks() where it sets '*pos'
to i_size.
For the majority of filesystems, generic_write_checks() is called from
__generic_file_aio_write_nolock. __generic_file_aio_write_nolock is
only called from generic_file_aio_write_nolock (which passes the address
of a kiocb->ki_pos) and generic_file_aio_write (same).
The filesystems that call generic_write_checks() directly are:
XFS (xfs_write): Passes the address of a local variable
OCFS2 (ocfs2_file_aio_write): Passes the address of a ki_pos
CIFS (cifs_user_write): Not sure.
NFS (nfs_file_direct_write): "Note that O_APPEND is not supported".
NTFS (ntfs_file_aio_write_nolock): Address of a local variable
FUSE (fuse_file_aio_write): Address of a local variable
FUSE (fuse_direct_write): Not sure.
So the only two that might be affected are CIFS and FUSE (O_DIRECT?!) as
far as I can tell. I'm having a hard time believing this is a security
problem.
--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
On Thu, 9 Oct 2008, Matthew Wilcox wrote:
> On Thu, Oct 09, 2008 at 02:23:19PM +0200, Pavel Machek wrote:
> > On Tue 2008-10-07 20:52:09, Matthew Wilcox wrote:
> > > And it's worth saying that letter-of-the-standard arguments aren't
> > > necessarily enough. Linux does not honour the POSIX guarantee that
> > > writes are atomic (if they cross page boundaries, it's not certain).
> > > This seems like even more of a corner case to me.
> >
> > We have append-only files, and normal users should not be able to work
> > around that restriction.
>
> Is it possible to work around this restriction by exploiting this?
>
> IS_APPEND() forces the user to have O_APPEND in their flags.
> O_APPEND is only checked in generic_write_checks() where it sets '*pos'
> to i_size.
>
> For the majority of filesystems, generic_write_checks() is called from
> __generic_file_aio_write_nolock. __generic_file_aio_write_nolock is
> only called from generic_file_aio_write_nolock (which passes the address
> of a kiocb->ki_pos) and generic_file_aio_write (same).
>
> The filesystems that call generic_write_checks() directly are:
> XFS (xfs_write): Passes the address of a local variable
> OCFS2 (ocfs2_file_aio_write): Passes the address of a ki_pos
> CIFS (cifs_user_write): Not sure.
> NFS (nfs_file_direct_write): "Note that O_APPEND is not supported".
> NTFS (ntfs_file_aio_write_nolock): Address of a local variable
> FUSE (fuse_file_aio_write): Address of a local variable
> FUSE (fuse_direct_write): Not sure.
>
> So the only two that might be affected are CIFS and FUSE (O_DIRECT?!) as
> far as I can tell. I'm having a hard time believing this is a security
> problem.
And even in those cases it's actually a local variable, since
sys_write() does:
loff_t pos = file_pos_read(file);
ret = vfs_write(file, buf, count, &pos);
file_pos_write(file, pos);
So there's no way to corrupt the starting position for an append mode
write, as that always comes from the file size.
Two append writes to the same file could corrupt f_pos, but that would
only matter for subsequent reads or non-append mode writes.
Miklos
On Thu, 9 Oct 2008, Matthew Wilcox wrote:
> On Thu, Oct 09, 2008 at 02:23:19PM +0200, Pavel Machek wrote:
> >
> > We have append-only files, and normal users should not be able to work
> > around that restriction.
>
> Is it possible to work around this restriction by exploiting this?
No, I don't think it is.
Because we had various nasty races, and various broken filesystems using
"f->f_pos" directly (and then pread/pwrite not working), we fixed things
many years ago, and nobody should use "f_pos" directly any more for any
regular file access.
Oh, you'll see a _lot_ of f_pos accesses if you grep for them in low-level
filesystems, but they should be for directory accesses, that are all under
i_mutex. And O_APPEND obviously doesn't enter into it anyway.
So for regular IO, all the filesystems should never touch f_pos directly
at all, they only ever touch a local "pos" that gets cached, and then at
the end of the write sys_write() will write it back with file_pos_write().
That function was done exactly so that we _could_ do locking if we cared.
Nobody ever did.
So even though filesystems get passed a _pointer_ to the position, it's
all actually a pointer to just a private per-thread, on-stack entry.
The reason for that is that we really used to have bugs where the
low-level filesystem assumed that "*pos" didn't change from under it while
the access was going on (reading it multiple times and comparing against
i_size etc), and exactly due to things like O_APPEND races against lseek.
So I think f_pos is fine. Yes, yes, if two threads or processes access the
same file pointer concurrently, that means that f_pos at the end may be
crazy, but that really is true regardless of whether you are able to hit
the *very* small race of updating the 32-bit lower/upper fields in some
mixed manner. No sane user program can possibly really care, since it
would already be getting random offsets.
(Yeah, yeah, I could see some really crazy code that can do retries with
optimistic locking in user space and could possibly see this as a bug, but
that really is totally insane code, and I doubt you could write such a
crazy thing to actually work).
Linus
Pavel Machek wrote:
> We have append-only files, and normal users should not be able to work
> around that restriction.
Well, apparently Posix breaks this, and we're supposed to let pwrite()
write anywhere in an O_APPEND file. So any security-related argument
goes out the window.
J
El Wed, 08 Oct 2008 08:51:51 +0200, Peter Zijlstra <[email protected]> escribió:
> either dup() the fd or open() the file twice. There is absolutely no
> valid reason to have two threads read from the same fd without
> synchronising their access to it - never.
In case this is the final consensus, I think that a topic that is brought
to the list every few months and even generates (aparently not neccesary)
patches is a hint that there should be somewhere a commentary (*) like
this:
(*) I don't know if what I wrote is 100% correct.
Signed-off-by: Diego Calleja García <[email protected]>
Index: 2.6/include/linux/fs.h
===================================================================
--- 2.6.orig/include/linux/fs.h 2008-10-09 00:06:50.000000000 +0200
+++ 2.6/include/linux/fs.h 2008-10-09 00:29:03.000000000 +0200
@@ -821,6 +821,18 @@
atomic_long_t f_count;
unsigned int f_flags;
mode_t f_mode;
+ /*
+ * Linux does NOT guarantee atomic reading/writing to file->f_pos in
+ * multithread apps running in 32 bit machines. There're several
+ * reasons for this behaviour:
+ * - Specifications don't say it must be implemented that way.
+ * - This behaviour is part of the Linux semantics.
+ * - Any application that does multithreaded access to file->f_pos
+ * should be doing its own locking: the processes should synchronize
+ * themselves when accessing a file descriptor. If an application
+ * doesn't do that, its file descriptor handling is buggy anyway and
+ * must be fixed to access file->f_pos properly.
+ */
loff_t f_pos;
struct fown_struct f_owner;
unsigned int f_uid, f_gid;
On Friday 10 October 2008 08:51, dcg wrote:
> El Wed, 08 Oct 2008 08:51:51 +0200, Peter Zijlstra <[email protected]>
escribió:
> > either dup() the fd or open() the file twice. There is absolutely no
> > valid reason to have two threads read from the same fd without
> > synchronising their access to it - never.
>
> In case this is the final consensus, I think that a topic that is brought
> to the list every few months and even generates (aparently not neccesary)
> patches is a hint that there should be somewhere a commentary (*) like
> this:
>
> (*) I don't know if what I wrote is 100% correct.
>
>
> Signed-off-by: Diego Calleja García <[email protected]>
>
> Index: 2.6/include/linux/fs.h
> ===================================================================
> --- 2.6.orig/include/linux/fs.h 2008-10-09 00:06:50.000000000 +0200
> +++ 2.6/include/linux/fs.h 2008-10-09 00:29:03.000000000 +0200
> @@ -821,6 +821,18 @@
> atomic_long_t f_count;
> unsigned int f_flags;
> mode_t f_mode;
> + /*
> + * Linux does NOT guarantee atomic reading/writing to file->f_pos in
> + * multithread apps running in 32 bit machines. There're several
> + * reasons for this behaviour:
Note that I don't think we'd want to explicitly guarantee that it is atomic
on 64-bit machines either. It does happen to be, but I don't think we want
anybody to rely on that...