2012-11-12 10:20:32

by Cyrill Gorcunov

[permalink] [raw]
Subject: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

This file handle will be used in /proc/pid/fdinfo/fd
output, which in turn will allow to restore a watch
target after checkpoint (thus it's provided for
CONFIG_CHECKPOINT_RESTORE only).

Signed-off-by: Cyrill Gorcunov <[email protected]>
CC: Pavel Emelyanov <[email protected]>
CC: Al Viro <[email protected]>
CC: Alexey Dobriyan <[email protected]>
CC: Andrew Morton <[email protected]>
CC: James Bottomley <[email protected]>
CC: "Aneesh Kumar K.V" <[email protected]>
CC: Alexey Dobriyan <[email protected]>
CC: Matthew Helsley <[email protected]>
CC: "J. Bruce Fields" <[email protected]>
CC: "Aneesh Kumar K.V" <[email protected]>
---
fs/notify/inotify/inotify.h | 8 +++++++
fs/notify/inotify/inotify_user.c | 42 ++++++++++++++++++++++++++++++++++-----
2 files changed, 45 insertions(+), 5 deletions(-)

Index: linux-2.6.git/fs/notify/inotify/inotify.h
===================================================================
--- linux-2.6.git.orig/fs/notify/inotify/inotify.h
+++ linux-2.6.git/fs/notify/inotify/inotify.h
@@ -1,6 +1,7 @@
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
#include <linux/slab.h> /* struct kmem_cache */
+#include <linux/exportfs.h>

extern struct kmem_cache *event_priv_cachep;

@@ -9,9 +10,16 @@ struct inotify_event_private_data {
int wd;
};

+#if defined(CONFIG_PROC_FS) && defined(CONFIG_EXPORTFS) && defined(CONFIG_CHECKPOINT_RESTORE)
+# define INOTIFY_USE_FHANDLE
+#endif
+
struct inotify_inode_mark {
struct fsnotify_mark fsn_mark;
int wd;
+#ifdef INOTIFY_USE_FHANDLE
+ __u8 fhandle[sizeof(struct file_handle) + MAX_HANDLE_SZ];
+#endif
};

extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
Index: linux-2.6.git/fs/notify/inotify/inotify_user.c
===================================================================
--- linux-2.6.git.orig/fs/notify/inotify/inotify_user.c
+++ linux-2.6.git/fs/notify/inotify/inotify_user.c
@@ -566,6 +566,33 @@ static void inotify_free_mark(struct fsn
kmem_cache_free(inotify_inode_mark_cachep, i_mark);
}

+#ifdef INOTIFY_USE_FHANDLE
+static int inotify_encode_wd_fhandle(struct inotify_inode_mark *mark, struct dentry *dentry)
+{
+ struct file_handle *fhandle = (struct file_handle *)mark->fhandle;
+ int size, ret;
+
+ BUILD_BUG_ON(sizeof(mark->fhandle) <= sizeof(struct file_handle));
+
+ fhandle->handle_bytes = sizeof(mark->fhandle) - sizeof(struct file_handle);
+ size = fhandle->handle_bytes >> 2;
+
+ ret = exportfs_encode_fh(dentry, (struct fid *)fhandle->f_handle, &size, 0);
+ if ((ret == 255) || (ret == -ENOSPC))
+ return -EOVERFLOW;
+
+ fhandle->handle_type = ret;
+ fhandle->handle_bytes = size * sizeof(u32);
+
+ return 0;
+}
+# else
+static int inotify_encode_wd_fhandle(struct inotify_inode_mark *mark, struct dentry *dentry)
+{
+ return 0;
+}
+#endif
+
static int inotify_update_existing_watch(struct fsnotify_group *group,
struct inode *inode,
u32 arg)
@@ -621,10 +648,11 @@ static int inotify_update_existing_watch
}

static int inotify_new_watch(struct fsnotify_group *group,
- struct inode *inode,
+ struct dentry *dentry,
u32 arg)
{
struct inotify_inode_mark *tmp_i_mark;
+ struct inode *inode = dentry->d_inode;
__u32 mask;
int ret;
struct idr *idr = &group->inotify_data.idr;
@@ -647,6 +675,10 @@ static int inotify_new_watch(struct fsno
if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
goto out_err;

+ ret = inotify_encode_wd_fhandle(tmp_i_mark, dentry);
+ if (ret)
+ goto out_err;
+
ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
tmp_i_mark);
if (ret)
@@ -673,16 +705,16 @@ out_err:
return ret;
}

-static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+static int inotify_update_watch(struct fsnotify_group *group, struct dentry *dentry, u32 arg)
{
int ret = 0;

retry:
/* try to update and existing watch with the new arg */
- ret = inotify_update_existing_watch(group, inode, arg);
+ ret = inotify_update_existing_watch(group, dentry->d_inode, arg);
/* no mark present, try to add a new one */
if (ret == -ENOENT)
- ret = inotify_new_watch(group, inode, arg);
+ ret = inotify_new_watch(group, dentry, arg);
/*
* inotify_new_watch could race with another thread which did an
* inotify_new_watch between the update_existing and the add watch
@@ -785,7 +817,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int,
group = f.file->private_data;

/* create/update an inode mark */
- ret = inotify_update_watch(group, inode, mask);
+ ret = inotify_update_watch(group, path.dentry, mask);
path_put(&path);
fput_and_out:
fdput(f);


2012-11-13 00:55:43

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Mon, 12 Nov 2012 14:14:43 +0400
Cyrill Gorcunov <[email protected]> wrote:

> This file handle will be used in /proc/pid/fdinfo/fd
> output, which in turn will allow to restore a watch
> target after checkpoint (thus it's provided for
> CONFIG_CHECKPOINT_RESTORE only).

This changelog isn't very good.

What appears to be happening is that you're borrowing exportfs's
ability to encode file handles and this is being reused to transport
inotify fd's to userspace for c/r? Or something else - I didn't try
very hard. Please explain better?

> --- linux-2.6.git.orig/fs/notify/inotify/inotify.h
> +++ linux-2.6.git/fs/notify/inotify/inotify.h
> @@ -1,6 +1,7 @@
> #include <linux/fsnotify_backend.h>
> #include <linux/inotify.h>
> #include <linux/slab.h> /* struct kmem_cache */
> +#include <linux/exportfs.h>
>
> extern struct kmem_cache *event_priv_cachep;
>
> @@ -9,9 +10,16 @@ struct inotify_event_private_data {
> int wd;
> };
>
> +#if defined(CONFIG_PROC_FS) && defined(CONFIG_EXPORTFS) && defined(CONFIG_CHECKPOINT_RESTORE)
> +# define INOTIFY_USE_FHANDLE
> +#endif

Does this mean that c/r will fail to work properly if
CONFIG_EXPORTFS=n? If so, that should be expressed in Kconfig
dependencies?

> struct inotify_inode_mark {
> struct fsnotify_mark fsn_mark;
> int wd;
> +#ifdef INOTIFY_USE_FHANDLE
> + __u8 fhandle[sizeof(struct file_handle) + MAX_HANDLE_SZ];
> +#endif
> };

Whoa. This adds 128+8 bytes to the inotify_inode_mark. That's a lot of
bloat, and there can be a lot of inotify_inode_mark's in the system?

Also, what about alignment? That embedded `struct file_handle' will
have a 4-byte alignment and if there's anything in there which is an
8-byte quantity then some architectures (ia64?) might get upset about
the kernel-mode unaligned access. It appears that this won't happen in
the present code, but are we future-proof?

Why did you use a __u8, anyway? Could have done something like

struct {
struct file_handle fhandle;
u8 pad[MAX_HANDLE_SZ];
};

and got some additional type safety and less typecasting?

> extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
> Index: linux-2.6.git/fs/notify/inotify/inotify_user.c
> ===================================================================
> --- linux-2.6.git.orig/fs/notify/inotify/inotify_user.c
> +++ linux-2.6.git/fs/notify/inotify/inotify_user.c
> @@ -566,6 +566,33 @@ static void inotify_free_mark(struct fsn
> kmem_cache_free(inotify_inode_mark_cachep, i_mark);
> }
>
> +#ifdef INOTIFY_USE_FHANDLE
> +static int inotify_encode_wd_fhandle(struct inotify_inode_mark *mark, struct dentry *dentry)
> +{
> + struct file_handle *fhandle = (struct file_handle *)mark->fhandle;
> + int size, ret;
> +
> + BUILD_BUG_ON(sizeof(mark->fhandle) <= sizeof(struct file_handle));
> +
> + fhandle->handle_bytes = sizeof(mark->fhandle) - sizeof(struct file_handle);
> + size = fhandle->handle_bytes >> 2;
> +
> + ret = exportfs_encode_fh(dentry, (struct fid *)fhandle->f_handle, &size, 0);
> + if ((ret == 255) || (ret == -ENOSPC))

Sigh. ENOSPC means "your disk is full". If this error ever gets back
to userspace, our poor user will go and provision more disks and then
wonder why that didn't fix anything.

> + return -EOVERFLOW;

That doesn't seem very appropriate either, unsure.

> + fhandle->handle_type = ret;
> + fhandle->handle_bytes = size * sizeof(u32);
> +
> + return 0;
> +}
>
> ...
>

2012-11-13 07:21:05

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Mon, Nov 12, 2012 at 04:55:40PM -0800, Andrew Morton wrote:
> On Mon, 12 Nov 2012 14:14:43 +0400
> Cyrill Gorcunov <[email protected]> wrote:
>
> > This file handle will be used in /proc/pid/fdinfo/fd
> > output, which in turn will allow to restore a watch
> > target after checkpoint (thus it's provided for
> > CONFIG_CHECKPOINT_RESTORE only).
>
> This changelog isn't very good.
>
> What appears to be happening is that you're borrowing exportfs's
> ability to encode file handles and this is being reused to transport
> inotify fd's to userspace for c/r? Or something else - I didn't try
> very hard. Please explain better?

Yes, we use fhandle mechanism to encode a watch target. This allows us
to remember fhandle on dump and open() it at restore time (ie when we do
restore a target we use open_by_handle_at syscall with fhandle).

I'll update the changelog and send it.

> > --- linux-2.6.git.orig/fs/notify/inotify/inotify.h
> > +++ linux-2.6.git/fs/notify/inotify/inotify.h
> > @@ -1,6 +1,7 @@
> > #include <linux/fsnotify_backend.h>
> > #include <linux/inotify.h>
> > #include <linux/slab.h> /* struct kmem_cache */
> > +#include <linux/exportfs.h>
> >
> > extern struct kmem_cache *event_priv_cachep;
> >
> > @@ -9,9 +10,16 @@ struct inotify_event_private_data {
> > int wd;
> > };
> >
> > +#if defined(CONFIG_PROC_FS) && defined(CONFIG_EXPORTFS) && defined(CONFIG_CHECKPOINT_RESTORE)
> > +# define INOTIFY_USE_FHANDLE
> > +#endif
>
> Does this mean that c/r will fail to work properly if
> CONFIG_EXPORTFS=n? If so, that should be expressed in Kconfig
> dependencies?

Well, strictly speaking -- yes. We need exportfs to be compiled in.
But note the c/r will fail iif the task we're dumping does use
inotify. if there is no inotify usage -- then nothing will fail
even if exportfs is not compiled in. Also in our tool itself we
provide the "check" command which does verify if all features
we need are provided by the kernel. I'll think about adding
this config entry to deps. Thanks!

> > struct inotify_inode_mark {
> > struct fsnotify_mark fsn_mark;
> > int wd;
> > +#ifdef INOTIFY_USE_FHANDLE
> > + __u8 fhandle[sizeof(struct file_handle) + MAX_HANDLE_SZ];
> > +#endif
> > };
>
> Whoa. This adds 128+8 bytes to the inotify_inode_mark. That's a lot of
> bloat, and there can be a lot of inotify_inode_mark's in the system?

Yes, that's why it's not turned on by default, only if is c/r turned on.
iirc I've been said that usually only about 40 bytes is used (in the tests
I met only 8 bytes). Letme re-check all things.

> Also, what about alignment? That embedded `struct file_handle' will
> have a 4-byte alignment and if there's anything in there which is an
> 8-byte quantity then some architectures (ia64?) might get upset about
> the kernel-mode unaligned access. It appears that this won't happen in
> the present code, but are we future-proof?
>
> Why did you use a __u8, anyway? Could have done something like
>
> struct {
> struct file_handle fhandle;
> u8 pad[MAX_HANDLE_SZ];
> };
>
> and got some additional type safety and less typecasting?

Good point! Agreed on all. I'll update, thanks!

> > +#ifdef INOTIFY_USE_FHANDLE
> > +static int inotify_encode_wd_fhandle(struct inotify_inode_mark *mark, struct dentry *dentry)
> > +{
> > + struct file_handle *fhandle = (struct file_handle *)mark->fhandle;
> > + int size, ret;
> > +
> > + BUILD_BUG_ON(sizeof(mark->fhandle) <= sizeof(struct file_handle));
> > +
> > + fhandle->handle_bytes = sizeof(mark->fhandle) - sizeof(struct file_handle);
> > + size = fhandle->handle_bytes >> 2;
> > +
> > + ret = exportfs_encode_fh(dentry, (struct fid *)fhandle->f_handle, &size, 0);
> > + if ((ret == 255) || (ret == -ENOSPC))
>
> Sigh. ENOSPC means "your disk is full". If this error ever gets back
> to userspace, our poor user will go and provision more disks and then
> wonder why that didn't fix anything.

Hmm, this errno is returned from exportfs_encode_fh. Letme think which one
is better here. I'll update. Thanks!

>
> > + return -EOVERFLOW;
>
> That doesn't seem very appropriate either, unsure.

Cyrill

2012-11-13 07:40:08

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Tue, 13 Nov 2012 11:20:57 +0400 Cyrill Gorcunov <[email protected]> wrote:

> > > struct inotify_inode_mark {
> > > struct fsnotify_mark fsn_mark;
> > > int wd;
> > > +#ifdef INOTIFY_USE_FHANDLE
> > > + __u8 fhandle[sizeof(struct file_handle) + MAX_HANDLE_SZ];
> > > +#endif
> > > };
> >
> > Whoa. This adds 128+8 bytes to the inotify_inode_mark. That's a lot of
> > bloat, and there can be a lot of inotify_inode_mark's in the system?
>
> Yes, that's why it's not turned on by default, only if is c/r turned on.
> iirc I've been said that usually only about 40 bytes is used (in the tests
> I met only 8 bytes). Letme re-check all things.

The question is, how many `struct inotify_inode_mark's are instantiated
system-wide? Could be millions?

Dumb question: do we really need inotify_inode_mark.fhandle at all?
What prevents us from assembling this info on demand when ->show_fdinfo() is
called?

2012-11-13 08:00:40

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Mon, Nov 12, 2012 at 11:40:01PM -0800, Andrew Morton wrote:
> On Tue, 13 Nov 2012 11:20:57 +0400 Cyrill Gorcunov <[email protected]> wrote:
> > >
> > > Whoa. This adds 128+8 bytes to the inotify_inode_mark. That's a lot of
> > > bloat, and there can be a lot of inotify_inode_mark's in the system?
> >
> > Yes, that's why it's not turned on by default, only if is c/r turned on.
> > iirc I've been said that usually only about 40 bytes is used (in the tests
> > I met only 8 bytes). Letme re-check all things.
>
> The question is, how many `struct inotify_inode_mark's are instantiated
> system-wide? Could be millions?

Well, hard to tell, to be fair. On my testing machine only apache has been
using inotify system as far as I remember, but for sure nothing except memory
limit the number of inotify. But I think if one running machine with millions
of inotify it's rather powerful machine with enough memory.

> Dumb question: do we really need inotify_inode_mark.fhandle at all?
> What prevents us from assembling this info on demand when ->show_fdinfo() is
> called?

exportfs requires the dentry to be passed as an argument while inotify works
with inodes instead and at moment of show-fdinfo the target dentry might be
already deleted but inode yet present as far as I remember.

2012-11-13 08:19:30

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Tue, 13 Nov 2012, Cyrill Gorcunov wrote:

> > The question is, how many `struct inotify_inode_mark's are instantiated
> > system-wide? Could be millions?
>
> Well, hard to tell, to be fair. On my testing machine only apache has been
> using inotify system as far as I remember, but for sure nothing except memory
> limit the number of inotify. But I think if one running machine with millions
> of inotify it's rather powerful machine with enough memory.
>

Seems easy to determine if you boot with slub_nomerge on the command line
and then read /sys/kernel/slab/inotify_inode_mark/objects. On my system,
that happens to be 210, but I'm sure you could come up with a realistic
synthetic workload to make it much higher.

2012-11-13 08:29:46

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Tue, Nov 13, 2012 at 12:19:25AM -0800, David Rientjes wrote:
> On Tue, 13 Nov 2012, Cyrill Gorcunov wrote:
>
> > > The question is, how many `struct inotify_inode_mark's are instantiated
> > > system-wide? Could be millions?
> >
> > Well, hard to tell, to be fair. On my testing machine only apache has been
> > using inotify system as far as I remember, but for sure nothing except memory
> > limit the number of inotify. But I think if one running machine with millions
> > of inotify it's rather powerful machine with enough memory.
> >
>
> Seems easy to determine if you boot with slub_nomerge on the command line
> and then read /sys/kernel/slab/inotify_inode_mark/objects. On my system,
> that happens to be 210, but I'm sure you could come up with a realistic
> synthetic workload to make it much higher.

Which would give about 26K of additional memory if c/r get used here. Not a
big number i guess?

2012-11-13 14:40:44

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Tue, Nov 13, 2012 at 12:37:23PM +0000, Tvrtko Ursulin wrote:
>> Which would give about 26K of additional memory if c/r get used here.
>> Not a big number i guess?
>
> I am pretty sure there are desktop file indexing packages which use
> inotify or fanotify which will put a mark on every single directory within
> users home.
>
> You probably need to test this with default installs of popular desktop
> environments and realistic home directories.

I'm about to shrink the handle down to 40/64 bytes as being proposed in one
of early review cycles (i'll do that with patch on top), which should minimize
the amount of memory needed (look, it's pretty clear that if the system
uses millions of inotify watchers each inotify mark will need the fhandle here
in c/r sake, i simply see no way at moment how to escape this completely,
but if the c/r is turned off, which is by default, no additional memory
needed).

p.s.: could you please don't use the html formatted messages

2012-11-13 15:02:49

by Tvrtko Ursulin

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Tuesday 13 November 2012 18:40:36 Cyrill Gorcunov wrote:
> On Tue, Nov 13, 2012 at 12:37:23PM +0000, Tvrtko Ursulin wrote:
> >> Which would give about 26K of additional memory if c/r get used here.
> >>
> >> Not a big number i guess?
> >
> > I am pretty sure there are desktop file indexing packages which use
> > inotify or fanotify which will put a mark on every single directory within
> > users home.
> >
> > You probably need to test this with default installs of popular desktop
> > environments and realistic home directories.
>
> I'm about to shrink the handle down to 40/64 bytes as being proposed in one
> of early review cycles (i'll do that with patch on top), which should
> minimize the amount of memory needed (look, it's pretty clear that if the
> system uses millions of inotify watchers each inotify mark will need the
> fhandle here in c/r sake, i simply see no way at moment how to escape this

Well I spotted uncertainty in this thread about how many of these structures
will typically be instantiated at runtime which is what I tried to add to this
discussion. I have 60k directories in my home for example...

If you don't want to use information I provided that is your choice. Because
you are still probably doubling this structure. Give or take - I haven't
actually bothered counting.

Perhaps there could be a different way, where you could use additional space
only when it is actually used at runtime. But as I said, I am not following
closely.

> completely, but if the c/r is turned off, which is by default, no
> additional memory needed).

By turned on and off you are not talking about runtime but about kernel
compile time, right? Do you envisage distributions turning this on, like they
do for most things?

> p.s.: could you please don't use the html formatted messages

I thought I sent a multi-part message which had a plain text part as well.
Granted this is not something I usually do, but on this occasion haven't
completed configuring my email client after a configuration loss.

Regards,

Tvrtko

2012-11-13 15:28:53

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Tue, Nov 13, 2012 at 03:02:22PM +0000, Tvrtko Ursulin wrote:
> On Tuesday 13 November 2012 18:40:36 Cyrill Gorcunov wrote:
> > On Tue, Nov 13, 2012 at 12:37:23PM +0000, Tvrtko Ursulin wrote:
> > >> Which would give about 26K of additional memory if c/r get used here.
> > >>
> > >> Not a big number i guess?
> > >
> > > I am pretty sure there are desktop file indexing packages which use
> > > inotify or fanotify which will put a mark on every single directory within
> > > users home.
> > >
> > > You probably need to test this with default installs of popular desktop
> > > environments and realistic home directories.
> >
> > I'm about to shrink the handle down to 40/64 bytes as being proposed in one
> > of early review cycles (i'll do that with patch on top), which should
> > minimize the amount of memory needed (look, it's pretty clear that if the
> > system uses millions of inotify watchers each inotify mark will need the
> > fhandle here in c/r sake, i simply see no way at moment how to escape this
>
> Well I spotted uncertainty in this thread about how many of these structures
> will typically be instantiated at runtime which is what I tried to add to this
> discussion. I have 60k directories in my home for example...

And I appreciate it, really, thanks for info!

> If you don't want to use information I provided that is your choice. Because
> you are still probably doubling this structure. Give or take - I haven't
> actually bothered counting.
>
> Perhaps there could be a different way, where you could use additional space
> only when it is actually used at runtime. But as I said, I am not following
> closely.

Unfortunatelly at moment I see no way how to make it in on-demand fashion.

>
> > completely, but if the c/r is turned off, which is by default, no
> > additional memory needed).
>
> By turned on and off you are not talking about runtime but about kernel
> compile time, right? Do you envisage distributions turning this on, like they
> do for most things?

yeah, compile time only. i don't expect it to be turned on by default
sometime soon but sure distro people have own opinions.

Cyrill

2012-11-13 22:38:12

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Tue, 13 Nov 2012 12:00:32 +0400
Cyrill Gorcunov <[email protected]> wrote:

> > Dumb question: do we really need inotify_inode_mark.fhandle at all?
> > What prevents us from assembling this info on demand when ->show_fdinfo() is
> > called?
>
> exportfs requires the dentry to be passed as an argument while inotify works
> with inodes instead and at moment of show-fdinfo the target dentry might be
> already deleted but inode yet present as far as I remember.

How can the c/r restore code reestablish the inode data if the dentry
isn't there any more?

2012-11-14 06:46:21

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Tue, Nov 13, 2012 at 02:38:08PM -0800, Andrew Morton wrote:
> On Tue, 13 Nov 2012 12:00:32 +0400
> Cyrill Gorcunov <[email protected]> wrote:
>
> > > Dumb question: do we really need inotify_inode_mark.fhandle at all?
> > > What prevents us from assembling this info on demand when ->show_fdinfo() is
> > > called?
> >
> > exportfs requires the dentry to be passed as an argument while inotify works
> > with inodes instead and at moment of show-fdinfo the target dentry might be
> > already deleted but inode yet present as far as I remember.
>
> How can the c/r restore code reestablish the inode data if the dentry
> isn't there any more?

By "deleted" I meant deleted from dcache, thus when we call for
open_by_handle_at with fhandle, the kernel reconstruct the path
and we simply read the /proc/self/fd/ link, and then pass this
path to inotify_add_watch.

2012-11-14 09:21:09

by Tvrtko Ursulin

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Tuesday 13 November 2012 19:28:46 Cyrill Gorcunov wrote:
> On Tue, Nov 13, 2012 at 03:02:22PM +0000, Tvrtko Ursulin wrote:
> > Perhaps there could be a different way, where you could use additional
> > space only when it is actually used at runtime. But as I said, I am not
> > following closely.
>
> Unfortunatelly at moment I see no way how to make it in on-demand fashion.

You could not use a pointer and then allocate your buffers on the check point
operation, freeing on restore?

Regards,

Tvrtko

2012-11-14 09:38:57

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wed, Nov 14, 2012 at 09:20:51AM +0000, Tvrtko Ursulin wrote:
> On Tuesday 13 November 2012 19:28:46 Cyrill Gorcunov wrote:
> > On Tue, Nov 13, 2012 at 03:02:22PM +0000, Tvrtko Ursulin wrote:
> > > Perhaps there could be a different way, where you could use additional
> > > space only when it is actually used at runtime. But as I said, I am not
> > > following closely.
> >
> > Unfortunatelly at moment I see no way how to make it in on-demand fashion.
>
> You could not use a pointer and then allocate your buffers on the check point
> operation, freeing on restore?

The problem is not allocating the memory itself but rather the time when the
information needed (ie the dentry) is available. The only moment when we can
use dentry of the target file/directory is at inotify_new_watch, that's why
i need to compose fhandle that early. At any later point we simply have no
dentry to use.

Cyrill

2012-11-14 09:51:13

by Tvrtko Ursulin

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wednesday 14 November 2012 13:38:49 Cyrill Gorcunov wrote:
> On Wed, Nov 14, 2012 at 09:20:51AM +0000, Tvrtko Ursulin wrote:
> > On Tuesday 13 November 2012 19:28:46 Cyrill Gorcunov wrote:
> > > On Tue, Nov 13, 2012 at 03:02:22PM +0000, Tvrtko Ursulin wrote:
> > > > Perhaps there could be a different way, where you could use additional
> > > > space only when it is actually used at runtime. But as I said, I am
> > > > not
> > > > following closely.
> > >
> > > Unfortunatelly at moment I see no way how to make it in on-demand
> > > fashion.
> >
> > You could not use a pointer and then allocate your buffers on the check
> > point operation, freeing on restore?
>
> The problem is not allocating the memory itself but rather the time when the
> information needed (ie the dentry) is available. The only moment when we
> can use dentry of the target file/directory is at inotify_new_watch, that's
> why i need to compose fhandle that early. At any later point we simply have
> no dentry to use.

But you do not fundamentally need the dentry to restore a watch, right?
Couldn't you restore, creating a new restore path if needed, using the inode
which is pinned anyway while the watch exists?

Regards,

Tvrtko

2012-11-14 09:58:20

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wed, Nov 14, 2012 at 09:50:55AM +0000, Tvrtko Ursulin wrote:
> > > You could not use a pointer and then allocate your buffers on the check
> > > point operation, freeing on restore?
> >
> > The problem is not allocating the memory itself but rather the time when the
> > information needed (ie the dentry) is available. The only moment when we
> > can use dentry of the target file/directory is at inotify_new_watch, that's
> > why i need to compose fhandle that early. At any later point we simply have
> > no dentry to use.
>
> But you do not fundamentally need the dentry to restore a watch, right?

dentry only needed to encode the file handle.

> Couldn't you restore, creating a new restore path if needed, using the inode
> which is pinned anyway while the watch exists?

plain inode is not enough as far as i can tell, iow i don't see the way
to restore path from inode solely. or there something i miss?

Cyrill

2012-11-14 10:09:10

by Tvrtko Ursulin

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wednesday 14 November 2012 13:58:12 Cyrill Gorcunov wrote:
> On Wed, Nov 14, 2012 at 09:50:55AM +0000, Tvrtko Ursulin wrote:
> > > > You could not use a pointer and then allocate your buffers on the
> > > > check
> > > > point operation, freeing on restore?
> > >
> > > The problem is not allocating the memory itself but rather the time when
> > > the information needed (ie the dentry) is available. The only moment
> > > when we can use dentry of the target file/directory is at
> > > inotify_new_watch, that's why i need to compose fhandle that early. At
> > > any later point we simply have no dentry to use.
> >
> > But you do not fundamentally need the dentry to restore a watch, right?
>
> dentry only needed to encode the file handle.
>
> > Couldn't you restore, creating a new restore path if needed, using the
> > inode which is pinned anyway while the watch exists?
>
> plain inode is not enough as far as i can tell, iow i don't see the way
> to restore path from inode solely. or there something i miss?

I don't know, as I said I was not following this at all until now. Just
throwing in ideas.

I thought, since inotify does not use the path or dentry outside the system
call at all, perhaps you need a different entry point allowing you to restore
the watch using the inode or something. Assuming life time of objects and
stuff in C&R world would allow you that. Since you don't need the full path,
just something 64 bytes long, I assumed that could be the case.

Regards,

Tvrtko

2012-11-14 10:11:14

by Pavel Emelyanov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On 11/14/2012 10:46 AM, Cyrill Gorcunov wrote:
> On Tue, Nov 13, 2012 at 02:38:08PM -0800, Andrew Morton wrote:
>> On Tue, 13 Nov 2012 12:00:32 +0400
>> Cyrill Gorcunov <[email protected]> wrote:
>>
>>>> Dumb question: do we really need inotify_inode_mark.fhandle at all?
>>>> What prevents us from assembling this info on demand when ->show_fdinfo() is
>>>> called?
>>>
>>> exportfs requires the dentry to be passed as an argument while inotify works
>>> with inodes instead and at moment of show-fdinfo the target dentry might be
>>> already deleted but inode yet present as far as I remember.
>>
>> How can the c/r restore code reestablish the inode data if the dentry
>> isn't there any more?
>
> By "deleted" I meant deleted from dcache, thus when we call for
> open_by_handle_at with fhandle, the kernel reconstruct the path
> and we simply read the /proc/self/fd/ link, and then pass this
> path to inotify_add_watch.

No we don't do readlink as the path we'd see would be empty. Instead after
we called the open_by_handle_at, we pass the "/proc/self/fd/<fd>" _path_ itself
to inotify_add_watch. The path resolution code follows the link properly and
adds the target inode into the watch list.

> .
>

2012-11-14 10:13:51

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wed, Nov 14, 2012 at 02:10:50PM +0400, Pavel Emelyanov wrote:
> >>
> >> How can the c/r restore code reestablish the inode data if the dentry
> >> isn't there any more?
> >
> > By "deleted" I meant deleted from dcache, thus when we call for
> > open_by_handle_at with fhandle, the kernel reconstruct the path
> > and we simply read the /proc/self/fd/ link, and then pass this
> > path to inotify_add_watch.
>
> No we don't do readlink as the path we'd see would be empty. Instead after
> we called the open_by_handle_at, we pass the "/proc/self/fd/<fd>" _path_ itself
> to inotify_add_watch. The path resolution code follows the link properly and
> adds the target inode into the watch list.

Yeah, sorry for confusion.

2012-11-14 10:15:03

by Pavel Emelyanov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On 11/14/2012 02:08 PM, Tvrtko Ursulin wrote:
> On Wednesday 14 November 2012 13:58:12 Cyrill Gorcunov wrote:
>> On Wed, Nov 14, 2012 at 09:50:55AM +0000, Tvrtko Ursulin wrote:
>>>>> You could not use a pointer and then allocate your buffers on the
>>>>> check
>>>>> point operation, freeing on restore?
>>>>
>>>> The problem is not allocating the memory itself but rather the time when
>>>> the information needed (ie the dentry) is available. The only moment
>>>> when we can use dentry of the target file/directory is at
>>>> inotify_new_watch, that's why i need to compose fhandle that early. At
>>>> any later point we simply have no dentry to use.
>>>
>>> But you do not fundamentally need the dentry to restore a watch, right?
>>
>> dentry only needed to encode the file handle.
>>
>>> Couldn't you restore, creating a new restore path if needed, using the
>>> inode which is pinned anyway while the watch exists?
>>
>> plain inode is not enough as far as i can tell, iow i don't see the way
>> to restore path from inode solely. or there something i miss?
>
> I don't know, as I said I was not following this at all until now. Just
> throwing in ideas.
>
> I thought, since inotify does not use the path or dentry outside the system
> call at all, perhaps you need a different entry point allowing you to restore
> the watch using the inode or something. Assuming life time of objects and
> stuff in C&R world would allow you that. Since you don't need the full path,
> just something 64 bytes long, I assumed that could be the case.

Well, the kernel already has all the API we need but one -- it shows us _nothing_
about the inode being watched. And we'd appreciate any information about it. Even
the ino:dev pair would work. We propose to show the handle because we believe, that
such API is better that ino:dev. You can get the handle, call the open_by_handle_at
right at once and get much much more information about the inode with any other
API (e.g. calling fstat() will give you the ino:dev pair). Having just ino:dev
pair at hands is not that flexible.

> Regards,
>
> Tvrtko
>
> .
>

2012-11-14 10:38:37

by Tvrtko Ursulin

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wednesday 14 November 2012 14:13:41 Pavel Emelyanov wrote:
> On 11/14/2012 02:08 PM, Tvrtko Ursulin wrote:
> > On Wednesday 14 November 2012 13:58:12 Cyrill Gorcunov wrote:
> >> On Wed, Nov 14, 2012 at 09:50:55AM +0000, Tvrtko Ursulin wrote:
> >>>>> You could not use a pointer and then allocate your buffers on the
> >>>>> check
> >>>>> point operation, freeing on restore?
> >>>>
> >>>> The problem is not allocating the memory itself but rather the time
> >>>> when
> >>>> the information needed (ie the dentry) is available. The only moment
> >>>> when we can use dentry of the target file/directory is at
> >>>> inotify_new_watch, that's why i need to compose fhandle that early. At
> >>>> any later point we simply have no dentry to use.
> >>>
> >>> But you do not fundamentally need the dentry to restore a watch, right?
> >>
> >> dentry only needed to encode the file handle.
> >>
> >>> Couldn't you restore, creating a new restore path if needed, using the
> >>> inode which is pinned anyway while the watch exists?
> >>
> >> plain inode is not enough as far as i can tell, iow i don't see the way
> >> to restore path from inode solely. or there something i miss?
> >
> > I don't know, as I said I was not following this at all until now. Just
> > throwing in ideas.
> >
> > I thought, since inotify does not use the path or dentry outside the
> > system
> > call at all, perhaps you need a different entry point allowing you to
> > restore the watch using the inode or something. Assuming life time of
> > objects and stuff in C&R world would allow you that. Since you don't need
> > the full path, just something 64 bytes long, I assumed that could be the
> > case.
>
> Well, the kernel already has all the API we need but one -- it shows us
> _nothing_ about the inode being watched. And we'd appreciate any
> information about it. Even the ino:dev pair would work. We propose to show
> the handle because we believe, that such API is better that ino:dev. You
> can get the handle, call the open_by_handle_at right at once and get much
> much more information about the inode with any other API (e.g. calling
> fstat() will give you the ino:dev pair). Having just ino:dev pair at hands
> is not that flexible.

How much space does a typical file system need to encode a handle? Am I right
that for must it is just a few bytes? (I just glanced at the code so I might
be wrong.) In which case, could the handle buffer be allocated dynamically
depending on the underlying filesystem? Perhaps adding a facility to query a
filesystem about its maximum handle buffer needs? Do you think the saving
would justify this extra work?

Regards,

Tvrtko

2012-11-14 10:47:09

by Pavel Emelyanov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On 11/14/2012 02:38 PM, Tvrtko Ursulin wrote:
> On Wednesday 14 November 2012 14:13:41 Pavel Emelyanov wrote:
>> On 11/14/2012 02:08 PM, Tvrtko Ursulin wrote:
>>> On Wednesday 14 November 2012 13:58:12 Cyrill Gorcunov wrote:
>>>> On Wed, Nov 14, 2012 at 09:50:55AM +0000, Tvrtko Ursulin wrote:
>>>>>>> You could not use a pointer and then allocate your buffers on the
>>>>>>> check
>>>>>>> point operation, freeing on restore?
>>>>>>
>>>>>> The problem is not allocating the memory itself but rather the time
>>>>>> when
>>>>>> the information needed (ie the dentry) is available. The only moment
>>>>>> when we can use dentry of the target file/directory is at
>>>>>> inotify_new_watch, that's why i need to compose fhandle that early. At
>>>>>> any later point we simply have no dentry to use.
>>>>>
>>>>> But you do not fundamentally need the dentry to restore a watch, right?
>>>>
>>>> dentry only needed to encode the file handle.
>>>>
>>>>> Couldn't you restore, creating a new restore path if needed, using the
>>>>> inode which is pinned anyway while the watch exists?
>>>>
>>>> plain inode is not enough as far as i can tell, iow i don't see the way
>>>> to restore path from inode solely. or there something i miss?
>>>
>>> I don't know, as I said I was not following this at all until now. Just
>>> throwing in ideas.
>>>
>>> I thought, since inotify does not use the path or dentry outside the
>>> system
>>> call at all, perhaps you need a different entry point allowing you to
>>> restore the watch using the inode or something. Assuming life time of
>>> objects and stuff in C&R world would allow you that. Since you don't need
>>> the full path, just something 64 bytes long, I assumed that could be the
>>> case.
>>
>> Well, the kernel already has all the API we need but one -- it shows us
>> _nothing_ about the inode being watched. And we'd appreciate any
>> information about it. Even the ino:dev pair would work. We propose to show
>> the handle because we believe, that such API is better that ino:dev. You
>> can get the handle, call the open_by_handle_at right at once and get much
>> much more information about the inode with any other API (e.g. calling
>> fstat() will give you the ino:dev pair). Having just ino:dev pair at hands
>> is not that flexible.
>
> How much space does a typical file system need to encode a handle? Am I right
> that for must it is just a few bytes? (I just glanced at the code so I might
> be wrong.) In which case, could the handle buffer be allocated dynamically
> depending on the underlying filesystem? Perhaps adding a facility to query a
> filesystem about its maximum handle buffer needs? Do you think the saving
> would justify this extra work?

Well, the MAX_HANDLE_SZ is taken from NFSv4 and is 128 bytes which is quite
big for inotify extension indeed. The good news is that this amount of bytes
seem to be required for the most descriptive fhandle -- with info about parent,
etc. We don't need such, we can live with shorter handle, people said that
40 bytes was enough for that.

However, your idea about determining the handle size dynamically seems promising.
As far as I can see from the code we can call for encode_fh with size equals zero
and filesystem would report back the amount of bytes it requires for a handle.

We can try going this route, what do you think?

> Regards,
>
> Tvrtko

Thanks,
Pavel

2012-11-14 10:54:24

by Tvrtko Ursulin

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wednesday 14 November 2012 14:46:42 Pavel Emelyanov wrote:
> On 11/14/2012 02:38 PM, Tvrtko Ursulin wrote:
> > On Wednesday 14 November 2012 14:13:41 Pavel Emelyanov wrote:
> >> On 11/14/2012 02:08 PM, Tvrtko Ursulin wrote:
> >>> On Wednesday 14 November 2012 13:58:12 Cyrill Gorcunov wrote:
> >>>> On Wed, Nov 14, 2012 at 09:50:55AM +0000, Tvrtko Ursulin wrote:
> >>>>>>> You could not use a pointer and then allocate your buffers on the
> >>>>>>> check
> >>>>>>> point operation, freeing on restore?
> >>>>>>
> >>>>>> The problem is not allocating the memory itself but rather the time
> >>>>>> when
> >>>>>> the information needed (ie the dentry) is available. The only moment
> >>>>>> when we can use dentry of the target file/directory is at
> >>>>>> inotify_new_watch, that's why i need to compose fhandle that early.
> >>>>>> At
> >>>>>> any later point we simply have no dentry to use.
> >>>>>
> >>>>> But you do not fundamentally need the dentry to restore a watch,
> >>>>> right?
> >>>>
> >>>> dentry only needed to encode the file handle.
> >>>>
> >>>>> Couldn't you restore, creating a new restore path if needed, using the
> >>>>> inode which is pinned anyway while the watch exists?
> >>>>
> >>>> plain inode is not enough as far as i can tell, iow i don't see the way
> >>>> to restore path from inode solely. or there something i miss?
> >>>
> >>> I don't know, as I said I was not following this at all until now. Just
> >>> throwing in ideas.
> >>>
> >>> I thought, since inotify does not use the path or dentry outside the
> >>> system
> >>> call at all, perhaps you need a different entry point allowing you to
> >>> restore the watch using the inode or something. Assuming life time of
> >>> objects and stuff in C&R world would allow you that. Since you don't
> >>> need
> >>> the full path, just something 64 bytes long, I assumed that could be the
> >>> case.
> >>
> >> Well, the kernel already has all the API we need but one -- it shows us
> >> _nothing_ about the inode being watched. And we'd appreciate any
> >> information about it. Even the ino:dev pair would work. We propose to
> >> show
> >> the handle because we believe, that such API is better that ino:dev. You
> >> can get the handle, call the open_by_handle_at right at once and get much
> >> much more information about the inode with any other API (e.g. calling
> >> fstat() will give you the ino:dev pair). Having just ino:dev pair at
> >> hands
> >> is not that flexible.
> >
> > How much space does a typical file system need to encode a handle? Am I
> > right that for must it is just a few bytes? (I just glanced at the code
> > so I might be wrong.) In which case, could the handle buffer be allocated
> > dynamically depending on the underlying filesystem? Perhaps adding a
> > facility to query a filesystem about its maximum handle buffer needs? Do
> > you think the saving would justify this extra work?
>
> Well, the MAX_HANDLE_SZ is taken from NFSv4 and is 128 bytes which is quite
> big for inotify extension indeed. The good news is that this amount of bytes
> seem to be required for the most descriptive fhandle -- with info about
> parent, etc. We don't need such, we can live with shorter handle, people
> said that 40 bytes was enough for that.
>
> However, your idea about determining the handle size dynamically seems
> promising. As far as I can see from the code we can call for encode_fh with
> size equals zero and filesystem would report back the amount of bytes it
> requires for a handle.
>
> We can try going this route, what do you think?

Sounds much better since that would only add one pointer to the watch
structure in the normal case.

Also at checkpoint time it will use only a few bytes (compared to 64) for the
encode buffer for most filesystems. This part is probably not that important
but still a win.

Regards,

Tvrtko

2012-11-14 10:57:17

by Pavel Emelyanov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

>>> How much space does a typical file system need to encode a handle? Am I
>>> right that for must it is just a few bytes? (I just glanced at the code
>>> so I might be wrong.) In which case, could the handle buffer be allocated
>>> dynamically depending on the underlying filesystem? Perhaps adding a
>>> facility to query a filesystem about its maximum handle buffer needs? Do
>>> you think the saving would justify this extra work?
>>
>> Well, the MAX_HANDLE_SZ is taken from NFSv4 and is 128 bytes which is quite
>> big for inotify extension indeed. The good news is that this amount of bytes
>> seem to be required for the most descriptive fhandle -- with info about
>> parent, etc. We don't need such, we can live with shorter handle, people
>> said that 40 bytes was enough for that.
>>
>> However, your idea about determining the handle size dynamically seems
>> promising. As far as I can see from the code we can call for encode_fh with
>> size equals zero and filesystem would report back the amount of bytes it
>> requires for a handle.
>>
>> We can try going this route, what do you think?
>
> Sounds much better since that would only add one pointer to the watch
> structure in the normal case.
>
> Also at checkpoint time it will use only a few bytes (compared to 64) for the
> encode buffer for most filesystems. This part is probably not that important
> but still a win.

No, the thing is -- we need to know the handle _before_ we start checkpoint.
More exactly -- at the time the inotify_add_watch is called. So the memory save
would be not that big.

> Regards,
>
> Tvrtko

Thanks,
Pavel

2012-11-14 11:03:42

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wed, Nov 14, 2012 at 02:56:00PM +0400, Pavel Emelyanov wrote:
> >>> How much space does a typical file system need to encode a handle? Am I
> >>> right that for must it is just a few bytes? (I just glanced at the code
> >>> so I might be wrong.) In which case, could the handle buffer be allocated
> >>> dynamically depending on the underlying filesystem? Perhaps adding a
> >>> facility to query a filesystem about its maximum handle buffer needs? Do
> >>> you think the saving would justify this extra work?
> >>
> >> Well, the MAX_HANDLE_SZ is taken from NFSv4 and is 128 bytes which is quite
> >> big for inotify extension indeed. The good news is that this amount of bytes
> >> seem to be required for the most descriptive fhandle -- with info about
> >> parent, etc. We don't need such, we can live with shorter handle, people
> >> said that 40 bytes was enough for that.
> >>
> >> However, your idea about determining the handle size dynamically seems
> >> promising. As far as I can see from the code we can call for encode_fh with
> >> size equals zero and filesystem would report back the amount of bytes it
> >> requires for a handle.
> >>
> >> We can try going this route, what do you think?
> >
> > Sounds much better since that would only add one pointer to the watch
> > structure in the normal case.
> >
> > Also at checkpoint time it will use only a few bytes (compared to 64) for the
> > encode buffer for most filesystems. This part is probably not that important
> > but still a win.
>
> No, the thing is -- we need to know the handle _before_ we start checkpoint.
> More exactly -- at the time the inotify_add_watch is called. So the memory save
> would be not that big.

for the worst case (NFSv4) it'll be 128 bytes + 8|4 byte pointer, but for
more common cases such as extXfs it'll be shrinked down to 8 byte handle +
8|4 byte pointer, which is a pretty good i think.

2012-11-14 11:13:18

by Tvrtko Ursulin

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wednesday 14 November 2012 14:56:00 Pavel Emelyanov wrote:
> >>> How much space does a typical file system need to encode a handle? Am I
> >>> right that for must it is just a few bytes? (I just glanced at the code
> >>> so I might be wrong.) In which case, could the handle buffer be
> >>> allocated
> >>> dynamically depending on the underlying filesystem? Perhaps adding a
> >>> facility to query a filesystem about its maximum handle buffer needs? Do
> >>> you think the saving would justify this extra work?
> >>
> >> Well, the MAX_HANDLE_SZ is taken from NFSv4 and is 128 bytes which is
> >> quite
> >> big for inotify extension indeed. The good news is that this amount of
> >> bytes seem to be required for the most descriptive fhandle -- with info
> >> about parent, etc. We don't need such, we can live with shorter handle,
> >> people said that 40 bytes was enough for that.
> >>
> >> However, your idea about determining the handle size dynamically seems
> >> promising. As far as I can see from the code we can call for encode_fh
> >> with
> >> size equals zero and filesystem would report back the amount of bytes it
> >> requires for a handle.
> >>
> >> We can try going this route, what do you think?
> >
> > Sounds much better since that would only add one pointer to the watch
> > structure in the normal case.
> >
> > Also at checkpoint time it will use only a few bytes (compared to 64) for
> > the encode buffer for most filesystems. This part is probably not that
> > important but still a win.
>
> No, the thing is -- we need to know the handle _before_ we start checkpoint.
> More exactly -- at the time the inotify_add_watch is called. So the memory
> save would be not that big.

Ah yes, I forgot about that. But the saving is quite solid as Cyrill already
wrote.

It is still a bit unfortunate you have to have handles allocated all the time
just because C&R is compiled in. There is no way you could ask the filesystem
to create you one on demand. What would you need? Just the superblock and
inode, or more?

Regards,

Tvrtko

2012-11-14 12:45:56

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wed, Nov 14, 2012 at 02:46:42PM +0400, Pavel Emelyanov wrote:
> Well, the MAX_HANDLE_SZ is taken from NFSv4 and is 128 bytes which is quite
> big for inotify extension indeed. The good news is that this amount of bytes
> seem to be required for the most descriptive fhandle

That 128-byte constant is just the protocol-defined maximum.

In practice my memory is that no existing filesystems require NFSv4 for
exports, so they all fit in the NFSv3 64-byte limit. (But I seem to
recall the NFSv2 32-byte limit being too small in some cases.)

> -- with info about parent, etc. We don't need such, we can live with
> shorter handle, people said that 40 bytes was enough for that.
>
> However, your idea about determining the handle size dynamically seems
> promising. As far as I can see from the code we can call for
> encode_fh with size equals zero and filesystem would report back the
> amount of bytes it requires for a handle.
>
> We can try going this route, what do you think?

I still don't understand why you need a dentry to get the filehandle.
The current api may ask for one, but it shouldn't really be necessary
(assuming you don't want parent directory information encoded in the
filehandle, which I hope you don't).

--b.

2012-11-14 13:03:54

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wed, Nov 14, 2012 at 07:45:49AM -0500, J. Bruce Fields wrote:
> >
> > We can try going this route, what do you think?
>
> I still don't understand why you need a dentry to get the filehandle.
> The current api may ask for one, but it shouldn't really be necessary
> (assuming you don't want parent directory information encoded in the
> filehandle, which I hope you don't).

As far as I know we don't need parent encoded. So Bruce, you think
to modify exportfs instead to work with inode directly?

Cyrill

2012-11-14 13:26:27

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wed, Nov 14, 2012 at 05:03:47PM +0400, Cyrill Gorcunov wrote:
> On Wed, Nov 14, 2012 at 07:45:49AM -0500, J. Bruce Fields wrote:
> > >
> > > We can try going this route, what do you think?
> >
> > I still don't understand why you need a dentry to get the filehandle.
> > The current api may ask for one, but it shouldn't really be necessary
> > (assuming you don't want parent directory information encoded in the
> > filehandle, which I hope you don't).
>
> As far as I know we don't need parent encoded. So Bruce, you think
> to modify exportfs instead to work with inode directly?

Looks like the filesystem encode_fh method just takes inodes (with the
parent inode allowed to be NULL), so all you'd need would be a version
of exportfs_encode_fh that took an inode.

(Worst case, if that didn't work, you could fake up a dentry with
something like d_obtain_alias, but better not to if it's not necessary.)

--b.

2012-11-14 13:33:03

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: [patch 3/7] fs, notify: Add file handle entry into inotify_inode_mark

On Wed, Nov 14, 2012 at 08:26:22AM -0500, J. Bruce Fields wrote:
> On Wed, Nov 14, 2012 at 05:03:47PM +0400, Cyrill Gorcunov wrote:
> > On Wed, Nov 14, 2012 at 07:45:49AM -0500, J. Bruce Fields wrote:
> > > >
> > > > We can try going this route, what do you think?
> > >
> > > I still don't understand why you need a dentry to get the filehandle.
> > > The current api may ask for one, but it shouldn't really be necessary
> > > (assuming you don't want parent directory information encoded in the
> > > filehandle, which I hope you don't).
> >
> > As far as I know we don't need parent encoded. So Bruce, you think
> > to modify exportfs instead to work with inode directly?
>
> Looks like the filesystem encode_fh method just takes inodes (with the
> parent inode allowed to be NULL), so all you'd need would be a version
> of exportfs_encode_fh that took an inode.
>
> (Worst case, if that didn't work, you could fake up a dentry with
> something like d_obtain_alias, but better not to if it's not necessary.)

Yeah, I'll update the series, test it and show ther results. Thanks!

Cyrill