Hi All,
The following patchset is a result of previous discussions regarding
file system threshold notifiactions. It introduces support for file
system event notifications, sent through generic netlinik interface
whenever an fs-related event occurs. Included are also some shmem
and ext4 changes showing how the new interface might actually be used.
The vary idea of using the generic netlink interface has been previoulsy
suggested here: https://lkml.org/lkml/2011/8/18/169
The basic description of the new functionality can be found in
the first patch from this set - both in the commit message and
in the doc file.
Some very basic tests have been performed though still this is
a PoC version. Below though is a sample user space application
which subscribes to the new multicast group and listens for
potential fs-related events. The code has been based on libnl 3.4
and its test application for the generic netlink.
---
Beata Michalska (4):
fs: Add generic file system event notifications
ext4: Add helper function to mark group as corrupted
ext4: Add support for generic FS events
shmem: Add support for generic FS events
Documentation/filesystems/events.txt | 254 +++++++++++
fs/Makefile | 1 +
fs/events/Makefile | 6 +
fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
fs/events/fs_event.h | 27 ++
fs/events/fs_event_netlink.c | 94 +++++
fs/ext4/balloc.c | 26 +-
fs/ext4/ext4.h | 10 +
fs/ext4/ialloc.c | 5 +-
fs/ext4/inode.c | 2 +-
fs/ext4/mballoc.c | 17 +-
fs/ext4/resize.c | 1 +
fs/ext4/super.c | 43 ++
fs/namespace.c | 1 +
include/linux/fs.h | 6 +-
include/linux/fs_event.h | 69 +++
include/uapi/linux/fs_event.h | 62 +++
include/uapi/linux/genetlink.h | 1 +
mm/shmem.c | 39 +-
net/netlink/genetlink.c | 7 +-
20 files changed, 1412 insertions(+), 34 deletions(-)
create mode 100644 Documentation/filesystems/events.txt
create mode 100644 fs/events/Makefile
create mode 100644 fs/events/fs_event.c
create mode 100644 fs/events/fs_event.h
create mode 100644 fs/events/fs_event_netlink.c
create mode 100644 include/linux/fs_event.h
create mode 100644 include/uapi/linux/fs_event.h
---
Sample application:
#include <netlink/cli/utils.h>
#include <fs_event.h>
#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
#define LOG(args...) fprintf(stderr, args)
static int parse_info(struct nl_cache_ops *unused, struct genl_cmd *cmd,
struct genl_info *info, void *arg)
{
LOG("New trace %d:\n",
info->attrs[FS_EVENT_ATR_FS_ID]
? nla_get_u32(info->attrs[FS_EVENT_ATR_FS_ID])
: -1);
LOG("Mout point: %s\n", info->attrs[FS_EVENT_ATR_MOUNT]
? nla_get_string(info->attrs[FS_EVENT_ATR_MOUNT])
: "unknown");
return 0;
}
static int parse_thres(struct nl_cache_ops *unused, struct genl_cmd *cmd,
struct genl_info *info, void *arg)
{
LOG("Threshold notification received for trace %d:\n",
info->attrs[FS_EVENT_ATR_FS_ID]
? nla_get_u32(info->attrs[FS_EVENT_ATR_FS_ID])
: -1);
if (info->attrs[FS_EVENT_ATR_DEV_MAJOR])
LOG("Backing dev major: %u\n",
nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MAJOR]));
if (info->attrs[FS_EVENT_ATR_DEV_MINOR])
LOG("Backing dev minor: %u\n",
nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MINOR]));
LOG("Proc: %u\n", info->attrs[FS_EVENT_ATR_CAUSED_ID] ?
nla_get_u32(info->attrs[FS_EVENT_ATR_CAUSED_ID]) : -1);
LOG("Threshold data: %llu\n", info->attrs[FS_EVENT_ATR_DATA]
? nla_get_u64(info->attrs[FS_EVENT_ATR_DATA])
: 0);
return 0;
}
static int parse_warning(struct nl_cache_ops *unused, struct genl_cmd *cmd,
struct genl_info *info, void *arg)
{
LOG("Warning recieved for trace %d\n", info->attrs[FS_EVENT_ATR_FS_ID] ?
nla_get_u32(info->attrs[FS_EVENT_ATR_FS_ID]) : -1);
if (info->attrs[FS_EVENT_ATR_DEV_MAJOR])
LOG("Backing dev major: %u\n",
nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MAJOR]));
if (info->attrs[FS_EVENT_ATR_DEV_MINOR])
LOG("Backing dev minor: %u\n",
nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MINOR]));
LOG("Proc: %u\n", info->attrs[FS_EVENT_ATR_CAUSED_ID] ?
nla_get_u32(info->attrs[FS_EVENT_ATR_CAUSED_ID]) : -1);
LOG("Warning: %u\n", info->attrs[FS_EVENT_ATR_ID] ?
nla_get_u32(info->attrs[FS_EVENT_ATR_ID]) : -1);
return 0;
}
static struct genl_cmd cmd[] = {
{
.c_id = FS_EVENT_TYPE_NEW_TRACE,
.c_name = "info",
.c_maxattr = 2,
.c_msg_parser = parse_info,
}, {
.c_id = FS_EVENT_TYPE_THRESH,
.c_name = "thres",
.c_maxattr = 6,
.c_msg_parser = parse_thres,
}, {
.c_id = FS_EVENT_TYPE_WARN,
.c_name = "warn",
.c_maxattr = 5,
.c_msg_parser = parse_warning,
},
};
static struct genl_ops ops = {
.o_id = GENL_ID_FS_EVENT,
.o_name = "FS_EVENT",
.o_hdrsize = 0,
.o_cmds = cmd,
.o_ncmds = ARRAY_SIZE(cmd),
};
int events_cb(struct nl_msg *msg, void *arg)
{
return genl_handle_msg(msg, arg);
}
int main(int argc, char **argv)
{
struct nl_sock *sock;
int ret;
sock = nl_cli_alloc_socket();
nl_socket_set_local_port(sock, 0);
nl_socket_disable_seq_check(sock);
nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, events_cb, NULL);
nl_cli_connect(sock, NETLINK_GENERIC);
if ((ret = nl_socket_add_membership(sock, GENL_ID_FS_EVENT))) {
LOG("Failed to add membership\n");
goto leave;
}
if((ret = genl_register_family(&ops))) {
LOG("Failed to register protocol family\n");
goto leave;
}
if ((ret = genl_ops_resolve(sock, &ops) < 0)) {
LOG("Unable to resolve the family name\n");
goto leave;
}
if (genl_ctrl_resolve(sock, "FS_EVENT") < 0) {
LOG("Failed to resolve the family name\n");
goto leave;
}
while (1) {
if ((ret = nl_recvmsgs_default(sock)) < 0)
LOG("Unable to receive message: %s\n", nl_geterror(ret));
}
leave:
nl_close(sock);
nl_socket_free(sock);
return 0;
}
--
1.7.9.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Add ext4_mark_group_corrupted helper function to
simplify the code and to keep the logic in one place.
Signed-off-by: Beata Michalska <[email protected]>
---
fs/ext4/balloc.c | 15 +++------------
fs/ext4/ext4.h | 9 +++++++++
fs/ext4/ialloc.c | 5 +----
fs/ext4/mballoc.c | 11 ++---------
4 files changed, 15 insertions(+), 25 deletions(-)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 83a6f49..e95b27a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -193,10 +193,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_corrupted(sbi, grp);
if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
int count;
count = ext4_free_inodes_count(sb, gdp);
@@ -379,20 +376,14 @@ static void ext4_validate_block_bitmap(struct super_block *sb,
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
block_group, blk);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_corrupted(sbi, grp);
return;
}
if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
desc, bh))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_corrupted(sbi, grp);
return;
}
set_buffer_verified(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f63c3d5..163afe2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2535,6 +2535,15 @@ static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}
+static inline
+void ext4_mark_group_corrupted(struct ext4_sb_info *sbi,
+ struct ext4_group_info *grp)
+{
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+}
+
/*
* Returns true if the filesystem is busy enough that attempts to
* access the block group locks has run into contention.
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ac644c3..ebe0499 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -79,10 +79,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_corrupted(sbi, grp);
if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
int count;
count = ext4_free_inodes_count(sb, gdp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8d1e602..24a4b6d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -760,10 +760,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_corrupted(sbi, grp);
}
mb_set_largest_free_order(sb, grp);
@@ -1448,12 +1445,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
"freeing already freed block "
"(bit %u); block bitmap corrupt.",
block);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- e4b->bd_info->bb_free);
/* Mark the block group as corrupt. */
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
- &e4b->bd_info->bb_state);
+ ext4_mark_group_corrupted(sbi, e4b->bd_info);
mb_regenerate_buddy(e4b);
goto done;
}
--
1.7.9.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Introduce configurable generic interface for file
system-wide event notifications to provide file
systems with a common way of reporting any potential
issues as they emerge.
The notifications are to be issued through generic
netlink interface, by a dedicated, for file system
events, multicast group. The file systems might as
well use this group to send their own custom messages.
The events have been split into four base categories:
information, warnings, errors and threshold notifications,
with some very basic event types like running out of space
or file system being remounted as read-only.
Threshold notifications have been included to allow
triggering an event whenever the amount of free space
drops below a certain level - or levels to be more precise
as two of them are being supported: the lower and the upper
range. The notifications work both ways: once the threshold
level has been reached, an event shall be generated whenever
the number of available blocks goes up again re-activating
the threshold.
The interface has been exposed through a vfs. Once mounted,
it serves as an entry point for the set-up where one can
register for particular file system events.
Signed-off-by: Beata Michalska <[email protected]>
---
Documentation/filesystems/events.txt | 254 +++++++++++
fs/Makefile | 1 +
fs/events/Makefile | 6 +
fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
fs/events/fs_event.h | 27 ++
fs/events/fs_event_netlink.c | 94 +++++
fs/namespace.c | 1 +
include/linux/fs.h | 6 +-
include/linux/fs_event.h | 69 +++
include/uapi/linux/fs_event.h | 62 +++
include/uapi/linux/genetlink.h | 1 +
net/netlink/genetlink.c | 7 +-
12 files changed, 1301 insertions(+), 2 deletions(-)
create mode 100644 Documentation/filesystems/events.txt
create mode 100644 fs/events/Makefile
create mode 100644 fs/events/fs_event.c
create mode 100644 fs/events/fs_event.h
create mode 100644 fs/events/fs_event_netlink.c
create mode 100644 include/linux/fs_event.h
create mode 100644 include/uapi/linux/fs_event.h
diff --git a/Documentation/filesystems/events.txt b/Documentation/filesystems/events.txt
new file mode 100644
index 0000000..c85dd88
--- /dev/null
+++ b/Documentation/filesystems/events.txt
@@ -0,0 +1,254 @@
+
+ Generic file system event notification interface
+
+Document created 09 April 2015 by Beata Michalska <[email protected]>
+
+1. The reason behind:
+=====================
+
+There are many corner cases when things might get messy with the filesystems.
+And it is not always obvious what and when went wrong. Sometimes you might
+get some subtle hints that there is something going on - but by the time
+you realise it, it might be too late as you are already out-of-space
+or the filesystem has been remounted as read-only (i.e.). The generic
+interface for the filesystem events fills the gap by providing a rather
+easy way of real-time notifications triggered whenever something intreseting
+happens, allowing filesystems to report events in a common way, as they occur.
+
+2. How does it work:
+====================
+
+The interface itself has been exposed as fstrace-type Virtual File System,
+primarily to ease the process of setting up the configuration for the file
+system notifications. So for starters it needs to get mounted (obviously):
+
+ mount -t fstrace none /sys/fs/events
+
+This will unveil the single fstrace filesystem entry - the 'config' file,
+through which the notification are being set-up.
+
+Activating notifications for particular filesystem is as straightforward
+as writing into the 'config' file. Note that by default all events despite
+the actual filesystem type are being disregarded.
+
+Synopsis of config:
+------------------
+
+ MOUNT EVENT_TYPE [L1] [L2]
+
+ MOUNT : the filesystem's mount point
+ EVENT_TYPE : type of events to be enabled: info,warn,err,thr;
+ at least one type needs to be specified;
+ note the comma delimiter and lack of spaces between
+ those options
+ L1 : the threshold limit - lower range
+ L2 : the threshold limit - upper range
+ case enabling threshold notifications the lower level is
+ mandatory, whereas the upper one remains optional;
+ note though, that as those refer to the number of available
+ blocks, the lower level needs to be higher than the upper one
+
+Sample request could look like the follwoing:
+
+ echo /sample/mount/point warn,err,thr 710000 500000 > /sys/fs/events/config
+
+Multiple request might be specified provided they are separated with semicolon.
+
+The configuration itself might be modified at any time. One can add/remove
+particilar event types for given fielsystem, modify the threshold levels,
+and remove single or all entries from the 'config' file.
+
+ - Adding new event type:
+
+ $ echo MOUNT EVENT_TYPE > /sys/fs/events/config
+
+(Note that is is enough to provide the eventy type to be enabled without
+the already set ones.)
+
+ - Removing event type:
+
+ $ echo '!MOUNT EVENT_TYPE' > /sys/fs/events/config
+
+ - Updating threshold limits:
+
+ $ echo MOUNT thres L1 L2 > /sys/fs/events/config
+
+ - Removing single entry:
+
+ $ echo '!MOUNT' > /sys/fs/events/config
+
+ - Removing all entries:
+
+ $ echo > /sys/fs/events/config
+
+Reading the file will list all registered entries with their current set-up
+along with some additional info like the id of the entry (@see more on generic
+netlink section), the filesystem type and the backing device name if available.
+
+Final, though a very important note on the configuration: when and if the
+actual events are being triggered falls way beyond the scope of the generic
+filesystem events interface. It is up to a particular filesystem
+implementation which events are to be supported - if any at all. So if
+given filesystem does not support the event notifications, an attempt to
+enable those through 'config' file will fail.
+
+
+3. The generic netlink interface support:
+=========================================
+
+Whenever an event notification is triggered (by given filesystem) the current
+configuration is being validated to decide whether a userpsace notification
+should be launched. If there has been no request (in a mean of 'config' file
+entry) for given event, one will be silently disreagrded. If, on the other
+hand, someone is 'watching' given filesystem for specific events, a generic
+netlink message will be sent.
+
+A dedicated multicast group has been provided solely for the purpose of
+notifying any potential listeners of file system events. So in order to
+receive such notifications, one should subscribe to this new mutlicast group.
+
+Each message type reflects the actual type of generated event (FS_EVENT_TYPE*)
+Currently there are two supported message formats.
+
+There is a common message format representing an event generated by
+a filesystem. The type of the event itself will be stored within
+the generic netlink message header as the command filed. The messge
+payload will provide more detailed info: the indentifier of the filesystem
+trace (genereted upon registering the trace), the backing device major and
+minor numbers, the event identifier and the id of the proccess which action
+led to the event occurance. In case of threshold notifications, the current
+number of available blocks will be included in the payload.
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | NETLINK MESSAGE HEADER |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | GENERIC NETLINK MESSAGE HEADER |
+ | (with event type as genlmsghdr cdm field) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Optional user specific message header |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | GENERIC MESSAGE PAYLOAD: |
+ +---------------------------------------------------------------+
+ | FS_EVENT_ATR_FS_ID (NLA_U32) |
+ +---------------------------------------------------------------+
+ | FS_EVENT_ATR_DEV_MAJOR (NLA_U32) (if available) |
+ +---------------------------------------------------------------+
+ | FS_EVENT_ATR_DEV_MINOR (NLA_U32) (if available) |
+ +---------------------------------------------------------------+
+ | FS_EVENT_ATR_ID (NLA_U32) |
+ +---------------------------------------------------------------+
+ | FS_EVENT_ATR_CAUSED_ID (NLA_U32) |
+ +---------------------------------------------------------------+
+ | FS_EVENT_ATR_DATA (NLA_U64) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+The second supported message format represents an event of a new trace being
+registered. It contains two attributes within the payload: the trace id and the
+mount point for which the trace has been registered. This message is of type
+FS_EVENT_TYPE_NEW_TRACE and is being sent regardless the actual event types
+being watched whenever new etnry for the 'config' file is being created. This
+is supposed to ease parsing the messages by userpsace applications and to help
+to identify the origin of the event. It also reduces the size of the payload
+as there is no need to send additional data such as mount point and the file
+system type for each possible event.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | NETLINK MESSAGE HEADER |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | GENERIC NETLINK MESSAGE HEADER |
+ | (with event type as genlmsghdr cdm field) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Optional user specific message header |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | GENERIC MESSAGE PAYLOAD: |
+ + ------------------------------------------------------------- +
+ | FS_EVENT_ATR_FS_ID (NLA_U32) |
+ + ------------------------------------------------------------- +
+ | FS_EVENT_ATR_MOUNT (NLA_STRING) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+The above figures are based on:
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/generic_netlink_howto#Message_Format
+
+
+
+4. API Reference:
+=================
+
+ 4.1 Generic file system event interface operations
+
+ #include <linux/fs_event.h>
+
+ struct fs_trace_operations {
+ int (*fs_trace_query)(struct super_block *, struct fs_trace_sdata *);
+ };
+
+ Each filesystem supporting the event notifications should register its
+ file system trace operations. This can be done through new entry in
+ the super_block structure: the s_trace_ops. The fs_trace_query shall
+ be called whenever new trace entry for given filesystem is being created
+ or when threshold notifications are being requested for the first time.
+ The filesystem should specify then, which event types are being supported.
+ In case of threshold notifications the current number of avaialble blocks
+ should be provided.
+
+ 4.2 Event notification:
+
+ #include <linux/fs_event.h>
+ void fs_event_notify(struct super_block *sb, unsigned int event_type,
+ unsigned int event_id);
+
+ Notify the generic FS event interface of an occuring event.
+ This shall be used by any file system that wishes to inform any potenial
+ listeners/watchers of a particular event.
+ - sb: the filesystem's super block
+ - event_type: the type of an event (one of the FS_EVENT_*)
+ - event_id: an event identifier
+
+ 4.3 Threshold notifications:
+
+ #include <linux/fs_event.h>
+ void fs_event_alloc_space(struct super_block *sb, u64 ncount);
+ void fs_event_free_space(struct super_block *sb, u64 ncount);
+
+ Each filesystme supporting the treshold notifiactions should call
+ fs_event_alloc_space/fs_event_free_space repsectively whenever the
+ ammount of availbale blocks changes.
+ - sb: the filesystem's super block
+ - ncount: number of blocks being acquired/released
+
+ Note that to properly handle the treshold notifiactions the fs events
+ interface needs to be keept up to date by the filesystems. Each should
+ register fs_trace_operations to enable querying the basic trace data,
+ among which, is the current number of the available blocks (fs_trace_query).
+
+ 4.4 Sending message through generic netlink interface
+
+ #include <linux/fs_event.h>
+ int fs_netlink_send_event(size_t size, unsigned int event_type,
+ int (*compose_msg)(struct sk_buff *skb, unsigned int event_id,
+ void *data),
+ unsigned int event_id, void *data);
+
+ Although the fs event interface is fully responsible for sending the messages
+ over the netlink, filesystems might use the FS_EVENT mutlicast group to send
+ their own custom messages.
+ - size: the size of the message payload
+ - event_type: the type of an event: stored as message header's command
+ - compose_msg: a custom callback handling composing the message payload
+ - event_id: the event identifier
+ - data: message custom data
+
+ Calling fs_netlink_send_event will result in a message being sent through
+ the FS_EVENT muslicast group. Note that the body of the message should be
+ prepared (set-up )by the caller - through compose_msg callback. The message's
+ sk_buff will be allocated on behalf of the caller (thus the size parameter).
+ The compose_msg should only fill the payload with proper data.
+
+
diff --git a/fs/Makefile b/fs/Makefile
index a88ac48..798021d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -126,3 +126,4 @@ obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
+obj-y += events/
diff --git a/fs/events/Makefile b/fs/events/Makefile
new file mode 100644
index 0000000..58d1454
--- /dev/null
+++ b/fs/events/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the Linux Generic File System Event Interface
+#
+
+obj-y := fs_event.o
+obj-$(CONFIG_NET) += fs_event_netlink.o
diff --git a/fs/events/fs_event.c b/fs/events/fs_event.c
new file mode 100644
index 0000000..8ebe371
--- /dev/null
+++ b/fs/events/fs_event.c
@@ -0,0 +1,775 @@
+/*
+ * Generic File System Events Interface
+ *
+ * Copyright(c) 2015 Samsung Electronics. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <net/genetlink.h>
+#include "../mount.h"
+#include "fs_event.h"
+
+#define FS_HASHTB_BITS 8
+#define FS_HASHTB_SIZE (1 << FS_HASHTB_BITS)
+
+/**
+ * The FS event trace entries are being stored in a hashtable
+ * for fast entry look-up, and in a doubly-linked list
+ * to ease all the paths that need to go through all
+ * the entries.
+ */
+static DEFINE_HASHTABLE(fs_trace_hashtbl, FS_HASHTB_BITS);
+static LIST_HEAD(fs_trace_list);
+static DEFINE_SPINLOCK(fs_trace_lock);
+
+static struct kmem_cache *fs_trace_cachep __read_mostly;
+
+/*
+ * Each registered FS event trace is being marked with
+ * a unique identifier managed by IDR
+ */
+static struct idr fs_trace_idr;
+static DEFINE_SPINLOCK(fs_trace_idr_lock);
+
+/*
+ * Threshold notification state bits.
+ * Note the reverse as this refers to the number
+ * of available blocks.
+ */
+#define THRESH_LR_BELOW 0x0001 /* Falling below the lower range */
+#define THRESH_LR_BEYOND 0x0002
+#define THRESH_UR_BELOW 0x0004
+#define THRESH_UR_BEYOND 0x0008 /* Going beyond the upper range */
+
+#define THRESH_LR_ON (THRESH_LR_BELOW | THRESH_LR_BEYOND)
+#define THRESH_UR_ON (THRESH_UR_BELOW | THRESH_UR_BEYOND)
+
+#define FS_TRACE_ADD 0x100000
+
+struct fs_trace_entry {
+ struct list_head node;
+ struct hlist_node hnode;
+ struct path path;
+ struct fs_trace_sdata data;
+ int mark;
+ unsigned int notify_mask;
+ struct fs_event_thresh {
+ u64 lrange;
+ u64 urange;
+ unsigned int state;
+ } thresh;
+ spinlock_t lock;
+};
+
+static const match_table_t fs_etypes = {
+ { FS_EVENT_INFO, "info" },
+ { FS_EVENT_WARN, "warn" },
+ { FS_EVENT_THRESH, "thr" },
+ { FS_EVENT_ERR, "err" },
+ { 0, NULL },
+};
+
+#define fs_trace_sb(en) ((en)->path.mnt->mnt_sb)
+
+#define fs_trace_query_data(sb, arg) \
+ (((sb)->s_trace_ops && (sb)->s_trace_ops->fs_trace_query) ? \
+ (sb)->s_trace_ops->fs_trace_query((sb), arg) : -EINVAL)
+
+#define fs_event_type_cast(event_type) (ffs(event_type))
+
+static inline unsigned int fs_trace_hasfn(const struct super_block *sb)
+{
+ return ((unsigned long)sb >> L1_CACHE_SHIFT) & (FS_HASHTB_SIZE - 1);
+}
+
+static struct fs_trace_entry *fs_find_trace_entry(struct super_block *sb)
+{
+ struct fs_trace_entry *en;
+ unsigned long hash;
+
+ if (list_empty(&fs_trace_list))
+ return ERR_PTR(-EINVAL);
+ hash = fs_trace_hasfn(sb);
+ hash_for_each_possible(fs_trace_hashtbl, en, hnode, hash)
+ if (fs_trace_sb(en) == sb)
+ return en;
+ return ERR_PTR(-EINVAL);
+}
+
+static inline void fs_trace_entry_list_del(struct fs_trace_entry *en)
+{
+ spin_lock(&en->lock);
+ list_del(&en->node);
+ hash_del(&en->hnode);
+ spin_unlock(&en->lock);
+}
+
+static inline void fs_trace_entry_idr_remove(struct fs_trace_entry *en)
+{
+ spin_lock(&fs_trace_idr_lock);
+ idr_remove(&fs_trace_idr, en->mark);
+ spin_unlock(&fs_trace_idr_lock);
+}
+
+static inline void fs_trace_entry_free(struct fs_trace_entry *en)
+{
+ kmem_cache_free(fs_trace_cachep, en);
+}
+
+static inline void fs_destroy_trace_entry(struct fs_trace_entry *en)
+{
+ fs_trace_entry_list_del(en);
+ fs_trace_entry_idr_remove(en);
+ fs_trace_entry_free(en);
+}
+
+static int fs_remove_trace_entry(struct super_block *sb)
+{
+ struct fs_trace_entry *en;
+ int ret = -EINVAL;
+
+ spin_lock(&fs_trace_lock);
+ en = fs_find_trace_entry(sb);
+ if (!IS_ERR(en)) {
+ fs_destroy_trace_entry(en);
+ ret = 0;
+ }
+ spin_unlock(&fs_trace_lock);
+ return ret;
+}
+
+static void fs_remove_all_traces(void)
+{
+ struct fs_trace_entry *en, *guard;
+
+ spin_lock(&fs_trace_lock);
+ list_for_each_entry_safe(en, guard, &fs_trace_list, node)
+ fs_destroy_trace_entry(en);
+ spin_unlock(&fs_trace_lock);
+}
+
+static int fs_event_new_trace_create_msg(struct sk_buff *skb,
+ unsigned int event_id, void *data)
+{
+ struct fs_trace_entry *en = (struct fs_trace_entry *)data;
+ char *path, *mount_dir;
+ int ret;
+
+ path = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!path)
+ return -EINVAL;
+ mount_dir = d_path(&en->path, path, PATH_MAX - 1);
+ if (IS_ERR(mount_dir))
+ mount_dir = "unknown";
+
+ ret = nla_put_u32(skb, FS_EVENT_ATR_FS_ID, en->mark);
+ if (ret)
+ goto leave;
+ ret = nla_put_string(skb, FS_EVENT_ATR_MOUNT, mount_dir);
+
+leave:
+ kfree(path);
+ return ret;
+}
+
+static int fs_event_common_create_msg(struct sk_buff *skb,
+ unsigned int event_id, void *data)
+{
+ struct fs_trace_entry *en = (struct fs_trace_entry *)data;
+ struct super_block *sb = fs_trace_sb(en);
+
+ if (nla_put_u32(skb, FS_EVENT_ATR_FS_ID, en->mark))
+ return -EINVAL;
+
+ /* In case there is no backing dev, so skip the followng */
+ if (sb->s_bdev && MAJOR(sb->s_dev))
+ if (nla_put_u32(skb, FS_EVENT_ATR_DEV_MAJOR, MAJOR(sb->s_dev))
+ || nla_put_u32(skb, FS_EVENT_ATR_DEV_MINOR, MINOR(sb->s_dev)))
+ return -EINVAL;
+
+ if (nla_put_u32(skb, FS_EVENT_ATR_ID, event_id))
+ return -EINVAL;
+ if (nla_put_u64(skb, FS_EVENT_ATR_CAUSED_ID, pid_nr(task_pid(current))))
+ return -EINVAL;
+
+ if (event_id & (FS_THRESH_LR_REACHED | FS_THRESH_UR_REACHED))
+ return nla_put_u64(skb, FS_EVENT_ATR_DATA,
+ en->data.available_blks);
+
+ return 0;
+}
+
+static void fs_event_new_trace(struct fs_trace_entry *en)
+{
+ fs_netlink_send_event(GENLMSG_DEFAULT_SIZE, FS_EVENT_TYPE_NEW_TRACE,
+ fs_event_new_trace_create_msg, 0, en);
+}
+
+static void fs_event_send(struct fs_trace_entry *en,
+ unsigned int event_type, unsigned int event_id)
+{
+ size_t size = nla_total_size(sizeof(u32)) * 4 +
+ nla_total_size(sizeof(u64)) * 2;
+
+ fs_netlink_send_event(size, fs_event_type_cast(event_type),
+ fs_event_common_create_msg, event_id, en);
+}
+
+void fs_event_notify(struct super_block *sb, unsigned int event_type,
+ unsigned int event_id)
+{
+ struct fs_trace_entry *en;
+
+ spin_lock(&fs_trace_lock);
+ en = fs_find_trace_entry(sb);
+ if (IS_ERR(en)) {
+ spin_unlock(&fs_trace_lock);
+ return;
+ }
+
+ spin_lock(&en->lock);
+ /* Relase the main lock - it's enough to keep the entry lock here */
+ spin_unlock(&fs_trace_lock);
+ if (en->notify_mask & event_type)
+ fs_event_send(en, event_type, event_id);
+ spin_unlock(&en->lock);
+}
+EXPORT_SYMBOL(fs_event_notify);
+
+void fs_event_alloc_space(struct super_block *sb, u64 ncount)
+{
+ struct fs_trace_entry *en;
+ s64 count;
+
+ spin_lock(&fs_trace_lock);
+ en = fs_find_trace_entry(sb);
+ if (IS_ERR(en)) {
+ spin_unlock(&fs_trace_lock);
+ return;
+ }
+
+ spin_lock(&en->lock);
+ spin_unlock(&fs_trace_lock);
+
+ if (!(en->notify_mask & FS_EVENT_THRESH))
+ goto leave;
+ /* we shouldn't drop below 0 here, unless there is a sync issue
+ somewhere (?) */
+ count = en->data.available_blks - ncount;
+ en->data.available_blks = count < 0 ? 0 : count;
+
+ if (en->data.available_blks > en->thresh.lrange)
+ /* Not 'even' close - leave */
+ goto leave;
+
+ if (en->data.available_blks > en->thresh.urange) {
+ /* Close enough - the lower range has been reached */
+ if (!(en->thresh.state & THRESH_LR_BEYOND)) {
+ /* Send notificaton */
+ fs_event_send(en, FS_EVENT_THRESH,
+ FS_THRESH_LR_REACHED);
+ en->thresh.state &= ~THRESH_LR_BELOW;
+ en->thresh.state |= THRESH_LR_BEYOND;
+ }
+ goto leave;
+ }
+ if (!(en->thresh.state & THRESH_UR_BEYOND)) {
+ fs_event_send(en, FS_EVENT_THRESH, FS_THRESH_UR_REACHED);
+ en->thresh.state &= ~THRESH_UR_BELOW;
+ en->thresh.state |= THRESH_UR_BEYOND;
+ }
+
+leave:
+ spin_unlock(&en->lock);
+}
+EXPORT_SYMBOL(fs_event_alloc_space);
+
+void fs_event_free_space(struct super_block *sb, u64 ncount)
+{
+ struct fs_trace_entry *en;
+
+ spin_lock(&fs_trace_lock);
+ en = fs_find_trace_entry(sb);
+ if (IS_ERR(en)) {
+ spin_unlock(&fs_trace_lock);
+ return;
+ }
+
+ spin_lock(&en->lock);
+ spin_unlock(&fs_trace_lock);
+
+ if (!(en->notify_mask & FS_EVENT_THRESH))
+ goto leave;
+
+ en->data.available_blks += ncount;
+
+ if (en->data.available_blks > en->thresh.lrange) {
+ if (!(en->thresh.state & THRESH_LR_BELOW)
+ && en->thresh.state & THRESH_LR_BEYOND) {
+ /* Send notificaton */
+ fs_event_send(en, FS_EVENT_THRESH,
+ FS_THRESH_LR_REACHED);
+ en->thresh.state &= ~THRESH_LR_BEYOND;
+ en->thresh.state |= THRESH_LR_BELOW;
+ goto leave;
+ }
+ }
+ if (en->data.available_blks > en->thresh.urange) {
+ if (!(en->thresh.state & THRESH_UR_BELOW)
+ && en->thresh.state & THRESH_UR_BEYOND) {
+ /* Notify */
+ fs_event_send(en, FS_EVENT_THRESH,
+ FS_THRESH_UR_REACHED);
+ en->thresh.state &= ~THRESH_UR_BEYOND;
+ en->thresh.state |= THRESH_UR_BELOW;
+ }
+ }
+leave:
+ spin_unlock(&en->lock);
+}
+EXPORT_SYMBOL(fs_event_free_space);
+
+void fs_event_mount_dropped(struct vfsmount *mnt)
+{
+ struct fs_trace_entry *en;
+
+ spin_lock(&fs_trace_lock);
+ en = fs_find_trace_entry(mnt->mnt_sb);
+ if (!IS_ERR(en)) {
+ spin_lock(&en->lock);
+ if (en->notify_mask & FS_EVENT_INFO)
+ fs_event_send(en, FS_EVENT_TYPE_INFO, FS_INFO_UMOUNT);
+ spin_unlock(&en->lock);
+ fs_destroy_trace_entry(en);
+ }
+ spin_unlock(&fs_trace_lock);
+}
+
+static int fs_new_trace_entry(struct path *path, struct fs_event_thresh *thresh,
+ unsigned int nmask)
+{
+ struct fs_trace_entry *en;
+ struct super_block *sb;
+ struct mount *r_mnt;
+
+ en = kmem_cache_zalloc(fs_trace_cachep, GFP_KERNEL);
+ if (unlikely(!en))
+ return -ENOMEM;
+ /*
+ * Note that no reference is being taken here for the path as it would
+ * make the umount unnecessarily puzzling (due to an extra 'valid'
+ * reference for the mnt).
+ * This is *rather* safe as the notification on mount being dropped
+ * will get called prior to releasing the super block - so right
+ * in time to send the event and perform appropraite clean-up
+ */
+ r_mnt = real_mount(path->mnt);
+ en->path.dentry = r_mnt->mnt.mnt_root;
+ en->path.mnt = &r_mnt->mnt;
+
+ sb = fs_trace_sb(en);
+ spin_lock_init(&en->lock);
+
+ spin_lock(&fs_trace_idr_lock);
+ idr_preload(GFP_KERNEL);
+ en->mark = idr_alloc_cyclic(&fs_trace_idr, en, 1, 0, GFP_KERNEL);
+ idr_preload_end();
+ spin_unlock(&fs_trace_idr_lock);
+
+ if (en->mark < 0)
+ goto leave;
+ if (fs_trace_query_data(sb, &en->data))
+ goto leave;
+
+ nmask = en->data.events_cap_mask & nmask;
+ if (!nmask)
+ goto leave;
+ en->notify_mask = nmask;
+ memcpy(&en->thresh, thresh, offsetof(struct fs_event_thresh, state));
+
+ spin_lock(&fs_trace_lock);
+ list_add(&en->node, &fs_trace_list);
+ hash_add(fs_trace_hashtbl, &en->hnode, fs_trace_hasfn(sb));
+ spin_unlock(&fs_trace_lock);
+
+ fs_event_new_trace(en);
+ return 0;
+leave:
+ kmem_cache_free(fs_trace_cachep, en);
+ return -EINVAL;
+}
+
+static int fs_update_trace_entry_locked(struct fs_trace_entry *en,
+ struct fs_event_thresh *thresh,
+ unsigned int nmask)
+{
+ int extend = nmask & FS_TRACE_ADD;
+
+ nmask &= en->data.events_cap_mask;
+ if (!nmask)
+ return -EINVAL;
+
+ if (nmask & FS_EVENT_THRESH) {
+ if (extend) {
+ /* Get the current state */
+ if (!(en->notify_mask & FS_EVENT_THRESH))
+ fs_trace_query_data(fs_trace_sb(en),
+ &en->data);
+ if (thresh->state & THRESH_LR_ON) {
+ en->thresh.lrange = thresh->lrange;
+ en->thresh.state &= ~THRESH_LR_ON;
+ }
+ if (thresh->state & THRESH_UR_ON) {
+ en->thresh.urange = thresh->urange;
+ en->thresh.state &= ~THRESH_UR_ON;
+ }
+ } else {
+ memset(&en->thresh, 0, sizeof(en->thresh));
+ }
+ }
+
+ if (extend)
+ en->notify_mask |= nmask;
+ else
+ en->notify_mask &= ~nmask;
+ return 0;
+}
+
+static int fs_update_trace_entry(struct path *path,
+ struct fs_event_thresh *thresh,
+ unsigned int nmask)
+{
+ struct fs_trace_entry *en;
+ int ret;
+
+ spin_lock(&fs_trace_lock);
+ en = fs_find_trace_entry(path->mnt->mnt_sb);
+ if (IS_ERR(en)) {
+ spin_unlock(&fs_trace_lock);
+ return (nmask & FS_TRACE_ADD)
+ ? fs_new_trace_entry(path, thresh, nmask)
+ : -EINVAL;
+ }
+ spin_lock(&en->lock);
+ spin_unlock(&fs_trace_lock);
+
+ ret = fs_update_trace_entry_locked(en, thresh, nmask);
+
+ spin_unlock(&en->lock);
+ return ret;
+}
+
+static int fs_parse_trace_request(int argc, char **argv)
+{
+ struct fs_event_thresh thresh = {0};
+ struct path path;
+ substring_t args[MAX_OPT_ARGS];
+ unsigned int nmask = FS_TRACE_ADD;
+ int token;
+ char *s;
+ int ret = -EINVAL;
+
+ if (!argc) {
+ fs_remove_all_traces();
+ return 0;
+ }
+
+ s = *(argv++);
+ if (*s == '!') {
+ /* Clear the trace entry */
+ nmask &= ~FS_TRACE_ADD;
+ ++s;
+ }
+
+ if (kern_path_mountpoint(AT_FDCWD, s, &path, LOOKUP_FOLLOW))
+ return -EINVAL;
+
+ if (!(--argc)) {
+ if (!(nmask & FS_TRACE_ADD))
+ ret = fs_remove_trace_entry(path.mnt->mnt_sb);
+ goto leave;
+ }
+
+ while ((s = strsep(argv, ",")) != NULL) {
+ if (!*s)
+ continue;
+ args[0].to = args[0].from = NULL;
+ token = match_token(s, fs_etypes, args);
+ nmask |= (token & FS_EVENTS_ALL);
+ }
+
+ if (!(nmask & (~FS_TRACE_ADD)) ||
+ (!(--argc) && (nmask & FS_EVENT_THRESH && nmask & FS_TRACE_ADD)))
+ goto leave;
+
+ if ((nmask & FS_EVENT_THRESH) && (nmask & FS_TRACE_ADD)) {
+ /*
+ * Get the threshold config data:
+ * lower range
+ * upper range
+ */
+ ret = kstrtoull(*(++argv), 10, &thresh.lrange);
+ if (ret)
+ goto leave;
+
+ thresh.state |= THRESH_LR_ON;
+
+ if ((--argc)) {
+ ret = kstrtoull(*(++argv), 10, &thresh.urange);
+ if (ret)
+ goto leave;
+ thresh.state |= THRESH_UR_ON;
+ }
+ /* The thresholds are based on number of available blocks */
+ if (thresh.lrange < thresh.urange) {
+ ret = -EINVAL;
+ goto leave;
+ }
+
+ }
+ ret = fs_update_trace_entry(&path, &thresh, nmask);
+leave:
+ path_put(&path);
+ return ret;
+}
+
+#define DEFAULT_BUF_SIZE PAGE_SIZE
+
+static ssize_t fs_trace_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char **argv;
+ char *kern_buf, *next, *cfg;
+ size_t size, dcount = 0;
+ int argc;
+
+ if (!count)
+ return 0;
+
+ kern_buf = kmalloc(DEFAULT_BUF_SIZE, GFP_KERNEL);
+ if (!kern_buf)
+ return -ENOMEM;
+
+ while (dcount < count) {
+
+ size = count - dcount;
+ if (size >= DEFAULT_BUF_SIZE)
+ size = DEFAULT_BUF_SIZE - 1;
+ if (copy_from_user(kern_buf, buffer + dcount, size)) {
+ dcount = -EINVAL;
+ goto leave;
+ }
+
+ kern_buf[size] = '\0';
+
+ next = cfg = kern_buf;
+
+ do {
+ next = strchr(cfg, ';');
+ if (next)
+ *next = '\0';
+
+ argv = argv_split(GFP_KERNEL, cfg, &argc);
+ if (!argv) {
+ dcount = -ENOMEM;
+ goto leave;
+ }
+
+ if (fs_parse_trace_request(argc, argv)) {
+ dcount = -EINVAL;
+ argv_free(argv);
+ goto leave;
+ }
+
+ argv_free(argv);
+ if (next)
+ cfg = ++next;
+
+ } while (next);
+ dcount += size;
+ }
+leave:
+ kfree(kern_buf);
+ return dcount;
+}
+
+static void *fs_trace_seq_start(struct seq_file *m, loff_t *pos)
+{
+ spin_lock(&fs_trace_lock);
+ return seq_list_start(&fs_trace_list, *pos);
+}
+
+static void *fs_trace_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &fs_trace_list, pos);
+}
+
+static void fs_trace_seq_stop(struct seq_file *m, void *v)
+{
+ spin_unlock(&fs_trace_lock);
+}
+
+static int fs_trace_seq_show(struct seq_file *m, void *v)
+{
+ struct fs_trace_entry *en;
+ struct super_block *sb;
+ struct mount *r_mnt;
+ const struct match_token *match;
+ unsigned int nmask;
+
+ en = list_entry(v, struct fs_trace_entry, node);
+ sb = fs_trace_sb(en);
+
+ seq_printf(m, "%d ", en->mark);
+
+ seq_path(m, &en->path, "\t\n\\");
+ seq_putc(m, ' ');
+
+ seq_escape(m, sb->s_type->name, " \t\n\\");
+ if (sb->s_subtype && sb->s_subtype[0]) {
+ seq_putc(m, '.');
+ seq_escape(m, sb->s_subtype, " \t\n\\");
+ }
+
+ seq_putc(m, ' ');
+ if (sb->s_op->show_devname) {
+ sb->s_op->show_devname(m, en->path.mnt->mnt_root);
+ } else {
+ r_mnt = real_mount(en->path.mnt);
+ seq_escape(m, r_mnt->mnt_devname ? r_mnt->mnt_devname : "none",
+ " \t\n\\");
+ }
+ seq_puts(m, " (");
+
+ nmask = en->notify_mask;
+ for (match = fs_etypes; match->pattern; ++match) {
+ if (match->token & nmask) {
+ seq_puts(m, match->pattern);
+ nmask &= ~match->token;
+ if (nmask)
+ seq_putc(m, ',');
+ }
+ }
+ seq_printf(m, " %llu %llu", en->thresh.lrange,
+ en->thresh.urange);
+ seq_puts(m, ")\n");
+ return 0;
+}
+
+static const struct seq_operations fs_trace_seq_ops = {
+ .start = fs_trace_seq_start,
+ .next = fs_trace_seq_next,
+ .stop = fs_trace_seq_stop,
+ .show = fs_trace_seq_show,
+};
+
+static int fs_trace_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &fs_trace_seq_ops);
+}
+
+static const struct file_operations fs_trace_fops = {
+ .owner = THIS_MODULE,
+ .open = fs_trace_open,
+ .write = fs_trace_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int fs_trace_init(void)
+{
+ fs_trace_cachep = KMEM_CACHE(fs_trace_entry, 0);
+ if (!fs_trace_cachep)
+ return -EINVAL;
+ if (!fs_event_netlink_register()) {
+ idr_init(&fs_trace_idr);
+ return 0;
+ }
+ kmem_cache_destroy(fs_trace_cachep);
+ return -EINVAL;
+}
+
+/* VFS support */
+static int fs_trace_fill_super(struct super_block *sb, void *data, int silent)
+{
+ int ret;
+ static struct tree_descr desc[] = {
+ [2] = {
+ .name = "config",
+ .ops = &fs_trace_fops,
+ .mode = S_IWUSR | S_IRUGO,
+ },
+ {""},
+ };
+
+ ret = simple_fill_super(sb, 0x7246332, desc);
+ return !ret ? fs_trace_init() : ret;
+}
+
+static struct dentry *fs_trace_do_mount(struct file_system_type *fs_type,
+ int ntype, const char *dev_name, void *data)
+{
+ return mount_single(fs_type, ntype, data, fs_trace_fill_super);
+}
+
+static void fs_trace_kill_super(struct super_block *sb)
+{
+ fs_remove_all_traces();
+ idr_destroy(&fs_trace_idr);
+ fs_event_netlink_unregister();
+ kmem_cache_destroy(fs_trace_cachep);
+ kill_litter_super(sb);
+}
+
+static struct kset *fs_trace_kset;
+static struct vfsmount *fs_trace_mount;
+
+static struct file_system_type fs_trace_fstype = {
+ .name = "fstrace",
+ .mount = fs_trace_do_mount,
+ .kill_sb = fs_trace_kill_super,
+};
+
+static void __init fs_trace_vfs_init(void)
+{
+ fs_trace_kset = kset_create_and_add("events", NULL, fs_kobj);
+
+ if (!fs_trace_kset)
+ return;
+
+ if (!register_filesystem(&fs_trace_fstype)) {
+ fs_trace_mount = kern_mount(&fs_trace_fstype);
+ if (!IS_ERR(fs_trace_mount))
+ return;
+
+ unregister_filesystem(&fs_trace_fstype);
+ }
+ kset_unregister(fs_trace_kset);
+}
+
+static int __init fs_trace_events_init(void)
+{
+ fs_trace_vfs_init();
+ return 0;
+};
+module_init(fs_trace_events_init);
+
diff --git a/fs/events/fs_event.h b/fs/events/fs_event.h
new file mode 100644
index 0000000..4260ce5
--- /dev/null
+++ b/fs/events/fs_event.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright(c) 2015 Samsung Electronics. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __GENERIC_FS_EVENTS_H
+#define __GENERIC_FS_EVENTS_H
+
+#ifdef CONFIG_NET
+int fs_event_netlink_register(void);
+void fs_event_netlink_unregister(void);
+#else /* CONFIG_NET */
+static inline int fs_event_netlink_register(void) { return -ENOSYS; }
+static inline void fs_event_netlink_unregister(void) {};
+#endif /* CONFIG_NET */
+
+#endif /* __GENERIC_FS_EVENTS_H */
diff --git a/fs/events/fs_event_netlink.c b/fs/events/fs_event_netlink.c
new file mode 100644
index 0000000..9c56e35
--- /dev/null
+++ b/fs/events/fs_event_netlink.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright(c) 2015 Samsung Electronics. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+static const struct genl_multicast_group fs_event_mcgroups[] = {
+ { .name = "event", },
+};
+
+static struct genl_family fs_event_family = {
+ .id = GENL_ID_FS_EVENT,
+ .hdrsize = 0,
+ .name = "FS_EVENT",
+ .version = 1,
+ .maxattr = FS_EVENT_ATR_MAX,
+ .mcgrps = fs_event_mcgroups,
+ .n_mcgrps = ARRAY_SIZE(fs_event_mcgroups),
+};
+
+int fs_netlink_send_event(size_t size, unsigned int event_type,
+ int (*compose_msg)(struct sk_buff *skb,
+ unsigned int event_id, void *data),
+ unsigned int event_id, void *data)
+{
+ static atomic_t seq;
+ struct sk_buff *skb;
+ void *msg_head;
+ int ret = 0;
+
+ if (!size || !compose_msg)
+ return -EINVAL;
+
+ size += nla_total_size(sizeof(u64));
+ skb = genlmsg_new(size, GFP_NOFS);
+
+ if (!skb) {
+ pr_err("Failed to allocate new FS generic netlink message\n");
+ return -ENOMEM;
+ }
+
+ msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+ &fs_event_family, 0, event_type);
+ if (!msg_head)
+ goto cleanup;
+
+ ret = compose_msg(skb, event_id, data);
+ if (ret) {
+ genlmsg_cancel(skb, msg_head);
+ goto cleanup;
+ }
+
+ genlmsg_end(skb, msg_head);
+ ret = genlmsg_multicast(&fs_event_family, skb, 0, 0, GFP_NOWAIT);
+ if (ret && ret != -ENOBUFS && ret != -ESRCH)
+ goto cleanup;
+
+ return ret;
+cleanup:
+ nlmsg_free(skb);
+ return ret;
+}
+EXPORT_SYMBOL(fs_netlink_send_event);
+
+int fs_event_netlink_register(void)
+{
+ int ret;
+
+ ret = genl_register_family(&fs_event_family);
+ if (ret)
+ pr_err("Failed to register FS netlink interface\n");
+ return ret;
+}
+
+void fs_event_netlink_unregister(void)
+{
+ genl_unregister_family(&fs_event_family);
+}
diff --git a/fs/namespace.c b/fs/namespace.c
index 82ef140..ec6e2ef 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1031,6 +1031,7 @@ static void cleanup_mnt(struct mount *mnt)
if (unlikely(mnt->mnt_pins.first))
mnt_pin_kill(mnt);
fsnotify_vfsmount_delete(&mnt->mnt);
+ fs_event_mount_dropped(&mnt->mnt);
dput(mnt->mnt.mnt_root);
deactivate_super(mnt->mnt.mnt_sb);
mnt_free_id(mnt);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b4d71b5..bb529af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -263,6 +263,10 @@ struct iattr {
* Includes for diskquotas.
*/
#include <linux/quota.h>
+/*
+ * Include for Generic File System Events Interface
+ */
+#include <linux/fs_event.h>
/*
* Maximum number of layers of fs stack. Needs to be limited to
@@ -1233,6 +1237,7 @@ struct super_block {
const struct dquot_operations *dq_op;
const struct quotactl_ops *s_qcop;
const struct export_operations *s_export_op;
+ const struct fs_trace_operations *s_trace_ops;
unsigned long s_flags;
unsigned long s_magic;
struct dentry *s_root;
@@ -1253,7 +1258,6 @@ struct super_block {
struct hlist_node s_instances;
unsigned int s_quota_types; /* Bitmask of supported quota types */
struct quota_info s_dquot; /* Diskquota specific options */
-
struct sb_writers s_writers;
char s_id[32]; /* Informational name */
diff --git a/include/linux/fs_event.h b/include/linux/fs_event.h
new file mode 100644
index 0000000..1e128d8
--- /dev/null
+++ b/include/linux/fs_event.h
@@ -0,0 +1,69 @@
+/*
+ * Generic File System Events Interface
+ *
+ * Copyright(c) 2015 Samsung Electronics. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _LINUX_GENERIC_FS_EVETS_
+#define _LINUX_GENERIC_FS_EVETS_
+#include <net/netlink.h>
+#include <uapi/linux/fs_event.h>
+
+/*
+ * Those event flags match the event types send though the netlink interface
+ * so mind in case making any modifications.
+ */
+#define FS_EVENT_INFO 0x001
+#define FS_EVENT_WARN 0x002
+#define FS_EVENT_ERR 0x004
+#define FS_EVENT_THRESH 0x008
+
+#define FS_EVENTS_ALL \
+ (FS_EVENT_INFO | FS_EVENT_WARN | FS_EVENT_THRESH | FS_EVENT_ERR)
+
+struct fs_trace_sdata {
+ /* Supported notification types */
+ unsigned int events_cap_mask;
+ /* Number of available/reachable blocks */
+ u64 available_blks;
+};
+
+struct fs_trace_operations {
+ int (*fs_trace_query)(struct super_block *, struct fs_trace_sdata *);
+};
+
+
+void fs_event_notify(struct super_block *sb, unsigned int event_type,
+ unsigned int event_id);
+void fs_event_alloc_space(struct super_block *sb, u64 ncount);
+void fs_event_free_space(struct super_block *sb, u64 ncount);
+void fs_event_mount_dropped(struct vfsmount *mnt);
+
+#ifdef CONFIG_NET
+int fs_netlink_send_event(size_t size, unsigned int event_type,
+ int (*compose_msg)(struct sk_buff *skb, unsigned int event_id,
+ void *data),
+ unsigned int event_id, void *data);
+#else /* CONFIG_NET */
+static inline
+int fs_netlink_send_event(size_t size, unsigned int event_type,
+ int (*compose_msg)(struct sk_buff *skb, unsigned int event_id,
+ void *data),
+ unsigned int event_idid, void *data)
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_NET */
+
+#endif /* _LINUX_GENERIC_FS_EVENTS_ */
+
diff --git a/include/uapi/linux/fs_event.h b/include/uapi/linux/fs_event.h
new file mode 100644
index 0000000..dd79953
--- /dev/null
+++ b/include/uapi/linux/fs_event.h
@@ -0,0 +1,62 @@
+/*
+ * Generic netlink support for Generic File System Events Interface
+ *
+ * Copyright(c) 2015 Samsung Electronics. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _UAPI_LINUX_GENERIC_FS_EVENTS_
+#define _UAPI_LINUX_GENERIC_FS_EVENTS_
+/*
+ * Generic FS event types
+ */
+enum {
+ FS_EVENT_TYPE_NONE,
+ FS_EVENT_TYPE_INFO,
+ FS_EVENT_TYPE_WARN,
+ FS_EVENT_TYPE_ERR,
+ FS_EVENT_TYPE_THRESH,
+ FS_EVENT_TYPE_NEW_TRACE,
+ __FS_EVENT_TYPE_MAX,
+};
+#define FS_EVENT_TYPE_MAX (__FS_EVENT_TYPE_MAX - 1)
+/*
+ * Generic netlink attribute types
+ */
+enum {
+ FS_EVENT_ATR_NONE,
+ FS_EVENT_ATR_FS_ID, /* An identifier of traced fs */
+ FS_EVENT_ATR_MOUNT, /* Mount point directory name */
+ FS_EVENT_ATR_DEV_MAJOR,
+ FS_EVENT_ATR_DEV_MINOR,
+ FS_EVENT_ATR_ID,
+ FS_EVENT_ATR_CAUSED_ID,
+ FS_EVENT_ATR_DATA,
+ __FS_EVENT_ATR_MAX,
+};
+#define FS_EVENT_ATR_MAX (__FS_EVENT_ATR_MAX - 1)
+
+/*
+ * Supported set of FS events ids
+ */
+#define FS_INFO_UMOUNT 0x00000001 /* File system unmounted */
+#define FS_WARN_UNKNOWN 0x00000004 /* Unknown warning */
+#define FS_WARN_ENOSPC 0x00000008 /* No space left to reserve data blks */
+#define FS_WANR_ENOSPC_META 0x00000010 /* No space left for metadata */
+#define FS_THRESH_LR_REACHED 0x00000020 /* The lower range of threshold has been reached */
+#define FS_THRESH_UR_REACHED 0x00000040 /* The upper range of threshold has been reached */
+#define FS_ERR_UNKNOWN 0x00000080 /* Unknown error */
+#define FS_ERR_RO_REMOUT 0x00000100 /* The file system has been remounted as red-only */
+#define FS_ERR_ITERNAL 0x00000200 /* File system's internal error */
+
+#endif /* _UAPI_LINUX_GENERIC_FS_EVENTS_ */
+
diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index c3363ba..6464129 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -29,6 +29,7 @@ struct genlmsghdr {
#define GENL_ID_CTRL NLMSG_MIN_TYPE
#define GENL_ID_VFS_DQUOT (NLMSG_MIN_TYPE + 1)
#define GENL_ID_PMCRAID (NLMSG_MIN_TYPE + 2)
+#define GENL_ID_FS_EVENT (NLMSG_MIN_TYPE + 3)
/**************************************************************************
* Controller
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 2ed5f96..e8e0bd68 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -82,7 +82,8 @@ static struct list_head family_ht[GENL_FAM_TAB_SIZE];
*/
static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
BIT(GENL_ID_VFS_DQUOT) |
- BIT(GENL_ID_PMCRAID);
+ BIT(GENL_ID_PMCRAID) |
+ BIT(GENL_ID_FS_EVENT);
static unsigned long *mc_groups = &mc_group_start;
static unsigned long mc_groups_longs = 1;
@@ -146,6 +147,7 @@ static u16 genl_generate_id(void)
for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
if (id_gen_idx != GENL_ID_VFS_DQUOT &&
id_gen_idx != GENL_ID_PMCRAID &&
+ id_gen_idx != GENL_ID_FS_EVENT &&
!genl_family_find_byid(id_gen_idx))
return id_gen_idx;
if (++id_gen_idx > GENL_MAX_ID)
@@ -249,6 +251,9 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
} else if (family->id == GENL_ID_PMCRAID) {
first_id = GENL_ID_PMCRAID;
BUG_ON(n_groups != 1);
+ } else if (family->id == GENL_ID_FS_EVENT) {
+ first_id = GENL_ID_FS_EVENT;
+ BUG_ON(n_groups != 1);
} else {
groups_allocated = true;
err = genl_allocate_reserve_groups(n_groups, &first_id);
--
1.7.9.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Add support for generic FS events including threshold
notifications, ENOSPC and remount as read-only warnings,
along with generic internal warnings/errors.
Signed-off-by: Beata Michalska <[email protected]>
---
fs/ext4/balloc.c | 11 +++++++++--
fs/ext4/ext4.h | 1 +
fs/ext4/inode.c | 2 +-
fs/ext4/mballoc.c | 6 +++++-
fs/ext4/resize.c | 1 +
fs/ext4/super.c | 43 +++++++++++++++++++++++++++++++++++++++++++
6 files changed, 60 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e95b27a..49d2ace 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -569,6 +569,7 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
{
if (ext4_has_free_clusters(sbi, nclusters, flags)) {
percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
+ fs_event_alloc_space(sbi->s_sb, EXT4_C2B(sbi, nclusters));
return 0;
} else
return -ENOSPC;
@@ -590,9 +591,10 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 3 ||
- !EXT4_SB(sb)->s_journal)
+ !EXT4_SB(sb)->s_journal) {
+ fs_event_notify(sb, FS_EVENT_WARN, FS_WARN_ENOSPC);
return 0;
-
+ }
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
@@ -637,6 +639,11 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
dquot_alloc_block_nofail(inode,
EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
}
+
+ if (*errp == -ENOSPC)
+ fs_event_notify(inode->i_sb, FS_EVENT_WARN,
+ FS_WANR_ENOSPC_META);
+
return ret;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 163afe2..7d75ff9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2542,6 +2542,7 @@ void ext4_mark_group_corrupted(struct ext4_sb_info *sbi,
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ fs_event_alloc_space(sbi->s_sb, EXT4_C2B(sbi, grp->bb_free));
}
/*
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cb9a21..2a7af0f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1238,7 +1238,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-
+ fs_event_free_space(sbi->s_sb, to_free);
dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 24a4b6d..e6cbbd6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4511,6 +4511,9 @@ out:
kmem_cache_free(ext4_ac_cachep, ac);
if (inquota && ar->len < inquota)
dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
+ if (reserv_clstrs && ar->len < reserv_clstrs)
+ fs_event_free_space(sbi->s_sb,
+ EXT4_C2B(sbi, reserv_clstrs - ar->len));
if (!ar->len) {
if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
/* release all the reserved blocks if non delalloc */
@@ -4848,7 +4851,7 @@ do_more:
if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
-
+ fs_event_free_space(sb, EXT4_C2B(sbi, count_clusters));
ext4_mb_unload_buddy(&e4b);
/* We dirtied the bitmap block */
@@ -4982,6 +4985,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
EXT4_NUM_B2C(sbi, blocks_freed));
+ fs_event_free_space(sb, blocks_freed);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 8a8ec62..dbf08d6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1378,6 +1378,7 @@ static void ext4_update_super(struct super_block *sb,
EXT4_NUM_B2C(sbi, free_blocks));
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+ fs_event_free_space(sb, free_blocks - reserved_blocks);
ext4_debug("free blocks count %llu",
percpu_counter_read(&sbi->s_freeclusters_counter));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e061e66..52091da 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -398,6 +398,7 @@ static void ext4_handle_error(struct super_block *sb)
if (test_opt(sb, ERRORS_PANIC))
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
+ fs_event_notify(sb, FS_EVENT_ERR, FS_ERR_UNKNOWN);
}
#define ext4_error_ratelimit(sb) \
@@ -585,6 +586,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
+ fs_event_notify(sb, FS_EVENT_ERR, FS_ERR_RO_REMOUT);
+
}
if (test_opt(sb, ERRORS_PANIC))
panic("EXT4-fs panic from previous error\n");
@@ -612,6 +615,8 @@ void __ext4_warning(struct super_block *sb, const char *function,
struct va_format vaf;
va_list args;
+ fs_event_notify(sb, FS_EVENT_WARN, FS_WARN_UNKNOWN);
+
if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
"EXT4-fs warning"))
return;
@@ -1083,6 +1088,13 @@ static const struct quotactl_ops ext4_qctl_operations = {
};
#endif
+static int ext4_trace_query(struct super_block *sb,
+ struct fs_trace_sdata *data);
+
+static const struct fs_trace_operations ext4_trace_ops = {
+ .fs_trace_query = ext4_trace_query,
+};
+
static const struct super_operations ext4_sops = {
.alloc_inode = ext4_alloc_inode,
.destroy_inode = ext4_destroy_inode,
@@ -3398,11 +3410,20 @@ static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
{
ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
sbi->s_cluster_bits;
+ ext4_fsblk_t current_resv;
if (count >= clusters)
return -EINVAL;
+ current_resv = atomic64_read(&sbi->s_resv_clusters);
atomic64_set(&sbi->s_resv_clusters, count);
+
+ if (count > current_resv)
+ fs_event_alloc_space(sbi->s_sb,
+ EXT4_C2B(sbi, count - current_resv));
+ else
+ fs_event_free_space(sbi->s_sb,
+ EXT4_C2B(sbi, current_resv - count));
return 0;
}
@@ -3966,6 +3987,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
+ sb->s_trace_ops = &ext4_trace_ops;
+
memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
@@ -5438,6 +5461,26 @@ out:
#endif
+static int ext4_trace_query(struct super_block *sb, struct fs_trace_sdata *data)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t rsv_blocks;
+
+ data->available_blks =
+ percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
+ data->available_blks = EXT4_C2B(sbi, data->available_blks);
+ rsv_blocks = ext4_r_blocks_count(es) +
+ EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
+ if (data->available_blks < rsv_blocks)
+ data->available_blks = 0;
+ else
+ data->available_blks -= rsv_blocks;
+ data->events_cap_mask = FS_EVENTS_ALL;
+ return 0;
+}
+
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
--
1.7.9.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Add support for the generic FS events interface
covering threshold notifiactions and the ENOSPC
warning.
Signed-off-by: Beata Michalska <[email protected]>
---
mm/shmem.c | 39 ++++++++++++++++++++++++++++++++++++---
1 file changed, 36 insertions(+), 3 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index cf2d0ca..bb261ac 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -201,6 +201,7 @@ static int shmem_reserve_inode(struct super_block *sb)
spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock);
+ fs_event_notify(sb, FS_EVENT_WARN, FS_WARN_ENOSPC);
return -ENOSPC;
}
sbinfo->free_inodes--;
@@ -239,8 +240,10 @@ static void shmem_recalc_inode(struct inode *inode)
freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
if (freed > 0) {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_blocks)
+ if (sbinfo->max_blocks) {
percpu_counter_add(&sbinfo->used_blocks, -freed);
+ fs_event_free_space(inode->i_sb, freed);
+ }
info->alloced -= freed;
inode->i_blocks -= freed * BLOCKS_PER_PAGE;
shmem_unacct_blocks(info->flags, freed);
@@ -1164,6 +1167,7 @@ repeat:
goto unacct;
}
percpu_counter_inc(&sbinfo->used_blocks);
+ fs_event_alloc_space(inode->i_sb, 1);
}
page = shmem_alloc_page(gfp, info, index);
@@ -1245,8 +1249,10 @@ trunc:
spin_unlock(&info->lock);
decused:
sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_blocks)
+ if (sbinfo->max_blocks) {
percpu_counter_add(&sbinfo->used_blocks, -1);
+ fs_event_free_space(inode->i_sb, 1);
+ }
unacct:
shmem_unacct_blocks(info->flags, 1);
failed:
@@ -1258,12 +1264,17 @@ unlock:
unlock_page(page);
page_cache_release(page);
}
- if (error == -ENOSPC && !once++) {
+ if (error == -ENOSPC) {
+ if (!once++) {
info = SHMEM_I(inode);
spin_lock(&info->lock);
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
goto repeat;
+ } else {
+ fs_event_notify(inode->i_sb, FS_EVENT_WARN,
+ FS_WARN_ENOSPC);
+ }
}
if (error == -EEXIST) /* from above or from radix_tree_insert */
goto repeat;
@@ -2729,12 +2740,33 @@ static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
return 1;
}
+static int shmem_trace_query(struct super_block *sb,
+ struct fs_trace_sdata *data)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+ if (!sb || !data)
+ return -EINVAL;
+
+ data->events_cap_mask = FS_EVENT_WARN;
+ if (sbinfo->max_blocks) {
+ data->available_blks = sbinfo->max_blocks -
+ percpu_counter_sum(&sbinfo->used_blocks);
+ data->events_cap_mask |= FS_EVENT_THRESH;
+ }
+ return 0;
+}
+
static const struct export_operations shmem_export_ops = {
.get_parent = shmem_get_parent,
.encode_fh = shmem_encode_fh,
.fh_to_dentry = shmem_fh_to_dentry,
};
+static const struct fs_trace_operations shmem_trace_ops = {
+ .fs_trace_query = shmem_trace_query,
+};
+
static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
bool remount)
{
@@ -3020,6 +3052,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= MS_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
+ sb->s_trace_ops = &shmem_trace_ops;
sb->s_flags |= MS_NOSEC;
#else
sb->s_flags |= MS_NOUSER;
--
1.7.9.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Wed, Apr 15, 2015 at 09:15:46AM +0200, Beata Michalska wrote:
> Add support for generic FS events including threshold
> notifications, ENOSPC and remount as read-only warnings,
> along with generic internal warnings/errors.
>
> Signed-off-by: Beata Michalska <[email protected]>
> ---
> fs/ext4/balloc.c | 11 +++++++++--
> fs/ext4/ext4.h | 1 +
> fs/ext4/inode.c | 2 +-
> fs/ext4/mballoc.c | 6 +++++-
> fs/ext4/resize.c | 1 +
> fs/ext4/super.c | 43 +++++++++++++++++++++++++++++++++++++++++++
> 6 files changed, 60 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index e95b27a..49d2ace 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -569,6 +569,7 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
> {
> if (ext4_has_free_clusters(sbi, nclusters, flags)) {
> percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
> + fs_event_alloc_space(sbi->s_sb, EXT4_C2B(sbi, nclusters));
> return 0;
> } else
> return -ENOSPC;
> @@ -590,9 +591,10 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
> {
> if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
> (*retries)++ > 3 ||
> - !EXT4_SB(sb)->s_journal)
> + !EXT4_SB(sb)->s_journal) {
> + fs_event_notify(sb, FS_EVENT_WARN, FS_WARN_ENOSPC);
> return 0;
> -
> + }
> jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
>
> return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
> @@ -637,6 +639,11 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
> dquot_alloc_block_nofail(inode,
> EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
> }
> +
> + if (*errp == -ENOSPC)
> + fs_event_notify(inode->i_sb, FS_EVENT_WARN,
> + FS_WANR_ENOSPC_META);
> +
> return ret;
> }
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 163afe2..7d75ff9 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2542,6 +2542,7 @@ void ext4_mark_group_corrupted(struct ext4_sb_info *sbi,
> if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
> percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free);
> set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
> + fs_event_alloc_space(sbi->s_sb, EXT4_C2B(sbi, grp->bb_free));
While we're adding fs netlink notifications, could we add a message that means
"This FS is corrupt, go run fsck"? A monitoring app could possibly figure
this out by a sudden drop in free space accompanied by EIO errors hitting
userland apps, but we might as well be explicit about the flaming death. :)
--D
> }
>
> /*
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5cb9a21..2a7af0f 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1238,7 +1238,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
> percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
>
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> -
> + fs_event_free_space(sbi->s_sb, to_free);
> dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
> }
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 24a4b6d..e6cbbd6 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4511,6 +4511,9 @@ out:
> kmem_cache_free(ext4_ac_cachep, ac);
> if (inquota && ar->len < inquota)
> dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
> + if (reserv_clstrs && ar->len < reserv_clstrs)
> + fs_event_free_space(sbi->s_sb,
> + EXT4_C2B(sbi, reserv_clstrs - ar->len));
> if (!ar->len) {
> if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
> /* release all the reserved blocks if non delalloc */
> @@ -4848,7 +4851,7 @@ do_more:
> if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
> dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
> percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
> -
> + fs_event_free_space(sb, EXT4_C2B(sbi, count_clusters));
> ext4_mb_unload_buddy(&e4b);
>
> /* We dirtied the bitmap block */
> @@ -4982,6 +4985,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
> ext4_unlock_group(sb, block_group);
> percpu_counter_add(&sbi->s_freeclusters_counter,
> EXT4_NUM_B2C(sbi, blocks_freed));
> + fs_event_free_space(sb, blocks_freed);
>
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
> diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
> index 8a8ec62..dbf08d6 100644
> --- a/fs/ext4/resize.c
> +++ b/fs/ext4/resize.c
> @@ -1378,6 +1378,7 @@ static void ext4_update_super(struct super_block *sb,
> EXT4_NUM_B2C(sbi, free_blocks));
> percpu_counter_add(&sbi->s_freeinodes_counter,
> EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
> + fs_event_free_space(sb, free_blocks - reserved_blocks);
>
> ext4_debug("free blocks count %llu",
> percpu_counter_read(&sbi->s_freeclusters_counter));
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index e061e66..52091da 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -398,6 +398,7 @@ static void ext4_handle_error(struct super_block *sb)
> if (test_opt(sb, ERRORS_PANIC))
> panic("EXT4-fs (device %s): panic forced after error\n",
> sb->s_id);
> + fs_event_notify(sb, FS_EVENT_ERR, FS_ERR_UNKNOWN);
> }
>
> #define ext4_error_ratelimit(sb) \
> @@ -585,6 +586,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
> if (EXT4_SB(sb)->s_journal)
> jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
> save_error_info(sb, function, line);
> + fs_event_notify(sb, FS_EVENT_ERR, FS_ERR_RO_REMOUT);
> +
> }
> if (test_opt(sb, ERRORS_PANIC))
> panic("EXT4-fs panic from previous error\n");
> @@ -612,6 +615,8 @@ void __ext4_warning(struct super_block *sb, const char *function,
> struct va_format vaf;
> va_list args;
>
> + fs_event_notify(sb, FS_EVENT_WARN, FS_WARN_UNKNOWN);
> +
> if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
> "EXT4-fs warning"))
> return;
> @@ -1083,6 +1088,13 @@ static const struct quotactl_ops ext4_qctl_operations = {
> };
> #endif
>
> +static int ext4_trace_query(struct super_block *sb,
> + struct fs_trace_sdata *data);
> +
> +static const struct fs_trace_operations ext4_trace_ops = {
> + .fs_trace_query = ext4_trace_query,
> +};
> +
> static const struct super_operations ext4_sops = {
> .alloc_inode = ext4_alloc_inode,
> .destroy_inode = ext4_destroy_inode,
> @@ -3398,11 +3410,20 @@ static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
> {
> ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
> sbi->s_cluster_bits;
> + ext4_fsblk_t current_resv;
>
> if (count >= clusters)
> return -EINVAL;
>
> + current_resv = atomic64_read(&sbi->s_resv_clusters);
> atomic64_set(&sbi->s_resv_clusters, count);
> +
> + if (count > current_resv)
> + fs_event_alloc_space(sbi->s_sb,
> + EXT4_C2B(sbi, count - current_resv));
> + else
> + fs_event_free_space(sbi->s_sb,
> + EXT4_C2B(sbi, current_resv - count));
> return 0;
> }
>
> @@ -3966,6 +3987,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> sb->s_qcop = &ext4_qctl_operations;
> sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
> #endif
> + sb->s_trace_ops = &ext4_trace_ops;
> +
> memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
>
> INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
> @@ -5438,6 +5461,26 @@ out:
>
> #endif
>
> +static int ext4_trace_query(struct super_block *sb, struct fs_trace_sdata *data)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_super_block *es = sbi->s_es;
> + ext4_fsblk_t rsv_blocks;
> +
> + data->available_blks =
> + percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
> + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
> + data->available_blks = EXT4_C2B(sbi, data->available_blks);
> + rsv_blocks = ext4_r_blocks_count(es) +
> + EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
> + if (data->available_blks < rsv_blocks)
> + data->available_blks = 0;
> + else
> + data->available_blks -= rsv_blocks;
> + data->events_cap_mask = FS_EVENTS_ALL;
> + return 0;
> +}
> +
> static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
> const char *dev_name, void *data)
> {
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Wed, Apr 15, 2015 at 09:15:44AM +0200, Beata Michalska wrote:
> Introduce configurable generic interface for file
> system-wide event notifications to provide file
> systems with a common way of reporting any potential
> issues as they emerge.
>
> The notifications are to be issued through generic
> netlink interface, by a dedicated, for file system
> events, multicast group. The file systems might as
> well use this group to send their own custom messages.
>
> The events have been split into four base categories:
> information, warnings, errors and threshold notifications,
> with some very basic event types like running out of space
> or file system being remounted as read-only.
>
> Threshold notifications have been included to allow
> triggering an event whenever the amount of free space
> drops below a certain level - or levels to be more precise
> as two of them are being supported: the lower and the upper
> range. The notifications work both ways: once the threshold
> level has been reached, an event shall be generated whenever
> the number of available blocks goes up again re-activating
> the threshold.
>
> The interface has been exposed through a vfs. Once mounted,
> it serves as an entry point for the set-up where one can
> register for particular file system events.
>
> Signed-off-by: Beata Michalska <[email protected]>
> ---
> Documentation/filesystems/events.txt | 254 +++++++++++
> fs/Makefile | 1 +
> fs/events/Makefile | 6 +
> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
> fs/events/fs_event.h | 27 ++
> fs/events/fs_event_netlink.c | 94 +++++
> fs/namespace.c | 1 +
> include/linux/fs.h | 6 +-
> include/linux/fs_event.h | 69 +++
> include/uapi/linux/fs_event.h | 62 +++
> include/uapi/linux/genetlink.h | 1 +
> net/netlink/genetlink.c | 7 +-
> 12 files changed, 1301 insertions(+), 2 deletions(-)
> create mode 100644 Documentation/filesystems/events.txt
> create mode 100644 fs/events/Makefile
> create mode 100644 fs/events/fs_event.c
> create mode 100644 fs/events/fs_event.h
> create mode 100644 fs/events/fs_event_netlink.c
> create mode 100644 include/linux/fs_event.h
> create mode 100644 include/uapi/linux/fs_event.h
>
> diff --git a/Documentation/filesystems/events.txt b/Documentation/filesystems/events.txt
> new file mode 100644
> index 0000000..c85dd88
> --- /dev/null
> +++ b/Documentation/filesystems/events.txt
> @@ -0,0 +1,254 @@
> +
> + Generic file system event notification interface
> +
> +Document created 09 April 2015 by Beata Michalska <[email protected]>
> +
> +1. The reason behind:
> +=====================
> +
> +There are many corner cases when things might get messy with the filesystems.
> +And it is not always obvious what and when went wrong. Sometimes you might
> +get some subtle hints that there is something going on - but by the time
> +you realise it, it might be too late as you are already out-of-space
> +or the filesystem has been remounted as read-only (i.e.). The generic
> +interface for the filesystem events fills the gap by providing a rather
> +easy way of real-time notifications triggered whenever something intreseting
> +happens, allowing filesystems to report events in a common way, as they occur.
> +
> +2. How does it work:
> +====================
> +
> +The interface itself has been exposed as fstrace-type Virtual File System,
> +primarily to ease the process of setting up the configuration for the file
> +system notifications. So for starters it needs to get mounted (obviously):
> +
> + mount -t fstrace none /sys/fs/events
> +
> +This will unveil the single fstrace filesystem entry - the 'config' file,
> +through which the notification are being set-up.
> +
> +Activating notifications for particular filesystem is as straightforward
> +as writing into the 'config' file. Note that by default all events despite
> +the actual filesystem type are being disregarded.
> +
> +Synopsis of config:
> +------------------
> +
> + MOUNT EVENT_TYPE [L1] [L2]
> +
> + MOUNT : the filesystem's mount point
> + EVENT_TYPE : type of events to be enabled: info,warn,err,thr;
> + at least one type needs to be specified;
> + note the comma delimiter and lack of spaces between
> + those options
> + L1 : the threshold limit - lower range
> + L2 : the threshold limit - upper range
> + case enabling threshold notifications the lower level is
> + mandatory, whereas the upper one remains optional;
> + note though, that as those refer to the number of available
> + blocks, the lower level needs to be higher than the upper one
> +
> +Sample request could look like the follwoing:
> +
> + echo /sample/mount/point warn,err,thr 710000 500000 > /sys/fs/events/config
> +
> +Multiple request might be specified provided they are separated with semicolon.
> +
> +The configuration itself might be modified at any time. One can add/remove
> +particilar event types for given fielsystem, modify the threshold levels,
> +and remove single or all entries from the 'config' file.
> +
> + - Adding new event type:
> +
> + $ echo MOUNT EVENT_TYPE > /sys/fs/events/config
> +
> +(Note that is is enough to provide the eventy type to be enabled without
> +the already set ones.)
> +
> + - Removing event type:
> +
> + $ echo '!MOUNT EVENT_TYPE' > /sys/fs/events/config
> +
> + - Updating threshold limits:
> +
> + $ echo MOUNT thres L1 L2 > /sys/fs/events/config
> +
> + - Removing single entry:
> +
> + $ echo '!MOUNT' > /sys/fs/events/config
> +
> + - Removing all entries:
> +
> + $ echo > /sys/fs/events/config
> +
> +Reading the file will list all registered entries with their current set-up
> +along with some additional info like the id of the entry (@see more on generic
> +netlink section), the filesystem type and the backing device name if available.
> +
> +Final, though a very important note on the configuration: when and if the
> +actual events are being triggered falls way beyond the scope of the generic
> +filesystem events interface. It is up to a particular filesystem
> +implementation which events are to be supported - if any at all. So if
> +given filesystem does not support the event notifications, an attempt to
> +enable those through 'config' file will fail.
> +
> +
> +3. The generic netlink interface support:
> +=========================================
> +
> +Whenever an event notification is triggered (by given filesystem) the current
> +configuration is being validated to decide whether a userpsace notification
> +should be launched. If there has been no request (in a mean of 'config' file
> +entry) for given event, one will be silently disreagrded. If, on the other
> +hand, someone is 'watching' given filesystem for specific events, a generic
> +netlink message will be sent.
> +
> +A dedicated multicast group has been provided solely for the purpose of
> +notifying any potential listeners of file system events. So in order to
> +receive such notifications, one should subscribe to this new mutlicast group.
> +
> +Each message type reflects the actual type of generated event (FS_EVENT_TYPE*)
> +Currently there are two supported message formats.
> +
> +There is a common message format representing an event generated by
> +a filesystem. The type of the event itself will be stored within
> +the generic netlink message header as the command filed. The messge
> +payload will provide more detailed info: the indentifier of the filesystem
> +trace (genereted upon registering the trace), the backing device major and
> +minor numbers, the event identifier and the id of the proccess which action
> +led to the event occurance. In case of threshold notifications, the current
> +number of available blocks will be included in the payload.
> +
> +
> + 0 1 2 3
> + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + | NETLINK MESSAGE HEADER |
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + | GENERIC NETLINK MESSAGE HEADER |
> + | (with event type as genlmsghdr cdm field) |
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + | Optional user specific message header |
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + | GENERIC MESSAGE PAYLOAD: |
> + +---------------------------------------------------------------+
> + | FS_EVENT_ATR_FS_ID (NLA_U32) |
> + +---------------------------------------------------------------+
> + | FS_EVENT_ATR_DEV_MAJOR (NLA_U32) (if available) |
> + +---------------------------------------------------------------+
> + | FS_EVENT_ATR_DEV_MINOR (NLA_U32) (if available) |
> + +---------------------------------------------------------------+
> + | FS_EVENT_ATR_ID (NLA_U32) |
> + +---------------------------------------------------------------+
> + | FS_EVENT_ATR_CAUSED_ID (NLA_U32) |
> + +---------------------------------------------------------------+
> + | FS_EVENT_ATR_DATA (NLA_U64) |
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> +
> +
> +The second supported message format represents an event of a new trace being
> +registered. It contains two attributes within the payload: the trace id and the
> +mount point for which the trace has been registered. This message is of type
> +FS_EVENT_TYPE_NEW_TRACE and is being sent regardless the actual event types
> +being watched whenever new etnry for the 'config' file is being created. This
> +is supposed to ease parsing the messages by userpsace applications and to help
> +to identify the origin of the event. It also reduces the size of the payload
> +as there is no need to send additional data such as mount point and the file
> +system type for each possible event.
> +
> + 0 1 2 3
> + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + | NETLINK MESSAGE HEADER |
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + | GENERIC NETLINK MESSAGE HEADER |
> + | (with event type as genlmsghdr cdm field) |
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + | Optional user specific message header |
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + | GENERIC MESSAGE PAYLOAD: |
> + + ------------------------------------------------------------- +
> + | FS_EVENT_ATR_FS_ID (NLA_U32) |
> + + ------------------------------------------------------------- +
> + | FS_EVENT_ATR_MOUNT (NLA_STRING) |
> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> +
> +The above figures are based on:
> + http://www.linuxfoundation.org/collaborate/workgroups/networking/generic_netlink_howto#Message_Format
> +
> +
> +
> +4. API Reference:
> +=================
> +
> + 4.1 Generic file system event interface operations
> +
> + #include <linux/fs_event.h>
> +
> + struct fs_trace_operations {
> + int (*fs_trace_query)(struct super_block *, struct fs_trace_sdata *);
> + };
> +
> + Each filesystem supporting the event notifications should register its
> + file system trace operations. This can be done through new entry in
> + the super_block structure: the s_trace_ops. The fs_trace_query shall
> + be called whenever new trace entry for given filesystem is being created
> + or when threshold notifications are being requested for the first time.
> + The filesystem should specify then, which event types are being supported.
> + In case of threshold notifications the current number of avaialble blocks
> + should be provided.
> +
> + 4.2 Event notification:
> +
> + #include <linux/fs_event.h>
> + void fs_event_notify(struct super_block *sb, unsigned int event_type,
> + unsigned int event_id);
> +
> + Notify the generic FS event interface of an occuring event.
> + This shall be used by any file system that wishes to inform any potenial
> + listeners/watchers of a particular event.
> + - sb: the filesystem's super block
> + - event_type: the type of an event (one of the FS_EVENT_*)
> + - event_id: an event identifier
> +
> + 4.3 Threshold notifications:
> +
> + #include <linux/fs_event.h>
> + void fs_event_alloc_space(struct super_block *sb, u64 ncount);
> + void fs_event_free_space(struct super_block *sb, u64 ncount);
> +
> + Each filesystme supporting the treshold notifiactions should call
> + fs_event_alloc_space/fs_event_free_space repsectively whenever the
> + ammount of availbale blocks changes.
> + - sb: the filesystem's super block
> + - ncount: number of blocks being acquired/released
> +
> + Note that to properly handle the treshold notifiactions the fs events
> + interface needs to be keept up to date by the filesystems. Each should
> + register fs_trace_operations to enable querying the basic trace data,
> + among which, is the current number of the available blocks (fs_trace_query).
> +
> + 4.4 Sending message through generic netlink interface
> +
> + #include <linux/fs_event.h>
> + int fs_netlink_send_event(size_t size, unsigned int event_type,
> + int (*compose_msg)(struct sk_buff *skb, unsigned int event_id,
> + void *data),
> + unsigned int event_id, void *data);
> +
> + Although the fs event interface is fully responsible for sending the messages
> + over the netlink, filesystems might use the FS_EVENT mutlicast group to send
> + their own custom messages.
> + - size: the size of the message payload
> + - event_type: the type of an event: stored as message header's command
> + - compose_msg: a custom callback handling composing the message payload
> + - event_id: the event identifier
> + - data: message custom data
> +
> + Calling fs_netlink_send_event will result in a message being sent through
> + the FS_EVENT muslicast group. Note that the body of the message should be
> + prepared (set-up )by the caller - through compose_msg callback. The message's
> + sk_buff will be allocated on behalf of the caller (thus the size parameter).
> + The compose_msg should only fill the payload with proper data.
> +
> +
> diff --git a/fs/Makefile b/fs/Makefile
> index a88ac48..798021d 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -126,3 +126,4 @@ obj-y += exofs/ # Multiple modules
> obj-$(CONFIG_CEPH_FS) += ceph/
> obj-$(CONFIG_PSTORE) += pstore/
> obj-$(CONFIG_EFIVAR_FS) += efivarfs/
> +obj-y += events/
> diff --git a/fs/events/Makefile b/fs/events/Makefile
> new file mode 100644
> index 0000000..58d1454
> --- /dev/null
> +++ b/fs/events/Makefile
> @@ -0,0 +1,6 @@
> +#
> +# Makefile for the Linux Generic File System Event Interface
> +#
> +
> +obj-y := fs_event.o
> +obj-$(CONFIG_NET) += fs_event_netlink.o
> diff --git a/fs/events/fs_event.c b/fs/events/fs_event.c
> new file mode 100644
> index 0000000..8ebe371
> --- /dev/null
> +++ b/fs/events/fs_event.c
> @@ -0,0 +1,775 @@
> +/*
> + * Generic File System Events Interface
> + *
> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2.
> + *
> + * The full GNU General Public License is included in this distribution in the
> + * file called COPYING.
> + *
> + * This program is distributed in the hope that it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +#include <linux/fs.h>
> +#include <linux/hashtable.h>
> +#include <linux/idr.h>
> +#include <linux/module.h>
> +#include <linux/mount.h>
> +#include <linux/namei.h>
> +#include <linux/parser.h>
> +#include <linux/seq_file.h>
> +#include <linux/slab.h>
> +#include <net/genetlink.h>
> +#include "../mount.h"
> +#include "fs_event.h"
> +
> +#define FS_HASHTB_BITS 8
> +#define FS_HASHTB_SIZE (1 << FS_HASHTB_BITS)
> +
> +/**
> + * The FS event trace entries are being stored in a hashtable
> + * for fast entry look-up, and in a doubly-linked list
> + * to ease all the paths that need to go through all
> + * the entries.
> + */
> +static DEFINE_HASHTABLE(fs_trace_hashtbl, FS_HASHTB_BITS);
> +static LIST_HEAD(fs_trace_list);
> +static DEFINE_SPINLOCK(fs_trace_lock);
> +
> +static struct kmem_cache *fs_trace_cachep __read_mostly;
> +
> +/*
> + * Each registered FS event trace is being marked with
> + * a unique identifier managed by IDR
> + */
> +static struct idr fs_trace_idr;
> +static DEFINE_SPINLOCK(fs_trace_idr_lock);
> +
> +/*
> + * Threshold notification state bits.
> + * Note the reverse as this refers to the number
> + * of available blocks.
> + */
> +#define THRESH_LR_BELOW 0x0001 /* Falling below the lower range */
> +#define THRESH_LR_BEYOND 0x0002
> +#define THRESH_UR_BELOW 0x0004
> +#define THRESH_UR_BEYOND 0x0008 /* Going beyond the upper range */
> +
> +#define THRESH_LR_ON (THRESH_LR_BELOW | THRESH_LR_BEYOND)
> +#define THRESH_UR_ON (THRESH_UR_BELOW | THRESH_UR_BEYOND)
> +
> +#define FS_TRACE_ADD 0x100000
> +
> +struct fs_trace_entry {
> + struct list_head node;
> + struct hlist_node hnode;
> + struct path path;
> + struct fs_trace_sdata data;
> + int mark;
> + unsigned int notify_mask;
> + struct fs_event_thresh {
> + u64 lrange;
> + u64 urange;
> + unsigned int state;
> + } thresh;
> + spinlock_t lock;
> +};
> +
> +static const match_table_t fs_etypes = {
> + { FS_EVENT_INFO, "info" },
> + { FS_EVENT_WARN, "warn" },
> + { FS_EVENT_THRESH, "thr" },
> + { FS_EVENT_ERR, "err" },
> + { 0, NULL },
> +};
> +
> +#define fs_trace_sb(en) ((en)->path.mnt->mnt_sb)
> +
> +#define fs_trace_query_data(sb, arg) \
> + (((sb)->s_trace_ops && (sb)->s_trace_ops->fs_trace_query) ? \
> + (sb)->s_trace_ops->fs_trace_query((sb), arg) : -EINVAL)
> +
> +#define fs_event_type_cast(event_type) (ffs(event_type))
> +
> +static inline unsigned int fs_trace_hasfn(const struct super_block *sb)
> +{
> + return ((unsigned long)sb >> L1_CACHE_SHIFT) & (FS_HASHTB_SIZE - 1);
> +}
> +
> +static struct fs_trace_entry *fs_find_trace_entry(struct super_block *sb)
> +{
> + struct fs_trace_entry *en;
> + unsigned long hash;
> +
> + if (list_empty(&fs_trace_list))
> + return ERR_PTR(-EINVAL);
> + hash = fs_trace_hasfn(sb);
> + hash_for_each_possible(fs_trace_hashtbl, en, hnode, hash)
> + if (fs_trace_sb(en) == sb)
> + return en;
> + return ERR_PTR(-EINVAL);
> +}
> +
> +static inline void fs_trace_entry_list_del(struct fs_trace_entry *en)
> +{
> + spin_lock(&en->lock);
> + list_del(&en->node);
> + hash_del(&en->hnode);
> + spin_unlock(&en->lock);
> +}
> +
> +static inline void fs_trace_entry_idr_remove(struct fs_trace_entry *en)
> +{
> + spin_lock(&fs_trace_idr_lock);
> + idr_remove(&fs_trace_idr, en->mark);
> + spin_unlock(&fs_trace_idr_lock);
> +}
> +
> +static inline void fs_trace_entry_free(struct fs_trace_entry *en)
> +{
> + kmem_cache_free(fs_trace_cachep, en);
> +}
> +
> +static inline void fs_destroy_trace_entry(struct fs_trace_entry *en)
> +{
> + fs_trace_entry_list_del(en);
> + fs_trace_entry_idr_remove(en);
> + fs_trace_entry_free(en);
> +}
> +
> +static int fs_remove_trace_entry(struct super_block *sb)
> +{
> + struct fs_trace_entry *en;
> + int ret = -EINVAL;
> +
> + spin_lock(&fs_trace_lock);
> + en = fs_find_trace_entry(sb);
> + if (!IS_ERR(en)) {
> + fs_destroy_trace_entry(en);
> + ret = 0;
> + }
> + spin_unlock(&fs_trace_lock);
> + return ret;
> +}
> +
> +static void fs_remove_all_traces(void)
> +{
> + struct fs_trace_entry *en, *guard;
> +
> + spin_lock(&fs_trace_lock);
> + list_for_each_entry_safe(en, guard, &fs_trace_list, node)
> + fs_destroy_trace_entry(en);
> + spin_unlock(&fs_trace_lock);
> +}
> +
> +static int fs_event_new_trace_create_msg(struct sk_buff *skb,
> + unsigned int event_id, void *data)
> +{
> + struct fs_trace_entry *en = (struct fs_trace_entry *)data;
> + char *path, *mount_dir;
> + int ret;
> +
> + path = kzalloc(PATH_MAX, GFP_KERNEL);
> + if (!path)
> + return -EINVAL;
> + mount_dir = d_path(&en->path, path, PATH_MAX - 1);
> + if (IS_ERR(mount_dir))
> + mount_dir = "unknown";
> +
> + ret = nla_put_u32(skb, FS_EVENT_ATR_FS_ID, en->mark);
> + if (ret)
> + goto leave;
> + ret = nla_put_string(skb, FS_EVENT_ATR_MOUNT, mount_dir);
> +
> +leave:
> + kfree(path);
> + return ret;
> +}
> +
> +static int fs_event_common_create_msg(struct sk_buff *skb,
> + unsigned int event_id, void *data)
> +{
> + struct fs_trace_entry *en = (struct fs_trace_entry *)data;
> + struct super_block *sb = fs_trace_sb(en);
> +
> + if (nla_put_u32(skb, FS_EVENT_ATR_FS_ID, en->mark))
> + return -EINVAL;
> +
> + /* In case there is no backing dev, so skip the followng */
> + if (sb->s_bdev && MAJOR(sb->s_dev))
> + if (nla_put_u32(skb, FS_EVENT_ATR_DEV_MAJOR, MAJOR(sb->s_dev))
> + || nla_put_u32(skb, FS_EVENT_ATR_DEV_MINOR, MINOR(sb->s_dev)))
> + return -EINVAL;
> +
> + if (nla_put_u32(skb, FS_EVENT_ATR_ID, event_id))
> + return -EINVAL;
> + if (nla_put_u64(skb, FS_EVENT_ATR_CAUSED_ID, pid_nr(task_pid(current))))
> + return -EINVAL;
> +
> + if (event_id & (FS_THRESH_LR_REACHED | FS_THRESH_UR_REACHED))
> + return nla_put_u64(skb, FS_EVENT_ATR_DATA,
> + en->data.available_blks);
> +
> + return 0;
> +}
> +
> +static void fs_event_new_trace(struct fs_trace_entry *en)
> +{
> + fs_netlink_send_event(GENLMSG_DEFAULT_SIZE, FS_EVENT_TYPE_NEW_TRACE,
> + fs_event_new_trace_create_msg, 0, en);
> +}
> +
> +static void fs_event_send(struct fs_trace_entry *en,
> + unsigned int event_type, unsigned int event_id)
> +{
> + size_t size = nla_total_size(sizeof(u32)) * 4 +
> + nla_total_size(sizeof(u64)) * 2;
> +
> + fs_netlink_send_event(size, fs_event_type_cast(event_type),
> + fs_event_common_create_msg, event_id, en);
> +}
> +
> +void fs_event_notify(struct super_block *sb, unsigned int event_type,
> + unsigned int event_id)
> +{
> + struct fs_trace_entry *en;
> +
> + spin_lock(&fs_trace_lock);
> + en = fs_find_trace_entry(sb);
> + if (IS_ERR(en)) {
> + spin_unlock(&fs_trace_lock);
> + return;
> + }
> +
> + spin_lock(&en->lock);
> + /* Relase the main lock - it's enough to keep the entry lock here */
> + spin_unlock(&fs_trace_lock);
> + if (en->notify_mask & event_type)
> + fs_event_send(en, event_type, event_id);
> + spin_unlock(&en->lock);
> +}
> +EXPORT_SYMBOL(fs_event_notify);
> +
> +void fs_event_alloc_space(struct super_block *sb, u64 ncount)
> +{
> + struct fs_trace_entry *en;
> + s64 count;
> +
> + spin_lock(&fs_trace_lock);
> + en = fs_find_trace_entry(sb);
> + if (IS_ERR(en)) {
> + spin_unlock(&fs_trace_lock);
> + return;
> + }
> +
> + spin_lock(&en->lock);
> + spin_unlock(&fs_trace_lock);
> +
> + if (!(en->notify_mask & FS_EVENT_THRESH))
> + goto leave;
> + /* we shouldn't drop below 0 here, unless there is a sync issue
> + somewhere (?) */
> + count = en->data.available_blks - ncount;
> + en->data.available_blks = count < 0 ? 0 : count;
> +
> + if (en->data.available_blks > en->thresh.lrange)
> + /* Not 'even' close - leave */
> + goto leave;
> +
> + if (en->data.available_blks > en->thresh.urange) {
> + /* Close enough - the lower range has been reached */
> + if (!(en->thresh.state & THRESH_LR_BEYOND)) {
> + /* Send notificaton */
> + fs_event_send(en, FS_EVENT_THRESH,
> + FS_THRESH_LR_REACHED);
> + en->thresh.state &= ~THRESH_LR_BELOW;
> + en->thresh.state |= THRESH_LR_BEYOND;
> + }
> + goto leave;
> + }
> + if (!(en->thresh.state & THRESH_UR_BEYOND)) {
> + fs_event_send(en, FS_EVENT_THRESH, FS_THRESH_UR_REACHED);
> + en->thresh.state &= ~THRESH_UR_BELOW;
> + en->thresh.state |= THRESH_UR_BEYOND;
> + }
> +
> +leave:
> + spin_unlock(&en->lock);
> +}
> +EXPORT_SYMBOL(fs_event_alloc_space);
> +
> +void fs_event_free_space(struct super_block *sb, u64 ncount)
> +{
> + struct fs_trace_entry *en;
> +
> + spin_lock(&fs_trace_lock);
> + en = fs_find_trace_entry(sb);
> + if (IS_ERR(en)) {
> + spin_unlock(&fs_trace_lock);
> + return;
> + }
> +
> + spin_lock(&en->lock);
> + spin_unlock(&fs_trace_lock);
> +
> + if (!(en->notify_mask & FS_EVENT_THRESH))
> + goto leave;
> +
> + en->data.available_blks += ncount;
> +
> + if (en->data.available_blks > en->thresh.lrange) {
> + if (!(en->thresh.state & THRESH_LR_BELOW)
> + && en->thresh.state & THRESH_LR_BEYOND) {
> + /* Send notificaton */
> + fs_event_send(en, FS_EVENT_THRESH,
> + FS_THRESH_LR_REACHED);
> + en->thresh.state &= ~THRESH_LR_BEYOND;
> + en->thresh.state |= THRESH_LR_BELOW;
> + goto leave;
> + }
> + }
> + if (en->data.available_blks > en->thresh.urange) {
> + if (!(en->thresh.state & THRESH_UR_BELOW)
> + && en->thresh.state & THRESH_UR_BEYOND) {
> + /* Notify */
> + fs_event_send(en, FS_EVENT_THRESH,
> + FS_THRESH_UR_REACHED);
> + en->thresh.state &= ~THRESH_UR_BEYOND;
> + en->thresh.state |= THRESH_UR_BELOW;
> + }
> + }
> +leave:
> + spin_unlock(&en->lock);
> +}
> +EXPORT_SYMBOL(fs_event_free_space);
> +
> +void fs_event_mount_dropped(struct vfsmount *mnt)
> +{
> + struct fs_trace_entry *en;
> +
> + spin_lock(&fs_trace_lock);
> + en = fs_find_trace_entry(mnt->mnt_sb);
> + if (!IS_ERR(en)) {
> + spin_lock(&en->lock);
> + if (en->notify_mask & FS_EVENT_INFO)
> + fs_event_send(en, FS_EVENT_TYPE_INFO, FS_INFO_UMOUNT);
> + spin_unlock(&en->lock);
> + fs_destroy_trace_entry(en);
> + }
> + spin_unlock(&fs_trace_lock);
> +}
> +
> +static int fs_new_trace_entry(struct path *path, struct fs_event_thresh *thresh,
> + unsigned int nmask)
> +{
> + struct fs_trace_entry *en;
> + struct super_block *sb;
> + struct mount *r_mnt;
> +
> + en = kmem_cache_zalloc(fs_trace_cachep, GFP_KERNEL);
> + if (unlikely(!en))
> + return -ENOMEM;
> + /*
> + * Note that no reference is being taken here for the path as it would
> + * make the umount unnecessarily puzzling (due to an extra 'valid'
> + * reference for the mnt).
> + * This is *rather* safe as the notification on mount being dropped
> + * will get called prior to releasing the super block - so right
> + * in time to send the event and perform appropraite clean-up
> + */
> + r_mnt = real_mount(path->mnt);
> + en->path.dentry = r_mnt->mnt.mnt_root;
> + en->path.mnt = &r_mnt->mnt;
> +
> + sb = fs_trace_sb(en);
> + spin_lock_init(&en->lock);
> +
> + spin_lock(&fs_trace_idr_lock);
> + idr_preload(GFP_KERNEL);
> + en->mark = idr_alloc_cyclic(&fs_trace_idr, en, 1, 0, GFP_KERNEL);
> + idr_preload_end();
> + spin_unlock(&fs_trace_idr_lock);
> +
> + if (en->mark < 0)
> + goto leave;
> + if (fs_trace_query_data(sb, &en->data))
> + goto leave;
> +
> + nmask = en->data.events_cap_mask & nmask;
> + if (!nmask)
> + goto leave;
> + en->notify_mask = nmask;
> + memcpy(&en->thresh, thresh, offsetof(struct fs_event_thresh, state));
> +
> + spin_lock(&fs_trace_lock);
> + list_add(&en->node, &fs_trace_list);
> + hash_add(fs_trace_hashtbl, &en->hnode, fs_trace_hasfn(sb));
> + spin_unlock(&fs_trace_lock);
> +
> + fs_event_new_trace(en);
> + return 0;
> +leave:
> + kmem_cache_free(fs_trace_cachep, en);
> + return -EINVAL;
> +}
> +
> +static int fs_update_trace_entry_locked(struct fs_trace_entry *en,
> + struct fs_event_thresh *thresh,
> + unsigned int nmask)
> +{
> + int extend = nmask & FS_TRACE_ADD;
> +
> + nmask &= en->data.events_cap_mask;
> + if (!nmask)
> + return -EINVAL;
> +
> + if (nmask & FS_EVENT_THRESH) {
> + if (extend) {
> + /* Get the current state */
> + if (!(en->notify_mask & FS_EVENT_THRESH))
> + fs_trace_query_data(fs_trace_sb(en),
> + &en->data);
> + if (thresh->state & THRESH_LR_ON) {
> + en->thresh.lrange = thresh->lrange;
> + en->thresh.state &= ~THRESH_LR_ON;
> + }
> + if (thresh->state & THRESH_UR_ON) {
> + en->thresh.urange = thresh->urange;
> + en->thresh.state &= ~THRESH_UR_ON;
> + }
> + } else {
> + memset(&en->thresh, 0, sizeof(en->thresh));
> + }
> + }
> +
> + if (extend)
> + en->notify_mask |= nmask;
> + else
> + en->notify_mask &= ~nmask;
> + return 0;
> +}
> +
> +static int fs_update_trace_entry(struct path *path,
> + struct fs_event_thresh *thresh,
> + unsigned int nmask)
> +{
> + struct fs_trace_entry *en;
> + int ret;
> +
> + spin_lock(&fs_trace_lock);
> + en = fs_find_trace_entry(path->mnt->mnt_sb);
> + if (IS_ERR(en)) {
> + spin_unlock(&fs_trace_lock);
> + return (nmask & FS_TRACE_ADD)
> + ? fs_new_trace_entry(path, thresh, nmask)
> + : -EINVAL;
> + }
> + spin_lock(&en->lock);
> + spin_unlock(&fs_trace_lock);
> +
> + ret = fs_update_trace_entry_locked(en, thresh, nmask);
> +
> + spin_unlock(&en->lock);
> + return ret;
> +}
> +
> +static int fs_parse_trace_request(int argc, char **argv)
> +{
> + struct fs_event_thresh thresh = {0};
> + struct path path;
> + substring_t args[MAX_OPT_ARGS];
> + unsigned int nmask = FS_TRACE_ADD;
> + int token;
> + char *s;
> + int ret = -EINVAL;
> +
> + if (!argc) {
> + fs_remove_all_traces();
> + return 0;
> + }
> +
> + s = *(argv++);
> + if (*s == '!') {
> + /* Clear the trace entry */
> + nmask &= ~FS_TRACE_ADD;
> + ++s;
> + }
> +
> + if (kern_path_mountpoint(AT_FDCWD, s, &path, LOOKUP_FOLLOW))
> + return -EINVAL;
> +
> + if (!(--argc)) {
> + if (!(nmask & FS_TRACE_ADD))
> + ret = fs_remove_trace_entry(path.mnt->mnt_sb);
> + goto leave;
> + }
> +
> + while ((s = strsep(argv, ",")) != NULL) {
> + if (!*s)
> + continue;
> + args[0].to = args[0].from = NULL;
> + token = match_token(s, fs_etypes, args);
> + nmask |= (token & FS_EVENTS_ALL);
> + }
> +
> + if (!(nmask & (~FS_TRACE_ADD)) ||
> + (!(--argc) && (nmask & FS_EVENT_THRESH && nmask & FS_TRACE_ADD)))
> + goto leave;
> +
> + if ((nmask & FS_EVENT_THRESH) && (nmask & FS_TRACE_ADD)) {
> + /*
> + * Get the threshold config data:
> + * lower range
> + * upper range
> + */
> + ret = kstrtoull(*(++argv), 10, &thresh.lrange);
> + if (ret)
> + goto leave;
> +
> + thresh.state |= THRESH_LR_ON;
> +
> + if ((--argc)) {
> + ret = kstrtoull(*(++argv), 10, &thresh.urange);
> + if (ret)
> + goto leave;
> + thresh.state |= THRESH_UR_ON;
> + }
> + /* The thresholds are based on number of available blocks */
> + if (thresh.lrange < thresh.urange) {
> + ret = -EINVAL;
> + goto leave;
> + }
> +
> + }
> + ret = fs_update_trace_entry(&path, &thresh, nmask);
> +leave:
> + path_put(&path);
> + return ret;
> +}
> +
> +#define DEFAULT_BUF_SIZE PAGE_SIZE
> +
> +static ssize_t fs_trace_write(struct file *file, const char __user *buffer,
> + size_t count, loff_t *ppos)
> +{
> + char **argv;
> + char *kern_buf, *next, *cfg;
> + size_t size, dcount = 0;
> + int argc;
> +
> + if (!count)
> + return 0;
> +
> + kern_buf = kmalloc(DEFAULT_BUF_SIZE, GFP_KERNEL);
> + if (!kern_buf)
> + return -ENOMEM;
> +
> + while (dcount < count) {
> +
> + size = count - dcount;
> + if (size >= DEFAULT_BUF_SIZE)
> + size = DEFAULT_BUF_SIZE - 1;
> + if (copy_from_user(kern_buf, buffer + dcount, size)) {
> + dcount = -EINVAL;
> + goto leave;
> + }
> +
> + kern_buf[size] = '\0';
> +
> + next = cfg = kern_buf;
> +
> + do {
> + next = strchr(cfg, ';');
> + if (next)
> + *next = '\0';
> +
> + argv = argv_split(GFP_KERNEL, cfg, &argc);
> + if (!argv) {
> + dcount = -ENOMEM;
> + goto leave;
> + }
> +
> + if (fs_parse_trace_request(argc, argv)) {
> + dcount = -EINVAL;
> + argv_free(argv);
> + goto leave;
> + }
> +
> + argv_free(argv);
> + if (next)
> + cfg = ++next;
> +
> + } while (next);
> + dcount += size;
> + }
> +leave:
> + kfree(kern_buf);
> + return dcount;
> +}
> +
> +static void *fs_trace_seq_start(struct seq_file *m, loff_t *pos)
> +{
> + spin_lock(&fs_trace_lock);
> + return seq_list_start(&fs_trace_list, *pos);
> +}
> +
> +static void *fs_trace_seq_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> + return seq_list_next(v, &fs_trace_list, pos);
> +}
> +
> +static void fs_trace_seq_stop(struct seq_file *m, void *v)
> +{
> + spin_unlock(&fs_trace_lock);
> +}
> +
> +static int fs_trace_seq_show(struct seq_file *m, void *v)
> +{
> + struct fs_trace_entry *en;
> + struct super_block *sb;
> + struct mount *r_mnt;
> + const struct match_token *match;
> + unsigned int nmask;
> +
> + en = list_entry(v, struct fs_trace_entry, node);
> + sb = fs_trace_sb(en);
> +
> + seq_printf(m, "%d ", en->mark);
> +
> + seq_path(m, &en->path, "\t\n\\");
> + seq_putc(m, ' ');
> +
> + seq_escape(m, sb->s_type->name, " \t\n\\");
> + if (sb->s_subtype && sb->s_subtype[0]) {
> + seq_putc(m, '.');
> + seq_escape(m, sb->s_subtype, " \t\n\\");
> + }
> +
> + seq_putc(m, ' ');
> + if (sb->s_op->show_devname) {
> + sb->s_op->show_devname(m, en->path.mnt->mnt_root);
> + } else {
> + r_mnt = real_mount(en->path.mnt);
> + seq_escape(m, r_mnt->mnt_devname ? r_mnt->mnt_devname : "none",
> + " \t\n\\");
> + }
> + seq_puts(m, " (");
> +
> + nmask = en->notify_mask;
> + for (match = fs_etypes; match->pattern; ++match) {
> + if (match->token & nmask) {
> + seq_puts(m, match->pattern);
> + nmask &= ~match->token;
> + if (nmask)
> + seq_putc(m, ',');
> + }
> + }
> + seq_printf(m, " %llu %llu", en->thresh.lrange,
> + en->thresh.urange);
> + seq_puts(m, ")\n");
> + return 0;
> +}
> +
> +static const struct seq_operations fs_trace_seq_ops = {
> + .start = fs_trace_seq_start,
> + .next = fs_trace_seq_next,
> + .stop = fs_trace_seq_stop,
> + .show = fs_trace_seq_show,
> +};
> +
> +static int fs_trace_open(struct inode *inode, struct file *file)
> +{
> + return seq_open(file, &fs_trace_seq_ops);
> +}
> +
> +static const struct file_operations fs_trace_fops = {
> + .owner = THIS_MODULE,
> + .open = fs_trace_open,
> + .write = fs_trace_write,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = seq_release,
> +};
> +
> +static int fs_trace_init(void)
> +{
> + fs_trace_cachep = KMEM_CACHE(fs_trace_entry, 0);
> + if (!fs_trace_cachep)
> + return -EINVAL;
> + if (!fs_event_netlink_register()) {
> + idr_init(&fs_trace_idr);
> + return 0;
> + }
> + kmem_cache_destroy(fs_trace_cachep);
> + return -EINVAL;
> +}
> +
> +/* VFS support */
> +static int fs_trace_fill_super(struct super_block *sb, void *data, int silent)
> +{
> + int ret;
> + static struct tree_descr desc[] = {
> + [2] = {
> + .name = "config",
> + .ops = &fs_trace_fops,
> + .mode = S_IWUSR | S_IRUGO,
> + },
> + {""},
> + };
> +
> + ret = simple_fill_super(sb, 0x7246332, desc);
> + return !ret ? fs_trace_init() : ret;
> +}
> +
> +static struct dentry *fs_trace_do_mount(struct file_system_type *fs_type,
> + int ntype, const char *dev_name, void *data)
> +{
> + return mount_single(fs_type, ntype, data, fs_trace_fill_super);
> +}
> +
> +static void fs_trace_kill_super(struct super_block *sb)
> +{
> + fs_remove_all_traces();
> + idr_destroy(&fs_trace_idr);
> + fs_event_netlink_unregister();
> + kmem_cache_destroy(fs_trace_cachep);
> + kill_litter_super(sb);
> +}
> +
> +static struct kset *fs_trace_kset;
> +static struct vfsmount *fs_trace_mount;
> +
> +static struct file_system_type fs_trace_fstype = {
> + .name = "fstrace",
> + .mount = fs_trace_do_mount,
> + .kill_sb = fs_trace_kill_super,
> +};
> +
> +static void __init fs_trace_vfs_init(void)
> +{
> + fs_trace_kset = kset_create_and_add("events", NULL, fs_kobj);
> +
> + if (!fs_trace_kset)
> + return;
> +
> + if (!register_filesystem(&fs_trace_fstype)) {
> + fs_trace_mount = kern_mount(&fs_trace_fstype);
> + if (!IS_ERR(fs_trace_mount))
> + return;
> +
> + unregister_filesystem(&fs_trace_fstype);
> + }
> + kset_unregister(fs_trace_kset);
> +}
> +
> +static int __init fs_trace_events_init(void)
> +{
> + fs_trace_vfs_init();
> + return 0;
> +};
> +module_init(fs_trace_events_init);
> +
> diff --git a/fs/events/fs_event.h b/fs/events/fs_event.h
> new file mode 100644
> index 0000000..4260ce5
> --- /dev/null
> +++ b/fs/events/fs_event.h
> @@ -0,0 +1,27 @@
> +/*
> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2.
> + *
> + * The full GNU General Public License is included in this distribution in the
> + * file called COPYING.
> + *
> + * This program is distributed in the hope that it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef __GENERIC_FS_EVENTS_H
> +#define __GENERIC_FS_EVENTS_H
> +
> +#ifdef CONFIG_NET
> +int fs_event_netlink_register(void);
> +void fs_event_netlink_unregister(void);
> +#else /* CONFIG_NET */
> +static inline int fs_event_netlink_register(void) { return -ENOSYS; }
> +static inline void fs_event_netlink_unregister(void) {};
> +#endif /* CONFIG_NET */
> +
> +#endif /* __GENERIC_FS_EVENTS_H */
> diff --git a/fs/events/fs_event_netlink.c b/fs/events/fs_event_netlink.c
> new file mode 100644
> index 0000000..9c56e35
> --- /dev/null
> +++ b/fs/events/fs_event_netlink.c
> @@ -0,0 +1,94 @@
> +/*
> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2.
> + *
> + * The full GNU General Public License is included in this distribution in the
> + * file called COPYING.
> + *
> + * This program is distributed in the hope that it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +#include <linux/fs.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <net/netlink.h>
> +#include <net/genetlink.h>
> +
> +static const struct genl_multicast_group fs_event_mcgroups[] = {
> + { .name = "event", },
> +};
> +
> +static struct genl_family fs_event_family = {
> + .id = GENL_ID_FS_EVENT,
> + .hdrsize = 0,
> + .name = "FS_EVENT",
> + .version = 1,
> + .maxattr = FS_EVENT_ATR_MAX,
> + .mcgrps = fs_event_mcgroups,
> + .n_mcgrps = ARRAY_SIZE(fs_event_mcgroups),
> +};
> +
> +int fs_netlink_send_event(size_t size, unsigned int event_type,
> + int (*compose_msg)(struct sk_buff *skb,
> + unsigned int event_id, void *data),
> + unsigned int event_id, void *data)
> +{
> + static atomic_t seq;
> + struct sk_buff *skb;
> + void *msg_head;
> + int ret = 0;
> +
> + if (!size || !compose_msg)
> + return -EINVAL;
> +
> + size += nla_total_size(sizeof(u64));
> + skb = genlmsg_new(size, GFP_NOFS);
> +
> + if (!skb) {
> + pr_err("Failed to allocate new FS generic netlink message\n");
> + return -ENOMEM;
> + }
> +
> + msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
> + &fs_event_family, 0, event_type);
> + if (!msg_head)
> + goto cleanup;
> +
> + ret = compose_msg(skb, event_id, data);
> + if (ret) {
> + genlmsg_cancel(skb, msg_head);
> + goto cleanup;
> + }
> +
> + genlmsg_end(skb, msg_head);
> + ret = genlmsg_multicast(&fs_event_family, skb, 0, 0, GFP_NOWAIT);
> + if (ret && ret != -ENOBUFS && ret != -ESRCH)
> + goto cleanup;
> +
> + return ret;
> +cleanup:
> + nlmsg_free(skb);
> + return ret;
> +}
> +EXPORT_SYMBOL(fs_netlink_send_event);
> +
> +int fs_event_netlink_register(void)
> +{
> + int ret;
> +
> + ret = genl_register_family(&fs_event_family);
> + if (ret)
> + pr_err("Failed to register FS netlink interface\n");
> + return ret;
> +}
> +
> +void fs_event_netlink_unregister(void)
> +{
> + genl_unregister_family(&fs_event_family);
> +}
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 82ef140..ec6e2ef 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1031,6 +1031,7 @@ static void cleanup_mnt(struct mount *mnt)
> if (unlikely(mnt->mnt_pins.first))
> mnt_pin_kill(mnt);
> fsnotify_vfsmount_delete(&mnt->mnt);
> + fs_event_mount_dropped(&mnt->mnt);
> dput(mnt->mnt.mnt_root);
> deactivate_super(mnt->mnt.mnt_sb);
> mnt_free_id(mnt);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index b4d71b5..bb529af 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -263,6 +263,10 @@ struct iattr {
> * Includes for diskquotas.
> */
> #include <linux/quota.h>
> +/*
> + * Include for Generic File System Events Interface
> + */
> +#include <linux/fs_event.h>
>
> /*
> * Maximum number of layers of fs stack. Needs to be limited to
> @@ -1233,6 +1237,7 @@ struct super_block {
> const struct dquot_operations *dq_op;
> const struct quotactl_ops *s_qcop;
> const struct export_operations *s_export_op;
> + const struct fs_trace_operations *s_trace_ops;
> unsigned long s_flags;
> unsigned long s_magic;
> struct dentry *s_root;
> @@ -1253,7 +1258,6 @@ struct super_block {
> struct hlist_node s_instances;
> unsigned int s_quota_types; /* Bitmask of supported quota types */
> struct quota_info s_dquot; /* Diskquota specific options */
> -
> struct sb_writers s_writers;
>
> char s_id[32]; /* Informational name */
> diff --git a/include/linux/fs_event.h b/include/linux/fs_event.h
> new file mode 100644
> index 0000000..1e128d8
> --- /dev/null
> +++ b/include/linux/fs_event.h
> @@ -0,0 +1,69 @@
> +/*
> + * Generic File System Events Interface
> + *
> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2.
> + *
> + * The full GNU General Public License is included in this distribution in the
> + * file called COPYING.
> + *
> + * This program is distributed in the hope that it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +#ifndef _LINUX_GENERIC_FS_EVETS_
> +#define _LINUX_GENERIC_FS_EVETS_
> +#include <net/netlink.h>
> +#include <uapi/linux/fs_event.h>
> +
> +/*
> + * Those event flags match the event types send though the netlink interface
> + * so mind in case making any modifications.
> + */
> +#define FS_EVENT_INFO 0x001
> +#define FS_EVENT_WARN 0x002
> +#define FS_EVENT_ERR 0x004
> +#define FS_EVENT_THRESH 0x008
> +
> +#define FS_EVENTS_ALL \
> + (FS_EVENT_INFO | FS_EVENT_WARN | FS_EVENT_THRESH | FS_EVENT_ERR)
> +
> +struct fs_trace_sdata {
> + /* Supported notification types */
> + unsigned int events_cap_mask;
> + /* Number of available/reachable blocks */
> + u64 available_blks;
> +};
> +
> +struct fs_trace_operations {
> + int (*fs_trace_query)(struct super_block *, struct fs_trace_sdata *);
> +};
> +
> +
> +void fs_event_notify(struct super_block *sb, unsigned int event_type,
> + unsigned int event_id);
> +void fs_event_alloc_space(struct super_block *sb, u64 ncount);
> +void fs_event_free_space(struct super_block *sb, u64 ncount);
> +void fs_event_mount_dropped(struct vfsmount *mnt);
> +
> +#ifdef CONFIG_NET
> +int fs_netlink_send_event(size_t size, unsigned int event_type,
> + int (*compose_msg)(struct sk_buff *skb, unsigned int event_id,
> + void *data),
> + unsigned int event_id, void *data);
> +#else /* CONFIG_NET */
> +static inline
> +int fs_netlink_send_event(size_t size, unsigned int event_type,
> + int (*compose_msg)(struct sk_buff *skb, unsigned int event_id,
> + void *data),
> + unsigned int event_idid, void *data)
> +{
> + return -ENOSYS;
> +}
> +#endif /* CONFIG_NET */
> +
> +#endif /* _LINUX_GENERIC_FS_EVENTS_ */
> +
> diff --git a/include/uapi/linux/fs_event.h b/include/uapi/linux/fs_event.h
> new file mode 100644
> index 0000000..dd79953
> --- /dev/null
> +++ b/include/uapi/linux/fs_event.h
> @@ -0,0 +1,62 @@
> +/*
> + * Generic netlink support for Generic File System Events Interface
> + *
> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2.
> + *
> + * The full GNU General Public License is included in this distribution in the
> + * file called COPYING.
> + *
> + * This program is distributed in the hope that it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +#ifndef _UAPI_LINUX_GENERIC_FS_EVENTS_
> +#define _UAPI_LINUX_GENERIC_FS_EVENTS_
> +/*
> + * Generic FS event types
> + */
> +enum {
> + FS_EVENT_TYPE_NONE,
> + FS_EVENT_TYPE_INFO,
> + FS_EVENT_TYPE_WARN,
> + FS_EVENT_TYPE_ERR,
> + FS_EVENT_TYPE_THRESH,
> + FS_EVENT_TYPE_NEW_TRACE,
> + __FS_EVENT_TYPE_MAX,
> +};
> +#define FS_EVENT_TYPE_MAX (__FS_EVENT_TYPE_MAX - 1)
> +/*
> + * Generic netlink attribute types
> + */
> +enum {
> + FS_EVENT_ATR_NONE,
> + FS_EVENT_ATR_FS_ID, /* An identifier of traced fs */
> + FS_EVENT_ATR_MOUNT, /* Mount point directory name */
> + FS_EVENT_ATR_DEV_MAJOR,
> + FS_EVENT_ATR_DEV_MINOR,
> + FS_EVENT_ATR_ID,
> + FS_EVENT_ATR_CAUSED_ID,
> + FS_EVENT_ATR_DATA,
> + __FS_EVENT_ATR_MAX,
> +};
> +#define FS_EVENT_ATR_MAX (__FS_EVENT_ATR_MAX - 1)
FS_EVENT_ATTR_ ? Most of the time, the kernel seems to use "attr" as shorthand
for "attribute".
> +
> +/*
> + * Supported set of FS events ids
> + */
> +#define FS_INFO_UMOUNT 0x00000001 /* File system unmounted */
> +#define FS_WARN_UNKNOWN 0x00000004 /* Unknown warning */
> +#define FS_WARN_ENOSPC 0x00000008 /* No space left to reserve data blks */
> +#define FS_WANR_ENOSPC_META 0x00000010 /* No space left for metadata */
Why WANR, as opposed to WARN?
> +#define FS_THRESH_LR_REACHED 0x00000020 /* The lower range of threshold has been reached */
> +#define FS_THRESH_UR_REACHED 0x00000040 /* The upper range of threshold has been reached */
> +#define FS_ERR_UNKNOWN 0x00000080 /* Unknown error */
> +#define FS_ERR_RO_REMOUT 0x00000100 /* The file system has been remounted as red-only */
_REMOUNT... read-only...
> +#define FS_ERR_ITERNAL 0x00000200 /* File system's internal error */
_INTERNAL...
What does FS_ERR_ITERNAL mean? "programming error"?
How about a separate FS_ERR_CORRUPTED to mean "go run fsck"?
Hmm, these are bit flags... it doesn't make sense that I can send things like
FS_INFO_UMOUNT | FS_ERR_RO_REMOUT.
> +
> +#endif /* _UAPI_LINUX_GENERIC_FS_EVENTS_ */
> +
> diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
> index c3363ba..6464129 100644
> --- a/include/uapi/linux/genetlink.h
> +++ b/include/uapi/linux/genetlink.h
> @@ -29,6 +29,7 @@ struct genlmsghdr {
> #define GENL_ID_CTRL NLMSG_MIN_TYPE
> #define GENL_ID_VFS_DQUOT (NLMSG_MIN_TYPE + 1)
> #define GENL_ID_PMCRAID (NLMSG_MIN_TYPE + 2)
> +#define GENL_ID_FS_EVENT (NLMSG_MIN_TYPE + 3)
>
> /**************************************************************************
> * Controller
> diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
> index 2ed5f96..e8e0bd68 100644
> --- a/net/netlink/genetlink.c
> +++ b/net/netlink/genetlink.c
> @@ -82,7 +82,8 @@ static struct list_head family_ht[GENL_FAM_TAB_SIZE];
> */
> static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
> BIT(GENL_ID_VFS_DQUOT) |
> - BIT(GENL_ID_PMCRAID);
> + BIT(GENL_ID_PMCRAID) |
> + BIT(GENL_ID_FS_EVENT);
> static unsigned long *mc_groups = &mc_group_start;
> static unsigned long mc_groups_longs = 1;
>
> @@ -146,6 +147,7 @@ static u16 genl_generate_id(void)
> for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
> if (id_gen_idx != GENL_ID_VFS_DQUOT &&
> id_gen_idx != GENL_ID_PMCRAID &&
> + id_gen_idx != GENL_ID_FS_EVENT &&
> !genl_family_find_byid(id_gen_idx))
> return id_gen_idx;
> if (++id_gen_idx > GENL_MAX_ID)
> @@ -249,6 +251,9 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
> } else if (family->id == GENL_ID_PMCRAID) {
> first_id = GENL_ID_PMCRAID;
> BUG_ON(n_groups != 1);
> + } else if (family->id == GENL_ID_FS_EVENT) {
> + first_id = GENL_ID_FS_EVENT;
> + BUG_ON(n_groups != 1);
> } else {
> groups_allocated = true;
> err = genl_allocate_reserve_groups(n_groups, &first_id);
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 4/15/15 2:15 AM, Beata Michalska wrote:
> Introduce configurable generic interface for file
> system-wide event notifications to provide file
> systems with a common way of reporting any potential
> issues as they emerge.
>
> The notifications are to be issued through generic
> netlink interface, by a dedicated, for file system
> events, multicast group. The file systems might as
> well use this group to send their own custom messages.
...
> + 4.3 Threshold notifications:
> +
> + #include <linux/fs_event.h>
> + void fs_event_alloc_space(struct super_block *sb, u64 ncount);
> + void fs_event_free_space(struct super_block *sb, u64 ncount);
> +
> + Each filesystme supporting the treshold notifiactions should call
> + fs_event_alloc_space/fs_event_free_space repsectively whenever the
> + ammount of availbale blocks changes.
> + - sb: the filesystem's super block
> + - ncount: number of blocks being acquired/released
so:
> +void fs_event_alloc_space(struct super_block *sb, u64 ncount)
> +{
> + struct fs_trace_entry *en;
> + s64 count;
> +
> + spin_lock(&fs_trace_lock);
Every allocation/free for every supported filesystem system-wide will be
serialized on this global spinlock? That sounds like a non-starter...
-Eric
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 04/15/2015 09:18 PM, Darrick J. Wong wrote:
> On Wed, Apr 15, 2015 at 09:15:46AM +0200, Beata Michalska wrote:
>> Add support for generic FS events including threshold
>> notifications, ENOSPC and remount as read-only warnings,
>> along with generic internal warnings/errors.
>>
>> Signed-off-by: Beata Michalska <[email protected]>
>> ---
>> fs/ext4/balloc.c | 11 +++++++++--
>> fs/ext4/ext4.h | 1 +
>> fs/ext4/inode.c | 2 +-
>> fs/ext4/mballoc.c | 6 +++++-
>> fs/ext4/resize.c | 1 +
>> fs/ext4/super.c | 43 +++++++++++++++++++++++++++++++++++++++++++
>> 6 files changed, 60 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
>> index e95b27a..49d2ace 100644
>> --- a/fs/ext4/balloc.c
>> +++ b/fs/ext4/balloc.c
>> @@ -569,6 +569,7 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
>> {
>> if (ext4_has_free_clusters(sbi, nclusters, flags)) {
>> percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
>> + fs_event_alloc_space(sbi->s_sb, EXT4_C2B(sbi, nclusters));
>> return 0;
>> } else
>> return -ENOSPC;
>> @@ -590,9 +591,10 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
>> {
>> if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
>> (*retries)++ > 3 ||
>> - !EXT4_SB(sb)->s_journal)
>> + !EXT4_SB(sb)->s_journal) {
>> + fs_event_notify(sb, FS_EVENT_WARN, FS_WARN_ENOSPC);
>> return 0;
>> -
>> + }
>> jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
>>
>> return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
>> @@ -637,6 +639,11 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
>> dquot_alloc_block_nofail(inode,
>> EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
>> }
>> +
>> + if (*errp == -ENOSPC)
>> + fs_event_notify(inode->i_sb, FS_EVENT_WARN,
>> + FS_WANR_ENOSPC_META);
>> +
>> return ret;
>> }
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 163afe2..7d75ff9 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -2542,6 +2542,7 @@ void ext4_mark_group_corrupted(struct ext4_sb_info *sbi,
>> if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
>> percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free);
>> set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
>> + fs_event_alloc_space(sbi->s_sb, EXT4_C2B(sbi, grp->bb_free));
>
> While we're adding fs netlink notifications, could we add a message that means
> "This FS is corrupt, go run fsck"? A monitoring app could possibly figure
> this out by a sudden drop in free space accompanied by EIO errors hitting
> userland apps, but we might as well be explicit about the flaming death. :)
>
> --D
>
The notifications sent through this interface can be extended to whatever is needed.
The are very few basic event codes - among them are FS_ERR_UNKNOWN and FS_ERR_ITERNAL.
So one can assume that whenever one of those is being triggered - smth wrong is going on.
So at this point running fsck would be a good idea. If this is not enough, new event
codes might be introduced. Note that it is also possible for the file systems
to send their own messages placing within the payload whatever they like.
This is an early version, so it can definitely be adjusted.
BR
Beata
>> }
>>
>> /*
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 5cb9a21..2a7af0f 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -1238,7 +1238,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
>> percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
>>
>> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
>> -
>> + fs_event_free_space(sbi->s_sb, to_free);
>> dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
>> }
>>
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index 24a4b6d..e6cbbd6 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -4511,6 +4511,9 @@ out:
>> kmem_cache_free(ext4_ac_cachep, ac);
>> if (inquota && ar->len < inquota)
>> dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
>> + if (reserv_clstrs && ar->len < reserv_clstrs)
>> + fs_event_free_space(sbi->s_sb,
>> + EXT4_C2B(sbi, reserv_clstrs - ar->len));
>> if (!ar->len) {
>> if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
>> /* release all the reserved blocks if non delalloc */
>> @@ -4848,7 +4851,7 @@ do_more:
>> if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
>> dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
>> percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
>> -
>> + fs_event_free_space(sb, EXT4_C2B(sbi, count_clusters));
>> ext4_mb_unload_buddy(&e4b);
>>
>> /* We dirtied the bitmap block */
>> @@ -4982,6 +4985,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
>> ext4_unlock_group(sb, block_group);
>> percpu_counter_add(&sbi->s_freeclusters_counter,
>> EXT4_NUM_B2C(sbi, blocks_freed));
>> + fs_event_free_space(sb, blocks_freed);
>>
>> if (sbi->s_log_groups_per_flex) {
>> ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
>> diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
>> index 8a8ec62..dbf08d6 100644
>> --- a/fs/ext4/resize.c
>> +++ b/fs/ext4/resize.c
>> @@ -1378,6 +1378,7 @@ static void ext4_update_super(struct super_block *sb,
>> EXT4_NUM_B2C(sbi, free_blocks));
>> percpu_counter_add(&sbi->s_freeinodes_counter,
>> EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
>> + fs_event_free_space(sb, free_blocks - reserved_blocks);
>>
>> ext4_debug("free blocks count %llu",
>> percpu_counter_read(&sbi->s_freeclusters_counter));
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index e061e66..52091da 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -398,6 +398,7 @@ static void ext4_handle_error(struct super_block *sb)
>> if (test_opt(sb, ERRORS_PANIC))
>> panic("EXT4-fs (device %s): panic forced after error\n",
>> sb->s_id);
>> + fs_event_notify(sb, FS_EVENT_ERR, FS_ERR_UNKNOWN);
>> }
>>
>> #define ext4_error_ratelimit(sb) \
>> @@ -585,6 +586,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
>> if (EXT4_SB(sb)->s_journal)
>> jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
>> save_error_info(sb, function, line);
>> + fs_event_notify(sb, FS_EVENT_ERR, FS_ERR_RO_REMOUT);
>> +
>> }
>> if (test_opt(sb, ERRORS_PANIC))
>> panic("EXT4-fs panic from previous error\n");
>> @@ -612,6 +615,8 @@ void __ext4_warning(struct super_block *sb, const char *function,
>> struct va_format vaf;
>> va_list args;
>>
>> + fs_event_notify(sb, FS_EVENT_WARN, FS_WARN_UNKNOWN);
>> +
>> if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
>> "EXT4-fs warning"))
>> return;
>> @@ -1083,6 +1088,13 @@ static const struct quotactl_ops ext4_qctl_operations = {
>> };
>> #endif
>>
>> +static int ext4_trace_query(struct super_block *sb,
>> + struct fs_trace_sdata *data);
>> +
>> +static const struct fs_trace_operations ext4_trace_ops = {
>> + .fs_trace_query = ext4_trace_query,
>> +};
>> +
>> static const struct super_operations ext4_sops = {
>> .alloc_inode = ext4_alloc_inode,
>> .destroy_inode = ext4_destroy_inode,
>> @@ -3398,11 +3410,20 @@ static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
>> {
>> ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
>> sbi->s_cluster_bits;
>> + ext4_fsblk_t current_resv;
>>
>> if (count >= clusters)
>> return -EINVAL;
>>
>> + current_resv = atomic64_read(&sbi->s_resv_clusters);
>> atomic64_set(&sbi->s_resv_clusters, count);
>> +
>> + if (count > current_resv)
>> + fs_event_alloc_space(sbi->s_sb,
>> + EXT4_C2B(sbi, count - current_resv));
>> + else
>> + fs_event_free_space(sbi->s_sb,
>> + EXT4_C2B(sbi, current_resv - count));
>> return 0;
>> }
>>
>> @@ -3966,6 +3987,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>> sb->s_qcop = &ext4_qctl_operations;
>> sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
>> #endif
>> + sb->s_trace_ops = &ext4_trace_ops;
>> +
>> memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
>>
>> INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
>> @@ -5438,6 +5461,26 @@ out:
>>
>> #endif
>>
>> +static int ext4_trace_query(struct super_block *sb, struct fs_trace_sdata *data)
>> +{
>> + struct ext4_sb_info *sbi = EXT4_SB(sb);
>> + struct ext4_super_block *es = sbi->s_es;
>> + ext4_fsblk_t rsv_blocks;
>> +
>> + data->available_blks =
>> + percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
>> + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
>> + data->available_blks = EXT4_C2B(sbi, data->available_blks);
>> + rsv_blocks = ext4_r_blocks_count(es) +
>> + EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
>> + if (data->available_blks < rsv_blocks)
>> + data->available_blks = 0;
>> + else
>> + data->available_blks -= rsv_blocks;
>> + data->events_cap_mask = FS_EVENTS_ALL;
>> + return 0;
>> +}
>> +
>> static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
>> const char *dev_name, void *data)
>> {
>> --
>> 1.7.9.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 04/15/2015 09:25 PM, Darrick J. Wong wrote:
> On Wed, Apr 15, 2015 at 09:15:44AM +0200, Beata Michalska wrote:
>> Introduce configurable generic interface for file
>> system-wide event notifications to provide file
>> systems with a common way of reporting any potential
>> issues as they emerge.
>>
>> The notifications are to be issued through generic
>> netlink interface, by a dedicated, for file system
>> events, multicast group. The file systems might as
>> well use this group to send their own custom messages.
>>
>> The events have been split into four base categories:
>> information, warnings, errors and threshold notifications,
>> with some very basic event types like running out of space
>> or file system being remounted as read-only.
>>
>> Threshold notifications have been included to allow
>> triggering an event whenever the amount of free space
>> drops below a certain level - or levels to be more precise
>> as two of them are being supported: the lower and the upper
>> range. The notifications work both ways: once the threshold
>> level has been reached, an event shall be generated whenever
>> the number of available blocks goes up again re-activating
>> the threshold.
>>
>> The interface has been exposed through a vfs. Once mounted,
>> it serves as an entry point for the set-up where one can
>> register for particular file system events.
>>
>> Signed-off-by: Beata Michalska <[email protected]>
>> ---
>> Documentation/filesystems/events.txt | 254 +++++++++++
>> fs/Makefile | 1 +
>> fs/events/Makefile | 6 +
>> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
>> fs/events/fs_event.h | 27 ++
>> fs/events/fs_event_netlink.c | 94 +++++
>> fs/namespace.c | 1 +
>> include/linux/fs.h | 6 +-
>> include/linux/fs_event.h | 69 +++
>> include/uapi/linux/fs_event.h | 62 +++
>> include/uapi/linux/genetlink.h | 1 +
>> net/netlink/genetlink.c | 7 +-
>> 12 files changed, 1301 insertions(+), 2 deletions(-)
>> create mode 100644 Documentation/filesystems/events.txt
>> create mode 100644 fs/events/Makefile
>> create mode 100644 fs/events/fs_event.c
>> create mode 100644 fs/events/fs_event.h
>> create mode 100644 fs/events/fs_event_netlink.c
>> create mode 100644 include/linux/fs_event.h
>> create mode 100644 include/uapi/linux/fs_event.h
>>
>> diff --git a/Documentation/filesystems/events.txt b/Documentation/filesystems/events.txt
>> new file mode 100644
>> index 0000000..c85dd88
>> --- /dev/null
>> +++ b/Documentation/filesystems/events.txt
>> @@ -0,0 +1,254 @@
>> +
>> + Generic file system event notification interface
>> +
>> +Document created 09 April 2015 by Beata Michalska <[email protected]>
>> +
>> +1. The reason behind:
>> +=====================
>> +
>> +There are many corner cases when things might get messy with the filesystems.
>> +And it is not always obvious what and when went wrong. Sometimes you might
>> +get some subtle hints that there is something going on - but by the time
>> +you realise it, it might be too late as you are already out-of-space
>> +or the filesystem has been remounted as read-only (i.e.). The generic
>> +interface for the filesystem events fills the gap by providing a rather
>> +easy way of real-time notifications triggered whenever something intreseting
>> +happens, allowing filesystems to report events in a common way, as they occur.
>> +
>> +2. How does it work:
>> +====================
>> +
>> +The interface itself has been exposed as fstrace-type Virtual File System,
>> +primarily to ease the process of setting up the configuration for the file
>> +system notifications. So for starters it needs to get mounted (obviously):
>> +
>> + mount -t fstrace none /sys/fs/events
>> +
>> +This will unveil the single fstrace filesystem entry - the 'config' file,
>> +through which the notification are being set-up.
>> +
>> +Activating notifications for particular filesystem is as straightforward
>> +as writing into the 'config' file. Note that by default all events despite
>> +the actual filesystem type are being disregarded.
>> +
>> +Synopsis of config:
>> +------------------
>> +
>> + MOUNT EVENT_TYPE [L1] [L2]
>> +
>> + MOUNT : the filesystem's mount point
>> + EVENT_TYPE : type of events to be enabled: info,warn,err,thr;
>> + at least one type needs to be specified;
>> + note the comma delimiter and lack of spaces between
>> + those options
>> + L1 : the threshold limit - lower range
>> + L2 : the threshold limit - upper range
>> + case enabling threshold notifications the lower level is
>> + mandatory, whereas the upper one remains optional;
>> + note though, that as those refer to the number of available
>> + blocks, the lower level needs to be higher than the upper one
>> +
>> +Sample request could look like the follwoing:
>> +
>> + echo /sample/mount/point warn,err,thr 710000 500000 > /sys/fs/events/config
>> +
>> +Multiple request might be specified provided they are separated with semicolon.
>> +
>> +The configuration itself might be modified at any time. One can add/remove
>> +particilar event types for given fielsystem, modify the threshold levels,
>> +and remove single or all entries from the 'config' file.
>> +
>> + - Adding new event type:
>> +
>> + $ echo MOUNT EVENT_TYPE > /sys/fs/events/config
>> +
>> +(Note that is is enough to provide the eventy type to be enabled without
>> +the already set ones.)
>> +
>> + - Removing event type:
>> +
>> + $ echo '!MOUNT EVENT_TYPE' > /sys/fs/events/config
>> +
>> + - Updating threshold limits:
>> +
>> + $ echo MOUNT thres L1 L2 > /sys/fs/events/config
>> +
>> + - Removing single entry:
>> +
>> + $ echo '!MOUNT' > /sys/fs/events/config
>> +
>> + - Removing all entries:
>> +
>> + $ echo > /sys/fs/events/config
>> +
>> +Reading the file will list all registered entries with their current set-up
>> +along with some additional info like the id of the entry (@see more on generic
>> +netlink section), the filesystem type and the backing device name if available.
>> +
>> +Final, though a very important note on the configuration: when and if the
>> +actual events are being triggered falls way beyond the scope of the generic
>> +filesystem events interface. It is up to a particular filesystem
>> +implementation which events are to be supported - if any at all. So if
>> +given filesystem does not support the event notifications, an attempt to
>> +enable those through 'config' file will fail.
>> +
>> +
>> +3. The generic netlink interface support:
>> +=========================================
>> +
>> +Whenever an event notification is triggered (by given filesystem) the current
>> +configuration is being validated to decide whether a userpsace notification
>> +should be launched. If there has been no request (in a mean of 'config' file
>> +entry) for given event, one will be silently disreagrded. If, on the other
>> +hand, someone is 'watching' given filesystem for specific events, a generic
>> +netlink message will be sent.
>> +
>> +A dedicated multicast group has been provided solely for the purpose of
>> +notifying any potential listeners of file system events. So in order to
>> +receive such notifications, one should subscribe to this new mutlicast group.
>> +
>> +Each message type reflects the actual type of generated event (FS_EVENT_TYPE*)
>> +Currently there are two supported message formats.
>> +
>> +There is a common message format representing an event generated by
>> +a filesystem. The type of the event itself will be stored within
>> +the generic netlink message header as the command filed. The messge
>> +payload will provide more detailed info: the indentifier of the filesystem
>> +trace (genereted upon registering the trace), the backing device major and
>> +minor numbers, the event identifier and the id of the proccess which action
>> +led to the event occurance. In case of threshold notifications, the current
>> +number of available blocks will be included in the payload.
>> +
>> +
>> + 0 1 2 3
>> + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + | NETLINK MESSAGE HEADER |
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + | GENERIC NETLINK MESSAGE HEADER |
>> + | (with event type as genlmsghdr cdm field) |
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + | Optional user specific message header |
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + | GENERIC MESSAGE PAYLOAD: |
>> + +---------------------------------------------------------------+
>> + | FS_EVENT_ATR_FS_ID (NLA_U32) |
>> + +---------------------------------------------------------------+
>> + | FS_EVENT_ATR_DEV_MAJOR (NLA_U32) (if available) |
>> + +---------------------------------------------------------------+
>> + | FS_EVENT_ATR_DEV_MINOR (NLA_U32) (if available) |
>> + +---------------------------------------------------------------+
>> + | FS_EVENT_ATR_ID (NLA_U32) |
>> + +---------------------------------------------------------------+
>> + | FS_EVENT_ATR_CAUSED_ID (NLA_U32) |
>> + +---------------------------------------------------------------+
>> + | FS_EVENT_ATR_DATA (NLA_U64) |
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> +
>> +
>> +The second supported message format represents an event of a new trace being
>> +registered. It contains two attributes within the payload: the trace id and the
>> +mount point for which the trace has been registered. This message is of type
>> +FS_EVENT_TYPE_NEW_TRACE and is being sent regardless the actual event types
>> +being watched whenever new etnry for the 'config' file is being created. This
>> +is supposed to ease parsing the messages by userpsace applications and to help
>> +to identify the origin of the event. It also reduces the size of the payload
>> +as there is no need to send additional data such as mount point and the file
>> +system type for each possible event.
>> +
>> + 0 1 2 3
>> + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + | NETLINK MESSAGE HEADER |
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + | GENERIC NETLINK MESSAGE HEADER |
>> + | (with event type as genlmsghdr cdm field) |
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + | Optional user specific message header |
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> + | GENERIC MESSAGE PAYLOAD: |
>> + + ------------------------------------------------------------- +
>> + | FS_EVENT_ATR_FS_ID (NLA_U32) |
>> + + ------------------------------------------------------------- +
>> + | FS_EVENT_ATR_MOUNT (NLA_STRING) |
>> + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>> +
>> +The above figures are based on:
>> + http://www.linuxfoundation.org/collaborate/workgroups/networking/generic_netlink_howto#Message_Format
>> +
>> +
>> +
>> +4. API Reference:
>> +=================
>> +
>> + 4.1 Generic file system event interface operations
>> +
>> + #include <linux/fs_event.h>
>> +
>> + struct fs_trace_operations {
>> + int (*fs_trace_query)(struct super_block *, struct fs_trace_sdata *);
>> + };
>> +
>> + Each filesystem supporting the event notifications should register its
>> + file system trace operations. This can be done through new entry in
>> + the super_block structure: the s_trace_ops. The fs_trace_query shall
>> + be called whenever new trace entry for given filesystem is being created
>> + or when threshold notifications are being requested for the first time.
>> + The filesystem should specify then, which event types are being supported.
>> + In case of threshold notifications the current number of avaialble blocks
>> + should be provided.
>> +
>> + 4.2 Event notification:
>> +
>> + #include <linux/fs_event.h>
>> + void fs_event_notify(struct super_block *sb, unsigned int event_type,
>> + unsigned int event_id);
>> +
>> + Notify the generic FS event interface of an occuring event.
>> + This shall be used by any file system that wishes to inform any potenial
>> + listeners/watchers of a particular event.
>> + - sb: the filesystem's super block
>> + - event_type: the type of an event (one of the FS_EVENT_*)
>> + - event_id: an event identifier
>> +
>> + 4.3 Threshold notifications:
>> +
>> + #include <linux/fs_event.h>
>> + void fs_event_alloc_space(struct super_block *sb, u64 ncount);
>> + void fs_event_free_space(struct super_block *sb, u64 ncount);
>> +
>> + Each filesystme supporting the treshold notifiactions should call
>> + fs_event_alloc_space/fs_event_free_space repsectively whenever the
>> + ammount of availbale blocks changes.
>> + - sb: the filesystem's super block
>> + - ncount: number of blocks being acquired/released
>> +
>> + Note that to properly handle the treshold notifiactions the fs events
>> + interface needs to be keept up to date by the filesystems. Each should
>> + register fs_trace_operations to enable querying the basic trace data,
>> + among which, is the current number of the available blocks (fs_trace_query).
>> +
>> + 4.4 Sending message through generic netlink interface
>> +
>> + #include <linux/fs_event.h>
>> + int fs_netlink_send_event(size_t size, unsigned int event_type,
>> + int (*compose_msg)(struct sk_buff *skb, unsigned int event_id,
>> + void *data),
>> + unsigned int event_id, void *data);
>> +
>> + Although the fs event interface is fully responsible for sending the messages
>> + over the netlink, filesystems might use the FS_EVENT mutlicast group to send
>> + their own custom messages.
>> + - size: the size of the message payload
>> + - event_type: the type of an event: stored as message header's command
>> + - compose_msg: a custom callback handling composing the message payload
>> + - event_id: the event identifier
>> + - data: message custom data
>> +
>> + Calling fs_netlink_send_event will result in a message being sent through
>> + the FS_EVENT muslicast group. Note that the body of the message should be
>> + prepared (set-up )by the caller - through compose_msg callback. The message's
>> + sk_buff will be allocated on behalf of the caller (thus the size parameter).
>> + The compose_msg should only fill the payload with proper data.
>> +
>> +
>> diff --git a/fs/Makefile b/fs/Makefile
>> index a88ac48..798021d 100644
>> --- a/fs/Makefile
>> +++ b/fs/Makefile
>> @@ -126,3 +126,4 @@ obj-y += exofs/ # Multiple modules
>> obj-$(CONFIG_CEPH_FS) += ceph/
>> obj-$(CONFIG_PSTORE) += pstore/
>> obj-$(CONFIG_EFIVAR_FS) += efivarfs/
>> +obj-y += events/
>> diff --git a/fs/events/Makefile b/fs/events/Makefile
>> new file mode 100644
>> index 0000000..58d1454
>> --- /dev/null
>> +++ b/fs/events/Makefile
>> @@ -0,0 +1,6 @@
>> +#
>> +# Makefile for the Linux Generic File System Event Interface
>> +#
>> +
>> +obj-y := fs_event.o
>> +obj-$(CONFIG_NET) += fs_event_netlink.o
>> diff --git a/fs/events/fs_event.c b/fs/events/fs_event.c
>> new file mode 100644
>> index 0000000..8ebe371
>> --- /dev/null
>> +++ b/fs/events/fs_event.c
>> @@ -0,0 +1,775 @@
>> +/*
>> + * Generic File System Events Interface
>> + *
>> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms of the GNU General Public License version 2.
>> + *
>> + * The full GNU General Public License is included in this distribution in the
>> + * file called COPYING.
>> + *
>> + * This program is distributed in the hope that it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
>> + * more details.
>> + */
>> +#include <linux/fs.h>
>> +#include <linux/hashtable.h>
>> +#include <linux/idr.h>
>> +#include <linux/module.h>
>> +#include <linux/mount.h>
>> +#include <linux/namei.h>
>> +#include <linux/parser.h>
>> +#include <linux/seq_file.h>
>> +#include <linux/slab.h>
>> +#include <net/genetlink.h>
>> +#include "../mount.h"
>> +#include "fs_event.h"
>> +
>> +#define FS_HASHTB_BITS 8
>> +#define FS_HASHTB_SIZE (1 << FS_HASHTB_BITS)
>> +
>> +/**
>> + * The FS event trace entries are being stored in a hashtable
>> + * for fast entry look-up, and in a doubly-linked list
>> + * to ease all the paths that need to go through all
>> + * the entries.
>> + */
>> +static DEFINE_HASHTABLE(fs_trace_hashtbl, FS_HASHTB_BITS);
>> +static LIST_HEAD(fs_trace_list);
>> +static DEFINE_SPINLOCK(fs_trace_lock);
>> +
>> +static struct kmem_cache *fs_trace_cachep __read_mostly;
>> +
>> +/*
>> + * Each registered FS event trace is being marked with
>> + * a unique identifier managed by IDR
>> + */
>> +static struct idr fs_trace_idr;
>> +static DEFINE_SPINLOCK(fs_trace_idr_lock);
>> +
>> +/*
>> + * Threshold notification state bits.
>> + * Note the reverse as this refers to the number
>> + * of available blocks.
>> + */
>> +#define THRESH_LR_BELOW 0x0001 /* Falling below the lower range */
>> +#define THRESH_LR_BEYOND 0x0002
>> +#define THRESH_UR_BELOW 0x0004
>> +#define THRESH_UR_BEYOND 0x0008 /* Going beyond the upper range */
>> +
>> +#define THRESH_LR_ON (THRESH_LR_BELOW | THRESH_LR_BEYOND)
>> +#define THRESH_UR_ON (THRESH_UR_BELOW | THRESH_UR_BEYOND)
>> +
>> +#define FS_TRACE_ADD 0x100000
>> +
>> +struct fs_trace_entry {
>> + struct list_head node;
>> + struct hlist_node hnode;
>> + struct path path;
>> + struct fs_trace_sdata data;
>> + int mark;
>> + unsigned int notify_mask;
>> + struct fs_event_thresh {
>> + u64 lrange;
>> + u64 urange;
>> + unsigned int state;
>> + } thresh;
>> + spinlock_t lock;
>> +};
>> +
>> +static const match_table_t fs_etypes = {
>> + { FS_EVENT_INFO, "info" },
>> + { FS_EVENT_WARN, "warn" },
>> + { FS_EVENT_THRESH, "thr" },
>> + { FS_EVENT_ERR, "err" },
>> + { 0, NULL },
>> +};
>> +
>> +#define fs_trace_sb(en) ((en)->path.mnt->mnt_sb)
>> +
>> +#define fs_trace_query_data(sb, arg) \
>> + (((sb)->s_trace_ops && (sb)->s_trace_ops->fs_trace_query) ? \
>> + (sb)->s_trace_ops->fs_trace_query((sb), arg) : -EINVAL)
>> +
>> +#define fs_event_type_cast(event_type) (ffs(event_type))
>> +
>> +static inline unsigned int fs_trace_hasfn(const struct super_block *sb)
>> +{
>> + return ((unsigned long)sb >> L1_CACHE_SHIFT) & (FS_HASHTB_SIZE - 1);
>> +}
>> +
>> +static struct fs_trace_entry *fs_find_trace_entry(struct super_block *sb)
>> +{
>> + struct fs_trace_entry *en;
>> + unsigned long hash;
>> +
>> + if (list_empty(&fs_trace_list))
>> + return ERR_PTR(-EINVAL);
>> + hash = fs_trace_hasfn(sb);
>> + hash_for_each_possible(fs_trace_hashtbl, en, hnode, hash)
>> + if (fs_trace_sb(en) == sb)
>> + return en;
>> + return ERR_PTR(-EINVAL);
>> +}
>> +
>> +static inline void fs_trace_entry_list_del(struct fs_trace_entry *en)
>> +{
>> + spin_lock(&en->lock);
>> + list_del(&en->node);
>> + hash_del(&en->hnode);
>> + spin_unlock(&en->lock);
>> +}
>> +
>> +static inline void fs_trace_entry_idr_remove(struct fs_trace_entry *en)
>> +{
>> + spin_lock(&fs_trace_idr_lock);
>> + idr_remove(&fs_trace_idr, en->mark);
>> + spin_unlock(&fs_trace_idr_lock);
>> +}
>> +
>> +static inline void fs_trace_entry_free(struct fs_trace_entry *en)
>> +{
>> + kmem_cache_free(fs_trace_cachep, en);
>> +}
>> +
>> +static inline void fs_destroy_trace_entry(struct fs_trace_entry *en)
>> +{
>> + fs_trace_entry_list_del(en);
>> + fs_trace_entry_idr_remove(en);
>> + fs_trace_entry_free(en);
>> +}
>> +
>> +static int fs_remove_trace_entry(struct super_block *sb)
>> +{
>> + struct fs_trace_entry *en;
>> + int ret = -EINVAL;
>> +
>> + spin_lock(&fs_trace_lock);
>> + en = fs_find_trace_entry(sb);
>> + if (!IS_ERR(en)) {
>> + fs_destroy_trace_entry(en);
>> + ret = 0;
>> + }
>> + spin_unlock(&fs_trace_lock);
>> + return ret;
>> +}
>> +
>> +static void fs_remove_all_traces(void)
>> +{
>> + struct fs_trace_entry *en, *guard;
>> +
>> + spin_lock(&fs_trace_lock);
>> + list_for_each_entry_safe(en, guard, &fs_trace_list, node)
>> + fs_destroy_trace_entry(en);
>> + spin_unlock(&fs_trace_lock);
>> +}
>> +
>> +static int fs_event_new_trace_create_msg(struct sk_buff *skb,
>> + unsigned int event_id, void *data)
>> +{
>> + struct fs_trace_entry *en = (struct fs_trace_entry *)data;
>> + char *path, *mount_dir;
>> + int ret;
>> +
>> + path = kzalloc(PATH_MAX, GFP_KERNEL);
>> + if (!path)
>> + return -EINVAL;
>> + mount_dir = d_path(&en->path, path, PATH_MAX - 1);
>> + if (IS_ERR(mount_dir))
>> + mount_dir = "unknown";
>> +
>> + ret = nla_put_u32(skb, FS_EVENT_ATR_FS_ID, en->mark);
>> + if (ret)
>> + goto leave;
>> + ret = nla_put_string(skb, FS_EVENT_ATR_MOUNT, mount_dir);
>> +
>> +leave:
>> + kfree(path);
>> + return ret;
>> +}
>> +
>> +static int fs_event_common_create_msg(struct sk_buff *skb,
>> + unsigned int event_id, void *data)
>> +{
>> + struct fs_trace_entry *en = (struct fs_trace_entry *)data;
>> + struct super_block *sb = fs_trace_sb(en);
>> +
>> + if (nla_put_u32(skb, FS_EVENT_ATR_FS_ID, en->mark))
>> + return -EINVAL;
>> +
>> + /* In case there is no backing dev, so skip the followng */
>> + if (sb->s_bdev && MAJOR(sb->s_dev))
>> + if (nla_put_u32(skb, FS_EVENT_ATR_DEV_MAJOR, MAJOR(sb->s_dev))
>> + || nla_put_u32(skb, FS_EVENT_ATR_DEV_MINOR, MINOR(sb->s_dev)))
>> + return -EINVAL;
>> +
>> + if (nla_put_u32(skb, FS_EVENT_ATR_ID, event_id))
>> + return -EINVAL;
>> + if (nla_put_u64(skb, FS_EVENT_ATR_CAUSED_ID, pid_nr(task_pid(current))))
>> + return -EINVAL;
>> +
>> + if (event_id & (FS_THRESH_LR_REACHED | FS_THRESH_UR_REACHED))
>> + return nla_put_u64(skb, FS_EVENT_ATR_DATA,
>> + en->data.available_blks);
>> +
>> + return 0;
>> +}
>> +
>> +static void fs_event_new_trace(struct fs_trace_entry *en)
>> +{
>> + fs_netlink_send_event(GENLMSG_DEFAULT_SIZE, FS_EVENT_TYPE_NEW_TRACE,
>> + fs_event_new_trace_create_msg, 0, en);
>> +}
>> +
>> +static void fs_event_send(struct fs_trace_entry *en,
>> + unsigned int event_type, unsigned int event_id)
>> +{
>> + size_t size = nla_total_size(sizeof(u32)) * 4 +
>> + nla_total_size(sizeof(u64)) * 2;
>> +
>> + fs_netlink_send_event(size, fs_event_type_cast(event_type),
>> + fs_event_common_create_msg, event_id, en);
>> +}
>> +
>> +void fs_event_notify(struct super_block *sb, unsigned int event_type,
>> + unsigned int event_id)
>> +{
>> + struct fs_trace_entry *en;
>> +
>> + spin_lock(&fs_trace_lock);
>> + en = fs_find_trace_entry(sb);
>> + if (IS_ERR(en)) {
>> + spin_unlock(&fs_trace_lock);
>> + return;
>> + }
>> +
>> + spin_lock(&en->lock);
>> + /* Relase the main lock - it's enough to keep the entry lock here */
>> + spin_unlock(&fs_trace_lock);
>> + if (en->notify_mask & event_type)
>> + fs_event_send(en, event_type, event_id);
>> + spin_unlock(&en->lock);
>> +}
>> +EXPORT_SYMBOL(fs_event_notify);
>> +
>> +void fs_event_alloc_space(struct super_block *sb, u64 ncount)
>> +{
>> + struct fs_trace_entry *en;
>> + s64 count;
>> +
>> + spin_lock(&fs_trace_lock);
>> + en = fs_find_trace_entry(sb);
>> + if (IS_ERR(en)) {
>> + spin_unlock(&fs_trace_lock);
>> + return;
>> + }
>> +
>> + spin_lock(&en->lock);
>> + spin_unlock(&fs_trace_lock);
>> +
>> + if (!(en->notify_mask & FS_EVENT_THRESH))
>> + goto leave;
>> + /* we shouldn't drop below 0 here, unless there is a sync issue
>> + somewhere (?) */
>> + count = en->data.available_blks - ncount;
>> + en->data.available_blks = count < 0 ? 0 : count;
>> +
>> + if (en->data.available_blks > en->thresh.lrange)
>> + /* Not 'even' close - leave */
>> + goto leave;
>> +
>> + if (en->data.available_blks > en->thresh.urange) {
>> + /* Close enough - the lower range has been reached */
>> + if (!(en->thresh.state & THRESH_LR_BEYOND)) {
>> + /* Send notificaton */
>> + fs_event_send(en, FS_EVENT_THRESH,
>> + FS_THRESH_LR_REACHED);
>> + en->thresh.state &= ~THRESH_LR_BELOW;
>> + en->thresh.state |= THRESH_LR_BEYOND;
>> + }
>> + goto leave;
>> + }
>> + if (!(en->thresh.state & THRESH_UR_BEYOND)) {
>> + fs_event_send(en, FS_EVENT_THRESH, FS_THRESH_UR_REACHED);
>> + en->thresh.state &= ~THRESH_UR_BELOW;
>> + en->thresh.state |= THRESH_UR_BEYOND;
>> + }
>> +
>> +leave:
>> + spin_unlock(&en->lock);
>> +}
>> +EXPORT_SYMBOL(fs_event_alloc_space);
>> +
>> +void fs_event_free_space(struct super_block *sb, u64 ncount)
>> +{
>> + struct fs_trace_entry *en;
>> +
>> + spin_lock(&fs_trace_lock);
>> + en = fs_find_trace_entry(sb);
>> + if (IS_ERR(en)) {
>> + spin_unlock(&fs_trace_lock);
>> + return;
>> + }
>> +
>> + spin_lock(&en->lock);
>> + spin_unlock(&fs_trace_lock);
>> +
>> + if (!(en->notify_mask & FS_EVENT_THRESH))
>> + goto leave;
>> +
>> + en->data.available_blks += ncount;
>> +
>> + if (en->data.available_blks > en->thresh.lrange) {
>> + if (!(en->thresh.state & THRESH_LR_BELOW)
>> + && en->thresh.state & THRESH_LR_BEYOND) {
>> + /* Send notificaton */
>> + fs_event_send(en, FS_EVENT_THRESH,
>> + FS_THRESH_LR_REACHED);
>> + en->thresh.state &= ~THRESH_LR_BEYOND;
>> + en->thresh.state |= THRESH_LR_BELOW;
>> + goto leave;
>> + }
>> + }
>> + if (en->data.available_blks > en->thresh.urange) {
>> + if (!(en->thresh.state & THRESH_UR_BELOW)
>> + && en->thresh.state & THRESH_UR_BEYOND) {
>> + /* Notify */
>> + fs_event_send(en, FS_EVENT_THRESH,
>> + FS_THRESH_UR_REACHED);
>> + en->thresh.state &= ~THRESH_UR_BEYOND;
>> + en->thresh.state |= THRESH_UR_BELOW;
>> + }
>> + }
>> +leave:
>> + spin_unlock(&en->lock);
>> +}
>> +EXPORT_SYMBOL(fs_event_free_space);
>> +
>> +void fs_event_mount_dropped(struct vfsmount *mnt)
>> +{
>> + struct fs_trace_entry *en;
>> +
>> + spin_lock(&fs_trace_lock);
>> + en = fs_find_trace_entry(mnt->mnt_sb);
>> + if (!IS_ERR(en)) {
>> + spin_lock(&en->lock);
>> + if (en->notify_mask & FS_EVENT_INFO)
>> + fs_event_send(en, FS_EVENT_TYPE_INFO, FS_INFO_UMOUNT);
>> + spin_unlock(&en->lock);
>> + fs_destroy_trace_entry(en);
>> + }
>> + spin_unlock(&fs_trace_lock);
>> +}
>> +
>> +static int fs_new_trace_entry(struct path *path, struct fs_event_thresh *thresh,
>> + unsigned int nmask)
>> +{
>> + struct fs_trace_entry *en;
>> + struct super_block *sb;
>> + struct mount *r_mnt;
>> +
>> + en = kmem_cache_zalloc(fs_trace_cachep, GFP_KERNEL);
>> + if (unlikely(!en))
>> + return -ENOMEM;
>> + /*
>> + * Note that no reference is being taken here for the path as it would
>> + * make the umount unnecessarily puzzling (due to an extra 'valid'
>> + * reference for the mnt).
>> + * This is *rather* safe as the notification on mount being dropped
>> + * will get called prior to releasing the super block - so right
>> + * in time to send the event and perform appropraite clean-up
>> + */
>> + r_mnt = real_mount(path->mnt);
>> + en->path.dentry = r_mnt->mnt.mnt_root;
>> + en->path.mnt = &r_mnt->mnt;
>> +
>> + sb = fs_trace_sb(en);
>> + spin_lock_init(&en->lock);
>> +
>> + spin_lock(&fs_trace_idr_lock);
>> + idr_preload(GFP_KERNEL);
>> + en->mark = idr_alloc_cyclic(&fs_trace_idr, en, 1, 0, GFP_KERNEL);
>> + idr_preload_end();
>> + spin_unlock(&fs_trace_idr_lock);
>> +
>> + if (en->mark < 0)
>> + goto leave;
>> + if (fs_trace_query_data(sb, &en->data))
>> + goto leave;
>> +
>> + nmask = en->data.events_cap_mask & nmask;
>> + if (!nmask)
>> + goto leave;
>> + en->notify_mask = nmask;
>> + memcpy(&en->thresh, thresh, offsetof(struct fs_event_thresh, state));
>> +
>> + spin_lock(&fs_trace_lock);
>> + list_add(&en->node, &fs_trace_list);
>> + hash_add(fs_trace_hashtbl, &en->hnode, fs_trace_hasfn(sb));
>> + spin_unlock(&fs_trace_lock);
>> +
>> + fs_event_new_trace(en);
>> + return 0;
>> +leave:
>> + kmem_cache_free(fs_trace_cachep, en);
>> + return -EINVAL;
>> +}
>> +
>> +static int fs_update_trace_entry_locked(struct fs_trace_entry *en,
>> + struct fs_event_thresh *thresh,
>> + unsigned int nmask)
>> +{
>> + int extend = nmask & FS_TRACE_ADD;
>> +
>> + nmask &= en->data.events_cap_mask;
>> + if (!nmask)
>> + return -EINVAL;
>> +
>> + if (nmask & FS_EVENT_THRESH) {
>> + if (extend) {
>> + /* Get the current state */
>> + if (!(en->notify_mask & FS_EVENT_THRESH))
>> + fs_trace_query_data(fs_trace_sb(en),
>> + &en->data);
>> + if (thresh->state & THRESH_LR_ON) {
>> + en->thresh.lrange = thresh->lrange;
>> + en->thresh.state &= ~THRESH_LR_ON;
>> + }
>> + if (thresh->state & THRESH_UR_ON) {
>> + en->thresh.urange = thresh->urange;
>> + en->thresh.state &= ~THRESH_UR_ON;
>> + }
>> + } else {
>> + memset(&en->thresh, 0, sizeof(en->thresh));
>> + }
>> + }
>> +
>> + if (extend)
>> + en->notify_mask |= nmask;
>> + else
>> + en->notify_mask &= ~nmask;
>> + return 0;
>> +}
>> +
>> +static int fs_update_trace_entry(struct path *path,
>> + struct fs_event_thresh *thresh,
>> + unsigned int nmask)
>> +{
>> + struct fs_trace_entry *en;
>> + int ret;
>> +
>> + spin_lock(&fs_trace_lock);
>> + en = fs_find_trace_entry(path->mnt->mnt_sb);
>> + if (IS_ERR(en)) {
>> + spin_unlock(&fs_trace_lock);
>> + return (nmask & FS_TRACE_ADD)
>> + ? fs_new_trace_entry(path, thresh, nmask)
>> + : -EINVAL;
>> + }
>> + spin_lock(&en->lock);
>> + spin_unlock(&fs_trace_lock);
>> +
>> + ret = fs_update_trace_entry_locked(en, thresh, nmask);
>> +
>> + spin_unlock(&en->lock);
>> + return ret;
>> +}
>> +
>> +static int fs_parse_trace_request(int argc, char **argv)
>> +{
>> + struct fs_event_thresh thresh = {0};
>> + struct path path;
>> + substring_t args[MAX_OPT_ARGS];
>> + unsigned int nmask = FS_TRACE_ADD;
>> + int token;
>> + char *s;
>> + int ret = -EINVAL;
>> +
>> + if (!argc) {
>> + fs_remove_all_traces();
>> + return 0;
>> + }
>> +
>> + s = *(argv++);
>> + if (*s == '!') {
>> + /* Clear the trace entry */
>> + nmask &= ~FS_TRACE_ADD;
>> + ++s;
>> + }
>> +
>> + if (kern_path_mountpoint(AT_FDCWD, s, &path, LOOKUP_FOLLOW))
>> + return -EINVAL;
>> +
>> + if (!(--argc)) {
>> + if (!(nmask & FS_TRACE_ADD))
>> + ret = fs_remove_trace_entry(path.mnt->mnt_sb);
>> + goto leave;
>> + }
>> +
>> + while ((s = strsep(argv, ",")) != NULL) {
>> + if (!*s)
>> + continue;
>> + args[0].to = args[0].from = NULL;
>> + token = match_token(s, fs_etypes, args);
>> + nmask |= (token & FS_EVENTS_ALL);
>> + }
>> +
>> + if (!(nmask & (~FS_TRACE_ADD)) ||
>> + (!(--argc) && (nmask & FS_EVENT_THRESH && nmask & FS_TRACE_ADD)))
>> + goto leave;
>> +
>> + if ((nmask & FS_EVENT_THRESH) && (nmask & FS_TRACE_ADD)) {
>> + /*
>> + * Get the threshold config data:
>> + * lower range
>> + * upper range
>> + */
>> + ret = kstrtoull(*(++argv), 10, &thresh.lrange);
>> + if (ret)
>> + goto leave;
>> +
>> + thresh.state |= THRESH_LR_ON;
>> +
>> + if ((--argc)) {
>> + ret = kstrtoull(*(++argv), 10, &thresh.urange);
>> + if (ret)
>> + goto leave;
>> + thresh.state |= THRESH_UR_ON;
>> + }
>> + /* The thresholds are based on number of available blocks */
>> + if (thresh.lrange < thresh.urange) {
>> + ret = -EINVAL;
>> + goto leave;
>> + }
>> +
>> + }
>> + ret = fs_update_trace_entry(&path, &thresh, nmask);
>> +leave:
>> + path_put(&path);
>> + return ret;
>> +}
>> +
>> +#define DEFAULT_BUF_SIZE PAGE_SIZE
>> +
>> +static ssize_t fs_trace_write(struct file *file, const char __user *buffer,
>> + size_t count, loff_t *ppos)
>> +{
>> + char **argv;
>> + char *kern_buf, *next, *cfg;
>> + size_t size, dcount = 0;
>> + int argc;
>> +
>> + if (!count)
>> + return 0;
>> +
>> + kern_buf = kmalloc(DEFAULT_BUF_SIZE, GFP_KERNEL);
>> + if (!kern_buf)
>> + return -ENOMEM;
>> +
>> + while (dcount < count) {
>> +
>> + size = count - dcount;
>> + if (size >= DEFAULT_BUF_SIZE)
>> + size = DEFAULT_BUF_SIZE - 1;
>> + if (copy_from_user(kern_buf, buffer + dcount, size)) {
>> + dcount = -EINVAL;
>> + goto leave;
>> + }
>> +
>> + kern_buf[size] = '\0';
>> +
>> + next = cfg = kern_buf;
>> +
>> + do {
>> + next = strchr(cfg, ';');
>> + if (next)
>> + *next = '\0';
>> +
>> + argv = argv_split(GFP_KERNEL, cfg, &argc);
>> + if (!argv) {
>> + dcount = -ENOMEM;
>> + goto leave;
>> + }
>> +
>> + if (fs_parse_trace_request(argc, argv)) {
>> + dcount = -EINVAL;
>> + argv_free(argv);
>> + goto leave;
>> + }
>> +
>> + argv_free(argv);
>> + if (next)
>> + cfg = ++next;
>> +
>> + } while (next);
>> + dcount += size;
>> + }
>> +leave:
>> + kfree(kern_buf);
>> + return dcount;
>> +}
>> +
>> +static void *fs_trace_seq_start(struct seq_file *m, loff_t *pos)
>> +{
>> + spin_lock(&fs_trace_lock);
>> + return seq_list_start(&fs_trace_list, *pos);
>> +}
>> +
>> +static void *fs_trace_seq_next(struct seq_file *m, void *v, loff_t *pos)
>> +{
>> + return seq_list_next(v, &fs_trace_list, pos);
>> +}
>> +
>> +static void fs_trace_seq_stop(struct seq_file *m, void *v)
>> +{
>> + spin_unlock(&fs_trace_lock);
>> +}
>> +
>> +static int fs_trace_seq_show(struct seq_file *m, void *v)
>> +{
>> + struct fs_trace_entry *en;
>> + struct super_block *sb;
>> + struct mount *r_mnt;
>> + const struct match_token *match;
>> + unsigned int nmask;
>> +
>> + en = list_entry(v, struct fs_trace_entry, node);
>> + sb = fs_trace_sb(en);
>> +
>> + seq_printf(m, "%d ", en->mark);
>> +
>> + seq_path(m, &en->path, "\t\n\\");
>> + seq_putc(m, ' ');
>> +
>> + seq_escape(m, sb->s_type->name, " \t\n\\");
>> + if (sb->s_subtype && sb->s_subtype[0]) {
>> + seq_putc(m, '.');
>> + seq_escape(m, sb->s_subtype, " \t\n\\");
>> + }
>> +
>> + seq_putc(m, ' ');
>> + if (sb->s_op->show_devname) {
>> + sb->s_op->show_devname(m, en->path.mnt->mnt_root);
>> + } else {
>> + r_mnt = real_mount(en->path.mnt);
>> + seq_escape(m, r_mnt->mnt_devname ? r_mnt->mnt_devname : "none",
>> + " \t\n\\");
>> + }
>> + seq_puts(m, " (");
>> +
>> + nmask = en->notify_mask;
>> + for (match = fs_etypes; match->pattern; ++match) {
>> + if (match->token & nmask) {
>> + seq_puts(m, match->pattern);
>> + nmask &= ~match->token;
>> + if (nmask)
>> + seq_putc(m, ',');
>> + }
>> + }
>> + seq_printf(m, " %llu %llu", en->thresh.lrange,
>> + en->thresh.urange);
>> + seq_puts(m, ")\n");
>> + return 0;
>> +}
>> +
>> +static const struct seq_operations fs_trace_seq_ops = {
>> + .start = fs_trace_seq_start,
>> + .next = fs_trace_seq_next,
>> + .stop = fs_trace_seq_stop,
>> + .show = fs_trace_seq_show,
>> +};
>> +
>> +static int fs_trace_open(struct inode *inode, struct file *file)
>> +{
>> + return seq_open(file, &fs_trace_seq_ops);
>> +}
>> +
>> +static const struct file_operations fs_trace_fops = {
>> + .owner = THIS_MODULE,
>> + .open = fs_trace_open,
>> + .write = fs_trace_write,
>> + .read = seq_read,
>> + .llseek = seq_lseek,
>> + .release = seq_release,
>> +};
>> +
>> +static int fs_trace_init(void)
>> +{
>> + fs_trace_cachep = KMEM_CACHE(fs_trace_entry, 0);
>> + if (!fs_trace_cachep)
>> + return -EINVAL;
>> + if (!fs_event_netlink_register()) {
>> + idr_init(&fs_trace_idr);
>> + return 0;
>> + }
>> + kmem_cache_destroy(fs_trace_cachep);
>> + return -EINVAL;
>> +}
>> +
>> +/* VFS support */
>> +static int fs_trace_fill_super(struct super_block *sb, void *data, int silent)
>> +{
>> + int ret;
>> + static struct tree_descr desc[] = {
>> + [2] = {
>> + .name = "config",
>> + .ops = &fs_trace_fops,
>> + .mode = S_IWUSR | S_IRUGO,
>> + },
>> + {""},
>> + };
>> +
>> + ret = simple_fill_super(sb, 0x7246332, desc);
>> + return !ret ? fs_trace_init() : ret;
>> +}
>> +
>> +static struct dentry *fs_trace_do_mount(struct file_system_type *fs_type,
>> + int ntype, const char *dev_name, void *data)
>> +{
>> + return mount_single(fs_type, ntype, data, fs_trace_fill_super);
>> +}
>> +
>> +static void fs_trace_kill_super(struct super_block *sb)
>> +{
>> + fs_remove_all_traces();
>> + idr_destroy(&fs_trace_idr);
>> + fs_event_netlink_unregister();
>> + kmem_cache_destroy(fs_trace_cachep);
>> + kill_litter_super(sb);
>> +}
>> +
>> +static struct kset *fs_trace_kset;
>> +static struct vfsmount *fs_trace_mount;
>> +
>> +static struct file_system_type fs_trace_fstype = {
>> + .name = "fstrace",
>> + .mount = fs_trace_do_mount,
>> + .kill_sb = fs_trace_kill_super,
>> +};
>> +
>> +static void __init fs_trace_vfs_init(void)
>> +{
>> + fs_trace_kset = kset_create_and_add("events", NULL, fs_kobj);
>> +
>> + if (!fs_trace_kset)
>> + return;
>> +
>> + if (!register_filesystem(&fs_trace_fstype)) {
>> + fs_trace_mount = kern_mount(&fs_trace_fstype);
>> + if (!IS_ERR(fs_trace_mount))
>> + return;
>> +
>> + unregister_filesystem(&fs_trace_fstype);
>> + }
>> + kset_unregister(fs_trace_kset);
>> +}
>> +
>> +static int __init fs_trace_events_init(void)
>> +{
>> + fs_trace_vfs_init();
>> + return 0;
>> +};
>> +module_init(fs_trace_events_init);
>> +
>> diff --git a/fs/events/fs_event.h b/fs/events/fs_event.h
>> new file mode 100644
>> index 0000000..4260ce5
>> --- /dev/null
>> +++ b/fs/events/fs_event.h
>> @@ -0,0 +1,27 @@
>> +/*
>> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms of the GNU General Public License version 2.
>> + *
>> + * The full GNU General Public License is included in this distribution in the
>> + * file called COPYING.
>> + *
>> + * This program is distributed in the hope that it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
>> + * more details.
>> + */
>> +
>> +#ifndef __GENERIC_FS_EVENTS_H
>> +#define __GENERIC_FS_EVENTS_H
>> +
>> +#ifdef CONFIG_NET
>> +int fs_event_netlink_register(void);
>> +void fs_event_netlink_unregister(void);
>> +#else /* CONFIG_NET */
>> +static inline int fs_event_netlink_register(void) { return -ENOSYS; }
>> +static inline void fs_event_netlink_unregister(void) {};
>> +#endif /* CONFIG_NET */
>> +
>> +#endif /* __GENERIC_FS_EVENTS_H */
>> diff --git a/fs/events/fs_event_netlink.c b/fs/events/fs_event_netlink.c
>> new file mode 100644
>> index 0000000..9c56e35
>> --- /dev/null
>> +++ b/fs/events/fs_event_netlink.c
>> @@ -0,0 +1,94 @@
>> +/*
>> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms of the GNU General Public License version 2.
>> + *
>> + * The full GNU General Public License is included in this distribution in the
>> + * file called COPYING.
>> + *
>> + * This program is distributed in the hope that it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
>> + * more details.
>> + */
>> +#include <linux/fs.h>
>> +#include <linux/init.h>
>> +#include <linux/kernel.h>
>> +#include <linux/sched.h>
>> +#include <linux/slab.h>
>> +#include <net/netlink.h>
>> +#include <net/genetlink.h>
>> +
>> +static const struct genl_multicast_group fs_event_mcgroups[] = {
>> + { .name = "event", },
>> +};
>> +
>> +static struct genl_family fs_event_family = {
>> + .id = GENL_ID_FS_EVENT,
>> + .hdrsize = 0,
>> + .name = "FS_EVENT",
>> + .version = 1,
>> + .maxattr = FS_EVENT_ATR_MAX,
>> + .mcgrps = fs_event_mcgroups,
>> + .n_mcgrps = ARRAY_SIZE(fs_event_mcgroups),
>> +};
>> +
>> +int fs_netlink_send_event(size_t size, unsigned int event_type,
>> + int (*compose_msg)(struct sk_buff *skb,
>> + unsigned int event_id, void *data),
>> + unsigned int event_id, void *data)
>> +{
>> + static atomic_t seq;
>> + struct sk_buff *skb;
>> + void *msg_head;
>> + int ret = 0;
>> +
>> + if (!size || !compose_msg)
>> + return -EINVAL;
>> +
>> + size += nla_total_size(sizeof(u64));
>> + skb = genlmsg_new(size, GFP_NOFS);
>> +
>> + if (!skb) {
>> + pr_err("Failed to allocate new FS generic netlink message\n");
>> + return -ENOMEM;
>> + }
>> +
>> + msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
>> + &fs_event_family, 0, event_type);
>> + if (!msg_head)
>> + goto cleanup;
>> +
>> + ret = compose_msg(skb, event_id, data);
>> + if (ret) {
>> + genlmsg_cancel(skb, msg_head);
>> + goto cleanup;
>> + }
>> +
>> + genlmsg_end(skb, msg_head);
>> + ret = genlmsg_multicast(&fs_event_family, skb, 0, 0, GFP_NOWAIT);
>> + if (ret && ret != -ENOBUFS && ret != -ESRCH)
>> + goto cleanup;
>> +
>> + return ret;
>> +cleanup:
>> + nlmsg_free(skb);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(fs_netlink_send_event);
>> +
>> +int fs_event_netlink_register(void)
>> +{
>> + int ret;
>> +
>> + ret = genl_register_family(&fs_event_family);
>> + if (ret)
>> + pr_err("Failed to register FS netlink interface\n");
>> + return ret;
>> +}
>> +
>> +void fs_event_netlink_unregister(void)
>> +{
>> + genl_unregister_family(&fs_event_family);
>> +}
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 82ef140..ec6e2ef 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -1031,6 +1031,7 @@ static void cleanup_mnt(struct mount *mnt)
>> if (unlikely(mnt->mnt_pins.first))
>> mnt_pin_kill(mnt);
>> fsnotify_vfsmount_delete(&mnt->mnt);
>> + fs_event_mount_dropped(&mnt->mnt);
>> dput(mnt->mnt.mnt_root);
>> deactivate_super(mnt->mnt.mnt_sb);
>> mnt_free_id(mnt);
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index b4d71b5..bb529af 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -263,6 +263,10 @@ struct iattr {
>> * Includes for diskquotas.
>> */
>> #include <linux/quota.h>
>> +/*
>> + * Include for Generic File System Events Interface
>> + */
>> +#include <linux/fs_event.h>
>>
>> /*
>> * Maximum number of layers of fs stack. Needs to be limited to
>> @@ -1233,6 +1237,7 @@ struct super_block {
>> const struct dquot_operations *dq_op;
>> const struct quotactl_ops *s_qcop;
>> const struct export_operations *s_export_op;
>> + const struct fs_trace_operations *s_trace_ops;
>> unsigned long s_flags;
>> unsigned long s_magic;
>> struct dentry *s_root;
>> @@ -1253,7 +1258,6 @@ struct super_block {
>> struct hlist_node s_instances;
>> unsigned int s_quota_types; /* Bitmask of supported quota types */
>> struct quota_info s_dquot; /* Diskquota specific options */
>> -
>> struct sb_writers s_writers;
>>
>> char s_id[32]; /* Informational name */
>> diff --git a/include/linux/fs_event.h b/include/linux/fs_event.h
>> new file mode 100644
>> index 0000000..1e128d8
>> --- /dev/null
>> +++ b/include/linux/fs_event.h
>> @@ -0,0 +1,69 @@
>> +/*
>> + * Generic File System Events Interface
>> + *
>> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms of the GNU General Public License version 2.
>> + *
>> + * The full GNU General Public License is included in this distribution in the
>> + * file called COPYING.
>> + *
>> + * This program is distributed in the hope that it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
>> + * more details.
>> + */
>> +#ifndef _LINUX_GENERIC_FS_EVETS_
>> +#define _LINUX_GENERIC_FS_EVETS_
>> +#include <net/netlink.h>
>> +#include <uapi/linux/fs_event.h>
>> +
>> +/*
>> + * Those event flags match the event types send though the netlink interface
>> + * so mind in case making any modifications.
>> + */
>> +#define FS_EVENT_INFO 0x001
>> +#define FS_EVENT_WARN 0x002
>> +#define FS_EVENT_ERR 0x004
>> +#define FS_EVENT_THRESH 0x008
>> +
>> +#define FS_EVENTS_ALL \
>> + (FS_EVENT_INFO | FS_EVENT_WARN | FS_EVENT_THRESH | FS_EVENT_ERR)
>> +
>> +struct fs_trace_sdata {
>> + /* Supported notification types */
>> + unsigned int events_cap_mask;
>> + /* Number of available/reachable blocks */
>> + u64 available_blks;
>> +};
>> +
>> +struct fs_trace_operations {
>> + int (*fs_trace_query)(struct super_block *, struct fs_trace_sdata *);
>> +};
>> +
>> +
>> +void fs_event_notify(struct super_block *sb, unsigned int event_type,
>> + unsigned int event_id);
>> +void fs_event_alloc_space(struct super_block *sb, u64 ncount);
>> +void fs_event_free_space(struct super_block *sb, u64 ncount);
>> +void fs_event_mount_dropped(struct vfsmount *mnt);
>> +
>> +#ifdef CONFIG_NET
>> +int fs_netlink_send_event(size_t size, unsigned int event_type,
>> + int (*compose_msg)(struct sk_buff *skb, unsigned int event_id,
>> + void *data),
>> + unsigned int event_id, void *data);
>> +#else /* CONFIG_NET */
>> +static inline
>> +int fs_netlink_send_event(size_t size, unsigned int event_type,
>> + int (*compose_msg)(struct sk_buff *skb, unsigned int event_id,
>> + void *data),
>> + unsigned int event_idid, void *data)
>> +{
>> + return -ENOSYS;
>> +}
>> +#endif /* CONFIG_NET */
>> +
>> +#endif /* _LINUX_GENERIC_FS_EVENTS_ */
>> +
>> diff --git a/include/uapi/linux/fs_event.h b/include/uapi/linux/fs_event.h
>> new file mode 100644
>> index 0000000..dd79953
>> --- /dev/null
>> +++ b/include/uapi/linux/fs_event.h
>> @@ -0,0 +1,62 @@
>> +/*
>> + * Generic netlink support for Generic File System Events Interface
>> + *
>> + * Copyright(c) 2015 Samsung Electronics. All rights reserved.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms of the GNU General Public License version 2.
>> + *
>> + * The full GNU General Public License is included in this distribution in the
>> + * file called COPYING.
>> + *
>> + * This program is distributed in the hope that it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
>> + * more details.
>> + */
>> +#ifndef _UAPI_LINUX_GENERIC_FS_EVENTS_
>> +#define _UAPI_LINUX_GENERIC_FS_EVENTS_
>> +/*
>> + * Generic FS event types
>> + */
>> +enum {
>> + FS_EVENT_TYPE_NONE,
>> + FS_EVENT_TYPE_INFO,
>> + FS_EVENT_TYPE_WARN,
>> + FS_EVENT_TYPE_ERR,
>> + FS_EVENT_TYPE_THRESH,
>> + FS_EVENT_TYPE_NEW_TRACE,
>> + __FS_EVENT_TYPE_MAX,
>> +};
>> +#define FS_EVENT_TYPE_MAX (__FS_EVENT_TYPE_MAX - 1)
>> +/*
>> + * Generic netlink attribute types
>> + */
>> +enum {
>> + FS_EVENT_ATR_NONE,
>> + FS_EVENT_ATR_FS_ID, /* An identifier of traced fs */
>> + FS_EVENT_ATR_MOUNT, /* Mount point directory name */
>> + FS_EVENT_ATR_DEV_MAJOR,
>> + FS_EVENT_ATR_DEV_MINOR,
>> + FS_EVENT_ATR_ID,
>> + FS_EVENT_ATR_CAUSED_ID,
>> + FS_EVENT_ATR_DATA,
>> + __FS_EVENT_ATR_MAX,
>> +};
>> +#define FS_EVENT_ATR_MAX (__FS_EVENT_ATR_MAX - 1)
>
> FS_EVENT_ATTR_ ? Most of the time, the kernel seems to use "attr" as shorthand
> for "attribute".
That actually varies. For netlink message attributes it is smt more like *_NL_A_*.
Nevertheless, I'm totally fine with possibly renaming those.
>
>> +
>> +/*
>> + * Supported set of FS events ids
>> + */
>> +#define FS_INFO_UMOUNT 0x00000001 /* File system unmounted */
>> +#define FS_WARN_UNKNOWN 0x00000004 /* Unknown warning */
>> +#define FS_WARN_ENOSPC 0x00000008 /* No space left to reserve data blks */
>> +#define FS_WANR_ENOSPC_META 0x00000010 /* No space left for metadata */
>
> Why WANR, as opposed to WARN?
That's me misspelling.
>
>> +#define FS_THRESH_LR_REACHED 0x00000020 /* The lower range of threshold has been reached */
>> +#define FS_THRESH_UR_REACHED 0x00000040 /* The upper range of threshold has been reached */
>> +#define FS_ERR_UNKNOWN 0x00000080 /* Unknown error */
>> +#define FS_ERR_RO_REMOUT 0x00000100 /* The file system has been remounted as red-only */
>
> _REMOUNT... read-only...
>
>> +#define FS_ERR_ITERNAL 0x00000200 /* File system's internal error */
>
> _INTERNAL...
>
> What does FS_ERR_ITERNAL mean? "programming error"?
>
FS_ERR_ITERNAL is supposed to mean smth than can not be easily translated
into generic event code - so smth that is specific for given file system type.
> How about a separate FS_ERR_CORRUPTED to mean "go run fsck"?
Sounds like a good idea.
>
> Hmm, these are bit flags... it doesn't make sense that I can send things like
> FS_INFO_UMOUNT | FS_ERR_RO_REMOUT.
>
You can but you shouldn't. Possibly some sanity checks could be added
for such cases. I was thinking of possibly merging events for the same
file system and sending them in one go - so a single message could contain
multiple events. Though this requires some more thoughts.
BR
Beata
>> +
>> +#endif /* _UAPI_LINUX_GENERIC_FS_EVENTS_ */
>> +
>> diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
>> index c3363ba..6464129 100644
>> --- a/include/uapi/linux/genetlink.h
>> +++ b/include/uapi/linux/genetlink.h
>> @@ -29,6 +29,7 @@ struct genlmsghdr {
>> #define GENL_ID_CTRL NLMSG_MIN_TYPE
>> #define GENL_ID_VFS_DQUOT (NLMSG_MIN_TYPE + 1)
>> #define GENL_ID_PMCRAID (NLMSG_MIN_TYPE + 2)
>> +#define GENL_ID_FS_EVENT (NLMSG_MIN_TYPE + 3)
>>
>> /**************************************************************************
>> * Controller
>> diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
>> index 2ed5f96..e8e0bd68 100644
>> --- a/net/netlink/genetlink.c
>> +++ b/net/netlink/genetlink.c
>> @@ -82,7 +82,8 @@ static struct list_head family_ht[GENL_FAM_TAB_SIZE];
>> */
>> static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
>> BIT(GENL_ID_VFS_DQUOT) |
>> - BIT(GENL_ID_PMCRAID);
>> + BIT(GENL_ID_PMCRAID) |
>> + BIT(GENL_ID_FS_EVENT);
>> static unsigned long *mc_groups = &mc_group_start;
>> static unsigned long mc_groups_longs = 1;
>>
>> @@ -146,6 +147,7 @@ static u16 genl_generate_id(void)
>> for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
>> if (id_gen_idx != GENL_ID_VFS_DQUOT &&
>> id_gen_idx != GENL_ID_PMCRAID &&
>> + id_gen_idx != GENL_ID_FS_EVENT &&
>> !genl_family_find_byid(id_gen_idx))
>> return id_gen_idx;
>> if (++id_gen_idx > GENL_MAX_ID)
>> @@ -249,6 +251,9 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
>> } else if (family->id == GENL_ID_PMCRAID) {
>> first_id = GENL_ID_PMCRAID;
>> BUG_ON(n_groups != 1);
>> + } else if (family->id == GENL_ID_FS_EVENT) {
>> + first_id = GENL_ID_FS_EVENT;
>> + BUG_ON(n_groups != 1);
>> } else {
>> groups_allocated = true;
>> err = genl_allocate_reserve_groups(n_groups, &first_id);
>> --
>> 1.7.9.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 04/16/2015 05:46 AM, Eric Sandeen wrote:
> On 4/15/15 2:15 AM, Beata Michalska wrote:
>> Introduce configurable generic interface for file
>> system-wide event notifications to provide file
>> systems with a common way of reporting any potential
>> issues as they emerge.
>>
>> The notifications are to be issued through generic
>> netlink interface, by a dedicated, for file system
>> events, multicast group. The file systems might as
>> well use this group to send their own custom messages.
>
> ...
>
>> + 4.3 Threshold notifications:
>> +
>> + #include <linux/fs_event.h>
>> + void fs_event_alloc_space(struct super_block *sb, u64 ncount);
>> + void fs_event_free_space(struct super_block *sb, u64 ncount);
>> +
>> + Each filesystme supporting the treshold notifiactions should call
>> + fs_event_alloc_space/fs_event_free_space repsectively whenever the
>> + ammount of availbale blocks changes.
>> + - sb: the filesystem's super block
>> + - ncount: number of blocks being acquired/released
>
> so:
>
>> +void fs_event_alloc_space(struct super_block *sb, u64 ncount)
>> +{
>> + struct fs_trace_entry *en;
>> + s64 count;
>> +
>> + spin_lock(&fs_trace_lock);
>
> Every allocation/free for every supported filesystem system-wide will be
> serialized on this global spinlock? That sounds like a non-starter...
>
> -Eric
>
I guess there is a plenty room for improvements as this is an early version.
I do agree that this might be a performance bottleneck event though I've tried
to keep this to minimum - it's being taken only for hashtable look-up. But still...
I was considering placing the trace object within the super_block to skip
this look-up part but I'd like to gather more comments, especially on the concept
itself.
BR
Beata
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Thu, 16 Apr 2015, Beata Michalska wrote:
> On 04/16/2015 05:46 AM, Eric Sandeen wrote:
> > On 4/15/15 2:15 AM, Beata Michalska wrote:
> >> Introduce configurable generic interface for file
> >> system-wide event notifications to provide file
> >> systems with a common way of reporting any potential
> >> issues as they emerge.
> >>
> >> The notifications are to be issued through generic
> >> netlink interface, by a dedicated, for file system
> >> events, multicast group. The file systems might as
> >> well use this group to send their own custom messages.
> >
> > ...
> >
> >> + 4.3 Threshold notifications:
> >> +
> >> + #include <linux/fs_event.h>
> >> + void fs_event_alloc_space(struct super_block *sb, u64 ncount);
> >> + void fs_event_free_space(struct super_block *sb, u64 ncount);
> >> +
> >> + Each filesystme supporting the treshold notifiactions should call
> >> + fs_event_alloc_space/fs_event_free_space repsectively whenever the
> >> + ammount of availbale blocks changes.
> >> + - sb: the filesystem's super block
> >> + - ncount: number of blocks being acquired/released
> >
> > so:
> >
> >> +void fs_event_alloc_space(struct super_block *sb, u64 ncount)
> >> +{
> >> + struct fs_trace_entry *en;
> >> + s64 count;
> >> +
> >> + spin_lock(&fs_trace_lock);
> >
> > Every allocation/free for every supported filesystem system-wide will be
> > serialized on this global spinlock? That sounds like a non-starter...
> >
> > -Eric
> >
> I guess there is a plenty room for improvements as this is an early version.
> I do agree that this might be a performance bottleneck event though I've tried
> to keep this to minimum - it's being taken only for hashtable look-up. But still...
> I was considering placing the trace object within the super_block to skip
> this look-up part but I'd like to gather more comments, especially on the concept
> itself.
Sorry, I have no opinion on the netlink fs notifications concept
itself, not my area of expertise at all.
No doubt you Cc'ed me for tmpfs: I am very glad you're now trying the
generic filesystem route, and yes, I'd be happy to have the support
in tmpfs, thank you - if it is generally agreed to be suitable for
filesystems; but wouldn't want this as a special for tmpfs.
However, I must echo Eric's point: please take a look at 7e496299d4d2
"tmpfs: make tmpfs scalable with percpu_counter for used blocks":
Tim would be unhappy if you added overhead back into that path.
(And please Cc [email protected] next time you post these.)
Hugh
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 15.04.2015 09:15, Beata Michalska wrote:
> Introduce configurable generic interface for file
> system-wide event notifications to provide file
> systems with a common way of reporting any potential
> issues as they emerge.
>
> The notifications are to be issued through generic
> netlink interface, by a dedicated, for file system
> events, multicast group. The file systems might as
> well use this group to send their own custom messages.
>
> The events have been split into four base categories:
> information, warnings, errors and threshold notifications,
> with some very basic event types like running out of space
> or file system being remounted as read-only.
>
> Threshold notifications have been included to allow
> triggering an event whenever the amount of free space
> drops below a certain level - or levels to be more precise
> as two of them are being supported: the lower and the upper
> range. The notifications work both ways: once the threshold
> level has been reached, an event shall be generated whenever
> the number of available blocks goes up again re-activating
> the threshold.
>
> The interface has been exposed through a vfs. Once mounted,
> it serves as an entry point for the set-up where one can
> register for particular file system events.
Having a framework for notification for file systems is a great idea.
Your solution covers an important part of the possible application scope.
Before moving forward I suggest we should analyze if this scope should
be enlarged.
Many filesystems are remote (e.g. CIFS/Samba) or distributed over many
network nodes (e.g. Lustre). How should file system notification work here?
How will fuse file systems be served?
The current point of reference is a single mount point.
Every time I insert an USB stick several file system may be automounted.
I would like to receive events for these automounted file systems.
A similar case arises when starting new virtual machines. How will I
receive events on the host system for the file systems of the virtual
machines?
In your implementation events are received via Netlink.
Using Netlink for marking mounts for notification would create a much
more homogenous interface. So why should we use a virtual file system here?
Best regards
Heinrich Schuchardt
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Hello,
On Wed 15-04-15 09:15:43, Beata Michalska wrote:
> The following patchset is a result of previous discussions regarding
> file system threshold notifiactions. It introduces support for file
> system event notifications, sent through generic netlinik interface
> whenever an fs-related event occurs. Included are also some shmem
> and ext4 changes showing how the new interface might actually be used.
>
> The vary idea of using the generic netlink interface has been previoulsy
> suggested here: https://lkml.org/lkml/2011/8/18/169
>
> The basic description of the new functionality can be found in
> the first patch from this set - both in the commit message and
> in the doc file.
>
> Some very basic tests have been performed though still this is
> a PoC version. Below though is a sample user space application
> which subscribes to the new multicast group and listens for
> potential fs-related events. The code has been based on libnl 3.4
> and its test application for the generic netlink.
Thanks for the patches! As a general note for the next posting, please CC
also [email protected] (since this has implications for other
filesystems as well, specifically I know about XFS guys thinking about some
notification system as well) and [email protected] (since this is a
new kernel interface to userspace).
Honza
>
> ---
>
> Beata Michalska (4):
> fs: Add generic file system event notifications
> ext4: Add helper function to mark group as corrupted
> ext4: Add support for generic FS events
> shmem: Add support for generic FS events
>
> Documentation/filesystems/events.txt | 254 +++++++++++
> fs/Makefile | 1 +
> fs/events/Makefile | 6 +
> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
> fs/events/fs_event.h | 27 ++
> fs/events/fs_event_netlink.c | 94 +++++
> fs/ext4/balloc.c | 26 +-
> fs/ext4/ext4.h | 10 +
> fs/ext4/ialloc.c | 5 +-
> fs/ext4/inode.c | 2 +-
> fs/ext4/mballoc.c | 17 +-
> fs/ext4/resize.c | 1 +
> fs/ext4/super.c | 43 ++
> fs/namespace.c | 1 +
> include/linux/fs.h | 6 +-
> include/linux/fs_event.h | 69 +++
> include/uapi/linux/fs_event.h | 62 +++
> include/uapi/linux/genetlink.h | 1 +
> mm/shmem.c | 39 +-
> net/netlink/genetlink.c | 7 +-
> 20 files changed, 1412 insertions(+), 34 deletions(-)
> create mode 100644 Documentation/filesystems/events.txt
> create mode 100644 fs/events/Makefile
> create mode 100644 fs/events/fs_event.c
> create mode 100644 fs/events/fs_event.h
> create mode 100644 fs/events/fs_event_netlink.c
> create mode 100644 include/linux/fs_event.h
> create mode 100644 include/uapi/linux/fs_event.h
>
> ---
> Sample application:
>
> #include <netlink/cli/utils.h>
> #include <fs_event.h>
>
> #define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
> #define LOG(args...) fprintf(stderr, args)
>
> static int parse_info(struct nl_cache_ops *unused, struct genl_cmd *cmd,
> struct genl_info *info, void *arg)
> {
> LOG("New trace %d:\n",
> info->attrs[FS_EVENT_ATR_FS_ID]
> ? nla_get_u32(info->attrs[FS_EVENT_ATR_FS_ID])
> : -1);
> LOG("Mout point: %s\n", info->attrs[FS_EVENT_ATR_MOUNT]
> ? nla_get_string(info->attrs[FS_EVENT_ATR_MOUNT])
> : "unknown");
> return 0;
> }
>
> static int parse_thres(struct nl_cache_ops *unused, struct genl_cmd *cmd,
> struct genl_info *info, void *arg)
> {
>
> LOG("Threshold notification received for trace %d:\n",
> info->attrs[FS_EVENT_ATR_FS_ID]
> ? nla_get_u32(info->attrs[FS_EVENT_ATR_FS_ID])
> : -1);
>
> if (info->attrs[FS_EVENT_ATR_DEV_MAJOR])
> LOG("Backing dev major: %u\n",
> nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MAJOR]));
> if (info->attrs[FS_EVENT_ATR_DEV_MINOR])
> LOG("Backing dev minor: %u\n",
> nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MINOR]));
> LOG("Proc: %u\n", info->attrs[FS_EVENT_ATR_CAUSED_ID] ?
> nla_get_u32(info->attrs[FS_EVENT_ATR_CAUSED_ID]) : -1);
> LOG("Threshold data: %llu\n", info->attrs[FS_EVENT_ATR_DATA]
> ? nla_get_u64(info->attrs[FS_EVENT_ATR_DATA])
> : 0);
>
> return 0;
> }
>
> static int parse_warning(struct nl_cache_ops *unused, struct genl_cmd *cmd,
> struct genl_info *info, void *arg)
> {
>
> LOG("Warning recieved for trace %d\n", info->attrs[FS_EVENT_ATR_FS_ID] ?
> nla_get_u32(info->attrs[FS_EVENT_ATR_FS_ID]) : -1);
> if (info->attrs[FS_EVENT_ATR_DEV_MAJOR])
> LOG("Backing dev major: %u\n",
> nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MAJOR]));
> if (info->attrs[FS_EVENT_ATR_DEV_MINOR])
> LOG("Backing dev minor: %u\n",
> nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MINOR]));
> LOG("Proc: %u\n", info->attrs[FS_EVENT_ATR_CAUSED_ID] ?
> nla_get_u32(info->attrs[FS_EVENT_ATR_CAUSED_ID]) : -1);
> LOG("Warning: %u\n", info->attrs[FS_EVENT_ATR_ID] ?
> nla_get_u32(info->attrs[FS_EVENT_ATR_ID]) : -1);
>
> return 0;
> }
>
> static struct genl_cmd cmd[] = {
> {
> .c_id = FS_EVENT_TYPE_NEW_TRACE,
> .c_name = "info",
> .c_maxattr = 2,
> .c_msg_parser = parse_info,
> }, {
> .c_id = FS_EVENT_TYPE_THRESH,
> .c_name = "thres",
> .c_maxattr = 6,
> .c_msg_parser = parse_thres,
> }, {
> .c_id = FS_EVENT_TYPE_WARN,
> .c_name = "warn",
> .c_maxattr = 5,
> .c_msg_parser = parse_warning,
> },
> };
>
> static struct genl_ops ops = {
> .o_id = GENL_ID_FS_EVENT,
> .o_name = "FS_EVENT",
> .o_hdrsize = 0,
> .o_cmds = cmd,
> .o_ncmds = ARRAY_SIZE(cmd),
> };
>
>
> int events_cb(struct nl_msg *msg, void *arg)
> {
> return genl_handle_msg(msg, arg);
> }
>
> int main(int argc, char **argv)
> {
> struct nl_sock *sock;
> int ret;
>
> sock = nl_cli_alloc_socket();
> nl_socket_set_local_port(sock, 0);
> nl_socket_disable_seq_check(sock);
>
> nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, events_cb, NULL);
>
> nl_cli_connect(sock, NETLINK_GENERIC);
>
> if ((ret = nl_socket_add_membership(sock, GENL_ID_FS_EVENT))) {
> LOG("Failed to add membership\n");
> goto leave;
> }
>
> if((ret = genl_register_family(&ops))) {
> LOG("Failed to register protocol family\n");
> goto leave;
> }
>
> if ((ret = genl_ops_resolve(sock, &ops) < 0)) {
> LOG("Unable to resolve the family name\n");
> goto leave;
> }
>
> if (genl_ctrl_resolve(sock, "FS_EVENT") < 0) {
> LOG("Failed to resolve the family name\n");
> goto leave;
> }
>
> while (1) {
> if ((ret = nl_recvmsgs_default(sock)) < 0)
> LOG("Unable to receive message: %s\n", nl_geterror(ret));
> }
>
> leave:
> nl_close(sock);
> nl_socket_free(sock);
> return 0;
> }
>
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 16-04-15 10:22:45, Beata Michalska wrote:
> On 04/15/2015 09:25 PM, Darrick J. Wong wrote:
> > On Wed, Apr 15, 2015 at 09:15:44AM +0200, Beata Michalska wrote:
> >
> >> +#define FS_THRESH_LR_REACHED 0x00000020 /* The lower range of threshold has been reached */
> >> +#define FS_THRESH_UR_REACHED 0x00000040 /* The upper range of threshold has been reached */
> >> +#define FS_ERR_UNKNOWN 0x00000080 /* Unknown error */
> >> +#define FS_ERR_RO_REMOUT 0x00000100 /* The file system has been remounted as red-only */
> >
> > _REMOUNT... read-only...
> >
> >> +#define FS_ERR_ITERNAL 0x00000200 /* File system's internal error */
> >
> > _INTERNAL...
> >
> > What does FS_ERR_ITERNAL mean? "programming error"?
> >
> FS_ERR_ITERNAL is supposed to mean smth than can not be easily translated
> into generic event code - so smth that is specific for given file system type.
>
>
> > How about a separate FS_ERR_CORRUPTED to mean "go run fsck"?
>
> Sounds like a good idea.
>
> >
> > Hmm, these are bit flags... it doesn't make sense that I can send things like
> > FS_INFO_UMOUNT | FS_ERR_RO_REMOUT.
> >
>
> You can but you shouldn't. Possibly some sanity checks could be added
> for such cases. I was thinking of possibly merging events for the same
> file system and sending them in one go - so a single message could contain
> multiple events. Though this requires some more thoughts.
Well, I don't think merging events makes some sense. I don't expect that
many messages going over this interface so that merging would be necessary
to get a good performance. And when you merge events, you loose information
about the order - like was it below_limit_info and then above_limit_warn or
the other way around? Also evens might carry other data with them in which
case merging is impossible anyway.
So I'd vote for just not allowing merging and making message type a simple
enum.
Honza
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Hi,
On 04/17/2015 10:17 AM, Jan Kara wrote:
> Hello,
>
> On Wed 15-04-15 09:15:43, Beata Michalska wrote:
>> The following patchset is a result of previous discussions regarding
>> file system threshold notifiactions. It introduces support for file
>> system event notifications, sent through generic netlinik interface
>> whenever an fs-related event occurs. Included are also some shmem
>> and ext4 changes showing how the new interface might actually be used.
>>
>> The vary idea of using the generic netlink interface has been previoulsy
>> suggested here: https://lkml.org/lkml/2011/8/18/169
>>
>> The basic description of the new functionality can be found in
>> the first patch from this set - both in the commit message and
>> in the doc file.
>>
>> Some very basic tests have been performed though still this is
>> a PoC version. Below though is a sample user space application
>> which subscribes to the new multicast group and listens for
>> potential fs-related events. The code has been based on libnl 3.4
>> and its test application for the generic netlink.
> Thanks for the patches! As a general note for the next posting, please CC
> also [email protected] (since this has implications for other
> filesystems as well, specifically I know about XFS guys thinking about some
> notification system as well) and [email protected] (since this is a
> new kernel interface to userspace).
>
> Honza
My bad - with the CC list. Apologies for that.
BR
Beata
>>
>> ---
>>
>> Beata Michalska (4):
>> fs: Add generic file system event notifications
>> ext4: Add helper function to mark group as corrupted
>> ext4: Add support for generic FS events
>> shmem: Add support for generic FS events
>>
>> Documentation/filesystems/events.txt | 254 +++++++++++
>> fs/Makefile | 1 +
>> fs/events/Makefile | 6 +
>> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
>> fs/events/fs_event.h | 27 ++
>> fs/events/fs_event_netlink.c | 94 +++++
>> fs/ext4/balloc.c | 26 +-
>> fs/ext4/ext4.h | 10 +
>> fs/ext4/ialloc.c | 5 +-
>> fs/ext4/inode.c | 2 +-
>> fs/ext4/mballoc.c | 17 +-
>> fs/ext4/resize.c | 1 +
>> fs/ext4/super.c | 43 ++
>> fs/namespace.c | 1 +
>> include/linux/fs.h | 6 +-
>> include/linux/fs_event.h | 69 +++
>> include/uapi/linux/fs_event.h | 62 +++
>> include/uapi/linux/genetlink.h | 1 +
>> mm/shmem.c | 39 +-
>> net/netlink/genetlink.c | 7 +-
>> 20 files changed, 1412 insertions(+), 34 deletions(-)
>> create mode 100644 Documentation/filesystems/events.txt
>> create mode 100644 fs/events/Makefile
>> create mode 100644 fs/events/fs_event.c
>> create mode 100644 fs/events/fs_event.h
>> create mode 100644 fs/events/fs_event_netlink.c
>> create mode 100644 include/linux/fs_event.h
>> create mode 100644 include/uapi/linux/fs_event.h
>>
>> ---
>> Sample application:
>>
>> #include <netlink/cli/utils.h>
>> #include <fs_event.h>
>>
>> #define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
>> #define LOG(args...) fprintf(stderr, args)
>>
>> static int parse_info(struct nl_cache_ops *unused, struct genl_cmd *cmd,
>> struct genl_info *info, void *arg)
>> {
>> LOG("New trace %d:\n",
>> info->attrs[FS_EVENT_ATR_FS_ID]
>> ? nla_get_u32(info->attrs[FS_EVENT_ATR_FS_ID])
>> : -1);
>> LOG("Mout point: %s\n", info->attrs[FS_EVENT_ATR_MOUNT]
>> ? nla_get_string(info->attrs[FS_EVENT_ATR_MOUNT])
>> : "unknown");
>> return 0;
>> }
>>
>> static int parse_thres(struct nl_cache_ops *unused, struct genl_cmd *cmd,
>> struct genl_info *info, void *arg)
>> {
>>
>> LOG("Threshold notification received for trace %d:\n",
>> info->attrs[FS_EVENT_ATR_FS_ID]
>> ? nla_get_u32(info->attrs[FS_EVENT_ATR_FS_ID])
>> : -1);
>>
>> if (info->attrs[FS_EVENT_ATR_DEV_MAJOR])
>> LOG("Backing dev major: %u\n",
>> nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MAJOR]));
>> if (info->attrs[FS_EVENT_ATR_DEV_MINOR])
>> LOG("Backing dev minor: %u\n",
>> nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MINOR]));
>> LOG("Proc: %u\n", info->attrs[FS_EVENT_ATR_CAUSED_ID] ?
>> nla_get_u32(info->attrs[FS_EVENT_ATR_CAUSED_ID]) : -1);
>> LOG("Threshold data: %llu\n", info->attrs[FS_EVENT_ATR_DATA]
>> ? nla_get_u64(info->attrs[FS_EVENT_ATR_DATA])
>> : 0);
>>
>> return 0;
>> }
>>
>> static int parse_warning(struct nl_cache_ops *unused, struct genl_cmd *cmd,
>> struct genl_info *info, void *arg)
>> {
>>
>> LOG("Warning recieved for trace %d\n", info->attrs[FS_EVENT_ATR_FS_ID] ?
>> nla_get_u32(info->attrs[FS_EVENT_ATR_FS_ID]) : -1);
>> if (info->attrs[FS_EVENT_ATR_DEV_MAJOR])
>> LOG("Backing dev major: %u\n",
>> nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MAJOR]));
>> if (info->attrs[FS_EVENT_ATR_DEV_MINOR])
>> LOG("Backing dev minor: %u\n",
>> nla_get_u32(info->attrs[FS_EVENT_ATR_DEV_MINOR]));
>> LOG("Proc: %u\n", info->attrs[FS_EVENT_ATR_CAUSED_ID] ?
>> nla_get_u32(info->attrs[FS_EVENT_ATR_CAUSED_ID]) : -1);
>> LOG("Warning: %u\n", info->attrs[FS_EVENT_ATR_ID] ?
>> nla_get_u32(info->attrs[FS_EVENT_ATR_ID]) : -1);
>>
>> return 0;
>> }
>>
>> static struct genl_cmd cmd[] = {
>> {
>> .c_id = FS_EVENT_TYPE_NEW_TRACE,
>> .c_name = "info",
>> .c_maxattr = 2,
>> .c_msg_parser = parse_info,
>> }, {
>> .c_id = FS_EVENT_TYPE_THRESH,
>> .c_name = "thres",
>> .c_maxattr = 6,
>> .c_msg_parser = parse_thres,
>> }, {
>> .c_id = FS_EVENT_TYPE_WARN,
>> .c_name = "warn",
>> .c_maxattr = 5,
>> .c_msg_parser = parse_warning,
>> },
>> };
>>
>> static struct genl_ops ops = {
>> .o_id = GENL_ID_FS_EVENT,
>> .o_name = "FS_EVENT",
>> .o_hdrsize = 0,
>> .o_cmds = cmd,
>> .o_ncmds = ARRAY_SIZE(cmd),
>> };
>>
>>
>> int events_cb(struct nl_msg *msg, void *arg)
>> {
>> return genl_handle_msg(msg, arg);
>> }
>>
>> int main(int argc, char **argv)
>> {
>> struct nl_sock *sock;
>> int ret;
>>
>> sock = nl_cli_alloc_socket();
>> nl_socket_set_local_port(sock, 0);
>> nl_socket_disable_seq_check(sock);
>>
>> nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, events_cb, NULL);
>>
>> nl_cli_connect(sock, NETLINK_GENERIC);
>>
>> if ((ret = nl_socket_add_membership(sock, GENL_ID_FS_EVENT))) {
>> LOG("Failed to add membership\n");
>> goto leave;
>> }
>>
>> if((ret = genl_register_family(&ops))) {
>> LOG("Failed to register protocol family\n");
>> goto leave;
>> }
>>
>> if ((ret = genl_ops_resolve(sock, &ops) < 0)) {
>> LOG("Unable to resolve the family name\n");
>> goto leave;
>> }
>>
>> if (genl_ctrl_resolve(sock, "FS_EVENT") < 0) {
>> LOG("Failed to resolve the family name\n");
>> goto leave;
>> }
>>
>> while (1) {
>> if ((ret = nl_recvmsgs_default(sock)) < 0)
>> LOG("Unable to receive message: %s\n", nl_geterror(ret));
>> }
>>
>> leave:
>> nl_close(sock);
>> nl_socket_free(sock);
>> return 0;
>> }
>>
>> --
>> 1.7.9.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Hi,
On 04/16/2015 10:10 PM, Hugh Dickins wrote:
> On Thu, 16 Apr 2015, Beata Michalska wrote:
>> On 04/16/2015 05:46 AM, Eric Sandeen wrote:
>>> On 4/15/15 2:15 AM, Beata Michalska wrote:
>>>> Introduce configurable generic interface for file
>>>> system-wide event notifications to provide file
>>>> systems with a common way of reporting any potential
>>>> issues as they emerge.
>>>>
>>>> The notifications are to be issued through generic
>>>> netlink interface, by a dedicated, for file system
>>>> events, multicast group. The file systems might as
>>>> well use this group to send their own custom messages.
>>>
>>> ...
>>>
>>>> + 4.3 Threshold notifications:
>>>> +
>>>> + #include <linux/fs_event.h>
>>>> + void fs_event_alloc_space(struct super_block *sb, u64 ncount);
>>>> + void fs_event_free_space(struct super_block *sb, u64 ncount);
>>>> +
>>>> + Each filesystme supporting the treshold notifiactions should call
>>>> + fs_event_alloc_space/fs_event_free_space repsectively whenever the
>>>> + ammount of availbale blocks changes.
>>>> + - sb: the filesystem's super block
>>>> + - ncount: number of blocks being acquired/released
>>>
>>> so:
>>>
>>>> +void fs_event_alloc_space(struct super_block *sb, u64 ncount)
>>>> +{
>>>> + struct fs_trace_entry *en;
>>>> + s64 count;
>>>> +
>>>> + spin_lock(&fs_trace_lock);
>>>
>>> Every allocation/free for every supported filesystem system-wide will be
>>> serialized on this global spinlock? That sounds like a non-starter...
>>>
>>> -Eric
>>>
>> I guess there is a plenty room for improvements as this is an early version.
>> I do agree that this might be a performance bottleneck event though I've tried
>> to keep this to minimum - it's being taken only for hashtable look-up. But still...
>> I was considering placing the trace object within the super_block to skip
>> this look-up part but I'd like to gather more comments, especially on the concept
>> itself.
>
> Sorry, I have no opinion on the netlink fs notifications concept
> itself, not my area of expertise at all.
>
> No doubt you Cc'ed me for tmpfs: I am very glad you're now trying the
> generic filesystem route, and yes, I'd be happy to have the support
> in tmpfs, thank you - if it is generally agreed to be suitable for
> filesystems; but wouldn't want this as a special for tmpfs.
>
> However, I must echo Eric's point: please take a look at 7e496299d4d2
> "tmpfs: make tmpfs scalable with percpu_counter for used blocks":
> Tim would be unhappy if you added overhead back into that path.
>
> (And please Cc [email protected] next time you post these.)
>
> Hugh
>
Well, the concept of using netlink interface here is just a part of the overall
idea - so any comments are really welcomed here. The more of them the better solution
can be worked out, as I believe.
As for the possible overhead: this is the last thing I would want, so I'll
definitely do may best to not to introduce any. I will definitely rework this.
Thanks for Your comments,
BR
Beata
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Hi,
On 04/16/2015 11:56 PM, Heinrich Schuchardt wrote:
> On 15.04.2015 09:15, Beata Michalska wrote:
>> Introduce configurable generic interface for file
>> system-wide event notifications to provide file
>> systems with a common way of reporting any potential
>> issues as they emerge.
>>
>> The notifications are to be issued through generic
>> netlink interface, by a dedicated, for file system
>> events, multicast group. The file systems might as
>> well use this group to send their own custom messages.
>>
>> The events have been split into four base categories:
>> information, warnings, errors and threshold notifications,
>> with some very basic event types like running out of space
>> or file system being remounted as read-only.
>>
>> Threshold notifications have been included to allow
>> triggering an event whenever the amount of free space
>> drops below a certain level - or levels to be more precise
>> as two of them are being supported: the lower and the upper
>> range. The notifications work both ways: once the threshold
>> level has been reached, an event shall be generated whenever
>> the number of available blocks goes up again re-activating
>> the threshold.
>>
>> The interface has been exposed through a vfs. Once mounted,
>> it serves as an entry point for the set-up where one can
>> register for particular file system events.
>
> Having a framework for notification for file systems is a great idea.
> Your solution covers an important part of the possible application scope.
>
> Before moving forward I suggest we should analyze if this scope should
> be enlarged.
>
> Many filesystems are remote (e.g. CIFS/Samba) or distributed over many
> network nodes (e.g. Lustre). How should file system notification work here?
>
> How will fuse file systems be served?
>
> The current point of reference is a single mount point.
> Every time I insert an USB stick several file system may be automounted.
> I would like to receive events for these automounted file systems.
>
> A similar case arises when starting new virtual machines. How will I
> receive events on the host system for the file systems of the virtual
> machines?
> In your implementation events are received via Netlink.
> Using Netlink for marking mounts for notification would create a much
> more homogenous interface. So why should we use a virtual file system here?
>
> Best regards
>
> Heinrich Schuchardt
>
>
I'd be more than happy to extend the scope of suggested changes.
I hope I'll be able to collect more comments - in this way there
is a chance we might get here smth that is really useful, for everyone.
I've tried to make the interface rather flexible, so that new cases
can be easily added - so the notification whenever a file system
is being mounted is definitely doable.
The vfs here merely serves the purpose to configure which type of events
and for which filesystems are to be issued. Having this done through
netlink is also an option, though it needs some more thoughts. The way
notifications are being sent might be extended: so there could be more
than one option for this. We might also want to consider if we want to
have this widely available - everything for everyone. (?)
As for the rest, I must admit I'm not really an fs person, so I assume
there will be more comments and questions like yours. This is also why
any comments/hints/remarks/doubts/issues etc would me more than just
welcomed. I'll try to answer them all, though this will require some
time on my side, thus apologies if I have some delays.
I'll get beck to this asap.
BR
Beata
On Wed 15-04-15 09:15:44, Beata Michalska wrote:
> Introduce configurable generic interface for file
> system-wide event notifications to provide file
> systems with a common way of reporting any potential
> issues as they emerge.
>
> The notifications are to be issued through generic
> netlink interface, by a dedicated, for file system
> events, multicast group. The file systems might as
> well use this group to send their own custom messages.
>
> The events have been split into four base categories:
> information, warnings, errors and threshold notifications,
> with some very basic event types like running out of space
> or file system being remounted as read-only.
>
> Threshold notifications have been included to allow
> triggering an event whenever the amount of free space
> drops below a certain level - or levels to be more precise
> as two of them are being supported: the lower and the upper
> range. The notifications work both ways: once the threshold
> level has been reached, an event shall be generated whenever
> the number of available blocks goes up again re-activating
> the threshold.
>
> The interface has been exposed through a vfs. Once mounted,
> it serves as an entry point for the set-up where one can
> register for particular file system events.
>
> Signed-off-by: Beata Michalska <[email protected]>
Thanks for the patches! Some comments are below.
> ---
> Documentation/filesystems/events.txt | 254 +++++++++++
> fs/Makefile | 1 +
> fs/events/Makefile | 6 +
> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
> fs/events/fs_event.h | 27 ++
> fs/events/fs_event_netlink.c | 94 +++++
> fs/namespace.c | 1 +
> include/linux/fs.h | 6 +-
> include/linux/fs_event.h | 69 +++
> include/uapi/linux/fs_event.h | 62 +++
> include/uapi/linux/genetlink.h | 1 +
> net/netlink/genetlink.c | 7 +-
> 12 files changed, 1301 insertions(+), 2 deletions(-)
> create mode 100644 Documentation/filesystems/events.txt
> create mode 100644 fs/events/Makefile
> create mode 100644 fs/events/fs_event.c
> create mode 100644 fs/events/fs_event.h
> create mode 100644 fs/events/fs_event_netlink.c
> create mode 100644 include/linux/fs_event.h
> create mode 100644 include/uapi/linux/fs_event.h
>
> diff --git a/Documentation/filesystems/events.txt b/Documentation/filesystems/events.txt
> new file mode 100644
> index 0000000..c85dd88
> --- /dev/null
> +++ b/Documentation/filesystems/events.txt
> @@ -0,0 +1,254 @@
> +
> + Generic file system event notification interface
> +
> +Document created 09 April 2015 by Beata Michalska <[email protected]>
> +
> +1. The reason behind:
> +=====================
> +
> +There are many corner cases when things might get messy with the filesystems.
> +And it is not always obvious what and when went wrong. Sometimes you might
> +get some subtle hints that there is something going on - but by the time
> +you realise it, it might be too late as you are already out-of-space
> +or the filesystem has been remounted as read-only (i.e.). The generic
> +interface for the filesystem events fills the gap by providing a rather
> +easy way of real-time notifications triggered whenever something intreseting
> +happens, allowing filesystems to report events in a common way, as they occur.
> +
> +2. How does it work:
> +====================
> +
> +The interface itself has been exposed as fstrace-type Virtual File System,
> +primarily to ease the process of setting up the configuration for the file
> +system notifications. So for starters it needs to get mounted (obviously):
> +
> + mount -t fstrace none /sys/fs/events
> +
> +This will unveil the single fstrace filesystem entry - the 'config' file,
> +through which the notification are being set-up.
> +
> +Activating notifications for particular filesystem is as straightforward
> +as writing into the 'config' file. Note that by default all events despite
> +the actual filesystem type are being disregarded.
Is there a reason to have a special filesystem for this? Do you expect
extending it by (many) more files? Why not just creating a file in sysfs or
something like that?
> +Synopsis of config:
> +------------------
> +
> + MOUNT EVENT_TYPE [L1] [L2]
> +
> + MOUNT : the filesystem's mount point
I'm not quite decided but is mountpoint really the right thing to pass
via the interface? They aren't unique (filesystem can be mounted in
multiple places) and more importantly can change over time. So won't it be
better to pass major:minor over the interface? These are stable, unique to
the filesystem, and userspace can easily get them by calling stat(2) on the
desired path (or directly from /proc/self/mountinfo). That could be also
used as an fs identifier instead of assigned ID (and thus we won't need
those events about creation of new trace which look somewhat strange to
me).
OTOH using major:minor may have issues in container world where processes
could watch events from filesystems inaccessible to the container if they
guess the device number. So maybe we could use 'path' when creating new
trace but I'd still like to use the device number internally and for all
outgoing communication because of above mentioned problems with
mountpoints.
> + EVENT_TYPE : type of events to be enabled: info,warn,err,thr;
> + at least one type needs to be specified;
> + note the comma delimiter and lack of spaces between
> + those options
> + L1 : the threshold limit - lower range
> + L2 : the threshold limit - upper range
> + case enabling threshold notifications the lower level is
> + mandatory, whereas the upper one remains optional;
> + note though, that as those refer to the number of available
> + blocks, the lower level needs to be higher than the upper one
> +
> +Sample request could look like the follwoing:
> +
> + echo /sample/mount/point warn,err,thr 710000 500000 > /sys/fs/events/config
> +
> +Multiple request might be specified provided they are separated with semicolon.
Is this necessary? It somewhat complicates syntax and parsing in kernel
and I don't see a need for that. I'd prefer to keep the interface as simple
as possible.
Also I think that we should make it clear that each event type has
different set of arguments. For threshold events they'll be L1 & L2, for
other events there may be no arguments, for other events maybe something
else...
...
> +static const match_table_t fs_etypes = {
> + { FS_EVENT_INFO, "info" },
> + { FS_EVENT_WARN, "warn" },
> + { FS_EVENT_THRESH, "thr" },
> + { FS_EVENT_ERR, "err" },
> + { 0, NULL },
> +};
Why are there these generic message types? Threshold messages make good
sense to me. But not so much the rest. If they don't have a clear meaning,
it will be a mess. So I also agree with a message like - "filesystem has
trouble, you should probably unmount and run fsck" - that's fine. But
generic "info" or "warning" doesn't really carry any meaning on its own and
thus seems pretty useless to me. To explain a bit more, AFAIU this
shouldn't be a generic logging interface where something like severity
makes sense but rather a relatively specific interface notifying about
events in filesystem userspace should know about so I expect relatively low
number of types of events, not tens or even hundreds...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Thu 16-04-15 23:56:11, Heinrich Schuchardt wrote:
> On 15.04.2015 09:15, Beata Michalska wrote:
> > Introduce configurable generic interface for file
> > system-wide event notifications to provide file
> > systems with a common way of reporting any potential
> > issues as they emerge.
> >
> > The notifications are to be issued through generic
> > netlink interface, by a dedicated, for file system
> > events, multicast group. The file systems might as
> > well use this group to send their own custom messages.
> >
> > The events have been split into four base categories:
> > information, warnings, errors and threshold notifications,
> > with some very basic event types like running out of space
> > or file system being remounted as read-only.
> >
> > Threshold notifications have been included to allow
> > triggering an event whenever the amount of free space
> > drops below a certain level - or levels to be more precise
> > as two of them are being supported: the lower and the upper
> > range. The notifications work both ways: once the threshold
> > level has been reached, an event shall be generated whenever
> > the number of available blocks goes up again re-activating
> > the threshold.
> >
> > The interface has been exposed through a vfs. Once mounted,
> > it serves as an entry point for the set-up where one can
> > register for particular file system events.
>
> Having a framework for notification for file systems is a great idea.
> Your solution covers an important part of the possible application scope.
>
> Before moving forward I suggest we should analyze if this scope should
> be enlarged.
>
> Many filesystems are remote (e.g. CIFS/Samba) or distributed over many
> network nodes (e.g. Lustre). How should file system notification work here?
IMO server <-> client notification is fully within the responsibility of
a particular protocol. The client can then translate the notification via
this interface just fine. So IMHO there's nothing to do in this regard.
> How will fuse file systems be served?
I similar answer as previously. It's resposibility of each filesystem to
provide the notification. You would need some way for userspace to notify
the FUSE in kernel which can then relay the information via this interface.
So doable but I don't think we have to do it now...
> The current point of reference is a single mount point.
> Every time I insert an USB stick several file system may be automounted.
> I would like to receive events for these automounted file systems.
So you'll receive udev / DBus events for the mounts, you can catch these
in a userspace daemon and add appropriate rules to receive events (you
could even make it part of the mounting procedure of your desktop). I don't
think we should magically insert new rules for mounted filesystems since
that's a decision that belongs to userspace.
> A similar case arises when starting new virtual machines. How will I
> receive events on the host system for the file systems of the virtual
> machines?
IMHO that belongs in userspace and is out of scope for this proposal.
> In your implementation events are received via Netlink.
> Using Netlink for marking mounts for notification would create a much
> more homogenous interface. So why should we use a virtual file system here?
Hum, that's an interesting idea. Yes, e.g. networking uses netlink to
configure e.g. routing in kernel and in case of this interface, it really
might make the interface nicer.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 04/17/2015 01:31 PM, Jan Kara wrote:
> On Wed 15-04-15 09:15:44, Beata Michalska wrote:
>> Introduce configurable generic interface for file
>> system-wide event notifications to provide file
>> systems with a common way of reporting any potential
>> issues as they emerge.
>>
>> The notifications are to be issued through generic
>> netlink interface, by a dedicated, for file system
>> events, multicast group. The file systems might as
>> well use this group to send their own custom messages.
>>
>> The events have been split into four base categories:
>> information, warnings, errors and threshold notifications,
>> with some very basic event types like running out of space
>> or file system being remounted as read-only.
>>
>> Threshold notifications have been included to allow
>> triggering an event whenever the amount of free space
>> drops below a certain level - or levels to be more precise
>> as two of them are being supported: the lower and the upper
>> range. The notifications work both ways: once the threshold
>> level has been reached, an event shall be generated whenever
>> the number of available blocks goes up again re-activating
>> the threshold.
>>
>> The interface has been exposed through a vfs. Once mounted,
>> it serves as an entry point for the set-up where one can
>> register for particular file system events.
>>
>> Signed-off-by: Beata Michalska <[email protected]>
> Thanks for the patches! Some comments are below.
>
>> ---
>> Documentation/filesystems/events.txt | 254 +++++++++++
>> fs/Makefile | 1 +
>> fs/events/Makefile | 6 +
>> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
>> fs/events/fs_event.h | 27 ++
>> fs/events/fs_event_netlink.c | 94 +++++
>> fs/namespace.c | 1 +
>> include/linux/fs.h | 6 +-
>> include/linux/fs_event.h | 69 +++
>> include/uapi/linux/fs_event.h | 62 +++
>> include/uapi/linux/genetlink.h | 1 +
>> net/netlink/genetlink.c | 7 +-
>> 12 files changed, 1301 insertions(+), 2 deletions(-)
>> create mode 100644 Documentation/filesystems/events.txt
>> create mode 100644 fs/events/Makefile
>> create mode 100644 fs/events/fs_event.c
>> create mode 100644 fs/events/fs_event.h
>> create mode 100644 fs/events/fs_event_netlink.c
>> create mode 100644 include/linux/fs_event.h
>> create mode 100644 include/uapi/linux/fs_event.h
>>
>> diff --git a/Documentation/filesystems/events.txt b/Documentation/filesystems/events.txt
>> new file mode 100644
>> index 0000000..c85dd88
>> --- /dev/null
>> +++ b/Documentation/filesystems/events.txt
>> @@ -0,0 +1,254 @@
>> +
>> + Generic file system event notification interface
>> +
>> +Document created 09 April 2015 by Beata Michalska <[email protected]>
>> +
>> +1. The reason behind:
>> +=====================
>> +
>> +There are many corner cases when things might get messy with the filesystems.
>> +And it is not always obvious what and when went wrong. Sometimes you might
>> +get some subtle hints that there is something going on - but by the time
>> +you realise it, it might be too late as you are already out-of-space
>> +or the filesystem has been remounted as read-only (i.e.). The generic
>> +interface for the filesystem events fills the gap by providing a rather
>> +easy way of real-time notifications triggered whenever something intreseting
>> +happens, allowing filesystems to report events in a common way, as they occur.
>> +
>> +2. How does it work:
>> +====================
>> +
>> +The interface itself has been exposed as fstrace-type Virtual File System,
>> +primarily to ease the process of setting up the configuration for the file
>> +system notifications. So for starters it needs to get mounted (obviously):
>> +
>> + mount -t fstrace none /sys/fs/events
>> +
>> +This will unveil the single fstrace filesystem entry - the 'config' file,
>> +through which the notification are being set-up.
>> +
>> +Activating notifications for particular filesystem is as straightforward
>> +as writing into the 'config' file. Note that by default all events despite
>> +the actual filesystem type are being disregarded.
> Is there a reason to have a special filesystem for this? Do you expect
> extending it by (many) more files? Why not just creating a file in sysfs or
> something like that?
No particular reason here - just for possible future extension if needed.
I'm totally fine with having a single sysfs entry.
>
>> +Synopsis of config:
>> +------------------
>> +
>> + MOUNT EVENT_TYPE [L1] [L2]
>> +
>> + MOUNT : the filesystem's mount point
> I'm not quite decided but is mountpoint really the right thing to pass
> via the interface? They aren't unique (filesystem can be mounted in
> multiple places) and more importantly can change over time. So won't it be
> better to pass major:minor over the interface? These are stable, unique to
> the filesystem, and userspace can easily get them by calling stat(2) on the
> desired path (or directly from /proc/self/mountinfo). That could be also
> used as an fs identifier instead of assigned ID (and thus we won't need
> those events about creation of new trace which look somewhat strange to
> me).
>
Even if a given filesystem is being mounted in many places this will come
down to single super_block - the interface will add trace for the first mount
point. This is just to ease the usage: internally a particular trace is
associated with a super_block.
> OTOH using major:minor may have issues in container world where processes
> could watch events from filesystems inaccessible to the container if they
> guess the device number. So maybe we could use 'path' when creating new
> trace but I'd still like to use the device number internally and for all
> outgoing communication because of above mentioned problems with
> mountpoints.
Alright then, so dropping the idea of announcing new trace (with assigned id)
and switching to using the major:minor numbers. Sounds OK to me.
>
>> + EVENT_TYPE : type of events to be enabled: info,warn,err,thr;
>> + at least one type needs to be specified;
>> + note the comma delimiter and lack of spaces between
>> + those options
>> + L1 : the threshold limit - lower range
>> + L2 : the threshold limit - upper range
>> + case enabling threshold notifications the lower level is
>> + mandatory, whereas the upper one remains optional;
>> + note though, that as those refer to the number of available
>> + blocks, the lower level needs to be higher than the upper one
>> +
>> +Sample request could look like the follwoing:
>> +
>> + echo /sample/mount/point warn,err,thr 710000 500000 > /sys/fs/events/config
>> +
>> +Multiple request might be specified provided they are separated with semicolon.
> Is this necessary? It somewhat complicates syntax and parsing in kernel
> and I don't see a need for that. I'd prefer to keep the interface as simple
> as possible.
>
This is not necessary but could ease the usage - i.e. through scripting: to specify
multiple traces and register them in one go.
> Also I think that we should make it clear that each event type has
> different set of arguments. For threshold events they'll be L1 & L2, for
> other events there may be no arguments, for other events maybe something
> else...
>
Currently only the threshold events use arguments - not sure what arguments
could be used for the remaining notifications. But any suggestions are welcomed.
> ...
>> +static const match_table_t fs_etypes = {
>> + { FS_EVENT_INFO, "info" },
>> + { FS_EVENT_WARN, "warn" },
>> + { FS_EVENT_THRESH, "thr" },
>> + { FS_EVENT_ERR, "err" },
>> + { 0, NULL },
>> +};
> Why are there these generic message types? Threshold messages make good
> sense to me. But not so much the rest. If they don't have a clear meaning,
> it will be a mess. So I also agree with a message like - "filesystem has
> trouble, you should probably unmount and run fsck" - that's fine. But
> generic "info" or "warning" doesn't really carry any meaning on its own and
> thus seems pretty useless to me. To explain a bit more, AFAIU this
> shouldn't be a generic logging interface where something like severity
> makes sense but rather a relatively specific interface notifying about
> events in filesystem userspace should know about so I expect relatively low
> number of types of events, not tens or even hundreds...
>
> Honza
Getting rid of those would simplify the configuration part, indeed.
So we would be left with 'generic' and threshold events.
I guess I've overdone this part.
Thanks for Your comments so far.
BR
Beata
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 04/17/2015 03:04 PM, Beata Michalska wrote:
> On 04/17/2015 01:31 PM, Jan Kara wrote:
>> On Wed 15-04-15 09:15:44, Beata Michalska wrote:
>>> Introduce configurable generic interface for file
>>> system-wide event notifications to provide file
>>> systems with a common way of reporting any potential
>>> issues as they emerge.
>>>
>>> The notifications are to be issued through generic
>>> netlink interface, by a dedicated, for file system
>>> events, multicast group. The file systems might as
>>> well use this group to send their own custom messages.
>>>
>>> The events have been split into four base categories:
>>> information, warnings, errors and threshold notifications,
>>> with some very basic event types like running out of space
>>> or file system being remounted as read-only.
>>>
>>> Threshold notifications have been included to allow
>>> triggering an event whenever the amount of free space
>>> drops below a certain level - or levels to be more precise
>>> as two of them are being supported: the lower and the upper
>>> range. The notifications work both ways: once the threshold
>>> level has been reached, an event shall be generated whenever
>>> the number of available blocks goes up again re-activating
>>> the threshold.
>>>
>>> The interface has been exposed through a vfs. Once mounted,
>>> it serves as an entry point for the set-up where one can
>>> register for particular file system events.
>>>
>>> Signed-off-by: Beata Michalska <[email protected]>
>> Thanks for the patches! Some comments are below.
>>
>>> ---
>>> Documentation/filesystems/events.txt | 254 +++++++++++
>>> fs/Makefile | 1 +
>>> fs/events/Makefile | 6 +
>>> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
>>> fs/events/fs_event.h | 27 ++
>>> fs/events/fs_event_netlink.c | 94 +++++
>>> fs/namespace.c | 1 +
>>> include/linux/fs.h | 6 +-
>>> include/linux/fs_event.h | 69 +++
>>> include/uapi/linux/fs_event.h | 62 +++
>>> include/uapi/linux/genetlink.h | 1 +
>>> net/netlink/genetlink.c | 7 +-
>>> 12 files changed, 1301 insertions(+), 2 deletions(-)
>>> create mode 100644 Documentation/filesystems/events.txt
>>> create mode 100644 fs/events/Makefile
>>> create mode 100644 fs/events/fs_event.c
>>> create mode 100644 fs/events/fs_event.h
>>> create mode 100644 fs/events/fs_event_netlink.c
>>> create mode 100644 include/linux/fs_event.h
>>> create mode 100644 include/uapi/linux/fs_event.h
>>>
>>> diff --git a/Documentation/filesystems/events.txt b/Documentation/filesystems/events.txt
>>> new file mode 100644
>>> index 0000000..c85dd88
>>> --- /dev/null
>>> +++ b/Documentation/filesystems/events.txt
>>> @@ -0,0 +1,254 @@
>>> +
>>> + Generic file system event notification interface
>>> +
>>> +Document created 09 April 2015 by Beata Michalska <[email protected]>
>>> +
>>> +1. The reason behind:
>>> +=====================
>>> +
>>> +There are many corner cases when things might get messy with the filesystems.
>>> +And it is not always obvious what and when went wrong. Sometimes you might
>>> +get some subtle hints that there is something going on - but by the time
>>> +you realise it, it might be too late as you are already out-of-space
>>> +or the filesystem has been remounted as read-only (i.e.). The generic
>>> +interface for the filesystem events fills the gap by providing a rather
>>> +easy way of real-time notifications triggered whenever something intreseting
>>> +happens, allowing filesystems to report events in a common way, as they occur.
>>> +
>>> +2. How does it work:
>>> +====================
>>> +
>>> +The interface itself has been exposed as fstrace-type Virtual File System,
>>> +primarily to ease the process of setting up the configuration for the file
>>> +system notifications. So for starters it needs to get mounted (obviously):
>>> +
>>> + mount -t fstrace none /sys/fs/events
>>> +
>>> +This will unveil the single fstrace filesystem entry - the 'config' file,
>>> +through which the notification are being set-up.
>>> +
>>> +Activating notifications for particular filesystem is as straightforward
>>> +as writing into the 'config' file. Note that by default all events despite
>>> +the actual filesystem type are being disregarded.
>> Is there a reason to have a special filesystem for this? Do you expect
>> extending it by (many) more files? Why not just creating a file in sysfs or
>> something like that?
>
> No particular reason here - just for possible future extension if needed.
> I'm totally fine with having a single sysfs entry.
>
On the other hand .... sysfs entries are mostly single-valued or are sets
of values of a single type, so not sure if we would fit in here -
with the current configuration for the interface.
>>
>>> +Synopsis of config:
>>> +------------------
>>> +
>>> + MOUNT EVENT_TYPE [L1] [L2]
>>> +
>>> + MOUNT : the filesystem's mount point
>> I'm not quite decided but is mountpoint really the right thing to pass
>> via the interface? They aren't unique (filesystem can be mounted in
>> multiple places) and more importantly can change over time. So won't it be
>> better to pass major:minor over the interface? These are stable, unique to
>> the filesystem, and userspace can easily get them by calling stat(2) on the
>> desired path (or directly from /proc/self/mountinfo). That could be also
>> used as an fs identifier instead of assigned ID (and thus we won't need
>> those events about creation of new trace which look somewhat strange to
>> me).
>>
> Even if a given filesystem is being mounted in many places this will come
> down to single super_block - the interface will add trace for the first mount
> point. This is just to ease the usage: internally a particular trace is
> associated with a super_block.
>
>> OTOH using major:minor may have issues in container world where processes
>> could watch events from filesystems inaccessible to the container if they
>> guess the device number. So maybe we could use 'path' when creating new
>> trace but I'd still like to use the device number internally and for all
>> outgoing communication because of above mentioned problems with
>> mountpoints.
>
> Alright then, so dropping the idea of announcing new trace (with assigned id)
> and switching to using the major:minor numbers. Sounds OK to me.
>
>>
>>> + EVENT_TYPE : type of events to be enabled: info,warn,err,thr;
>>> + at least one type needs to be specified;
>>> + note the comma delimiter and lack of spaces between
>>> + those options
>>> + L1 : the threshold limit - lower range
>>> + L2 : the threshold limit - upper range
>>> + case enabling threshold notifications the lower level is
>>> + mandatory, whereas the upper one remains optional;
>>> + note though, that as those refer to the number of available
>>> + blocks, the lower level needs to be higher than the upper one
>>> +
>>> +Sample request could look like the follwoing:
>>> +
>>> + echo /sample/mount/point warn,err,thr 710000 500000 > /sys/fs/events/config
>>> +
>>> +Multiple request might be specified provided they are separated with semicolon.
>> Is this necessary? It somewhat complicates syntax and parsing in kernel
>> and I don't see a need for that. I'd prefer to keep the interface as simple
>> as possible.
>>
>
> This is not necessary but could ease the usage - i.e. through scripting: to specify
> multiple traces and register them in one go.
>
>> Also I think that we should make it clear that each event type has
>> different set of arguments. For threshold events they'll be L1 & L2, for
>> other events there may be no arguments, for other events maybe something
>> else...
>>
>
> Currently only the threshold events use arguments - not sure what arguments
> could be used for the remaining notifications. But any suggestions are welcomed.
>
>> ...
>>> +static const match_table_t fs_etypes = {
>>> + { FS_EVENT_INFO, "info" },
>>> + { FS_EVENT_WARN, "warn" },
>>> + { FS_EVENT_THRESH, "thr" },
>>> + { FS_EVENT_ERR, "err" },
>>> + { 0, NULL },
>>> +};
>> Why are there these generic message types? Threshold messages make good
>> sense to me. But not so much the rest. If they don't have a clear meaning,
>> it will be a mess. So I also agree with a message like - "filesystem has
>> trouble, you should probably unmount and run fsck" - that's fine. But
>> generic "info" or "warning" doesn't really carry any meaning on its own and
>> thus seems pretty useless to me. To explain a bit more, AFAIU this
>> shouldn't be a generic logging interface where something like severity
>> makes sense but rather a relatively specific interface notifying about
>> events in filesystem userspace should know about so I expect relatively low
>> number of types of events, not tens or even hundreds...
>>
>> Honza
>
> Getting rid of those would simplify the configuration part, indeed.
> So we would be left with 'generic' and threshold events.
> I guess I've overdone this part.
>
> Thanks for Your comments so far.
>
> BR
> Beata
>
>
>
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Fri 17-04-15 15:04:37, Beata Michalska wrote:
> On 04/17/2015 01:31 PM, Jan Kara wrote:
> > On Wed 15-04-15 09:15:44, Beata Michalska wrote:
> > Also I think that we should make it clear that each event type has
> > different set of arguments. For threshold events they'll be L1 & L2, for
> > other events there may be no arguments, for other events maybe something
> > else...
> >
>
> Currently only the threshold events use arguments - not sure what arguments
> could be used for the remaining notifications. But any suggestions are welcomed.
Me neither be someone will surely find something in future ;)
> > ...
> >> +static const match_table_t fs_etypes = {
> >> + { FS_EVENT_INFO, "info" },
> >> + { FS_EVENT_WARN, "warn" },
> >> + { FS_EVENT_THRESH, "thr" },
> >> + { FS_EVENT_ERR, "err" },
> >> + { 0, NULL },
> >> +};
> > Why are there these generic message types? Threshold messages make good
> > sense to me. But not so much the rest. If they don't have a clear meaning,
> > it will be a mess. So I also agree with a message like - "filesystem has
> > trouble, you should probably unmount and run fsck" - that's fine. But
> > generic "info" or "warning" doesn't really carry any meaning on its own and
> > thus seems pretty useless to me. To explain a bit more, AFAIU this
> > shouldn't be a generic logging interface where something like severity
> > makes sense but rather a relatively specific interface notifying about
> > events in filesystem userspace should know about so I expect relatively low
> > number of types of events, not tens or even hundreds...
> >
>
> Getting rid of those would simplify the configuration part, indeed.
> So we would be left with 'generic' and threshold events.
> I guess I've overdone this part.
Well, I would avoid defining anything that's not really used. So
currently you can define threshold events and we start with just those.
When someone hooks up filesystem error paths to send notification, we can
create event type for telling "filesystem corrupted". And so on... We just
have to be careful to document that new event types can be added and
userspace has to ignore events it does not understand.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On 2015-04-17 09:04, Beata Michalska wrote:
> On 04/17/2015 01:31 PM, Jan Kara wrote:
>> On Wed 15-04-15 09:15:44, Beata Michalska wrote:
>> ...
>>> +static const match_table_t fs_etypes = {
>>> + { FS_EVENT_INFO, "info" },
>>> + { FS_EVENT_WARN, "warn" },
>>> + { FS_EVENT_THRESH, "thr" },
>>> + { FS_EVENT_ERR, "err" },
>>> + { 0, NULL },
>>> +};
>> Why are there these generic message types? Threshold messages make good
>> sense to me. But not so much the rest. If they don't have a clear meaning,
>> it will be a mess. So I also agree with a message like - "filesystem has
>> trouble, you should probably unmount and run fsck" - that's fine. But
>> generic "info" or "warning" doesn't really carry any meaning on its own and
>> thus seems pretty useless to me. To explain a bit more, AFAIU this
>> shouldn't be a generic logging interface where something like severity
>> makes sense but rather a relatively specific interface notifying about
>> events in filesystem userspace should know about so I expect relatively low
>> number of types of events, not tens or even hundreds...
>>
>> Honza
>
> Getting rid of those would simplify the configuration part, indeed.
> So we would be left with 'generic' and threshold events.
> I guess I've overdone this part.
For some filesystems, it may make sense to differentiate between a
generic warning and an error. For BTRFS and ZFS for example, if there
is a csum error on a block, this will get automatically corrected in
many configurations, and won't require anything like fsck to be run, but
monitoring applications will still probably want to be notified.
On Fri 17-04-15 09:23:35, Austin S Hemmelgarn wrote:
> On 2015-04-17 09:04, Beata Michalska wrote:
> >On 04/17/2015 01:31 PM, Jan Kara wrote:
> >>On Wed 15-04-15 09:15:44, Beata Michalska wrote:
> >>...
> >>>+static const match_table_t fs_etypes = {
> >>>+ { FS_EVENT_INFO, "info" },
> >>>+ { FS_EVENT_WARN, "warn" },
> >>>+ { FS_EVENT_THRESH, "thr" },
> >>>+ { FS_EVENT_ERR, "err" },
> >>>+ { 0, NULL },
> >>>+};
> >> Why are there these generic message types? Threshold messages make good
> >>sense to me. But not so much the rest. If they don't have a clear meaning,
> >>it will be a mess. So I also agree with a message like - "filesystem has
> >>trouble, you should probably unmount and run fsck" - that's fine. But
> >>generic "info" or "warning" doesn't really carry any meaning on its own and
> >>thus seems pretty useless to me. To explain a bit more, AFAIU this
> >>shouldn't be a generic logging interface where something like severity
> >>makes sense but rather a relatively specific interface notifying about
> >>events in filesystem userspace should know about so I expect relatively low
> >>number of types of events, not tens or even hundreds...
> >>
> >> Honza
> >
> >Getting rid of those would simplify the configuration part, indeed.
> >So we would be left with 'generic' and threshold events.
> >I guess I've overdone this part.
>
> For some filesystems, it may make sense to differentiate between a
> generic warning and an error. For BTRFS and ZFS for example, if
> there is a csum error on a block, this will get automatically
> corrected in many configurations, and won't require anything like
> fsck to be run, but monitoring applications will still probably want
> to be notified.
Sure, but in that case just create an event CORRECTED_CHECKSUM_ERROR and
use that. Then userspace knows what it should do with the event. No need to
hide it behind warning / error category.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On 17/04/2015 14:23, Austin S Hemmelgarn wrote:
> On 2015-04-17 09:04, Beata Michalska wrote:
>> On 04/17/2015 01:31 PM, Jan Kara wrote:
>>> On Wed 15-04-15 09:15:44, Beata Michalska wrote:
>>> ...
>>>> +static const match_table_t fs_etypes = {
>>>> + { FS_EVENT_INFO, "info" },
>>>> + { FS_EVENT_WARN, "warn" },
>>>> + { FS_EVENT_THRESH, "thr" },
>>>> + { FS_EVENT_ERR, "err" },
>>>> + { 0, NULL },
>>>> +};
>>> Why are there these generic message types? Threshold messages
>>> make good
>>> sense to me. But not so much the rest. If they don't have a clear
>>> meaning,
>>> it will be a mess. So I also agree with a message like - "filesystem
>>> has
>>> trouble, you should probably unmount and run fsck" - that's fine. But
>>> generic "info" or "warning" doesn't really carry any meaning on its
>>> own and
>>> thus seems pretty useless to me. To explain a bit more, AFAIU this
>>> shouldn't be a generic logging interface where something like severity
>>> makes sense but rather a relatively specific interface notifying about
>>> events in filesystem userspace should know about so I expect
>>> relatively low
>>> number of types of events, not tens or even hundreds...
>>>
>>> Honza
>>
>> Getting rid of those would simplify the configuration part, indeed.
>> So we would be left with 'generic' and threshold events.
>> I guess I've overdone this part.
>
> For some filesystems, it may make sense to differentiate between a
> generic warning and an error. For BTRFS and ZFS for example, if there
> is a csum error on a block, this will get automatically corrected in
> many configurations, and won't require anything like fsck to be run,
> but monitoring applications will still probably want to be notified.
Another key differentiation IMHO is between transient errors (like
server is unavailable in a distributed filesystem) that will block the
filesystem but might clear on their own, vs. permanent errors like
unreadable drives that definitely will not clear until the administrator
takes some action. It's usually a reasonable approximation to call
transient issues warnings, and permanent issues errors.
John
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Fri 17-04-15 15:51:14, John Spray wrote:
> On 17/04/2015 14:23, Austin S Hemmelgarn wrote:
> >On 2015-04-17 09:04, Beata Michalska wrote:
> >>On 04/17/2015 01:31 PM, Jan Kara wrote:
> >>>On Wed 15-04-15 09:15:44, Beata Michalska wrote:
> >>>...
> >>>>+static const match_table_t fs_etypes = {
> >>>>+ { FS_EVENT_INFO, "info" },
> >>>>+ { FS_EVENT_WARN, "warn" },
> >>>>+ { FS_EVENT_THRESH, "thr" },
> >>>>+ { FS_EVENT_ERR, "err" },
> >>>>+ { 0, NULL },
> >>>>+};
> >>> Why are there these generic message types? Threshold
> >>>messages make good
> >>>sense to me. But not so much the rest. If they don't have a
> >>>clear meaning,
> >>>it will be a mess. So I also agree with a message like -
> >>>"filesystem has
> >>>trouble, you should probably unmount and run fsck" - that's fine. But
> >>>generic "info" or "warning" doesn't really carry any meaning
> >>>on its own and
> >>>thus seems pretty useless to me. To explain a bit more, AFAIU this
> >>>shouldn't be a generic logging interface where something like severity
> >>>makes sense but rather a relatively specific interface notifying about
> >>>events in filesystem userspace should know about so I expect
> >>>relatively low
> >>>number of types of events, not tens or even hundreds...
> >>>
> >>> Honza
> >>
> >>Getting rid of those would simplify the configuration part, indeed.
> >>So we would be left with 'generic' and threshold events.
> >>I guess I've overdone this part.
> >
> >For some filesystems, it may make sense to differentiate between a
> >generic warning and an error. For BTRFS and ZFS for example, if
> >there is a csum error on a block, this will get automatically
> >corrected in many configurations, and won't require anything like
> >fsck to be run, but monitoring applications will still probably
> >want to be notified.
>
> Another key differentiation IMHO is between transient errors (like
> server is unavailable in a distributed filesystem) that will block
> the filesystem but might clear on their own, vs. permanent errors
> like unreadable drives that definitely will not clear until the
> administrator takes some action. It's usually a reasonable
> approximation to call transient issues warnings, and permanent
> issues errors.
So you can have events like FS_UNAVAILABLE and FS_AVAILABLE but what use
would this have? I wouldn't like the interface to be dumping ground for
random crap - we have dmesg for that :).
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 17/04/2015 16:43, Jan Kara wrote:
> On Fri 17-04-15 15:51:14, John Spray wrote:
>> On 17/04/2015 14:23, Austin S Hemmelgarn wrote:
>>
>>> For some filesystems, it may make sense to differentiate between a
>>> generic warning and an error. For BTRFS and ZFS for example, if
>>> there is a csum error on a block, this will get automatically
>>> corrected in many configurations, and won't require anything like
>>> fsck to be run, but monitoring applications will still probably
>>> want to be notified.
>> Another key differentiation IMHO is between transient errors (like
>> server is unavailable in a distributed filesystem) that will block
>> the filesystem but might clear on their own, vs. permanent errors
>> like unreadable drives that definitely will not clear until the
>> administrator takes some action. It's usually a reasonable
>> approximation to call transient issues warnings, and permanent
>> issues errors.
> So you can have events like FS_UNAVAILABLE and FS_AVAILABLE but what use
> would this have? I wouldn't like the interface to be dumping ground for
> random crap - we have dmesg for that :).
In that case I'm confused -- why would ENOSPC be an appropriate use of
this interface if the mount being entirely blocked would be
inappropriate? Isn't being unable to service any I/O a more fundamental
and severe thing than being up and healthy but full?
Were you intending the interface to be exclusively for data integrity
issues like checksum failures, rather than more general events about a
mount that userspace would probably like to know about?
John
On Fri 17-04-15 17:08:10, John Spray wrote:
>
> On 17/04/2015 16:43, Jan Kara wrote:
> >On Fri 17-04-15 15:51:14, John Spray wrote:
> >>On 17/04/2015 14:23, Austin S Hemmelgarn wrote:
> >>
> >>>For some filesystems, it may make sense to differentiate between a
> >>>generic warning and an error. For BTRFS and ZFS for example, if
> >>>there is a csum error on a block, this will get automatically
> >>>corrected in many configurations, and won't require anything like
> >>>fsck to be run, but monitoring applications will still probably
> >>>want to be notified.
> >>Another key differentiation IMHO is between transient errors (like
> >>server is unavailable in a distributed filesystem) that will block
> >>the filesystem but might clear on their own, vs. permanent errors
> >>like unreadable drives that definitely will not clear until the
> >>administrator takes some action. It's usually a reasonable
> >>approximation to call transient issues warnings, and permanent
> >>issues errors.
> > So you can have events like FS_UNAVAILABLE and FS_AVAILABLE but what use
> >would this have? I wouldn't like the interface to be dumping ground for
> >random crap - we have dmesg for that :).
> In that case I'm confused -- why would ENOSPC be an appropriate use
> of this interface if the mount being entirely blocked would be
> inappropriate? Isn't being unable to service any I/O a more
> fundamental and severe thing than being up and healthy but full?
>
> Were you intending the interface to be exclusively for data
> integrity issues like checksum failures, rather than more general
> events about a mount that userspace would probably like to know
> about?
Well, I'm not saying we cannot have those events for fs availability /
inavailability. I'm just saying I'd like to see some use for that first.
I don't want events to be added just because it's possible...
For ENOSPC we have thin provisioned storage and the userspace deamon
shuffling real storage underneath. So there I know the usecase.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On 04/17/2015 06:08 PM, John Spray wrote:
>
> On 17/04/2015 16:43, Jan Kara wrote:
>> On Fri 17-04-15 15:51:14, John Spray wrote:
>>> On 17/04/2015 14:23, Austin S Hemmelgarn wrote:
>>>
>>>> For some filesystems, it may make sense to differentiate between a
>>>> generic warning and an error. For BTRFS and ZFS for example, if
>>>> there is a csum error on a block, this will get automatically
>>>> corrected in many configurations, and won't require anything like
>>>> fsck to be run, but monitoring applications will still probably
>>>> want to be notified.
>>> Another key differentiation IMHO is between transient errors (like
>>> server is unavailable in a distributed filesystem) that will block
>>> the filesystem but might clear on their own, vs. permanent errors
>>> like unreadable drives that definitely will not clear until the
>>> administrator takes some action. It's usually a reasonable
>>> approximation to call transient issues warnings, and permanent
>>> issues errors.
>> So you can have events like FS_UNAVAILABLE and FS_AVAILABLE but what use
>> would this have? I wouldn't like the interface to be dumping ground for
>> random crap - we have dmesg for that :).
> In that case I'm confused -- why would ENOSPC be an appropriate use of this interface if the mount being entirely blocked would be inappropriate? Isn't being unable to service any I/O a more fundamental and severe thing than being up and healthy but full?
>
> Were you intending the interface to be exclusively for data integrity issues like checksum failures, rather than more general events about a mount that userspace would probably like to know about?
>
> John
>
I think we should support both and leave the decision on what
is to be reported or not to particular file systems keeping it
to a reasonable extent, of course. The interface should hand it over
to user space - acting as a go-between. I would though avoid
any filesystem specific events (when it comes to specifying those),
keeping it as generic as possible.
BR
Beata
On 2015-04-17 12:22, Jan Kara wrote:
> On Fri 17-04-15 17:08:10, John Spray wrote:
>>
>> On 17/04/2015 16:43, Jan Kara wrote:
>>> On Fri 17-04-15 15:51:14, John Spray wrote:
>>>> On 17/04/2015 14:23, Austin S Hemmelgarn wrote:
>>>>
>>>>> For some filesystems, it may make sense to differentiate between a
>>>>> generic warning and an error. For BTRFS and ZFS for example, if
>>>>> there is a csum error on a block, this will get automatically
>>>>> corrected in many configurations, and won't require anything like
>>>>> fsck to be run, but monitoring applications will still probably
>>>>> want to be notified.
>>>> Another key differentiation IMHO is between transient errors (like
>>>> server is unavailable in a distributed filesystem) that will block
>>>> the filesystem but might clear on their own, vs. permanent errors
>>>> like unreadable drives that definitely will not clear until the
>>>> administrator takes some action. It's usually a reasonable
>>>> approximation to call transient issues warnings, and permanent
>>>> issues errors.
>>> So you can have events like FS_UNAVAILABLE and FS_AVAILABLE but what use
>>> would this have? I wouldn't like the interface to be dumping ground for
>>> random crap - we have dmesg for that :).
>> In that case I'm confused -- why would ENOSPC be an appropriate use
>> of this interface if the mount being entirely blocked would be
>> inappropriate? Isn't being unable to service any I/O a more
>> fundamental and severe thing than being up and healthy but full?
>>
>> Were you intending the interface to be exclusively for data
>> integrity issues like checksum failures, rather than more general
>> events about a mount that userspace would probably like to know
>> about?
> Well, I'm not saying we cannot have those events for fs availability /
> inavailability. I'm just saying I'd like to see some use for that first.
> I don't want events to be added just because it's possible...
>
> For ENOSPC we have thin provisioned storage and the userspace deamon
> shuffling real storage underneath. So there I know the usecase.
>
> Honza
>
The use-case that immediately comes to mind for me would be diskless
nodes with root-on-nfs needing to know if they can actually access the
root filesystem.
On Fri 17-04-15 12:29:07, Austin S Hemmelgarn wrote:
> On 2015-04-17 12:22, Jan Kara wrote:
> >On Fri 17-04-15 17:08:10, John Spray wrote:
> >>
> >>On 17/04/2015 16:43, Jan Kara wrote:
> >>>On Fri 17-04-15 15:51:14, John Spray wrote:
> >>>>On 17/04/2015 14:23, Austin S Hemmelgarn wrote:
> >>>>
> >>>>>For some filesystems, it may make sense to differentiate between a
> >>>>>generic warning and an error. For BTRFS and ZFS for example, if
> >>>>>there is a csum error on a block, this will get automatically
> >>>>>corrected in many configurations, and won't require anything like
> >>>>>fsck to be run, but monitoring applications will still probably
> >>>>>want to be notified.
> >>>>Another key differentiation IMHO is between transient errors (like
> >>>>server is unavailable in a distributed filesystem) that will block
> >>>>the filesystem but might clear on their own, vs. permanent errors
> >>>>like unreadable drives that definitely will not clear until the
> >>>>administrator takes some action. It's usually a reasonable
> >>>>approximation to call transient issues warnings, and permanent
> >>>>issues errors.
> >>> So you can have events like FS_UNAVAILABLE and FS_AVAILABLE but what use
> >>>would this have? I wouldn't like the interface to be dumping ground for
> >>>random crap - we have dmesg for that :).
> >>In that case I'm confused -- why would ENOSPC be an appropriate use
> >>of this interface if the mount being entirely blocked would be
> >>inappropriate? Isn't being unable to service any I/O a more
> >>fundamental and severe thing than being up and healthy but full?
> >>
> >>Were you intending the interface to be exclusively for data
> >>integrity issues like checksum failures, rather than more general
> >>events about a mount that userspace would probably like to know
> >>about?
> > Well, I'm not saying we cannot have those events for fs availability /
> >inavailability. I'm just saying I'd like to see some use for that first.
> >I don't want events to be added just because it's possible...
> >
> >For ENOSPC we have thin provisioned storage and the userspace deamon
> >shuffling real storage underneath. So there I know the usecase.
> >
> The use-case that immediately comes to mind for me would be diskless
> nodes with root-on-nfs needing to know if they can actually access
> the root filesystem.
Well, most apps will access the root file system regardless of what we
send over netlink... So I don't see netlink events improving the situation
there too much. You could try to use it for something like failover but
even there I'm not too convinced - just doing some IO, waiting for timeout,
and failing over if IO doesn't complete works just fine for that these
days. That's why I was asking because I didn't see convincing usecase
myself...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On 17/04/2015 17:22, Jan Kara wrote:
> On Fri 17-04-15 17:08:10, John Spray wrote:
>> On 17/04/2015 16:43, Jan Kara wrote:
>> In that case I'm confused -- why would ENOSPC be an appropriate use
>> of this interface if the mount being entirely blocked would be
>> inappropriate? Isn't being unable to service any I/O a more
>> fundamental and severe thing than being up and healthy but full?
>>
>> Were you intending the interface to be exclusively for data
>> integrity issues like checksum failures, rather than more general
>> events about a mount that userspace would probably like to know
>> about?
> Well, I'm not saying we cannot have those events for fs availability /
> inavailability. I'm just saying I'd like to see some use for that first.
> I don't want events to be added just because it's possible...
>
> For ENOSPC we have thin provisioned storage and the userspace deamon
> shuffling real storage underneath. So there I know the usecase.
>
Ah, OK. So I can think of a couple of use cases:
* a cluster scheduling service (think MPI jobs or docker containers)
might check for events like this. If it can see the cluster filesystem
is unavailable, then it can avoid scheduling the job, so that the
(multi-node) application does not get hung on one node with a bad
mount. If it sees a mount go bad (unavailable, or client evicted)
partway through a job, then it can kill -9 the process that was relying
on the bad mount, and go run it somewhere else.
* Boring but practical case: a nagios health check for checking if
mounts are OK.
We don't have to invent these event types now of course, but something
to bear in mind. Hopefully if/when any of the distributed filesystems
(Lustre/Ceph/etc) choose to implement this, we can look at making the
event types common at that time though.
BTW in any case an interface for filesystem events to userspace will be
a useful addition, thank you!
Cheers,
John
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Apr 17, 2015, at 11:37 AM, John Spray <[email protected]> wrote:
> On 17/04/2015 17:22, Jan Kara wrote:
>> On Fri 17-04-15 17:08:10, John Spray wrote:
>>> On 17/04/2015 16:43, Jan Kara wrote:
>>> In that case I'm confused -- why would ENOSPC be an appropriate use
>>> of this interface if the mount being entirely blocked would be
>>> inappropriate? Isn't being unable to service any I/O a more
>>> fundamental and severe thing than being up and healthy but full?
>>>
>>> Were you intending the interface to be exclusively for data
>>> integrity issues like checksum failures, rather than more general
>>> events about a mount that userspace would probably like to know
>>> about?
>> Well, I'm not saying we cannot have those events for fs availability /
>> inavailability. I'm just saying I'd like to see some use for that first.
>> I don't want events to be added just because it's possible...
>>
>> For ENOSPC we have thin provisioned storage and the userspace deamon
>> shuffling real storage underneath. So there I know the usecase.
>>
>
> Ah, OK. So I can think of a couple of use cases:
> * a cluster scheduling service (think MPI jobs or docker containers) might check for events like this. If it can see the cluster filesystem is unavailable, then it can avoid scheduling the job, so that the (multi-node) application does not get hung on one node with a bad mount. If it sees a mount go bad (unavailable, or client evicted) partway through a job, then it can kill -9 the process that was relying on the bad mount, and go run it somewhere else.
> * Boring but practical case: a nagios health check for checking if mounts are OK.
John,
thanks for chiming in, as I was just about to write the same. Some users
were just asking yesterday at the Lustre User Group meeting about adding
an interface to notify job schedulers for your #1 point, and I'd much
rather use a generic interface than inventing our own for Lustre.
Cheers, Andreas
> We don't have to invent these event types now of course, but something to bear in mind. Hopefully if/when any of the distributed filesystems (Lustre/Ceph/etc) choose to implement this, we can look at making the event types common at that time though.
>
> BTW in any case an interface for filesystem events to userspace will be a useful addition, thank you!
>
> Cheers,
> John
Cheers, Andreas
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href
ilto:"[email protected]"> [email protected] </a>
On Apr 17, 2015, at 5:31 AM, Jan Kara <[email protected]> wrote:
> On Wed 15-04-15 09:15:44, Beata Michalska wrote:
>> Introduce configurable generic interface for file
>> system-wide event notifications to provide file
>> systems with a common way of reporting any potential
>> issues as they emerge.
>>
>> The notifications are to be issued through generic
>> netlink interface, by a dedicated, for file system
>> events, multicast group. The file systems might as
>> well use this group to send their own custom messages.
>>
>> The events have been split into four base categories:
>> information, warnings, errors and threshold notifications,
>> with some very basic event types like running out of space
>> or file system being remounted as read-only.
>>
>> Threshold notifications have been included to allow
>> triggering an event whenever the amount of free space
>> drops below a certain level - or levels to be more precise
>> as two of them are being supported: the lower and the upper
>> range. The notifications work both ways: once the threshold
>> level has been reached, an event shall be generated whenever
>> the number of available blocks goes up again re-activating
>> the threshold.
>>
>> The interface has been exposed through a vfs. Once mounted,
>> it serves as an entry point for the set-up where one can
>> register for particular file system events.
>>
>> Signed-off-by: Beata Michalska <[email protected]>
> Thanks for the patches! Some comments are below.
>
>> ---
>> Documentation/filesystems/events.txt | 254 +++++++++++
>> fs/Makefile | 1 +
>> fs/events/Makefile | 6 +
>> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
>> fs/events/fs_event.h | 27 ++
>> fs/events/fs_event_netlink.c | 94 +++++
>> fs/namespace.c | 1 +
>> include/linux/fs.h | 6 +-
>> include/linux/fs_event.h | 69 +++
>> include/uapi/linux/fs_event.h | 62 +++
>> include/uapi/linux/genetlink.h | 1 +
>> net/netlink/genetlink.c | 7 +-
>> 12 files changed, 1301 insertions(+), 2 deletions(-)
>> create mode 100644 Documentation/filesystems/events.txt
>> create mode 100644 fs/events/Makefile
>> create mode 100644 fs/events/fs_event.c
>> create mode 100644 fs/events/fs_event.h
>> create mode 100644 fs/events/fs_event_netlink.c
>> create mode 100644 include/linux/fs_event.h
>> create mode 100644 include/uapi/linux/fs_event.h
>>
>> diff --git a/Documentation/filesystems/events.txt b/Documentation/filesystems/events.txt
>> new file mode 100644
>> index 0000000..c85dd88
>> --- /dev/null
>> +++ b/Documentation/filesystems/events.txt
>> @@ -0,0 +1,254 @@
>> +
>> + Generic file system event notification interface
>> +
>> +Document created 09 April 2015 by Beata Michalska <[email protected]>
>> +
>> +1. The reason behind:
>> +=====================
>> +
>> +There are many corner cases when things might get messy with the filesystems.
>> +And it is not always obvious what and when went wrong. Sometimes you might
>> +get some subtle hints that there is something going on - but by the time
>> +you realise it, it might be too late as you are already out-of-space
>> +or the filesystem has been remounted as read-only (i.e.). The generic
>> +interface for the filesystem events fills the gap by providing a rather
>> +easy way of real-time notifications triggered whenever something intreseting
>> +happens, allowing filesystems to report events in a common way, as they occur.
>> +
>> +2. How does it work:
>> +====================
>> +
>> +The interface itself has been exposed as fstrace-type Virtual File System,
>> +primarily to ease the process of setting up the configuration for the file
>> +system notifications. So for starters it needs to get mounted (obviously):
>> +
>> + mount -t fstrace none /sys/fs/events
>> +
>> +This will unveil the single fstrace filesystem entry - the 'config' file,
>> +through which the notification are being set-up.
>> +
>> +Activating notifications for particular filesystem is as straightforward
>> +as writing into the 'config' file. Note that by default all events despite
>> +the actual filesystem type are being disregarded.
> Is there a reason to have a special filesystem for this? Do you expect
> extending it by (many) more files? Why not just creating a file in sysfs or
> something like that?
>
>> +Synopsis of config:
>> +------------------
>> +
>> + MOUNT EVENT_TYPE [L1] [L2]
>> +
>> + MOUNT : the filesystem's mount point
> I'm not quite decided but is mountpoint really the right thing to pass
> via the interface? They aren't unique (filesystem can be mounted in
> multiple places) and more importantly can change over time. So won't it be
> better to pass major:minor over the interface? These are stable, unique to
> the filesystem, and userspace can easily get them by calling stat(2) on the
> desired path (or directly from /proc/self/mountinfo). That could be also
> used as an fs identifier instead of assigned ID (and thus we won't need
> those events about creation of new trace which look somewhat strange to
> me).
>
> OTOH using major:minor may have issues in container world where processes
> could watch events from filesystems inaccessible to the container if they
> guess the device number. So maybe we could use 'path' when creating new
> trace but I'd still like to use the device number internally and for all
> outgoing communication because of above mentioned problems with
> mountpoints.
Please don't make major:minor part of the interface. That doesn't make
sense for network filesystems. Using the mountpoint to set this up is
fine, and really what is expected by userspace tools to monitor a specific
mountpoint. We could use sb->s_id to identify the events.
>> + EVENT_TYPE : type of events to be enabled: info,warn,err,thr;
>> + at least one type needs to be specified;
>> + note the comma delimiter and lack of spaces between
>> + those options
>> + L1 : the threshold limit - lower range
>> + L2 : the threshold limit - upper range
>> + case enabling threshold notifications the lower level is
>> + mandatory, whereas the upper one remains optional;
>> + note though, that as those refer to the number of available
>> + blocks, the lower level needs to be higher than the upper one
>> +
>> +Sample request could look like the follwoing:
>> +
>> + echo /sample/mount/point warn,err,thr 710000 500000 > /sys/fs/events/config
>> +
>> +Multiple request might be specified provided they are separated with semicolon.
> Is this necessary? It somewhat complicates syntax and parsing in kernel
> and I don't see a need for that. I'd prefer to keep the interface as simple
> as possible.
>
> Also I think that we should make it clear that each event type has
> different set of arguments. For threshold events they'll be L1 & L2, for
> other events there may be no arguments, for other events maybe something
> else...
>
> ...
>> +static const match_table_t fs_etypes = {
>> + { FS_EVENT_INFO, "info" },
>> + { FS_EVENT_WARN, "warn" },
>> + { FS_EVENT_THRESH, "thr" },
>> + { FS_EVENT_ERR, "err" },
>> + { 0, NULL },
>> +};
> Why are there these generic message types? Threshold messages make good
> sense to me. But not so much the rest. If they don't have a clear meaning,
> it will be a mess. So I also agree with a message like - "filesystem has
> trouble, you should probably unmount and run fsck" - that's fine. But
> generic "info" or "warning" doesn't really carry any meaning on its own and
> thus seems pretty useless to me. To explain a bit more, AFAIU this
> shouldn't be a generic logging interface where something like severity
> makes sense but rather a relatively specific interface notifying about
> events in filesystem userspace should know about so I expect relatively low
> number of types of events, not tens or even hundreds...
>
> Honza
> --
> Jan Kara <[email protected]>
> SUSE Labs, CR
Cheers, Andreas
Hi,
On 04/18/2015 12:44 AM, Andreas Dilger wrote:
> On Apr 17, 2015, at 5:31 AM, Jan Kara <[email protected]> wrote:
>> On Wed 15-04-15 09:15:44, Beata Michalska wrote:
>>> Introduce configurable generic interface for file
>>> system-wide event notifications to provide file
>>> systems with a common way of reporting any potential
>>> issues as they emerge.
>>>
>>> The notifications are to be issued through generic
>>> netlink interface, by a dedicated, for file system
>>> events, multicast group. The file systems might as
>>> well use this group to send their own custom messages.
>>>
>>> The events have been split into four base categories:
>>> information, warnings, errors and threshold notifications,
>>> with some very basic event types like running out of space
>>> or file system being remounted as read-only.
>>>
>>> Threshold notifications have been included to allow
>>> triggering an event whenever the amount of free space
>>> drops below a certain level - or levels to be more precise
>>> as two of them are being supported: the lower and the upper
>>> range. The notifications work both ways: once the threshold
>>> level has been reached, an event shall be generated whenever
>>> the number of available blocks goes up again re-activating
>>> the threshold.
>>>
>>> The interface has been exposed through a vfs. Once mounted,
>>> it serves as an entry point for the set-up where one can
>>> register for particular file system events.
>>>
>>> Signed-off-by: Beata Michalska <[email protected]>
>> Thanks for the patches! Some comments are below.
>>
>>> ---
>>> Documentation/filesystems/events.txt | 254 +++++++++++
>>> fs/Makefile | 1 +
>>> fs/events/Makefile | 6 +
>>> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
>>> fs/events/fs_event.h | 27 ++
>>> fs/events/fs_event_netlink.c | 94 +++++
>>> fs/namespace.c | 1 +
>>> include/linux/fs.h | 6 +-
>>> include/linux/fs_event.h | 69 +++
>>> include/uapi/linux/fs_event.h | 62 +++
>>> include/uapi/linux/genetlink.h | 1 +
>>> net/netlink/genetlink.c | 7 +-
>>> 12 files changed, 1301 insertions(+), 2 deletions(-)
>>> create mode 100644 Documentation/filesystems/events.txt
>>> create mode 100644 fs/events/Makefile
>>> create mode 100644 fs/events/fs_event.c
>>> create mode 100644 fs/events/fs_event.h
>>> create mode 100644 fs/events/fs_event_netlink.c
>>> create mode 100644 include/linux/fs_event.h
>>> create mode 100644 include/uapi/linux/fs_event.h
>>>
>>> diff --git a/Documentation/filesystems/events.txt b/Documentation/filesystems/events.txt
>>> new file mode 100644
>>> index 0000000..c85dd88
>>> --- /dev/null
>>> +++ b/Documentation/filesystems/events.txt
>>> @@ -0,0 +1,254 @@
>>> +
>>> + Generic file system event notification interface
>>> +
>>> +Document created 09 April 2015 by Beata Michalska <[email protected]>
>>> +
>>> +1. The reason behind:
>>> +=====================
>>> +
>>> +There are many corner cases when things might get messy with the filesystems.
>>> +And it is not always obvious what and when went wrong. Sometimes you might
>>> +get some subtle hints that there is something going on - but by the time
>>> +you realise it, it might be too late as you are already out-of-space
>>> +or the filesystem has been remounted as read-only (i.e.). The generic
>>> +interface for the filesystem events fills the gap by providing a rather
>>> +easy way of real-time notifications triggered whenever something intreseting
>>> +happens, allowing filesystems to report events in a common way, as they occur.
>>> +
>>> +2. How does it work:
>>> +====================
>>> +
>>> +The interface itself has been exposed as fstrace-type Virtual File System,
>>> +primarily to ease the process of setting up the configuration for the file
>>> +system notifications. So for starters it needs to get mounted (obviously):
>>> +
>>> + mount -t fstrace none /sys/fs/events
>>> +
>>> +This will unveil the single fstrace filesystem entry - the 'config' file,
>>> +through which the notification are being set-up.
>>> +
>>> +Activating notifications for particular filesystem is as straightforward
>>> +as writing into the 'config' file. Note that by default all events despite
>>> +the actual filesystem type are being disregarded.
>> Is there a reason to have a special filesystem for this? Do you expect
>> extending it by (many) more files? Why not just creating a file in sysfs or
>> something like that?
>>
>>> +Synopsis of config:
>>> +------------------
>>> +
>>> + MOUNT EVENT_TYPE [L1] [L2]
>>> +
>>> + MOUNT : the filesystem's mount point
>> I'm not quite decided but is mountpoint really the right thing to pass
>> via the interface? They aren't unique (filesystem can be mounted in
>> multiple places) and more importantly can change over time. So won't it be
>> better to pass major:minor over the interface? These are stable, unique to
>> the filesystem, and userspace can easily get them by calling stat(2) on the
>> desired path (or directly from /proc/self/mountinfo). That could be also
>> used as an fs identifier instead of assigned ID (and thus we won't need
>> those events about creation of new trace which look somewhat strange to
>> me).
>>
>> OTOH using major:minor may have issues in container world where processes
>> could watch events from filesystems inaccessible to the container if they
>> guess the device number. So maybe we could use 'path' when creating new
>> trace but I'd still like to use the device number internally and for all
>> outgoing communication because of above mentioned problems with
>> mountpoints.
>
> Please don't make major:minor part of the interface. That doesn't make
> sense for network filesystems. Using the mountpoint to set this up is
> fine, and really what is expected by userspace tools to monitor a specific
> mountpoint. We could use sb->s_id to identify the events.
I'm afraid that using the sb->s_id might not be sufficient here.
The sb->s_id is based either on the backing dev name or the fs type,
so it will not show the difference between the nodev filesystems (like tmpfs i.e.).
It would be better (and more efficient) to have here smth that would enable
the user space to explicitly determine which fs has triggered the notification.
(which is why I've used the ids generated for each trace).
>
>>> + EVENT_TYPE : type of events to be enabled: info,warn,err,thr;
>>> + at least one type needs to be specified;
>>> + note the comma delimiter and lack of spaces between
>>> + those options
>>> + L1 : the threshold limit - lower range
>>> + L2 : the threshold limit - upper range
>>> + case enabling threshold notifications the lower level is
>>> + mandatory, whereas the upper one remains optional;
>>> + note though, that as those refer to the number of available
>>> + blocks, the lower level needs to be higher than the upper one
>>> +
>>> +Sample request could look like the follwoing:
>>> +
>>> + echo /sample/mount/point warn,err,thr 710000 500000 > /sys/fs/events/config
>>> +
>>> +Multiple request might be specified provided they are separated with semicolon.
>> Is this necessary? It somewhat complicates syntax and parsing in kernel
>> and I don't see a need for that. I'd prefer to keep the interface as simple
>> as possible.
>>
>> Also I think that we should make it clear that each event type has
>> different set of arguments. For threshold events they'll be L1 & L2, for
>> other events there may be no arguments, for other events maybe something
>> else...
>>
>> ...
>>> +static const match_table_t fs_etypes = {
>>> + { FS_EVENT_INFO, "info" },
>>> + { FS_EVENT_WARN, "warn" },
>>> + { FS_EVENT_THRESH, "thr" },
>>> + { FS_EVENT_ERR, "err" },
>>> + { 0, NULL },
>>> +};
>> Why are there these generic message types? Threshold messages make good
>> sense to me. But not so much the rest. If they don't have a clear meaning,
>> it will be a mess. So I also agree with a message like - "filesystem has
>> trouble, you should probably unmount and run fsck" - that's fine. But
>> generic "info" or "warning" doesn't really carry any meaning on its own and
>> thus seems pretty useless to me. To explain a bit more, AFAIU this
>> shouldn't be a generic logging interface where something like severity
>> makes sense but rather a relatively specific interface notifying about
>> events in filesystem userspace should know about so I expect relatively low
>> number of types of events, not tens or even hundreds...
>>
>> Honza
>> --
>> Jan Kara <[email protected]>
>> SUSE Labs, CR
>
>
> Cheers, Andreas
>
>
BR
Beata
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Fri 17-04-15 16:44:16, Andreas Dilger wrote:
> On Apr 17, 2015, at 5:31 AM, Jan Kara <[email protected]> wrote:
> > On Wed 15-04-15 09:15:44, Beata Michalska wrote:
> >> Introduce configurable generic interface for file
> >> system-wide event notifications to provide file
> >> systems with a common way of reporting any potential
> >> issues as they emerge.
> >>
> >> The notifications are to be issued through generic
> >> netlink interface, by a dedicated, for file system
> >> events, multicast group. The file systems might as
> >> well use this group to send their own custom messages.
> >>
> >> The events have been split into four base categories:
> >> information, warnings, errors and threshold notifications,
> >> with some very basic event types like running out of space
> >> or file system being remounted as read-only.
> >>
> >> Threshold notifications have been included to allow
> >> triggering an event whenever the amount of free space
> >> drops below a certain level - or levels to be more precise
> >> as two of them are being supported: the lower and the upper
> >> range. The notifications work both ways: once the threshold
> >> level has been reached, an event shall be generated whenever
> >> the number of available blocks goes up again re-activating
> >> the threshold.
> >>
> >> The interface has been exposed through a vfs. Once mounted,
> >> it serves as an entry point for the set-up where one can
> >> register for particular file system events.
> >>
> >> Signed-off-by: Beata Michalska <[email protected]>
> > Thanks for the patches! Some comments are below.
> >
> >> ---
> >> Documentation/filesystems/events.txt | 254 +++++++++++
> >> fs/Makefile | 1 +
> >> fs/events/Makefile | 6 +
> >> fs/events/fs_event.c | 775 ++++++++++++++++++++++++++++++++++
> >> fs/events/fs_event.h | 27 ++
> >> fs/events/fs_event_netlink.c | 94 +++++
> >> fs/namespace.c | 1 +
> >> include/linux/fs.h | 6 +-
> >> include/linux/fs_event.h | 69 +++
> >> include/uapi/linux/fs_event.h | 62 +++
> >> include/uapi/linux/genetlink.h | 1 +
> >> net/netlink/genetlink.c | 7 +-
> >> 12 files changed, 1301 insertions(+), 2 deletions(-)
> >> create mode 100644 Documentation/filesystems/events.txt
> >> create mode 100644 fs/events/Makefile
> >> create mode 100644 fs/events/fs_event.c
> >> create mode 100644 fs/events/fs_event.h
> >> create mode 100644 fs/events/fs_event_netlink.c
> >> create mode 100644 include/linux/fs_event.h
> >> create mode 100644 include/uapi/linux/fs_event.h
> >>
> >> diff --git a/Documentation/filesystems/events.txt b/Documentation/filesystems/events.txt
> >> new file mode 100644
> >> index 0000000..c85dd88
> >> --- /dev/null
> >> +++ b/Documentation/filesystems/events.txt
> >> @@ -0,0 +1,254 @@
> >> +
> >> + Generic file system event notification interface
> >> +
> >> +Document created 09 April 2015 by Beata Michalska <[email protected]>
> >> +
> >> +1. The reason behind:
> >> +=====================
> >> +
> >> +There are many corner cases when things might get messy with the filesystems.
> >> +And it is not always obvious what and when went wrong. Sometimes you might
> >> +get some subtle hints that there is something going on - but by the time
> >> +you realise it, it might be too late as you are already out-of-space
> >> +or the filesystem has been remounted as read-only (i.e.). The generic
> >> +interface for the filesystem events fills the gap by providing a rather
> >> +easy way of real-time notifications triggered whenever something intreseting
> >> +happens, allowing filesystems to report events in a common way, as they occur.
> >> +
> >> +2. How does it work:
> >> +====================
> >> +
> >> +The interface itself has been exposed as fstrace-type Virtual File System,
> >> +primarily to ease the process of setting up the configuration for the file
> >> +system notifications. So for starters it needs to get mounted (obviously):
> >> +
> >> + mount -t fstrace none /sys/fs/events
> >> +
> >> +This will unveil the single fstrace filesystem entry - the 'config' file,
> >> +through which the notification are being set-up.
> >> +
> >> +Activating notifications for particular filesystem is as straightforward
> >> +as writing into the 'config' file. Note that by default all events despite
> >> +the actual filesystem type are being disregarded.
> > Is there a reason to have a special filesystem for this? Do you expect
> > extending it by (many) more files? Why not just creating a file in sysfs or
> > something like that?
> >
> >> +Synopsis of config:
> >> +------------------
> >> +
> >> + MOUNT EVENT_TYPE [L1] [L2]
> >> +
> >> + MOUNT : the filesystem's mount point
> > I'm not quite decided but is mountpoint really the right thing to pass
> > via the interface? They aren't unique (filesystem can be mounted in
> > multiple places) and more importantly can change over time. So won't it be
> > better to pass major:minor over the interface? These are stable, unique to
> > the filesystem, and userspace can easily get them by calling stat(2) on the
> > desired path (or directly from /proc/self/mountinfo). That could be also
> > used as an fs identifier instead of assigned ID (and thus we won't need
> > those events about creation of new trace which look somewhat strange to
> > me).
> >
> > OTOH using major:minor may have issues in container world where processes
> > could watch events from filesystems inaccessible to the container if they
> > guess the device number. So maybe we could use 'path' when creating new
> > trace but I'd still like to use the device number internally and for all
> > outgoing communication because of above mentioned problems with
> > mountpoints.
>
> Please don't make major:minor part of the interface. That doesn't make
> sense for network filesystems. Using the mountpoint to set this up is
> fine, and really what is expected by userspace tools to monitor a specific
> mountpoint. We could use sb->s_id to identify the events.
So for setup I agree that mountpoint is probably the easiest. For
reporting back from kernel, sb->s_id isn't enough because as Beata noted,
this isn't unique. You are right that for network filesystems (or
in-memory filesystem for that matter) device number doesn't make any
particular sense but each fs (even e.g. procfs) is assigned a "virtual"
device number which uniquely identifies that filesystem. You can see that
device number in /proc/self/mountinfo and you will also see it in st_dev
from stat(2). So using that is IMHO better than devising own unique number.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR