2023-01-13 16:25:26

by Alexander Larsson

[permalink] [raw]
Subject: [PATCH v2 2/6] composefs: Add on-disk layout

This commit adds the on-disk layout header file of composefs.

Signed-off-by: Alexander Larsson <[email protected]>
Co-developed-by: Giuseppe Scrivano <[email protected]>
Signed-off-by: Giuseppe Scrivano <[email protected]>
---
fs/composefs/cfs.h | 203 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 203 insertions(+)
create mode 100644 fs/composefs/cfs.h

diff --git a/fs/composefs/cfs.h b/fs/composefs/cfs.h
new file mode 100644
index 000000000000..658df728e366
--- /dev/null
+++ b/fs/composefs/cfs.h
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * composefs
+ *
+ * Copyright (C) 2021 Giuseppe Scrivano
+ * Copyright (C) 2022 Alexander Larsson
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _CFS_H
+#define _CFS_H
+
+#include <asm/byteorder.h>
+#include <crypto/sha2.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/types.h>
+
+#define CFS_VERSION 1
+
+#define CFS_MAGIC 0xc078629aU
+
+#define CFS_MAX_DIR_CHUNK_SIZE 4096
+#define CFS_MAX_XATTRS_SIZE 4096
+
+static inline int cfs_digest_from_payload(const char *payload, size_t payload_len,
+ u8 digest_out[SHA256_DIGEST_SIZE])
+{
+ const char *p, *end;
+ u8 last_digit = 0;
+ int digit = 0;
+ size_t n_nibbles = 0;
+
+ /* This handles payloads (i.e. path names) that are "essentially" a
+ * digest as the digest (if the DIGEST_FROM_PAYLOAD flag is set). The
+ * "essential" part means that we ignore hierarchical structure as well
+ * as any extension. So, for example "ef/deadbeef.file" would match the
+ * (too short) digest "efdeadbeef".
+ *
+ * This allows images to avoid storing both the digest and the pathname,
+ * yet work with pre-existing object store formats of various kinds.
+ */
+
+ end = payload + payload_len;
+ for (p = payload; p != end; p++) {
+ /* Skip subdir structure */
+ if (*p == '/')
+ continue;
+
+ /* Break at (and ignore) extension */
+ if (*p == '.')
+ break;
+
+ if (n_nibbles == SHA256_DIGEST_SIZE * 2)
+ return -EINVAL; /* Too long */
+
+ digit = hex_to_bin(*p);
+ if (digit == -1)
+ return -EINVAL; /* Not hex digit */
+
+ n_nibbles++;
+ if ((n_nibbles % 2) == 0)
+ digest_out[n_nibbles / 2 - 1] = (last_digit << 4) | digit;
+ last_digit = digit;
+ }
+
+ if (n_nibbles != SHA256_DIGEST_SIZE * 2)
+ return -EINVAL; /* Too short */
+
+ return 0;
+}
+
+struct cfs_vdata_s {
+ u64 off;
+ u32 len;
+} __packed;
+
+struct cfs_header_s {
+ u8 version;
+ u8 unused1;
+ u16 unused2;
+
+ u32 magic;
+ u64 data_offset;
+ u64 root_inode;
+
+ u64 unused3[2];
+} __packed;
+
+enum cfs_inode_flags {
+ CFS_INODE_FLAGS_NONE = 0,
+ CFS_INODE_FLAGS_PAYLOAD = 1 << 0,
+ CFS_INODE_FLAGS_MODE = 1 << 1,
+ CFS_INODE_FLAGS_NLINK = 1 << 2,
+ CFS_INODE_FLAGS_UIDGID = 1 << 3,
+ CFS_INODE_FLAGS_RDEV = 1 << 4,
+ CFS_INODE_FLAGS_TIMES = 1 << 5,
+ CFS_INODE_FLAGS_TIMES_NSEC = 1 << 6,
+ CFS_INODE_FLAGS_LOW_SIZE = 1 << 7, /* Low 32bit of st_size */
+ CFS_INODE_FLAGS_HIGH_SIZE = 1 << 8, /* High 32bit of st_size */
+ CFS_INODE_FLAGS_XATTRS = 1 << 9,
+ CFS_INODE_FLAGS_DIGEST = 1 << 10, /* fs-verity sha256 digest */
+ CFS_INODE_FLAGS_DIGEST_FROM_PAYLOAD = 1 << 11, /* Compute digest from payload */
+};
+
+#define CFS_INODE_FLAG_CHECK(_flag, _name) \
+ (((_flag) & (CFS_INODE_FLAGS_##_name)) != 0)
+#define CFS_INODE_FLAG_CHECK_SIZE(_flag, _name, _size) \
+ (CFS_INODE_FLAG_CHECK(_flag, _name) ? (_size) : 0)
+
+#define CFS_INODE_DEFAULT_MODE 0100644
+#define CFS_INODE_DEFAULT_NLINK 1
+#define CFS_INODE_DEFAULT_NLINK_DIR 2
+#define CFS_INODE_DEFAULT_UIDGID 0
+#define CFS_INODE_DEFAULT_RDEV 0
+#define CFS_INODE_DEFAULT_TIMES 0
+
+struct cfs_inode_s {
+ u32 flags;
+
+ /* Optional data: (selected by flags) */
+
+ /* This is the size of the type specific data that comes directly after
+ * the inode in the file. Of this type:
+ *
+ * directory: cfs_dir_s
+ * regular file: the backing filename
+ * symlink: the target link
+ *
+ * Canonically payload_length is 0 for empty dir/file/symlink.
+ */
+ u32 payload_length;
+
+ u32 st_mode; /* File type and mode. */
+ u32 st_nlink; /* Number of hard links, only for regular files. */
+ u32 st_uid; /* User ID of owner. */
+ u32 st_gid; /* Group ID of owner. */
+ u32 st_rdev; /* Device ID (if special file). */
+ u64 st_size; /* Size of file, only used for regular files */
+
+ struct cfs_vdata_s xattrs; /* ref to variable data */
+
+ u8 digest[SHA256_DIGEST_SIZE]; /* fs-verity digest */
+
+ struct timespec64 st_mtim; /* Time of last modification. */
+ struct timespec64 st_ctim; /* Time of last status change. */
+};
+
+static inline u32 cfs_inode_encoded_size(u32 flags)
+{
+ return sizeof(u32) /* flags */ +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, PAYLOAD, sizeof(u32)) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, MODE, sizeof(u32)) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, NLINK, sizeof(u32)) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, UIDGID, sizeof(u32) + sizeof(u32)) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, RDEV, sizeof(u32)) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, TIMES, sizeof(u64) * 2) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, TIMES_NSEC, sizeof(u32) * 2) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, LOW_SIZE, sizeof(u32)) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, HIGH_SIZE, sizeof(u32)) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, XATTRS, sizeof(u64) + sizeof(u32)) +
+ CFS_INODE_FLAG_CHECK_SIZE(flags, DIGEST, SHA256_DIGEST_SIZE);
+}
+
+struct cfs_dentry_s {
+ /* Index of struct cfs_inode_s */
+ u64 inode_index;
+ u8 d_type;
+ u8 name_len;
+ u16 name_offset;
+} __packed;
+
+struct cfs_dir_chunk_s {
+ u16 n_dentries;
+ u16 chunk_size;
+ u64 chunk_offset;
+} __packed;
+
+struct cfs_dir_s {
+ u32 n_chunks;
+ struct cfs_dir_chunk_s chunks[];
+} __packed;
+
+#define cfs_dir_size(_n_chunks) \
+ (sizeof(struct cfs_dir_s) + (_n_chunks) * sizeof(struct cfs_dir_chunk_s))
+
+/* xattr representation. */
+struct cfs_xattr_element_s {
+ u16 key_length;
+ u16 value_length;
+} __packed;
+
+struct cfs_xattr_header_s {
+ u16 n_attr;
+ struct cfs_xattr_element_s attr[0];
+} __packed;
+
+#define cfs_xattr_header_size(_n_element) \
+ (sizeof(struct cfs_xattr_header_s) + \
+ (_n_element) * sizeof(struct cfs_xattr_element_s))
+
+#endif
--
2.39.0


2023-01-16 01:53:54

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] composefs: Add on-disk layout

On Fri, Jan 13, 2023 at 04:33:55PM +0100, Alexander Larsson wrote:
> This commit adds the on-disk layout header file of composefs.

This isn't really a useful commit message.

Perhaps it should actually explain what the overall goals of the
on-disk format are - space usage, complexity trade-offs, potential
issues with validation of variable payload sections, etc.

>
> Signed-off-by: Alexander Larsson <[email protected]>
> Co-developed-by: Giuseppe Scrivano <[email protected]>
> Signed-off-by: Giuseppe Scrivano <[email protected]>
> ---
> fs/composefs/cfs.h | 203 +++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 203 insertions(+)
> create mode 100644 fs/composefs/cfs.h
>
> diff --git a/fs/composefs/cfs.h b/fs/composefs/cfs.h
> new file mode 100644
> index 000000000000..658df728e366
> --- /dev/null
> +++ b/fs/composefs/cfs.h
> @@ -0,0 +1,203 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * composefs
> + *
> + * Copyright (C) 2021 Giuseppe Scrivano
> + * Copyright (C) 2022 Alexander Larsson
> + *
> + * This file is released under the GPL.
> + */
> +
> +#ifndef _CFS_H
> +#define _CFS_H
> +
> +#include <asm/byteorder.h>
> +#include <crypto/sha2.h>
> +#include <linux/fs.h>
> +#include <linux/stat.h>
> +#include <linux/types.h>
> +
> +#define CFS_VERSION 1

This should start with a description of the on-disk format for the
version 1 format.


> +
> +#define CFS_MAGIC 0xc078629aU
> +
> +#define CFS_MAX_DIR_CHUNK_SIZE 4096
> +#define CFS_MAX_XATTRS_SIZE 4096

How do we store 64kB xattrs in this format if the max attr size is
4096 bytes? Or is that the maximum total xattr storage?

A comment telling us what these limits are would be nice.

> +
> +static inline int cfs_digest_from_payload(const char *payload, size_t payload_len,
> + u8 digest_out[SHA256_DIGEST_SIZE])
> +{
> + const char *p, *end;
> + u8 last_digit = 0;
> + int digit = 0;
> + size_t n_nibbles = 0;
> +
> + /* This handles payloads (i.e. path names) that are "essentially" a
> + * digest as the digest (if the DIGEST_FROM_PAYLOAD flag is set). The
> + * "essential" part means that we ignore hierarchical structure as well
> + * as any extension. So, for example "ef/deadbeef.file" would match the
> + * (too short) digest "efdeadbeef".
> + *
> + * This allows images to avoid storing both the digest and the pathname,
> + * yet work with pre-existing object store formats of various kinds.
> + */
> +
> + end = payload + payload_len;
> + for (p = payload; p != end; p++) {
> + /* Skip subdir structure */
> + if (*p == '/')
> + continue;
> +
> + /* Break at (and ignore) extension */
> + if (*p == '.')
> + break;
> +
> + if (n_nibbles == SHA256_DIGEST_SIZE * 2)
> + return -EINVAL; /* Too long */
> +
> + digit = hex_to_bin(*p);
> + if (digit == -1)
> + return -EINVAL; /* Not hex digit */
> +
> + n_nibbles++;
> + if ((n_nibbles % 2) == 0)
> + digest_out[n_nibbles / 2 - 1] = (last_digit << 4) | digit;
> + last_digit = digit;
> + }
> +
> + if (n_nibbles != SHA256_DIGEST_SIZE * 2)
> + return -EINVAL; /* Too short */
> +
> + return 0;
> +}

Too big to be a inline function.

> +
> +struct cfs_vdata_s {

Drop the "_s" suffix to indicate the type is a structure - that's
waht "struct" tells us.

> + u64 off;
> + u32 len;

If these are on-disk format structures, why aren't the defined as
using the specific endian they are encoded in? i.e. __le64, __le32,
etc? Otherwise a file built on a big endian machine won't be
readable on a little endian machine (and vice versa).

> +} __packed;
> +
> +struct cfs_header_s {
> + u8 version;
> + u8 unused1;
> + u16 unused2;

Why are you hyper-optimising these structures for minimal space
usage? This is 2023 - we can use a __le32 for the version number,
the magic number and then leave....
> +
> + u32 magic;
> + u64 data_offset;
> + u64 root_inode;
> +
> + u64 unused3[2];

a whole heap of space to round it up to at least a CPU cacheline
size using something like "__le64 unused[15]".

That way we don't need packed structures nor do we care about having
weird little holes in the structures to fill....

> +} __packed;
> +
> +enum cfs_inode_flags {
> + CFS_INODE_FLAGS_NONE = 0,
> + CFS_INODE_FLAGS_PAYLOAD = 1 << 0,
> + CFS_INODE_FLAGS_MODE = 1 << 1,
> + CFS_INODE_FLAGS_NLINK = 1 << 2,
> + CFS_INODE_FLAGS_UIDGID = 1 << 3,
> + CFS_INODE_FLAGS_RDEV = 1 << 4,
> + CFS_INODE_FLAGS_TIMES = 1 << 5,
> + CFS_INODE_FLAGS_TIMES_NSEC = 1 << 6,
> + CFS_INODE_FLAGS_LOW_SIZE = 1 << 7, /* Low 32bit of st_size */
> + CFS_INODE_FLAGS_HIGH_SIZE = 1 << 8, /* High 32bit of st_size */

Why do we need to complicate things by splitting the inode size
like this?

> + CFS_INODE_FLAGS_XATTRS = 1 << 9,
> + CFS_INODE_FLAGS_DIGEST = 1 << 10, /* fs-verity sha256 digest */
> + CFS_INODE_FLAGS_DIGEST_FROM_PAYLOAD = 1 << 11, /* Compute digest from payload */
> +};
> +
> +#define CFS_INODE_FLAG_CHECK(_flag, _name) \
> + (((_flag) & (CFS_INODE_FLAGS_##_name)) != 0)

Check what about a flag? If this is a "check that a feature is set",
then open coding it better, but if you must do it like this, then
please use static inline functions like:

if (cfs_inode_has_xattrs(inode->flags)) {
.....
}

> +#define CFS_INODE_FLAG_CHECK_SIZE(_flag, _name, _size) \
> + (CFS_INODE_FLAG_CHECK(_flag, _name) ? (_size) : 0)

This doesn't seem particularly useful, because you've still got to
test is the return value is valid. i.e.

size = CFS_INODE_FLAG_CHECK_SIZE(inode->flags, XATTRS, 32);
if (size == 32) {
/* got xattrs, decode! */
}

vs
if (cfs_inode_has_xattrs(inode->flags)) {
/* decode! */
}



> +
> +#define CFS_INODE_DEFAULT_MODE 0100644
> +#define CFS_INODE_DEFAULT_NLINK 1
> +#define CFS_INODE_DEFAULT_NLINK_DIR 2
> +#define CFS_INODE_DEFAULT_UIDGID 0
> +#define CFS_INODE_DEFAULT_RDEV 0
> +#define CFS_INODE_DEFAULT_TIMES 0

Where do these get used? Are they on disk defaults or something
else? (comment, please!)

> +struct cfs_inode_s {
> + u32 flags;
> +
> + /* Optional data: (selected by flags) */

WHy would you make them optional given that all the fields are still
defined in the structure?

It's much simpler just to decode the entire structure into memory
than to have to check each flag value to determine if a field needs
to be decoded...

> + /* This is the size of the type specific data that comes directly after
> + * the inode in the file. Of this type:
> + *
> + * directory: cfs_dir_s
> + * regular file: the backing filename
> + * symlink: the target link
> + *
> + * Canonically payload_length is 0 for empty dir/file/symlink.
> + */
> + u32 payload_length;

How do you have an empty symlink?

> + u32 st_mode; /* File type and mode. */
> + u32 st_nlink; /* Number of hard links, only for regular files. */
> + u32 st_uid; /* User ID of owner. */
> + u32 st_gid; /* Group ID of owner. */
> + u32 st_rdev; /* Device ID (if special file). */
> + u64 st_size; /* Size of file, only used for regular files */
> +
> + struct cfs_vdata_s xattrs; /* ref to variable data */

This is in the payload that follows the inode? Is it included in
the payload_length above?

If not, where is this stuff located, how do we validate it points to
the correct place in the on-disk format file, the xattrs belong to
this specific inode, etc? I think that's kinda important to
describe, because xattrs often contain important security
information...


> +
> + u8 digest[SHA256_DIGEST_SIZE]; /* fs-verity digest */

Why would you have this in the on-disk structure, then also have
"digest from payload" that allows the digest to be in the payload
section of the inode data?

> +
> + struct timespec64 st_mtim; /* Time of last modification. */
> + struct timespec64 st_ctim; /* Time of last status change. */
> +};

This really feels like an in-memory format inode, not an on-disk
format inode, because this:

> +
> +static inline u32 cfs_inode_encoded_size(u32 flags)
> +{
> + return sizeof(u32) /* flags */ +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, PAYLOAD, sizeof(u32)) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, MODE, sizeof(u32)) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, NLINK, sizeof(u32)) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, UIDGID, sizeof(u32) + sizeof(u32)) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, RDEV, sizeof(u32)) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, TIMES, sizeof(u64) * 2) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, TIMES_NSEC, sizeof(u32) * 2) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, LOW_SIZE, sizeof(u32)) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, HIGH_SIZE, sizeof(u32)) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, XATTRS, sizeof(u64) + sizeof(u32)) +
> + CFS_INODE_FLAG_CHECK_SIZE(flags, DIGEST, SHA256_DIGEST_SIZE);
> +}

looks like the on-disk format is an encoded format hyper-optimised
for minimal storage space usage?

Without comments to explain it, I'm not exactly sure what is stored
in the on-disk format inodes, nor the layout of the variable
payload section or how payload sections are defined and verified.

Seems overly complex to me - it's far simpler just to have a fixed
inode structure and just decode it directly into the in-memory
structure when it is read....

> +struct cfs_dentry_s {
> + /* Index of struct cfs_inode_s */

Not a useful (or correct!) comment :/

Also, the typical term for this on disk structure in a filesystem is
a "dirent", and this is also what readdir() returns to userspace.
dentry is typically used internally in the kernel to refer to the
VFS cache layer objects, not the filesystem dirents the VFS layers
look up to populate it's dentry cache.

> + u64 inode_index;
> + u8 d_type;
> + u8 name_len;
> + u16 name_offset;

What's this name_offset refer to?

> +} __packed;
> +
> +struct cfs_dir_chunk_s {
> + u16 n_dentries;
> + u16 chunk_size;
> + u64 chunk_offset;

What's this chunk offset refer to?

> +} __packed;
> +
> +struct cfs_dir_s {
> + u32 n_chunks;
> + struct cfs_dir_chunk_s chunks[];
> +} __packed;

So directory data is packed in discrete chunks? Given that this is a
static directory format, and the size of the directory is known at
image creation time, why does the storage need to be chunked?

> +
> +#define cfs_dir_size(_n_chunks) \
> + (sizeof(struct cfs_dir_s) + (_n_chunks) * sizeof(struct cfs_dir_chunk_s))

static inline, at least.

Also, this appears to be the size of the encoded directory
header, not the size of the directory itself. cfs_dir_header_size(),
perhaps, to match the cfs_xattr_header_size() function that does the
same thing?

Cheers,

Dave.
--
Dave Chinner
[email protected]

2023-01-16 11:10:37

by Alexander Larsson

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] composefs: Add on-disk layout

On Mon, 2023-01-16 at 12:29 +1100, Dave Chinner wrote:
> On Fri, Jan 13, 2023 at 04:33:55PM +0100, Alexander Larsson wrote:
> > This commit adds the on-disk layout header file of composefs.
>
> This isn't really a useful commit message.
>
> Perhaps it should actually explain what the overall goals of the
> on-disk format are - space usage, complexity trade-offs, potential
> issues with validation of variable payload sections, etc.
>

I agree, will flesh it out. But, as for below discussions, one of the
overall goals is to keep the on-disk file size low.

> > Signed-off-by: Alexander Larsson <[email protected]>
> > Co-developed-by: Giuseppe Scrivano <[email protected]>
> > Signed-off-by: Giuseppe Scrivano <[email protected]>
> > ---
> >  fs/composefs/cfs.h | 203
> > +++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 203 insertions(+)
> >  create mode 100644 fs/composefs/cfs.h
> >
> > diff --git a/fs/composefs/cfs.h b/fs/composefs/cfs.h
> > new file mode 100644
> > index 000000000000..658df728e366
> > --- /dev/null
> > +++ b/fs/composefs/cfs.h
> > @@ -0,0 +1,203 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * composefs
> > + *
> > + * Copyright (C) 2021 Giuseppe Scrivano
> > + * Copyright (C) 2022 Alexander Larsson
> > + *
> > + * This file is released under the GPL.
> > + */
> > +
> > +#ifndef _CFS_H
> > +#define _CFS_H
> > +
> > +#include <asm/byteorder.h>
> > +#include <crypto/sha2.h>
> > +#include <linux/fs.h>
> > +#include <linux/stat.h>
> > +#include <linux/types.h>
> > +
> > +#define CFS_VERSION 1
>
> This should start with a description of the on-disk format for the
> version 1 format.

There are some format descriptions in the later document patch. What is
the general approach here, do we document in the header, or in separate
doc file? For example, I don't see much of format descriptions in the
xfs headers. I mean, I should probably add *some* info here for easier
reading of the stuff below, but I don't feel like headers are a great
place for docs.

> > +
> > +#define CFS_MAGIC 0xc078629aU
> > +
> > +#define CFS_MAX_DIR_CHUNK_SIZE 4096
> > +#define CFS_MAX_XATTRS_SIZE 4096
>
> How do we store 64kB xattrs in this format if the max attr size is
> 4096 bytes? Or is that the maximum total xattr storage?

This is a current limitation of the composefs file format. I am aware
that the kernel maximum size is 64k, but I'm not sure what use this
would have in a read-only filesystem image in practice. I could extend
this limit with some added complextity, but would it be worth the
increase in complexity?

> A comment telling us what these limits are would be nice.
>

Sure.

> > +
> > +static inline int cfs_digest_from_payload(const char *payload,
> > size_t payload_len,
> > +                                         u8
> > digest_out[SHA256_DIGEST_SIZE])
> > +{
> > +       const char *p, *end;
> > +       u8 last_digit = 0;
> > +       int digit = 0;
> > +       size_t n_nibbles = 0;
> > +
> > +       /* This handles payloads (i.e. path names) that are
> > "essentially" a
> > +        * digest as the digest (if the DIGEST_FROM_PAYLOAD flag is
> > set). The
> > +        * "essential" part means that we ignore hierarchical
> > structure as well
> > +        * as any extension. So, for example "ef/deadbeef.file"
> > would match the
> > +        * (too short) digest "efdeadbeef".
> > +        *
> > +        * This allows images to avoid storing both the digest and
> > the pathname,
> > +        * yet work with pre-existing object store formats of
> > various kinds.
> > +        */
> > +
> > +       end = payload + payload_len;
> > +       for (p = payload; p != end; p++) {
> > +               /* Skip subdir structure */
> > +               if (*p == '/')
> > +                       continue;
> > +
> > +               /* Break at (and ignore) extension */
> > +               if (*p == '.')
> > +                       break;
> > +
> > +               if (n_nibbles == SHA256_DIGEST_SIZE * 2)
> > +                       return -EINVAL; /* Too long */
> > +
> > +               digit = hex_to_bin(*p);
> > +               if (digit == -1)
> > +                       return -EINVAL; /* Not hex digit */
> > +
> > +               n_nibbles++;
> > +               if ((n_nibbles % 2) == 0)
> > +                       digest_out[n_nibbles / 2 - 1] = (last_digit
> > << 4) | digit;
> > +               last_digit = digit;
> > +       }
> > +
> > +       if (n_nibbles != SHA256_DIGEST_SIZE * 2)
> > +               return -EINVAL; /* Too short */
> > +
> > +       return 0;
> > +}
>
> Too big to be a inline function.
>

Yeah, I'm aware of this. I mainly put it in the header as the
implementation of it is sort of part of the on-disk format. But, I can
move it to a .c file instead.


> > +
> > +struct cfs_vdata_s {
>
> Drop the "_s" suffix to indicate the type is a structure - that's
> waht "struct" tells us.

Sure.

> > +       u64 off;
> > +       u32 len;
>
> If these are on-disk format structures, why aren't the defined as
> using the specific endian they are encoded in? i.e. __le64, __le32,
> etc? Otherwise a file built on a big endian machine won't be
> readable on a little endian machine (and vice versa).

On disk all fields are little endian. However, when we read them from
disk we convert them using e.g. le32_to_cpu(), and then we use the same
structure in memory, with native endian. So, it seems wrong to mark
them as little endian.

>
> > +} __packed;
> > +
> > +struct cfs_header_s {
> > +       u8 version;
> > +       u8 unused1;
> > +       u16 unused2;
>
> Why are you hyper-optimising these structures for minimal space
> usage? This is 2023 - we can use a __le32 for the version number,
> the magic number and then leave....
>
> > +
> > +       u32 magic;
> > +       u64 data_offset;
> > +       u64 root_inode;
> > +
> > +       u64 unused3[2];
>
> a whole heap of space to round it up to at least a CPU cacheline
> size using something like "__le64 unused[15]".
>
> That way we don't need packed structures nor do we care about having
> weird little holes in the structures to fill....

Sure.

> > +} __packed;
> > +
> > +enum cfs_inode_flags {
> > +       CFS_INODE_FLAGS_NONE = 0,
> > +       CFS_INODE_FLAGS_PAYLOAD = 1 << 0,
> > +       CFS_INODE_FLAGS_MODE = 1 << 1,
> > +       CFS_INODE_FLAGS_NLINK = 1 << 2,
> > +       CFS_INODE_FLAGS_UIDGID = 1 << 3,
> > +       CFS_INODE_FLAGS_RDEV = 1 << 4,
> > +       CFS_INODE_FLAGS_TIMES = 1 << 5,
> > +       CFS_INODE_FLAGS_TIMES_NSEC = 1 << 6,
> > +       CFS_INODE_FLAGS_LOW_SIZE = 1 << 7, /* Low 32bit of st_size
> > */
> > +       CFS_INODE_FLAGS_HIGH_SIZE = 1 << 8, /* High 32bit of
> > st_size */
>
> Why do we need to complicate things by splitting the inode size
> like this?
>

The goal is to minimize the image size for a typical rootfs or
container image. Almost zero files in any such images are > 4GB. 

Also, we don't just "not decode" the items with the flag not set, they
are not even stored on disk.

> > +       CFS_INODE_FLAGS_XATTRS = 1 << 9,
> > +       CFS_INODE_FLAGS_DIGEST = 1 << 10, /* fs-verity sha256
> > digest */
> > +       CFS_INODE_FLAGS_DIGEST_FROM_PAYLOAD = 1 << 11, /* Compute
> > digest from payload */
> > +};
> > +
> > +#define CFS_INODE_FLAG_CHECK(_flag,
> > _name)                                     \
> > +       (((_flag) & (CFS_INODE_FLAGS_##_name)) != 0)
>
> Check what about a flag? If this is a "check that a feature is set",
> then open coding it better, but if you must do it like this, then
> please use static inline functions like:
>
>         if (cfs_inode_has_xattrs(inode->flags)) {
>                 .....
>         }
>

The check is if the flag is set, so maybe CFS_INODE_FLAG_IS_SET is a
better name. This is used only when decoding the on-disk version of the
inode to the in memory one, which is a bunch of:

if (CFS_INODE_FLAG_CHECK(ino->flags, THE_FIELD))
ino->the_field = cfs_read_u32(&data);
else
ino->the_field = THE_FIELD_DEFUALT;

I can easily open-code these checks, although I'm not sure it makes a
great difference either way.

> > +#define CFS_INODE_FLAG_CHECK_SIZE(_flag, _name,
> > _size)                         \
> > +       (CFS_INODE_FLAG_CHECK(_flag, _name) ? (_size) : 0)
>
> This doesn't seem particularly useful, because you've still got to
> test is the return value is valid. i.e.
>
>         size = CFS_INODE_FLAG_CHECK_SIZE(inode->flags, XATTRS, 32);
>         if (size == 32) {
>                 /* got xattrs, decode! */
>         }
>
> vs
>         if (cfs_inode_has_xattrs(inode->flags)) {
>                 /* decode! */
>         }

This macro is only uses by the cfs_inode_encoded_size() function that
computes the size of the on-disk format of an inode, given its flags:

static inline u32 cfs_inode_encoded_size(u32 flags)
{
return sizeof(u32) /* flags */ +
CFS_INODE_FLAG_CHECK_SIZE(flags, PAYLOAD, sizeof(u32))
+
CFS_INODE_FLAG_CHECK_SIZE(flags, MODE, sizeof(u32)) +
CFS_INODE_FLAG_CHECK_SIZE(flags, NLINK, sizeof(u32)) +
...

It is only useful in the sense that it makes this function easy to
read/write. I should maybe move the definion of the macro to that
function.

>
> > +
> > +#define CFS_INODE_DEFAULT_MODE 0100644
> > +#define CFS_INODE_DEFAULT_NLINK 1
> > +#define CFS_INODE_DEFAULT_NLINK_DIR 2
> > +#define CFS_INODE_DEFAULT_UIDGID 0
> > +#define CFS_INODE_DEFAULT_RDEV 0
> > +#define CFS_INODE_DEFAULT_TIMES 0
>
> Where do these get used? Are they on disk defaults or something
> else? (comment, please!)

They are the defaults that are used when inode fields on disk are
missing. I'll add some comments.

> > +struct cfs_inode_s {
> > +       u32 flags;
> > +
> > +       /* Optional data: (selected by flags) */
>
> WHy would you make them optional given that all the fields are still
> defined in the structure?
>
> It's much simpler just to decode the entire structure into memory
> than to have to check each flag value to determine if a field needs
> to be decoded...
>

I guess I need to clarify these comments a bit, but they are optional
on-disk, and decoded and extended with the above defaults by
cfs_get_ino_index() when read into memory. So, they are not optional in
memory.


> > +       /* This is the size of the type specific data that comes
> > directly after
> > +        * the inode in the file. Of this type:
> > +        *
> > +        * directory: cfs_dir_s
> > +        * regular file: the backing filename
> > +        * symlink: the target link
> > +        *
> > +        * Canonically payload_length is 0 for empty
> > dir/file/symlink.
> > +        */
> > +       u32 payload_length;
>
> How do you have an empty symlink?

In terms of the file format, empty would mean a zero length target
string. But you're right that this isn't allowed. I'll change this
comment.

> > +       u32 st_mode; /* File type and mode.  */
> > +       u32 st_nlink; /* Number of hard links, only for regular
> > files.  */
> > +       u32 st_uid; /* User ID of owner.  */
> > +       u32 st_gid; /* Group ID of owner.  */
> > +       u32 st_rdev; /* Device ID (if special file).  */
> > +       u64 st_size; /* Size of file, only used for regular files
> > */
> > +
> > +       struct cfs_vdata_s xattrs; /* ref to variable data */
>
> This is in the payload that follows the inode?  Is it included in
> the payload_length above?
>
> If not, where is this stuff located, how do we validate it points to
> the correct place in the on-disk format file, the xattrs belong to
> this specific inode, etc? I think that's kinda important to
> describe, because xattrs often contain important security
> information...

No, all inodes are packed into the initial part of the file, each
containing a flags set, a variable size (from flags) chunk of fixed
size elements and an variable size payload. The payload is either the
target symlink for symlinks, or the path of the backing file for
regular files. Other data, such as xattrs and dirents are stored in a
separate part of the file and the offsets for those in the inode refer
to offsets into that area.

>
> > +
> > +       u8 digest[SHA256_DIGEST_SIZE]; /* fs-verity digest */
>
> Why would you have this in the on-disk structure, then also have
> "digest from payload" that allows the digest to be in the payload
> section of the inode data?

The payload is normally the path to the backing file, and then you need
to store the verity digest separately. This is what would be needed
when using this with ostree for instance, because we have an existing
backing file repo format we can't change. However, if your backing
store files are stored by their fs-verity digest already (which is the
default for mkcomposefs), then we can set this flag and avoid storing
the digest unnecessary.

> > +
> > +       struct timespec64 st_mtim; /* Time of last modification. 
> > */
> > +       struct timespec64 st_ctim; /* Time of last status change. 
> > */
> > +};
>
> This really feels like an in-memory format inode, not an on-disk
> format inode, because this:
>
> > +
> > +static inline u32 cfs_inode_encoded_size(u32 flags)
> > +{
> > +       return sizeof(u32) /* flags */ +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, PAYLOAD,
> > sizeof(u32)) +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, MODE, sizeof(u32))
> > +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, NLINK, sizeof(u32))
> > +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, UIDGID, sizeof(u32)
> > + sizeof(u32)) +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, RDEV, sizeof(u32))
> > +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, TIMES, sizeof(u64)
> > * 2) +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, TIMES_NSEC,
> > sizeof(u32) * 2) +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, LOW_SIZE,
> > sizeof(u32)) +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, HIGH_SIZE,
> > sizeof(u32)) +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, XATTRS, sizeof(u64)
> > + sizeof(u32)) +
> > +              CFS_INODE_FLAG_CHECK_SIZE(flags, DIGEST,
> > SHA256_DIGEST_SIZE);
> > +}
>
> looks like the on-disk format is an encoded format hyper-optimised
> for minimal storage space usage?

Yes.


> Without comments to explain it, I'm not exactly sure what is stored
> in the on-disk format inodes, nor the layout of the variable
> payload section or how payload sections are defined and verified.
>
> Seems overly complex to me - it's far simpler just to have a fixed
> inode structure and just decode it directly into the in-memory
> structure when it is read....

We have a not-fixed-size on disk inode structure (for size reasons)
which we decode directly into the above in-memory structure when read.
So I don't think we're that far from what you expect. However, yes,
this could easily be explained better.

>
> > +struct cfs_dentry_s {
> > +       /* Index of struct cfs_inode_s */
>
> Not a useful (or correct!) comment :/

Its not really incorrect, but I agree its not neccessary a great
comment. At this specific offset in the inode section we can decode the
cfs_inode_s that this inode refers to, and his offset is also the inode
number of the inode.

> Also, the typical term for this on disk structure in a filesystem is
> a "dirent", and this is also what readdir() returns to userspace.
> dentry is typically used internally in the kernel to refer to the
> VFS cache layer objects, not the filesystem dirents the VFS layers
> look up to populate it's dentry cache.
>

Yeah, i'll rename it.

> > +       u64 inode_index;
> > +       u8 d_type;
> > +       u8 name_len;
> > +       u16 name_offset;
>
> What's this name_offset refer to?

Dirents are stored in chunks, each chunk < 4k. These chunks are a list
of these dirents, followed by the strings for the names, the
name_offset is the offset from the start of the chunk to the name.

> > +} __packed;
> > +
> > +struct cfs_dir_chunk_s {
> > +       u16 n_dentries;
> > +       u16 chunk_size;
> > +       u64 chunk_offset;
>
> What's this chunk offset refer to?
>

This is the offset in the "variable data" section of the image. This
section follows the packed inode data section. Again, better comments
needed.

> > +} __packed;
> > +
> > +struct cfs_dir_s {
> > +       u32 n_chunks;
> > +       struct cfs_dir_chunk_s chunks[];
> > +} __packed;
>
> So directory data is packed in discrete chunks? Given that this is a
> static directory format, and the size of the directory is known at
> image creation time, why does the storage need to be chunked?

We chunk the data such that each chunk fits inside a single page in the
image file. I did this to make accessing image data directly from the
page cache easier. We can just kmap_page_local() each chunk and treat
it as a non-split continuous dirent array, then move on to the next
chunk in the next page. If we had dirent data spanning multiple pages
then we would either need to map the pages consecutively (which seems
hard/costly) or have complex in-kernel code to handle the case where a
dirent straddles two pages.

> > +
> > +#define
> > cfs_dir_size(_n_chunks)                                            
> >     \
> > +       (sizeof(struct cfs_dir_s) + (_n_chunks) * sizeof(struct
> > cfs_dir_chunk_s))
>
> static inline, at least.
>
> Also, this appears to be the size of the encoded directory
> header, not the size of the directory itself. cfs_dir_header_size(),
> perhaps, to match the cfs_xattr_header_size() function that does the
> same thing?

Yeah, that makes sense.

--
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
=-=-=
Alexander Larsson Red Hat,
Inc
[email protected] [email protected]
He's a suicidal hunchbacked cyborg who knows the secret of the alien
invasion. She's a time-travelling out-of-work snake charmer with a song
in her heart and a spring in her step. They fight crime!

2023-01-16 23:22:18

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] composefs: Add on-disk layout

On Mon, Jan 16, 2023 at 12:00:03PM +0100, Alexander Larsson wrote:
> On Mon, 2023-01-16 at 12:29 +1100, Dave Chinner wrote:
> > On Fri, Jan 13, 2023 at 04:33:55PM +0100, Alexander Larsson wrote:
> > > This commit adds the on-disk layout header file of composefs.
> >
> > This isn't really a useful commit message.
> >
> > Perhaps it should actually explain what the overall goals of the
> > on-disk format are - space usage, complexity trade-offs, potential
> > issues with validation of variable payload sections, etc.
> >
>
> I agree, will flesh it out. But, as for below discussions, one of the
> overall goals is to keep the on-disk file size low.
>
> > > Signed-off-by: Alexander Larsson <[email protected]>
> > > Co-developed-by: Giuseppe Scrivano <[email protected]>
> > > Signed-off-by: Giuseppe Scrivano <[email protected]>
> > > ---
> > > ?fs/composefs/cfs.h | 203
> > > +++++++++++++++++++++++++++++++++++++++++++++
> > > ?1 file changed, 203 insertions(+)
> > > ?create mode 100644 fs/composefs/cfs.h
> > >
> > > diff --git a/fs/composefs/cfs.h b/fs/composefs/cfs.h
> > > new file mode 100644
> > > index 000000000000..658df728e366
> > > --- /dev/null
> > > +++ b/fs/composefs/cfs.h
> > > @@ -0,0 +1,203 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +/*
> > > + * composefs
> > > + *
> > > + * Copyright (C) 2021 Giuseppe Scrivano
> > > + * Copyright (C) 2022 Alexander Larsson
> > > + *
> > > + * This file is released under the GPL.
> > > + */
> > > +
> > > +#ifndef _CFS_H
> > > +#define _CFS_H
> > > +
> > > +#include <asm/byteorder.h>
> > > +#include <crypto/sha2.h>
> > > +#include <linux/fs.h>
> > > +#include <linux/stat.h>
> > > +#include <linux/types.h>
> > > +
> > > +#define CFS_VERSION 1
> >
> > This should start with a description of the on-disk format for the
> > version 1 format.
>
> There are some format descriptions in the later document patch. What is
> the general approach here, do we document in the header, or in separate
> doc file? For example, I don't see much of format descriptions in the
> xfs headers. I mean, I should probably add *some* info here for easier
> reading of the stuff below, but I don't feel like headers are a great
> place for docs.

it's fine to describe the format in the docs, but when reading the
code there needs to at least an overview of the structure the code
is implementing so that the code makes some sense without having to
go find the external place the format is documented.

> > > +
> > > +#define CFS_MAGIC 0xc078629aU
> > > +
> > > +#define CFS_MAX_DIR_CHUNK_SIZE 4096
> > > +#define CFS_MAX_XATTRS_SIZE 4096
> >
> > How do we store 64kB xattrs in this format if the max attr size is
> > 4096 bytes? Or is that the maximum total xattr storage?
>
> This is a current limitation of the composefs file format.

Yes, but is that 4kB limit the maximum size of a single xattr, or is
it the total xattr storage space for an inode?

> I am aware
> that the kernel maximum size is 64k,

For a single xattr, yes. Hence my question....

> > > +static inline int cfs_digest_from_payload(const char *payload,
> > > size_t payload_len,
> > > +???????????????????????????????????????? u8
> > > digest_out[SHA256_DIGEST_SIZE])
.....
> > Too big to be a inline function.
>
> Yeah, I'm aware of this. I mainly put it in the header as the
> implementation of it is sort of part of the on-disk format. But, I can
> move it to a .c file instead.

Please do - it's really part of the reader implementation, not the
structure definition.

> > > +struct cfs_vdata_s {
> >
> > Drop the "_s" suffix to indicate the type is a structure - that's
> > waht "struct" tells us.
>
> Sure.
>
> > > +???????u64 off;
> > > +???????u32 len;
> >
> > If these are on-disk format structures, why aren't the defined as
> > using the specific endian they are encoded in? i.e. __le64, __le32,
> > etc? Otherwise a file built on a big endian machine won't be
> > readable on a little endian machine (and vice versa).
>
> On disk all fields are little endian. However, when we read them from
> disk we convert them using e.g. le32_to_cpu(), and then we use the same
> structure in memory, with native endian. So, it seems wrong to mark
> them as little endian.

Then these structures do not define "on-disk format". Looking a bit
further through the patchset, these are largely intermediate
structures that are read once to instatiate objects in memory, then
never used again. The cfs_inode_s is a good example of this - I'll
come back to that.

>
> >
> > > +} __packed;
> > > +
> > > +struct cfs_header_s {
> > > +???????u8 version;
> > > +???????u8 unused1;
> > > +???????u16 unused2;
> >
> > Why are you hyper-optimising these structures for minimal space
> > usage? This is 2023 - we can use a __le32 for the version number,
> > the magic number and then leave....
> >
> > > +
> > > +???????u32 magic;
> > > +???????u64 data_offset;
> > > +???????u64 root_inode;
> > > +
> > > +???????u64 unused3[2];
> >
> > a whole heap of space to round it up to at least a CPU cacheline
> > size using something like "__le64 unused[15]".
> >
> > That way we don't need packed structures nor do we care about having
> > weird little holes in the structures to fill....
>
> Sure.

FWIW, now I see how this is used, this header kinda defines what
we'd call the superblock in the on-disk format of a filesystem. It's
at a fixed location in the image file, so there should be a #define
somewhere in this file to document it's fixed location.

Also, if this is the in-memory representation of the structure and
not the actual on-disk format, why does it even need padding,
packing or even store the magic number?

i.e. this information could simply be stored in a few fields in the cfs
superblock structure that wraps the vfs superblock, and the
superblock read function could decode straight into those fields...


> > > +} __packed;
> > > +
> > > +enum cfs_inode_flags {
> > > +???????CFS_INODE_FLAGS_NONE = 0,
> > > +???????CFS_INODE_FLAGS_PAYLOAD = 1 << 0,
> > > +???????CFS_INODE_FLAGS_MODE = 1 << 1,
> > > +???????CFS_INODE_FLAGS_NLINK = 1 << 2,
> > > +???????CFS_INODE_FLAGS_UIDGID = 1 << 3,
> > > +???????CFS_INODE_FLAGS_RDEV = 1 << 4,
> > > +???????CFS_INODE_FLAGS_TIMES = 1 << 5,
> > > +???????CFS_INODE_FLAGS_TIMES_NSEC = 1 << 6,
> > > +???????CFS_INODE_FLAGS_LOW_SIZE = 1 << 7, /* Low 32bit of st_size
> > > */
> > > +???????CFS_INODE_FLAGS_HIGH_SIZE = 1 << 8, /* High 32bit of
> > > st_size */
> >
> > Why do we need to complicate things by splitting the inode size
> > like this?
> >
>
> The goal is to minimize the image size for a typical rootfs or
> container image. Almost zero files in any such images are > 4GB.?

Sure, but how much space does this typically save, versus how much
complexity it adds to runtime decoding of inodes?

I mean, in a dense container system the critical resources that need
to be saved is runtime memory and CPU overhead of operations, not
the storage space. Saving a 30-40 bytes of storage space per inode
means a typical image might ber a few MB smaller, but given the
image file is not storing data we're only talking about images the
use maybe 500 bytes of data per inode. Storage space for images
is not a limiting factor, nor is network transmission (because
compression), so it comes back to runtime CPU and memory usage.

The inodes are decoded out of the page cache, so the memory for the
raw inode information is volatile and reclaimed when needed.
Similarly, the VFS inode built from this information is reclaimable
when not in use, too. So the only real overhead for runtime is the
decoding time to find the inode in the image file and then decode
it.

Given the decoding of the inode -all branches- and is not
straight-line code, it cannot be well optimised and the CPU branch
predictor is not going to get it right every time. Straight line
code that decodes every field whether it is zero or not is going to
be faster.

Further, with a fixed size inode in the image file, the inode table
can be entirely fixed size, getting rid of the whole unaligned data
retreival problem that code currently has (yes, all that
"le32_to_cpu(__get_unaligned(__le32, data)" code) because we can
ensure that all the inode fields are aligned in the data pages. This
will significantly speed up decoding into the in-memory inode
structures.

And to take it another step, the entire struct cfs_inode_s structure
could go away - it is entirely a temporary structure used to shuffle
data from the on-disk encoded format to the the initialisation of
the VFS inode. The on-disk inode data could be decoded directly into
the VFS inode after it has been instantiated, rather than decoding
the inode from the backing file and the instantiating the in-memory
inode.

i.e. instead of:

cfs_lookup()
cfs_dir_lookup(&index)
cfs_get_ino_index(index, &inode_s)
cfs_get_inode_data_max(index, &data)
inode_s->st_.... = cfs_read_....(&data);
inode_s->st_.... = cfs_read_....(&data);
inode_s->st_.... = cfs_read_....(&data);
inode_s->st_.... = cfs_read_....(&data);
cfs_make_inode(inode_s, &vfs_inode)
inode = new_inode(sb)
inode->i_... = inode_s->st_....;
inode->i_... = inode_s->st_....;
inode->i_... = inode_s->st_....;
inode->i_... = inode_s->st_....;

You could collapse this straight down to:

cfs_lookup()
cfs_dir_lookup(&index)
cfs_make_inode(index, &vfs_inode)
inode = new_inode(sb)
cfs_get_inode_data_max(index, &data)
inode->i_... = cfs_read_....(&data);
inode->i_... = cfs_read_....(&data);
inode->i_... = cfs_read_....(&data);
inode->i_... = cfs_read_....(&data);

This removes an intermediately layer from the inode instantiation
fast path completely. ANd if the inode table is fixed size and
always well aligned, then the cfs_make_inode() code that sets up the
VFS inode is almost unchanged from what it is now. There are no new
branches, the file image format is greatly simplified, and the
runtime overhead of instantiating inodes is significantly reduced.

Similar things can be done with the rest of the "descriptor"
abstraction - the intermediate in-memory structures can be placed
directly in the cfs_inode structure that wraps the VFS inode, and
the initialisation of them can call the decoding code directly
instead of using intermediate structures as is currently done.

This will remove a chunk of code from the implemenation and make it
run faster....

> Also, we don't just "not decode" the items with the flag not set, they
> are not even stored on disk.

Yup, and I think that is a mistake - premature optimisation and all
that...

>
> > > +???????CFS_INODE_FLAGS_XATTRS = 1 << 9,
> > > +???????CFS_INODE_FLAGS_DIGEST = 1 << 10, /* fs-verity sha256
> > > digest */
> > > +???????CFS_INODE_FLAGS_DIGEST_FROM_PAYLOAD = 1 << 11, /* Compute
> > > digest from payload */
> > > +};
> > > +
> > > +#define CFS_INODE_FLAG_CHECK(_flag,
> > > _name)???????????????????????????????????? \
> > > +???????(((_flag) & (CFS_INODE_FLAGS_##_name)) != 0)
> >
> > Check what about a flag? If this is a "check that a feature is set",
> > then open coding it better, but if you must do it like this, then
> > please use static inline functions like:
> >
> > ????????if (cfs_inode_has_xattrs(inode->flags)) {
> > ????????????????.....
> > ????????}
> >
>
> The check is if the flag is set, so maybe CFS_INODE_FLAG_IS_SET is a
> better name. This is used only when decoding the on-disk version of the
> inode to the in memory one, which is a bunch of:
>
> if (CFS_INODE_FLAG_CHECK(ino->flags, THE_FIELD))
> ino->the_field = cfs_read_u32(&data);
> else
> ino->the_field = THE_FIELD_DEFUALT;
>
> I can easily open-code these checks, although I'm not sure it makes a
> great difference either way.

If they are used only once, then it should be open coded. But I
think the whole "optional inode fields" stuff should just go away
entirely at this point...

> > > +#define CFS_INODE_DEFAULT_MODE 0100644
> > > +#define CFS_INODE_DEFAULT_NLINK 1
> > > +#define CFS_INODE_DEFAULT_NLINK_DIR 2
> > > +#define CFS_INODE_DEFAULT_UIDGID 0
> > > +#define CFS_INODE_DEFAULT_RDEV 0
> > > +#define CFS_INODE_DEFAULT_TIMES 0
> >
> > Where do these get used? Are they on disk defaults or something
> > else? (comment, please!)
>
> They are the defaults that are used when inode fields on disk are
> missing. I'll add some comments.

They go away entirely with fixed size on-disk inodes.

> > > +???????u32 st_mode; /* File type and mode.? */
> > > +???????u32 st_nlink; /* Number of hard links, only for regular
> > > files.? */
> > > +???????u32 st_uid; /* User ID of owner.? */
> > > +???????u32 st_gid; /* Group ID of owner.? */
> > > +???????u32 st_rdev; /* Device ID (if special file).? */
> > > +???????u64 st_size; /* Size of file, only used for regular files
> > > */
> > > +
> > > +???????struct cfs_vdata_s xattrs; /* ref to variable data */
> >
> > This is in the payload that follows the inode?? Is it included in
> > the payload_length above?
> >
> > If not, where is this stuff located, how do we validate it points to
> > the correct place in the on-disk format file, the xattrs belong to
> > this specific inode, etc? I think that's kinda important to
> > describe, because xattrs often contain important security
> > information...
>
> No, all inodes are packed into the initial part of the file, each
> containing a flags set, a variable size (from flags) chunk of fixed
> size elements and an variable size payload. The payload is either the
> target symlink for symlinks, or the path of the backing file for
> regular files.

Ok, I think you need to stop calling that a "payload", then. It's
the path name to the backing file. The backing file is only relevant
for S_IFREG and S_IFLINK types - directories don't need path names
as they only contain pointers to other inodes in the image file.
Types like S_IFIFO, S_IFBLK, etc should not have backing files,
either - they should just be instantiated as the correct type in the
VFS inode and not require any backing file interactions at all...

Hence I think this "payload" should be called something like
"backing path" or something similar.

> Other data, such as xattrs and dirents are stored in a
> separate part of the file and the offsets for those in the inode refer
> to offsets into that area.

So "variable data" and "payload" are different sections in the file
format? You haven't defined what these names mean anywhere in this
file (see my original comment about describing the format in the
code), so it's hard to understand the difference without any actual
reference....

> > Why would you have this in the on-disk structure, then also have
> > "digest from payload" that allows the digest to be in the payload
> > section of the inode data?
>
> The payload is normally the path to the backing file, and then you need
> to store the verity digest separately. This is what would be needed
> when using this with ostree for instance, because we have an existing
> backing file repo format we can't change.

*nod*

> However, if your backing
> store files are stored by their fs-verity digest already (which is the
> default for mkcomposefs), then we can set this flag and avoid storing
> the digest unnecessary.

So if you name your files according to the fsverity digest using
FS_VERITY_HASH_ALG_SHA256, you have to either completely rebuild
the repository if you want to change to FS_VERITY_HASH_ALG_SHA512
or you have to move to storing the fsverity digest in the image
files anyway?

Seems much better to me to require the repo to use an independent
content index rather than try to use the same hash to index the
contents and detect tampering at the same time. This, once again,
seems like an attempt to minimise file image size at the expense of
everything else and I'm very much not convinced this is the right
tradeoff to be making for modern computing technologies.

.....

> > > +struct cfs_dir_s {
> > > +???????u32 n_chunks;
> > > +???????struct cfs_dir_chunk_s chunks[];
> > > +} __packed;
> >
> > So directory data is packed in discrete chunks? Given that this is a
> > static directory format, and the size of the directory is known at
> > image creation time, why does the storage need to be chunked?
>
> We chunk the data such that each chunk fits inside a single page in the
> image file. I did this to make accessing image data directly from the
> page cache easier.

Hmmmm. So you defined a -block size- that matched the x86-64 -page
size- to avoid page cache issues. Now, what about ARM or POWER
which has 64kB page sizes?

IOWs, "page size" is not the same on all machines, whilst the
on-disk format for a filesystem image needs to be the same on all
machines. Hence it appears that this:

> > > +#define CFS_MAX_DIR_CHUNK_SIZE 4096

should actually be defined in terms of the block size for the
filesystem image, and this size of these dir chunks should be
recorded in the superblock of the filesystem image. That way it
is clear that the image has a specific chunk size, and it also paves
the way for supporting more efficient directory structures using
larger-than-page size chunks in future.

> We can just kmap_page_local() each chunk and treat
> it as a non-split continuous dirent array, then move on to the next
> chunk in the next page.

OK.

> If we had dirent data spanning multiple pages
> then we would either need to map the pages consecutively (which seems
> hard/costly) or have complex in-kernel code to handle the case where a
> dirent straddles two pages.

Actually pretty easy - we do this with XFS for multi-page directory
buffers. We just use vm_map_ram() on a page array at the moment,
but in the near future there will be other options based on
multipage folios.

That is, the page cache now stores folios rather than pages, and is
capable of using contiguous multi-page folios in the cache. As a
result, multipage folios could be used to cache multi-page
structures in the page cache and efficiently map them as a whole.

That mapping code isn't there yet - kmap_local_folio() only maps the
page within the folio at the offset given - but the foundation is
there for supporting this functionality natively....

I certainly wouldn't be designing a new filesystem these days that
has it's on-disk format constrained by the x86-64 4kB page size...

Cheers,

Dave.
--
Dave Chinner
[email protected]

2023-01-17 12:32:43

by Alexander Larsson

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] composefs: Add on-disk layout

On Tue, 2023-01-17 at 10:06 +1100, Dave Chinner wrote:
> On Mon, Jan 16, 2023 at 12:00:03PM +0100, Alexander Larsson wrote:
> > On Mon, 2023-01-16 at 12:29 +1100, Dave Chinner wrote:
> > > On Fri, Jan 13, 2023 at 04:33:55PM +0100, Alexander Larsson
> > > wrote:
> > > > This commit adds the on-disk layout header file of composefs.
> > >
> > > This isn't really a useful commit message.
> > >
> > > Perhaps it should actually explain what the overall goals of the
> > > on-disk format are - space usage, complexity trade-offs,
> > > potential
> > > issues with validation of variable payload sections, etc.
> > >
> >
> > I agree, will flesh it out. But, as for below discussions, one of
> > the
> > overall goals is to keep the on-disk file size low.
> >
> > > > Signed-off-by: Alexander Larsson <[email protected]>
> > > > Co-developed-by: Giuseppe Scrivano <[email protected]>
> > > > Signed-off-by: Giuseppe Scrivano <[email protected]>
> > > > ---
> > > >  fs/composefs/cfs.h | 203
> > > > +++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 203 insertions(+)
> > > >  create mode 100644 fs/composefs/cfs.h
> > > >
> > > > diff --git a/fs/composefs/cfs.h b/fs/composefs/cfs.h
> > > > new file mode 100644
> > > > index 000000000000..658df728e366
> > > > --- /dev/null
> > > > +++ b/fs/composefs/cfs.h
> > > > @@ -0,0 +1,203 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > +/*
> > > > + * composefs
> > > > + *
> > > > + * Copyright (C) 2021 Giuseppe Scrivano
> > > > + * Copyright (C) 2022 Alexander Larsson
> > > > + *
> > > > + * This file is released under the GPL.
> > > > + */
> > > > +
> > > > +#ifndef _CFS_H
> > > > +#define _CFS_H
> > > > +
> > > > +#include <asm/byteorder.h>
> > > > +#include <crypto/sha2.h>
> > > > +#include <linux/fs.h>
> > > > +#include <linux/stat.h>
> > > > +#include <linux/types.h>
> > > > +
> > > > +#define CFS_VERSION 1
> > >
> > > This should start with a description of the on-disk format for
> > > the
> > > version 1 format.
> >
> > There are some format descriptions in the later document patch.
> > What is
> > the general approach here, do we document in the header, or in
> > separate
> > doc file? For example, I don't see much of format descriptions in
> > the
> > xfs headers. I mean, I should probably add *some* info here for
> > easier
> > reading of the stuff below, but I don't feel like headers are a
> > great
> > place for docs.
>
> it's fine to describe the format in the docs, but when reading the
> code there needs to at least an overview of the structure the code
> is implementing so that the code makes some sense without having to
> go find the external place the format is documented.

Yeah, I'll try to make format in the next series be overall more
commented.

> > >
> > > > +#define CFS_MAGIC 0xc078629aU
> > > > +
> > > > +#define CFS_MAX_DIR_CHUNK_SIZE 4096
> > > > +#define CFS_MAX_XATTRS_SIZE 4096
> > >
> > > How do we store 64kB xattrs in this format if the max attr size
> > > is
> > > 4096 bytes? Or is that the maximum total xattr storage?
> >
> > This is a current limitation of the composefs file format.
>
> Yes, but is that 4kB limit the maximum size of a single xattr, or is
> it the total xattr storage space for an inode?

Currently it is actually the total xattrs storage. I've never seen any
container images or rootfs images in general use any large amount of
xattrs. However, given the below discussion on multi-page mappings
maybe its possible to easily drop this limit.

> > I am aware
> > that the kernel maximum size is 64k,
>
> For a single xattr, yes. Hence my question....
>
> > > > +static inline int cfs_digest_from_payload(const char *payload,
> > > > size_t payload_len,
> > > > +                                         u8
> > > > digest_out[SHA256_DIGEST_SIZE])
> .....
> > > Too big to be a inline function.
> >
> > Yeah, I'm aware of this. I mainly put it in the header as the
> > implementation of it is sort of part of the on-disk format. But, I
> > can
> > move it to a .c file instead.
>
> Please do - it's really part of the reader implementation, not the
> structure definition.
>
> > > > +struct cfs_vdata_s {
> > >
> > > Drop the "_s" suffix to indicate the type is a structure - that's
> > > waht "struct" tells us.
> >
> > Sure.
> >
> > > > +       u64 off;
> > > > +       u32 len;
> > >
> > > If these are on-disk format structures, why aren't the defined as
> > > using the specific endian they are encoded in? i.e. __le64,
> > > __le32,
> > > etc? Otherwise a file built on a big endian machine won't be
> > > readable on a little endian machine (and vice versa).
> >
> > On disk all fields are little endian. However, when we read them
> > from
> > disk we convert them using e.g. le32_to_cpu(), and then we use the
> > same
> > structure in memory, with native endian. So, it seems wrong to mark
> > them as little endian.
>
> Then these structures do not define "on-disk format". Looking a bit
> further through the patchset, these are largely intermediate
> structures that are read once to instatiate objects in memory, then
> never used again. The cfs_inode_s is a good example of this - I'll
> come back to that.

The header/superblock is actually just read from the fs as-is, as are
most of the other structures. Only the inode data is packed.

> > > > +} __packed;
> > > > +
> > > > +struct cfs_header_s {
> > > > +       u8 version;
> > > > +       u8 unused1;
> > > > +       u16 unused2;
> > >
> > > Why are you hyper-optimising these structures for minimal space
> > > usage? This is 2023 - we can use a __le32 for the version number,
> > > the magic number and then leave....
> > >
> > > > +
> > > > +       u32 magic;
> > > > +       u64 data_offset;
> > > > +       u64 root_inode;
> > > > +
> > > > +       u64 unused3[2];
> > >
> > > a whole heap of space to round it up to at least a CPU cacheline
> > > size using something like "__le64 unused[15]".
> > >
> > > That way we don't need packed structures nor do we care about
> > > having
> > > weird little holes in the structures to fill....
> >
> > Sure.
>
> FWIW, now I see how this is used, this header kinda defines what
> we'd call the superblock in the on-disk format of a filesystem. It's
> at a fixed location in the image file, so there should be a #define
> somewhere in this file to document it's fixed location.

It is at offset zero. I don't really think that needs a define, does
it? Maybe a comment though.

> Also, if this is the in-memory representation of the structure and
> not the actual on-disk format, why does it even need padding,
> packing or even store the magic number?

In this case it is the on-disk format though.

> i.e. this information could simply be stored in a few fields in the
> cfs
> superblock structure that wraps the vfs superblock, and the
> superblock read function could decode straight into those fields...

We just read this header from disk, validate the magic and then convert
the fields to native endian, then the few used fields (data_offset and
root_inode) to the vfs superblock structure.


> > > > +} __packed;
> > > > +
> > > > +enum cfs_inode_flags {
> > > > +       CFS_INODE_FLAGS_NONE = 0,
> > > > +       CFS_INODE_FLAGS_PAYLOAD = 1 << 0,
> > > > +       CFS_INODE_FLAGS_MODE = 1 << 1,
> > > > +       CFS_INODE_FLAGS_NLINK = 1 << 2,
> > > > +       CFS_INODE_FLAGS_UIDGID = 1 << 3,
> > > > +       CFS_INODE_FLAGS_RDEV = 1 << 4,
> > > > +       CFS_INODE_FLAGS_TIMES = 1 << 5,
> > > > +       CFS_INODE_FLAGS_TIMES_NSEC = 1 << 6,
> > > > +       CFS_INODE_FLAGS_LOW_SIZE = 1 << 7, /* Low 32bit of
> > > > st_size
> > > > */
> > > > +       CFS_INODE_FLAGS_HIGH_SIZE = 1 << 8, /* High 32bit of
> > > > st_size */
> > >
> > > Why do we need to complicate things by splitting the inode size
> > > like this?
> > >
> >
> > The goal is to minimize the image size for a typical rootfs or
> > container image. Almost zero files in any such images are > 4GB. 
>
> Sure, but how much space does this typically save, versus how much
> complexity it adds to runtime decoding of inodes?
>
> I mean, in a dense container system the critical resources that need
> to be saved is runtime memory and CPU overhead of operations, not
> the storage space. Saving a 30-40 bytes of storage space per inode
> means a typical image might ber a few MB smaller, but given the
> image file is not storing data we're only talking about images the
> use maybe 500 bytes of data per inode. Storage space for images
> is not a limiting factor, nor is network transmission (because
> compression), so it comes back to runtime CPU and memory usage.

Here are some example sizes of composefs images with the current packed
inodes:

6.2M cs9-developer-rootfs.composefs
2.1M cs9-minimal-rootfs.composefs
1.2M fedora-37-container.composefs
433K ubuntu-22.04-container.composefs

If we set all the flags for the inodes (i.e. fixed size inodes) we get:

8.8M cs9-developer-rootfs.composefs
3.0M cs9-minimal-rootfs.composefs
1.6M fedora-37-container.composefs
625K ubuntu-22.04-container.composefs

So, images are about 40% larger with fixed size inodes.

> The inodes are decoded out of the page cache, so the memory for the
> raw inode information is volatile and reclaimed when needed.
> Similarly, the VFS inode built from this information is reclaimable
> when not in use, too. So the only real overhead for runtime is the
> decoding time to find the inode in the image file and then decode
> it.

I disagree with this characterization. It is true that page cache is
volatile, but if you can fit 40% less inode data in the page cache then
there is additional overhead where you need to read this from disk. So,
decoding time is not the only thing that affects overhead.

Additionally, just by being larger and less dense, more data has to be
read from disk, which itself is slower.

> Given the decoding of the inode -all branches- and is not
> straight-line code, it cannot be well optimised and the CPU branch
> predictor is not going to get it right every time. Straight line
> code that decodes every field whether it is zero or not is going to
> be faster.
>
> Further, with a fixed size inode in the image file, the inode table
> can be entirely fixed size, getting rid of the whole unaligned data
> retreival problem that code currently has (yes, all that
> "le32_to_cpu(__get_unaligned(__le32, data)" code) because we can
> ensure that all the inode fields are aligned in the data pages. This
> will significantly speed up decoding into the in-memory inode
> structures.

I agree it could be faster. But is inode decode actually the limiting
factor, compared to things like disk i/o or better use of page cache?

> And to take it another step, the entire struct cfs_inode_s structure
> could go away - it is entirely a temporary structure used to shuffle
> data from the on-disk encoded format to the the initialisation of
> the VFS inode. The on-disk inode data could be decoded directly into
> the VFS inode after it has been instantiated, rather than decoding
> the inode from the backing file and the instantiating the in-memory
> inode.
>
> i.e. instead of:
>
> cfs_lookup()
>         cfs_dir_lookup(&index)
>         cfs_get_ino_index(index, &inode_s)
>                 cfs_get_inode_data_max(index, &data)
>                 inode_s->st_.... = cfs_read_....(&data);
>                 inode_s->st_.... = cfs_read_....(&data);
>                 inode_s->st_.... = cfs_read_....(&data);
>                 inode_s->st_.... = cfs_read_....(&data);
>         cfs_make_inode(inode_s, &vfs_inode)
>                 inode = new_inode(sb)
>                 inode->i_... = inode_s->st_....;
>                 inode->i_... = inode_s->st_....;
>                 inode->i_... = inode_s->st_....;
>                 inode->i_... = inode_s->st_....;
>
> You could collapse this straight down to:
>
> cfs_lookup()
>         cfs_dir_lookup(&index)
>         cfs_make_inode(index, &vfs_inode)
>                 inode = new_inode(sb)
>                 cfs_get_inode_data_max(index, &data)
>                 inode->i_... = cfs_read_....(&data);
>                 inode->i_... = cfs_read_....(&data);
>                 inode->i_... = cfs_read_....(&data);
>                 inode->i_... = cfs_read_....(&data);
>
> This removes an intermediately layer from the inode instantiation
> fast path completely. ANd if the inode table is fixed size and
> always well aligned, then the cfs_make_inode() code that sets up the
> VFS inode is almost unchanged from what it is now. There are no new
> branches, the file image format is greatly simplified, and the
> runtime overhead of instantiating inodes is significantly reduced.

I'm not sure the performance win is clear compared to the extra size,
as generally inodes are only decoded once and kept around in memory for
most of its use. However, I agree that there are clear advantages in
simplifying the format. That makes it easier to maintain and
understand. I'll give this some thought.

> Similar things can be done with the rest of the "descriptor"
> abstraction - the intermediate in-memory structures can be placed
> directly in the cfs_inode structure that wraps the VFS inode, and
> the initialisation of them can call the decoding code directly
> instead of using intermediate structures as is currently done.
>
> This will remove a chunk of code from the implemenation and make it
> run faster....
>
> > Also, we don't just "not decode" the items with the flag not set,
> > they
> > are not even stored on disk.
>
> Yup, and I think that is a mistake - premature optimisation and all
> that...
>
> >
> > > > +       CFS_INODE_FLAGS_XATTRS = 1 << 9,
> > > > +       CFS_INODE_FLAGS_DIGEST = 1 << 10, /* fs-verity sha256
> > > > digest */
> > > > +       CFS_INODE_FLAGS_DIGEST_FROM_PAYLOAD = 1 << 11, /*
> > > > Compute
> > > > digest from payload */
> > > > +};
> > > > +
> > > > +#define CFS_INODE_FLAG_CHECK(_flag,
> > > > _name)                                     \
> > > > +       (((_flag) & (CFS_INODE_FLAGS_##_name)) != 0)
> > >
> > > Check what about a flag? If this is a "check that a feature is
> > > set",
> > > then open coding it better, but if you must do it like this, then
> > > please use static inline functions like:
> > >
> > >         if (cfs_inode_has_xattrs(inode->flags)) {
> > >                 .....
> > >         }
> > >
> >
> > The check is if the flag is set, so maybe CFS_INODE_FLAG_IS_SET is
> > a
> > better name. This is used only when decoding the on-disk version of
> > the
> > inode to the in memory one, which is a bunch of:
> >
> >         if (CFS_INODE_FLAG_CHECK(ino->flags, THE_FIELD))
> >                 ino->the_field = cfs_read_u32(&data);
> >         else
> >                 ino->the_field = THE_FIELD_DEFUALT;
> >
> > I can easily open-code these checks, although I'm not sure it makes
> > a
> > great difference either way.
>
> If they are used only once, then it should be open coded. But I
> think the whole "optional inode fields" stuff should just go away
> entirely at this point...
>
> > > > +#define CFS_INODE_DEFAULT_MODE 0100644
> > > > +#define CFS_INODE_DEFAULT_NLINK 1
> > > > +#define CFS_INODE_DEFAULT_NLINK_DIR 2
> > > > +#define CFS_INODE_DEFAULT_UIDGID 0
> > > > +#define CFS_INODE_DEFAULT_RDEV 0
> > > > +#define CFS_INODE_DEFAULT_TIMES 0
> > >
> > > Where do these get used? Are they on disk defaults or something
> > > else? (comment, please!)
> >
> > They are the defaults that are used when inode fields on disk are
> > missing. I'll add some comments.
>
> They go away entirely with fixed size on-disk inodes.
>
> > > > +       u32 st_mode; /* File type and mode.  */
> > > > +       u32 st_nlink; /* Number of hard links, only for regular
> > > > files.  */
> > > > +       u32 st_uid; /* User ID of owner.  */
> > > > +       u32 st_gid; /* Group ID of owner.  */
> > > > +       u32 st_rdev; /* Device ID (if special file).  */
> > > > +       u64 st_size; /* Size of file, only used for regular
> > > > files
> > > > */
> > > > +
> > > > +       struct cfs_vdata_s xattrs; /* ref to variable data */
> > >
> > > This is in the payload that follows the inode?  Is it included in
> > > the payload_length above?
> > >
> > > If not, where is this stuff located, how do we validate it points
> > > to
> > > the correct place in the on-disk format file, the xattrs belong
> > > to
> > > this specific inode, etc? I think that's kinda important to
> > > describe, because xattrs often contain important security
> > > information...
> >
> > No, all inodes are packed into the initial part of the file, each
> > containing a flags set, a variable size (from flags) chunk of fixed
> > size elements and an variable size payload. The payload is either
> > the
> > target symlink for symlinks, or the path of the backing file for
> > regular files.
>
> Ok, I think you need to stop calling that a "payload", then. It's
> the path name to the backing file. The backing file is only relevant
> for S_IFREG and S_IFLINK types - directories don't need path names
> as they only contain pointers to other inodes in the image file.
> Types like S_IFIFO, S_IFBLK, etc should not have backing files,
> either - they should just be instantiated as the correct type in the
> VFS inode and not require any backing file interactions at all...
>
> Hence I think this "payload" should be called something like
> "backing path" or something similar.

Yeah, that may be better.

>
> .....
>
> > > > +struct cfs_dir_s {
> > > > +       u32 n_chunks;
> > > > +       struct cfs_dir_chunk_s chunks[];
> > > > +} __packed;
> > >
> > > So directory data is packed in discrete chunks? Given that this
> > > is a
> > > static directory format, and the size of the directory is known
> > > at
> > > image creation time, why does the storage need to be chunked?
> >
> > We chunk the data such that each chunk fits inside a single page in
> > the
> > image file. I did this to make accessing image data directly from
> > the
> > page cache easier.
>
> Hmmmm. So you defined a -block size- that matched the x86-64 -page
> size- to avoid page cache issues.  Now, what about ARM or POWER
> which has 64kB page sizes?
>
> IOWs, "page size" is not the same on all machines, whilst the
> on-disk format for a filesystem image needs to be the same on all
> machines. Hence it appears that this:
>
> > > > +#define CFS_MAX_DIR_CHUNK_SIZE 4096
>
> should actually be defined in terms of the block size for the
> filesystem image, and this size of these dir chunks should be
> recorded in the superblock of the filesystem image. That way it
> is clear that the image has a specific chunk size, and it also paves
> the way for supporting more efficient directory structures using
> larger-than-page size chunks in future.

Yes, its true that assuming a (min) 4k page size is wasteful on some
arches, but it would be hard to read a filesystem created for 64k pages
on a 4k page machine, which is not ideal. However, wrt your commend on
multi-page mappings, maybe we can just totally drop these limits. I'll
have a look at that.

> > If we had dirent data spanning multiple pages
> > then we would either need to map the pages consecutively (which
> > seems
> > hard/costly) or have complex in-kernel code to handle the case
> > where a
> > dirent straddles two pages.
>
> Actually pretty easy - we do this with XFS for multi-page directory
> buffers. We just use vm_map_ram() on a page array at the moment,
> but in the near future there will be other options based on
> multipage folios.
>
> That is, the page cache now stores folios rather than pages, and is
> capable of using contiguous multi-page folios in the cache. As a
> result, multipage folios could be used to cache multi-page
> structures in the page cache and efficiently map them as a whole.
>
> That mapping code isn't there yet - kmap_local_folio() only maps the
> page within the folio at the offset given - but the foundation is
> there for supporting this functionality natively....
>
> I certainly wouldn't be designing a new filesystem these days that
> has it's on-disk format constrained by the x86-64 4kB page size...

Yes, I agree. I'm gonna look at using multi-page mapping for both
dirents and xattr data, which should completely drop these limits, as
well as get rid of the dirent chunking.

--
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
=-=-=
Alexander Larsson Red Hat,
Inc
[email protected] [email protected]
He's a witless playboy firefighter fleeing from a secret government
programme. She's a ditzy streetsmart socialite from the wrong side of
the
tracks. They fight crime!

2023-01-18 03:17:11

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] composefs: Add on-disk layout

On Tue, Jan 17, 2023 at 01:11:33PM +0100, Alexander Larsson wrote:
> On Tue, 2023-01-17 at 10:06 +1100, Dave Chinner wrote:
> > On Mon, Jan 16, 2023 at 12:00:03PM +0100, Alexander Larsson wrote:
> > > On Mon, 2023-01-16 at 12:29 +1100, Dave Chinner wrote:
> > > > On Fri, Jan 13, 2023 at 04:33:55PM +0100, Alexander Larsson
> > > > wrote:
> > > > > +} __packed;
> > > > > +
> > > > > +struct cfs_header_s {
> > > > > +???????u8 version;
> > > > > +???????u8 unused1;
> > > > > +???????u16 unused2;
> > > >
> > > > Why are you hyper-optimising these structures for minimal space
> > > > usage? This is 2023 - we can use a __le32 for the version number,
> > > > the magic number and then leave....
> > > >
> > > > > +
> > > > > +???????u32 magic;
> > > > > +???????u64 data_offset;
> > > > > +???????u64 root_inode;
> > > > > +
> > > > > +???????u64 unused3[2];
> > > >
> > > > a whole heap of space to round it up to at least a CPU cacheline
> > > > size using something like "__le64 unused[15]".
> > > >
> > > > That way we don't need packed structures nor do we care about
> > > > having
> > > > weird little holes in the structures to fill....
> > >
> > > Sure.
> >
> > FWIW, now I see how this is used, this header kinda defines what
> > we'd call the superblock in the on-disk format of a filesystem. It's
> > at a fixed location in the image file, so there should be a #define
> > somewhere in this file to document it's fixed location.
>
> It is at offset zero. I don't really think that needs a define, does
> it? Maybe a comment though.

Having the code use magic numbers for accessing fixed structures
(e.g. the hard coded 0 in the superblock read function)
is generally considered bad form.

If someone needs to understand how an image file is laid out, where
do they look to find where structures are physically located? Should
it be defined in a header file that is easy to find, or should they
have to read all the code to find where the magic number is embedded
in the code that defines the location of critical structures?


> > Also, if this is the in-memory representation of the structure and
> > not the actual on-disk format, why does it even need padding,
> > packing or even store the magic number?
>
> In this case it is the on-disk format though.

Yeah, that wasn't obvious at first glance.

> > > > > +} __packed;
> > > > > +
> > > > > +enum cfs_inode_flags {
> > > > > +???????CFS_INODE_FLAGS_NONE = 0,
> > > > > +???????CFS_INODE_FLAGS_PAYLOAD = 1 << 0,
> > > > > +???????CFS_INODE_FLAGS_MODE = 1 << 1,
> > > > > +???????CFS_INODE_FLAGS_NLINK = 1 << 2,
> > > > > +???????CFS_INODE_FLAGS_UIDGID = 1 << 3,
> > > > > +???????CFS_INODE_FLAGS_RDEV = 1 << 4,
> > > > > +???????CFS_INODE_FLAGS_TIMES = 1 << 5,
> > > > > +???????CFS_INODE_FLAGS_TIMES_NSEC = 1 << 6,
> > > > > +???????CFS_INODE_FLAGS_LOW_SIZE = 1 << 7, /* Low 32bit of
> > > > > st_size
> > > > > */
> > > > > +???????CFS_INODE_FLAGS_HIGH_SIZE = 1 << 8, /* High 32bit of
> > > > > st_size */
> > > >
> > > > Why do we need to complicate things by splitting the inode size
> > > > like this?
> > > >
> > >
> > > The goal is to minimize the image size for a typical rootfs or
> > > container image. Almost zero files in any such images are > 4GB.?
> >
> > Sure, but how much space does this typically save, versus how much
> > complexity it adds to runtime decoding of inodes?
> >
> > I mean, in a dense container system the critical resources that need
> > to be saved is runtime memory and CPU overhead of operations, not
> > the storage space. Saving a 30-40 bytes of storage space per inode
> > means a typical image might ber a few MB smaller, but given the
> > image file is not storing data we're only talking about images the
> > use maybe 500 bytes of data per inode. Storage space for images
> > is not a limiting factor, nor is network transmission (because
> > compression), so it comes back to runtime CPU and memory usage.
>
> Here are some example sizes of composefs images with the current packed
> inodes:
>
> 6.2M cs9-developer-rootfs.composefs
> 2.1M cs9-minimal-rootfs.composefs
> 1.2M fedora-37-container.composefs
> 433K ubuntu-22.04-container.composefs
>
> If we set all the flags for the inodes (i.e. fixed size inodes) we get:
>
> 8.8M cs9-developer-rootfs.composefs
> 3.0M cs9-minimal-rootfs.composefs
> 1.6M fedora-37-container.composefs
> 625K ubuntu-22.04-container.composefs
>
> So, images are about 40% larger with fixed size inodes.

40% sounds like a lot, but in considering the size magnitude of the
image files I'd say we just don't care about a few hundred KB to a
couple of MB extra space usage. Indeed, we'll use much more than 40%
extra space on XFS internally via speculative EOF preallocation when
writing those files to disk....

Also, I don't think that this is an issue for shipping them across
the network or archiving the images for the long term: compression
should remove most of the extra zeros.

Hence I'm still not convinced that the complexity of conditional
field storage is worth the decrease in image file size...

> > The inodes are decoded out of the page cache, so the memory for the
> > raw inode information is volatile and reclaimed when needed.
> > Similarly, the VFS inode built from this information is reclaimable
> > when not in use, too. So the only real overhead for runtime is the
> > decoding time to find the inode in the image file and then decode
> > it.
>
> I disagree with this characterization. It is true that page cache is
> volatile, but if you can fit 40% less inode data in the page cache then
> there is additional overhead where you need to read this from disk. So,
> decoding time is not the only thing that affects overhead.

True, but the page cache is a secondary cache for inodes - if you
are relying on secondary caches for performance then you've already
lost because it means the primary cache is not functioning
effectively for your production workload.

> Additionally, just by being larger and less dense, more data has to be
> read from disk, which itself is slower.

That's a surprisingly common fallacy.

e.g. we can do a 64kB read IO for only 5% more time and CPU cost
than a 4kB read IO. This means we can pull 16x as much information
into the cache for almost no extra cost. This has been true since
spinning disks were invented more than 4 decades ago, but it's still
true with modern SSDs (for different reasons).

A 64kb IO is going to allow more inodes to be bought into the cache
for effectively the same IO cost, yet it provides a 16x improvement
in subsequent cache hit probability compared to doing 4kB IO. In
comparison, saving 40% in object size only improves the cache hit
probability for the same IO by ~1.5x....

Hence I don't consider object density isn't a primary issue for a
secondary IO caches; what matters is how many objects you can bring
into cache per IO, and how likely a primary level cache miss for
those objects will be in the near future before memory reclaim
removes them from the cache again.

As an example of this, the XFS inode allocation layout and caching
architecture is from the early 1990s, and it is a direct embodiment
of the the above principle. We move inodes in and out of the
secondary cache in clusters of 32 inodes (16KB IOs) because it is
much more CPU and IO efficient than doing it in 4kB IOs...

> > Given the decoding of the inode -all branches- and is not
> > straight-line code, it cannot be well optimised and the CPU branch
> > predictor is not going to get it right every time. Straight line
> > code that decodes every field whether it is zero or not is going to
> > be faster.
> >
> > Further, with a fixed size inode in the image file, the inode table
> > can be entirely fixed size, getting rid of the whole unaligned data
> > retreival problem that code currently has (yes, all that
> > "le32_to_cpu(__get_unaligned(__le32, data)" code) because we can
> > ensure that all the inode fields are aligned in the data pages. This
> > will significantly speed up decoding into the in-memory inode
> > structures.
>
> I agree it could be faster. But is inode decode actually the limiting
> factor, compared to things like disk i/o or better use of page cache?

The limiting factor in filesystem lookup paths tends to CPU usage.
It's spread across many parts of the kernel, but every bit we can
save makes a difference. Especially on a large server running
thousands of containers - the less CPU we use doing inode lookup and
instantiation, the more CPU there is for the user workloads. We are
rarely IO limited on machines like this, and as SSDs get even faster
in the near future, that's going to be even less of a problem than
it now.

> > > > > +struct cfs_dir_s {
> > > > > +???????u32 n_chunks;
> > > > > +???????struct cfs_dir_chunk_s chunks[];
> > > > > +} __packed;
> > > >
> > > > So directory data is packed in discrete chunks? Given that this
> > > > is a
> > > > static directory format, and the size of the directory is known
> > > > at
> > > > image creation time, why does the storage need to be chunked?
> > >
> > > We chunk the data such that each chunk fits inside a single page in
> > > the
> > > image file. I did this to make accessing image data directly from
> > > the
> > > page cache easier.
> >
> > Hmmmm. So you defined a -block size- that matched the x86-64 -page
> > size- to avoid page cache issues.? Now, what about ARM or POWER
> > which has 64kB page sizes?
> >
> > IOWs, "page size" is not the same on all machines, whilst the
> > on-disk format for a filesystem image needs to be the same on all
> > machines. Hence it appears that this:
> >
> > > > > +#define CFS_MAX_DIR_CHUNK_SIZE 4096
> >
> > should actually be defined in terms of the block size for the
> > filesystem image, and this size of these dir chunks should be
> > recorded in the superblock of the filesystem image. That way it
> > is clear that the image has a specific chunk size, and it also paves
> > the way for supporting more efficient directory structures using
> > larger-than-page size chunks in future.
>
> Yes, its true that assuming a (min) 4k page size is wasteful on some
> arches, but it would be hard to read a filesystem created for 64k pages
> on a 4k page machine, which is not ideal. However, wrt your commend on
> multi-page mappings, maybe we can just totally drop these limits. I'll
> have a look at that.

It's not actually that hard - just read in all the pages into the
page cache, look them up, map them, do the operation, unmap them.

After all, you already ahve a cfs_buf that you could store a page
array in, and then you have an object that you can use for single
pages (on a 64kB machine) or 16 pages (4kB page machine) without the
code that is walking the buffers caring about the underlying page
size. This is exactly what we do with the struct xfs_buf. :)

>
> > > If we had dirent data spanning multiple pages
> > > then we would either need to map the pages consecutively (which
> > > seems
> > > hard/costly) or have complex in-kernel code to handle the case
> > > where a
> > > dirent straddles two pages.
> >
> > Actually pretty easy - we do this with XFS for multi-page directory
> > buffers. We just use vm_map_ram() on a page array at the moment,
> > but in the near future there will be other options based on
> > multipage folios.
> >
> > That is, the page cache now stores folios rather than pages, and is
> > capable of using contiguous multi-page folios in the cache. As a
> > result, multipage folios could be used to cache multi-page
> > structures in the page cache and efficiently map them as a whole.
> >
> > That mapping code isn't there yet - kmap_local_folio() only maps the
> > page within the folio at the offset given - but the foundation is
> > there for supporting this functionality natively....
> >
> > I certainly wouldn't be designing a new filesystem these days that
> > has it's on-disk format constrained by the x86-64 4kB page size...
>
> Yes, I agree. I'm gonna look at using multi-page mapping for both
> dirents and xattr data, which should completely drop these limits, as
> well as get rid of the dirent chunking.

That will be interesting to see :)

Cheers,

Dave.
--
Dave Chinner
[email protected]