2021-03-18 13:35:07

by Shreeya Patel

[permalink] [raw]
Subject: [PATCH v2 0/4] Make UTF-8 encoding loadable

utf8data.h_shipped has a large database table which is an auto-generated
decodification trie for the unicode normalization functions and it is not
necessary to carry this large table in the kernel.
Goal is to make UTF-8 encoding loadable by converting it into a module
and adding a layer between the filesystems and the utf8 module which will
load the module whenever any filesystem that needs unicode is mounted.
Unicode is the subsystem and utf8 is a charachter encoding for the
subsystem, hence first two patches in the series are renaming functions
and file name to unicode for better understanding the difference between
UTF-8 module and unicode layer.
3rd patch resolves the warning reported by kernel test robot.
Last patch in the series adds the layer and utf8 module.

---
Changes in v2
- Remove the duplicate file from the last patch.
- Make the wrapper functions inline.
- Remove msleep and use try_module_get() and module_put()
for ensuring that module is loaded correctly and also
doesn't get unloaded while in use.
- Resolve the warning reported by kernel test robot.
- Resolve all the checkpatch.pl warnings.

Shreeya Patel (4):
fs: unicode: Rename function names from utf8 to unicode
fs: unicode: Rename utf8-core file to unicode-core
fs: unicode: Use strscpy() instead of strncpy()
fs: unicode: Add utf8 module and a unicode layer

fs/ext4/hash.c | 2 +-
fs/ext4/namei.c | 12 ++--
fs/ext4/super.c | 6 +-
fs/f2fs/dir.c | 12 ++--
fs/f2fs/super.c | 6 +-
fs/libfs.c | 6 +-
fs/unicode/Kconfig | 11 +++-
fs/unicode/Makefile | 5 +-
fs/unicode/unicode-core.c | 60 ++++++++++++++++++++
fs/unicode/utf8-selftest.c | 8 +--
fs/unicode/{utf8-core.c => utf8mod.c} | 80 +++++++++++++++------------
include/linux/unicode.h | 77 ++++++++++++++++++++------
12 files changed, 206 insertions(+), 79 deletions(-)
create mode 100644 fs/unicode/unicode-core.c
rename fs/unicode/{utf8-core.c => utf8mod.c} (68%)

--
2.30.1


2021-03-18 13:35:37

by Shreeya Patel

[permalink] [raw]
Subject: [PATCH v2 1/4] fs: unicode: Rename function names from utf8 to unicode

Rename the function names from utf8 to unicode for taking the first step
towards the transformation of utf8-core file into the unicode subsystem
layer file.

Signed-off-by: Shreeya Patel <[email protected]>
---
fs/ext4/hash.c | 2 +-
fs/ext4/namei.c | 12 ++++----
fs/ext4/super.c | 6 ++--
fs/f2fs/dir.c | 12 ++++----
fs/f2fs/super.c | 6 ++--
fs/libfs.c | 6 ++--
fs/unicode/utf8-core.c | 57 +++++++++++++++++++-------------------
fs/unicode/utf8-selftest.c | 8 +++---
include/linux/unicode.h | 32 ++++++++++-----------
9 files changed, 70 insertions(+), 71 deletions(-)

diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index a92eb79de0cc..8890a76abe86 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -285,7 +285,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
if (!buff)
return -ENOMEM;

- dlen = utf8_casefold(um, &qstr, buff, PATH_MAX);
+ dlen = unicode_casefold(um, &qstr, buff, PATH_MAX);
if (dlen < 0) {
kfree(buff);
goto opaque_seq;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 686bf982c84e..dde5ce795416 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1290,9 +1290,9 @@ int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
int ret;

if (quick)
- ret = utf8_strncasecmp_folded(um, name, entry);
+ ret = unicode_strncasecmp_folded(um, name, entry);
else
- ret = utf8_strncasecmp(um, name, entry);
+ ret = unicode_strncasecmp(um, name, entry);

if (ret < 0) {
/* Handle invalid character sequence as either an error
@@ -1324,9 +1324,9 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
if (!cf_name->name)
return;

- len = utf8_casefold(dir->i_sb->s_encoding,
- iname, cf_name->name,
- EXT4_NAME_LEN);
+ len = unicode_casefold(dir->i_sb->s_encoding,
+ iname, cf_name->name,
+ EXT4_NAME_LEN);
if (len <= 0) {
kfree(cf_name->name);
cf_name->name = NULL;
@@ -2201,7 +2201,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,

#ifdef CONFIG_UNICODE
if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
- sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
+ sb->s_encoding && unicode_validate(sb->s_encoding, &dentry->d_name))
return -EINVAL;
#endif

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ad34a37278cd..2fb845752c90 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1259,7 +1259,7 @@ static void ext4_put_super(struct super_block *sb)
fs_put_dax(sbi->s_daxdev);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#ifdef CONFIG_UNICODE
- utf8_unload(sb->s_encoding);
+ unicode_unload(sb->s_encoding);
#endif
kfree(sbi);
}
@@ -4304,7 +4304,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}

- encoding = utf8_load(encoding_info->version);
+ encoding = unicode_load(encoding_info->version);
if (IS_ERR(encoding)) {
ext4_msg(sb, KERN_ERR,
"can't mount with superblock charset: %s-%s "
@@ -5165,7 +5165,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
crypto_free_shash(sbi->s_chksum_driver);

#ifdef CONFIG_UNICODE
- utf8_unload(sb->s_encoding);
+ unicode_unload(sb->s_encoding);
#endif

#ifdef CONFIG_QUOTA
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index e6270a867be1..f160f9dd667d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -84,10 +84,10 @@ int f2fs_init_casefolded_name(const struct inode *dir,
GFP_NOFS);
if (!fname->cf_name.name)
return -ENOMEM;
- fname->cf_name.len = utf8_casefold(sb->s_encoding,
- fname->usr_fname,
- fname->cf_name.name,
- F2FS_NAME_LEN);
+ fname->cf_name.len = unicode_casefold(sb->s_encoding,
+ fname->usr_fname,
+ fname->cf_name.name,
+ F2FS_NAME_LEN);
if ((int)fname->cf_name.len <= 0) {
kfree(fname->cf_name.name);
fname->cf_name.name = NULL;
@@ -237,7 +237,7 @@ static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
entry.len = decrypted_name.len;
}

- res = utf8_strncasecmp_folded(um, name, &entry);
+ res = unicode_strncasecmp_folded(um, name, &entry);
/*
* In strict mode, ignore invalid names. In non-strict mode,
* fall back to treating them as opaque byte sequences.
@@ -246,7 +246,7 @@ static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
res = name->len == entry.len &&
memcmp(name->name, entry.name, name->len) == 0;
} else {
- /* utf8_strncasecmp_folded returns 0 on match */
+ /* unicode_strncasecmp_folded returns 0 on match */
res = (res == 0);
}
out:
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 7069793752f1..b4a92e763e27 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1430,7 +1430,7 @@ static void f2fs_put_super(struct super_block *sb)
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
#ifdef CONFIG_UNICODE
- utf8_unload(sb->s_encoding);
+ unicode_unload(sb->s_encoding);
#endif
kfree(sbi);
}
@@ -3560,7 +3560,7 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
return -EINVAL;
}

- encoding = utf8_load(encoding_info->version);
+ encoding = unicode_load(encoding_info->version);
if (IS_ERR(encoding)) {
f2fs_err(sbi,
"can't mount with superblock charset: %s-%s "
@@ -4073,7 +4073,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
kvfree(sbi->write_io[i]);

#ifdef CONFIG_UNICODE
- utf8_unload(sb->s_encoding);
+ unicode_unload(sb->s_encoding);
sb->s_encoding = NULL;
#endif
free_options:
diff --git a/fs/libfs.c b/fs/libfs.c
index e2de5401abca..766556165bb5 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1404,7 +1404,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
* If the dentry name is stored in-line, then it may be concurrently
* modified by a rename. If this happens, the VFS will eventually retry
* the lookup, so it doesn't matter what ->d_compare() returns.
- * However, it's unsafe to call utf8_strncasecmp() with an unstable
+ * However, it's unsafe to call unicode_strncasecmp() with an unstable
* string. Therefore, we have to copy the name into a temporary buffer.
*/
if (len <= DNAME_INLINE_LEN - 1) {
@@ -1414,7 +1414,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
/* prevent compiler from optimizing out the temporary buffer */
barrier();
}
- ret = utf8_strncasecmp(um, name, &qstr);
+ ret = unicode_strncasecmp(um, name, &qstr);
if (ret >= 0)
return ret;

@@ -1443,7 +1443,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
if (!dir || !needs_casefold(dir))
return 0;

- ret = utf8_casefold_hash(um, dentry, str);
+ ret = unicode_casefold_hash(um, dentry, str);
if (ret < 0 && sb_has_strict_encoding(sb))
return -EINVAL;
return 0;
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index dc25823bfed9..d5f09e022ac5 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -10,7 +10,7 @@

#include "utf8n.h"

-int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+int unicode_validate(const struct unicode_map *um, const struct qstr *str)
{
const struct utf8data *data = utf8nfdi(um->version);

@@ -18,10 +18,10 @@ int utf8_validate(const struct unicode_map *um, const struct qstr *str)
return -1;
return 0;
}
-EXPORT_SYMBOL(utf8_validate);
+EXPORT_SYMBOL(unicode_validate);

-int utf8_strncmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2)
+int unicode_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
{
const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur1, cur2;
@@ -45,10 +45,10 @@ int utf8_strncmp(const struct unicode_map *um,

return 0;
}
-EXPORT_SYMBOL(utf8_strncmp);
+EXPORT_SYMBOL(unicode_strncmp);

-int utf8_strncasecmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2)
+int unicode_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur1, cur2;
@@ -72,14 +72,14 @@ int utf8_strncasecmp(const struct unicode_map *um,

return 0;
}
-EXPORT_SYMBOL(utf8_strncasecmp);
+EXPORT_SYMBOL(unicode_strncasecmp);

/* String cf is expected to be a valid UTF-8 casefolded
* string.
*/
-int utf8_strncasecmp_folded(const struct unicode_map *um,
- const struct qstr *cf,
- const struct qstr *s1)
+int unicode_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur1;
@@ -100,10 +100,10 @@ int utf8_strncasecmp_folded(const struct unicode_map *um,

return 0;
}
-EXPORT_SYMBOL(utf8_strncasecmp_folded);
+EXPORT_SYMBOL(unicode_strncasecmp_folded);

-int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen)
+int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur;
@@ -123,10 +123,10 @@ int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
}
return -EINVAL;
}
-EXPORT_SYMBOL(utf8_casefold);
+EXPORT_SYMBOL(unicode_casefold);

-int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
- struct qstr *str)
+int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur;
@@ -144,10 +144,10 @@ int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
str->hash = end_name_hash(hash);
return 0;
}
-EXPORT_SYMBOL(utf8_casefold_hash);
+EXPORT_SYMBOL(unicode_casefold_hash);

-int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen)
+int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
{
const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur;
@@ -167,11 +167,10 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
}
return -EINVAL;
}
+EXPORT_SYMBOL(unicode_normalize);

-EXPORT_SYMBOL(utf8_normalize);
-
-static int utf8_parse_version(const char *version, unsigned int *maj,
- unsigned int *min, unsigned int *rev)
+static int unicode_parse_version(const char *version, unsigned int *maj,
+ unsigned int *min, unsigned int *rev)
{
substring_t args[3];
char version_string[12];
@@ -192,7 +191,7 @@ static int utf8_parse_version(const char *version, unsigned int *maj,
return 0;
}

-struct unicode_map *utf8_load(const char *version)
+struct unicode_map *unicode_load(const char *version)
{
struct unicode_map *um = NULL;
int unicode_version;
@@ -200,7 +199,7 @@ struct unicode_map *utf8_load(const char *version)
if (version) {
unsigned int maj, min, rev;

- if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+ if (unicode_parse_version(version, &maj, &min, &rev) < 0)
return ERR_PTR(-EINVAL);

if (!utf8version_is_supported(maj, min, rev))
@@ -225,12 +224,12 @@ struct unicode_map *utf8_load(const char *version)

return um;
}
-EXPORT_SYMBOL(utf8_load);
+EXPORT_SYMBOL(unicode_load);

-void utf8_unload(struct unicode_map *um)
+void unicode_unload(struct unicode_map *um)
{
kfree(um);
}
-EXPORT_SYMBOL(utf8_unload);
+EXPORT_SYMBOL(unicode_unload);

MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index 6fe8af7edccb..796c1ed922ea 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -235,7 +235,7 @@ static void check_utf8_nfdicf(void)
static void check_utf8_comparisons(void)
{
int i;
- struct unicode_map *table = utf8_load("12.1.0");
+ struct unicode_map *table = unicode_load("12.1.0");

if (IS_ERR(table)) {
pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
@@ -249,7 +249,7 @@ static void check_utf8_comparisons(void)
const struct qstr s2 = {.name = nfdi_test_data[i].dec,
.len = sizeof(nfdi_test_data[i].dec)};

- test_f(!utf8_strncmp(table, &s1, &s2),
+ test_f(!unicode_strncmp(table, &s1, &s2),
"%s %s comparison mismatch\n", s1.name, s2.name);
}

@@ -259,11 +259,11 @@ static void check_utf8_comparisons(void)
const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
.len = sizeof(nfdicf_test_data[i].ncf)};

- test_f(!utf8_strncasecmp(table, &s1, &s2),
+ test_f(!unicode_strncasecmp(table, &s1, &s2),
"%s %s comparison mismatch\n", s1.name, s2.name);
}

- utf8_unload(table);
+ unicode_unload(table);
}

static void check_supported_versions(void)
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 74484d44c755..de23f9ee720b 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -10,27 +10,27 @@ struct unicode_map {
int version;
};

-int utf8_validate(const struct unicode_map *um, const struct qstr *str);
+int unicode_validate(const struct unicode_map *um, const struct qstr *str);

-int utf8_strncmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2);
+int unicode_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2);

-int utf8_strncasecmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2);
-int utf8_strncasecmp_folded(const struct unicode_map *um,
- const struct qstr *cf,
- const struct qstr *s1);
+int unicode_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2);
+int unicode_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1);

-int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen);
+int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);

-int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen);
+int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);

-int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
- struct qstr *str);
+int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str);

-struct unicode_map *utf8_load(const char *version);
-void utf8_unload(struct unicode_map *um);
+struct unicode_map *unicode_load(const char *version);
+void unicode_unload(struct unicode_map *um);

#endif /* _LINUX_UNICODE_H */
--
2.30.1

2021-03-18 13:36:06

by Shreeya Patel

[permalink] [raw]
Subject: [PATCH v2 2/4] fs: unicode: Rename utf8-core file to unicode-core

Rename the file name from utf8-core to unicode-core for transformation of
utf8-core file into the unicode subsystem layer file and also for better
understanding.

Signed-off-by: Shreeya Patel <[email protected]>
---
fs/unicode/Makefile | 2 +-
fs/unicode/{utf8-core.c => unicode-core.c} | 0
2 files changed, 1 insertion(+), 1 deletion(-)
rename fs/unicode/{utf8-core.c => unicode-core.c} (100%)

diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index b88aecc86550..fbf9a629ed0d 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -3,7 +3,7 @@
obj-$(CONFIG_UNICODE) += unicode.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o

-unicode-y := utf8-norm.o utf8-core.o
+unicode-y := utf8-norm.o unicode-core.o

$(obj)/utf8-norm.o: $(obj)/utf8data.h

diff --git a/fs/unicode/utf8-core.c b/fs/unicode/unicode-core.c
similarity index 100%
rename from fs/unicode/utf8-core.c
rename to fs/unicode/unicode-core.c
--
2.30.1

2021-03-18 13:36:17

by Shreeya Patel

[permalink] [raw]
Subject: [PATCH v2 3/4] fs: unicode: Use strscpy() instead of strncpy()

Following warning was reported by Kernel Test Robot.

In function 'utf8_parse_version',
inlined from 'utf8_load' at fs/unicode/utf8mod.c:195:7:
>> fs/unicode/utf8mod.c:175:2: warning: 'strncpy' specified bound 12 equals
destination size [-Wstringop-truncation]
175 | strncpy(version_string, version, sizeof(version_string));
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The -Wstringop-truncation warning highlights the unintended
uses of the strncpy function that truncate the terminating NULL
character from the source string.
Unlike strncpy(), strscpy() always null-terminates the destination string,
hence use strscpy() instead of strncpy().

Signed-off-by: Shreeya Patel <[email protected]>
Reported-by: kernel test robot <[email protected]>
---
Changes in v2
- Resolve warning of -Wstringop-truncation reported by
kernel test robot.

fs/unicode/unicode-core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
index d5f09e022ac5..287a8a48836c 100644
--- a/fs/unicode/unicode-core.c
+++ b/fs/unicode/unicode-core.c
@@ -179,7 +179,7 @@ static int unicode_parse_version(const char *version, unsigned int *maj,
{0, NULL}
};

- strncpy(version_string, version, sizeof(version_string));
+ strscpy(version_string, version, sizeof(version_string));

if (match_token(version_string, token, args) != 1)
return -EINVAL;
--
2.30.1

2021-03-18 13:36:30

by Shreeya Patel

[permalink] [raw]
Subject: [PATCH v2 4/4] fs: unicode: Add utf8 module and a unicode layer

utf8data.h_shipped has a large database table which is an auto-generated
decodification trie for the unicode normalization functions.
It is not necessary to carry this large table in the kernel hence make
UTF-8 encoding loadable by converting it into a module.
Also, modify the file called unicode-core which will act as a layer for
unicode subsystem. It will load the UTF-8 module and access it's functions
whenever any filesystem that needs unicode is mounted.

Signed-off-by: Shreeya Patel <[email protected]>
---

Changes in v2
- Remove the duplicate file utf8-core.c
- Make the wrapper functions inline.
- Remove msleep and use try_module_get() and module_put()
for ensuring that module is loaded correctly and also
doesn't get unloaded while in use.

fs/unicode/Kconfig | 11 +-
fs/unicode/Makefile | 5 +-
fs/unicode/unicode-core.c | 229 +++++------------------------------
fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
include/linux/unicode.h | 73 ++++++++---
5 files changed, 346 insertions(+), 218 deletions(-)
create mode 100644 fs/unicode/utf8mod.c

diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index 2c27b9a5cd6c..2961b0206b4d 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -8,7 +8,16 @@ config UNICODE
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
support.

+# UTF-8 encoding can be compiled as a module using UNICODE_UTF8 option.
+# Having UTF-8 encoding as a module will avoid carrying large
+# database table present in utf8data.h_shipped into the kernel
+# by being able to load it only when it is required by the filesystem.
+config UNICODE_UTF8
+ tristate "UTF-8 module"
+ depends on UNICODE
+ default m
+
config UNICODE_NORMALIZATION_SELFTEST
tristate "Test UTF-8 normalization support"
- depends on UNICODE
+ depends on UNICODE_UTF8
default n
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index fbf9a629ed0d..9dbb04194b32 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,11 +1,14 @@
# SPDX-License-Identifier: GPL-2.0

obj-$(CONFIG_UNICODE) += unicode.o
+obj-$(CONFIG_UNICODE_UTF8) += utf8.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o

-unicode-y := utf8-norm.o unicode-core.o
+unicode-y := unicode-core.o
+utf8-y := utf8mod.o utf8-norm.o

$(obj)/utf8-norm.o: $(obj)/utf8data.h
+$(obj)/utf8mod.o: $(obj)/utf8-norm.o

# In the normal build, the checked-in utf8data.h is just shipped.
#
diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
index 287a8a48836c..945984a3fe9e 100644
--- a/fs/unicode/unicode-core.c
+++ b/fs/unicode/unicode-core.c
@@ -1,235 +1,60 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/module.h>
#include <linux/kernel.h>
-#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/parser.h>
#include <linux/errno.h>
#include <linux/unicode.h>
-#include <linux/stringhash.h>

-#include "utf8n.h"

-int unicode_validate(const struct unicode_map *um, const struct qstr *str)
-{
- const struct utf8data *data = utf8nfdi(um->version);
-
- if (utf8nlen(data, str->name, str->len) < 0)
- return -1;
- return 0;
-}
-EXPORT_SYMBOL(unicode_validate);
-
-int unicode_strncmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2)
-{
- const struct utf8data *data = utf8nfdi(um->version);
- struct utf8cursor cur1, cur2;
- int c1, c2;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = utf8byte(&cur2);
-
- if (c1 < 0 || c2 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
-}
-EXPORT_SYMBOL(unicode_strncmp);
-
-int unicode_strncasecmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2)
-{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur1, cur2;
- int c1, c2;
+struct unicode_ops *utf8_ops;
+EXPORT_SYMBOL(utf8_ops);

- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = utf8byte(&cur2);
-
- if (c1 < 0 || c2 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
-}
-EXPORT_SYMBOL(unicode_strncasecmp);
-
-/* String cf is expected to be a valid UTF-8 casefolded
- * string.
- */
-int unicode_strncasecmp_folded(const struct unicode_map *um,
- const struct qstr *cf,
- const struct qstr *s1)
+static int unicode_load_module(void)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur1;
- int c1, c2;
- int i = 0;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = cf->name[i++];
- if (c1 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
-}
-EXPORT_SYMBOL(unicode_strncasecmp_folded);
-
-int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen)
-{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur;
- size_t nlen = 0;
-
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
+ int ret = request_module("utf8");

- for (nlen = 0; nlen < dlen; nlen++) {
- int c = utf8byte(&cur);
-
- dest[nlen] = c;
- if (!c)
- return nlen;
- if (c == -1)
- break;
+ if (ret) {
+ pr_err("Failed to load UTF-8 module\n");
+ return ret;
}
- return -EINVAL;
-}
-EXPORT_SYMBOL(unicode_casefold);
-
-int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
- struct qstr *str)
-{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur;
- int c;
- unsigned long hash = init_name_hash(salt);

- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
-
- while ((c = utf8byte(&cur))) {
- if (c < 0)
- return -EINVAL;
- hash = partial_name_hash((unsigned char)c, hash);
- }
- str->hash = end_name_hash(hash);
return 0;
}
-EXPORT_SYMBOL(unicode_casefold_hash);

-int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen)
+struct unicode_map *unicode_load(const char *version)
{
- const struct utf8data *data = utf8nfdi(um->version);
- struct utf8cursor cur;
- ssize_t nlen = 0;
+ int ret = unicode_load_module();

- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
+ if (ret)
+ return ERR_PTR(ret);

- for (nlen = 0; nlen < dlen; nlen++) {
- int c = utf8byte(&cur);
+ if (utf8_ops && !try_module_get(utf8_ops->owner))
+ return ERR_PTR(-ENODEV);

- dest[nlen] = c;
- if (!c)
- return nlen;
- if (c == -1)
- break;
- }
- return -EINVAL;
+ else
+ return utf8_ops->load(version);
}
-EXPORT_SYMBOL(unicode_normalize);
+EXPORT_SYMBOL(unicode_load);

-static int unicode_parse_version(const char *version, unsigned int *maj,
- unsigned int *min, unsigned int *rev)
+void unicode_unload(struct unicode_map *um)
{
- substring_t args[3];
- char version_string[12];
- static const struct match_token token[] = {
- {1, "%d.%d.%d"},
- {0, NULL}
- };
-
- strscpy(version_string, version, sizeof(version_string));
-
- if (match_token(version_string, token, args) != 1)
- return -EINVAL;
+ if (utf8_ops)
+ module_put(utf8_ops->owner);

- if (match_int(&args[0], maj) || match_int(&args[1], min) ||
- match_int(&args[2], rev))
- return -EINVAL;
-
- return 0;
+ kfree(um);
}
+EXPORT_SYMBOL(unicode_unload);

-struct unicode_map *unicode_load(const char *version)
+void unicode_register(struct unicode_ops *ops)
{
- struct unicode_map *um = NULL;
- int unicode_version;
-
- if (version) {
- unsigned int maj, min, rev;
-
- if (unicode_parse_version(version, &maj, &min, &rev) < 0)
- return ERR_PTR(-EINVAL);
-
- if (!utf8version_is_supported(maj, min, rev))
- return ERR_PTR(-EINVAL);
-
- unicode_version = UNICODE_AGE(maj, min, rev);
- } else {
- unicode_version = utf8version_latest();
- printk(KERN_WARNING"UTF-8 version not specified. "
- "Assuming latest supported version (%d.%d.%d).",
- (unicode_version >> 16) & 0xff,
- (unicode_version >> 8) & 0xff,
- (unicode_version & 0xff));
- }
-
- um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
- if (!um)
- return ERR_PTR(-ENOMEM);
-
- um->charset = "UTF-8";
- um->version = unicode_version;
-
- return um;
+ utf8_ops = ops;
}
-EXPORT_SYMBOL(unicode_load);
+EXPORT_SYMBOL(unicode_register);

-void unicode_unload(struct unicode_map *um)
+void unicode_unregister(void)
{
- kfree(um);
+ utf8_ops = NULL;
}
-EXPORT_SYMBOL(unicode_unload);
+EXPORT_SYMBOL(unicode_unregister);

MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
new file mode 100644
index 000000000000..9981960da863
--- /dev/null
+++ b/fs/unicode/utf8mod.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include <linux/stringhash.h>
+
+#include "utf8n.h"
+
+static int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+
+ if (utf8nlen(data, str->name, str->len) < 0)
+ return -1;
+ return 0;
+}
+
+static int utf8_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+static int utf8_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+static int utf8_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1;
+ int c1, c2;
+ int i = 0;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = cf->name[i++];
+ if (c1 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+static int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ size_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+static int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ int c;
+ unsigned long hash = init_name_hash(salt);
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ while ((c = utf8byte(&cur))) {
+ if (c < 0)
+ return -EINVAL;
+ hash = partial_name_hash((unsigned char)c, hash);
+ }
+ str->hash = end_name_hash(hash);
+ return 0;
+}
+
+static int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur;
+ ssize_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+ unsigned int *min, unsigned int *rev)
+{
+ substring_t args[3];
+ char version_string[12];
+ static const struct match_token token[] = {
+ {1, "%d.%d.%d"},
+ {0, NULL}
+ };
+
+ strscpy(version_string, version, sizeof(version_string));
+
+ if (match_token(version_string, token, args) != 1)
+ return -EINVAL;
+
+ if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+ match_int(&args[2], rev))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct unicode_map *utf8_load(const char *version)
+{
+ struct unicode_map *um = NULL;
+ int unicode_version;
+
+ if (version) {
+ unsigned int maj, min, rev;
+
+ if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+ return ERR_PTR(-EINVAL);
+
+ if (!utf8version_is_supported(maj, min, rev))
+ return ERR_PTR(-EINVAL);
+
+ unicode_version = UNICODE_AGE(maj, min, rev);
+ } else {
+ unicode_version = utf8version_latest();
+ pr_warn("UTF-8 version not specified. Assuming latest supported version (%d.%d.%d).",
+ (unicode_version >> 16) & 0xff,
+ (unicode_version >> 8) & 0xff,
+ (unicode_version & 0xfe));
+ }
+
+ um = kzalloc(sizeof(*um), GFP_KERNEL);
+ if (!um)
+ return ERR_PTR(-ENOMEM);
+
+ um->charset = "UTF-8";
+ um->version = unicode_version;
+
+ return um;
+}
+
+static struct unicode_ops ops = {
+ .owner = THIS_MODULE,
+ .validate = utf8_validate,
+ .strncmp = utf8_strncmp,
+ .strncasecmp = utf8_strncasecmp,
+ .strncasecmp_folded = utf8_strncasecmp_folded,
+ .casefold = utf8_casefold,
+ .casefold_hash = utf8_casefold_hash,
+ .normalize = utf8_normalize,
+ .load = utf8_load,
+};
+
+static int __init utf8_init(void)
+{
+ unicode_register(&ops);
+ return 0;
+}
+
+static void __exit utf8_exit(void)
+{
+ unicode_unregister();
+}
+
+module_init(utf8_init);
+module_exit(utf8_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index de23f9ee720b..5c9ebd49bfc4 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -10,27 +10,72 @@ struct unicode_map {
int version;
};

-int unicode_validate(const struct unicode_map *um, const struct qstr *str);
+struct unicode_ops {
+ struct module *owner;
+ int (*validate)(const struct unicode_map *um, const struct qstr *str);
+ int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
+ const struct qstr *s2);
+ int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
+ const struct qstr *s2);
+ int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf,
+ const struct qstr *s1);
+ int (*normalize)(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+ int (*casefold)(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+ int (*casefold_hash)(const struct unicode_map *um, const void *salt,
+ struct qstr *str);
+ struct unicode_map* (*load)(const char *version);
+};
+
+extern struct unicode_ops *utf8_ops;
+
+static inline int unicode_validate(const struct unicode_map *um, const struct qstr *str)
+{
+ return utf8_ops->validate(um, str);
+}

-int unicode_strncmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2);
+static inline int unicode_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ return utf8_ops->strncmp(um, s1, s2);
+}

-int unicode_strncasecmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2);
-int unicode_strncasecmp_folded(const struct unicode_map *um,
- const struct qstr *cf,
- const struct qstr *s1);
+static inline int unicode_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ return utf8_ops->strncasecmp(um, s1, s2);
+}

-int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen);
+static inline int unicode_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
+{
+ return utf8_ops->strncasecmp_folded(um, cf, s1);
+}

-int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen);
+static inline int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ return utf8_ops->normalize(um, str, dest, dlen);
+}

-int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
- struct qstr *str);
+static inline int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ return utf8_ops->casefold(um, str, dest, dlen);
+}
+
+static inline int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
+{
+ return utf8_ops->casefold_hash(um, salt, str);
+}

struct unicode_map *unicode_load(const char *version);
void unicode_unload(struct unicode_map *um);

+void unicode_register(struct unicode_ops *ops);
+void unicode_unregister(void);
+
#endif /* _LINUX_UNICODE_H */
--
2.30.1

2021-03-18 14:19:46

by Shreeya Patel

[permalink] [raw]
Subject: Re: [PATCH v2 3/4] fs: unicode: Use strscpy() instead of strncpy()


On 18/03/21 7:03 pm, Shreeya Patel wrote:
> Following warning was reported by Kernel Test Robot.
>
> In function 'utf8_parse_version',
> inlined from 'utf8_load' at fs/unicode/utf8mod.c:195:7:
>>> fs/unicode/utf8mod.c:175:2: warning: 'strncpy' specified bound 12 equals
> destination size [-Wstringop-truncation]
> 175 | strncpy(version_string, version, sizeof(version_string));
> | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> The -Wstringop-truncation warning highlights the unintended
> uses of the strncpy function that truncate the terminating NULL
> character from the source string.
> Unlike strncpy(), strscpy() always null-terminates the destination string,
> hence use strscpy() instead of strncpy().


Not sure if strscpy is preferable. Just found this article
https://lwn.net/Articles/659214/
Should I go for memcpy instead?


>
> Signed-off-by: Shreeya Patel <[email protected]>
> Reported-by: kernel test robot <[email protected]>
> ---
> Changes in v2
> - Resolve warning of -Wstringop-truncation reported by
> kernel test robot.
>
> fs/unicode/unicode-core.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
> index d5f09e022ac5..287a8a48836c 100644
> --- a/fs/unicode/unicode-core.c
> +++ b/fs/unicode/unicode-core.c
> @@ -179,7 +179,7 @@ static int unicode_parse_version(const char *version, unsigned int *maj,
> {0, NULL}
> };
>
> - strncpy(version_string, version, sizeof(version_string));
> + strscpy(version_string, version, sizeof(version_string));
>
> if (match_token(version_string, token, args) != 1)
> return -EINVAL;

2021-03-18 15:43:23

by David Laight

[permalink] [raw]
Subject: RE: [PATCH v2 3/4] fs: unicode: Use strscpy() instead of strncpy()

From: Shreeya Patel
> Sent: 18 March 2021 14:13
>
> On 18/03/21 7:03 pm, Shreeya Patel wrote:
> > Following warning was reported by Kernel Test Robot.
> >
> > In function 'utf8_parse_version',
> > inlined from 'utf8_load' at fs/unicode/utf8mod.c:195:7:
> >>> fs/unicode/utf8mod.c:175:2: warning: 'strncpy' specified bound 12 equals
> > destination size [-Wstringop-truncation]
> > 175 | strncpy(version_string, version, sizeof(version_string));
> > | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >
> > The -Wstringop-truncation warning highlights the unintended
> > uses of the strncpy function that truncate the terminating NULL
> > character from the source string.
> > Unlike strncpy(), strscpy() always null-terminates the destination string,
> > hence use strscpy() instead of strncpy().
>
>
> Not sure if strscpy is preferable. Just found this article
> https://lwn.net/Articles/659214/
> Should I go for memcpy instead?

Which length would you give memcpy() ?
The compiler will moan if you try to read beyond the end of the
input string.

strscpy() is about the best of a bad lot.

I think (I'm not sure!) that a good string copy function should
return the number of bytes copies or the buffer length is truncated.
Then you can do repeated:
off += xxxcpy(buf + off, buflen - off, xxxxx);
without any danger of writing beyond the buffer end, always
getting a '\0' terminated string, and being able to detect overflow
right at the end.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2021-03-18 20:01:05

by Gabriel Krisman Bertazi

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] fs: unicode: Add utf8 module and a unicode layer


Shreeya,


> utf8data.h_shipped has a large database table which is an auto-generated
> decodification trie for the unicode normalization functions.
> It is not necessary to carry this large table in the kernel hence make
>

Thanks for the v2. This is looking much better! I have a few comments
inline.

I also have a question about how this patchset was tested. Did you run
the normalization test (UNICODE_NORMALIZATION_SELFTEST)? What about
actual filesystems?

Minor nit:

"It is not necessary to load this large table in the kernel in the
kernel if no file system is using it"

> UTF-8 encoding loadable by converting it into a module.
> Also, modify the file called unicode-core which will act as a layer for
> unicode subsystem. It will load the UTF-8 module and access it's functions
> whenever any filesystem that needs unicode is mounted.
>
> Signed-off-by: Shreeya Patel <[email protected]>
> ---
>
> Changes in v2
> - Remove the duplicate file utf8-core.c
> - Make the wrapper functions inline.
> - Remove msleep and use try_module_get() and module_put()
> for ensuring that module is loaded correctly and also
> doesn't get unloaded while in use.

> fs/unicode/Kconfig | 11 +-
> fs/unicode/Makefile | 5 +-
> fs/unicode/unicode-core.c | 229 +++++------------------------------
> fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
> include/linux/unicode.h | 73 ++++++++---
> 5 files changed, 346 insertions(+), 218 deletions(-)
> create mode 100644 fs/unicode/utf8mod.c
>
> diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
> index 2c27b9a5cd6c..2961b0206b4d 100644
> --- a/fs/unicode/Kconfig
> +++ b/fs/unicode/Kconfig
> @@ -8,7 +8,16 @@ config UNICODE
> Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
> support.
>
> +# UTF-8 encoding can be compiled as a module using UNICODE_UTF8 option.
> +# Having UTF-8 encoding as a module will avoid carrying large
> +# database table present in utf8data.h_shipped into the kernel
> +# by being able to load it only when it is required by the filesystem.
> +config UNICODE_UTF8
> + tristate "UTF-8 module"
> + depends on UNICODE
> + default m
> +
> config UNICODE_NORMALIZATION_SELFTEST
> tristate "Test UTF-8 normalization support"
> - depends on UNICODE
> + depends on UNICODE_UTF8
> default n
> diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
> index fbf9a629ed0d..9dbb04194b32 100644
> --- a/fs/unicode/Makefile
> +++ b/fs/unicode/Makefile
> @@ -1,11 +1,14 @@
> # SPDX-License-Identifier: GPL-2.0
>
> obj-$(CONFIG_UNICODE) += unicode.o
> +obj-$(CONFIG_UNICODE_UTF8) += utf8.o
> obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
>
> -unicode-y := utf8-norm.o unicode-core.o
> +unicode-y := unicode-core.o
> +utf8-y := utf8mod.o utf8-norm.o
>
> $(obj)/utf8-norm.o: $(obj)/utf8data.h
> +$(obj)/utf8mod.o: $(obj)/utf8-norm.o
>
> # In the normal build, the checked-in utf8data.h is just shipped.
> #
> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
> index 287a8a48836c..945984a3fe9e 100644
> --- a/fs/unicode/unicode-core.c
> +++ b/fs/unicode/unicode-core.c
> @@ -1,235 +1,60 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> #include <linux/module.h>
> #include <linux/kernel.h>
> -#include <linux/string.h>
> #include <linux/slab.h>
> -#include <linux/parser.h>
> #include <linux/errno.h>
> #include <linux/unicode.h>
> -#include <linux/stringhash.h>
>
> -#include "utf8n.h"
>
> -int unicode_validate(const struct unicode_map *um, const struct qstr *str)
> -{
> - const struct utf8data *data = utf8nfdi(um->version);
> -
> - if (utf8nlen(data, str->name, str->len) < 0)
> - return -1;
> - return 0;
> -}
> -EXPORT_SYMBOL(unicode_validate);
> -
> -int unicode_strncmp(const struct unicode_map *um,
> - const struct qstr *s1, const struct qstr *s2)
> -{
> - const struct utf8data *data = utf8nfdi(um->version);
> - struct utf8cursor cur1, cur2;
> - int c1, c2;
> -
> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
> - return -EINVAL;
> -
> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
> - return -EINVAL;
> -
> - do {
> - c1 = utf8byte(&cur1);
> - c2 = utf8byte(&cur2);
> -
> - if (c1 < 0 || c2 < 0)
> - return -EINVAL;
> - if (c1 != c2)
> - return 1;
> - } while (c1);
> -
> - return 0;
> -}
> -EXPORT_SYMBOL(unicode_strncmp);
> -
> -int unicode_strncasecmp(const struct unicode_map *um,
> - const struct qstr *s1, const struct qstr *s2)
> -{
> - const struct utf8data *data = utf8nfdicf(um->version);
> - struct utf8cursor cur1, cur2;
> - int c1, c2;
> +struct unicode_ops *utf8_ops;
> +EXPORT_SYMBOL(utf8_ops);
>
> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
> - return -EINVAL;
> -
> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
> - return -EINVAL;
> -
> - do {
> - c1 = utf8byte(&cur1);
> - c2 = utf8byte(&cur2);
> -
> - if (c1 < 0 || c2 < 0)
> - return -EINVAL;
> - if (c1 != c2)
> - return 1;
> - } while (c1);
> -
> - return 0;
> -}
> -EXPORT_SYMBOL(unicode_strncasecmp);
> -
> -/* String cf is expected to be a valid UTF-8 casefolded
> - * string.
> - */
> -int unicode_strncasecmp_folded(const struct unicode_map *um,
> - const struct qstr *cf,
> - const struct qstr *s1)
> +static int unicode_load_module(void)
> {
> - const struct utf8data *data = utf8nfdicf(um->version);
> - struct utf8cursor cur1;
> - int c1, c2;
> - int i = 0;
> -
> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
> - return -EINVAL;
> -
> - do {
> - c1 = utf8byte(&cur1);
> - c2 = cf->name[i++];
> - if (c1 < 0)
> - return -EINVAL;
> - if (c1 != c2)
> - return 1;
> - } while (c1);
> -
> - return 0;
> -}
> -EXPORT_SYMBOL(unicode_strncasecmp_folded);
> -
> -int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
> - unsigned char *dest, size_t dlen)
> -{
> - const struct utf8data *data = utf8nfdicf(um->version);
> - struct utf8cursor cur;
> - size_t nlen = 0;
> -
> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
> - return -EINVAL;
> + int ret = request_module("utf8");
>
> - for (nlen = 0; nlen < dlen; nlen++) {
> - int c = utf8byte(&cur);
> -
> - dest[nlen] = c;
> - if (!c)
> - return nlen;
> - if (c == -1)
> - break;
> + if (ret) {
> + pr_err("Failed to load UTF-8 module\n");
> + return ret;
> }
> - return -EINVAL;
> -}
> -EXPORT_SYMBOL(unicode_casefold);
> -
> -int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
> - struct qstr *str)
> -{
> - const struct utf8data *data = utf8nfdicf(um->version);
> - struct utf8cursor cur;
> - int c;
> - unsigned long hash = init_name_hash(salt);
>
> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
> - return -EINVAL;
> -
> - while ((c = utf8byte(&cur))) {
> - if (c < 0)
> - return -EINVAL;
> - hash = partial_name_hash((unsigned char)c, hash);
> - }
> - str->hash = end_name_hash(hash);
> return 0;
> }
> -EXPORT_SYMBOL(unicode_casefold_hash);
>
> -int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
> - unsigned char *dest, size_t dlen)
> +struct unicode_map *unicode_load(const char *version)
> {
> - const struct utf8data *data = utf8nfdi(um->version);
> - struct utf8cursor cur;
> - ssize_t nlen = 0;
> + int ret = unicode_load_module();
>
> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
> - return -EINVAL;
> + if (ret)
> + return ERR_PTR(ret);
>
> - for (nlen = 0; nlen < dlen; nlen++) {
> - int c = utf8byte(&cur);
> + if (utf8_ops && !try_module_get(utf8_ops->owner))
> + return ERR_PTR(-ENODEV);
>
> - dest[nlen] = c;
> - if (!c)
> - return nlen;
> - if (c == -1)
> - break;
> - }
> - return -EINVAL;
> + else
> + return utf8_ops->load(version);

This patch is a bit hard to review with the way git separated the
hunks. I think you should be able to regenerate it in a simpler form
by tinkering with git-format-patch parameters.

Here, it seems that if utf8_ops is NULL (the first condition of the if
leg), the else leg dereferences a NULL pointer by calling
utf8_ops->load(), which can't be right.

Maybe, the if leg should be:

if (!utf8_ops || !try_module_get(utf8_ops->owner)
return ERR_PTR(-ENODEV)

But this is still racy, since you are not protecting utf8_ops before
acquiring the reference. If you race with module removal here, a
NULL ptr dereference can still occur. See below.

> }
> -EXPORT_SYMBOL(unicode_normalize);
> +EXPORT_SYMBOL(unicode_load);
>
> -static int unicode_parse_version(const char *version, unsigned int *maj,
> - unsigned int *min, unsigned int *rev)
> +void unicode_unload(struct unicode_map *um)
> {
> - substring_t args[3];
> - char version_string[12];
> - static const struct match_token token[] = {
> - {1, "%d.%d.%d"},
> - {0, NULL}
> - };
> -
> - strscpy(version_string, version, sizeof(version_string));
> -
> - if (match_token(version_string, token, args) != 1)
> - return -EINVAL;
> + if (utf8_ops)
> + module_put(utf8_ops->owner);
>

How can we have a unicode_map to free if utf8_ops is NULL? that seems
to be an invalid use of API, which suggests a bug elsewhere
in the kernel. maybe this should read like this:

void unicode_unload(struct unicode_map *um)
{
if (WARN_ON(!utf8_ops))
return;

module_put(utf8_ops->owner);
kfree(um);
}

> - if (match_int(&args[0], maj) || match_int(&args[1], min) ||
> - match_int(&args[2], rev))
> - return -EINVAL;
> -
> - return 0;
> + kfree(um);
> }
> +EXPORT_SYMBOL(unicode_unload);
>
> -struct unicode_map *unicode_load(const char *version)
> +void unicode_register(struct unicode_ops *ops)
> {
> - struct unicode_map *um = NULL;
> - int unicode_version;
> -
> - if (version) {
> - unsigned int maj, min, rev;
> -
> - if (unicode_parse_version(version, &maj, &min, &rev) < 0)
> - return ERR_PTR(-EINVAL);
> -
> - if (!utf8version_is_supported(maj, min, rev))
> - return ERR_PTR(-EINVAL);
> -
> - unicode_version = UNICODE_AGE(maj, min, rev);
> - } else {
> - unicode_version = utf8version_latest();
> - printk(KERN_WARNING"UTF-8 version not specified. "
> - "Assuming latest supported version (%d.%d.%d).",
> - (unicode_version >> 16) & 0xff,
> - (unicode_version >> 8) & 0xff,
> - (unicode_version & 0xff));
> - }
> -
> - um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
> - if (!um)
> - return ERR_PTR(-ENOMEM);
> -
> - um->charset = "UTF-8";
> - um->version = unicode_version;
> -
> - return um;
> + utf8_ops = ops;

While we guarantee that utf8_ops isn't modified when a unicode_map is
in-use by acquiring the module reference, registering/unregistering the
module should be protected, to avoid that race I mentioned above.

> }
> -EXPORT_SYMBOL(unicode_load);
> +EXPORT_SYMBOL(unicode_register);
>
> -void unicode_unload(struct unicode_map *um)
> +void unicode_unregister(void)
> {
> - kfree(um);
> + utf8_ops = NULL;
> }
> -EXPORT_SYMBOL(unicode_unload);
> +EXPORT_SYMBOL(unicode_unregister);
>
> MODULE_LICENSE("GPL v2");
> diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
> new file mode 100644
> index 000000000000..9981960da863
> --- /dev/null
> +++ b/fs/unicode/utf8mod.c

This is a bit nitpicky, but can you follow the naming scheme and call
this file unicode-utf8.c or maybe utf8-mod.c ?

--
Gabriel Krisman Bertazi

2021-03-18 21:05:03

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v2 3/4] fs: unicode: Use strscpy() instead of strncpy()

On Thu, Mar 18, 2021 at 07:03:04PM +0530, Shreeya Patel wrote:
> Following warning was reported by Kernel Test Robot.
>
> In function 'utf8_parse_version',
> inlined from 'utf8_load' at fs/unicode/utf8mod.c:195:7:
> >> fs/unicode/utf8mod.c:175:2: warning: 'strncpy' specified bound 12 equals
> destination size [-Wstringop-truncation]
> 175 | strncpy(version_string, version, sizeof(version_string));
> | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> The -Wstringop-truncation warning highlights the unintended
> uses of the strncpy function that truncate the terminating NULL
> character from the source string.
> Unlike strncpy(), strscpy() always null-terminates the destination string,
> hence use strscpy() instead of strncpy().
>
> Signed-off-by: Shreeya Patel <[email protected]>
> Reported-by: kernel test robot <[email protected]>
> ---
> Changes in v2
> - Resolve warning of -Wstringop-truncation reported by
> kernel test robot.
>
> fs/unicode/unicode-core.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
> index d5f09e022ac5..287a8a48836c 100644
> --- a/fs/unicode/unicode-core.c
> +++ b/fs/unicode/unicode-core.c
> @@ -179,7 +179,7 @@ static int unicode_parse_version(const char *version, unsigned int *maj,
> {0, NULL}
> };
>
> - strncpy(version_string, version, sizeof(version_string));
> + strscpy(version_string, version, sizeof(version_string));
>

Shouldn't unicode_parse_version() return an error if the string gets truncated
here? I.e. check if strscpy() returns < 0.

Also, this is a "fix" (though one that doesn't currently matter, since 'version'
is currently always shorter than sizeof(version_string)), so it should go first
in the series and have a Fixes tag.

- Eric

2021-03-18 21:09:26

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] fs: unicode: Add utf8 module and a unicode layer

On Thu, Mar 18, 2021 at 07:03:05PM +0530, Shreeya Patel wrote:
> +struct unicode_ops {
> + struct module *owner;
> + int (*validate)(const struct unicode_map *um, const struct qstr *str);
> + int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
> + const struct qstr *s2);
> + int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
> + const struct qstr *s2);
> + int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf,
> + const struct qstr *s1);
> + int (*normalize)(const struct unicode_map *um, const struct qstr *str,
> + unsigned char *dest, size_t dlen);
> + int (*casefold)(const struct unicode_map *um, const struct qstr *str,
> + unsigned char *dest, size_t dlen);
> + int (*casefold_hash)(const struct unicode_map *um, const void *salt,
> + struct qstr *str);
> + struct unicode_map* (*load)(const char *version);
> +};

Indirect calls are expensive these days, especially due to the Spectre
mitigations. Would it make sense to use static calls
(include/linux/static_call.h) instead for this?

- Eric

2021-03-19 10:28:47

by Shreeya Patel

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] fs: unicode: Add utf8 module and a unicode layer


On 19/03/21 1:27 am, Gabriel Krisman Bertazi wrote:
> Shreeya,
>
Hi Gabriel,

Thanks for your detailed review. Please find my comments inline.
>> utf8data.h_shipped has a large database table which is an auto-generated
>> decodification trie for the unicode normalization functions.
>> It is not necessary to carry this large table in the kernel hence make
>>
> Thanks for the v2. This is looking much better! I have a few comments
> inline.
>
> I also have a question about how this patchset was tested. Did you run
> the normalization test (UNICODE_NORMALIZATION_SELFTEST)? What about
> actual filesystems?


Yes I did run the normalization test by loading the utf8-selftest.ko module.
modprobe utf8-selftest


For actual filesystem check I followed the blog post written by you.
https://www.collabora.com/news-and-blog/blog/2020/08/27/using-the-linux-kernel-case-insensitive-feature-in-ext4/
So the basic steps that were followed are as follows :-
1. mkfs -t ext4 -O casefold image.img
2. mount image.img /mnt ( This step would load the module )
3. cd /mnt
4. mkdir CI_dir
5. touch CI_dir/hello_world ( this step would call the inline functions )
6. ls CI_dir/HELLO_WORLD (HELLO_WORLD file should be found if our
case-insensitive fs is working as expected.)


>
> Minor nit:
>
> "It is not necessary to load this large table in the kernel in the
> kernel if no file system is using it"
>
>> UTF-8 encoding loadable by converting it into a module.
>> Also, modify the file called unicode-core which will act as a layer for
>> unicode subsystem. It will load the UTF-8 module and access it's functions
>> whenever any filesystem that needs unicode is mounted.
>>
>> Signed-off-by: Shreeya Patel <[email protected]>
>> ---
>>
>> Changes in v2
>> - Remove the duplicate file utf8-core.c
>> - Make the wrapper functions inline.
>> - Remove msleep and use try_module_get() and module_put()
>> for ensuring that module is loaded correctly and also
>> doesn't get unloaded while in use.
>> fs/unicode/Kconfig | 11 +-
>> fs/unicode/Makefile | 5 +-
>> fs/unicode/unicode-core.c | 229 +++++------------------------------
>> fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
>> include/linux/unicode.h | 73 ++++++++---
>> 5 files changed, 346 insertions(+), 218 deletions(-)
>> create mode 100644 fs/unicode/utf8mod.c
>>
>> diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
>> index 2c27b9a5cd6c..2961b0206b4d 100644
>> --- a/fs/unicode/Kconfig
>> +++ b/fs/unicode/Kconfig
>> @@ -8,7 +8,16 @@ config UNICODE
>> Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
>> support.
>>
>> +# UTF-8 encoding can be compiled as a module using UNICODE_UTF8 option.
>> +# Having UTF-8 encoding as a module will avoid carrying large
>> +# database table present in utf8data.h_shipped into the kernel
>> +# by being able to load it only when it is required by the filesystem.
>> +config UNICODE_UTF8
>> + tristate "UTF-8 module"
>> + depends on UNICODE
>> + default m
>> +
>> config UNICODE_NORMALIZATION_SELFTEST
>> tristate "Test UTF-8 normalization support"
>> - depends on UNICODE
>> + depends on UNICODE_UTF8
>> default n
>> diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
>> index fbf9a629ed0d..9dbb04194b32 100644
>> --- a/fs/unicode/Makefile
>> +++ b/fs/unicode/Makefile
>> @@ -1,11 +1,14 @@
>> # SPDX-License-Identifier: GPL-2.0
>>
>> obj-$(CONFIG_UNICODE) += unicode.o
>> +obj-$(CONFIG_UNICODE_UTF8) += utf8.o
>> obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
>>
>> -unicode-y := utf8-norm.o unicode-core.o
>> +unicode-y := unicode-core.o
>> +utf8-y := utf8mod.o utf8-norm.o
>>
>> $(obj)/utf8-norm.o: $(obj)/utf8data.h
>> +$(obj)/utf8mod.o: $(obj)/utf8-norm.o
>>
>> # In the normal build, the checked-in utf8data.h is just shipped.
>> #
>> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
>> index 287a8a48836c..945984a3fe9e 100644
>> --- a/fs/unicode/unicode-core.c
>> +++ b/fs/unicode/unicode-core.c
>> @@ -1,235 +1,60 @@
>> /* SPDX-License-Identifier: GPL-2.0 */
>> #include <linux/module.h>
>> #include <linux/kernel.h>
>> -#include <linux/string.h>
>> #include <linux/slab.h>
>> -#include <linux/parser.h>
>> #include <linux/errno.h>
>> #include <linux/unicode.h>
>> -#include <linux/stringhash.h>
>>
>> -#include "utf8n.h"
>>
>> -int unicode_validate(const struct unicode_map *um, const struct qstr *str)
>> -{
>> - const struct utf8data *data = utf8nfdi(um->version);
>> -
>> - if (utf8nlen(data, str->name, str->len) < 0)
>> - return -1;
>> - return 0;
>> -}
>> -EXPORT_SYMBOL(unicode_validate);
>> -
>> -int unicode_strncmp(const struct unicode_map *um,
>> - const struct qstr *s1, const struct qstr *s2)
>> -{
>> - const struct utf8data *data = utf8nfdi(um->version);
>> - struct utf8cursor cur1, cur2;
>> - int c1, c2;
>> -
>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> - return -EINVAL;
>> -
>> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>> - return -EINVAL;
>> -
>> - do {
>> - c1 = utf8byte(&cur1);
>> - c2 = utf8byte(&cur2);
>> -
>> - if (c1 < 0 || c2 < 0)
>> - return -EINVAL;
>> - if (c1 != c2)
>> - return 1;
>> - } while (c1);
>> -
>> - return 0;
>> -}
>> -EXPORT_SYMBOL(unicode_strncmp);
>> -
>> -int unicode_strncasecmp(const struct unicode_map *um,
>> - const struct qstr *s1, const struct qstr *s2)
>> -{
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur1, cur2;
>> - int c1, c2;
>> +struct unicode_ops *utf8_ops;
>> +EXPORT_SYMBOL(utf8_ops);
>>
>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> - return -EINVAL;
>> -
>> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>> - return -EINVAL;
>> -
>> - do {
>> - c1 = utf8byte(&cur1);
>> - c2 = utf8byte(&cur2);
>> -
>> - if (c1 < 0 || c2 < 0)
>> - return -EINVAL;
>> - if (c1 != c2)
>> - return 1;
>> - } while (c1);
>> -
>> - return 0;
>> -}
>> -EXPORT_SYMBOL(unicode_strncasecmp);
>> -
>> -/* String cf is expected to be a valid UTF-8 casefolded
>> - * string.
>> - */
>> -int unicode_strncasecmp_folded(const struct unicode_map *um,
>> - const struct qstr *cf,
>> - const struct qstr *s1)
>> +static int unicode_load_module(void)
>> {
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur1;
>> - int c1, c2;
>> - int i = 0;
>> -
>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> - return -EINVAL;
>> -
>> - do {
>> - c1 = utf8byte(&cur1);
>> - c2 = cf->name[i++];
>> - if (c1 < 0)
>> - return -EINVAL;
>> - if (c1 != c2)
>> - return 1;
>> - } while (c1);
>> -
>> - return 0;
>> -}
>> -EXPORT_SYMBOL(unicode_strncasecmp_folded);
>> -
>> -int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
>> - unsigned char *dest, size_t dlen)
>> -{
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur;
>> - size_t nlen = 0;
>> -
>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> - return -EINVAL;
>> + int ret = request_module("utf8");
>>
>> - for (nlen = 0; nlen < dlen; nlen++) {
>> - int c = utf8byte(&cur);
>> -
>> - dest[nlen] = c;
>> - if (!c)
>> - return nlen;
>> - if (c == -1)
>> - break;
>> + if (ret) {
>> + pr_err("Failed to load UTF-8 module\n");
>> + return ret;
>> }
>> - return -EINVAL;
>> -}
>> -EXPORT_SYMBOL(unicode_casefold);
>> -
>> -int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
>> - struct qstr *str)
>> -{
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur;
>> - int c;
>> - unsigned long hash = init_name_hash(salt);
>>
>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> - return -EINVAL;
>> -
>> - while ((c = utf8byte(&cur))) {
>> - if (c < 0)
>> - return -EINVAL;
>> - hash = partial_name_hash((unsigned char)c, hash);
>> - }
>> - str->hash = end_name_hash(hash);
>> return 0;
>> }
>> -EXPORT_SYMBOL(unicode_casefold_hash);
>>
>> -int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
>> - unsigned char *dest, size_t dlen)
>> +struct unicode_map *unicode_load(const char *version)
>> {
>> - const struct utf8data *data = utf8nfdi(um->version);
>> - struct utf8cursor cur;
>> - ssize_t nlen = 0;
>> + int ret = unicode_load_module();
>>
>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> - return -EINVAL;
>> + if (ret)
>> + return ERR_PTR(ret);
>>
>> - for (nlen = 0; nlen < dlen; nlen++) {
>> - int c = utf8byte(&cur);
>> + if (utf8_ops && !try_module_get(utf8_ops->owner))
>> + return ERR_PTR(-ENODEV);
>>
>> - dest[nlen] = c;
>> - if (!c)
>> - return nlen;
>> - if (c == -1)
>> - break;
>> - }
>> - return -EINVAL;
>> + else
>> + return utf8_ops->load(version);
> This patch is a bit hard to review with the way git separated the
> hunks. I think you should be able to regenerate it in a simpler form
> by tinkering with git-format-patch parameters.


I agree, I will make sure to format it nicely for v3.

>
> Here, it seems that if utf8_ops is NULL (the first condition of the if
> leg), the else leg dereferences a NULL pointer by calling
> utf8_ops->load(), which can't be right.


Ah, right.

> Maybe, the if leg should be:
>
> if (!utf8_ops || !try_module_get(utf8_ops->owner)
> return ERR_PTR(-ENODEV)
>
> But this is still racy, since you are not protecting utf8_ops before
> acquiring the reference. If you race with module removal here, a
> NULL ptr dereference can still occur. See below.


If module is removed before reaching this step, then unicode_unregister
function would make utf8_ops NULL. So the first condition of if will be true
and it will return error so how can we have a NULL ptr dereference then?
>> }
>> -EXPORT_SYMBOL(unicode_normalize);
>> +EXPORT_SYMBOL(unicode_load);
>>
>> -static int unicode_parse_version(const char *version, unsigned int *maj,
>> - unsigned int *min, unsigned int *rev)
>> +void unicode_unload(struct unicode_map *um)
>> {
>> - substring_t args[3];
>> - char version_string[12];
>> - static const struct match_token token[] = {
>> - {1, "%d.%d.%d"},
>> - {0, NULL}
>> - };
>> -
>> - strscpy(version_string, version, sizeof(version_string));
>> -
>> - if (match_token(version_string, token, args) != 1)
>> - return -EINVAL;
>> + if (utf8_ops)
>> + module_put(utf8_ops->owner);
>>
> How can we have a unicode_map to free if utf8_ops is NULL? that seems
> to be an invalid use of API, which suggests a bug elsewhere
> in the kernel. maybe this should read like this:
>
> void unicode_unload(struct unicode_map *um)
> {
> if (WARN_ON(!utf8_ops))
> return;
>
> module_put(utf8_ops->owner);
> kfree(um);
> }


The reason for adding the check if(utf8_ops) is that some of the filesystem
calls the unicode_unload function even before calling the unicode_load
function.
if we try to decrement the reference without even having the reference.
( i.e. not loading the module )
it would result in kernel panic.
fs/ext4/super.c
fs/f2fs/super.c
Both the above files call the unicode_unload function if CONFIG_UNICODE
is enabled.
Not sure if this is an odd behavior or expected.


>> - if (match_int(&args[0], maj) || match_int(&args[1], min) ||
>> - match_int(&args[2], rev))
>> - return -EINVAL;
>> -
>> - return 0;
>> + kfree(um);
>> }
>> +EXPORT_SYMBOL(unicode_unload);
>>
>> -struct unicode_map *unicode_load(const char *version)
>> +void unicode_register(struct unicode_ops *ops)
>> {
>> - struct unicode_map *um = NULL;
>> - int unicode_version;
>> -
>> - if (version) {
>> - unsigned int maj, min, rev;
>> -
>> - if (unicode_parse_version(version, &maj, &min, &rev) < 0)
>> - return ERR_PTR(-EINVAL);
>> -
>> - if (!utf8version_is_supported(maj, min, rev))
>> - return ERR_PTR(-EINVAL);
>> -
>> - unicode_version = UNICODE_AGE(maj, min, rev);
>> - } else {
>> - unicode_version = utf8version_latest();
>> - printk(KERN_WARNING"UTF-8 version not specified. "
>> - "Assuming latest supported version (%d.%d.%d).",
>> - (unicode_version >> 16) & 0xff,
>> - (unicode_version >> 8) & 0xff,
>> - (unicode_version & 0xff));
>> - }
>> -
>> - um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
>> - if (!um)
>> - return ERR_PTR(-ENOMEM);
>> -
>> - um->charset = "UTF-8";
>> - um->version = unicode_version;
>> -
>> - return um;
>> + utf8_ops = ops;
> While we guarantee that utf8_ops isn't modified when a unicode_map is
> in-use by acquiring the module reference, registering/unregistering the
> module should be protected, to avoid that race I mentioned above.
>
>> }
>> -EXPORT_SYMBOL(unicode_load);
>> +EXPORT_SYMBOL(unicode_register);
>>
>> -void unicode_unload(struct unicode_map *um)
>> +void unicode_unregister(void)
>> {
>> - kfree(um);
>> + utf8_ops = NULL;
>> }
>> -EXPORT_SYMBOL(unicode_unload);
>> +EXPORT_SYMBOL(unicode_unregister);
>>
>> MODULE_LICENSE("GPL v2");
>> diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
>> new file mode 100644
>> index 000000000000..9981960da863
>> --- /dev/null
>> +++ b/fs/unicode/utf8mod.c
> This is a bit nitpicky, but can you follow the naming scheme and call
> this file unicode-utf8.c or maybe utf8-mod.c ?


Sure, will name it as uncide-utf8.c

>

2021-03-19 10:33:59

by Shreeya Patel

[permalink] [raw]
Subject: Re: [PATCH v2 3/4] fs: unicode: Use strscpy() instead of strncpy()


On 19/03/21 2:33 am, Eric Biggers wrote:
> On Thu, Mar 18, 2021 at 07:03:04PM +0530, Shreeya Patel wrote:
>> Following warning was reported by Kernel Test Robot.
>>
>> In function 'utf8_parse_version',
>> inlined from 'utf8_load' at fs/unicode/utf8mod.c:195:7:
>>>> fs/unicode/utf8mod.c:175:2: warning: 'strncpy' specified bound 12 equals
>> destination size [-Wstringop-truncation]
>> 175 | strncpy(version_string, version, sizeof(version_string));
>> | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>>
>> The -Wstringop-truncation warning highlights the unintended
>> uses of the strncpy function that truncate the terminating NULL
>> character from the source string.
>> Unlike strncpy(), strscpy() always null-terminates the destination string,
>> hence use strscpy() instead of strncpy().
>>
>> Signed-off-by: Shreeya Patel <[email protected]>
>> Reported-by: kernel test robot <[email protected]>
>> ---
>> Changes in v2
>> - Resolve warning of -Wstringop-truncation reported by
>> kernel test robot.
>>
>> fs/unicode/unicode-core.c | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
>> index d5f09e022ac5..287a8a48836c 100644
>> --- a/fs/unicode/unicode-core.c
>> +++ b/fs/unicode/unicode-core.c
>> @@ -179,7 +179,7 @@ static int unicode_parse_version(const char *version, unsigned int *maj,
>> {0, NULL}
>> };
>>
>> - strncpy(version_string, version, sizeof(version_string));
>> + strscpy(version_string, version, sizeof(version_string));
>>
> Shouldn't unicode_parse_version() return an error if the string gets truncated
> here? I.e. check if strscpy() returns < 0.
>
> Also, this is a "fix" (though one that doesn't currently matter, since 'version'
> is currently always shorter than sizeof(version_string)), so it should go first
> in the series and have a Fixes tag.


Thanks Eric, will send v3 for it.

>
> - Eric

2021-03-19 13:36:23

by Gabriel Krisman Bertazi

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] fs: unicode: Add utf8 module and a unicode layer

Shreeya Patel <[email protected]> writes:

> On 19/03/21 1:27 am, Gabriel Krisman Bertazi wrote:

>> Maybe, the if leg should be:
>>
>> if (!utf8_ops || !try_module_get(utf8_ops->owner)
>> return ERR_PTR(-ENODEV)
>>
>> But this is still racy, since you are not protecting utf8_ops before
>> acquiring the reference. If you race with module removal here, a
>> NULL ptr dereference can still occur. See below.
>
>
> If module is removed before reaching this step, then unicode_unregister
> function would make utf8_ops NULL. So the first condition of if will be true
> and it will return error so how can we have a NULL ptr dereference
> then?

Hi Shreeya,

As we discussed offline, it can happen if the module is deregistered
after checking utf8_ops and before doing the try_module_get.

>>> }
>>> -EXPORT_SYMBOL(unicode_normalize);
>>> +EXPORT_SYMBOL(unicode_load);
>>> -static int unicode_parse_version(const char *version, unsigned int
>>> *maj,
>>> - unsigned int *min, unsigned int *rev)
>>> +void unicode_unload(struct unicode_map *um)
>>> {
>>> - substring_t args[3];
>>> - char version_string[12];
>>> - static const struct match_token token[] = {
>>> - {1, "%d.%d.%d"},
>>> - {0, NULL}
>>> - };
>>> -
>>> - strscpy(version_string, version, sizeof(version_string));
>>> -
>>> - if (match_token(version_string, token, args) != 1)
>>> - return -EINVAL;
>>> + if (utf8_ops)
>>> + module_put(utf8_ops->owner);
>>>
>> How can we have a unicode_map to free if utf8_ops is NULL? that seems
>> to be an invalid use of API, which suggests a bug elsewhere
>> in the kernel. maybe this should read like this:
>>
>> void unicode_unload(struct unicode_map *um)
>> {
>> if (WARN_ON(!utf8_ops))
>> return;
>>
>> module_put(utf8_ops->owner);
>> kfree(um);
>> }
>
>
> The reason for adding the check if(utf8_ops) is that some of the filesystem
> calls the unicode_unload function even before calling the unicode_load
> function.
> if we try to decrement the reference without even having the
> reference. ( i.e. not loading the module )
> it would result in kernel panic.
> fs/ext4/super.c
> fs/f2fs/super.c
> Both the above files call the unicode_unload function if CONFIG_UNICODE
> is enabled.
> Not sure if this is an odd behavior or expected.

Those seem to be error paths, where the mount fails before we get a
chance to load the unicode map. I suggest we fix the callers to avoid
calling the unicode API unnecessarily.

--
Gabriel Krisman Bertazi

2021-03-24 07:23:44

by Shreeya Patel

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] fs: unicode: Add utf8 module and a unicode layer


On 19/03/21 2:35 am, Eric Biggers wrote:
> On Thu, Mar 18, 2021 at 07:03:05PM +0530, Shreeya Patel wrote:
>> +struct unicode_ops {
>> + struct module *owner;
>> + int (*validate)(const struct unicode_map *um, const struct qstr *str);
>> + int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
>> + const struct qstr *s2);
>> + int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
>> + const struct qstr *s2);
>> + int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf,
>> + const struct qstr *s1);
>> + int (*normalize)(const struct unicode_map *um, const struct qstr *str,
>> + unsigned char *dest, size_t dlen);
>> + int (*casefold)(const struct unicode_map *um, const struct qstr *str,
>> + unsigned char *dest, size_t dlen);
>> + int (*casefold_hash)(const struct unicode_map *um, const void *salt,
>> + struct qstr *str);
>> + struct unicode_map* (*load)(const char *version);
>> +};
> Indirect calls are expensive these days, especially due to the Spectre
> mitigations. Would it make sense to use static calls
> (include/linux/static_call.h) instead for this?

Hi Eric,

I just sent a v3 with static_call implementation.


>
> - Eric
>