LinuxLists.cc - [PATCH 0/3] Make UTF-8 encoding loadable

2021-03-13 23:14:35

Subject: [PATCH 0/3] Make UTF-8 encoding loadable

utf8data.h_shipped has a large database table which is an auto-generated
decodification trie for the unicode normalization functions and it is not
necessary to carry this large table in the kernel.
Goal is to make UTF-8 encoding loadable by converting it into a module
and adding a layer between the filesystems and the utf8 module which will
load the module whenever any filesystem that needs unicode is mounted.
Unicode is the subsystem and utf8 is a charachter encoding for the
subsystem, hence first two patches in the series are renaming functions
and file name to unicode for better understanding the difference between
UTF-8 module and unicode layer.
Last patch in the series adds the layer and utf8 module.

Shreeya Patel (3):
fs: unicode: Rename function names from utf8 to unicode
fs: unicode: Rename utf8-core file to unicode-core
fs: unicode: Add utf8 module and a unicode layer

fs/ext4/hash.c | 2 +-
fs/ext4/namei.c | 12 +-
fs/ext4/super.c | 6 +-
fs/f2fs/dir.c | 12 +-
fs/f2fs/super.c | 6 +-
fs/libfs.c | 6 +-
fs/unicode/Kconfig | 7 +-
fs/unicode/Makefile | 5 +-
fs/unicode/unicode-core.c | 112 +++++++++++++++++
fs/unicode/utf8-core.c | 248 ++++++++++---------------------------
fs/unicode/utf8-selftest.c | 8 +-
fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++
include/linux/unicode.h | 52 +++++---
13 files changed, 492 insertions(+), 230 deletions(-)
create mode 100644 fs/unicode/unicode-core.c
create mode 100644 fs/unicode/utf8mod.c

--
2.30.1

2021-03-13 23:16:39

by Shreeya Patel

[permalink] [raw]

Subject: [PATCH 1/3] fs: unicode: Rename function names from utf8 to unicode

Rename the function names from utf8 to unicode for taking the first step
towards the transformation of utf8-core file into the unicode subsystem
layer file.

Signed-off-by: Shreeya Patel <[email protected]>
---
fs/ext4/hash.c | 2 +-
fs/ext4/namei.c | 12 ++++----
fs/ext4/super.c | 6 ++--
fs/f2fs/dir.c | 12 ++++----
fs/f2fs/super.c | 6 ++--
fs/libfs.c | 6 ++--
fs/unicode/utf8-core.c | 57 +++++++++++++++++++-------------------
fs/unicode/utf8-selftest.c | 8 +++---
include/linux/unicode.h | 32 ++++++++++-----------
9 files changed, 70 insertions(+), 71 deletions(-)

diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index a92eb79de0cc..8890a76abe86 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -285,7 +285,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
if (!buff)
return -ENOMEM;

- dlen = utf8_casefold(um, &qstr, buff, PATH_MAX);
+ dlen = unicode_casefold(um, &qstr, buff, PATH_MAX);
if (dlen < 0) {
kfree(buff);
goto opaque_seq;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 686bf982c84e..dde5ce795416 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1290,9 +1290,9 @@ int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
int ret;

if (quick)
- ret = utf8_strncasecmp_folded(um, name, entry);
+ ret = unicode_strncasecmp_folded(um, name, entry);
else
- ret = utf8_strncasecmp(um, name, entry);
+ ret = unicode_strncasecmp(um, name, entry);

if (ret < 0) {
/* Handle invalid character sequence as either an error
@@ -1324,9 +1324,9 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
if (!cf_name->name)
return;

- len = utf8_casefold(dir->i_sb->s_encoding,
- iname, cf_name->name,
- EXT4_NAME_LEN);
+ len = unicode_casefold(dir->i_sb->s_encoding,
+ iname, cf_name->name,
+ EXT4_NAME_LEN);
if (len <= 0) {
kfree(cf_name->name);
cf_name->name = NULL;
@@ -2201,7 +2201,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,

#ifdef CONFIG_UNICODE
if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
- sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
+ sb->s_encoding && unicode_validate(sb->s_encoding, &dentry->d_name))
return -EINVAL;
#endif

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ad34a37278cd..2fb845752c90 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1259,7 +1259,7 @@ static void ext4_put_super(struct super_block *sb)
fs_put_dax(sbi->s_daxdev);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#ifdef CONFIG_UNICODE
- utf8_unload(sb->s_encoding);
+ unicode_unload(sb->s_encoding);
#endif
kfree(sbi);
}
@@ -4304,7 +4304,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}

- encoding = utf8_load(encoding_info->version);
+ encoding = unicode_load(encoding_info->version);
if (IS_ERR(encoding)) {
ext4_msg(sb, KERN_ERR,
"can't mount with superblock charset: %s-%s "
@@ -5165,7 +5165,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
crypto_free_shash(sbi->s_chksum_driver);

#ifdef CONFIG_UNICODE
- utf8_unload(sb->s_encoding);
+ unicode_unload(sb->s_encoding);
#endif

#ifdef CONFIG_QUOTA
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index e6270a867be1..f160f9dd667d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -84,10 +84,10 @@ int f2fs_init_casefolded_name(const struct inode *dir,
GFP_NOFS);
if (!fname->cf_name.name)
return -ENOMEM;
- fname->cf_name.len = utf8_casefold(sb->s_encoding,
- fname->usr_fname,
- fname->cf_name.name,
- F2FS_NAME_LEN);
+ fname->cf_name.len = unicode_casefold(sb->s_encoding,
+ fname->usr_fname,
+ fname->cf_name.name,
+ F2FS_NAME_LEN);
if ((int)fname->cf_name.len <= 0) {
kfree(fname->cf_name.name);
fname->cf_name.name = NULL;
@@ -237,7 +237,7 @@ static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
entry.len = decrypted_name.len;
}

- res = utf8_strncasecmp_folded(um, name, &entry);
+ res = unicode_strncasecmp_folded(um, name, &entry);
/*
* In strict mode, ignore invalid names. In non-strict mode,
* fall back to treating them as opaque byte sequences.
@@ -246,7 +246,7 @@ static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
res = name->len == entry.len &&
memcmp(name->name, entry.name, name->len) == 0;
} else {
- /* utf8_strncasecmp_folded returns 0 on match */
+ /* unicode_strncasecmp_folded returns 0 on match */
res = (res == 0);
}
out:
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 7069793752f1..b4a92e763e27 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1430,7 +1430,7 @@ static void f2fs_put_super(struct super_block *sb)
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
#ifdef CONFIG_UNICODE
- utf8_unload(sb->s_encoding);
+ unicode_unload(sb->s_encoding);
#endif
kfree(sbi);
}
@@ -3560,7 +3560,7 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
return -EINVAL;
}

- encoding = utf8_load(encoding_info->version);
+ encoding = unicode_load(encoding_info->version);
if (IS_ERR(encoding)) {
f2fs_err(sbi,
"can't mount with superblock charset: %s-%s "
@@ -4073,7 +4073,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
kvfree(sbi->write_io[i]);

#ifdef CONFIG_UNICODE
- utf8_unload(sb->s_encoding);
+ unicode_unload(sb->s_encoding);
sb->s_encoding = NULL;
#endif
free_options:
diff --git a/fs/libfs.c b/fs/libfs.c
index e2de5401abca..766556165bb5 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1404,7 +1404,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
* If the dentry name is stored in-line, then it may be concurrently
* modified by a rename. If this happens, the VFS will eventually retry
* the lookup, so it doesn't matter what ->d_compare() returns.
- * However, it's unsafe to call utf8_strncasecmp() with an unstable
+ * However, it's unsafe to call unicode_strncasecmp() with an unstable
* string. Therefore, we have to copy the name into a temporary buffer.
*/
if (len <= DNAME_INLINE_LEN - 1) {
@@ -1414,7 +1414,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
/* prevent compiler from optimizing out the temporary buffer */
barrier();
}
- ret = utf8_strncasecmp(um, name, &qstr);
+ ret = unicode_strncasecmp(um, name, &qstr);
if (ret >= 0)
return ret;

@@ -1443,7 +1443,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
if (!dir || !needs_casefold(dir))
return 0;

- ret = utf8_casefold_hash(um, dentry, str);
+ ret = unicode_casefold_hash(um, dentry, str);
if (ret < 0 && sb_has_strict_encoding(sb))
return -EINVAL;
return 0;
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index dc25823bfed9..d5f09e022ac5 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -10,7 +10,7 @@

#include "utf8n.h"

-int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+int unicode_validate(const struct unicode_map *um, const struct qstr *str)
{
const struct utf8data *data = utf8nfdi(um->version);

@@ -18,10 +18,10 @@ int utf8_validate(const struct unicode_map *um, const struct qstr *str)
return -1;
return 0;
}
-EXPORT_SYMBOL(utf8_validate);
+EXPORT_SYMBOL(unicode_validate);

-int utf8_strncmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2)
+int unicode_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
{
const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur1, cur2;
@@ -45,10 +45,10 @@ int utf8_strncmp(const struct unicode_map *um,

return 0;
}
-EXPORT_SYMBOL(utf8_strncmp);
+EXPORT_SYMBOL(unicode_strncmp);

-int utf8_strncasecmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2)
+int unicode_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur1, cur2;
@@ -72,14 +72,14 @@ int utf8_strncasecmp(const struct unicode_map *um,

return 0;
}
-EXPORT_SYMBOL(utf8_strncasecmp);
+EXPORT_SYMBOL(unicode_strncasecmp);

/* String cf is expected to be a valid UTF-8 casefolded
* string.
*/
-int utf8_strncasecmp_folded(const struct unicode_map *um,
- const struct qstr *cf,
- const struct qstr *s1)
+int unicode_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur1;
@@ -100,10 +100,10 @@ int utf8_strncasecmp_folded(const struct unicode_map *um,

return 0;
}
-EXPORT_SYMBOL(utf8_strncasecmp_folded);
+EXPORT_SYMBOL(unicode_strncasecmp_folded);

-int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen)
+int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur;
@@ -123,10 +123,10 @@ int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
}
return -EINVAL;
}
-EXPORT_SYMBOL(utf8_casefold);
+EXPORT_SYMBOL(unicode_casefold);

-int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
- struct qstr *str)
+int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur;
@@ -144,10 +144,10 @@ int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
str->hash = end_name_hash(hash);
return 0;
}
-EXPORT_SYMBOL(utf8_casefold_hash);
+EXPORT_SYMBOL(unicode_casefold_hash);

-int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen)
+int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
{
const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur;
@@ -167,11 +167,10 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
}
return -EINVAL;
}
+EXPORT_SYMBOL(unicode_normalize);

-EXPORT_SYMBOL(utf8_normalize);
-
-static int utf8_parse_version(const char *version, unsigned int *maj,
- unsigned int *min, unsigned int *rev)
+static int unicode_parse_version(const char *version, unsigned int *maj,
+ unsigned int *min, unsigned int *rev)
{
substring_t args[3];
char version_string[12];
@@ -192,7 +191,7 @@ static int utf8_parse_version(const char *version, unsigned int *maj,
return 0;
}

-struct unicode_map *utf8_load(const char *version)
+struct unicode_map *unicode_load(const char *version)
{
struct unicode_map *um = NULL;
int unicode_version;
@@ -200,7 +199,7 @@ struct unicode_map *utf8_load(const char *version)
if (version) {
unsigned int maj, min, rev;

- if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+ if (unicode_parse_version(version, &maj, &min, &rev) < 0)
return ERR_PTR(-EINVAL);

if (!utf8version_is_supported(maj, min, rev))
@@ -225,12 +224,12 @@ struct unicode_map *utf8_load(const char *version)

return um;
}
-EXPORT_SYMBOL(utf8_load);
+EXPORT_SYMBOL(unicode_load);

-void utf8_unload(struct unicode_map *um)
+void unicode_unload(struct unicode_map *um)
{
kfree(um);
}
-EXPORT_SYMBOL(utf8_unload);
+EXPORT_SYMBOL(unicode_unload);

MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index 6fe8af7edccb..796c1ed922ea 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -235,7 +235,7 @@ static void check_utf8_nfdicf(void)
static void check_utf8_comparisons(void)
{
int i;
- struct unicode_map *table = utf8_load("12.1.0");
+ struct unicode_map *table = unicode_load("12.1.0");

if (IS_ERR(table)) {
pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
@@ -249,7 +249,7 @@ static void check_utf8_comparisons(void)
const struct qstr s2 = {.name = nfdi_test_data[i].dec,
.len = sizeof(nfdi_test_data[i].dec)};

- test_f(!utf8_strncmp(table, &s1, &s2),
+ test_f(!unicode_strncmp(table, &s1, &s2),
"%s %s comparison mismatch\n", s1.name, s2.name);
}

@@ -259,11 +259,11 @@ static void check_utf8_comparisons(void)
const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
.len = sizeof(nfdicf_test_data[i].ncf)};

- test_f(!utf8_strncasecmp(table, &s1, &s2),
+ test_f(!unicode_strncasecmp(table, &s1, &s2),
"%s %s comparison mismatch\n", s1.name, s2.name);
}

- utf8_unload(table);
+ unicode_unload(table);
}

static void check_supported_versions(void)
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 74484d44c755..de23f9ee720b 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -10,27 +10,27 @@ struct unicode_map {
int version;
};

-int utf8_validate(const struct unicode_map *um, const struct qstr *str);
+int unicode_validate(const struct unicode_map *um, const struct qstr *str);

-int utf8_strncmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2);
+int unicode_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2);

-int utf8_strncasecmp(const struct unicode_map *um,
- const struct qstr *s1, const struct qstr *s2);
-int utf8_strncasecmp_folded(const struct unicode_map *um,
- const struct qstr *cf,
- const struct qstr *s1);
+int unicode_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2);
+int unicode_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1);

-int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen);
+int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);

-int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
- unsigned char *dest, size_t dlen);
+int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);

-int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
- struct qstr *str);
+int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str);

-struct unicode_map *utf8_load(const char *version);
-void utf8_unload(struct unicode_map *um);
+struct unicode_map *unicode_load(const char *version);
+void unicode_unload(struct unicode_map *um);

#endif /* _LINUX_UNICODE_H */
--
2.30.1

2021-03-13 23:18:35

by Shreeya Patel

[permalink] [raw]

Subject: [PATCH 2/3] fs: unicode: Rename utf8-core file to unicode-core

Rename the file name from utf8-core to unicode-core for transformation of
utf8-core file into the unicode subsystem layer file and also for better
understanding.

Signed-off-by: Shreeya Patel <[email protected]>
---
fs/unicode/Makefile | 2 +-
fs/unicode/{utf8-core.c => unicode-core.c} | 0
2 files changed, 1 insertion(+), 1 deletion(-)
rename fs/unicode/{utf8-core.c => unicode-core.c} (100%)

diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index b88aecc86550..fbf9a629ed0d 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -3,7 +3,7 @@
obj-$(CONFIG_UNICODE) += unicode.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o

-unicode-y := utf8-norm.o utf8-core.o
+unicode-y := utf8-norm.o unicode-core.o

$(obj)/utf8-norm.o: $(obj)/utf8data.h

diff --git a/fs/unicode/utf8-core.c b/fs/unicode/unicode-core.c
similarity index 100%
rename from fs/unicode/utf8-core.c
rename to fs/unicode/unicode-core.c
--
2.30.1

2021-03-13 23:23:49

by Shreeya Patel

[permalink] [raw]

Subject: [PATCH 3/3] fs: unicode: Add utf8 module and a unicode layer

utf8data.h_shipped has a large database table which is an auto-generated
decodification trie for the unicode normalization functions.
It is not necessary to carry this large table in the kernel hence make
UTF-8 encoding loadable by converting it into a module.
Also, modify the file called unicode-core which will act as a layer for
unicode subsystem. It will load the UTF-8 module and access it's functions
whenever any filesystem that needs unicode is mounted.

Signed-off-by: Shreeya Patel <[email protected]>
---
fs/unicode/Kconfig | 7 +-
fs/unicode/Makefile | 5 +-
fs/unicode/unicode-core.c | 201 ++++++-------------------------
fs/unicode/utf8-core.c | 112 +++++++++++++++++
fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
include/linux/unicode.h | 20 ++++
6 files changed, 427 insertions(+), 164 deletions(-)
create mode 100644 fs/unicode/utf8-core.c
create mode 100644 fs/unicode/utf8mod.c

diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index 2c27b9a5cd6c..33a27deef729 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -8,7 +8,12 @@ config UNICODE
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
support.

+config UNICODE_UTF8
+ tristate "UTF-8 module"
+ depends on UNICODE
+ default m
+
config UNICODE_NORMALIZATION_SELFTEST
tristate "Test UTF-8 normalization support"
- depends on UNICODE
+ depends on UNICODE_UTF8
default n
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index fbf9a629ed0d..9dbb04194b32 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,11 +1,14 @@
# SPDX-License-Identifier: GPL-2.0

obj-$(CONFIG_UNICODE) += unicode.o
+obj-$(CONFIG_UNICODE_UTF8) += utf8.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o

-unicode-y := utf8-norm.o unicode-core.o
+unicode-y := unicode-core.o
+utf8-y := utf8mod.o utf8-norm.o

$(obj)/utf8-norm.o: $(obj)/utf8data.h
+$(obj)/utf8mod.o: $(obj)/utf8-norm.o

# In the normal build, the checked-in utf8data.h is just shipped.
#
diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
index d5f09e022ac5..b832341f1e7b 100644
--- a/fs/unicode/unicode-core.c
+++ b/fs/unicode/unicode-core.c
@@ -7,70 +7,29 @@
#include <linux/errno.h>
#include <linux/unicode.h>
#include <linux/stringhash.h>
+#include <linux/delay.h>

-#include "utf8n.h"
+struct unicode_ops *utf8_ops;
+
+static int unicode_load_module(void);

int unicode_validate(const struct unicode_map *um, const struct qstr *str)
{
- const struct utf8data *data = utf8nfdi(um->version);
-
- if (utf8nlen(data, str->name, str->len) < 0)
- return -1;
- return 0;
+ return utf8_ops->validate(um, str);
}
EXPORT_SYMBOL(unicode_validate);

int unicode_strncmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
- const struct utf8data *data = utf8nfdi(um->version);
- struct utf8cursor cur1, cur2;
- int c1, c2;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = utf8byte(&cur2);
-
- if (c1 < 0 || c2 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
+ return utf8_ops->strncmp(um, s1, s2);
}
EXPORT_SYMBOL(unicode_strncmp);

int unicode_strncasecmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur1, cur2;
- int c1, c2;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = utf8byte(&cur2);
-
- if (c1 < 0 || c2 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
+ return utf8_ops->strncasecmp(um, s1, s2);
}
EXPORT_SYMBOL(unicode_strncasecmp);

@@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct unicode_map *um,
const struct qstr *cf,
const struct qstr *s1)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur1;
- int c1, c2;
- int i = 0;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = cf->name[i++];
- if (c1 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
+ return utf8_ops->strncasecmp_folded(um, cf, s1);
}
EXPORT_SYMBOL(unicode_strncasecmp_folded);

int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur;
- size_t nlen = 0;
-
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
-
- for (nlen = 0; nlen < dlen; nlen++) {
- int c = utf8byte(&cur);
-
- dest[nlen] = c;
- if (!c)
- return nlen;
- if (c == -1)
- break;
- }
- return -EINVAL;
+ return utf8_ops->casefold(um, str, dest, dlen);
}
EXPORT_SYMBOL(unicode_casefold);

int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
struct qstr *str)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur;
- int c;
- unsigned long hash = init_name_hash(salt);
-
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
-
- while ((c = utf8byte(&cur))) {
- if (c < 0)
- return -EINVAL;
- hash = partial_name_hash((unsigned char)c, hash);
- }
- str->hash = end_name_hash(hash);
- return 0;
+ return utf8_ops->casefold_hash(um, salt, str);
}
EXPORT_SYMBOL(unicode_casefold_hash);

int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
- const struct utf8data *data = utf8nfdi(um->version);
- struct utf8cursor cur;
- ssize_t nlen = 0;
+ return utf8_ops->normalize(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_normalize);

- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
+struct unicode_map *unicode_load(const char *version)
+{
+ int ret = unicode_load_module();

- for (nlen = 0; nlen < dlen; nlen++) {
- int c = utf8byte(&cur);
+ if (ret)
+ return ERR_PTR(ret);

- dest[nlen] = c;
- if (!c)
- return nlen;
- if (c == -1)
- break;
- }
- return -EINVAL;
+ else
+ return utf8_ops->load(version);
}
-EXPORT_SYMBOL(unicode_normalize);
+EXPORT_SYMBOL(unicode_load);

-static int unicode_parse_version(const char *version, unsigned int *maj,
- unsigned int *min, unsigned int *rev)
+void unicode_unload(struct unicode_map *um)
{
- substring_t args[3];
- char version_string[12];
- static const struct match_token token[] = {
- {1, "%d.%d.%d"},
- {0, NULL}
- };
+ kfree(um);
+}
+EXPORT_SYMBOL(unicode_unload);

- strncpy(version_string, version, sizeof(version_string));
+static int unicode_load_module(void)
+{
+ int ret = request_module("utf8");

- if (match_token(version_string, token, args) != 1)
- return -EINVAL;
+ msleep(100);

- if (match_int(&args[0], maj) || match_int(&args[1], min) ||
- match_int(&args[2], rev))
- return -EINVAL;
+ if (ret) {
+ pr_err("Failed to load UTF-8 module\n");
+ return ret;
+ }

return 0;
}

-struct unicode_map *unicode_load(const char *version)
+void unicode_register(struct unicode_ops *ops)
{
- struct unicode_map *um = NULL;
- int unicode_version;
-
- if (version) {
- unsigned int maj, min, rev;
-
- if (unicode_parse_version(version, &maj, &min, &rev) < 0)
- return ERR_PTR(-EINVAL);
-
- if (!utf8version_is_supported(maj, min, rev))
- return ERR_PTR(-EINVAL);
-
- unicode_version = UNICODE_AGE(maj, min, rev);
- } else {
- unicode_version = utf8version_latest();
- printk(KERN_WARNING"UTF-8 version not specified. "
- "Assuming latest supported version (%d.%d.%d).",
- (unicode_version >> 16) & 0xff,
- (unicode_version >> 8) & 0xff,
- (unicode_version & 0xff));
- }
-
- um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
- if (!um)
- return ERR_PTR(-ENOMEM);
-
- um->charset = "UTF-8";
- um->version = unicode_version;
-
- return um;
+ utf8_ops = ops;
}
-EXPORT_SYMBOL(unicode_load);
+EXPORT_SYMBOL(unicode_register);

-void unicode_unload(struct unicode_map *um)
+void unicode_unregister(void)
{
- kfree(um);
+ utf8_ops = NULL;
}
-EXPORT_SYMBOL(unicode_unload);
+EXPORT_SYMBOL(unicode_unregister);

MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
new file mode 100644
index 000000000000..009faa68330c
--- /dev/null
+++ b/fs/unicode/utf8-core.c
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include <linux/stringhash.h>
+#include <linux/delay.h>
+
+struct unicode_ops *utf8_ops;
+
+static int unicode_load_module(void);
+
+int unicode_validate(const struct unicode_map *um, const struct qstr *str)
+{
+ return utf8_ops->validate(um, str);
+}
+EXPORT_SYMBOL(unicode_validate);
+
+int unicode_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ return utf8_ops->strncmp(um, s1, s2);
+}
+EXPORT_SYMBOL(unicode_strncmp);
+
+int unicode_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ return utf8_ops->strncasecmp(um, s1, s2);
+}
+EXPORT_SYMBOL(unicode_strncasecmp);
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+int unicode_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
+{
+ return utf8_ops->strncasecmp_folded(um, cf, s1);
+}
+EXPORT_SYMBOL(unicode_strncasecmp_folded);
+
+int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ return utf8_ops->casefold(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_casefold);
+
+int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
+{
+ return utf8_ops->casefold_hash(um, salt, str);
+}
+EXPORT_SYMBOL(unicode_casefold_hash);
+
+int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ return utf8_ops->normalize(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_normalize);
+
+struct unicode_map *unicode_load(const char *version)
+{
+ int ret = unicode_load_module();
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ else
+ return utf8_ops->load(version);
+}
+EXPORT_SYMBOL(unicode_load);
+
+void unicode_unload(struct unicode_map *um)
+{
+ kfree(um);
+}
+EXPORT_SYMBOL(unicode_unload);
+
+void unicode_register(struct unicode_ops *ops)
+{
+ utf8_ops = ops;
+}
+EXPORT_SYMBOL(unicode_register);
+
+void unicode_unregister(void)
+{
+ utf8_ops = NULL;
+}
+EXPORT_SYMBOL(unicode_unregister);
+
+static int unicode_load_module(void)
+{
+ int ret = request_module("utf8");
+
+ msleep(100);
+
+ if (ret) {
+ pr_err("Failed to load UTF-8 module\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
new file mode 100644
index 000000000000..8eaeeb27255c
--- /dev/null
+++ b/fs/unicode/utf8mod.c
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include <linux/stringhash.h>
+
+#include "utf8n.h"
+
+static int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+
+ if (utf8nlen(data, str->name, str->len) < 0)
+ return -1;
+ return 0;
+}
+
+static int utf8_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+static int utf8_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+static int utf8_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1;
+ int c1, c2;
+ int i = 0;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = cf->name[i++];
+ if (c1 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+static int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ size_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+static int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ int c;
+ unsigned long hash = init_name_hash(salt);
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ while ((c = utf8byte(&cur))) {
+ if (c < 0)
+ return -EINVAL;
+ hash = partial_name_hash((unsigned char)c, hash);
+ }
+ str->hash = end_name_hash(hash);
+ return 0;
+}
+
+static int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur;
+ ssize_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+ unsigned int *min, unsigned int *rev)
+{
+ substring_t args[3];
+ char version_string[12];
+ static const struct match_token token[] = {
+ {1, "%d.%d.%d"},
+ {0, NULL}
+ };
+
+ strncpy(version_string, version, sizeof(version_string));
+
+ if (match_token(version_string, token, args) != 1)
+ return -EINVAL;
+
+ if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+ match_int(&args[2], rev))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct unicode_map *utf8_load(const char *version)
+{
+ struct unicode_map *um = NULL;
+ int unicode_version;
+
+ if (version) {
+ unsigned int maj, min, rev;
+
+ if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+ return ERR_PTR(-EINVAL);
+
+ if (!utf8version_is_supported(maj, min, rev))
+ return ERR_PTR(-EINVAL);
+
+ unicode_version = UNICODE_AGE(maj, min, rev);
+ } else {
+ unicode_version = utf8version_latest();
+ printk(KERN_WARNING"UTF-8 version not specified. "
+ "Assuming latest supported version (%d.%d.%d).",
+ (unicode_version >> 16) & 0xff,
+ (unicode_version >> 8) & 0xff,
+ (unicode_version & 0xff));
+ }
+
+ um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
+ if (!um)
+ return ERR_PTR(-ENOMEM);
+
+ um->charset = "UTF-8";
+ um->version = unicode_version;
+
+ return um;
+}
+
+static struct unicode_ops ops = {
+ .validate = utf8_validate,
+ .strncmp = utf8_strncmp,
+ .strncasecmp = utf8_strncasecmp,
+ .strncasecmp_folded = utf8_strncasecmp_folded,
+ .casefold = utf8_casefold,
+ .casefold_hash = utf8_casefold_hash,
+ .normalize = utf8_normalize,
+ .load = utf8_load,
+};
+
+static int __init utf8_init(void)
+{
+ unicode_register(&ops);
+ return 0;
+}
+
+static void __exit utf8_exit(void)
+{
+ unicode_unregister();
+}
+
+module_init(utf8_init);
+module_exit(utf8_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index de23f9ee720b..b0d59069e438 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -10,6 +10,23 @@ struct unicode_map {
int version;
};

+struct unicode_ops {
+ int (*validate)(const struct unicode_map *um, const struct qstr *str);
+ int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
+ const struct qstr *s2);
+ int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
+ const struct qstr *s2);
+ int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf,
+ const struct qstr *s1);
+ int (*normalize)(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+ int (*casefold)(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+ int (*casefold_hash)(const struct unicode_map *um, const void *salt,
+ struct qstr *str);
+ struct unicode_map* (*load)(const char *version);
+};
+
int unicode_validate(const struct unicode_map *um, const struct qstr *str);

int unicode_strncmp(const struct unicode_map *um,
@@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
struct unicode_map *unicode_load(const char *version);
void unicode_unload(struct unicode_map *um);

+void unicode_register(struct unicode_ops *ops);
+void unicode_unregister(void);
+
#endif /* _LINUX_UNICODE_H */
--
2.30.1

2021-03-13 23:25:45

by Shreeya Patel

[permalink] [raw]

Subject: [PATCH 3/3] fs: unicode: Make UTF-8 encoding loadable

2021-03-14 01:36:42

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH 3/3] fs: unicode: Make UTF-8 encoding loadable

Hi Shreeya,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on ext4/dev]
[also build test WARNING on f2fs/dev-test linux/master linus/master v5.12-rc2 next-20210312]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Shreeya-Patel/Make-UTF-8-encoding-loadable/20210314-071604
base: https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
config: riscv-randconfig-r022-20210314 (attached as .config)
compiler: riscv64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/85f4765787c386a4b949afaf9721046c0e85955a
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Shreeya-Patel/Make-UTF-8-encoding-loadable/20210314-071604
git checkout 85f4765787c386a4b949afaf9721046c0e85955a
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=riscv

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

In function 'utf8_parse_version',
inlined from 'utf8_load' at fs/unicode/utf8mod.c:195:7:
>> fs/unicode/utf8mod.c:175:2: warning: 'strncpy' specified bound 12 equals destination size [-Wstringop-truncation]
175 | strncpy(version_string, version, sizeof(version_string));
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

vim +/strncpy +175 fs/unicode/utf8mod.c

164
165 static int utf8_parse_version(const char *version, unsigned int *maj,
166 unsigned int *min, unsigned int *rev)
167 {
168 substring_t args[3];
169 char version_string[12];
170 static const struct match_token token[] = {
171 {1, "%d.%d.%d"},
172 {0, NULL}
173 };
174
> 175 strncpy(version_string, version, sizeof(version_string));
176
177 if (match_token(version_string, token, args) != 1)
178 return -EINVAL;
179
180 if (match_int(&args[0], maj) || match_int(&args[1], min) ||
181 match_int(&args[2], rev))
182 return -EINVAL;
183
184 return 0;
185 }
186

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]

Attachments:

(No filename) (2.64 kB)
.config.gz (24.62 kB)
Download all attachments

2021-03-14 02:05:00

by Gabriel Krisman Bertazi

[permalink] [raw]

Subject: Re: [PATCH 3/3] fs: unicode: Add utf8 module and a unicode layer

Shreeya Patel <[email protected]> writes:

> utf8data.h_shipped has a large database table which is an auto-generated
> decodification trie for the unicode normalization functions.
> It is not necessary to carry this large table in the kernel hence make
> UTF-8 encoding loadable by converting it into a module.
> Also, modify the file called unicode-core which will act as a layer for
> unicode subsystem. It will load the UTF-8 module and access it's functions
> whenever any filesystem that needs unicode is mounted.
>
> Signed-off-by: Shreeya Patel <[email protected]>

Hi Shreeya,

> ---
> fs/unicode/Kconfig | 7 +-
> fs/unicode/Makefile | 5 +-
> fs/unicode/unicode-core.c | 201 ++++++-------------------------
> fs/unicode/utf8-core.c | 112 +++++++++++++++++
> fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
> include/linux/unicode.h | 20 ++++
> 6 files changed, 427 insertions(+), 164 deletions(-)
> create mode 100644 fs/unicode/utf8-core.c
> create mode 100644 fs/unicode/utf8mod.c
>
> diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
> index 2c27b9a5cd6c..33a27deef729 100644
> --- a/fs/unicode/Kconfig
> +++ b/fs/unicode/Kconfig
> @@ -8,7 +8,12 @@ config UNICODE
> Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
> support.
>
> +config UNICODE_UTF8
> + tristate "UTF-8 module"
> + depends on UNICODE
> + default m
> +
> config UNICODE_NORMALIZATION_SELFTEST
> tristate "Test UTF-8 normalization support"
> - depends on UNICODE
> + depends on UNICODE_UTF8
> default n
> diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
> index fbf9a629ed0d..9dbb04194b32 100644
> --- a/fs/unicode/Makefile
> +++ b/fs/unicode/Makefile
> @@ -1,11 +1,14 @@
> # SPDX-License-Identifier: GPL-2.0
>
> obj-$(CONFIG_UNICODE) += unicode.o
> +obj-$(CONFIG_UNICODE_UTF8) += utf8.o
> obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
>
> -unicode-y := utf8-norm.o unicode-core.o
> +unicode-y := unicode-core.o
> +utf8-y := utf8mod.o utf8-norm.o
>
> $(obj)/utf8-norm.o: $(obj)/utf8data.h
> +$(obj)/utf8mod.o: $(obj)/utf8-norm.o
>
> # In the normal build, the checked-in utf8data.h is just shipped.
> #
> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
> index d5f09e022ac5..b832341f1e7b 100644
> --- a/fs/unicode/unicode-core.c
> +++ b/fs/unicode/unicode-core.c
> @@ -7,70 +7,29 @@
> #include <linux/errno.h>
> #include <linux/unicode.h>
> #include <linux/stringhash.h>
> +#include <linux/delay.h>
>
> -#include "utf8n.h"
> +struct unicode_ops *utf8_ops;
> +
> +static int unicode_load_module(void);

This is unnecessary
>
> int unicode_validate(const struct unicode_map *um, const struct qstr *str)
> {
> - const struct utf8data *data = utf8nfdi(um->version);
> -
> - if (utf8nlen(data, str->name, str->len) < 0)
> - return -1;
> - return 0;
> + return utf8_ops->validate(um, str);
> }
> EXPORT_SYMBOL(unicode_validate);
>
> int unicode_strncmp(const struct unicode_map *um,
> const struct qstr *s1, const struct qstr *s2)
> {
> - const struct utf8data *data = utf8nfdi(um->version);
> - struct utf8cursor cur1, cur2;
> - int c1, c2;
> -
> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
> - return -EINVAL;
> -
> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
> - return -EINVAL;
> -
> - do {
> - c1 = utf8byte(&cur1);
> - c2 = utf8byte(&cur2);
> -
> - if (c1 < 0 || c2 < 0)
> - return -EINVAL;
> - if (c1 != c2)
> - return 1;
> - } while (c1);
> -
> - return 0;
> + return utf8_ops->strncmp(um, s1, s2);
> }

I think these would go on a header file and inlined.

> EXPORT_SYMBOL(unicode_strncmp);
>
> int unicode_strncasecmp(const struct unicode_map *um,
> const struct qstr *s1, const struct qstr *s2)
> {
> - const struct utf8data *data = utf8nfdicf(um->version);
> - struct utf8cursor cur1, cur2;
> - int c1, c2;
> -
> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
> - return -EINVAL;
> -
> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
> - return -EINVAL;
> -
> - do {
> - c1 = utf8byte(&cur1);
> - c2 = utf8byte(&cur2);
> -
> - if (c1 < 0 || c2 < 0)
> - return -EINVAL;
> - if (c1 != c2)
> - return 1;
> - } while (c1);
> -
> - return 0;
> + return utf8_ops->strncasecmp(um, s1, s2);
> }
> EXPORT_SYMBOL(unicode_strncasecmp);
>
> @@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct unicode_map *um,
> const struct qstr *cf,
> const struct qstr *s1)
> {
> - const struct utf8data *data = utf8nfdicf(um->version);
> - struct utf8cursor cur1;
> - int c1, c2;
> - int i = 0;
> -
> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
> - return -EINVAL;
> -
> - do {
> - c1 = utf8byte(&cur1);
> - c2 = cf->name[i++];
> - if (c1 < 0)
> - return -EINVAL;
> - if (c1 != c2)
> - return 1;
> - } while (c1);
> -
> - return 0;
> + return utf8_ops->strncasecmp_folded(um, cf, s1);
> }
> EXPORT_SYMBOL(unicode_strncasecmp_folded);
>
> int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
> unsigned char *dest, size_t dlen)
> {
> - const struct utf8data *data = utf8nfdicf(um->version);
> - struct utf8cursor cur;
> - size_t nlen = 0;
> -
> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
> - return -EINVAL;
> -
> - for (nlen = 0; nlen < dlen; nlen++) {
> - int c = utf8byte(&cur);
> -
> - dest[nlen] = c;
> - if (!c)
> - return nlen;
> - if (c == -1)
> - break;
> - }
> - return -EINVAL;
> + return utf8_ops->casefold(um, str, dest, dlen);
> }
> EXPORT_SYMBOL(unicode_casefold);
>
> int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
> struct qstr *str)
> {
> - const struct utf8data *data = utf8nfdicf(um->version);
> - struct utf8cursor cur;
> - int c;
> - unsigned long hash = init_name_hash(salt);
> -
> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
> - return -EINVAL;
> -
> - while ((c = utf8byte(&cur))) {
> - if (c < 0)
> - return -EINVAL;
> - hash = partial_name_hash((unsigned char)c, hash);
> - }
> - str->hash = end_name_hash(hash);
> - return 0;
> + return utf8_ops->casefold_hash(um, salt, str);
> }
> EXPORT_SYMBOL(unicode_casefold_hash);
>
> int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
> unsigned char *dest, size_t dlen)
> {
> - const struct utf8data *data = utf8nfdi(um->version);
> - struct utf8cursor cur;
> - ssize_t nlen = 0;
> + return utf8_ops->normalize(um, str, dest, dlen);
> +}
> +EXPORT_SYMBOL(unicode_normalize);
>
> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
> - return -EINVAL;
> +struct unicode_map *unicode_load(const char *version)
> +{
> + int ret = unicode_load_module();
>
> - for (nlen = 0; nlen < dlen; nlen++) {
> - int c = utf8byte(&cur);
> + if (ret)
> + return ERR_PTR(ret);
>
> - dest[nlen] = c;
> - if (!c)
> - return nlen;
> - if (c == -1)
> - break;
> - }
> - return -EINVAL;
> + else
> + return utf8_ops->load(version);
> }
> -EXPORT_SYMBOL(unicode_normalize);
> +EXPORT_SYMBOL(unicode_load);
>
> -static int unicode_parse_version(const char *version, unsigned int *maj,
> - unsigned int *min, unsigned int *rev)
> +void unicode_unload(struct unicode_map *um)
> {
> - substring_t args[3];
> - char version_string[12];
> - static const struct match_token token[] = {
> - {1, "%d.%d.%d"},
> - {0, NULL}
> - };
> + kfree(um);
> +}
> +EXPORT_SYMBOL(unicode_unload);
>
> - strncpy(version_string, version, sizeof(version_string));
> +static int unicode_load_module(void)
> +{
> + int ret = request_module("utf8");
>
> - if (match_token(version_string, token, args) != 1)
> - return -EINVAL;
> + msleep(100);

I think I misunderstood when you mentioned you did this msleep. It was
ok to debug the issue you were observing, but it is not a solution.
Setting an arbitrary amount of time will either waste time, or you can
still fail if things take longer than expected. There are mechanisms to
load and wait on a module. See how fs/nls/nls_base.c do exactly this.

> - if (match_int(&args[0], maj) || match_int(&args[1], min) ||
> - match_int(&args[2], rev))
> - return -EINVAL;
> + if (ret) {
> + pr_err("Failed to load UTF-8 module\n");
> + return ret;
> + }
>
> return 0;
> }
>
> -struct unicode_map *unicode_load(const char *version)
> +void unicode_register(struct unicode_ops *ops)
> {
> - struct unicode_map *um = NULL;
> - int unicode_version;
> -
> - if (version) {
> - unsigned int maj, min, rev;
> -
> - if (unicode_parse_version(version, &maj, &min, &rev) < 0)
> - return ERR_PTR(-EINVAL);
> -
> - if (!utf8version_is_supported(maj, min, rev))
> - return ERR_PTR(-EINVAL);
> -
> - unicode_version = UNICODE_AGE(maj, min, rev);
> - } else {
> - unicode_version = utf8version_latest();
> - printk(KERN_WARNING"UTF-8 version not specified. "
> - "Assuming latest supported version (%d.%d.%d).",
> - (unicode_version >> 16) & 0xff,
> - (unicode_version >> 8) & 0xff,
> - (unicode_version & 0xff));
> - }
> -
> - um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
> - if (!um)
> - return ERR_PTR(-ENOMEM);
> -
> - um->charset = "UTF-8";
> - um->version = unicode_version;
> -
> - return um;
> + utf8_ops = ops;
> }
> -EXPORT_SYMBOL(unicode_load);
> +EXPORT_SYMBOL(unicode_register);
>
> -void unicode_unload(struct unicode_map *um)
> +void unicode_unregister(void)
> {
> - kfree(um);
> + utf8_ops = NULL;
> }
> -EXPORT_SYMBOL(unicode_unload);
> +EXPORT_SYMBOL(unicode_unregister);
>
> MODULE_LICENSE("GPL v2");
> diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
> new file mode 100644
> index 000000000000..009faa68330c
> --- /dev/null
> +++ b/fs/unicode/utf8-core.c
> @@ -0,0 +1,112 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/string.h>
> +#include <linux/slab.h>
> +#include <linux/parser.h>
> +#include <linux/errno.h>
> +#include <linux/unicode.h>
> +#include <linux/stringhash.h>
> +#include <linux/delay.h>
> +
> +struct unicode_ops *utf8_ops;
> +
> +static int unicode_load_module(void);
> +
> +int unicode_validate(const struct unicode_map *um, const struct qstr *str)
> +{
> + return utf8_ops->validate(um, str);
> +}
> +EXPORT_SYMBOL(unicode_validate);
> +
> +int unicode_strncmp(const struct unicode_map *um,
> + const struct qstr *s1, const struct qstr *s2)
> +{
> + return utf8_ops->strncmp(um, s1, s2);
> +}
> +EXPORT_SYMBOL(unicode_strncmp);

I'm confused now. Isn't this redefining unicode_strncmp ? It was
defined in unicode_core.c on the hunk above and now it is redefined on
utf8_core.c. There is something odd here.

> +
> +int unicode_strncasecmp(const struct unicode_map *um,
> + const struct qstr *s1, const struct qstr *s2)
> +{
> + return utf8_ops->strncasecmp(um, s1, s2);
> +}
> +EXPORT_SYMBOL(unicode_strncasecmp);
> +
> +/* String cf is expected to be a valid UTF-8 casefolded
> + * string.
> + */
> +int unicode_strncasecmp_folded(const struct unicode_map *um,
> + const struct qstr *cf,
> + const struct qstr *s1)
> +{
> + return utf8_ops->strncasecmp_folded(um, cf, s1);
> +}
> +EXPORT_SYMBOL(unicode_strncasecmp_folded);
> +
> +int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
> + unsigned char *dest, size_t dlen)
> +{
> + return utf8_ops->casefold(um, str, dest, dlen);
> +}
> +EXPORT_SYMBOL(unicode_casefold);
> +
> +int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
> + struct qstr *str)
> +{
> + return utf8_ops->casefold_hash(um, salt, str);
> +}
> +EXPORT_SYMBOL(unicode_casefold_hash);
> +
> +int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
> + unsigned char *dest, size_t dlen)
> +{
> + return utf8_ops->normalize(um, str, dest, dlen);
> +}
> +EXPORT_SYMBOL(unicode_normalize);
> +
> +struct unicode_map *unicode_load(const char *version)
> +{
> + int ret = unicode_load_module();
> +
> + if (ret)
> + return ERR_PTR(ret);
> +
> + else
> + return utf8_ops->load(version);
> +}
> +EXPORT_SYMBOL(unicode_load);
> +
> +void unicode_unload(struct unicode_map *um)
> +{
> + kfree(um);
> +}
> +EXPORT_SYMBOL(unicode_unload);
> +
> +void unicode_register(struct unicode_ops *ops)
> +{
> + utf8_ops = ops;
> +}
> +EXPORT_SYMBOL(unicode_register);
> +
> +void unicode_unregister(void)
> +{
> + utf8_ops = NULL;
> +}
> +EXPORT_SYMBOL(unicode_unregister);
> +
> +static int unicode_load_module(void)
> +{
> + int ret = request_module("utf8");
> +
> + msleep(100);
> +
> + if (ret) {
> + pr_err("Failed to load UTF-8 module\n");
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +MODULE_LICENSE("GPL v2");
> diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
> new file mode 100644
> index 000000000000..8eaeeb27255c
> --- /dev/null
> +++ b/fs/unicode/utf8mod.c
> @@ -0,0 +1,246 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/string.h>
> +#include <linux/slab.h>
> +#include <linux/parser.h>
> +#include <linux/errno.h>
> +#include <linux/unicode.h>
> +#include <linux/stringhash.h>
> +
> +#include "utf8n.h"
> +
> +static int utf8_validate(const struct unicode_map *um, const struct qstr *str)
> +{
> + const struct utf8data *data = utf8nfdi(um->version);
> +
> + if (utf8nlen(data, str->name, str->len) < 0)
> + return -1;
> + return 0;
> +}
> +
> +static int utf8_strncmp(const struct unicode_map *um,
> + const struct qstr *s1, const struct qstr *s2)
> +{
> + const struct utf8data *data = utf8nfdi(um->version);
> + struct utf8cursor cur1, cur2;
> + int c1, c2;
> +
> + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
> + return -EINVAL;
> +
> + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
> + return -EINVAL;
> +
> + do {
> + c1 = utf8byte(&cur1);
> + c2 = utf8byte(&cur2);
> +
> + if (c1 < 0 || c2 < 0)
> + return -EINVAL;
> + if (c1 != c2)
> + return 1;
> + } while (c1);
> +
> + return 0;
> +}
> +
> +static int utf8_strncasecmp(const struct unicode_map *um,
> + const struct qstr *s1, const struct qstr *s2)
> +{
> + const struct utf8data *data = utf8nfdicf(um->version);
> + struct utf8cursor cur1, cur2;
> + int c1, c2;
> +
> + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
> + return -EINVAL;
> +
> + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
> + return -EINVAL;
> +
> + do {
> + c1 = utf8byte(&cur1);
> + c2 = utf8byte(&cur2);
> +
> + if (c1 < 0 || c2 < 0)
> + return -EINVAL;
> + if (c1 != c2)
> + return 1;
> + } while (c1);
> +
> + return 0;
> +}
> +
> +/* String cf is expected to be a valid UTF-8 casefolded
> + * string.
> + */
> +static int utf8_strncasecmp_folded(const struct unicode_map *um,
> + const struct qstr *cf,
> + const struct qstr *s1)
> +{
> + const struct utf8data *data = utf8nfdicf(um->version);
> + struct utf8cursor cur1;
> + int c1, c2;
> + int i = 0;
> +
> + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
> + return -EINVAL;
> +
> + do {
> + c1 = utf8byte(&cur1);
> + c2 = cf->name[i++];
> + if (c1 < 0)
> + return -EINVAL;
> + if (c1 != c2)
> + return 1;
> + } while (c1);
> +
> + return 0;
> +}
> +
> +static int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
> + unsigned char *dest, size_t dlen)
> +{
> + const struct utf8data *data = utf8nfdicf(um->version);
> + struct utf8cursor cur;
> + size_t nlen = 0;
> +
> + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
> + return -EINVAL;
> +
> + for (nlen = 0; nlen < dlen; nlen++) {
> + int c = utf8byte(&cur);
> +
> + dest[nlen] = c;
> + if (!c)
> + return nlen;
> + if (c == -1)
> + break;
> + }
> + return -EINVAL;
> +}
> +
> +static int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
> + struct qstr *str)
> +{
> + const struct utf8data *data = utf8nfdicf(um->version);
> + struct utf8cursor cur;
> + int c;
> + unsigned long hash = init_name_hash(salt);
> +
> + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
> + return -EINVAL;
> +
> + while ((c = utf8byte(&cur))) {
> + if (c < 0)
> + return -EINVAL;
> + hash = partial_name_hash((unsigned char)c, hash);
> + }
> + str->hash = end_name_hash(hash);
> + return 0;
> +}
> +
> +static int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
> + unsigned char *dest, size_t dlen)
> +{
> + const struct utf8data *data = utf8nfdi(um->version);
> + struct utf8cursor cur;
> + ssize_t nlen = 0;
> +
> + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
> + return -EINVAL;
> +
> + for (nlen = 0; nlen < dlen; nlen++) {
> + int c = utf8byte(&cur);
> +
> + dest[nlen] = c;
> + if (!c)
> + return nlen;
> + if (c == -1)
> + break;
> + }
> + return -EINVAL;
> +}
> +
> +static int utf8_parse_version(const char *version, unsigned int *maj,
> + unsigned int *min, unsigned int *rev)
> +{
> + substring_t args[3];
> + char version_string[12];
> + static const struct match_token token[] = {
> + {1, "%d.%d.%d"},
> + {0, NULL}
> + };
> +
> + strncpy(version_string, version, sizeof(version_string));
> +
> + if (match_token(version_string, token, args) != 1)
> + return -EINVAL;
> +
> + if (match_int(&args[0], maj) || match_int(&args[1], min) ||
> + match_int(&args[2], rev))
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +static struct unicode_map *utf8_load(const char *version)
> +{
> + struct unicode_map *um = NULL;
> + int unicode_version;
> +
> + if (version) {
> + unsigned int maj, min, rev;
> +
> + if (utf8_parse_version(version, &maj, &min, &rev) < 0)
> + return ERR_PTR(-EINVAL);
> +
> + if (!utf8version_is_supported(maj, min, rev))
> + return ERR_PTR(-EINVAL);
> +
> + unicode_version = UNICODE_AGE(maj, min, rev);
> + } else {
> + unicode_version = utf8version_latest();
> + printk(KERN_WARNING"UTF-8 version not specified. "
> + "Assuming latest supported version (%d.%d.%d).",
> + (unicode_version >> 16) & 0xff,
> + (unicode_version >> 8) & 0xff,
> + (unicode_version & 0xff));
> + }
> +
> + um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
> + if (!um)
> + return ERR_PTR(-ENOMEM);
> +
> + um->charset = "UTF-8";
> + um->version = unicode_version;
> +
> + return um;
> +}
> +
> +static struct unicode_ops ops = {
> + .validate = utf8_validate,
> + .strncmp = utf8_strncmp,
> + .strncasecmp = utf8_strncasecmp,
> + .strncasecmp_folded = utf8_strncasecmp_folded,
> + .casefold = utf8_casefold,
> + .casefold_hash = utf8_casefold_hash,
> + .normalize = utf8_normalize,
> + .load = utf8_load,
> +};
> +
> +static int __init utf8_init(void)
> +{
> + unicode_register(&ops);
> + return 0;
> +}
> +
> +static void __exit utf8_exit(void)
> +{
> + unicode_unregister();
> +}
> +
> +module_init(utf8_init);
> +module_exit(utf8_exit);
> +
> +MODULE_LICENSE("GPL v2");
> diff --git a/include/linux/unicode.h b/include/linux/unicode.h
> index de23f9ee720b..b0d59069e438 100644
> --- a/include/linux/unicode.h
> +++ b/include/linux/unicode.h
> @@ -10,6 +10,23 @@ struct unicode_map {
> int version;
> };
>
> +struct unicode_ops {
> + int (*validate)(const struct unicode_map *um, const struct qstr *str);
> + int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
> + const struct qstr *s2);
> + int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
> + const struct qstr *s2);
> + int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf,
> + const struct qstr *s1);
> + int (*normalize)(const struct unicode_map *um, const struct qstr *str,
> + unsigned char *dest, size_t dlen);
> + int (*casefold)(const struct unicode_map *um, const struct qstr *str,
> + unsigned char *dest, size_t dlen);
> + int (*casefold_hash)(const struct unicode_map *um, const void *salt,
> + struct qstr *str);
> + struct unicode_map* (*load)(const char *version);
> +};

Also, make sure you run checkpatch.pl on the patch series before
submitting.

> +
> int unicode_validate(const struct unicode_map *um, const struct qstr *str);
>
> int unicode_strncmp(const struct unicode_map *um,
> @@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
> struct unicode_map *unicode_load(const char *version);
> void unicode_unload(struct unicode_map *um);
>
> +void unicode_register(struct unicode_ops *ops);
> +void unicode_unregister(void);
> +
> #endif /* _LINUX_UNICODE_H */

--
Gabriel Krisman Bertazi

2021-03-14 14:51:31

by Shreeya Patel

[permalink] [raw]

Subject: Re: [PATCH 3/3] fs: unicode: Add utf8 module and a unicode layer

On 14/03/21 7:19 am, Gabriel Krisman Bertazi wrote:
> Shreeya Patel <[email protected]> writes:
>
>> utf8data.h_shipped has a large database table which is an auto-generated
>> decodification trie for the unicode normalization functions.
>> It is not necessary to carry this large table in the kernel hence make
>> UTF-8 encoding loadable by converting it into a module.
>> Also, modify the file called unicode-core which will act as a layer for
>> unicode subsystem. It will load the UTF-8 module and access it's functions
>> whenever any filesystem that needs unicode is mounted.
>>
>> Signed-off-by: Shreeya Patel <[email protected]>
> Hi Shreeya,
Hi Gabriel,
>
>> ---
>> fs/unicode/Kconfig | 7 +-
>> fs/unicode/Makefile | 5 +-
>> fs/unicode/unicode-core.c | 201 ++++++-------------------------
>> fs/unicode/utf8-core.c | 112 +++++++++++++++++
>> fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
>> include/linux/unicode.h | 20 ++++
>> 6 files changed, 427 insertions(+), 164 deletions(-)
>> create mode 100644 fs/unicode/utf8-core.c
>> create mode 100644 fs/unicode/utf8mod.c
>>
>> diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
>> index 2c27b9a5cd6c..33a27deef729 100644
>> --- a/fs/unicode/Kconfig
>> +++ b/fs/unicode/Kconfig
>> @@ -8,7 +8,12 @@ config UNICODE
>> Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
>> support.
>>
>> +config UNICODE_UTF8
>> + tristate "UTF-8 module"
>> + depends on UNICODE
>> + default m
>> +
>> config UNICODE_NORMALIZATION_SELFTEST
>> tristate "Test UTF-8 normalization support"
>> - depends on UNICODE
>> + depends on UNICODE_UTF8
>> default n
>> diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
>> index fbf9a629ed0d..9dbb04194b32 100644
>> --- a/fs/unicode/Makefile
>> +++ b/fs/unicode/Makefile
>> @@ -1,11 +1,14 @@
>> # SPDX-License-Identifier: GPL-2.0
>>
>> obj-$(CONFIG_UNICODE) += unicode.o
>> +obj-$(CONFIG_UNICODE_UTF8) += utf8.o
>> obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
>>
>> -unicode-y := utf8-norm.o unicode-core.o
>> +unicode-y := unicode-core.o
>> +utf8-y := utf8mod.o utf8-norm.o
>>
>> $(obj)/utf8-norm.o: $(obj)/utf8data.h
>> +$(obj)/utf8mod.o: $(obj)/utf8-norm.o
>>
>> # In the normal build, the checked-in utf8data.h is just shipped.
>> #
>> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
>> index d5f09e022ac5..b832341f1e7b 100644
>> --- a/fs/unicode/unicode-core.c
>> +++ b/fs/unicode/unicode-core.c
>> @@ -7,70 +7,29 @@
>> #include <linux/errno.h>
>> #include <linux/unicode.h>
>> #include <linux/stringhash.h>
>> +#include <linux/delay.h>
>>
>> -#include "utf8n.h"
>> +struct unicode_ops *utf8_ops;
>> +
>> +static int unicode_load_module(void);
> This is unnecessary
>>
>> int unicode_validate(const struct unicode_map *um, const struct qstr *str)
>> {
>> - const struct utf8data *data = utf8nfdi(um->version);
>> -
>> - if (utf8nlen(data, str->name, str->len) < 0)
>> - return -1;
>> - return 0;
>> + return utf8_ops->validate(um, str);
>> }
>> EXPORT_SYMBOL(unicode_validate);
>>
>> int unicode_strncmp(const struct unicode_map *um,
>> const struct qstr *s1, const struct qstr *s2)
>> {
>> - const struct utf8data *data = utf8nfdi(um->version);
>> - struct utf8cursor cur1, cur2;
>> - int c1, c2;
>> -
>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> - return -EINVAL;
>> -
>> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>> - return -EINVAL;
>> -
>> - do {
>> - c1 = utf8byte(&cur1);
>> - c2 = utf8byte(&cur2);
>> -
>> - if (c1 < 0 || c2 < 0)
>> - return -EINVAL;
>> - if (c1 != c2)
>> - return 1;
>> - } while (c1);
>> -
>> - return 0;
>> + return utf8_ops->strncmp(um, s1, s2);
>> }
> I think these would go on a header file and inlined.
>
>> EXPORT_SYMBOL(unicode_strncmp);
>>
>> int unicode_strncasecmp(const struct unicode_map *um,
>> const struct qstr *s1, const struct qstr *s2)
>> {
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur1, cur2;
>> - int c1, c2;
>> -
>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> - return -EINVAL;
>> -
>> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>> - return -EINVAL;
>> -
>> - do {
>> - c1 = utf8byte(&cur1);
>> - c2 = utf8byte(&cur2);
>> -
>> - if (c1 < 0 || c2 < 0)
>> - return -EINVAL;
>> - if (c1 != c2)
>> - return 1;
>> - } while (c1);
>> -
>> - return 0;
>> + return utf8_ops->strncasecmp(um, s1, s2);
>> }
>> EXPORT_SYMBOL(unicode_strncasecmp);
>>
>> @@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct unicode_map *um,
>> const struct qstr *cf,
>> const struct qstr *s1)
>> {
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur1;
>> - int c1, c2;
>> - int i = 0;
>> -
>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> - return -EINVAL;
>> -
>> - do {
>> - c1 = utf8byte(&cur1);
>> - c2 = cf->name[i++];
>> - if (c1 < 0)
>> - return -EINVAL;
>> - if (c1 != c2)
>> - return 1;
>> - } while (c1);
>> -
>> - return 0;
>> + return utf8_ops->strncasecmp_folded(um, cf, s1);
>> }
>> EXPORT_SYMBOL(unicode_strncasecmp_folded);
>>
>> int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
>> unsigned char *dest, size_t dlen)
>> {
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur;
>> - size_t nlen = 0;
>> -
>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> - return -EINVAL;
>> -
>> - for (nlen = 0; nlen < dlen; nlen++) {
>> - int c = utf8byte(&cur);
>> -
>> - dest[nlen] = c;
>> - if (!c)
>> - return nlen;
>> - if (c == -1)
>> - break;
>> - }
>> - return -EINVAL;
>> + return utf8_ops->casefold(um, str, dest, dlen);
>> }
>> EXPORT_SYMBOL(unicode_casefold);
>>
>> int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
>> struct qstr *str)
>> {
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur;
>> - int c;
>> - unsigned long hash = init_name_hash(salt);
>> -
>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> - return -EINVAL;
>> -
>> - while ((c = utf8byte(&cur))) {
>> - if (c < 0)
>> - return -EINVAL;
>> - hash = partial_name_hash((unsigned char)c, hash);
>> - }
>> - str->hash = end_name_hash(hash);
>> - return 0;
>> + return utf8_ops->casefold_hash(um, salt, str);
>> }
>> EXPORT_SYMBOL(unicode_casefold_hash);
>>
>> int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
>> unsigned char *dest, size_t dlen)
>> {
>> - const struct utf8data *data = utf8nfdi(um->version);
>> - struct utf8cursor cur;
>> - ssize_t nlen = 0;
>> + return utf8_ops->normalize(um, str, dest, dlen);
>> +}
>> +EXPORT_SYMBOL(unicode_normalize);
>>
>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> - return -EINVAL;
>> +struct unicode_map *unicode_load(const char *version)
>> +{
>> + int ret = unicode_load_module();
>>
>> - for (nlen = 0; nlen < dlen; nlen++) {
>> - int c = utf8byte(&cur);
>> + if (ret)
>> + return ERR_PTR(ret);
>>
>> - dest[nlen] = c;
>> - if (!c)
>> - return nlen;
>> - if (c == -1)
>> - break;
>> - }
>> - return -EINVAL;
>> + else
>> + return utf8_ops->load(version);
>> }
>> -EXPORT_SYMBOL(unicode_normalize);
>> +EXPORT_SYMBOL(unicode_load);
>>
>> -static int unicode_parse_version(const char *version, unsigned int *maj,
>> - unsigned int *min, unsigned int *rev)
>> +void unicode_unload(struct unicode_map *um)
>> {
>> - substring_t args[3];
>> - char version_string[12];
>> - static const struct match_token token[] = {
>> - {1, "%d.%d.%d"},
>> - {0, NULL}
>> - };
>> + kfree(um);
>> +}
>> +EXPORT_SYMBOL(unicode_unload);
>>
>> - strncpy(version_string, version, sizeof(version_string));
>> +static int unicode_load_module(void)
>> +{
>> + int ret = request_module("utf8");
>>
>> - if (match_token(version_string, token, args) != 1)
>> - return -EINVAL;
>> + msleep(100);
> I think I misunderstood when you mentioned you did this msleep. It was
> ok to debug the issue you were observing, but it is not a solution.
> Setting an arbitrary amount of time will either waste time, or you can
> still fail if things take longer than expected. There are mechanisms to
> load and wait on a module. See how fs/nls/nls_base.c do exactly this.
>
>> - if (match_int(&args[0], maj) || match_int(&args[1], min) ||
>> - match_int(&args[2], rev))
>> - return -EINVAL;
>> + if (ret) {
>> + pr_err("Failed to load UTF-8 module\n");
>> + return ret;
>> + }
>>
>> return 0;
>> }
>>
>> -struct unicode_map *unicode_load(const char *version)
>> +void unicode_register(struct unicode_ops *ops)
>> {
>> - struct unicode_map *um = NULL;
>> - int unicode_version;
>> -
>> - if (version) {
>> - unsigned int maj, min, rev;
>> -
>> - if (unicode_parse_version(version, &maj, &min, &rev) < 0)
>> - return ERR_PTR(-EINVAL);
>> -
>> - if (!utf8version_is_supported(maj, min, rev))
>> - return ERR_PTR(-EINVAL);
>> -
>> - unicode_version = UNICODE_AGE(maj, min, rev);
>> - } else {
>> - unicode_version = utf8version_latest();
>> - printk(KERN_WARNING"UTF-8 version not specified. "
>> - "Assuming latest supported version (%d.%d.%d).",
>> - (unicode_version >> 16) & 0xff,
>> - (unicode_version >> 8) & 0xff,
>> - (unicode_version & 0xff));
>> - }
>> -
>> - um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
>> - if (!um)
>> - return ERR_PTR(-ENOMEM);
>> -
>> - um->charset = "UTF-8";
>> - um->version = unicode_version;
>> -
>> - return um;
>> + utf8_ops = ops;
>> }
>> -EXPORT_SYMBOL(unicode_load);
>> +EXPORT_SYMBOL(unicode_register);
>>
>> -void unicode_unload(struct unicode_map *um)
>> +void unicode_unregister(void)
>> {
>> - kfree(um);
>> + utf8_ops = NULL;
>> }
>> -EXPORT_SYMBOL(unicode_unload);
>> +EXPORT_SYMBOL(unicode_unregister);
>>
>> MODULE_LICENSE("GPL v2");
>> diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
>> new file mode 100644
>> index 000000000000..009faa68330c
>> --- /dev/null
>> +++ b/fs/unicode/utf8-core.c
>> @@ -0,0 +1,112 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#include <linux/module.h>
>> +#include <linux/kernel.h>
>> +#include <linux/string.h>
>> +#include <linux/slab.h>
>> +#include <linux/parser.h>
>> +#include <linux/errno.h>
>> +#include <linux/unicode.h>
>> +#include <linux/stringhash.h>
>> +#include <linux/delay.h>
>> +
>> +struct unicode_ops *utf8_ops;
>> +
>> +static int unicode_load_module(void);
>> +
>> +int unicode_validate(const struct unicode_map *um, const struct qstr *str)
>> +{
>> + return utf8_ops->validate(um, str);
>> +}
>> +EXPORT_SYMBOL(unicode_validate);
>> +
>> +int unicode_strncmp(const struct unicode_map *um,
>> + const struct qstr *s1, const struct qstr *s2)
>> +{
>> + return utf8_ops->strncmp(um, s1, s2);
>> +}
>> +EXPORT_SYMBOL(unicode_strncmp);
> I'm confused now. Isn't this redefining unicode_strncmp ? It was
> defined in unicode_core.c on the hunk above and now it is redefined on
> utf8_core.c. There is something odd here.
sorry, I think I messed up patches while using git send-email and that
is why you might see
two copies of the last patch. Let me resend the series and then it might
make sense. One question
though, why would unicode_strncmp go into the header file?
>
>> +
>> +int unicode_strncasecmp(const struct unicode_map *um,
>> + const struct qstr *s1, const struct qstr *s2)
>> +{
>> + return utf8_ops->strncasecmp(um, s1, s2);
>> +}
>> +EXPORT_SYMBOL(unicode_strncasecmp);
>> +
>> +/* String cf is expected to be a valid UTF-8 casefolded
>> + * string.
>> + */
>> +int unicode_strncasecmp_folded(const struct unicode_map *um,
>> + const struct qstr *cf,
>> + const struct qstr *s1)
>> +{
>> + return utf8_ops->strncasecmp_folded(um, cf, s1);
>> +}
>> +EXPORT_SYMBOL(unicode_strncasecmp_folded);
>> +
>> +int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
>> + unsigned char *dest, size_t dlen)
>> +{
>> + return utf8_ops->casefold(um, str, dest, dlen);
>> +}
>> +EXPORT_SYMBOL(unicode_casefold);
>> +
>> +int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
>> + struct qstr *str)
>> +{
>> + return utf8_ops->casefold_hash(um, salt, str);
>> +}
>> +EXPORT_SYMBOL(unicode_casefold_hash);
>> +
>> +int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
>> + unsigned char *dest, size_t dlen)
>> +{
>> + return utf8_ops->normalize(um, str, dest, dlen);
>> +}
>> +EXPORT_SYMBOL(unicode_normalize);
>> +
>> +struct unicode_map *unicode_load(const char *version)
>> +{
>> + int ret = unicode_load_module();
>> +
>> + if (ret)
>> + return ERR_PTR(ret);
>> +
>> + else
>> + return utf8_ops->load(version);
>> +}
>> +EXPORT_SYMBOL(unicode_load);
>> +
>> +void unicode_unload(struct unicode_map *um)
>> +{
>> + kfree(um);
>> +}
>> +EXPORT_SYMBOL(unicode_unload);
>> +
>> +void unicode_register(struct unicode_ops *ops)
>> +{
>> + utf8_ops = ops;
>> +}
>> +EXPORT_SYMBOL(unicode_register);
>> +
>> +void unicode_unregister(void)
>> +{
>> + utf8_ops = NULL;
>> +}
>> +EXPORT_SYMBOL(unicode_unregister);
>> +
>> +static int unicode_load_module(void)
>> +{
>> + int ret = request_module("utf8");
>> +
>> + msleep(100);
>> +
>> + if (ret) {
>> + pr_err("Failed to load UTF-8 module\n");
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +MODULE_LICENSE("GPL v2");
>> diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
>> new file mode 100644
>> index 000000000000..8eaeeb27255c
>> --- /dev/null
>> +++ b/fs/unicode/utf8mod.c
>> @@ -0,0 +1,246 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#include <linux/module.h>
>> +#include <linux/kernel.h>
>> +#include <linux/string.h>
>> +#include <linux/slab.h>
>> +#include <linux/parser.h>
>> +#include <linux/errno.h>
>> +#include <linux/unicode.h>
>> +#include <linux/stringhash.h>
>> +
>> +#include "utf8n.h"
>> +
>> +static int utf8_validate(const struct unicode_map *um, const struct qstr *str)
>> +{
>> + const struct utf8data *data = utf8nfdi(um->version);
>> +
>> + if (utf8nlen(data, str->name, str->len) < 0)
>> + return -1;
>> + return 0;
>> +}
>> +
>> +static int utf8_strncmp(const struct unicode_map *um,
>> + const struct qstr *s1, const struct qstr *s2)
>> +{
>> + const struct utf8data *data = utf8nfdi(um->version);
>> + struct utf8cursor cur1, cur2;
>> + int c1, c2;
>> +
>> + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> + return -EINVAL;
>> +
>> + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>> + return -EINVAL;
>> +
>> + do {
>> + c1 = utf8byte(&cur1);
>> + c2 = utf8byte(&cur2);
>> +
>> + if (c1 < 0 || c2 < 0)
>> + return -EINVAL;
>> + if (c1 != c2)
>> + return 1;
>> + } while (c1);
>> +
>> + return 0;
>> +}
>> +
>> +static int utf8_strncasecmp(const struct unicode_map *um,
>> + const struct qstr *s1, const struct qstr *s2)
>> +{
>> + const struct utf8data *data = utf8nfdicf(um->version);
>> + struct utf8cursor cur1, cur2;
>> + int c1, c2;
>> +
>> + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> + return -EINVAL;
>> +
>> + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>> + return -EINVAL;
>> +
>> + do {
>> + c1 = utf8byte(&cur1);
>> + c2 = utf8byte(&cur2);
>> +
>> + if (c1 < 0 || c2 < 0)
>> + return -EINVAL;
>> + if (c1 != c2)
>> + return 1;
>> + } while (c1);
>> +
>> + return 0;
>> +}
>> +
>> +/* String cf is expected to be a valid UTF-8 casefolded
>> + * string.
>> + */
>> +static int utf8_strncasecmp_folded(const struct unicode_map *um,
>> + const struct qstr *cf,
>> + const struct qstr *s1)
>> +{
>> + const struct utf8data *data = utf8nfdicf(um->version);
>> + struct utf8cursor cur1;
>> + int c1, c2;
>> + int i = 0;
>> +
>> + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> + return -EINVAL;
>> +
>> + do {
>> + c1 = utf8byte(&cur1);
>> + c2 = cf->name[i++];
>> + if (c1 < 0)
>> + return -EINVAL;
>> + if (c1 != c2)
>> + return 1;
>> + } while (c1);
>> +
>> + return 0;
>> +}
>> +
>> +static int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
>> + unsigned char *dest, size_t dlen)
>> +{
>> + const struct utf8data *data = utf8nfdicf(um->version);
>> + struct utf8cursor cur;
>> + size_t nlen = 0;
>> +
>> + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> + return -EINVAL;
>> +
>> + for (nlen = 0; nlen < dlen; nlen++) {
>> + int c = utf8byte(&cur);
>> +
>> + dest[nlen] = c;
>> + if (!c)
>> + return nlen;
>> + if (c == -1)
>> + break;
>> + }
>> + return -EINVAL;
>> +}
>> +
>> +static int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
>> + struct qstr *str)
>> +{
>> + const struct utf8data *data = utf8nfdicf(um->version);
>> + struct utf8cursor cur;
>> + int c;
>> + unsigned long hash = init_name_hash(salt);
>> +
>> + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> + return -EINVAL;
>> +
>> + while ((c = utf8byte(&cur))) {
>> + if (c < 0)
>> + return -EINVAL;
>> + hash = partial_name_hash((unsigned char)c, hash);
>> + }
>> + str->hash = end_name_hash(hash);
>> + return 0;
>> +}
>> +
>> +static int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
>> + unsigned char *dest, size_t dlen)
>> +{
>> + const struct utf8data *data = utf8nfdi(um->version);
>> + struct utf8cursor cur;
>> + ssize_t nlen = 0;
>> +
>> + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> + return -EINVAL;
>> +
>> + for (nlen = 0; nlen < dlen; nlen++) {
>> + int c = utf8byte(&cur);
>> +
>> + dest[nlen] = c;
>> + if (!c)
>> + return nlen;
>> + if (c == -1)
>> + break;
>> + }
>> + return -EINVAL;
>> +}
>> +
>> +static int utf8_parse_version(const char *version, unsigned int *maj,
>> + unsigned int *min, unsigned int *rev)
>> +{
>> + substring_t args[3];
>> + char version_string[12];
>> + static const struct match_token token[] = {
>> + {1, "%d.%d.%d"},
>> + {0, NULL}
>> + };
>> +
>> + strncpy(version_string, version, sizeof(version_string));
>> +
>> + if (match_token(version_string, token, args) != 1)
>> + return -EINVAL;
>> +
>> + if (match_int(&args[0], maj) || match_int(&args[1], min) ||
>> + match_int(&args[2], rev))
>> + return -EINVAL;
>> +
>> + return 0;
>> +}
>> +
>> +static struct unicode_map *utf8_load(const char *version)
>> +{
>> + struct unicode_map *um = NULL;
>> + int unicode_version;
>> +
>> + if (version) {
>> + unsigned int maj, min, rev;
>> +
>> + if (utf8_parse_version(version, &maj, &min, &rev) < 0)
>> + return ERR_PTR(-EINVAL);
>> +
>> + if (!utf8version_is_supported(maj, min, rev))
>> + return ERR_PTR(-EINVAL);
>> +
>> + unicode_version = UNICODE_AGE(maj, min, rev);
>> + } else {
>> + unicode_version = utf8version_latest();
>> + printk(KERN_WARNING"UTF-8 version not specified. "
>> + "Assuming latest supported version (%d.%d.%d).",
>> + (unicode_version >> 16) & 0xff,
>> + (unicode_version >> 8) & 0xff,
>> + (unicode_version & 0xff));
>> + }
>> +
>> + um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
>> + if (!um)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + um->charset = "UTF-8";
>> + um->version = unicode_version;
>> +
>> + return um;
>> +}
>> +
>> +static struct unicode_ops ops = {
>> + .validate = utf8_validate,
>> + .strncmp = utf8_strncmp,
>> + .strncasecmp = utf8_strncasecmp,
>> + .strncasecmp_folded = utf8_strncasecmp_folded,
>> + .casefold = utf8_casefold,
>> + .casefold_hash = utf8_casefold_hash,
>> + .normalize = utf8_normalize,
>> + .load = utf8_load,
>> +};
>> +
>> +static int __init utf8_init(void)
>> +{
>> + unicode_register(&ops);
>> + return 0;
>> +}
>> +
>> +static void __exit utf8_exit(void)
>> +{
>> + unicode_unregister();
>> +}
>> +
>> +module_init(utf8_init);
>> +module_exit(utf8_exit);
>> +
>> +MODULE_LICENSE("GPL v2");
>> diff --git a/include/linux/unicode.h b/include/linux/unicode.h
>> index de23f9ee720b..b0d59069e438 100644
>> --- a/include/linux/unicode.h
>> +++ b/include/linux/unicode.h
>> @@ -10,6 +10,23 @@ struct unicode_map {
>> int version;
>> };
>>
>> +struct unicode_ops {
>> + int (*validate)(const struct unicode_map *um, const struct qstr *str);
>> + int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
>> + const struct qstr *s2);
>> + int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
>> + const struct qstr *s2);
>> + int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf,
>> + const struct qstr *s1);
>> + int (*normalize)(const struct unicode_map *um, const struct qstr *str,
>> + unsigned char *dest, size_t dlen);
>> + int (*casefold)(const struct unicode_map *um, const struct qstr *str,
>> + unsigned char *dest, size_t dlen);
>> + int (*casefold_hash)(const struct unicode_map *um, const void *salt,
>> + struct qstr *str);
>> + struct unicode_map* (*load)(const char *version);
>> +};
> Also, make sure you run checkpatch.pl on the patch series before
> submitting.
I ran checkpatch.pl over the patch, but it seems there were some
previously existing warnings
which are not introduced due to any change made in this patch series.
I am not sure if I am supposed to resolve those warnings in this patch
series.

>> +
>> int unicode_validate(const struct unicode_map *um, const struct qstr *str);
>>
>> int unicode_strncmp(const struct unicode_map *um,
>> @@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
>> struct unicode_map *unicode_load(const char *version);
>> void unicode_unload(struct unicode_map *um);
>>
>> +void unicode_register(struct unicode_ops *ops);
>> +void unicode_unregister(void);
>> +
>> #endif /* _LINUX_UNICODE_H */

2021-03-15 04:23:52

by Gabriel Krisman Bertazi

[permalink] [raw]

Subject: Re: [PATCH 3/3] fs: unicode: Add utf8 module and a unicode layer

Shreeya Patel <[email protected]> writes:

> On 14/03/21 7:19 am, Gabriel Krisman Bertazi wrote:
>> Shreeya Patel <[email protected]> writes:
>>
>>> utf8data.h_shipped has a large database table which is an auto-generated
>>> decodification trie for the unicode normalization functions.
>>> It is not necessary to carry this large table in the kernel hence make
>>> UTF-8 encoding loadable by converting it into a module.
>>> Also, modify the file called unicode-core which will act as a layer for
>>> unicode subsystem. It will load the UTF-8 module and access it's functions
>>> whenever any filesystem that needs unicode is mounted.
>>>
>>> Signed-off-by: Shreeya Patel <[email protected]>
>> Hi Shreeya,
> Hi Gabriel,
>>
>>> ---
>>> fs/unicode/Kconfig | 7 +-
>>> fs/unicode/Makefile | 5 +-
>>> fs/unicode/unicode-core.c | 201 ++++++-------------------------
>>> fs/unicode/utf8-core.c | 112 +++++++++++++++++
>>> fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
>>> include/linux/unicode.h | 20 ++++
>>> 6 files changed, 427 insertions(+), 164 deletions(-)
>>> create mode 100644 fs/unicode/utf8-core.c
>>> create mode 100644 fs/unicode/utf8mod.c
>>>
>>> diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
>>> index 2c27b9a5cd6c..33a27deef729 100644
>>> --- a/fs/unicode/Kconfig
>>> +++ b/fs/unicode/Kconfig
>>> @@ -8,7 +8,12 @@ config UNICODE
>>> Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
>>> support.
>>> +config UNICODE_UTF8
>>> + tristate "UTF-8 module"
>>> + depends on UNICODE
>>> + default m
>>> +
>>> config UNICODE_NORMALIZATION_SELFTEST
>>> tristate "Test UTF-8 normalization support"
>>> - depends on UNICODE
>>> + depends on UNICODE_UTF8
>>> default n
>>> diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
>>> index fbf9a629ed0d..9dbb04194b32 100644
>>> --- a/fs/unicode/Makefile
>>> +++ b/fs/unicode/Makefile
>>> @@ -1,11 +1,14 @@
>>> # SPDX-License-Identifier: GPL-2.0
>>> obj-$(CONFIG_UNICODE) += unicode.o
>>> +obj-$(CONFIG_UNICODE_UTF8) += utf8.o
>>> obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
>>> -unicode-y := utf8-norm.o unicode-core.o
>>> +unicode-y := unicode-core.o
>>> +utf8-y := utf8mod.o utf8-norm.o
>>> $(obj)/utf8-norm.o: $(obj)/utf8data.h
>>> +$(obj)/utf8mod.o: $(obj)/utf8-norm.o
>>> # In the normal build, the checked-in utf8data.h is just shipped.
>>> #
>>> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
>>> index d5f09e022ac5..b832341f1e7b 100644
>>> --- a/fs/unicode/unicode-core.c
>>> +++ b/fs/unicode/unicode-core.c
>>> @@ -7,70 +7,29 @@
>>> #include <linux/errno.h>
>>> #include <linux/unicode.h>
>>> #include <linux/stringhash.h>
>>> +#include <linux/delay.h>
>>> -#include "utf8n.h"
>>> +struct unicode_ops *utf8_ops;
>>> +
>>> +static int unicode_load_module(void);
>> This is unnecessary
>>> int unicode_validate(const struct unicode_map *um, const struct
>>> qstr *str)
>>> {
>>> - const struct utf8data *data = utf8nfdi(um->version);
>>> -
>>> - if (utf8nlen(data, str->name, str->len) < 0)
>>> - return -1;
>>> - return 0;
>>> + return utf8_ops->validate(um, str);
>>> }
>>> EXPORT_SYMBOL(unicode_validate);
>>> int unicode_strncmp(const struct unicode_map *um,
>>> const struct qstr *s1, const struct qstr *s2)
>>> {
>>> - const struct utf8data *data = utf8nfdi(um->version);
>>> - struct utf8cursor cur1, cur2;
>>> - int c1, c2;
>>> -
>>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> - return -EINVAL;
>>> -
>>> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>>> - return -EINVAL;
>>> -
>>> - do {
>>> - c1 = utf8byte(&cur1);
>>> - c2 = utf8byte(&cur2);
>>> -
>>> - if (c1 < 0 || c2 < 0)
>>> - return -EINVAL;
>>> - if (c1 != c2)
>>> - return 1;
>>> - } while (c1);
>>> -
>>> - return 0;
>>> + return utf8_ops->strncmp(um, s1, s2);
>>> }
>> I think these would go on a header file and inlined.
>>
>>> EXPORT_SYMBOL(unicode_strncmp);
>>> int unicode_strncasecmp(const struct unicode_map *um,
>>> const struct qstr *s1, const struct qstr *s2)
>>> {
>>> - const struct utf8data *data = utf8nfdicf(um->version);
>>> - struct utf8cursor cur1, cur2;
>>> - int c1, c2;
>>> -
>>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> - return -EINVAL;
>>> -
>>> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>>> - return -EINVAL;
>>> -
>>> - do {
>>> - c1 = utf8byte(&cur1);
>>> - c2 = utf8byte(&cur2);
>>> -
>>> - if (c1 < 0 || c2 < 0)
>>> - return -EINVAL;
>>> - if (c1 != c2)
>>> - return 1;
>>> - } while (c1);
>>> -
>>> - return 0;
>>> + return utf8_ops->strncasecmp(um, s1, s2);
>>> }
>>> EXPORT_SYMBOL(unicode_strncasecmp);
>>> @@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct
>>> unicode_map *um,
>>> const struct qstr *cf,
>>> const struct qstr *s1)
>>> {
>>> - const struct utf8data *data = utf8nfdicf(um->version);
>>> - struct utf8cursor cur1;
>>> - int c1, c2;
>>> - int i = 0;
>>> -
>>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> - return -EINVAL;
>>> -
>>> - do {
>>> - c1 = utf8byte(&cur1);
>>> - c2 = cf->name[i++];
>>> - if (c1 < 0)
>>> - return -EINVAL;
>>> - if (c1 != c2)
>>> - return 1;
>>> - } while (c1);
>>> -
>>> - return 0;
>>> + return utf8_ops->strncasecmp_folded(um, cf, s1);
>>> }
>>> EXPORT_SYMBOL(unicode_strncasecmp_folded);
>>> int unicode_casefold(const struct unicode_map *um, const struct
>>> qstr *str,
>>> unsigned char *dest, size_t dlen)
>>> {
>>> - const struct utf8data *data = utf8nfdicf(um->version);
>>> - struct utf8cursor cur;
>>> - size_t nlen = 0;
>>> -
>>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> - return -EINVAL;
>>> -
>>> - for (nlen = 0; nlen < dlen; nlen++) {
>>> - int c = utf8byte(&cur);
>>> -
>>> - dest[nlen] = c;
>>> - if (!c)
>>> - return nlen;
>>> - if (c == -1)
>>> - break;
>>> - }
>>> - return -EINVAL;
>>> + return utf8_ops->casefold(um, str, dest, dlen);
>>> }
>>> EXPORT_SYMBOL(unicode_casefold);
>>> int unicode_casefold_hash(const struct unicode_map *um, const
>>> void *salt,
>>> struct qstr *str)
>>> {
>>> - const struct utf8data *data = utf8nfdicf(um->version);
>>> - struct utf8cursor cur;
>>> - int c;
>>> - unsigned long hash = init_name_hash(salt);
>>> -
>>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> - return -EINVAL;
>>> -
>>> - while ((c = utf8byte(&cur))) {
>>> - if (c < 0)
>>> - return -EINVAL;
>>> - hash = partial_name_hash((unsigned char)c, hash);
>>> - }
>>> - str->hash = end_name_hash(hash);
>>> - return 0;
>>> + return utf8_ops->casefold_hash(um, salt, str);
>>> }
>>> EXPORT_SYMBOL(unicode_casefold_hash);
>>> int unicode_normalize(const struct unicode_map *um, const struct
>>> qstr *str,
>>> unsigned char *dest, size_t dlen)
>>> {
>>> - const struct utf8data *data = utf8nfdi(um->version);
>>> - struct utf8cursor cur;
>>> - ssize_t nlen = 0;
>>> + return utf8_ops->normalize(um, str, dest, dlen);
>>> +}
>>> +EXPORT_SYMBOL(unicode_normalize);
>>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> - return -EINVAL;
>>> +struct unicode_map *unicode_load(const char *version)
>>> +{
>>> + int ret = unicode_load_module();
>>> - for (nlen = 0; nlen < dlen; nlen++) {
>>> - int c = utf8byte(&cur);
>>> + if (ret)
>>> + return ERR_PTR(ret);
>>> - dest[nlen] = c;
>>> - if (!c)
>>> - return nlen;
>>> - if (c == -1)
>>> - break;
>>> - }
>>> - return -EINVAL;
>>> + else
>>> + return utf8_ops->load(version);
>>> }
>>> -EXPORT_SYMBOL(unicode_normalize);
>>> +EXPORT_SYMBOL(unicode_load);
>>> -static int unicode_parse_version(const char *version, unsigned int
>>> *maj,
>>> - unsigned int *min, unsigned int *rev)
>>> +void unicode_unload(struct unicode_map *um)
>>> {
>>> - substring_t args[3];
>>> - char version_string[12];
>>> - static const struct match_token token[] = {
>>> - {1, "%d.%d.%d"},
>>> - {0, NULL}
>>> - };
>>> + kfree(um);
>>> +}
>>> +EXPORT_SYMBOL(unicode_unload);
>>> - strncpy(version_string, version, sizeof(version_string));
>>> +static int unicode_load_module(void)
>>> +{
>>> + int ret = request_module("utf8");
>>> - if (match_token(version_string, token, args) != 1)
>>> - return -EINVAL;
>>> + msleep(100);
>> I think I misunderstood when you mentioned you did this msleep. It was
>> ok to debug the issue you were observing, but it is not a solution.
>> Setting an arbitrary amount of time will either waste time, or you can
>> still fail if things take longer than expected. There are mechanisms to
>> load and wait on a module. See how fs/nls/nls_base.c do exactly this.
>>
>>> - if (match_int(&args[0], maj) || match_int(&args[1], min) ||
>>> - match_int(&args[2], rev))
>>> - return -EINVAL;
>>> + if (ret) {
>>> + pr_err("Failed to load UTF-8 module\n");
>>> + return ret;
>>> + }
>>> return 0;
>>> }
>>> -struct unicode_map *unicode_load(const char *version)
>>> +void unicode_register(struct unicode_ops *ops)
>>> {
>>> - struct unicode_map *um = NULL;
>>> - int unicode_version;
>>> -
>>> - if (version) {
>>> - unsigned int maj, min, rev;
>>> -
>>> - if (unicode_parse_version(version, &maj, &min, &rev) < 0)
>>> - return ERR_PTR(-EINVAL);
>>> -
>>> - if (!utf8version_is_supported(maj, min, rev))
>>> - return ERR_PTR(-EINVAL);
>>> -
>>> - unicode_version = UNICODE_AGE(maj, min, rev);
>>> - } else {
>>> - unicode_version = utf8version_latest();
>>> - printk(KERN_WARNING"UTF-8 version not specified. "
>>> - "Assuming latest supported version (%d.%d.%d).",
>>> - (unicode_version >> 16) & 0xff,
>>> - (unicode_version >> 8) & 0xff,
>>> - (unicode_version & 0xff));
>>> - }
>>> -
>>> - um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
>>> - if (!um)
>>> - return ERR_PTR(-ENOMEM);
>>> -
>>> - um->charset = "UTF-8";
>>> - um->version = unicode_version;
>>> -
>>> - return um;
>>> + utf8_ops = ops;
>>> }
>>> -EXPORT_SYMBOL(unicode_load);
>>> +EXPORT_SYMBOL(unicode_register);
>>> -void unicode_unload(struct unicode_map *um)
>>> +void unicode_unregister(void)
>>> {
>>> - kfree(um);
>>> + utf8_ops = NULL;
>>> }
>>> -EXPORT_SYMBOL(unicode_unload);
>>> +EXPORT_SYMBOL(unicode_unregister);
>>> MODULE_LICENSE("GPL v2");
>>> diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
>>> new file mode 100644
>>> index 000000000000..009faa68330c
>>> --- /dev/null
>>> +++ b/fs/unicode/utf8-core.c
>>> @@ -0,0 +1,112 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +#include <linux/module.h>
>>> +#include <linux/kernel.h>
>>> +#include <linux/string.h>
>>> +#include <linux/slab.h>
>>> +#include <linux/parser.h>
>>> +#include <linux/errno.h>
>>> +#include <linux/unicode.h>
>>> +#include <linux/stringhash.h>
>>> +#include <linux/delay.h>
>>> +
>>> +struct unicode_ops *utf8_ops;
>>> +
>>> +static int unicode_load_module(void);
>>> +
>>> +int unicode_validate(const struct unicode_map *um, const struct qstr *str)
>>> +{
>>> + return utf8_ops->validate(um, str);
>>> +}
>>> +EXPORT_SYMBOL(unicode_validate);
>>> +
>>> +int unicode_strncmp(const struct unicode_map *um,
>>> + const struct qstr *s1, const struct qstr *s2)
>>> +{
>>> + return utf8_ops->strncmp(um, s1, s2);
>>> +}
>>> +EXPORT_SYMBOL(unicode_strncmp);
>> I'm confused now. Isn't this redefining unicode_strncmp ? It was
>> defined in unicode_core.c on the hunk above and now it is redefined on
>> utf8_core.c. There is something odd here.
> sorry, I think I messed up patches while using git send-email and that
> is why you might see
> two copies of the last patch. Let me resend the series and then it might
> make sense. One question

Hi Shreeya,

Yes, I noticed that too, but this comment just was about the patch I reviewed.
If I read the patch correctly, unicode_strncmp is defined in two places,
in file unicode_core.c and utf8_core.c, in this single patch?

> though, why would unicode_strncmp go into the header file?

Since unicode_strnmcp is now just a wrapper that calls the ->ops hook,
putting it in the header file will allow it to be inlined by the
compiler inside the caller function.

>>> +
>>> +int unicode_strncasecmp(const struct unicode_map *um,
>>> + const struct qstr *s1, const struct qstr *s2)
>>> +{
>>> + return utf8_ops->strncasecmp(um, s1, s2);
>>> +}
>>> +EXPORT_SYMBOL(unicode_strncasecmp);
>>> +
>>> +/* String cf is expected to be a valid UTF-8 casefolded
>>> + * string.
>>> + */
>>> +int unicode_strncasecmp_folded(const struct unicode_map *um,
>>> + const struct qstr *cf,
>>> + const struct qstr *s1)
>>> +{
>>> + return utf8_ops->strncasecmp_folded(um, cf, s1);
>>> +}
>>> +EXPORT_SYMBOL(unicode_strncasecmp_folded);
>>> +
>>> +int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
>>> + unsigned char *dest, size_t dlen)
>>> +{
>>> + return utf8_ops->casefold(um, str, dest, dlen);
>>> +}
>>> +EXPORT_SYMBOL(unicode_casefold);
>>> +
>>> +int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
>>> + struct qstr *str)
>>> +{
>>> + return utf8_ops->casefold_hash(um, salt, str);
>>> +}
>>> +EXPORT_SYMBOL(unicode_casefold_hash);
>>> +
>>> +int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
>>> + unsigned char *dest, size_t dlen)
>>> +{
>>> + return utf8_ops->normalize(um, str, dest, dlen);
>>> +}
>>> +EXPORT_SYMBOL(unicode_normalize);
>>> +
>>> +struct unicode_map *unicode_load(const char *version)
>>> +{
>>> + int ret = unicode_load_module();
>>> +
>>> + if (ret)
>>> + return ERR_PTR(ret);
>>> +
>>> + else
>>> + return utf8_ops->load(version);
>>> +}
>>> +EXPORT_SYMBOL(unicode_load);
>>> +
>>> +void unicode_unload(struct unicode_map *um)
>>> +{
>>> + kfree(um);
>>> +}
>>> +EXPORT_SYMBOL(unicode_unload);
>>> +
>>> +void unicode_register(struct unicode_ops *ops)
>>> +{
>>> + utf8_ops = ops;
>>> +}
>>> +EXPORT_SYMBOL(unicode_register);
>>> +
>>> +void unicode_unregister(void)
>>> +{
>>> + utf8_ops = NULL;
>>> +}
>>> +EXPORT_SYMBOL(unicode_unregister);
>>> +
>>> +static int unicode_load_module(void)
>>> +{
>>> + int ret = request_module("utf8");
>>> +
>>> + msleep(100);
>>> +
>>> + if (ret) {
>>> + pr_err("Failed to load UTF-8 module\n");
>>> + return ret;
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +MODULE_LICENSE("GPL v2");
>>> diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
>>> new file mode 100644
>>> index 000000000000..8eaeeb27255c
>>> --- /dev/null
>>> +++ b/fs/unicode/utf8mod.c
>>> @@ -0,0 +1,246 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +#include <linux/module.h>
>>> +#include <linux/kernel.h>
>>> +#include <linux/string.h>
>>> +#include <linux/slab.h>
>>> +#include <linux/parser.h>
>>> +#include <linux/errno.h>
>>> +#include <linux/unicode.h>
>>> +#include <linux/stringhash.h>
>>> +
>>> +#include "utf8n.h"
>>> +
>>> +static int utf8_validate(const struct unicode_map *um, const struct qstr *str)
>>> +{
>>> + const struct utf8data *data = utf8nfdi(um->version);
>>> +
>>> + if (utf8nlen(data, str->name, str->len) < 0)
>>> + return -1;
>>> + return 0;
>>> +}
>>> +
>>> +static int utf8_strncmp(const struct unicode_map *um,
>>> + const struct qstr *s1, const struct qstr *s2)
>>> +{
>>> + const struct utf8data *data = utf8nfdi(um->version);
>>> + struct utf8cursor cur1, cur2;
>>> + int c1, c2;
>>> +
>>> + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> + return -EINVAL;
>>> +
>>> + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>>> + return -EINVAL;
>>> +
>>> + do {
>>> + c1 = utf8byte(&cur1);
>>> + c2 = utf8byte(&cur2);
>>> +
>>> + if (c1 < 0 || c2 < 0)
>>> + return -EINVAL;
>>> + if (c1 != c2)
>>> + return 1;
>>> + } while (c1);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int utf8_strncasecmp(const struct unicode_map *um,
>>> + const struct qstr *s1, const struct qstr *s2)
>>> +{
>>> + const struct utf8data *data = utf8nfdicf(um->version);
>>> + struct utf8cursor cur1, cur2;
>>> + int c1, c2;
>>> +
>>> + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> + return -EINVAL;
>>> +
>>> + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>>> + return -EINVAL;
>>> +
>>> + do {
>>> + c1 = utf8byte(&cur1);
>>> + c2 = utf8byte(&cur2);
>>> +
>>> + if (c1 < 0 || c2 < 0)
>>> + return -EINVAL;
>>> + if (c1 != c2)
>>> + return 1;
>>> + } while (c1);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +/* String cf is expected to be a valid UTF-8 casefolded
>>> + * string.
>>> + */
>>> +static int utf8_strncasecmp_folded(const struct unicode_map *um,
>>> + const struct qstr *cf,
>>> + const struct qstr *s1)
>>> +{
>>> + const struct utf8data *data = utf8nfdicf(um->version);
>>> + struct utf8cursor cur1;
>>> + int c1, c2;
>>> + int i = 0;
>>> +
>>> + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> + return -EINVAL;
>>> +
>>> + do {
>>> + c1 = utf8byte(&cur1);
>>> + c2 = cf->name[i++];
>>> + if (c1 < 0)
>>> + return -EINVAL;
>>> + if (c1 != c2)
>>> + return 1;
>>> + } while (c1);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
>>> + unsigned char *dest, size_t dlen)
>>> +{
>>> + const struct utf8data *data = utf8nfdicf(um->version);
>>> + struct utf8cursor cur;
>>> + size_t nlen = 0;
>>> +
>>> + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> + return -EINVAL;
>>> +
>>> + for (nlen = 0; nlen < dlen; nlen++) {
>>> + int c = utf8byte(&cur);
>>> +
>>> + dest[nlen] = c;
>>> + if (!c)
>>> + return nlen;
>>> + if (c == -1)
>>> + break;
>>> + }
>>> + return -EINVAL;
>>> +}
>>> +
>>> +static int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
>>> + struct qstr *str)
>>> +{
>>> + const struct utf8data *data = utf8nfdicf(um->version);
>>> + struct utf8cursor cur;
>>> + int c;
>>> + unsigned long hash = init_name_hash(salt);
>>> +
>>> + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> + return -EINVAL;
>>> +
>>> + while ((c = utf8byte(&cur))) {
>>> + if (c < 0)
>>> + return -EINVAL;
>>> + hash = partial_name_hash((unsigned char)c, hash);
>>> + }
>>> + str->hash = end_name_hash(hash);
>>> + return 0;
>>> +}
>>> +
>>> +static int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
>>> + unsigned char *dest, size_t dlen)
>>> +{
>>> + const struct utf8data *data = utf8nfdi(um->version);
>>> + struct utf8cursor cur;
>>> + ssize_t nlen = 0;
>>> +
>>> + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> + return -EINVAL;
>>> +
>>> + for (nlen = 0; nlen < dlen; nlen++) {
>>> + int c = utf8byte(&cur);
>>> +
>>> + dest[nlen] = c;
>>> + if (!c)
>>> + return nlen;
>>> + if (c == -1)
>>> + break;
>>> + }
>>> + return -EINVAL;
>>> +}
>>> +
>>> +static int utf8_parse_version(const char *version, unsigned int *maj,
>>> + unsigned int *min, unsigned int *rev)
>>> +{
>>> + substring_t args[3];
>>> + char version_string[12];
>>> + static const struct match_token token[] = {
>>> + {1, "%d.%d.%d"},
>>> + {0, NULL}
>>> + };
>>> +
>>> + strncpy(version_string, version, sizeof(version_string));
>>> +
>>> + if (match_token(version_string, token, args) != 1)
>>> + return -EINVAL;
>>> +
>>> + if (match_int(&args[0], maj) || match_int(&args[1], min) ||
>>> + match_int(&args[2], rev))
>>> + return -EINVAL;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static struct unicode_map *utf8_load(const char *version)
>>> +{
>>> + struct unicode_map *um = NULL;
>>> + int unicode_version;
>>> +
>>> + if (version) {
>>> + unsigned int maj, min, rev;
>>> +
>>> + if (utf8_parse_version(version, &maj, &min, &rev) < 0)
>>> + return ERR_PTR(-EINVAL);
>>> +
>>> + if (!utf8version_is_supported(maj, min, rev))
>>> + return ERR_PTR(-EINVAL);
>>> +
>>> + unicode_version = UNICODE_AGE(maj, min, rev);
>>> + } else {
>>> + unicode_version = utf8version_latest();
>>> + printk(KERN_WARNING"UTF-8 version not specified. "
>>> + "Assuming latest supported version (%d.%d.%d).",
>>> + (unicode_version >> 16) & 0xff,
>>> + (unicode_version >> 8) & 0xff,
>>> + (unicode_version & 0xff));
>>> + }
>>> +
>>> + um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
>>> + if (!um)
>>> + return ERR_PTR(-ENOMEM);
>>> +
>>> + um->charset = "UTF-8";
>>> + um->version = unicode_version;
>>> +
>>> + return um;
>>> +}
>>> +
>>> +static struct unicode_ops ops = {
>>> + .validate = utf8_validate,
>>> + .strncmp = utf8_strncmp,
>>> + .strncasecmp = utf8_strncasecmp,
>>> + .strncasecmp_folded = utf8_strncasecmp_folded,
>>> + .casefold = utf8_casefold,
>>> + .casefold_hash = utf8_casefold_hash,
>>> + .normalize = utf8_normalize,
>>> + .load = utf8_load,
>>> +};
>>> +
>>> +static int __init utf8_init(void)
>>> +{
>>> + unicode_register(&ops);
>>> + return 0;
>>> +}
>>> +
>>> +static void __exit utf8_exit(void)
>>> +{
>>> + unicode_unregister();
>>> +}
>>> +
>>> +module_init(utf8_init);
>>> +module_exit(utf8_exit);
>>> +
>>> +MODULE_LICENSE("GPL v2");
>>> diff --git a/include/linux/unicode.h b/include/linux/unicode.h
>>> index de23f9ee720b..b0d59069e438 100644
>>> --- a/include/linux/unicode.h
>>> +++ b/include/linux/unicode.h
>>> @@ -10,6 +10,23 @@ struct unicode_map {
>>> int version;
>>> };
>>> +struct unicode_ops {
>>> + int (*validate)(const struct unicode_map *um, const struct qstr *str);
>>> + int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
>>> + const struct qstr *s2);
>>> + int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
>>> + const struct qstr *s2);
>>> + int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf,
>>> + const struct qstr *s1);
>>> + int (*normalize)(const struct unicode_map *um, const struct qstr *str,
>>> + unsigned char *dest, size_t dlen);
>>> + int (*casefold)(const struct unicode_map *um, const struct qstr *str,
>>> + unsigned char *dest, size_t dlen);
>>> + int (*casefold_hash)(const struct unicode_map *um, const void *salt,
>>> + struct qstr *str);
>>> + struct unicode_map* (*load)(const char *version);
>>> +};
>> Also, make sure you run checkpatch.pl on the patch series before
>> submitting.
> I ran checkpatch.pl over the patch, but it seems there were some
> previously existing warnings
> which are not introduced due to any change made in this patch series.
> I am not sure if I am supposed to resolve those warnings in this patch
> series.
>
>>> +
>>> int unicode_validate(const struct unicode_map *um, const struct qstr *str);
>>> int unicode_strncmp(const struct unicode_map *um,
>>> @@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
>>> struct unicode_map *unicode_load(const char *version);
>>> void unicode_unload(struct unicode_map *um);
>>> +void unicode_register(struct unicode_ops *ops);
>>> +void unicode_unregister(void);
>>> +
>>> #endif /* _LINUX_UNICODE_H */
>

--
Gabriel Krisman Bertazi