LinuxLists.cc - [PATCH] e2fsprogs: Introduce script used to generate utf8data.h

2018-12-03 23:04:39

Subject: [PATCH] e2fsprogs: Introduce script used to generate utf8data.h

The mkutf8data program was written by SGI for the kernel. We integrate
it here, but it should be kept in sync with the kernel code, when
applicable. The header generated by this script is also included in
this repository, such that we don't have to carry the UCD files too.
They are only necessary when the header has to be recreated (for
instance, to update the Unicode version supported)

The mkutf8data is not a dependency of utf8data.h because we don't want
the utf8data.h rule to be called after a make clean that would remove
mkutf8data. It is simpler to call it from inside the rule, and make
sure we only trigger the utf8data.h rule if we really need to,
particularly if we decide not to integrate the ucd files in the tree.

The UCD files are not distributed along with this commit, but we add
information on how to fetch them.

Signed-off-by: Gabriel Krisman Bertazi <[email protected]>
---
MCONFIG.in | 8 +
lib/ext2fs/Makefile.in | 14 +-
util/Makefile.in | 10 +-
util/mkutf8data.c | 3392 ++++++++++++++++++++++++++++++++++++++++
util/ucd/README | 37 +
5 files changed, 3457 insertions(+), 4 deletions(-)
create mode 100644 util/mkutf8data.c
create mode 100644 util/ucd/README

diff --git a/MCONFIG.in b/MCONFIG.in
index 7ca86ac6b269..d24f7ef2cb7c 100644
--- a/MCONFIG.in
+++ b/MCONFIG.in
@@ -182,6 +182,14 @@ DEP_SUBSTITUTE= $(top_builddir)/util/subst $(SUBST_CONF)
$(top_builddir)/util/subst:
cd $(top_builddir)/util ; $(MAKE) subst

+#
+# Script for generating utf8data.h
+#
+MKUTF8DATA=$(top_builddir)/util/mkutf8data
+
+$(top_builddir)/util/mkutf8data:
+ $(MAKE) -C $(top_builddir)/util mkutf8data
+
#
# Script for installing symlinks (for shared libraries)
#
diff --git a/lib/ext2fs/Makefile.in b/lib/ext2fs/Makefile.in
index 5e8907630a05..1510fc72ccec 100644
--- a/lib/ext2fs/Makefile.in
+++ b/lib/ext2fs/Makefile.in
@@ -620,6 +620,18 @@ $(top_builddir)/$(my_dir)/ext2_types.h: $(srcdir)/ext2_types.h.in \
$(top_builddir)/config.status
cd $(top_builddir); CONFIG_FILES=$(my_dir)/ext2_types.h ./config.status

+$(srcdir)/utf8data.h:
+ $(Q) $(MAKE) $(MKUTF8DATA)
+ $(E) "MKUTF8DATA $@"
+ $(Q) $(MKUTF8DATA) -a $(top_srcdir)/util/ucd/DerivedAge-11.0.0.txt \
+ -c $(top_srcdir)/util/ucd/DerivedCombiningClass-11.0.0.txt \
+ -p $(top_srcdir)/util/ucd/DerivedCoreProperties-11.0.0.txt \
+ -d $(top_srcdir)/util/ucd/UnicodeData-11.0.0.txt \
+ -f $(top_srcdir)/util/ucd/CaseFolding-11.0.0.txt \
+ -n $(top_srcdir)/util/ucd/NormalizationCorrections-11.0.0.txt \
+ -t $(top_srcdir)/util/ucd/NormalizationTest-11.0.0.txt \
+ -o $@
+
# +++ Dependency line eater +++
#
# Makefile dependencies follow. This must be the last section in
@@ -1419,4 +1431,4 @@ do_journal.o: $(top_srcdir)/debugfs/do_journal.c $(top_builddir)/lib/config.h \
$(srcdir)/jfs_compat.h $(srcdir)/kernel-list.h \
$(top_srcdir)/debugfs/journal.h $(srcdir)/../../e2fsck/jfs_user.h
$(NLS_OBJS): $(srcdir)/nls.h
-nls_ascii-norm.o: $(srcdir)/utf8n.h $(srcdir)/utf8data.h $(srcdir)/nls.h
+nls_utf8-norm.o: $(srcdir)/utf8n.h $(srcdir)/utf8data.h $(srcdir)/nls.h
diff --git a/util/Makefile.in b/util/Makefile.in
index 96117d9e0019..f3fb3c6b7cf4 100644
--- a/util/Makefile.in
+++ b/util/Makefile.in
@@ -9,7 +9,7 @@ top_builddir = ..
my_dir = util
INSTALL = @INSTALL@

-SRCS = $(srcdir)/subst.c
+SRCS = $(srcdir)/subst.c $(srcdir)/mkutf8data.c

@MCONFIG@

@@ -19,7 +19,7 @@ SRCS = $(srcdir)/subst.c
$(Q) $(CHECK_CMD) $(ALL_CFLAGS) $<
$(Q) $(CPPCHECK_CMD) $(CPPFLAGS) $<

-PROGS= subst symlinks
+PROGS= subst symlinks mkutf8data

all:: $(PROGS) gen-tarball

@@ -33,6 +33,10 @@ subst: subst.o
$(E) " LD $@"
$(Q) $(BUILD_CC) $(BUILD_LDFLAGS) -o subst subst.o

+mkutf8data: mkutf8data.o
+ $(E) " LD $@"
+ $(Q) $(BUILD_CC) $(BUILD_LDFLAGS) -o mkutf8data mkutf8data.o
+
copy_sparse: copy_sparse.o
$(E) " LD $@"
$(Q) $(BUILD_CC) $(BUILD_LDFLAGS) -o copy_sparse copy_sparse.o
@@ -53,7 +57,7 @@ tarballs: gen-tarball

clean::
$(RM) -f $(PROGS) \#* *.s *.o *.a *~ core *.tar.gz gen-tarball \
- copy-sparse dirpaths.h install-symlink
+ copy-sparse dirpaths.h install-symlink mkutf8data

mostlyclean: clean

diff --git a/util/mkutf8data.c b/util/mkutf8data.c
new file mode 100644
index 000000000000..49bb0e16669b
--- /dev/null
+++ b/util/mkutf8data.c
@@ -0,0 +1,3392 @@
+/*
+ * Copyright (c) 2014 SGI.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Generator for a compact trie for unicode normalization */
+
+#include <sys/types.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+
+/* Default names of the in- and output files. */
+
+#define AGE_NAME "DerivedAge.txt"
+#define CCC_NAME "DerivedCombiningClass.txt"
+#define PROP_NAME "DerivedCoreProperties.txt"
+#define DATA_NAME "UnicodeData.txt"
+#define FOLD_NAME "CaseFolding.txt"
+#define NORM_NAME "NormalizationCorrections.txt"
+#define TEST_NAME "NormalizationTest.txt"
+#define UTF8_NAME "utf8data.h"
+
+const char *age_name = AGE_NAME;
+const char *ccc_name = CCC_NAME;
+const char *prop_name = PROP_NAME;
+const char *data_name = DATA_NAME;
+const char *fold_name = FOLD_NAME;
+const char *norm_name = NORM_NAME;
+const char *test_name = TEST_NAME;
+const char *utf8_name = UTF8_NAME;
+
+int verbose = 0;
+
+/* An arbitrary line size limit on input lines. */
+
+#define LINESIZE 1024
+char line[LINESIZE];
+char buf0[LINESIZE];
+char buf1[LINESIZE];
+char buf2[LINESIZE];
+char buf3[LINESIZE];
+
+const char *argv0;
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode version numbers consist of three parts: major, minor, and a
+ * revision. These numbers are packed into an unsigned int to obtain
+ * a single version number.
+ *
+ * To save space in the generated trie, the unicode version is not
+ * stored directly, instead we calculate a generation number from the
+ * unicode versions seen in the DerivedAge file, and use that as an
+ * index into a table of unicode versions.
+ */
+#define UNICODE_MAJ_SHIFT (16)
+#define UNICODE_MIN_SHIFT (8)
+
+#define UNICODE_MAJ_MAX ((unsigned short)-1)
+#define UNICODE_MIN_MAX ((unsigned char)-1)
+#define UNICODE_REV_MAX ((unsigned char)-1)
+
+#define UNICODE_AGE(MAJ,MIN,REV) \
+ (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
+ ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
+ ((unsigned int)(REV)))
+
+unsigned int *ages;
+int ages_count;
+
+unsigned int unicode_maxage;
+
+static int age_valid(unsigned int major, unsigned int minor,
+ unsigned int revision)
+{
+ if (major > UNICODE_MAJ_MAX)
+ return 0;
+ if (minor > UNICODE_MIN_MAX)
+ return 0;
+ if (revision > UNICODE_REV_MAX)
+ return 0;
+ return 1;
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * utf8trie_t
+ *
+ * A compact binary tree, used to decode UTF-8 characters.
+ *
+ * Internal nodes are one byte for the node itself, and up to three
+ * bytes for an offset into the tree. The first byte contains the
+ * following information:
+ * NEXTBYTE - flag - advance to next byte if set
+ * BITNUM - 3 bit field - the bit number to tested
+ * OFFLEN - 2 bit field - number of bytes in the offset
+ * if offlen == 0 (non-branching node)
+ * RIGHTPATH - 1 bit field - set if the following node is for the
+ * right-hand path (tested bit is set)
+ * TRIENODE - 1 bit field - set if the following node is an internal
+ * node, otherwise it is a leaf node
+ * if offlen != 0 (branching node)
+ * LEFTNODE - 1 bit field - set if the left-hand node is internal
+ * RIGHTNODE - 1 bit field - set if the right-hand node is internal
+ *
+ * Due to the way utf8 works, there cannot be branching nodes with
+ * NEXTBYTE set, and moreover those nodes always have a righthand
+ * descendant.
+ */
+typedef unsigned char utf8trie_t;
+#define BITNUM 0x07
+#define NEXTBYTE 0x08
+#define OFFLEN 0x30
+#define OFFLEN_SHIFT 4
+#define RIGHTPATH 0x40
+#define TRIENODE 0x80
+#define RIGHTNODE 0x40
+#define LEFTNODE 0x80
+
+/*
+ * utf8leaf_t
+ *
+ * The leaves of the trie are embedded in the trie, and so the same
+ * underlying datatype, unsigned char.
+ *
+ * leaf[0]: The unicode version, stored as a generation number that is
+ * an index into utf8agetab[]. With this we can filter code
+ * points based on the unicode version in which they were
+ * defined. The CCC of a non-defined code point is 0.
+ * leaf[1]: Canonical Combining Class. During normalization, we need
+ * to do a stable sort into ascending order of all characters
+ * with a non-zero CCC that occur between two characters with
+ * a CCC of 0, or at the begin or end of a string.
+ * The unicode standard guarantees that all CCC values are
+ * between 0 and 254 inclusive, which leaves 255 available as
+ * a special value.
+ * Code points with CCC 0 are known as stoppers.
+ * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
+ * start of a NUL-terminated string that is the decomposition
+ * of the character.
+ * The CCC of a decomposable character is the same as the CCC
+ * of the first character of its decomposition.
+ * Some characters decompose as the empty string: these are
+ * characters with the Default_Ignorable_Code_Point property.
+ * These do affect normalization, as they all have CCC 0.
+ *
+ * The decompositions in the trie have been fully expanded.
+ *
+ * Casefolding, if applicable, is also done using decompositions.
+ */
+typedef unsigned char utf8leaf_t;
+
+#define LEAF_GEN(LEAF) ((LEAF)[0])
+#define LEAF_CCC(LEAF) ((LEAF)[1])
+#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
+
+#define MAXGEN (255)
+
+#define MINCCC (0)
+#define MAXCCC (254)
+#define STOPPER (0)
+#define DECOMPOSE (255)
+#define HANGUL ((char)(255))
+
+#define UTF8HANGULLEAF (12)
+
+struct tree;
+static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
+ const char *, size_t);
+static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
+
+unsigned char *utf8data;
+size_t utf8data_size;
+
+utf8trie_t *nfkdi;
+utf8trie_t *nfkdicf;
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * UTF8 valid ranges.
+ *
+ * The UTF-8 encoding spreads the bits of a 32bit word over several
+ * bytes. This table gives the ranges that can be held and how they'd
+ * be represented.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * There is an additional requirement on UTF-8, in that only the
+ * shortest representation of a 32bit value is to be used. A decoder
+ * must not decode sequences that do not satisfy this requirement.
+ * Thus the allowed ranges have a lower bound.
+ *
+ * 0x00000000 0x0000007F: 0xxxxxxx
+ * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
+ * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
+ * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
+ * 17 planes of 65536 values. This limits the sequences actually seen
+ * even more, to just the following.
+ *
+ * 0 - 0x7f: 0 0x7f
+ * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
+ * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
+ * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
+ *
+ * Even within those ranges not all values are allowed: the surrogates
+ * 0xd800 - 0xdfff should never be seen.
+ *
+ * Note that the longest sequence seen with valid usage is 4 bytes,
+ * the same a single UTF-32 character. This makes the UTF-8
+ * representation of Unicode strictly smaller than UTF-32.
+ *
+ * The shortest sequence requirement was introduced by:
+ * Corrigendum #1: UTF-8 Shortest Form
+ * It can be found here:
+ * http://www.unicode.org/versions/corrigendum1.html
+ *
+ */
+
+#define UTF8_2_BITS 0xC0
+#define UTF8_3_BITS 0xE0
+#define UTF8_4_BITS 0xF0
+#define UTF8_N_BITS 0x80
+#define UTF8_2_MASK 0xE0
+#define UTF8_3_MASK 0xF0
+#define UTF8_4_MASK 0xF8
+#define UTF8_N_MASK 0xC0
+#define UTF8_V_MASK 0x3F
+#define UTF8_V_SHIFT 6
+
+static int utf8encode(char *str, unsigned int val)
+{
+ int len;
+
+ if (val < 0x80) {
+ str[0] = val;
+ len = 1;
+ } else if (val < 0x800) {
+ str[1] = val & UTF8_V_MASK;
+ str[1] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[0] = val;
+ str[0] |= UTF8_2_BITS;
+ len = 2;
+ } else if (val < 0x10000) {
+ str[2] = val & UTF8_V_MASK;
+ str[2] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[1] = val & UTF8_V_MASK;
+ str[1] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[0] = val;
+ str[0] |= UTF8_3_BITS;
+ len = 3;
+ } else if (val < 0x110000) {
+ str[3] = val & UTF8_V_MASK;
+ str[3] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[2] = val & UTF8_V_MASK;
+ str[2] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[1] = val & UTF8_V_MASK;
+ str[1] |= UTF8_N_BITS;
+ val >>= UTF8_V_SHIFT;
+ str[0] = val;
+ str[0] |= UTF8_4_BITS;
+ len = 4;
+ } else {
+ printf("%#x: illegal val\n", val);
+ len = 0;
+ }
+ return len;
+}
+
+static unsigned int utf8decode(const char *str)
+{
+ const unsigned char *s = (const unsigned char*)str;
+ unsigned int unichar = 0;
+
+ if (*s < 0x80) {
+ unichar = *s;
+ } else if (*s < UTF8_3_BITS) {
+ unichar = *s++ & 0x1F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s & 0x3F;
+ } else if (*s < UTF8_4_BITS) {
+ unichar = *s++ & 0x0F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s++ & 0x3F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s & 0x3F;
+ } else {
+ unichar = *s++ & 0x0F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s++ & 0x3F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s++ & 0x3F;
+ unichar <<= UTF8_V_SHIFT;
+ unichar |= *s & 0x3F;
+ }
+ return unichar;
+}
+
+static int utf32valid(unsigned int unichar)
+{
+ return unichar < 0x110000;
+}
+
+#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
+
+#define NODE 1
+#define LEAF 0
+
+struct tree {
+ void *root;
+ int childnode;
+ const char *type;
+ unsigned int maxage;
+ struct tree *next;
+ int (*leaf_equal)(void *, void *);
+ void (*leaf_print)(void *, int);
+ int (*leaf_mark)(void *);
+ int (*leaf_size)(void *);
+ int *(*leaf_index)(struct tree *, void *);
+ unsigned char *(*leaf_emit)(void *, unsigned char *);
+ int leafindex[0x110000];
+ int index;
+};
+
+struct node {
+ int index;
+ int offset;
+ int mark;
+ int size;
+ struct node *parent;
+ void *left;
+ void *right;
+ unsigned char bitnum;
+ unsigned char nextbyte;
+ unsigned char leftnode;
+ unsigned char rightnode;
+ unsigned int keybits;
+ unsigned int keymask;
+};
+
+/*
+ * Example lookup function for a tree.
+ */
+static void *lookup(struct tree *tree, const char *key)
+{
+ struct node *node;
+ void *leaf = NULL;
+
+ node = tree->root;
+ while (!leaf && node) {
+ if (node->nextbyte)
+ key++;
+ if (*key & (1 << (node->bitnum & 7))) {
+ /* Right leg */
+ if (node->rightnode == NODE) {
+ node = node->right;
+ } else if (node->rightnode == LEAF) {
+ leaf = node->right;
+ } else {
+ node = NULL;
+ }
+ } else {
+ /* Left leg */
+ if (node->leftnode == NODE) {
+ node = node->left;
+ } else if (node->leftnode == LEAF) {
+ leaf = node->left;
+ } else {
+ node = NULL;
+ }
+ }
+ }
+
+ return leaf;
+}
+
+/*
+ * A simple non-recursive tree walker: keep track of visits to the
+ * left and right branches in the leftmask and rightmask.
+ */
+static void tree_walk(struct tree *tree)
+{
+ struct node *node;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int indent = 1;
+ int nodes, singletons, leaves;
+
+ nodes = singletons = leaves = 0;
+
+ printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
+ if (tree->childnode == LEAF) {
+ assert(tree->root);
+ tree->leaf_print(tree->root, indent);
+ leaves = 1;
+ } else {
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ printf("%*snode @ %p bitnum %d nextbyte %d"
+ " left %p right %p mask %x bits %x\n",
+ indent, "", node,
+ node->bitnum, node->nextbyte,
+ node->left, node->right,
+ node->keymask, node->keybits);
+ nodes += 1;
+ if (!(node->left && node->right))
+ singletons += 1;
+
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ tree->leaf_print(node->left,
+ indent+1);
+ leaves += 1;
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ indent += 1;
+ node = node->left;
+ break;
+ }
+ }
+ if ((rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ tree->leaf_print(node->right,
+ indent+1);
+ leaves += 1;
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ indent += 1;
+ node = node->right;
+ break;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ indent -= 1;
+ }
+ }
+ }
+ printf("nodes %d leaves %d singletons %d\n",
+ nodes, leaves, singletons);
+}
+
+/*
+ * Allocate an initialize a new internal node.
+ */
+static struct node *alloc_node(struct node *parent)
+{
+ struct node *node;
+ int bitnum;
+
+ node = malloc(sizeof(*node));
+ node->left = node->right = NULL;
+ node->parent = parent;
+ node->leftnode = NODE;
+ node->rightnode = NODE;
+ node->keybits = 0;
+ node->keymask = 0;
+ node->mark = 0;
+ node->index = 0;
+ node->offset = -1;
+ node->size = 4;
+
+ if (node->parent) {
+ bitnum = parent->bitnum;
+ if ((bitnum & 7) == 0) {
+ node->bitnum = bitnum + 7 + 8;
+ node->nextbyte = 1;
+ } else {
+ node->bitnum = bitnum - 1;
+ node->nextbyte = 0;
+ }
+ } else {
+ node->bitnum = 7;
+ node->nextbyte = 0;
+ }
+
+ return node;
+}
+
+/*
+ * Insert a new leaf into the tree, and collapse any subtrees that are
+ * fully populated and end in identical leaves. A nextbyte tagged
+ * internal node will not be removed to preserve the tree's integrity.
+ * Note that due to the structure of utf8, no nextbyte tagged node
+ * will be a candidate for removal.
+ */
+static int insert(struct tree *tree, char *key, int keylen, void *leaf)
+{
+ struct node *node;
+ struct node *parent;
+ void **cursor;
+ int keybits;
+
+ assert(keylen >= 1 && keylen <= 4);
+
+ node = NULL;
+ cursor = &tree->root;
+ keybits = 8 * keylen;
+
+ /* Insert, creating path along the way. */
+ while (keybits) {
+ if (!*cursor)
+ *cursor = alloc_node(node);
+ node = *cursor;
+ if (node->nextbyte)
+ key++;
+ if (*key & (1 << (node->bitnum & 7)))
+ cursor = &node->right;
+ else
+ cursor = &node->left;
+ keybits--;
+ }
+ *cursor = leaf;
+
+ /* Merge subtrees if possible. */
+ while (node) {
+ if (*key & (1 << (node->bitnum & 7)))
+ node->rightnode = LEAF;
+ else
+ node->leftnode = LEAF;
+ if (node->nextbyte)
+ break;
+ if (node->leftnode == NODE || node->rightnode == NODE)
+ break;
+ assert(node->left);
+ assert(node->right);
+ /* Compare */
+ if (! tree->leaf_equal(node->left, node->right))
+ break;
+ /* Keep left, drop right leaf. */
+ leaf = node->left;
+ /* Check in parent */
+ parent = node->parent;
+ if (!parent) {
+ /* root of tree! */
+ tree->root = leaf;
+ tree->childnode = LEAF;
+ } else if (parent->left == node) {
+ parent->left = leaf;
+ parent->leftnode = LEAF;
+ if (parent->right) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else {
+ parent->keymask |= (1 << node->bitnum);
+ }
+ } else if (parent->right == node) {
+ parent->right = leaf;
+ parent->rightnode = LEAF;
+ if (parent->left) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else {
+ parent->keymask |= (1 << node->bitnum);
+ parent->keybits |= (1 << node->bitnum);
+ }
+ } else {
+ /* internal tree error */
+ assert(0);
+ }
+ free(node);
+ node = parent;
+ }
+
+ /* Propagate keymasks up along singleton chains. */
+ while (node) {
+ parent = node->parent;
+ if (!parent)
+ break;
+ /* Nix the mask for parents with two children. */
+ if (node->keymask == 0) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else if (parent->left && parent->right) {
+ parent->keymask = 0;
+ parent->keybits = 0;
+ } else {
+ assert((parent->keymask & node->keymask) == 0);
+ parent->keymask |= node->keymask;
+ parent->keymask |= (1 << parent->bitnum);
+ parent->keybits |= node->keybits;
+ if (parent->right)
+ parent->keybits |= (1 << parent->bitnum);
+ }
+ node = parent;
+ }
+
+ return 0;
+}
+
+/*
+ * Prune internal nodes.
+ *
+ * Fully populated subtrees that end at the same leaf have already
+ * been collapsed. There are still internal nodes that have for both
+ * their left and right branches a sequence of singletons that make
+ * identical choices and end in identical leaves. The keymask and
+ * keybits collected in the nodes describe the choices made in these
+ * singleton chains. When they are identical for the left and right
+ * branch of a node, and the two leaves comare identical, the node in
+ * question can be removed.
+ *
+ * Note that nodes with the nextbyte tag set will not be removed by
+ * this to ensure tree integrity. Note as well that the structure of
+ * utf8 ensures that these nodes would not have been candidates for
+ * removal in any case.
+ */
+static void prune(struct tree *tree)
+{
+ struct node *node;
+ struct node *left;
+ struct node *right;
+ struct node *parent;
+ void *leftleaf;
+ void *rightleaf;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int count;
+
+ if (verbose > 0)
+ printf("Pruning %s_%x\n", tree->type, tree->maxage);
+
+ count = 0;
+ if (tree->childnode == LEAF)
+ return;
+ if (!tree->root)
+ return;
+
+ leftmask = rightmask = 0;
+ node = tree->root;
+ while (node) {
+ if (node->nextbyte)
+ goto advance;
+ if (node->leftnode == LEAF)
+ goto advance;
+ if (node->rightnode == LEAF)
+ goto advance;
+ if (!node->left)
+ goto advance;
+ if (!node->right)
+ goto advance;
+ left = node->left;
+ right = node->right;
+ if (left->keymask == 0)
+ goto advance;
+ if (right->keymask == 0)
+ goto advance;
+ if (left->keymask != right->keymask)
+ goto advance;
+ if (left->keybits != right->keybits)
+ goto advance;
+ leftleaf = NULL;
+ while (!leftleaf) {
+ assert(left->left || left->right);
+ if (left->leftnode == LEAF)
+ leftleaf = left->left;
+ else if (left->rightnode == LEAF)
+ leftleaf = left->right;
+ else if (left->left)
+ left = left->left;
+ else if (left->right)
+ left = left->right;
+ else
+ assert(0);
+ }
+ rightleaf = NULL;
+ while (!rightleaf) {
+ assert(right->left || right->right);
+ if (right->leftnode == LEAF)
+ rightleaf = right->left;
+ else if (right->rightnode == LEAF)
+ rightleaf = right->right;
+ else if (right->left)
+ right = right->left;
+ else if (right->right)
+ right = right->right;
+ else
+ assert(0);
+ }
+ if (! tree->leaf_equal(leftleaf, rightleaf))
+ goto advance;
+ /*
+ * This node has identical singleton-only subtrees.
+ * Remove it.
+ */
+ parent = node->parent;
+ left = node->left;
+ right = node->right;
+ if (parent->left == node)
+ parent->left = left;
+ else if (parent->right == node)
+ parent->right = left;
+ else
+ assert(0);
+ left->parent = parent;
+ left->keymask |= (1 << node->bitnum);
+ node->left = NULL;
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ if (node->leftnode == NODE && node->left) {
+ left = node->left;
+ free(node);
+ count++;
+ node = left;
+ } else if (node->rightnode == NODE && node->right) {
+ right = node->right;
+ free(node);
+ count++;
+ node = right;
+ } else {
+ node = NULL;
+ }
+ }
+ /* Propagate keymasks up along singleton chains. */
+ node = parent;
+ /* Force re-check */
+ bitmask = 1 << node->bitnum;
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ for (;;) {
+ if (node->left && node->right)
+ break;
+ if (node->left) {
+ left = node->left;
+ node->keymask |= left->keymask;
+ node->keybits |= left->keybits;
+ }
+ if (node->right) {
+ right = node->right;
+ node->keymask |= right->keymask;
+ node->keybits |= right->keybits;
+ }
+ node->keymask |= (1 << node->bitnum);
+ node = node->parent;
+ /* Force re-check */
+ bitmask = 1 << node->bitnum;
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ }
+ advance:
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0 &&
+ node->leftnode == NODE &&
+ node->left) {
+ leftmask |= bitmask;
+ node = node->left;
+ } else if ((rightmask & bitmask) == 0 &&
+ node->rightnode == NODE &&
+ node->right) {
+ rightmask |= bitmask;
+ node = node->right;
+ } else {
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ }
+ }
+ if (verbose > 0)
+ printf("Pruned %d nodes\n", count);
+}
+
+/*
+ * Mark the nodes in the tree that lead to leaves that must be
+ * emitted.
+ */
+static void mark_nodes(struct tree *tree)
+{
+ struct node *node;
+ struct node *n;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int marked;
+
+ marked = 0;
+ if (verbose > 0)
+ printf("Marking %s_%x\n", tree->type, tree->maxage);
+ if (tree->childnode == LEAF)
+ goto done;
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ if (tree->leaf_mark(node->left)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ node = node->left;
+ continue;
+ }
+ }
+ if ((rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ if (tree->leaf_mark(node->right)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ node = node->right;
+ continue;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ }
+
+ /* second pass: left siblings and singletons */
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if ((leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ if (tree->leaf_mark(node->left)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ node = node->left;
+ if (!node->mark && node->parent->mark) {
+ marked++;
+ node->mark = 1;
+ }
+ continue;
+ }
+ }
+ if ((rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ if (tree->leaf_mark(node->right)) {
+ n = node;
+ while (n && !n->mark) {
+ marked++;
+ n->mark = 1;
+ n = n->parent;
+ }
+ }
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ node = node->right;
+ if (!node->mark && node->parent->mark &&
+ !node->parent->left) {
+ marked++;
+ node->mark = 1;
+ }
+ continue;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ }
+done:
+ if (verbose > 0)
+ printf("Marked %d nodes\n", marked);
+}
+
+/*
+ * Compute the index of each node and leaf, which is the offset in the
+ * emitted trie. These values must be pre-computed because relative
+ * offsets between nodes are used to navigate the tree.
+ */
+static int index_nodes(struct tree *tree, int index)
+{
+ struct node *node;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int count;
+ int indent;
+
+ /* Align to a cache line (or half a cache line?). */
+ while (index % 64)
+ index++;
+ tree->index = index;
+ indent = 1;
+ count = 0;
+
+ if (verbose > 0)
+ printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
+ if (tree->childnode == LEAF) {
+ index += tree->leaf_size(tree->root);
+ goto done;
+ }
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ if (!node->mark)
+ goto skip;
+ count++;
+ if (node->index != index)
+ node->index = index;
+ index += node->size;
+skip:
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if (node->mark && (leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ *tree->leaf_index(tree, node->left) =
+ index;
+ index += tree->leaf_size(node->left);
+ count++;
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ indent += 1;
+ node = node->left;
+ break;
+ }
+ }
+ if (node->mark && (rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ *tree->leaf_index(tree, node->right) = index;
+ index += tree->leaf_size(node->right);
+ count++;
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ indent += 1;
+ node = node->right;
+ break;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ indent -= 1;
+ }
+ }
+done:
+ /* Round up to a multiple of 16 */
+ while (index % 16)
+ index++;
+ if (verbose > 0)
+ printf("Final index %d\n", index);
+ return index;
+}
+
+/*
+ * Mark the nodes in a subtree, helper for size_nodes().
+ */
+static int mark_subtree(struct node *node)
+{
+ int changed;
+
+ if (!node || node->mark)
+ return 0;
+ node->mark = 1;
+ node->index = node->parent->index;
+ changed = 1;
+ if (node->leftnode == NODE)
+ changed += mark_subtree(node->left);
+ if (node->rightnode == NODE)
+ changed += mark_subtree(node->right);
+ return changed;
+}
+
+/*
+ * Compute the size of nodes and leaves. We start by assuming that
+ * each node needs to store a three-byte offset. The indexes of the
+ * nodes are calculated based on that, and then this function is
+ * called to see if the sizes of some nodes can be reduced. This is
+ * repeated until no more changes are seen.
+ */
+static int size_nodes(struct tree *tree)
+{
+ struct tree *next;
+ struct node *node;
+ struct node *right;
+ struct node *n;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ unsigned int pathbits;
+ unsigned int pathmask;
+ unsigned int nbit;
+ int changed;
+ int offset;
+ int size;
+ int indent;
+
+ indent = 1;
+ changed = 0;
+ size = 0;
+
+ if (verbose > 0)
+ printf("Sizing %s_%x\n", tree->type, tree->maxage);
+ if (tree->childnode == LEAF)
+ goto done;
+
+ assert(tree->childnode == NODE);
+ pathbits = 0;
+ pathmask = 0;
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ if (!node->mark)
+ goto skip;
+ offset = 0;
+ if (!node->left || !node->right) {
+ size = 1;
+ } else {
+ if (node->rightnode == NODE) {
+ /*
+ * If the right node is not marked,
+ * look for a corresponding node in
+ * the next tree. Such a node need
+ * not exist.
+ */
+ right = node->right;
+ next = tree->next;
+ while (!right->mark) {
+ assert(next);
+ n = next->root;
+ while (n->bitnum != node->bitnum) {
+ nbit = 1 << n->bitnum;
+ if (!(pathmask & nbit))
+ break;
+ if (pathbits & nbit) {
+ if (n->rightnode == LEAF)
+ break;
+ n = n->right;
+ } else {
+ if (n->leftnode == LEAF)
+ break;
+ n = n->left;
+ }
+ }
+ if (n->bitnum != node->bitnum)
+ break;
+ n = n->right;
+ right = n;
+ next = next->next;
+ }
+ /* Make sure the right node is marked. */
+ if (!right->mark)
+ changed += mark_subtree(right);
+ offset = right->index - node->index;
+ } else {
+ offset = *tree->leaf_index(tree, node->right);
+ offset -= node->index;
+ }
+ assert(offset >= 0);
+ assert(offset <= 0xffffff);
+ if (offset <= 0xff) {
+ size = 2;
+ } else if (offset <= 0xffff) {
+ size = 3;
+ } else { /* offset <= 0xffffff */
+ size = 4;
+ }
+ }
+ if (node->size != size || node->offset != offset) {
+ node->size = size;
+ node->offset = offset;
+ changed++;
+ }
+skip:
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ pathmask |= bitmask;
+ if (node->mark && (leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ indent += 1;
+ node = node->left;
+ break;
+ }
+ }
+ if (node->mark && (rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ pathbits |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ indent += 1;
+ node = node->right;
+ break;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ pathmask &= ~bitmask;
+ pathbits &= ~bitmask;
+ node = node->parent;
+ indent -= 1;
+ }
+ }
+done:
+ if (verbose > 0)
+ printf("Found %d changes\n", changed);
+ return changed;
+}
+
+/*
+ * Emit a trie for the given tree into the data array.
+ */
+static void emit(struct tree *tree, unsigned char *data)
+{
+ struct node *node;
+ unsigned int leftmask;
+ unsigned int rightmask;
+ unsigned int bitmask;
+ int offlen;
+ int offset;
+ int index;
+ int indent;
+ int size;
+ int bytes;
+ int leaves;
+ int nodes[4];
+ unsigned char byte;
+
+ nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
+ leaves = 0;
+ bytes = 0;
+ index = tree->index;
+ data += index;
+ indent = 1;
+ if (verbose > 0)
+ printf("Emitting %s_%x\n", tree->type, tree->maxage);
+ if (tree->childnode == LEAF) {
+ assert(tree->root);
+ tree->leaf_emit(tree->root, data);
+ size = tree->leaf_size(tree->root);
+ index += size;
+ leaves++;
+ goto done;
+ }
+
+ assert(tree->childnode == NODE);
+ node = tree->root;
+ leftmask = rightmask = 0;
+ while (node) {
+ if (!node->mark)
+ goto skip;
+ assert(node->offset != -1);
+ assert(node->index == index);
+
+ byte = 0;
+ if (node->nextbyte)
+ byte |= NEXTBYTE;
+ byte |= (node->bitnum & BITNUM);
+ if (node->left && node->right) {
+ if (node->leftnode == NODE)
+ byte |= LEFTNODE;
+ if (node->rightnode == NODE)
+ byte |= RIGHTNODE;
+ if (node->offset <= 0xff)
+ offlen = 1;
+ else if (node->offset <= 0xffff)
+ offlen = 2;
+ else
+ offlen = 3;
+ nodes[offlen]++;
+ offset = node->offset;
+ byte |= offlen << OFFLEN_SHIFT;
+ *data++ = byte;
+ index++;
+ while (offlen--) {
+ *data++ = offset & 0xff;
+ index++;
+ offset >>= 8;
+ }
+ } else if (node->left) {
+ if (node->leftnode == NODE)
+ byte |= TRIENODE;
+ nodes[0]++;
+ *data++ = byte;
+ index++;
+ } else if (node->right) {
+ byte |= RIGHTNODE;
+ if (node->rightnode == NODE)
+ byte |= TRIENODE;
+ nodes[0]++;
+ *data++ = byte;
+ index++;
+ } else {
+ assert(0);
+ }
+skip:
+ while (node) {
+ bitmask = 1 << node->bitnum;
+ if (node->mark && (leftmask & bitmask) == 0) {
+ leftmask |= bitmask;
+ if (node->leftnode == LEAF) {
+ assert(node->left);
+ data = tree->leaf_emit(node->left,
+ data);
+ size = tree->leaf_size(node->left);
+ index += size;
+ bytes += size;
+ leaves++;
+ } else if (node->left) {
+ assert(node->leftnode == NODE);
+ indent += 1;
+ node = node->left;
+ break;
+ }
+ }
+ if (node->mark && (rightmask & bitmask) == 0) {
+ rightmask |= bitmask;
+ if (node->rightnode == LEAF) {
+ assert(node->right);
+ data = tree->leaf_emit(node->right,
+ data);
+ size = tree->leaf_size(node->right);
+ index += size;
+ bytes += size;
+ leaves++;
+ } else if (node->right) {
+ assert(node->rightnode == NODE);
+ indent += 1;
+ node = node->right;
+ break;
+ }
+ }
+ leftmask &= ~bitmask;
+ rightmask &= ~bitmask;
+ node = node->parent;
+ indent -= 1;
+ }
+ }
+done:
+ if (verbose > 0) {
+ printf("Emitted %d (%d) leaves",
+ leaves, bytes);
+ printf(" %d (%d+%d+%d+%d) nodes",
+ nodes[0] + nodes[1] + nodes[2] + nodes[3],
+ nodes[0], nodes[1], nodes[2], nodes[3]);
+ printf(" %d total\n", index - tree->index);
+ }
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Unicode data.
+ *
+ * We need to keep track of the Canonical Combining Class, the Age,
+ * and decompositions for a code point.
+ *
+ * For the Age, we store the index into the ages table. Effectively
+ * this is a generation number that the table maps to a unicode
+ * version.
+ *
+ * The correction field is used to indicate that this entry is in the
+ * corrections array, which contains decompositions that were
+ * corrected in later revisions. The value of the correction field is
+ * the Unicode version in which the mapping was corrected.
+ */
+struct unicode_data {
+ unsigned int code;
+ int ccc;
+ int gen;
+ int correction;
+ unsigned int *utf32nfkdi;
+ unsigned int *utf32nfkdicf;
+ char *utf8nfkdi;
+ char *utf8nfkdicf;
+};
+
+struct unicode_data unicode_data[0x110000];
+struct unicode_data *corrections;
+int corrections_count;
+
+struct tree *nfkdi_tree;
+struct tree *nfkdicf_tree;
+
+struct tree *trees;
+int trees_count;
+
+/*
+ * Check the corrections array to see if this entry was corrected at
+ * some point.
+ */
+static struct unicode_data *corrections_lookup(struct unicode_data *u)
+{
+ int i;
+
+ for (i = 0; i != corrections_count; i++)
+ if (u->code == corrections[i].code)
+ return &corrections[i];
+ return u;
+}
+
+static int nfkdi_equal(void *l, void *r)
+{
+ struct unicode_data *left = l;
+ struct unicode_data *right = r;
+
+ if (left->gen != right->gen)
+ return 0;
+ if (left->ccc != right->ccc)
+ return 0;
+ if (left->utf8nfkdi && right->utf8nfkdi &&
+ strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
+ return 1;
+ if (left->utf8nfkdi || right->utf8nfkdi)
+ return 0;
+ return 1;
+}
+
+static int nfkdicf_equal(void *l, void *r)
+{
+ struct unicode_data *left = l;
+ struct unicode_data *right = r;
+
+ if (left->gen != right->gen)
+ return 0;
+ if (left->ccc != right->ccc)
+ return 0;
+ if (left->utf8nfkdicf && right->utf8nfkdicf &&
+ strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0)
+ return 1;
+ if (left->utf8nfkdicf && right->utf8nfkdicf)
+ return 0;
+ if (left->utf8nfkdicf || right->utf8nfkdicf)
+ return 0;
+ if (left->utf8nfkdi && right->utf8nfkdi &&
+ strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
+ return 1;
+ if (left->utf8nfkdi || right->utf8nfkdi)
+ return 0;
+ return 1;
+}
+
+static void nfkdi_print(void *l, int indent)
+{
+ struct unicode_data *leaf = l;
+
+ printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
+ leaf->code, leaf->ccc, leaf->gen);
+ if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
+ printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
+ else if (leaf->utf8nfkdi)
+ printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
+ printf("\n");
+}
+
+static void nfkdicf_print(void *l, int indent)
+{
+ struct unicode_data *leaf = l;
+
+ printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
+ leaf->code, leaf->ccc, leaf->gen);
+ if (leaf->utf8nfkdicf)
+ printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf);
+ else if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
+ printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
+ else if (leaf->utf8nfkdi)
+ printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
+ printf("\n");
+}
+
+static int nfkdi_mark(void *l)
+{
+ return 1;
+}
+
+static int nfkdicf_mark(void *l)
+{
+ struct unicode_data *leaf = l;
+
+ if (leaf->utf8nfkdicf)
+ return 1;
+ return 0;
+}
+
+static int correction_mark(void *l)
+{
+ struct unicode_data *leaf = l;
+
+ return leaf->correction;
+}
+
+static int nfkdi_size(void *l)
+{
+ struct unicode_data *leaf = l;
+
+ int size = 2;
+ if (HANGUL_SYLLABLE(leaf->code))
+ size += 1;
+ else if (leaf->utf8nfkdi)
+ size += strlen(leaf->utf8nfkdi) + 1;
+ return size;
+}
+
+static int nfkdicf_size(void *l)
+{
+ struct unicode_data *leaf = l;
+
+ int size = 2;
+ if (HANGUL_SYLLABLE(leaf->code))
+ size += 1;
+ else if (leaf->utf8nfkdicf)
+ size += strlen(leaf->utf8nfkdicf) + 1;
+ else if (leaf->utf8nfkdi)
+ size += strlen(leaf->utf8nfkdi) + 1;
+ return size;
+}
+
+static int *nfkdi_index(struct tree *tree, void *l)
+{
+ struct unicode_data *leaf = l;
+
+ return &tree->leafindex[leaf->code];
+}
+
+static int *nfkdicf_index(struct tree *tree, void *l)
+{
+ struct unicode_data *leaf = l;
+
+ return &tree->leafindex[leaf->code];
+}
+
+static unsigned char *nfkdi_emit(void *l, unsigned char *data)
+{
+ struct unicode_data *leaf = l;
+ unsigned char *s;
+
+ *data++ = leaf->gen;
+ if (HANGUL_SYLLABLE(leaf->code)) {
+ *data++ = DECOMPOSE;
+ *data++ = HANGUL;
+ } else if (leaf->utf8nfkdi) {
+ *data++ = DECOMPOSE;
+ s = (unsigned char*)leaf->utf8nfkdi;
+ while ((*data++ = *s++) != 0)
+ ;
+ } else {
+ *data++ = leaf->ccc;
+ }
+ return data;
+}
+
+static unsigned char *nfkdicf_emit(void *l, unsigned char *data)
+{
+ struct unicode_data *leaf = l;
+ unsigned char *s;
+
+ *data++ = leaf->gen;
+ if (HANGUL_SYLLABLE(leaf->code)) {
+ *data++ = DECOMPOSE;
+ *data++ = HANGUL;
+ } else if (leaf->utf8nfkdicf) {
+ *data++ = DECOMPOSE;
+ s = (unsigned char*)leaf->utf8nfkdicf;
+ while ((*data++ = *s++) != 0)
+ ;
+ } else if (leaf->utf8nfkdi) {
+ *data++ = DECOMPOSE;
+ s = (unsigned char*)leaf->utf8nfkdi;
+ while ((*data++ = *s++) != 0)
+ ;
+ } else {
+ *data++ = leaf->ccc;
+ }
+ return data;
+}
+
+static void utf8_create(struct unicode_data *data)
+{
+ char utf[18*4+1];
+ char *u;
+ unsigned int *um;
+ int i;
+
+ if (data->utf8nfkdi) {
+ assert(data->utf8nfkdi[0] == HANGUL);
+ return;
+ }
+
+ u = utf;
+ um = data->utf32nfkdi;
+ if (um) {
+ for (i = 0; um[i]; i++)
+ u += utf8encode(u, um[i]);
+ *u = '\0';
+ data->utf8nfkdi = strdup(utf);
+ }
+ u = utf;
+ um = data->utf32nfkdicf;
+ if (um) {
+ for (i = 0; um[i]; i++)
+ u += utf8encode(u, um[i]);
+ *u = '\0';
+ if (!data->utf8nfkdi || strcmp(data->utf8nfkdi, utf))
+ data->utf8nfkdicf = strdup(utf);
+ }
+}
+
+static void utf8_init(void)
+{
+ unsigned int unichar;
+ int i;
+
+ for (unichar = 0; unichar != 0x110000; unichar++)
+ utf8_create(&unicode_data[unichar]);
+
+ for (i = 0; i != corrections_count; i++)
+ utf8_create(&corrections[i]);
+}
+
+static void trees_init(void)
+{
+ struct unicode_data *data;
+ unsigned int maxage;
+ unsigned int nextage;
+ int count;
+ int i;
+ int j;
+
+ /* Count the number of different ages. */
+ count = 0;
+ nextage = (unsigned int)-1;
+ do {
+ maxage = nextage;
+ nextage = 0;
+ for (i = 0; i <= corrections_count; i++) {
+ data = &corrections[i];
+ if (nextage < data->correction &&
+ data->correction < maxage)
+ nextage = data->correction;
+ }
+ count++;
+ } while (nextage);
+
+ /* Two trees per age: nfkdi and nfkdicf */
+ trees_count = count * 2;
+ trees = calloc(trees_count, sizeof(struct tree));
+
+ /* Assign ages to the trees. */
+ count = trees_count;
+ nextage = (unsigned int)-1;
+ do {
+ maxage = nextage;
+ trees[--count].maxage = maxage;
+ trees[--count].maxage = maxage;
+ nextage = 0;
+ for (i = 0; i <= corrections_count; i++) {
+ data = &corrections[i];
+ if (nextage < data->correction &&
+ data->correction < maxage)
+ nextage = data->correction;
+ }
+ } while (nextage);
+
+ /* The ages assigned above are off by one. */
+ for (i = 0; i != trees_count; i++) {
+ j = 0;
+ while (ages[j] < trees[i].maxage)
+ j++;
+ trees[i].maxage = ages[j-1];
+ }
+
+ /* Set up the forwarding between trees. */
+ trees[trees_count-2].next = &trees[trees_count-1];
+ trees[trees_count-1].leaf_mark = nfkdi_mark;
+ trees[trees_count-2].leaf_mark = nfkdicf_mark;
+ for (i = 0; i != trees_count-2; i += 2) {
+ trees[i].next = &trees[trees_count-2];
+ trees[i].leaf_mark = correction_mark;
+ trees[i+1].next = &trees[trees_count-1];
+ trees[i+1].leaf_mark = correction_mark;
+ }
+
+ /* Assign the callouts. */
+ for (i = 0; i != trees_count; i += 2) {
+ trees[i].type = "nfkdicf";
+ trees[i].leaf_equal = nfkdicf_equal;
+ trees[i].leaf_print = nfkdicf_print;
+ trees[i].leaf_size = nfkdicf_size;
+ trees[i].leaf_index = nfkdicf_index;
+ trees[i].leaf_emit = nfkdicf_emit;
+
+ trees[i+1].type = "nfkdi";
+ trees[i+1].leaf_equal = nfkdi_equal;
+ trees[i+1].leaf_print = nfkdi_print;
+ trees[i+1].leaf_size = nfkdi_size;
+ trees[i+1].leaf_index = nfkdi_index;
+ trees[i+1].leaf_emit = nfkdi_emit;
+ }
+
+ /* Finish init. */
+ for (i = 0; i != trees_count; i++)
+ trees[i].childnode = NODE;
+}
+
+static void trees_populate(void)
+{
+ struct unicode_data *data;
+ unsigned int unichar;
+ char keyval[4];
+ int keylen;
+ int i;
+
+ for (i = 0; i != trees_count; i++) {
+ if (verbose > 0) {
+ printf("Populating %s_%x\n",
+ trees[i].type, trees[i].maxage);
+ }
+ for (unichar = 0; unichar != 0x110000; unichar++) {
+ if (unicode_data[unichar].gen < 0)
+ continue;
+ keylen = utf8encode(keyval, unichar);
+ data = corrections_lookup(&unicode_data[unichar]);
+ if (data->correction <= trees[i].maxage)
+ data = &unicode_data[unichar];
+ insert(&trees[i], keyval, keylen, data);
+ }
+ }
+}
+
+static void trees_reduce(void)
+{
+ int i;
+ int size;
+ int changed;
+
+ for (i = 0; i != trees_count; i++)
+ prune(&trees[i]);
+ for (i = 0; i != trees_count; i++)
+ mark_nodes(&trees[i]);
+ do {
+ size = 0;
+ for (i = 0; i != trees_count; i++)
+ size = index_nodes(&trees[i], size);
+ changed = 0;
+ for (i = 0; i != trees_count; i++)
+ changed += size_nodes(&trees[i]);
+ } while (changed);
+
+ utf8data = calloc(size, 1);
+ utf8data_size = size;
+ for (i = 0; i != trees_count; i++)
+ emit(&trees[i], utf8data);
+
+ if (verbose > 0) {
+ for (i = 0; i != trees_count; i++) {
+ printf("%s_%x idx %d\n",
+ trees[i].type, trees[i].maxage, trees[i].index);
+ }
+ }
+
+ nfkdi = utf8data + trees[trees_count-1].index;
+ nfkdicf = utf8data + trees[trees_count-2].index;
+
+ nfkdi_tree = &trees[trees_count-1];
+ nfkdicf_tree = &trees[trees_count-2];
+}
+
+static void verify(struct tree *tree)
+{
+ struct unicode_data *data;
+ utf8leaf_t *leaf;
+ unsigned int unichar;
+ char key[4];
+ unsigned char hangul[UTF8HANGULLEAF];
+ int report;
+ int nocf;
+
+ if (verbose > 0)
+ printf("Verifying %s_%x\n", tree->type, tree->maxage);
+ nocf = strcmp(tree->type, "nfkdicf");
+
+ for (unichar = 0; unichar != 0x110000; unichar++) {
+ report = 0;
+ data = corrections_lookup(&unicode_data[unichar]);
+ if (data->correction <= tree->maxage)
+ data = &unicode_data[unichar];
+ utf8encode(key,unichar);
+ leaf = utf8lookup(tree, hangul, key);
+
+ if (!leaf) {
+ if (data->gen != -1)
+ report++;
+ if (unichar < 0xd800 || unichar > 0xdfff)
+ report++;
+ } else {
+ if (unichar >= 0xd800 && unichar <= 0xdfff)
+ report++;
+ if (data->gen == -1)
+ report++;
+ if (data->gen != LEAF_GEN(leaf))
+ report++;
+ if (LEAF_CCC(leaf) == DECOMPOSE) {
+ if (HANGUL_SYLLABLE(data->code)) {
+ if (data->utf8nfkdi[0] != HANGUL)
+ report++;
+ } else if (nocf) {
+ if (!data->utf8nfkdi) {
+ report++;
+ } else if (strcmp(data->utf8nfkdi,
+ LEAF_STR(leaf))) {
+ report++;
+ }
+ } else {
+ if (!data->utf8nfkdicf &&
+ !data->utf8nfkdi) {
+ report++;
+ } else if (data->utf8nfkdicf) {
+ if (strcmp(data->utf8nfkdicf,
+ LEAF_STR(leaf)))
+ report++;
+ } else if (strcmp(data->utf8nfkdi,
+ LEAF_STR(leaf))) {
+ report++;
+ }
+ }
+ } else if (data->ccc != LEAF_CCC(leaf)) {
+ report++;
+ }
+ }
+ if (report) {
+ printf("%X code %X gen %d ccc %d"
+ " nfkdi -> \"%s\"",
+ unichar, data->code, data->gen,
+ data->ccc,
+ data->utf8nfkdi);
+ if (leaf) {
+ printf(" gen %d ccc %d"
+ " nfkdi -> \"%s\"",
+ LEAF_GEN(leaf),
+ LEAF_CCC(leaf),
+ LEAF_CCC(leaf) == DECOMPOSE ?
+ LEAF_STR(leaf) : "");
+ }
+ printf("\n");
+ }
+ }
+}
+
+static void trees_verify(void)
+{
+ int i;
+
+ for (i = 0; i != trees_count; i++)
+ verify(&trees[i]);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void help(void)
+{
+ printf("Usage: %s [options]\n", argv0);
+ printf("\n");
+ printf("This program creates an a data trie used for parsing and\n");
+ printf("normalization of UTF-8 strings. The trie is derived from\n");
+ printf("a set of input files from the Unicode character database\n");
+ printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
+ printf("\n");
+ printf("The generated tree supports two normalization forms:\n");
+ printf("\n");
+ printf("\tnfkdi:\n");
+ printf("\t- Apply unicode normalization form NFKD.\n");
+ printf("\t- Remove any Default_Ignorable_Code_Point.\n");
+ printf("\n");
+ printf("\tnfkdicf:\n");
+ printf("\t- Apply unicode normalization form NFKD.\n");
+ printf("\t- Remove any Default_Ignorable_Code_Point.\n");
+ printf("\t- Apply a full casefold (C + F).\n");
+ printf("\n");
+ printf("These forms were chosen as being most useful when dealing\n");
+ printf("with file names: NFKD catches most cases where characters\n");
+ printf("should be considered equivalent. The ignorables are mostly\n");
+ printf("invisible, making names hard to type.\n");
+ printf("\n");
+ printf("The options to specify the files to be used are listed\n");
+ printf("below with their default values, which are the names used\n");
+ printf("by version 11.0.0 of the Unicode Character Database.\n");
+ printf("\n");
+ printf("The input files:\n");
+ printf("\t-a %s\n", AGE_NAME);
+ printf("\t-c %s\n", CCC_NAME);
+ printf("\t-p %s\n", PROP_NAME);
+ printf("\t-d %s\n", DATA_NAME);
+ printf("\t-f %s\n", FOLD_NAME);
+ printf("\t-n %s\n", NORM_NAME);
+ printf("\n");
+ printf("Additionally, the generated tables are tested using:\n");
+ printf("\t-t %s\n", TEST_NAME);
+ printf("\n");
+ printf("Finally, the output file:\n");
+ printf("\t-o %s\n", UTF8_NAME);
+ printf("\n");
+}
+
+static void usage(void)
+{
+ help();
+ exit(1);
+}
+
+static void open_fail(const char *name, int error)
+{
+ printf("Error %d opening %s: %s\n", error, name, strerror(error));
+ exit(1);
+}
+
+static void file_fail(const char *filename)
+{
+ printf("Error parsing %s\n", filename);
+ exit(1);
+}
+
+static void line_fail(const char *filename, const char *line)
+{
+ printf("Error parsing %s:%s\n", filename, line);
+ exit(1);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void print_utf32(unsigned int *utf32str)
+{
+ int i;
+
+ for (i = 0; utf32str[i]; i++)
+ printf(" %X", utf32str[i]);
+}
+
+static void print_utf32nfkdi(unsigned int unichar)
+{
+ printf(" %X ->", unichar);
+ print_utf32(unicode_data[unichar].utf32nfkdi);
+ printf("\n");
+}
+
+static void print_utf32nfkdicf(unsigned int unichar)
+{
+ printf(" %X ->", unichar);
+ print_utf32(unicode_data[unichar].utf32nfkdicf);
+ printf("\n");
+}
+
+/* ------------------------------------------------------------------ */
+
+static void age_init(void)
+{
+ FILE *file;
+ unsigned int first;
+ unsigned int last;
+ unsigned int unichar;
+ unsigned int major;
+ unsigned int minor;
+ unsigned int revision;
+ int gen;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", age_name);
+
+ file = fopen(age_name, "r");
+ if (!file)
+ open_fail(age_name, errno);
+ count = 0;
+
+ gen = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "# Age=V%d_%d_%d",
+ &major, &minor, &revision);
+ if (ret == 3) {
+ ages_count++;
+ if (verbose > 1)
+ printf(" Age V%d_%d_%d\n",
+ major, minor, revision);
+ if (!age_valid(major, minor, revision))
+ line_fail(age_name, line);
+ continue;
+ }
+ ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
+ if (ret == 2) {
+ ages_count++;
+ if (verbose > 1)
+ printf(" Age V%d_%d\n", major, minor);
+ if (!age_valid(major, minor, 0))
+ line_fail(age_name, line);
+ continue;
+ }
+ }
+
+ /* We must have found something above. */
+ if (verbose > 1)
+ printf("%d age entries\n", ages_count);
+ if (ages_count == 0 || ages_count > MAXGEN)
+ file_fail(age_name);
+
+ /* There is a 0 entry. */
+ ages_count++;
+ ages = calloc(ages_count + 1, sizeof(*ages));
+ /* And a guard entry. */
+ ages[ages_count] = (unsigned int)-1;
+
+ rewind(file);
+ count = 0;
+ gen = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "# Age=V%d_%d_%d",
+ &major, &minor, &revision);
+ if (ret == 3) {
+ ages[++gen] =
+ UNICODE_AGE(major, minor, revision);
+ if (verbose > 1)
+ printf(" Age V%d_%d_%d = gen %d\n",
+ major, minor, revision, gen);
+ if (!age_valid(major, minor, revision))
+ line_fail(age_name, line);
+ continue;
+ }
+ ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
+ if (ret == 2) {
+ ages[++gen] = UNICODE_AGE(major, minor, 0);
+ if (verbose > 1)
+ printf(" Age V%d_%d = %d\n",
+ major, minor, gen);
+ if (!age_valid(major, minor, 0))
+ line_fail(age_name, line);
+ continue;
+ }
+ ret = sscanf(line, "%X..%X ; %d.%d #",
+ &first, &last, &major, &minor);
+ if (ret == 4) {
+ for (unichar = first; unichar <= last; unichar++)
+ unicode_data[unichar].gen = gen;
+ count += 1 + last - first;
+ if (verbose > 1)
+ printf(" %X..%X gen %d\n", first, last, gen);
+ if (!utf32valid(first) || !utf32valid(last))
+ line_fail(age_name, line);
+ continue;
+ }
+ ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
+ if (ret == 3) {
+ unicode_data[unichar].gen = gen;
+ count++;
+ if (verbose > 1)
+ printf(" %X gen %d\n", unichar, gen);
+ if (!utf32valid(unichar))
+ line_fail(age_name, line);
+ continue;
+ }
+ }
+ unicode_maxage = ages[gen];
+ fclose(file);
+
+ /* Nix surrogate block */
+ if (verbose > 1)
+ printf(" Removing surrogate block D800..DFFF\n");
+ for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
+ unicode_data[unichar].gen = -1;
+
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(age_name);
+}
+
+static void ccc_init(void)
+{
+ FILE *file;
+ unsigned int first;
+ unsigned int last;
+ unsigned int unichar;
+ unsigned int value;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", ccc_name);
+
+ file = fopen(ccc_name, "r");
+ if (!file)
+ open_fail(ccc_name, errno);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
+ if (ret == 3) {
+ for (unichar = first; unichar <= last; unichar++) {
+ unicode_data[unichar].ccc = value;
+ count++;
+ }
+ if (verbose > 1)
+ printf(" %X..%X ccc %d\n", first, last, value);
+ if (!utf32valid(first) || !utf32valid(last))
+ line_fail(ccc_name, line);
+ continue;
+ }
+ ret = sscanf(line, "%X ; %d #", &unichar, &value);
+ if (ret == 2) {
+ unicode_data[unichar].ccc = value;
+ count++;
+ if (verbose > 1)
+ printf(" %X ccc %d\n", unichar, value);
+ if (!utf32valid(unichar))
+ line_fail(ccc_name, line);
+ continue;
+ }
+ }
+ fclose(file);
+
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(ccc_name);
+}
+
+static void nfkdi_init(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ char *s;
+ unsigned int *um;
+ int count;
+ int i;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", data_name);
+ file = fopen(data_name, "r");
+ if (!file)
+ open_fail(data_name, errno);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
+ &unichar, buf0);
+ if (ret != 2)
+ continue;
+ if (!utf32valid(unichar))
+ line_fail(data_name, line);
+
+ s = buf0;
+ /* skip over <tag> */
+ if (*s == '<')
+ while (*s++ != ' ')
+ ;
+ /* decode the decomposition into UTF-32 */
+ i = 0;
+ while (*s) {
+ mapping[i] = strtoul(s, &s, 16);
+ if (!utf32valid(mapping[i]))
+ line_fail(data_name, line);
+ i++;
+ }
+ mapping[i++] = 0;
+
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdi = um;
+
+ if (verbose > 1)
+ print_utf32nfkdi(unichar);
+ count++;
+ }
+ fclose(file);
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(data_name);
+}
+
+static void nfkdicf_init(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ char status;
+ char *s;
+ unsigned int *um;
+ int i;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", fold_name);
+ file = fopen(fold_name, "r");
+ if (!file)
+ open_fail(fold_name, errno);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
+ if (ret != 3)
+ continue;
+ if (!utf32valid(unichar))
+ line_fail(fold_name, line);
+ /* Use the C+F casefold. */
+ if (status != 'C' && status != 'F')
+ continue;
+ s = buf0;
+ if (*s == '<')
+ while (*s++ != ' ')
+ ;
+ i = 0;
+ while (*s) {
+ mapping[i] = strtoul(s, &s, 16);
+ if (!utf32valid(mapping[i]))
+ line_fail(fold_name, line);
+ i++;
+ }
+ mapping[i++] = 0;
+
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdicf = um;
+
+ if (verbose > 1)
+ print_utf32nfkdicf(unichar);
+ count++;
+ }
+ fclose(file);
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(fold_name);
+}
+
+static void ignore_init(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ unsigned int first;
+ unsigned int last;
+ unsigned int *um;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", prop_name);
+ file = fopen(prop_name, "r");
+ if (!file)
+ open_fail(prop_name, errno);
+ assert(file);
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
+ if (ret == 3) {
+ if (strcmp(buf0, "Default_Ignorable_Code_Point"))
+ continue;
+ if (!utf32valid(first) || !utf32valid(last))
+ line_fail(prop_name, line);
+ for (unichar = first; unichar <= last; unichar++) {
+ free(unicode_data[unichar].utf32nfkdi);
+ um = malloc(sizeof(unsigned int));
+ *um = 0;
+ unicode_data[unichar].utf32nfkdi = um;
+ free(unicode_data[unichar].utf32nfkdicf);
+ um = malloc(sizeof(unsigned int));
+ *um = 0;
+ unicode_data[unichar].utf32nfkdicf = um;
+ count++;
+ }
+ if (verbose > 1)
+ printf(" %X..%X Default_Ignorable_Code_Point\n",
+ first, last);
+ continue;
+ }
+ ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
+ if (ret == 2) {
+ if (strcmp(buf0, "Default_Ignorable_Code_Point"))
+ continue;
+ if (!utf32valid(unichar))
+ line_fail(prop_name, line);
+ free(unicode_data[unichar].utf32nfkdi);
+ um = malloc(sizeof(unsigned int));
+ *um = 0;
+ unicode_data[unichar].utf32nfkdi = um;
+ free(unicode_data[unichar].utf32nfkdicf);
+ um = malloc(sizeof(unsigned int));
+ *um = 0;
+ unicode_data[unichar].utf32nfkdicf = um;
+ if (verbose > 1)
+ printf(" %X Default_Ignorable_Code_Point\n",
+ unichar);
+ count++;
+ continue;
+ }
+ }
+ fclose(file);
+
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(prop_name);
+}
+
+static void corrections_init(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ unsigned int major;
+ unsigned int minor;
+ unsigned int revision;
+ unsigned int age;
+ unsigned int *um;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ char *s;
+ int i;
+ int count;
+ int ret;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", norm_name);
+ file = fopen(norm_name, "r");
+ if (!file)
+ open_fail(norm_name, errno);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
+ &unichar, buf0, buf1,
+ &major, &minor, &revision);
+ if (ret != 6)
+ continue;
+ if (!utf32valid(unichar) || !age_valid(major, minor, revision))
+ line_fail(norm_name, line);
+ count++;
+ }
+ corrections = calloc(count, sizeof(struct unicode_data));
+ corrections_count = count;
+ rewind(file);
+
+ count = 0;
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
+ &unichar, buf0, buf1,
+ &major, &minor, &revision);
+ if (ret != 6)
+ continue;
+ if (!utf32valid(unichar) || !age_valid(major, minor, revision))
+ line_fail(norm_name, line);
+ corrections[count] = unicode_data[unichar];
+ assert(corrections[count].code == unichar);
+ age = UNICODE_AGE(major, minor, revision);
+ corrections[count].correction = age;
+
+ i = 0;
+ s = buf0;
+ while (*s) {
+ mapping[i] = strtoul(s, &s, 16);
+ if (!utf32valid(mapping[i]))
+ line_fail(norm_name, line);
+ i++;
+ }
+ mapping[i++] = 0;
+
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ corrections[count].utf32nfkdi = um;
+
+ if (verbose > 1)
+ printf(" %X -> %s -> %s V%d_%d_%d\n",
+ unichar, buf0, buf1, major, minor, revision);
+ count++;
+ }
+ fclose(file);
+
+ if (verbose > 0)
+ printf("Found %d entries\n", count);
+ if (count == 0)
+ file_fail(norm_name);
+}
+
+/* ------------------------------------------------------------------ */
+
+/*
+ * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
+ *
+ * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ *
+ * SBase = 0xAC00
+ * LBase = 0x1100
+ * VBase = 0x1161
+ * TBase = 0x11A7
+ * LCount = 19
+ * VCount = 21
+ * TCount = 28
+ * NCount = 588 (VCount * TCount)
+ * SCount = 11172 (LCount * NCount)
+ *
+ * Decomposition:
+ * SIndex = s - SBase
+ *
+ * LV (Canonical/Full)
+ * LIndex = SIndex / NCount
+ * VIndex = (Sindex % NCount) / TCount
+ * LPart = LBase + LIndex
+ * VPart = VBase + VIndex
+ *
+ * LVT (Canonical)
+ * LVIndex = (SIndex / TCount) * TCount
+ * TIndex = (Sindex % TCount)
+ * LVPart = SBase + LVIndex
+ * TPart = TBase + TIndex
+ *
+ * LVT (Full)
+ * LIndex = SIndex / NCount
+ * VIndex = (Sindex % NCount) / TCount
+ * TIndex = (Sindex % TCount)
+ * LPart = LBase + LIndex
+ * VPart = VBase + VIndex
+ * if (TIndex == 0) {
+ * d = <LPart, VPart>
+ * } else {
+ * TPart = TBase + TIndex
+ * d = <LPart, VPart, TPart>
+ * }
+ *
+ */
+
+static void hangul_decompose(void)
+{
+ unsigned int sb = 0xAC00;
+ unsigned int lb = 0x1100;
+ unsigned int vb = 0x1161;
+ unsigned int tb = 0x11a7;
+ /* unsigned int lc = 19; */
+ unsigned int vc = 21;
+ unsigned int tc = 28;
+ unsigned int nc = (vc * tc);
+ /* unsigned int sc = (lc * nc); */
+ unsigned int unichar;
+ unsigned int mapping[4];
+ unsigned int *um;
+ int count;
+ int i;
+
+ if (verbose > 0)
+ printf("Decomposing hangul\n");
+ /* Hangul */
+ count = 0;
+ for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
+ unsigned int si = unichar - sb;
+ unsigned int li = si / nc;
+ unsigned int vi = (si % nc) / tc;
+ unsigned int ti = si % tc;
+
+ i = 0;
+ mapping[i++] = lb + li;
+ mapping[i++] = vb + vi;
+ if (ti)
+ mapping[i++] = tb + ti;
+ mapping[i++] = 0;
+
+ assert(!unicode_data[unichar].utf32nfkdi);
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdi = um;
+
+ assert(!unicode_data[unichar].utf32nfkdicf);
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdicf = um;
+
+ /*
+ * Add a cookie as a reminder that the hangul syllable
+ * decompositions must not be stored in the generated
+ * trie.
+ */
+ unicode_data[unichar].utf8nfkdi = malloc(2);
+ unicode_data[unichar].utf8nfkdi[0] = HANGUL;
+ unicode_data[unichar].utf8nfkdi[1] = '\0';
+
+ if (verbose > 1)
+ print_utf32nfkdi(unichar);
+
+ count++;
+ }
+ if (verbose > 0)
+ printf("Created %d entries\n", count);
+}
+
+static void nfkdi_decompose(void)
+{
+ unsigned int unichar;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ unsigned int *um;
+ unsigned int *dc;
+ int count;
+ int i;
+ int j;
+ int ret;
+
+ if (verbose > 0)
+ printf("Decomposing nfkdi\n");
+
+ count = 0;
+ for (unichar = 0; unichar != 0x110000; unichar++) {
+ if (!unicode_data[unichar].utf32nfkdi)
+ continue;
+ for (;;) {
+ ret = 1;
+ i = 0;
+ um = unicode_data[unichar].utf32nfkdi;
+ while (*um) {
+ dc = unicode_data[*um].utf32nfkdi;
+ if (dc) {
+ for (j = 0; dc[j]; j++)
+ mapping[i++] = dc[j];
+ ret = 0;
+ } else {
+ mapping[i++] = *um;
+ }
+ um++;
+ }
+ mapping[i++] = 0;
+ if (ret)
+ break;
+ free(unicode_data[unichar].utf32nfkdi);
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdi = um;
+ }
+ /* Add this decomposition to nfkdicf if there is no entry. */
+ if (!unicode_data[unichar].utf32nfkdicf) {
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdicf = um;
+ }
+ if (verbose > 1)
+ print_utf32nfkdi(unichar);
+ count++;
+ }
+ if (verbose > 0)
+ printf("Processed %d entries\n", count);
+}
+
+static void nfkdicf_decompose(void)
+{
+ unsigned int unichar;
+ unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
+ unsigned int *um;
+ unsigned int *dc;
+ int count;
+ int i;
+ int j;
+ int ret;
+
+ if (verbose > 0)
+ printf("Decomposing nfkdicf\n");
+ count = 0;
+ for (unichar = 0; unichar != 0x110000; unichar++) {
+ if (!unicode_data[unichar].utf32nfkdicf)
+ continue;
+ for (;;) {
+ ret = 1;
+ i = 0;
+ um = unicode_data[unichar].utf32nfkdicf;
+ while (*um) {
+ dc = unicode_data[*um].utf32nfkdicf;
+ if (dc) {
+ for (j = 0; dc[j]; j++)
+ mapping[i++] = dc[j];
+ ret = 0;
+ } else {
+ mapping[i++] = *um;
+ }
+ um++;
+ }
+ mapping[i++] = 0;
+ if (ret)
+ break;
+ free(unicode_data[unichar].utf32nfkdicf);
+ um = malloc(i * sizeof(unsigned int));
+ memcpy(um, mapping, i * sizeof(unsigned int));
+ unicode_data[unichar].utf32nfkdicf = um;
+ }
+ if (verbose > 1)
+ print_utf32nfkdicf(unichar);
+ count++;
+ }
+ if (verbose > 0)
+ printf("Processed %d entries\n", count);
+}
+
+/* ------------------------------------------------------------------ */
+
+int utf8agemax(struct tree *, const char *);
+int utf8nagemax(struct tree *, const char *, size_t);
+int utf8agemin(struct tree *, const char *);
+int utf8nagemin(struct tree *, const char *, size_t);
+ssize_t utf8len(struct tree *, const char *);
+ssize_t utf8nlen(struct tree *, const char *, size_t);
+struct utf8cursor;
+int utf8cursor(struct utf8cursor *, struct tree *, const char *);
+int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
+int utf8byte(struct utf8cursor *);
+
+/*
+ * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
+ *
+ * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ *
+ * SBase = 0xAC00
+ * LBase = 0x1100
+ * VBase = 0x1161
+ * TBase = 0x11A7
+ * LCount = 19
+ * VCount = 21
+ * TCount = 28
+ * NCount = 588 (VCount * TCount)
+ * SCount = 11172 (LCount * NCount)
+ *
+ * Decomposition:
+ * SIndex = s - SBase
+ *
+ * LV (Canonical/Full)
+ * LIndex = SIndex / NCount
+ * VIndex = (Sindex % NCount) / TCount
+ * LPart = LBase + LIndex
+ * VPart = VBase + VIndex
+ *
+ * LVT (Canonical)
+ * LVIndex = (SIndex / TCount) * TCount
+ * TIndex = (Sindex % TCount)
+ * LVPart = SBase + LVIndex
+ * TPart = TBase + TIndex
+ *
+ * LVT (Full)
+ * LIndex = SIndex / NCount
+ * VIndex = (Sindex % NCount) / TCount
+ * TIndex = (Sindex % TCount)
+ * LPart = LBase + LIndex
+ * VPart = VBase + VIndex
+ * if (TIndex == 0) {
+ * d = <LPart, VPart>
+ * } else {
+ * TPart = TBase + TIndex
+ * d = <LPart, VPart, TPart>
+ * }
+ */
+
+/* Constants */
+#define SB (0xAC00)
+#define LB (0x1100)
+#define VB (0x1161)
+#define TB (0x11A7)
+#define LC (19)
+#define VC (21)
+#define TC (28)
+#define NC (VC * TC)
+#define SC (LC * NC)
+
+/* Algorithmic decomposition of hangul syllable. */
+static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
+{
+ unsigned int si;
+ unsigned int li;
+ unsigned int vi;
+ unsigned int ti;
+ unsigned char *h;
+
+ /* Calculate the SI, LI, VI, and TI values. */
+ si = utf8decode(str) - SB;
+ li = si / NC;
+ vi = (si % NC) / TC;
+ ti = si % TC;
+
+ /* Fill in base of leaf. */
+ h = hangul;
+ LEAF_GEN(h) = 2;
+ LEAF_CCC(h) = DECOMPOSE;
+ h += 2;
+
+ /* Add LPart, a 3-byte UTF-8 sequence. */
+ h += utf8encode((char *)h, li + LB);
+
+ /* Add VPart, a 3-byte UTF-8 sequence. */
+ h += utf8encode((char *)h, vi + VB);
+
+ /* Add TPart if required, also a 3-byte UTF-8 sequence. */
+ if (ti)
+ h += utf8encode((char *)h, ti + TB);
+
+ /* Terminate string. */
+ h[0] = '\0';
+
+ return hangul;
+}
+
+/*
+ * Use trie to scan s, touching at most len bytes.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * A non-NULL return guarantees that the UTF-8 sequence starting at s
+ * is well-formed and corresponds to a known unicode code point. The
+ * shorthand for this will be "is valid UTF-8 unicode".
+ */
+static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
+ const char *s, size_t len)
+{
+ utf8trie_t *trie = utf8data + tree->index;
+ int offlen;
+ int offset;
+ int mask;
+ int node;
+
+ if (!tree)
+ return NULL;
+ if (len == 0)
+ return NULL;
+ node = 1;
+ while (node) {
+ offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
+ if (*trie & NEXTBYTE) {
+ if (--len == 0)
+ return NULL;
+ s++;
+ }
+ mask = 1 << (*trie & BITNUM);
+ if (*s & mask) {
+ /* Right leg */
+ if (offlen) {
+ /* Right node at offset of trie */
+ node = (*trie & RIGHTNODE);
+ offset = trie[offlen];
+ while (--offlen) {
+ offset <<= 8;
+ offset |= trie[offlen];
+ }
+ trie += offset;
+ } else if (*trie & RIGHTPATH) {
+ /* Right node after this node */
+ node = (*trie & TRIENODE);
+ trie++;
+ } else {
+ /* No right node. */
+ return NULL;
+ }
+ } else {
+ /* Left leg */
+ if (offlen) {
+ /* Left node after this node. */
+ node = (*trie & LEFTNODE);
+ trie += offlen + 1;
+ } else if (*trie & RIGHTPATH) {
+ /* No left node. */
+ return NULL;
+ } else {
+ /* Left node after this node */
+ node = (*trie & TRIENODE);
+ trie++;
+ }
+ }
+ }
+ /*
+ * Hangul decomposition is done algorithmically. These are the
+ * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
+ * always 3 bytes long, so s has been advanced twice, and the
+ * start of the sequence is at s-2.
+ */
+ if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
+ trie = utf8hangul(s - 2, hangul);
+ return trie;
+}
+
+/*
+ * Use trie to scan s.
+ * Returns the leaf if one exists, NULL otherwise.
+ *
+ * Forwards to trie_nlookup().
+ */
+static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
+ const char *s)
+{
+ return utf8nlookup(tree, hangul, s, (size_t)-1);
+}
+
+/*
+ * Return the number of bytes used by the current UTF-8 sequence.
+ * Assumes the input points to the first byte of a valid UTF-8
+ * sequence.
+ */
+static inline int utf8clen(const char *s)
+{
+ unsigned char c = *s;
+ return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
+}
+
+/*
+ * Maximum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if only non-assigned code points are used.
+ */
+int utf8agemax(struct tree *tree, const char *s)
+{
+ utf8leaf_t *leaf;
+ int age = 0;
+ int leaf_age;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+
+ while (*s) {
+ leaf = utf8lookup(tree, hangul, s);
+ if (!leaf)
+ return -1;
+ leaf_age = ages[LEAF_GEN(leaf)];
+ if (leaf_age <= tree->maxage && leaf_age > age)
+ age = leaf_age;
+ s += utf8clen(s);
+ }
+ return age;
+}
+
+/*
+ * Minimum age of any character in s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ * Return 0 if non-assigned code points are used.
+ */
+int utf8agemin(struct tree *tree, const char *s)
+{
+ utf8leaf_t *leaf;
+ int age;
+ int leaf_age;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+ age = tree->maxage;
+ while (*s) {
+ leaf = utf8lookup(tree, hangul, s);
+ if (!leaf)
+ return -1;
+ leaf_age = ages[LEAF_GEN(leaf)];
+ if (leaf_age <= tree->maxage && leaf_age < age)
+ age = leaf_age;
+ s += utf8clen(s);
+ }
+ return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int utf8nagemax(struct tree *tree, const char *s, size_t len)
+{
+ utf8leaf_t *leaf;
+ int age = 0;
+ int leaf_age;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+
+ while (len && *s) {
+ leaf = utf8nlookup(tree, hangul, s, len);
+ if (!leaf)
+ return -1;
+ leaf_age = ages[LEAF_GEN(leaf)];
+ if (leaf_age <= tree->maxage && leaf_age > age)
+ age = leaf_age;
+ len -= utf8clen(s);
+ s += utf8clen(s);
+ }
+ return age;
+}
+
+/*
+ * Maximum age of any character in s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+int utf8nagemin(struct tree *tree, const char *s, size_t len)
+{
+ utf8leaf_t *leaf;
+ int leaf_age;
+ int age;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+ age = tree->maxage;
+ while (len && *s) {
+ leaf = utf8nlookup(tree, hangul, s, len);
+ if (!leaf)
+ return -1;
+ leaf_age = ages[LEAF_GEN(leaf)];
+ if (leaf_age <= tree->maxage && leaf_age < age)
+ age = leaf_age;
+ len -= utf8clen(s);
+ s += utf8clen(s);
+ }
+ return age;
+}
+
+/*
+ * Length of the normalization of s.
+ * Return -1 if s is not valid UTF-8 unicode.
+ *
+ * A string of Default_Ignorable_Code_Point has length 0.
+ */
+ssize_t utf8len(struct tree *tree, const char *s)
+{
+ utf8leaf_t *leaf;
+ size_t ret = 0;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+ while (*s) {
+ leaf = utf8lookup(tree, hangul, s);
+ if (!leaf)
+ return -1;
+ if (ages[LEAF_GEN(leaf)] > tree->maxage)
+ ret += utf8clen(s);
+ else if (LEAF_CCC(leaf) == DECOMPOSE)
+ ret += strlen(LEAF_STR(leaf));
+ else
+ ret += utf8clen(s);
+ s += utf8clen(s);
+ }
+ return ret;
+}
+
+/*
+ * Length of the normalization of s, touch at most len bytes.
+ * Return -1 if s is not valid UTF-8 unicode.
+ */
+ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
+{
+ utf8leaf_t *leaf;
+ size_t ret = 0;
+ unsigned char hangul[UTF8HANGULLEAF];
+
+ if (!tree)
+ return -1;
+ while (len && *s) {
+ leaf = utf8nlookup(tree, hangul, s, len);
+ if (!leaf)
+ return -1;
+ if (ages[LEAF_GEN(leaf)] > tree->maxage)
+ ret += utf8clen(s);
+ else if (LEAF_CCC(leaf) == DECOMPOSE)
+ ret += strlen(LEAF_STR(leaf));
+ else
+ ret += utf8clen(s);
+ len -= utf8clen(s);
+ s += utf8clen(s);
+ }
+ return ret;
+}
+
+/*
+ * Cursor structure used by the normalizer.
+ */
+struct utf8cursor {
+ struct tree *tree;
+ const char *s;
+ const char *p;
+ const char *ss;
+ const char *sp;
+ unsigned int len;
+ unsigned int slen;
+ short int ccc;
+ short int nccc;
+ unsigned int unichar;
+ unsigned char hangul[UTF8HANGULLEAF];
+};
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ * s : string.
+ * len : length of s.
+ * u8c : pointer to cursor.
+ * trie : utf8trie_t to use for normalization.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
+ size_t len)
+{
+ if (!tree)
+ return -1;
+ if (!s)
+ return -1;
+ u8c->tree = tree;
+ u8c->s = s;
+ u8c->p = NULL;
+ u8c->ss = NULL;
+ u8c->sp = NULL;
+ u8c->len = len;
+ u8c->slen = 0;
+ u8c->ccc = STOPPER;
+ u8c->nccc = STOPPER;
+ u8c->unichar = 0;
+ /* Check we didn't clobber the maximum length. */
+ if (u8c->len != len)
+ return -1;
+ /* The first byte of s may not be an utf8 continuation. */
+ if (len > 0 && (*s & 0xC0) == 0x80)
+ return -1;
+ return 0;
+}
+
+/*
+ * Set up an utf8cursor for use by utf8byte().
+ *
+ * s : NUL-terminated string.
+ * u8c : pointer to cursor.
+ * trie : utf8trie_t to use for normalization.
+ *
+ * Returns -1 on error, 0 on success.
+ */
+int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
+{
+ return utf8ncursor(u8c, tree, s, (unsigned int)-1);
+}
+
+/*
+ * Get one byte from the normalized form of the string described by u8c.
+ *
+ * Returns the byte cast to an unsigned char on succes, and -1 on failure.
+ *
+ * The cursor keeps track of the location in the string in u8c->s.
+ * When a character is decomposed, the current location is stored in
+ * u8c->p, and u8c->s is set to the start of the decomposition. Note
+ * that bytes from a decomposition do not count against u8c->len.
+ *
+ * Characters are emitted if they match the current CCC in u8c->ccc.
+ * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
+ * and the function returns 0 in that case.
+ *
+ * Sorting by CCC is done by repeatedly scanning the string. The
+ * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
+ * the start of the scan. The first pass finds the lowest CCC to be
+ * emitted and stores it in u8c->nccc, the second pass emits the
+ * characters with this CCC and finds the next lowest CCC. This limits
+ * the number of passes to 1 + the number of different CCCs in the
+ * sequence being scanned.
+ *
+ * Therefore:
+ * u8c->p != NULL -> a decomposition is being scanned.
+ * u8c->ss != NULL -> this is a repeating scan.
+ * u8c->ccc == -1 -> this is the first scan of a repeating scan.
+ */
+int utf8byte(struct utf8cursor *u8c)
+{
+ utf8leaf_t *leaf;
+ int ccc;
+
+ for (;;) {
+ /* Check for the end of a decomposed character. */
+ if (u8c->p && *u8c->s == '\0') {
+ u8c->s = u8c->p;
+ u8c->p = NULL;
+ }
+
+ /* Check for end-of-string. */
+ if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
+ /* There is no next byte. */
+ if (u8c->ccc == STOPPER)
+ return 0;
+ /* End-of-string during a scan counts as a stopper. */
+ ccc = STOPPER;
+ goto ccc_mismatch;
+ } else if ((*u8c->s & 0xC0) == 0x80) {
+ /* This is a continuation of the current character. */
+ if (!u8c->p)
+ u8c->len--;
+ return (unsigned char)*u8c->s++;
+ }
+
+ /* Look up the data for the current character. */
+ if (u8c->p) {
+ leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
+ } else {
+ leaf = utf8nlookup(u8c->tree, u8c->hangul,
+ u8c->s, u8c->len);
+ }
+
+ /* No leaf found implies that the input is a binary blob. */
+ if (!leaf)
+ return -1;
+
+ /* Characters that are too new have CCC 0. */
+ if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
+ ccc = STOPPER;
+ } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
+ u8c->len -= utf8clen(u8c->s);
+ u8c->p = u8c->s + utf8clen(u8c->s);
+ u8c->s = LEAF_STR(leaf);
+ /* Empty decomposition implies CCC 0. */
+ if (*u8c->s == '\0') {
+ if (u8c->ccc == STOPPER)
+ continue;
+ ccc = STOPPER;
+ goto ccc_mismatch;
+ }
+ leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
+ ccc = LEAF_CCC(leaf);
+ }
+ u8c->unichar = utf8decode(u8c->s);
+
+ /*
+ * If this is not a stopper, then see if it updates
+ * the next canonical class to be emitted.
+ */
+ if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
+ u8c->nccc = ccc;
+
+ /*
+ * Return the current byte if this is the current
+ * combining class.
+ */
+ if (ccc == u8c->ccc) {
+ if (!u8c->p)
+ u8c->len--;
+ return (unsigned char)*u8c->s++;
+ }
+
+ /* Current combining class mismatch. */
+ ccc_mismatch:
+ if (u8c->nccc == STOPPER) {
+ /*
+ * Scan forward for the first canonical class
+ * to be emitted. Save the position from
+ * which to restart.
+ */
+ assert(u8c->ccc == STOPPER);
+ u8c->ccc = MINCCC - 1;
+ u8c->nccc = ccc;
+ u8c->sp = u8c->p;
+ u8c->ss = u8c->s;
+ u8c->slen = u8c->len;
+ if (!u8c->p)
+ u8c->len -= utf8clen(u8c->s);
+ u8c->s += utf8clen(u8c->s);
+ } else if (ccc != STOPPER) {
+ /* Not a stopper, and not the ccc we're emitting. */
+ if (!u8c->p)
+ u8c->len -= utf8clen(u8c->s);
+ u8c->s += utf8clen(u8c->s);
+ } else if (u8c->nccc != MAXCCC + 1) {
+ /* At a stopper, restart for next ccc. */
+ u8c->ccc = u8c->nccc;
+ u8c->nccc = MAXCCC + 1;
+ u8c->s = u8c->ss;
+ u8c->p = u8c->sp;
+ u8c->len = u8c->slen;
+ } else {
+ /* All done, proceed from here. */
+ u8c->ccc = STOPPER;
+ u8c->nccc = STOPPER;
+ u8c->sp = NULL;
+ u8c->ss = NULL;
+ u8c->slen = 0;
+ }
+ }
+}
+
+/* ------------------------------------------------------------------ */
+
+static int normalize_line(struct tree *tree)
+{
+ char *s;
+ char *t;
+ int c;
+ struct utf8cursor u8c;
+
+ /* First test: null-terminated string. */
+ s = buf2;
+ t = buf3;
+ if (utf8cursor(&u8c, tree, s))
+ return -1;
+ while ((c = utf8byte(&u8c)) > 0)
+ if (c != (unsigned char)*t++)
+ return -1;
+ if (c < 0)
+ return -1;
+ if (*t != 0)
+ return -1;
+
+ /* Second test: length-limited string. */
+ s = buf2;
+ /* Replace NUL with a value that will cause an error if seen. */
+ s[strlen(s) + 1] = -1;
+ t = buf3;
+ if (utf8cursor(&u8c, tree, s))
+ return -1;
+ while ((c = utf8byte(&u8c)) > 0)
+ if (c != (unsigned char)*t++)
+ return -1;
+ if (c < 0)
+ return -1;
+ if (*t != 0)
+ return -1;
+
+ return 0;
+}
+
+static void normalization_test(void)
+{
+ FILE *file;
+ unsigned int unichar;
+ struct unicode_data *data;
+ char *s;
+ char *t;
+ int ret;
+ int ignorables;
+ int tests = 0;
+ int failures = 0;
+
+ if (verbose > 0)
+ printf("Parsing %s\n", test_name);
+ /* Step one, read data from file. */
+ file = fopen(test_name, "r");
+ if (!file)
+ open_fail(test_name, errno);
+
+ while (fgets(line, LINESIZE, file)) {
+ ret = sscanf(line, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];",
+ buf0, buf1);
+ if (ret != 2 || *line == '#')
+ continue;
+ s = buf0;
+ t = buf2;
+ while (*s) {
+ unichar = strtoul(s, &s, 16);
+ t += utf8encode(t, unichar);
+ }
+ *t = '\0';
+
+ ignorables = 0;
+ s = buf1;
+ t = buf3;
+ while (*s) {
+ unichar = strtoul(s, &s, 16);
+ data = &unicode_data[unichar];
+ if (data->utf8nfkdi && !*data->utf8nfkdi)
+ ignorables = 1;
+ else
+ t += utf8encode(t, unichar);
+ }
+ *t = '\0';
+
+ tests++;
+ if (normalize_line(nfkdi_tree) < 0) {
+ printf("Line %s -> %s", buf0, buf1);
+ if (ignorables)
+ printf(" (ignorables removed)");
+ printf(" failure\n");
+ failures++;
+ }
+ }
+ fclose(file);
+ if (verbose > 0)
+ printf("Ran %d tests with %d failures\n", tests, failures);
+ if (failures)
+ file_fail(test_name);
+}
+
+/* ------------------------------------------------------------------ */
+
+static void write_file(void)
+{
+ FILE *file;
+ int i;
+ int j;
+ int t;
+ int gen;
+
+ if (verbose > 0)
+ printf("Writing %s\n", utf8_name);
+ file = fopen(utf8_name, "w");
+ if (!file)
+ open_fail(utf8_name, errno);
+
+ fprintf(file, "/* This file is generated code, do not edit. */\n");
+ fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
+ fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
+ fprintf(file, "#endif\n");
+ fprintf(file, "\n");
+ fprintf(file, "static const unsigned int utf8vers = %#x;\n",
+ unicode_maxage);
+ fprintf(file, "\n");
+ fprintf(file, "static const unsigned int utf8agetab[] = {\n");
+ for (i = 0; i != ages_count; i++)
+ fprintf(file, "\t%#x%s\n", ages[i],
+ ages[i] == unicode_maxage ? "" : ",");
+ fprintf(file, "};\n");
+ fprintf(file, "\n");
+ fprintf(file, "static const struct utf8data utf8nfkdicfdata[] = {\n");
+ t = 0;
+ for (gen = 0; gen < ages_count; gen++) {
+ fprintf(file, "\t{ %#x, %d }%s\n",
+ ages[gen], trees[t].index,
+ ages[gen] == unicode_maxage ? "" : ",");
+ if (trees[t].maxage == ages[gen])
+ t += 2;
+ }
+ fprintf(file, "};\n");
+ fprintf(file, "\n");
+ fprintf(file, "static const struct utf8data utf8nfkdidata[] = {\n");
+ t = 1;
+ for (gen = 0; gen < ages_count; gen++) {
+ fprintf(file, "\t{ %#x, %d }%s\n",
+ ages[gen], trees[t].index,
+ ages[gen] == unicode_maxage ? "" : ",");
+ if (trees[t].maxage == ages[gen])
+ t += 2;
+ }
+ fprintf(file, "};\n");
+ fprintf(file, "\n");
+ fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
+ utf8data_size);
+ t = 0;
+ for (i = 0; i != utf8data_size; i += 16) {
+ if (i == trees[t].index) {
+ fprintf(file, "\t/* %s_%x */\n",
+ trees[t].type, trees[t].maxage);
+ if (t < trees_count-1)
+ t++;
+ }
+ fprintf(file, "\t");
+ for (j = i; j != i + 16; j++)
+ fprintf(file, "0x%.2x%s", utf8data[j],
+ (j < utf8data_size -1 ? "," : ""));
+ fprintf(file, "\n");
+ }
+ fprintf(file, "};\n");
+ fclose(file);
+}
+
+/* ------------------------------------------------------------------ */
+
+int main(int argc, char *argv[])
+{
+ unsigned int unichar;
+ int opt;
+
+ argv0 = argv[0];
+
+ while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
+ switch (opt) {
+ case 'a':
+ age_name = optarg;
+ break;
+ case 'c':
+ ccc_name = optarg;
+ break;
+ case 'd':
+ data_name = optarg;
+ break;
+ case 'f':
+ fold_name = optarg;
+ break;
+ case 'n':
+ norm_name = optarg;
+ break;
+ case 'o':
+ utf8_name = optarg;
+ break;
+ case 'p':
+ prop_name = optarg;
+ break;
+ case 't':
+ test_name = optarg;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'h':
+ help();
+ exit(0);
+ default:
+ usage();
+ }
+ }
+
+ if (verbose > 1)
+ help();
+ for (unichar = 0; unichar != 0x110000; unichar++)
+ unicode_data[unichar].code = unichar;
+ age_init();
+ ccc_init();
+ nfkdi_init();
+ nfkdicf_init();
+ ignore_init();
+ corrections_init();
+ hangul_decompose();
+ nfkdi_decompose();
+ nfkdicf_decompose();
+ utf8_init();
+ trees_init();
+ trees_populate();
+ trees_reduce();
+ trees_verify();
+ /* Prevent "unused function" warning. */
+ (void)lookup(nfkdi_tree, " ");
+ if (verbose > 2)
+ tree_walk(nfkdi_tree);
+ if (verbose > 2)
+ tree_walk(nfkdicf_tree);
+ normalization_test();
+ write_file();
+
+ return 0;
+}
diff --git a/util/ucd/README b/util/ucd/README
new file mode 100644
index 000000000000..9fed514084b7
--- /dev/null
+++ b/util/ucd/README
@@ -0,0 +1,37 @@
+The files in this directory are part of the Unicode Character Database
+for version 11.0.0 of the Unicode standard.
+
+The full set of UCD files are not distributed with e2fsprogs, since they
+are very large and only needed to regenerate the lib/ext2fs/utf8data.h
+during an Unicode version update. They can be found in the link below
+and also in the Linux kernel source tree:
+
+ http://www.unicode.org/Public/11.0.0/ucd/
+
+The latest released version of the UCD can be found here:
+
+ http://www.unicode.org/Public/UCD/latest/
+
+The files in this directory are identical, except that they have been
+renamed with a suffix indicating the unicode version.
+
+Individual source links:
+
+ http://www.unicode.org/Public/11.0.0/ucd/CaseFolding.txt
+ http://www.unicode.org/Public/11.0.0/ucd/DerivedAge.txt
+ http://www.unicode.org/Public/11.0.0/ucd/extracted/DerivedCombiningClass.txt
+ http://www.unicode.org/Public/11.0.0/ucd/DerivedCoreProperties.txt
+ http://www.unicode.org/Public/11.0.0/ucd/NormalizationCorrections.txt
+ http://www.unicode.org/Public/11.0.0/ucd/NormalizationTest.txt
+ http://www.unicode.org/Public/11.0.0/ucd/UnicodeData.txt
+
+md5sums
+
+ 414436796cf097df55f798e1585448ee CaseFolding-11.0.0.txt
+ 6032a595fbb782694456491d86eecfac DerivedAge-11.0.0.txt
+ 3240997d671297ac754ab0d27577acf7 DerivedCombiningClass-11.0.0.txt
+ d41d8cd98f00b204e9800998ecf8427e DerivedCombiningClass.txt
+ 2a4fe257d9d8184518e036194d2248ec DerivedCoreProperties-11.0.0.txt
+ 4e7d383fa0dd3cd9d49d64e5b7b7c9e0 NormalizationCorrections-11.0.0.txt
+ c9500c5b8b88e584469f056023ecc3f2 NormalizationTest-11.0.0.txt
+ acc291106c3758d2025f8d7bd5518bee UnicodeData-11.0.0.txt
--
2.20.0.rc2

2018-12-09 20:55:53

by Gabriel Krisman Bertazi

[permalink] [raw]

Subject: Re: [PATCH] e2fsprogs: Introduce script used to generate utf8data.h

"Theodore Y. Ts'o" <[email protected]> writes:

> On Mon, Dec 03, 2018 at 06:04:28PM -0500, Gabriel Krisman Bertazi wrote:
>> The mkutf8data program was written by SGI for the kernel. We integrate
>> it here, but it should be kept in sync with the kernel code, when
>> applicable. The header generated by this script is also included in
>> this repository, such that we don't have to carry the UCD files too.
>> They are only necessary when the header has to be recreated (for
>> instance, to update the Unicode version supported)
>>
>> The mkutf8data is not a dependency of utf8data.h because we don't want
>> the utf8data.h rule to be called after a make clean that would remove
>> mkutf8data. It is simpler to call it from inside the rule, and make
>> sure we only trigger the utf8data.h rule if we really need to,
>> particularly if we decide not to integrate the ucd files in the tree.
>>
>> The UCD files are not distributed along with this commit, but we add
>> information on how to fetch them.
>>
>> Signed-off-by: Gabriel Krisman Bertazi <[email protected]>
>
> Thanks, applied, with one minor fixup:
>
>> diff --git a/util/ucd/README b/util/ucd/README
>> new file mode 100644
>> index 000000000000..9fed514084b7
>> --- /dev/null
>> +++ b/util/ucd/README
> ...
>> +md5sums
>> +
>> + 414436796cf097df55f798e1585448ee CaseFolding-11.0.0.txt
>> + 6032a595fbb782694456491d86eecfac DerivedAge-11.0.0.txt
>> + 3240997d671297ac754ab0d27577acf7 DerivedCombiningClass-11.0.0.txt
>> + d41d8cd98f00b204e9800998ecf8427e DerivedCombiningClass.txt
> ^^^^^^^^^^^^^^^^^^^^^^^^^
>
> This looks like a leftover, so I removed it.

Indeed.

Thanks for applying.

--
Gabriel Krisman Bertazi

2018-12-08 18:13:35

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH] e2fsprogs: Introduce script used to generate utf8data.h

On Mon, Dec 03, 2018 at 06:04:28PM -0500, Gabriel Krisman Bertazi wrote:
> The mkutf8data program was written by SGI for the kernel. We integrate
> it here, but it should be kept in sync with the kernel code, when
> applicable. The header generated by this script is also included in
> this repository, such that we don't have to carry the UCD files too.
> They are only necessary when the header has to be recreated (for
> instance, to update the Unicode version supported)
>
> The mkutf8data is not a dependency of utf8data.h because we don't want
> the utf8data.h rule to be called after a make clean that would remove
> mkutf8data. It is simpler to call it from inside the rule, and make
> sure we only trigger the utf8data.h rule if we really need to,
> particularly if we decide not to integrate the ucd files in the tree.
>
> The UCD files are not distributed along with this commit, but we add
> information on how to fetch them.
>
> Signed-off-by: Gabriel Krisman Bertazi <[email protected]>

Thanks, applied, with one minor fixup:

> diff --git a/util/ucd/README b/util/ucd/README
> new file mode 100644
> index 000000000000..9fed514084b7
> --- /dev/null
> +++ b/util/ucd/README
...
> +md5sums
> +
> + 414436796cf097df55f798e1585448ee CaseFolding-11.0.0.txt
> + 6032a595fbb782694456491d86eecfac DerivedAge-11.0.0.txt
> + 3240997d671297ac754ab0d27577acf7 DerivedCombiningClass-11.0.0.txt
> + d41d8cd98f00b204e9800998ecf8427e DerivedCombiningClass.txt
^^^^^^^^^^^^^^^^^^^^^^^^^

This looks like a leftover, so I removed it.

- Ted