linux/fs/unicode/mkutf8data.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2014 SGI.
   3 * All rights reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18
  19/* Generator for a compact trie for unicode normalization */
  20
  21#include <sys/types.h>
  22#include <stddef.h>
  23#include <stdlib.h>
  24#include <stdio.h>
  25#include <assert.h>
  26#include <string.h>
  27#include <unistd.h>
  28#include <errno.h>
  29
  30/* Default names of the in- and output files. */
  31
  32#define AGE_NAME        "DerivedAge.txt"
  33#define CCC_NAME        "DerivedCombiningClass.txt"
  34#define PROP_NAME       "DerivedCoreProperties.txt"
  35#define DATA_NAME       "UnicodeData.txt"
  36#define FOLD_NAME       "CaseFolding.txt"
  37#define NORM_NAME       "NormalizationCorrections.txt"
  38#define TEST_NAME       "NormalizationTest.txt"
  39#define UTF8_NAME       "utf8data.h"
  40
  41const char      *age_name  = AGE_NAME;
  42const char      *ccc_name  = CCC_NAME;
  43const char      *prop_name = PROP_NAME;
  44const char      *data_name = DATA_NAME;
  45const char      *fold_name = FOLD_NAME;
  46const char      *norm_name = NORM_NAME;
  47const char      *test_name = TEST_NAME;
  48const char      *utf8_name = UTF8_NAME;
  49
  50int verbose = 0;
  51
  52/* An arbitrary line size limit on input lines. */
  53
  54#define LINESIZE        1024
  55char line[LINESIZE];
  56char buf0[LINESIZE];
  57char buf1[LINESIZE];
  58char buf2[LINESIZE];
  59char buf3[LINESIZE];
  60
  61const char *argv0;
  62
  63#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
  64
  65/* ------------------------------------------------------------------ */
  66
  67/*
  68 * Unicode version numbers consist of three parts: major, minor, and a
  69 * revision.  These numbers are packed into an unsigned int to obtain
  70 * a single version number.
  71 *
  72 * To save space in the generated trie, the unicode version is not
  73 * stored directly, instead we calculate a generation number from the
  74 * unicode versions seen in the DerivedAge file, and use that as an
  75 * index into a table of unicode versions.
  76 */
  77#define UNICODE_MAJ_SHIFT               (16)
  78#define UNICODE_MIN_SHIFT               (8)
  79
  80#define UNICODE_MAJ_MAX                 ((unsigned short)-1)
  81#define UNICODE_MIN_MAX                 ((unsigned char)-1)
  82#define UNICODE_REV_MAX                 ((unsigned char)-1)
  83
  84#define UNICODE_AGE(MAJ,MIN,REV)                        \
  85        (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
  86         ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
  87         ((unsigned int)(REV)))
  88
  89unsigned int *ages;
  90int ages_count;
  91
  92unsigned int unicode_maxage;
  93
  94static int age_valid(unsigned int major, unsigned int minor,
  95                     unsigned int revision)
  96{
  97        if (major > UNICODE_MAJ_MAX)
  98                return 0;
  99        if (minor > UNICODE_MIN_MAX)
 100                return 0;
 101        if (revision > UNICODE_REV_MAX)
 102                return 0;
 103        return 1;
 104}
 105
 106/* ------------------------------------------------------------------ */
 107
 108/*
 109 * utf8trie_t
 110 *
 111 * A compact binary tree, used to decode UTF-8 characters.
 112 *
 113 * Internal nodes are one byte for the node itself, and up to three
 114 * bytes for an offset into the tree.  The first byte contains the
 115 * following information:
 116 *  NEXTBYTE  - flag        - advance to next byte if set
 117 *  BITNUM    - 3 bit field - the bit number to tested
 118 *  OFFLEN    - 2 bit field - number of bytes in the offset
 119 * if offlen == 0 (non-branching node)
 120 *  RIGHTPATH - 1 bit field - set if the following node is for the
 121 *                            right-hand path (tested bit is set)
 122 *  TRIENODE  - 1 bit field - set if the following node is an internal
 123 *                            node, otherwise it is a leaf node
 124 * if offlen != 0 (branching node)
 125 *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 126 *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 127 *
 128 * Due to the way utf8 works, there cannot be branching nodes with
 129 * NEXTBYTE set, and moreover those nodes always have a righthand
 130 * descendant.
 131 */
 132typedef unsigned char utf8trie_t;
 133#define BITNUM          0x07
 134#define NEXTBYTE        0x08
 135#define OFFLEN          0x30
 136#define OFFLEN_SHIFT    4
 137#define RIGHTPATH       0x40
 138#define TRIENODE        0x80
 139#define RIGHTNODE       0x40
 140#define LEFTNODE        0x80
 141
 142/*
 143 * utf8leaf_t
 144 *
 145 * The leaves of the trie are embedded in the trie, and so the same
 146 * underlying datatype, unsigned char.
 147 *
 148 * leaf[0]: The unicode version, stored as a generation number that is
 149 *          an index into utf8agetab[].  With this we can filter code
 150 *          points based on the unicode version in which they were
 151 *          defined.  The CCC of a non-defined code point is 0.
 152 * leaf[1]: Canonical Combining Class. During normalization, we need
 153 *          to do a stable sort into ascending order of all characters
 154 *          with a non-zero CCC that occur between two characters with
 155 *          a CCC of 0, or at the begin or end of a string.
 156 *          The unicode standard guarantees that all CCC values are
 157 *          between 0 and 254 inclusive, which leaves 255 available as
 158 *          a special value.
 159 *          Code points with CCC 0 are known as stoppers.
 160 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 161 *          start of a NUL-terminated string that is the decomposition
 162 *          of the character.
 163 *          The CCC of a decomposable character is the same as the CCC
 164 *          of the first character of its decomposition.
 165 *          Some characters decompose as the empty string: these are
 166 *          characters with the Default_Ignorable_Code_Point property.
 167 *          These do affect normalization, as they all have CCC 0.
 168 *
 169 * The decompositions in the trie have been fully expanded.
 170 *
 171 * Casefolding, if applicable, is also done using decompositions.
 172 */
 173typedef unsigned char utf8leaf_t;
 174
 175#define LEAF_GEN(LEAF)  ((LEAF)[0])
 176#define LEAF_CCC(LEAF)  ((LEAF)[1])
 177#define LEAF_STR(LEAF)  ((const char*)((LEAF) + 2))
 178
 179#define MAXGEN          (255)
 180
 181#define MINCCC          (0)
 182#define MAXCCC          (254)
 183#define STOPPER         (0)
 184#define DECOMPOSE       (255)
 185#define HANGUL          ((char)(255))
 186
 187#define UTF8HANGULLEAF  (12)
 188
 189struct tree;
 190static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
 191                               const char *, size_t);
 192static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
 193
 194unsigned char *utf8data;
 195size_t utf8data_size;
 196
 197utf8trie_t *nfdi;
 198utf8trie_t *nfdicf;
 199
 200/* ------------------------------------------------------------------ */
 201
 202/*
 203 * UTF8 valid ranges.
 204 *
 205 * The UTF-8 encoding spreads the bits of a 32bit word over several
 206 * bytes. This table gives the ranges that can be held and how they'd
 207 * be represented.
 208 *
 209 * 0x00000000 0x0000007F: 0xxxxxxx
 210 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
 211 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 212 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 213 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 214 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 215 *
 216 * There is an additional requirement on UTF-8, in that only the
 217 * shortest representation of a 32bit value is to be used.  A decoder
 218 * must not decode sequences that do not satisfy this requirement.
 219 * Thus the allowed ranges have a lower bound.
 220 *
 221 * 0x00000000 0x0000007F: 0xxxxxxx
 222 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
 223 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 224 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 225 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 226 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 227 *
 228 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
 229 * 17 planes of 65536 values.  This limits the sequences actually seen
 230 * even more, to just the following.
 231 *
 232 *          0 -     0x7f: 0                     0x7f
 233 *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
 234 *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
 235 *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
 236 *
 237 * Even within those ranges not all values are allowed: the surrogates
 238 * 0xd800 - 0xdfff should never be seen.
 239 *
 240 * Note that the longest sequence seen with valid usage is 4 bytes,
 241 * the same a single UTF-32 character.  This makes the UTF-8
 242 * representation of Unicode strictly smaller than UTF-32.
 243 *
 244 * The shortest sequence requirement was introduced by:
 245 *    Corrigendum #1: UTF-8 Shortest Form
 246 * It can be found here:
 247 *    http://www.unicode.org/versions/corrigendum1.html
 248 *
 249 */
 250
 251#define UTF8_2_BITS     0xC0
 252#define UTF8_3_BITS     0xE0
 253#define UTF8_4_BITS     0xF0
 254#define UTF8_N_BITS     0x80
 255#define UTF8_2_MASK     0xE0
 256#define UTF8_3_MASK     0xF0
 257#define UTF8_4_MASK     0xF8
 258#define UTF8_N_MASK     0xC0
 259#define UTF8_V_MASK     0x3F
 260#define UTF8_V_SHIFT    6
 261
 262static int utf8encode(char *str, unsigned int val)
 263{
 264        int len;
 265
 266        if (val < 0x80) {
 267                str[0] = val;
 268                len = 1;
 269        } else if (val < 0x800) {
 270                str[1] = val & UTF8_V_MASK;
 271                str[1] |= UTF8_N_BITS;
 272                val >>= UTF8_V_SHIFT;
 273                str[0] = val;
 274                str[0] |= UTF8_2_BITS;
 275                len = 2;
 276        } else if (val < 0x10000) {
 277                str[2] = val & UTF8_V_MASK;
 278                str[2] |= UTF8_N_BITS;
 279                val >>= UTF8_V_SHIFT;
 280                str[1] = val & UTF8_V_MASK;
 281                str[1] |= UTF8_N_BITS;
 282                val >>= UTF8_V_SHIFT;
 283                str[0] = val;
 284                str[0] |= UTF8_3_BITS;
 285                len = 3;
 286        } else if (val < 0x110000) {
 287                str[3] = val & UTF8_V_MASK;
 288                str[3] |= UTF8_N_BITS;
 289                val >>= UTF8_V_SHIFT;
 290                str[2] = val & UTF8_V_MASK;
 291                str[2] |= UTF8_N_BITS;
 292                val >>= UTF8_V_SHIFT;
 293                str[1] = val & UTF8_V_MASK;
 294                str[1] |= UTF8_N_BITS;
 295                val >>= UTF8_V_SHIFT;
 296                str[0] = val;
 297                str[0] |= UTF8_4_BITS;
 298                len = 4;
 299        } else {
 300                printf("%#x: illegal val\n", val);
 301                len = 0;
 302        }
 303        return len;
 304}
 305
 306static unsigned int utf8decode(const char *str)
 307{
 308        const unsigned char *s = (const unsigned char*)str;
 309        unsigned int unichar = 0;
 310
 311        if (*s < 0x80) {
 312                unichar = *s;
 313        } else if (*s < UTF8_3_BITS) {
 314                unichar = *s++ & 0x1F;
 315                unichar <<= UTF8_V_SHIFT;
 316                unichar |= *s & 0x3F;
 317        } else if (*s < UTF8_4_BITS) {
 318                unichar = *s++ & 0x0F;
 319                unichar <<= UTF8_V_SHIFT;
 320                unichar |= *s++ & 0x3F;
 321                unichar <<= UTF8_V_SHIFT;
 322                unichar |= *s & 0x3F;
 323        } else {
 324                unichar = *s++ & 0x0F;
 325                unichar <<= UTF8_V_SHIFT;
 326                unichar |= *s++ & 0x3F;
 327                unichar <<= UTF8_V_SHIFT;
 328                unichar |= *s++ & 0x3F;
 329                unichar <<= UTF8_V_SHIFT;
 330                unichar |= *s & 0x3F;
 331        }
 332        return unichar;
 333}
 334
 335static int utf32valid(unsigned int unichar)
 336{
 337        return unichar < 0x110000;
 338}
 339
 340#define HANGUL_SYLLABLE(U)      ((U) >= 0xAC00 && (U) <= 0xD7A3)
 341
 342#define NODE 1
 343#define LEAF 0
 344
 345struct tree {
 346        void *root;
 347        int childnode;
 348        const char *type;
 349        unsigned int maxage;
 350        struct tree *next;
 351        int (*leaf_equal)(void *, void *);
 352        void (*leaf_print)(void *, int);
 353        int (*leaf_mark)(void *);
 354        int (*leaf_size)(void *);
 355        int *(*leaf_index)(struct tree *, void *);
 356        unsigned char *(*leaf_emit)(void *, unsigned char *);
 357        int leafindex[0x110000];
 358        int index;
 359};
 360
 361struct node {
 362        int index;
 363        int offset;
 364        int mark;
 365        int size;
 366        struct node *parent;
 367        void *left;
 368        void *right;
 369        unsigned char bitnum;
 370        unsigned char nextbyte;
 371        unsigned char leftnode;
 372        unsigned char rightnode;
 373        unsigned int keybits;
 374        unsigned int keymask;
 375};
 376
 377/*
 378 * Example lookup function for a tree.
 379 */
 380static void *lookup(struct tree *tree, const char *key)
 381{
 382        struct node *node;
 383        void *leaf = NULL;
 384
 385        node = tree->root;
 386        while (!leaf && node) {
 387                if (node->nextbyte)
 388                        key++;
 389                if (*key & (1 << (node->bitnum & 7))) {
 390                        /* Right leg */
 391                        if (node->rightnode == NODE) {
 392                                node = node->right;
 393                        } else if (node->rightnode == LEAF) {
 394                                leaf = node->right;
 395                        } else {
 396                                node = NULL;
 397                        }
 398                } else {
 399                        /* Left leg */
 400                        if (node->leftnode == NODE) {
 401                                node = node->left;
 402                        } else if (node->leftnode == LEAF) {
 403                                leaf = node->left;
 404                        } else {
 405                                node = NULL;
 406                        }
 407                }
 408        }
 409
 410        return leaf;
 411}
 412
 413/*
 414 * A simple non-recursive tree walker: keep track of visits to the
 415 * left and right branches in the leftmask and rightmask.
 416 */
 417static void tree_walk(struct tree *tree)
 418{
 419        struct node *node;
 420        unsigned int leftmask;
 421        unsigned int rightmask;
 422        unsigned int bitmask;
 423        int indent = 1;
 424        int nodes, singletons, leaves;
 425
 426        nodes = singletons = leaves = 0;
 427
 428        printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
 429        if (tree->childnode == LEAF) {
 430                assert(tree->root);
 431                tree->leaf_print(tree->root, indent);
 432                leaves = 1;
 433        } else {
 434                assert(tree->childnode == NODE);
 435                node = tree->root;
 436                leftmask = rightmask = 0;
 437                while (node) {
 438                        printf("%*snode @ %p bitnum %d nextbyte %d"
 439                               " left %p right %p mask %x bits %x\n",
 440                                indent, "", node,
 441                                node->bitnum, node->nextbyte,
 442                                node->left, node->right,
 443                                node->keymask, node->keybits);
 444                        nodes += 1;
 445                        if (!(node->left && node->right))
 446                                singletons += 1;
 447
 448                        while (node) {
 449                                bitmask = 1 << node->bitnum;
 450                                if ((leftmask & bitmask) == 0) {
 451                                        leftmask |= bitmask;
 452                                        if (node->leftnode == LEAF) {
 453                                                assert(node->left);
 454                                                tree->leaf_print(node->left,
 455                                                                 indent+1);
 456                                                leaves += 1;
 457                                        } else if (node->left) {
 458                                                assert(node->leftnode == NODE);
 459                                                indent += 1;
 460                                                node = node->left;
 461                                                break;
 462                                        }
 463                                }
 464                                if ((rightmask & bitmask) == 0) {
 465                                        rightmask |= bitmask;
 466                                        if (node->rightnode == LEAF) {
 467                                                assert(node->right);
 468                                                tree->leaf_print(node->right,
 469                                                                 indent+1);
 470                                                leaves += 1;
 471                                        } else if (node->right) {
 472                                                assert(node->rightnode == NODE);
 473                                                indent += 1;
 474                                                node = node->right;
 475                                                break;
 476                                        }
 477                                }
 478                                leftmask &= ~bitmask;
 479                                rightmask &= ~bitmask;
 480                                node = node->parent;
 481                                indent -= 1;
 482                        }
 483                }
 484        }
 485        printf("nodes %d leaves %d singletons %d\n",
 486               nodes, leaves, singletons);
 487}
 488
 489/*
 490 * Allocate an initialize a new internal node.
 491 */
 492static struct node *alloc_node(struct node *parent)
 493{
 494        struct node *node;
 495        int bitnum;
 496
 497        node = malloc(sizeof(*node));
 498        node->left = node->right = NULL;
 499        node->parent = parent;
 500        node->leftnode = NODE;
 501        node->rightnode = NODE;
 502        node->keybits = 0;
 503        node->keymask = 0;
 504        node->mark = 0;
 505        node->index = 0;
 506        node->offset = -1;
 507        node->size = 4;
 508
 509        if (node->parent) {
 510                bitnum = parent->bitnum;
 511                if ((bitnum & 7) == 0) {
 512                        node->bitnum = bitnum + 7 + 8;
 513                        node->nextbyte = 1;
 514                } else {
 515                        node->bitnum = bitnum - 1;
 516                        node->nextbyte = 0;
 517                }
 518        } else {
 519                node->bitnum = 7;
 520                node->nextbyte = 0;
 521        }
 522
 523        return node;
 524}
 525
 526/*
 527 * Insert a new leaf into the tree, and collapse any subtrees that are
 528 * fully populated and end in identical leaves. A nextbyte tagged
 529 * internal node will not be removed to preserve the tree's integrity.
 530 * Note that due to the structure of utf8, no nextbyte tagged node
 531 * will be a candidate for removal.
 532 */
 533static int insert(struct tree *tree, char *key, int keylen, void *leaf)
 534{
 535        struct node *node;
 536        struct node *parent;
 537        void **cursor;
 538        int keybits;
 539
 540        assert(keylen >= 1 && keylen <= 4);
 541
 542        node = NULL;
 543        cursor = &tree->root;
 544        keybits = 8 * keylen;
 545
 546        /* Insert, creating path along the way. */
 547        while (keybits) {
 548                if (!*cursor)
 549                        *cursor = alloc_node(node);
 550                node = *cursor;
 551                if (node->nextbyte)
 552                        key++;
 553                if (*key & (1 << (node->bitnum & 7)))
 554                        cursor = &node->right;
 555                else
 556                        cursor = &node->left;
 557                keybits--;
 558        }
 559        *cursor = leaf;
 560
 561        /* Merge subtrees if possible. */
 562        while (node) {
 563                if (*key & (1 << (node->bitnum & 7)))
 564                        node->rightnode = LEAF;
 565                else
 566                        node->leftnode = LEAF;
 567                if (node->nextbyte)
 568                        break;
 569                if (node->leftnode == NODE || node->rightnode == NODE)
 570                        break;
 571                assert(node->left);
 572                assert(node->right);
 573                /* Compare */
 574                if (! tree->leaf_equal(node->left, node->right))
 575                        break;
 576                /* Keep left, drop right leaf. */
 577                leaf = node->left;
 578                /* Check in parent */
 579                parent = node->parent;
 580                if (!parent) {
 581                        /* root of tree! */
 582                        tree->root = leaf;
 583                        tree->childnode = LEAF;
 584                } else if (parent->left == node) {
 585                        parent->left = leaf;
 586                        parent->leftnode = LEAF;
 587                        if (parent->right) {
 588                                parent->keymask = 0;
 589                                parent->keybits = 0;
 590                        } else {
 591                                parent->keymask |= (1 << node->bitnum);
 592                        }
 593                } else if (parent->right == node) {
 594                        parent->right = leaf;
 595                        parent->rightnode = LEAF;
 596                        if (parent->left) {
 597                                parent->keymask = 0;
 598                                parent->keybits = 0;
 599                        } else {
 600                                parent->keymask |= (1 << node->bitnum);
 601                                parent->keybits |= (1 << node->bitnum);
 602                        }
 603                } else {
 604                        /* internal tree error */
 605                        assert(0);
 606                }
 607                free(node);
 608                node = parent;
 609        }
 610
 611        /* Propagate keymasks up along singleton chains. */
 612        while (node) {
 613                parent = node->parent;
 614                if (!parent)
 615                        break;
 616                /* Nix the mask for parents with two children. */
 617                if (node->keymask == 0) {
 618                        parent->keymask = 0;
 619                        parent->keybits = 0;
 620                } else if (parent->left && parent->right) {
 621                        parent->keymask = 0;
 622                        parent->keybits = 0;
 623                } else {
 624                        assert((parent->keymask & node->keymask) == 0);
 625                        parent->keymask |= node->keymask;
 626                        parent->keymask |= (1 << parent->bitnum);
 627                        parent->keybits |= node->keybits;
 628                        if (parent->right)
 629                                parent->keybits |= (1 << parent->bitnum);
 630                }
 631                node = parent;
 632        }
 633
 634        return 0;
 635}
 636
 637/*
 638 * Prune internal nodes.
 639 *
 640 * Fully populated subtrees that end at the same leaf have already
 641 * been collapsed.  There are still internal nodes that have for both
 642 * their left and right branches a sequence of singletons that make
 643 * identical choices and end in identical leaves.  The keymask and
 644 * keybits collected in the nodes describe the choices made in these
 645 * singleton chains.  When they are identical for the left and right
 646 * branch of a node, and the two leaves comare identical, the node in
 647 * question can be removed.
 648 *
 649 * Note that nodes with the nextbyte tag set will not be removed by
 650 * this to ensure tree integrity.  Note as well that the structure of
 651 * utf8 ensures that these nodes would not have been candidates for
 652 * removal in any case.
 653 */
 654static void prune(struct tree *tree)
 655{
 656        struct node *node;
 657        struct node *left;
 658        struct node *right;
 659        struct node *parent;
 660        void *leftleaf;
 661        void *rightleaf;
 662        unsigned int leftmask;
 663        unsigned int rightmask;
 664        unsigned int bitmask;
 665        int count;
 666
 667        if (verbose > 0)
 668                printf("Pruning %s_%x\n", tree->type, tree->maxage);
 669
 670        count = 0;
 671        if (tree->childnode == LEAF)
 672                return;
 673        if (!tree->root)
 674                return;
 675
 676        leftmask = rightmask = 0;
 677        node = tree->root;
 678        while (node) {
 679                if (node->nextbyte)
 680                        goto advance;
 681                if (node->leftnode == LEAF)
 682                        goto advance;
 683                if (node->rightnode == LEAF)
 684                        goto advance;
 685                if (!node->left)
 686                        goto advance;
 687                if (!node->right)
 688                        goto advance;
 689                left = node->left;
 690                right = node->right;
 691                if (left->keymask == 0)
 692                        goto advance;
 693                if (right->keymask == 0)
 694                        goto advance;
 695                if (left->keymask != right->keymask)
 696                        goto advance;
 697                if (left->keybits != right->keybits)
 698                        goto advance;
 699                leftleaf = NULL;
 700                while (!leftleaf) {
 701                        assert(left->left || left->right);
 702                        if (left->leftnode == LEAF)
 703                                leftleaf = left->left;
 704                        else if (left->rightnode == LEAF)
 705                                leftleaf = left->right;
 706                        else if (left->left)
 707                                left = left->left;
 708                        else if (left->right)
 709                                left = left->right;
 710                        else
 711                                assert(0);
 712                }
 713                rightleaf = NULL;
 714                while (!rightleaf) {
 715                        assert(right->left || right->right);
 716                        if (right->leftnode == LEAF)
 717                                rightleaf = right->left;
 718                        else if (right->rightnode == LEAF)
 719                                rightleaf = right->right;
 720                        else if (right->left)
 721                                right = right->left;
 722                        else if (right->right)
 723                                right = right->right;
 724                        else
 725                                assert(0);
 726                }
 727                if (! tree->leaf_equal(leftleaf, rightleaf))
 728                        goto advance;
 729                /*
 730                 * This node has identical singleton-only subtrees.
 731                 * Remove it.
 732                 */
 733                parent = node->parent;
 734                left = node->left;
 735                right = node->right;
 736                if (parent->left == node)
 737                        parent->left = left;
 738                else if (parent->right == node)
 739                        parent->right = left;
 740                else
 741                        assert(0);
 742                left->parent = parent;
 743                left->keymask |= (1 << node->bitnum);
 744                node->left = NULL;
 745                while (node) {
 746                        bitmask = 1 << node->bitnum;
 747                        leftmask &= ~bitmask;
 748                        rightmask &= ~bitmask;
 749                        if (node->leftnode == NODE && node->left) {
 750                                left = node->left;
 751                                free(node);
 752                                count++;
 753                                node = left;
 754                        } else if (node->rightnode == NODE && node->right) {
 755                                right = node->right;
 756                                free(node);
 757                                count++;
 758                                node = right;
 759                        } else {
 760                                node = NULL;
 761                        }
 762                }
 763                /* Propagate keymasks up along singleton chains. */
 764                node = parent;
 765                /* Force re-check */
 766                bitmask = 1 << node->bitnum;
 767                leftmask &= ~bitmask;
 768                rightmask &= ~bitmask;
 769                for (;;) {
 770                        if (node->left && node->right)
 771                                break;
 772                        if (node->left) {
 773                                left = node->left;
 774                                node->keymask |= left->keymask;
 775                                node->keybits |= left->keybits;
 776                        }
 777                        if (node->right) {
 778                                right = node->right;
 779                                node->keymask |= right->keymask;
 780                                node->keybits |= right->keybits;
 781                        }
 782                        node->keymask |= (1 << node->bitnum);
 783                        node = node->parent;
 784                        /* Force re-check */
 785                        bitmask = 1 << node->bitnum;
 786                        leftmask &= ~bitmask;
 787                        rightmask &= ~bitmask;
 788                }
 789        advance:
 790                bitmask = 1 << node->bitnum;
 791                if ((leftmask & bitmask) == 0 &&
 792                    node->leftnode == NODE &&
 793                    node->left) {
 794                        leftmask |= bitmask;
 795                        node = node->left;
 796                } else if ((rightmask & bitmask) == 0 &&
 797                           node->rightnode == NODE &&
 798                           node->right) {
 799                        rightmask |= bitmask;
 800                        node = node->right;
 801                } else {
 802                        leftmask &= ~bitmask;
 803                        rightmask &= ~bitmask;
 804                        node = node->parent;
 805                }
 806        }
 807        if (verbose > 0)
 808                printf("Pruned %d nodes\n", count);
 809}
 810
 811/*
 812 * Mark the nodes in the tree that lead to leaves that must be
 813 * emitted.
 814 */
 815static void mark_nodes(struct tree *tree)
 816{
 817        struct node *node;
 818        struct node *n;
 819        unsigned int leftmask;
 820        unsigned int rightmask;
 821        unsigned int bitmask;
 822        int marked;
 823
 824        marked = 0;
 825        if (verbose > 0)
 826                printf("Marking %s_%x\n", tree->type, tree->maxage);
 827        if (tree->childnode == LEAF)
 828                goto done;
 829
 830        assert(tree->childnode == NODE);
 831        node = tree->root;
 832        leftmask = rightmask = 0;
 833        while (node) {
 834                bitmask = 1 << node->bitnum;
 835                if ((leftmask & bitmask) == 0) {
 836                        leftmask |= bitmask;
 837                        if (node->leftnode == LEAF) {
 838                                assert(node->left);
 839                                if (tree->leaf_mark(node->left)) {
 840                                        n = node;
 841                                        while (n && !n->mark) {
 842                                                marked++;
 843                                                n->mark = 1;
 844                                                n = n->parent;
 845                                        }
 846                                }
 847                        } else if (node->left) {
 848                                assert(node->leftnode == NODE);
 849                                node = node->left;
 850                                continue;
 851                        }
 852                }
 853                if ((rightmask & bitmask) == 0) {
 854                        rightmask |= bitmask;
 855                        if (node->rightnode == LEAF) {
 856                                assert(node->right);
 857                                if (tree->leaf_mark(node->right)) {
 858                                        n = node;
 859                                        while (n && !n->mark) {
 860                                                marked++;
 861                                                n->mark = 1;
 862                                                n = n->parent;
 863                                        }
 864                                }
 865                        } else if (node->right) {
 866                                assert(node->rightnode == NODE);
 867                                node = node->right;
 868                                continue;
 869                        }
 870                }
 871                leftmask &= ~bitmask;
 872                rightmask &= ~bitmask;
 873                node = node->parent;
 874        }
 875
 876        /* second pass: left siblings and singletons */
 877
 878        assert(tree->childnode == NODE);
 879        node = tree->root;
 880        leftmask = rightmask = 0;
 881        while (node) {
 882                bitmask = 1 << node->bitnum;
 883                if ((leftmask & bitmask) == 0) {
 884                        leftmask |= bitmask;
 885                        if (node->leftnode == LEAF) {
 886                                assert(node->left);
 887                                if (tree->leaf_mark(node->left)) {
 888                                        n = node;
 889                                        while (n && !n->mark) {
 890                                                marked++;
 891                                                n->mark = 1;
 892                                                n = n->parent;
 893                                        }
 894                                }
 895                        } else if (node->left) {
 896                                assert(node->leftnode == NODE);
 897                                node = node->left;
 898                                if (!node->mark && node->parent->mark) {
 899                                        marked++;
 900                                        node->mark = 1;
 901                                }
 902                                continue;
 903                        }
 904                }
 905                if ((rightmask & bitmask) == 0) {
 906                        rightmask |= bitmask;
 907                        if (node->rightnode == LEAF) {
 908                                assert(node->right);
 909                                if (tree->leaf_mark(node->right)) {
 910                                        n = node;
 911                                        while (n && !n->mark) {
 912                                                marked++;
 913                                                n->mark = 1;
 914                                                n = n->parent;
 915                                        }
 916                                }
 917                        } else if (node->right) {
 918                                assert(node->rightnode == NODE);
 919                                node = node->right;
 920                                if (!node->mark && node->parent->mark &&
 921                                    !node->parent->left) {
 922                                        marked++;
 923                                        node->mark = 1;
 924                                }
 925                                continue;
 926                        }
 927                }
 928                leftmask &= ~bitmask;
 929                rightmask &= ~bitmask;
 930                node = node->parent;
 931        }
 932done:
 933        if (verbose > 0)
 934                printf("Marked %d nodes\n", marked);
 935}
 936
 937/*
 938 * Compute the index of each node and leaf, which is the offset in the
 939 * emitted trie.  These values must be pre-computed because relative
 940 * offsets between nodes are used to navigate the tree.
 941 */
 942static int index_nodes(struct tree *tree, int index)
 943{
 944        struct node *node;
 945        unsigned int leftmask;
 946        unsigned int rightmask;
 947        unsigned int bitmask;
 948        int count;
 949        int indent;
 950
 951        /* Align to a cache line (or half a cache line?). */
 952        while (index % 64)
 953                index++;
 954        tree->index = index;
 955        indent = 1;
 956        count = 0;
 957
 958        if (verbose > 0)
 959                printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
 960        if (tree->childnode == LEAF) {
 961                index += tree->leaf_size(tree->root);
 962                goto done;
 963        }
 964
 965        assert(tree->childnode == NODE);
 966        node = tree->root;
 967        leftmask = rightmask = 0;
 968        while (node) {
 969                if (!node->mark)
 970                        goto skip;
 971                count++;
 972                if (node->index != index)
 973                        node->index = index;
 974                index += node->size;
 975skip:
 976                while (node) {
 977                        bitmask = 1 << node->bitnum;
 978                        if (node->mark && (leftmask & bitmask) == 0) {
 979                                leftmask |= bitmask;
 980                                if (node->leftnode == LEAF) {
 981                                        assert(node->left);
 982                                        *tree->leaf_index(tree, node->left) =
 983                                                                        index;
 984                                        index += tree->leaf_size(node->left);
 985                                        count++;
 986                                } else if (node->left) {
 987                                        assert(node->leftnode == NODE);
 988                                        indent += 1;
 989                                        node = node->left;
 990                                        break;
 991                                }
 992                        }
 993                        if (node->mark && (rightmask & bitmask) == 0) {
 994                                rightmask |= bitmask;
 995                                if (node->rightnode == LEAF) {
 996                                        assert(node->right);
 997                                        *tree->leaf_index(tree, node->right) = index;
 998                                        index += tree->leaf_size(node->right);
 999                                        count++;
1000                                } else if (node->right) {
1001                                        assert(node->rightnode == NODE);
1002                                        indent += 1;
1003                                        node = node->right;
1004                                        break;
1005                                }
1006                        }
1007                        leftmask &= ~bitmask;
1008                        rightmask &= ~bitmask;
1009                        node = node->parent;
1010                        indent -= 1;
1011                }
1012        }
1013done:
1014        /* Round up to a multiple of 16 */
1015        while (index % 16)
1016                index++;
1017        if (verbose > 0)
1018                printf("Final index %d\n", index);
1019        return index;
1020}
1021
1022/*
1023 * Mark the nodes in a subtree, helper for size_nodes().
1024 */
1025static int mark_subtree(struct node *node)
1026{
1027        int changed;
1028
1029        if (!node || node->mark)
1030                return 0;
1031        node->mark = 1;
1032        node->index = node->parent->index;
1033        changed = 1;
1034        if (node->leftnode == NODE)
1035                changed += mark_subtree(node->left);
1036        if (node->rightnode == NODE)
1037                changed += mark_subtree(node->right);
1038        return changed;
1039}
1040
1041/*
1042 * Compute the size of nodes and leaves. We start by assuming that
1043 * each node needs to store a three-byte offset. The indexes of the
1044 * nodes are calculated based on that, and then this function is
1045 * called to see if the sizes of some nodes can be reduced.  This is
1046 * repeated until no more changes are seen.
1047 */
1048static int size_nodes(struct tree *tree)
1049{
1050        struct tree *next;
1051        struct node *node;
1052        struct node *right;
1053        struct node *n;
1054        unsigned int leftmask;
1055        unsigned int rightmask;
1056        unsigned int bitmask;
1057        unsigned int pathbits;
1058        unsigned int pathmask;
1059        unsigned int nbit;
1060        int changed;
1061        int offset;
1062        int size;
1063        int indent;
1064
1065        indent = 1;
1066        changed = 0;
1067        size = 0;
1068
1069        if (verbose > 0)
1070                printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071        if (tree->childnode == LEAF)
1072                goto done;
1073
1074        assert(tree->childnode == NODE);
1075        pathbits = 0;
1076        pathmask = 0;
1077        node = tree->root;
1078        leftmask = rightmask = 0;
1079        while (node) {
1080                if (!node->mark)
1081                        goto skip;
1082                offset = 0;
1083                if (!node->left || !node->right) {
1084                        size = 1;
1085                } else {
1086                        if (node->rightnode == NODE) {
1087                                /*
1088                                 * If the right node is not marked,
1089                                 * look for a corresponding node in
1090                                 * the next tree.  Such a node need
1091                                 * not exist.
1092                                 */
1093                                right = node->right;
1094                                next = tree->next;
1095                                while (!right->mark) {
1096                                        assert(next);
1097                                        n = next->root;
1098                                        while (n->bitnum != node->bitnum) {
1099                                                nbit = 1 << n->bitnum;
1100                                                if (!(pathmask & nbit))
1101                                                        break;
1102                                                if (pathbits & nbit) {
1103                                                        if (n->rightnode == LEAF)
1104                                                                break;
1105                                                        n = n->right;
1106                                                } else {
1107                                                        if (n->leftnode == LEAF)
1108                                                                break;
1109                                                        n = n->left;
1110                                                }
1111                                        }
1112                                        if (n->bitnum != node->bitnum)
1113                                                break;
1114                                        n = n->right;
1115                                        right = n;
1116                                        next = next->next;
1117                                }
1118                                /* Make sure the right node is marked. */
1119                                if (!right->mark)
1120                                        changed += mark_subtree(right);
1121                                offset = right->index - node->index;
1122                        } else {
1123                                offset = *tree->leaf_index(tree, node->right);
1124                                offset -= node->index;
1125                        }
1126                        assert(offset >= 0);
1127                        assert(offset <= 0xffffff);
1128                        if (offset <= 0xff) {
1129                                size = 2;
1130                        } else if (offset <= 0xffff) {
1131                                size = 3;
1132                        } else { /* offset <= 0xffffff */
1133                                size = 4;
1134                        }
1135                }
1136                if (node->size != size || node->offset != offset) {
1137                        node->size = size;
1138                        node->offset = offset;
1139                        changed++;
1140                }
1141skip:
1142                while (node) {
1143                        bitmask = 1 << node->bitnum;
1144                        pathmask |= bitmask;
1145                        if (node->mark && (leftmask & bitmask) == 0) {
1146                                leftmask |= bitmask;
1147                                if (node->leftnode == LEAF) {
1148                                        assert(node->left);
1149                                } else if (node->left) {
1150                                        assert(node->leftnode == NODE);
1151                                        indent += 1;
1152                                        node = node->left;
1153                                        break;
1154                                }
1155                        }
1156                        if (node->mark && (rightmask & bitmask) == 0) {
1157                                rightmask |= bitmask;
1158                                pathbits |= bitmask;
1159                                if (node->rightnode == LEAF) {
1160                                        assert(node->right);
1161                                } else if (node->right) {
1162                                        assert(node->rightnode == NODE);
1163                                        indent += 1;
1164                                        node = node->right;
1165                                        break;
1166                                }
1167                        }
1168                        leftmask &= ~bitmask;
1169                        rightmask &= ~bitmask;
1170                        pathmask &= ~bitmask;
1171                        pathbits &= ~bitmask;
1172                        node = node->parent;
1173                        indent -= 1;
1174                }
1175        }
1176done:
1177        if (verbose > 0)
1178                printf("Found %d changes\n", changed);
1179        return changed;
1180}
1181
1182/*
1183 * Emit a trie for the given tree into the data array.
1184 */
1185static void emit(struct tree *tree, unsigned char *data)
1186{
1187        struct node *node;
1188        unsigned int leftmask;
1189        unsigned int rightmask;
1190        unsigned int bitmask;
1191        int offlen;
1192        int offset;
1193        int index;
1194        int indent;
1195        int size;
1196        int bytes;
1197        int leaves;
1198        int nodes[4];
1199        unsigned char byte;
1200
1201        nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202        leaves = 0;
1203        bytes = 0;
1204        index = tree->index;
1205        data += index;
1206        indent = 1;
1207        if (verbose > 0)
1208                printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209        if (tree->childnode == LEAF) {
1210                assert(tree->root);
1211                tree->leaf_emit(tree->root, data);
1212                size = tree->leaf_size(tree->root);
1213                index += size;
1214                leaves++;
1215                goto done;
1216        }
1217
1218        assert(tree->childnode == NODE);
1219        node = tree->root;
1220        leftmask = rightmask = 0;
1221        while (node) {
1222                if (!node->mark)
1223                        goto skip;
1224                assert(node->offset != -1);
1225                assert(node->index == index);
1226
1227                byte = 0;
1228                if (node->nextbyte)
1229                        byte |= NEXTBYTE;
1230                byte |= (node->bitnum & BITNUM);
1231                if (node->left && node->right) {
1232                        if (node->leftnode == NODE)
1233                                byte |= LEFTNODE;
1234                        if (node->rightnode == NODE)
1235                                byte |= RIGHTNODE;
1236                        if (node->offset <= 0xff)
1237                                offlen = 1;
1238                        else if (node->offset <= 0xffff)
1239                                offlen = 2;
1240                        else
1241                                offlen = 3;
1242                        nodes[offlen]++;
1243                        offset = node->offset;
1244                        byte |= offlen << OFFLEN_SHIFT;
1245                        *data++ = byte;
1246                        index++;
1247                        while (offlen--) {
1248                                *data++ = offset & 0xff;
1249                                index++;
1250                                offset >>= 8;
1251                        }
1252                } else if (node->left) {
1253                        if (node->leftnode == NODE)
1254                                byte |= TRIENODE;
1255                        nodes[0]++;
1256                        *data++ = byte;
1257                        index++;
1258                } else if (node->right) {
1259                        byte |= RIGHTNODE;
1260                        if (node->rightnode == NODE)
1261                                byte |= TRIENODE;
1262                        nodes[0]++;
1263                        *data++ = byte;
1264                        index++;
1265                } else {
1266                        assert(0);
1267                }
1268skip:
1269                while (node) {
1270                        bitmask = 1 << node->bitnum;
1271                        if (node->mark && (leftmask & bitmask) == 0) {
1272                                leftmask |= bitmask;
1273                                if (node->leftnode == LEAF) {
1274                                        assert(node->left);
1275                                        data = tree->leaf_emit(node->left,
1276                                                               data);
1277                                        size = tree->leaf_size(node->left);
1278                                        index += size;
1279                                        bytes += size;
1280                                        leaves++;
1281                                } else if (node->left) {
1282                                        assert(node->leftnode == NODE);
1283                                        indent += 1;
1284                                        node = node->left;
1285                                        break;
1286                                }
1287                        }
1288                        if (node->mark && (rightmask & bitmask) == 0) {
1289                                rightmask |= bitmask;
1290                                if (node->rightnode == LEAF) {
1291                                        assert(node->right);
1292                                        data = tree->leaf_emit(node->right,
1293                                                               data);
1294                                        size = tree->leaf_size(node->right);
1295                                        index += size;
1296                                        bytes += size;
1297                                        leaves++;
1298                                } else if (node->right) {
1299                                        assert(node->rightnode == NODE);
1300                                        indent += 1;
1301                                        node = node->right;
1302                                        break;
1303                                }
1304                        }
1305                        leftmask &= ~bitmask;
1306                        rightmask &= ~bitmask;
1307                        node = node->parent;
1308                        indent -= 1;
1309                }
1310        }
1311done:
1312        if (verbose > 0) {
1313                printf("Emitted %d (%d) leaves",
1314                        leaves, bytes);
1315                printf(" %d (%d+%d+%d+%d) nodes",
1316                        nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317                        nodes[0], nodes[1], nodes[2], nodes[3]);
1318                printf(" %d total\n", index - tree->index);
1319        }
1320}
1321
1322/* ------------------------------------------------------------------ */
1323
1324/*
1325 * Unicode data.
1326 *
1327 * We need to keep track of the Canonical Combining Class, the Age,
1328 * and decompositions for a code point.
1329 *
1330 * For the Age, we store the index into the ages table.  Effectively
1331 * this is a generation number that the table maps to a unicode
1332 * version.
1333 *
1334 * The correction field is used to indicate that this entry is in the
1335 * corrections array, which contains decompositions that were
1336 * corrected in later revisions.  The value of the correction field is
1337 * the Unicode version in which the mapping was corrected.
1338 */
1339struct unicode_data {
1340        unsigned int code;
1341        int ccc;
1342        int gen;
1343        int correction;
1344        unsigned int *utf32nfdi;
1345        unsigned int *utf32nfdicf;
1346        char *utf8nfdi;
1347        char *utf8nfdicf;
1348};
1349
1350struct unicode_data unicode_data[0x110000];
1351struct unicode_data *corrections;
1352int    corrections_count;
1353
1354struct tree *nfdi_tree;
1355struct tree *nfdicf_tree;
1356
1357struct tree *trees;
1358int          trees_count;
1359
1360/*
1361 * Check the corrections array to see if this entry was corrected at
1362 * some point.
1363 */
1364static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365{
1366        int i;
1367
1368        for (i = 0; i != corrections_count; i++)
1369                if (u->code == corrections[i].code)
1370                        return &corrections[i];
1371        return u;
1372}
1373
1374static int nfdi_equal(void *l, void *r)
1375{
1376        struct unicode_data *left = l;
1377        struct unicode_data *right = r;
1378
1379        if (left->gen != right->gen)
1380                return 0;
1381        if (left->ccc != right->ccc)
1382                return 0;
1383        if (left->utf8nfdi && right->utf8nfdi &&
1384            strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385                return 1;
1386        if (left->utf8nfdi || right->utf8nfdi)
1387                return 0;
1388        return 1;
1389}
1390
1391static int nfdicf_equal(void *l, void *r)
1392{
1393        struct unicode_data *left = l;
1394        struct unicode_data *right = r;
1395
1396        if (left->gen != right->gen)
1397                return 0;
1398        if (left->ccc != right->ccc)
1399                return 0;
1400        if (left->utf8nfdicf && right->utf8nfdicf &&
1401            strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402                return 1;
1403        if (left->utf8nfdicf && right->utf8nfdicf)
1404                return 0;
1405        if (left->utf8nfdicf || right->utf8nfdicf)
1406                return 0;
1407        if (left->utf8nfdi && right->utf8nfdi &&
1408            strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409                return 1;
1410        if (left->utf8nfdi || right->utf8nfdi)
1411                return 0;
1412        return 1;
1413}
1414
1415static void nfdi_print(void *l, int indent)
1416{
1417        struct unicode_data *leaf = l;
1418
1419        printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420                leaf->code, leaf->ccc, leaf->gen);
1421
1422        if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423                printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424        else if (leaf->utf8nfdi)
1425                printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426
1427        printf("\n");
1428}
1429
1430static void nfdicf_print(void *l, int indent)
1431{
1432        struct unicode_data *leaf = l;
1433
1434        printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435                leaf->code, leaf->ccc, leaf->gen);
1436
1437        if (leaf->utf8nfdicf)
1438                printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439        else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440                printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441        else if (leaf->utf8nfdi)
1442                printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443        printf("\n");
1444}
1445
1446static int nfdi_mark(void *l)
1447{
1448        return 1;
1449}
1450
1451static int nfdicf_mark(void *l)
1452{
1453        struct unicode_data *leaf = l;
1454
1455        if (leaf->utf8nfdicf)
1456                return 1;
1457        return 0;
1458}
1459
1460static int correction_mark(void *l)
1461{
1462        struct unicode_data *leaf = l;
1463
1464        return leaf->correction;
1465}
1466
1467static int nfdi_size(void *l)
1468{
1469        struct unicode_data *leaf = l;
1470        int size = 2;
1471
1472        if (HANGUL_SYLLABLE(leaf->code))
1473                size += 1;
1474        else if (leaf->utf8nfdi)
1475                size += strlen(leaf->utf8nfdi) + 1;
1476        return size;
1477}
1478
1479static int nfdicf_size(void *l)
1480{
1481        struct unicode_data *leaf = l;
1482        int size = 2;
1483
1484        if (HANGUL_SYLLABLE(leaf->code))
1485                size += 1;
1486        else if (leaf->utf8nfdicf)
1487                size += strlen(leaf->utf8nfdicf) + 1;
1488        else if (leaf->utf8nfdi)
1489                size += strlen(leaf->utf8nfdi) + 1;
1490        return size;
1491}
1492
1493static int *nfdi_index(struct tree *tree, void *l)
1494{
1495        struct unicode_data *leaf = l;
1496
1497        return &tree->leafindex[leaf->code];
1498}
1499
1500static int *nfdicf_index(struct tree *tree, void *l)
1501{
1502        struct unicode_data *leaf = l;
1503
1504        return &tree->leafindex[leaf->code];
1505}
1506
1507static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508{
1509        struct unicode_data *leaf = l;
1510        unsigned char *s;
1511
1512        *data++ = leaf->gen;
1513
1514        if (HANGUL_SYLLABLE(leaf->code)) {
1515                *data++ = DECOMPOSE;
1516                *data++ = HANGUL;
1517        } else if (leaf->utf8nfdi) {
1518                *data++ = DECOMPOSE;
1519                s = (unsigned char*)leaf->utf8nfdi;
1520                while ((*data++ = *s++) != 0)
1521                        ;
1522        } else {
1523                *data++ = leaf->ccc;
1524        }
1525        return data;
1526}
1527
1528static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529{
1530        struct unicode_data *leaf = l;
1531        unsigned char *s;
1532
1533        *data++ = leaf->gen;
1534
1535        if (HANGUL_SYLLABLE(leaf->code)) {
1536                *data++ = DECOMPOSE;
1537                *data++ = HANGUL;
1538        } else if (leaf->utf8nfdicf) {
1539                *data++ = DECOMPOSE;
1540                s = (unsigned char*)leaf->utf8nfdicf;
1541                while ((*data++ = *s++) != 0)
1542                        ;
1543        } else if (leaf->utf8nfdi) {
1544                *data++ = DECOMPOSE;
1545                s = (unsigned char*)leaf->utf8nfdi;
1546                while ((*data++ = *s++) != 0)
1547                        ;
1548        } else {
1549                *data++ = leaf->ccc;
1550        }
1551        return data;
1552}
1553
1554static void utf8_create(struct unicode_data *data)
1555{
1556        char utf[18*4+1];
1557        char *u;
1558        unsigned int *um;
1559        int i;
1560
1561        if (data->utf8nfdi) {
1562                assert(data->utf8nfdi[0] == HANGUL);
1563                return;
1564        }
1565
1566        u = utf;
1567        um = data->utf32nfdi;
1568        if (um) {
1569                for (i = 0; um[i]; i++)
1570                        u += utf8encode(u, um[i]);
1571                *u = '\0';
1572                data->utf8nfdi = strdup(utf);
1573        }
1574        u = utf;
1575        um = data->utf32nfdicf;
1576        if (um) {
1577                for (i = 0; um[i]; i++)
1578                        u += utf8encode(u, um[i]);
1579                *u = '\0';
1580                if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581                        data->utf8nfdicf = strdup(utf);
1582        }
1583}
1584
1585static void utf8_init(void)
1586{
1587        unsigned int unichar;
1588        int i;
1589
1590        for (unichar = 0; unichar != 0x110000; unichar++)
1591                utf8_create(&unicode_data[unichar]);
1592
1593        for (i = 0; i != corrections_count; i++)
1594                utf8_create(&corrections[i]);
1595}
1596
1597static void trees_init(void)
1598{
1599        struct unicode_data *data;
1600        unsigned int maxage;
1601        unsigned int nextage;
1602        int count;
1603        int i;
1604        int j;
1605
1606        /* Count the number of different ages. */
1607        count = 0;
1608        nextage = (unsigned int)-1;
1609        do {
1610                maxage = nextage;
1611                nextage = 0;
1612                for (i = 0; i <= corrections_count; i++) {
1613                        data = &corrections[i];
1614                        if (nextage < data->correction &&
1615                            data->correction < maxage)
1616                                nextage = data->correction;
1617                }
1618                count++;
1619        } while (nextage);
1620
1621        /* Two trees per age: nfdi and nfdicf */
1622        trees_count = count * 2;
1623        trees = calloc(trees_count, sizeof(struct tree));
1624
1625        /* Assign ages to the trees. */
1626        count = trees_count;
1627        nextage = (unsigned int)-1;
1628        do {
1629                maxage = nextage;
1630                trees[--count].maxage = maxage;
1631                trees[--count].maxage = maxage;
1632                nextage = 0;
1633                for (i = 0; i <= corrections_count; i++) {
1634                        data = &corrections[i];
1635                        if (nextage < data->correction &&
1636                            data->correction < maxage)
1637                                nextage = data->correction;
1638                }
1639        } while (nextage);
1640
1641        /* The ages assigned above are off by one. */
1642        for (i = 0; i != trees_count; i++) {
1643                j = 0;
1644                while (ages[j] < trees[i].maxage)
1645                        j++;
1646                trees[i].maxage = ages[j-1];
1647        }
1648
1649        /* Set up the forwarding between trees. */
1650        trees[trees_count-2].next = &trees[trees_count-1];
1651        trees[trees_count-1].leaf_mark = nfdi_mark;
1652        trees[trees_count-2].leaf_mark = nfdicf_mark;
1653        for (i = 0; i != trees_count-2; i += 2) {
1654                trees[i].next = &trees[trees_count-2];
1655                trees[i].leaf_mark = correction_mark;
1656                trees[i+1].next = &trees[trees_count-1];
1657                trees[i+1].leaf_mark = correction_mark;
1658        }
1659
1660        /* Assign the callouts. */
1661        for (i = 0; i != trees_count; i += 2) {
1662                trees[i].type = "nfdicf";
1663                trees[i].leaf_equal = nfdicf_equal;
1664                trees[i].leaf_print = nfdicf_print;
1665                trees[i].leaf_size = nfdicf_size;
1666                trees[i].leaf_index = nfdicf_index;
1667                trees[i].leaf_emit = nfdicf_emit;
1668
1669                trees[i+1].type = "nfdi";
1670                trees[i+1].leaf_equal = nfdi_equal;
1671                trees[i+1].leaf_print = nfdi_print;
1672                trees[i+1].leaf_size = nfdi_size;
1673                trees[i+1].leaf_index = nfdi_index;
1674                trees[i+1].leaf_emit = nfdi_emit;
1675        }
1676
1677        /* Finish init. */
1678        for (i = 0; i != trees_count; i++)
1679                trees[i].childnode = NODE;
1680}
1681
1682static void trees_populate(void)
1683{
1684        struct unicode_data *data;
1685        unsigned int unichar;
1686        char keyval[4];
1687        int keylen;
1688        int i;
1689
1690        for (i = 0; i != trees_count; i++) {
1691                if (verbose > 0) {
1692                        printf("Populating %s_%x\n",
1693                                trees[i].type, trees[i].maxage);
1694                }
1695                for (unichar = 0; unichar != 0x110000; unichar++) {
1696                        if (unicode_data[unichar].gen < 0)
1697                                continue;
1698                        keylen = utf8encode(keyval, unichar);
1699                        data = corrections_lookup(&unicode_data[unichar]);
1700                        if (data->correction <= trees[i].maxage)
1701                                data = &unicode_data[unichar];
1702                        insert(&trees[i], keyval, keylen, data);
1703                }
1704        }
1705}
1706
1707static void trees_reduce(void)
1708{
1709        int i;
1710        int size;
1711        int changed;
1712
1713        for (i = 0; i != trees_count; i++)
1714                prune(&trees[i]);
1715        for (i = 0; i != trees_count; i++)
1716                mark_nodes(&trees[i]);
1717        do {
1718                size = 0;
1719                for (i = 0; i != trees_count; i++)
1720                        size = index_nodes(&trees[i], size);
1721                changed = 0;
1722                for (i = 0; i != trees_count; i++)
1723                        changed += size_nodes(&trees[i]);
1724        } while (changed);
1725
1726        utf8data = calloc(size, 1);
1727        utf8data_size = size;
1728        for (i = 0; i != trees_count; i++)
1729                emit(&trees[i], utf8data);
1730
1731        if (verbose > 0) {
1732                for (i = 0; i != trees_count; i++) {
1733                        printf("%s_%x idx %d\n",
1734                                trees[i].type, trees[i].maxage, trees[i].index);
1735                }
1736        }
1737
1738        nfdi = utf8data + trees[trees_count-1].index;
1739        nfdicf = utf8data + trees[trees_count-2].index;
1740
1741        nfdi_tree = &trees[trees_count-1];
1742        nfdicf_tree = &trees[trees_count-2];
1743}
1744
1745static void verify(struct tree *tree)
1746{
1747        struct unicode_data *data;
1748        utf8leaf_t      *leaf;
1749        unsigned int    unichar;
1750        char            key[4];
1751        unsigned char   hangul[UTF8HANGULLEAF];
1752        int             report;
1753        int             nocf;
1754
1755        if (verbose > 0)
1756                printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757        nocf = strcmp(tree->type, "nfdicf");
1758
1759        for (unichar = 0; unichar != 0x110000; unichar++) {
1760                report = 0;
1761                data = corrections_lookup(&unicode_data[unichar]);
1762                if (data->correction <= tree->maxage)
1763                        data = &unicode_data[unichar];
1764                utf8encode(key,unichar);
1765                leaf = utf8lookup(tree, hangul, key);
1766
1767                if (!leaf) {
1768                        if (data->gen != -1)
1769                                report++;
1770                        if (unichar < 0xd800 || unichar > 0xdfff)
1771                                report++;
1772                } else {
1773                        if (unichar >= 0xd800 && unichar <= 0xdfff)
1774                                report++;
1775                        if (data->gen == -1)
1776                                report++;
1777                        if (data->gen != LEAF_GEN(leaf))
1778                                report++;
1779                        if (LEAF_CCC(leaf) == DECOMPOSE) {
1780                                if (HANGUL_SYLLABLE(data->code)) {
1781                                        if (data->utf8nfdi[0] != HANGUL)
1782                                                report++;
1783                                } else if (nocf) {
1784                                        if (!data->utf8nfdi) {
1785                                                report++;
1786                                        } else if (strcmp(data->utf8nfdi,
1787                                                          LEAF_STR(leaf))) {
1788                                                report++;
1789                                        }
1790                                } else {
1791                                        if (!data->utf8nfdicf &&
1792                                            !data->utf8nfdi) {
1793                                                report++;
1794                                        } else if (data->utf8nfdicf) {
1795                                                if (strcmp(data->utf8nfdicf,
1796                                                           LEAF_STR(leaf)))
1797                                                        report++;
1798                                        } else if (strcmp(data->utf8nfdi,
1799                                                          LEAF_STR(leaf))) {
1800                                                report++;
1801                                        }
1802                                }
1803                        } else if (data->ccc != LEAF_CCC(leaf)) {
1804                                report++;
1805                        }
1806                }
1807                if (report) {
1808                        printf("%X code %X gen %d ccc %d"
1809                                " nfdi -> \"%s\"",
1810                                unichar, data->code, data->gen,
1811                                data->ccc,
1812                                data->utf8nfdi);
1813                        if (leaf) {
1814                                printf(" gen %d ccc %d"
1815                                        " nfdi -> \"%s\"",
1816                                        LEAF_GEN(leaf),
1817                                        LEAF_CCC(leaf),
1818                                        LEAF_CCC(leaf) == DECOMPOSE ?
1819                                                LEAF_STR(leaf) : "");
1820                        }
1821                        printf("\n");
1822                }
1823        }
1824}
1825
1826static void trees_verify(void)
1827{
1828        int i;
1829
1830        for (i = 0; i != trees_count; i++)
1831                verify(&trees[i]);
1832}
1833
1834/* ------------------------------------------------------------------ */
1835
1836static void help(void)
1837{
1838        printf("Usage: %s [options]\n", argv0);
1839        printf("\n");
1840        printf("This program creates an a data trie used for parsing and\n");
1841        printf("normalization of UTF-8 strings. The trie is derived from\n");
1842        printf("a set of input files from the Unicode character database\n");
1843        printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844        printf("\n");
1845        printf("The generated tree supports two normalization forms:\n");
1846        printf("\n");
1847        printf("\tnfdi:\n");
1848        printf("\t- Apply unicode normalization form NFD.\n");
1849        printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850        printf("\n");
1851        printf("\tnfdicf:\n");
1852        printf("\t- Apply unicode normalization form NFD.\n");
1853        printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854        printf("\t- Apply a full casefold (C + F).\n");
1855        printf("\n");
1856        printf("These forms were chosen as being most useful when dealing\n");
1857        printf("with file names: NFD catches most cases where characters\n");
1858        printf("should be considered equivalent. The ignorables are mostly\n");
1859        printf("invisible, making names hard to type.\n");
1860        printf("\n");
1861        printf("The options to specify the files to be used are listed\n");
1862        printf("below with their default values, which are the names used\n");
1863        printf("by version 11.0.0 of the Unicode Character Database.\n");
1864        printf("\n");
1865        printf("The input files:\n");
1866        printf("\t-a %s\n", AGE_NAME);
1867        printf("\t-c %s\n", CCC_NAME);
1868        printf("\t-p %s\n", PROP_NAME);
1869        printf("\t-d %s\n", DATA_NAME);
1870        printf("\t-f %s\n", FOLD_NAME);
1871        printf("\t-n %s\n", NORM_NAME);
1872        printf("\n");
1873        printf("Additionally, the generated tables are tested using:\n");
1874        printf("\t-t %s\n", TEST_NAME);
1875        printf("\n");
1876        printf("Finally, the output file:\n");
1877        printf("\t-o %s\n", UTF8_NAME);
1878        printf("\n");
1879}
1880
1881static void usage(void)
1882{
1883        help();
1884        exit(1);
1885}
1886
1887static void open_fail(const char *name, int error)
1888{
1889        printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890        exit(1);
1891}
1892
1893static void file_fail(const char *filename)
1894{
1895        printf("Error parsing %s\n", filename);
1896        exit(1);
1897}
1898
1899static void line_fail(const char *filename, const char *line)
1900{
1901        printf("Error parsing %s:%s\n", filename, line);
1902        exit(1);
1903}
1904
1905/* ------------------------------------------------------------------ */
1906
1907static void print_utf32(unsigned int *utf32str)
1908{
1909        int     i;
1910
1911        for (i = 0; utf32str[i]; i++)
1912                printf(" %X", utf32str[i]);
1913}
1914
1915static void print_utf32nfdi(unsigned int unichar)
1916{
1917        printf(" %X ->", unichar);
1918        print_utf32(unicode_data[unichar].utf32nfdi);
1919        printf("\n");
1920}
1921
1922static void print_utf32nfdicf(unsigned int unichar)
1923{
1924        printf(" %X ->", unichar);
1925        print_utf32(unicode_data[unichar].utf32nfdicf);
1926        printf("\n");
1927}
1928
1929/* ------------------------------------------------------------------ */
1930
1931static void age_init(void)
1932{
1933        FILE *file;
1934        unsigned int first;
1935        unsigned int last;
1936        unsigned int unichar;
1937        unsigned int major;
1938        unsigned int minor;
1939        unsigned int revision;
1940        int gen;
1941        int count;
1942        int ret;
1943
1944        if (verbose > 0)
1945                printf("Parsing %s\n", age_name);
1946
1947        file = fopen(age_name, "r");
1948        if (!file)
1949                open_fail(age_name, errno);
1950        count = 0;
1951
1952        gen = 0;
1953        while (fgets(line, LINESIZE, file)) {
1954                ret = sscanf(line, "# Age=V%d_%d_%d",
1955                                &major, &minor, &revision);
1956                if (ret == 3) {
1957                        ages_count++;
1958                        if (verbose > 1)
1959                                printf(" Age V%d_%d_%d\n",
1960                                        major, minor, revision);
1961                        if (!age_valid(major, minor, revision))
1962                                line_fail(age_name, line);
1963                        continue;
1964                }
1965                ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966                if (ret == 2) {
1967                        ages_count++;
1968                        if (verbose > 1)
1969                                printf(" Age V%d_%d\n", major, minor);
1970                        if (!age_valid(major, minor, 0))
1971                                line_fail(age_name, line);
1972                        continue;
1973                }
1974        }
1975
1976        /* We must have found something above. */
1977        if (verbose > 1)
1978                printf("%d age entries\n", ages_count);
1979        if (ages_count == 0 || ages_count > MAXGEN)
1980                file_fail(age_name);
1981
1982        /* There is a 0 entry. */
1983        ages_count++;
1984        ages = calloc(ages_count + 1, sizeof(*ages));
1985        /* And a guard entry. */
1986        ages[ages_count] = (unsigned int)-1;
1987
1988        rewind(file);
1989        count = 0;
1990        gen = 0;
1991        while (fgets(line, LINESIZE, file)) {
1992                ret = sscanf(line, "# Age=V%d_%d_%d",
1993                                &major, &minor, &revision);
1994                if (ret == 3) {
1995                        ages[++gen] =
1996                                UNICODE_AGE(major, minor, revision);
1997                        if (verbose > 1)
1998                                printf(" Age V%d_%d_%d = gen %d\n",
1999                                        major, minor, revision, gen);
2000                        if (!age_valid(major, minor, revision))
2001                                line_fail(age_name, line);
2002                        continue;
2003                }
2004                ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005                if (ret == 2) {
2006                        ages[++gen] = UNICODE_AGE(major, minor, 0);
2007                        if (verbose > 1)
2008                                printf(" Age V%d_%d = %d\n",
2009                                        major, minor, gen);
2010                        if (!age_valid(major, minor, 0))
2011                                line_fail(age_name, line);
2012                        continue;
2013                }
2014                ret = sscanf(line, "%X..%X ; %d.%d #",
2015                             &first, &last, &major, &minor);
2016                if (ret == 4) {
2017                        for (unichar = first; unichar <= last; unichar++)
2018                                unicode_data[unichar].gen = gen;
2019                        count += 1 + last - first;
2020                        if (verbose > 1)
2021                                printf("  %X..%X gen %d\n", first, last, gen);
2022                        if (!utf32valid(first) || !utf32valid(last))
2023                                line_fail(age_name, line);
2024                        continue;
2025                }
2026                ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027                if (ret == 3) {
2028                        unicode_data[unichar].gen = gen;
2029                        count++;
2030                        if (verbose > 1)
2031                                printf("  %X gen %d\n", unichar, gen);
2032                        if (!utf32valid(unichar))
2033                                line_fail(age_name, line);
2034                        continue;
2035                }
2036        }
2037        unicode_maxage = ages[gen];
2038        fclose(file);
2039
2040        /* Nix surrogate block */
2041        if (verbose > 1)
2042                printf(" Removing surrogate block D800..DFFF\n");
2043        for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044                unicode_data[unichar].gen = -1;
2045
2046        if (verbose > 0)
2047                printf("Found %d entries\n", count);
2048        if (count == 0)
2049                file_fail(age_name);
2050}
2051
2052static void ccc_init(void)
2053{
2054        FILE *file;
2055        unsigned int first;
2056        unsigned int last;
2057        unsigned int unichar;
2058        unsigned int value;
2059        int count;
2060        int ret;
2061
2062        if (verbose > 0)
2063                printf("Parsing %s\n", ccc_name);
2064
2065        file = fopen(ccc_name, "r");
2066        if (!file)
2067                open_fail(ccc_name, errno);
2068
2069        count = 0;
2070        while (fgets(line, LINESIZE, file)) {
2071                ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072                if (ret == 3) {
2073                        for (unichar = first; unichar <= last; unichar++) {
2074                                unicode_data[unichar].ccc = value;
2075                                count++;
2076                        }
2077                        if (verbose > 1)
2078                                printf(" %X..%X ccc %d\n", first, last, value);
2079                        if (!utf32valid(first) || !utf32valid(last))
2080                                line_fail(ccc_name, line);
2081                        continue;
2082                }
2083                ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084                if (ret == 2) {
2085                        unicode_data[unichar].ccc = value;
2086                        count++;
2087                        if (verbose > 1)
2088                                printf(" %X ccc %d\n", unichar, value);
2089                        if (!utf32valid(unichar))
2090                                line_fail(ccc_name, line);
2091                        continue;
2092                }
2093        }
2094        fclose(file);
2095
2096        if (verbose > 0)
2097                printf("Found %d entries\n", count);
2098        if (count == 0)
2099                file_fail(ccc_name);
2100}
2101
2102static int ignore_compatibility_form(char *type)
2103{
2104        int i;
2105        char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106                                 "final", "isolated", "circle", "super",
2107                                 "sub", "vertical", "wide", "narrow",
2108                                 "small", "square", "fraction", "compat"};
2109
2110        for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111                if (strcmp(type, ignored_types[i]) == 0)
2112                        return 1;
2113        return 0;
2114}
2115
2116static void nfdi_init(void)
2117{
2118        FILE *file;
2119        unsigned int unichar;
2120        unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2121        char *s;
2122        char *type;
2123        unsigned int *um;
2124        int count;
2125        int i;
2126        int ret;
2127
2128        if (verbose > 0)
2129                printf("Parsing %s\n", data_name);
2130        file = fopen(data_name, "r");
2131        if (!file)
2132                open_fail(data_name, errno);
2133
2134        count = 0;
2135        while (fgets(line, LINESIZE, file)) {
2136                ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137                             &unichar, buf0);
2138                if (ret != 2)
2139                        continue;
2140                if (!utf32valid(unichar))
2141                        line_fail(data_name, line);
2142
2143                s = buf0;
2144                /* skip over <tag> */
2145                if (*s == '<') {
2146                        type = ++s;
2147                        while (*++s != '>');
2148                        *s++ = '\0';
2149                        if(ignore_compatibility_form(type))
2150                                continue;
2151                }
2152                /* decode the decomposition into UTF-32 */
2153                i = 0;
2154                while (*s) {
2155                        mapping[i] = strtoul(s, &s, 16);
2156                        if (!utf32valid(mapping[i]))
2157                                line_fail(data_name, line);
2158                        i++;
2159                }
2160                mapping[i++] = 0;
2161
2162                um = malloc(i * sizeof(unsigned int));
2163                memcpy(um, mapping, i * sizeof(unsigned int));
2164                unicode_data[unichar].utf32nfdi = um;
2165
2166                if (verbose > 1)
2167                        print_utf32nfdi(unichar);
2168                count++;
2169        }
2170        fclose(file);
2171        if (verbose > 0)
2172                printf("Found %d entries\n", count);
2173        if (count == 0)
2174                file_fail(data_name);
2175}
2176
2177static void nfdicf_init(void)
2178{
2179        FILE *file;
2180        unsigned int unichar;
2181        unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2182        char status;
2183        char *s;
2184        unsigned int *um;
2185        int i;
2186        int count;
2187        int ret;
2188
2189        if (verbose > 0)
2190                printf("Parsing %s\n", fold_name);
2191        file = fopen(fold_name, "r");
2192        if (!file)
2193                open_fail(fold_name, errno);
2194
2195        count = 0;
2196        while (fgets(line, LINESIZE, file)) {
2197                ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198                if (ret != 3)
2199                        continue;
2200                if (!utf32valid(unichar))
2201                        line_fail(fold_name, line);
2202                /* Use the C+F casefold. */
2203                if (status != 'C' && status != 'F')
2204                        continue;
2205                s = buf0;
2206                if (*s == '<')
2207                        while (*s++ != ' ')
2208                                ;
2209                i = 0;
2210                while (*s) {
2211                        mapping[i] = strtoul(s, &s, 16);
2212                        if (!utf32valid(mapping[i]))
2213                                line_fail(fold_name, line);
2214                        i++;
2215                }
2216                mapping[i++] = 0;
2217
2218                um = malloc(i * sizeof(unsigned int));
2219                memcpy(um, mapping, i * sizeof(unsigned int));
2220                unicode_data[unichar].utf32nfdicf = um;
2221
2222                if (verbose > 1)
2223                        print_utf32nfdicf(unichar);
2224                count++;
2225        }
2226        fclose(file);
2227        if (verbose > 0)
2228                printf("Found %d entries\n", count);
2229        if (count == 0)
2230                file_fail(fold_name);
2231}
2232
2233static void ignore_init(void)
2234{
2235        FILE *file;
2236        unsigned int unichar;
2237        unsigned int first;
2238        unsigned int last;
2239        unsigned int *um;
2240        int count;
2241        int ret;
2242
2243        if (verbose > 0)
2244                printf("Parsing %s\n", prop_name);
2245        file = fopen(prop_name, "r");
2246        if (!file)
2247                open_fail(prop_name, errno);
2248        assert(file);
2249        count = 0;
2250        while (fgets(line, LINESIZE, file)) {
2251                ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252                if (ret == 3) {
2253                        if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254                                continue;
2255                        if (!utf32valid(first) || !utf32valid(last))
2256                                line_fail(prop_name, line);
2257                        for (unichar = first; unichar <= last; unichar++) {
2258                                free(unicode_data[unichar].utf32nfdi);
2259                                um = malloc(sizeof(unsigned int));
2260                                *um = 0;
2261                                unicode_data[unichar].utf32nfdi = um;
2262                                free(unicode_data[unichar].utf32nfdicf);
2263                                um = malloc(sizeof(unsigned int));
2264                                *um = 0;
2265                                unicode_data[unichar].utf32nfdicf = um;
2266                                count++;
2267                        }
2268                        if (verbose > 1)
2269                                printf(" %X..%X Default_Ignorable_Code_Point\n",
2270                                        first, last);
2271                        continue;
2272                }
2273                ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274                if (ret == 2) {
2275                        if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276                                continue;
2277                        if (!utf32valid(unichar))
2278                                line_fail(prop_name, line);
2279                        free(unicode_data[unichar].utf32nfdi);
2280                        um = malloc(sizeof(unsigned int));
2281                        *um = 0;
2282                        unicode_data[unichar].utf32nfdi = um;
2283                        free(unicode_data[unichar].utf32nfdicf);
2284                        um = malloc(sizeof(unsigned int));
2285                        *um = 0;
2286                        unicode_data[unichar].utf32nfdicf = um;
2287                        if (verbose > 1)
2288                                printf(" %X Default_Ignorable_Code_Point\n",
2289                                        unichar);
2290                        count++;
2291                        continue;
2292                }
2293        }
2294        fclose(file);
2295
2296        if (verbose > 0)
2297                printf("Found %d entries\n", count);
2298        if (count == 0)
2299                file_fail(prop_name);
2300}
2301
2302static void corrections_init(void)
2303{
2304        FILE *file;
2305        unsigned int unichar;
2306        unsigned int major;
2307        unsigned int minor;
2308        unsigned int revision;
2309        unsigned int age;
2310        unsigned int *um;
2311        unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2312        char *s;
2313        int i;
2314        int count;
2315        int ret;
2316
2317        if (verbose > 0)
2318                printf("Parsing %s\n", norm_name);
2319        file = fopen(norm_name, "r");
2320        if (!file)
2321                open_fail(norm_name, errno);
2322
2323        count = 0;
2324        while (fgets(line, LINESIZE, file)) {
2325                ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326                                &unichar, buf0, buf1,
2327                                &major, &minor, &revision);
2328                if (ret != 6)
2329                        continue;
2330                if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2331                        line_fail(norm_name, line);
2332                count++;
2333        }
2334        corrections = calloc(count, sizeof(struct unicode_data));
2335        corrections_count = count;
2336        rewind(file);
2337
2338        count = 0;
2339        while (fgets(line, LINESIZE, file)) {
2340                ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341                                &unichar, buf0, buf1,
2342                                &major, &minor, &revision);
2343                if (ret != 6)
2344                        continue;
2345                if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2346                        line_fail(norm_name, line);
2347                corrections[count] = unicode_data[unichar];
2348                assert(corrections[count].code == unichar);
2349                age = UNICODE_AGE(major, minor, revision);
2350                corrections[count].correction = age;
2351
2352                i = 0;
2353                s = buf0;
2354                while (*s) {
2355                        mapping[i] = strtoul(s, &s, 16);
2356                        if (!utf32valid(mapping[i]))
2357                                line_fail(norm_name, line);
2358                        i++;
2359                }
2360                mapping[i++] = 0;
2361
2362                um = malloc(i * sizeof(unsigned int));
2363                memcpy(um, mapping, i * sizeof(unsigned int));
2364                corrections[count].utf32nfdi = um;
2365
2366                if (verbose > 1)
2367                        printf(" %X -> %s -> %s V%d_%d_%d\n",
2368                                unichar, buf0, buf1, major, minor, revision);
2369                count++;
2370        }
2371        fclose(file);
2372
2373        if (verbose > 0)
2374                printf("Found %d entries\n", count);
2375        if (count == 0)
2376                file_fail(norm_name);
2377}
2378
2379/* ------------------------------------------------------------------ */
2380
2381/*
2382 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2383 *
2384 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2386 *
2387 * SBase = 0xAC00
2388 * LBase = 0x1100
2389 * VBase = 0x1161
2390 * TBase = 0x11A7
2391 * LCount = 19
2392 * VCount = 21
2393 * TCount = 28
2394 * NCount = 588 (VCount * TCount)
2395 * SCount = 11172 (LCount * NCount)
2396 *
2397 * Decomposition:
2398 *   SIndex = s - SBase
2399 *
2400 * LV (Canonical/Full)
2401 *   LIndex = SIndex / NCount
2402 *   VIndex = (Sindex % NCount) / TCount
2403 *   LPart = LBase + LIndex
2404 *   VPart = VBase + VIndex
2405 *
2406 * LVT (Canonical)
2407 *   LVIndex = (SIndex / TCount) * TCount
2408 *   TIndex = (Sindex % TCount)
2409 *   LVPart = SBase + LVIndex
2410 *   TPart = TBase + TIndex
2411 *
2412 * LVT (Full)
2413 *   LIndex = SIndex / NCount
2414 *   VIndex = (Sindex % NCount) / TCount
2415 *   TIndex = (Sindex % TCount)
2416 *   LPart = LBase + LIndex
2417 *   VPart = VBase + VIndex
2418 *   if (TIndex == 0) {
2419 *          d = <LPart, VPart>
2420 *   } else {
2421 *          TPart = TBase + TIndex
2422 *          d = <LPart, VPart, TPart>
2423 *   }
2424 *
2425 */
2426
2427static void hangul_decompose(void)
2428{
2429        unsigned int sb = 0xAC00;
2430        unsigned int lb = 0x1100;
2431        unsigned int vb = 0x1161;
2432        unsigned int tb = 0x11a7;
2433        /* unsigned int lc = 19; */
2434        unsigned int vc = 21;
2435        unsigned int tc = 28;
2436        unsigned int nc = (vc * tc);
2437        /* unsigned int sc = (lc * nc); */
2438        unsigned int unichar;
2439        unsigned int mapping[4];
2440        unsigned int *um;
2441        int count;
2442        int i;
2443
2444        if (verbose > 0)
2445                printf("Decomposing hangul\n");
2446        /* Hangul */
2447        count = 0;
2448        for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2449                unsigned int si = unichar - sb;
2450                unsigned int li = si / nc;
2451                unsigned int vi = (si % nc) / tc;
2452                unsigned int ti = si % tc;
2453
2454                i = 0;
2455                mapping[i++] = lb + li;
2456                mapping[i++] = vb + vi;
2457                if (ti)
2458                        mapping[i++] = tb + ti;
2459                mapping[i++] = 0;
2460
2461                assert(!unicode_data[unichar].utf32nfdi);
2462                um = malloc(i * sizeof(unsigned int));
2463                memcpy(um, mapping, i * sizeof(unsigned int));
2464                unicode_data[unichar].utf32nfdi = um;
2465
2466                assert(!unicode_data[unichar].utf32nfdicf);
2467                um = malloc(i * sizeof(unsigned int));
2468                memcpy(um, mapping, i * sizeof(unsigned int));
2469                unicode_data[unichar].utf32nfdicf = um;
2470
2471                /*
2472                 * Add a cookie as a reminder that the hangul syllable
2473                 * decompositions must not be stored in the generated
2474                 * trie.
2475                 */
2476                unicode_data[unichar].utf8nfdi = malloc(2);
2477                unicode_data[unichar].utf8nfdi[0] = HANGUL;
2478                unicode_data[unichar].utf8nfdi[1] = '\0';
2479
2480                if (verbose > 1)
2481                        print_utf32nfdi(unichar);
2482
2483                count++;
2484        }
2485        if (verbose > 0)
2486                printf("Created %d entries\n", count);
2487}
2488
2489static void nfdi_decompose(void)
2490{
2491        unsigned int unichar;
2492        unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2493        unsigned int *um;
2494        unsigned int *dc;
2495        int count;
2496        int i;
2497        int j;
2498        int ret;
2499
2500        if (verbose > 0)
2501                printf("Decomposing nfdi\n");
2502
2503        count = 0;
2504        for (unichar = 0; unichar != 0x110000; unichar++) {
2505                if (!unicode_data[unichar].utf32nfdi)
2506                        continue;
2507                for (;;) {
2508                        ret = 1;
2509                        i = 0;
2510                        um = unicode_data[unichar].utf32nfdi;
2511                        while (*um) {
2512                                dc = unicode_data[*um].utf32nfdi;
2513                                if (dc) {
2514                                        for (j = 0; dc[j]; j++)
2515                                                mapping[i++] = dc[j];
2516                                        ret = 0;
2517                                } else {
2518                                        mapping[i++] = *um;
2519                                }
2520                                um++;
2521                        }
2522                        mapping[i++] = 0;
2523                        if (ret)
2524                                break;
2525                        free(unicode_data[unichar].utf32nfdi);
2526                        um = malloc(i * sizeof(unsigned int));
2527                        memcpy(um, mapping, i * sizeof(unsigned int));
2528                        unicode_data[unichar].utf32nfdi = um;
2529                }
2530                /* Add this decomposition to nfdicf if there is no entry. */
2531                if (!unicode_data[unichar].utf32nfdicf) {
2532                        um = malloc(i * sizeof(unsigned int));
2533                        memcpy(um, mapping, i * sizeof(unsigned int));
2534                        unicode_data[unichar].utf32nfdicf = um;
2535                }
2536                if (verbose > 1)
2537                        print_utf32nfdi(unichar);
2538                count++;
2539        }
2540        if (verbose > 0)
2541                printf("Processed %d entries\n", count);
2542}
2543
2544static void nfdicf_decompose(void)
2545{
2546        unsigned int unichar;
2547        unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2548        unsigned int *um;
2549        unsigned int *dc;
2550        int count;
2551        int i;
2552        int j;
2553        int ret;
2554
2555        if (verbose > 0)
2556                printf("Decomposing nfdicf\n");
2557        count = 0;
2558        for (unichar = 0; unichar != 0x110000; unichar++) {
2559                if (!unicode_data[unichar].utf32nfdicf)
2560                        continue;
2561                for (;;) {
2562                        ret = 1;
2563                        i = 0;
2564                        um = unicode_data[unichar].utf32nfdicf;
2565                        while (*um) {
2566                                dc = unicode_data[*um].utf32nfdicf;
2567                                if (dc) {
2568                                        for (j = 0; dc[j]; j++)
2569                                                mapping[i++] = dc[j];
2570                                        ret = 0;
2571                                } else {
2572                                        mapping[i++] = *um;
2573                                }
2574                                um++;
2575                        }
2576                        mapping[i++] = 0;
2577                        if (ret)
2578                                break;
2579                        free(unicode_data[unichar].utf32nfdicf);
2580                        um = malloc(i * sizeof(unsigned int));
2581                        memcpy(um, mapping, i * sizeof(unsigned int));
2582                        unicode_data[unichar].utf32nfdicf = um;
2583                }
2584                if (verbose > 1)
2585                        print_utf32nfdicf(unichar);
2586                count++;
2587        }
2588        if (verbose > 0)
2589                printf("Processed %d entries\n", count);
2590}
2591
2592/* ------------------------------------------------------------------ */
2593
2594int utf8agemax(struct tree *, const char *);
2595int utf8nagemax(struct tree *, const char *, size_t);
2596int utf8agemin(struct tree *, const char *);
2597int utf8nagemin(struct tree *, const char *, size_t);
2598ssize_t utf8len(struct tree *, const char *);
2599ssize_t utf8nlen(struct tree *, const char *, size_t);
2600struct utf8cursor;
2601int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2602int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2603int utf8byte(struct utf8cursor *);
2604
2605/*
2606 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2607 *
2608 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2610 *
2611 * SBase = 0xAC00
2612 * LBase = 0x1100
2613 * VBase = 0x1161
2614 * TBase = 0x11A7
2615 * LCount = 19
2616 * VCount = 21
2617 * TCount = 28
2618 * NCount = 588 (VCount * TCount)
2619 * SCount = 11172 (LCount * NCount)
2620 *
2621 * Decomposition:
2622 *   SIndex = s - SBase
2623 *
2624 * LV (Canonical/Full)
2625 *   LIndex = SIndex / NCount
2626 *   VIndex = (Sindex % NCount) / TCount
2627 *   LPart = LBase + LIndex
2628 *   VPart = VBase + VIndex
2629 *
2630 * LVT (Canonical)
2631 *   LVIndex = (SIndex / TCount) * TCount
2632 *   TIndex = (Sindex % TCount)
2633 *   LVPart = SBase + LVIndex
2634 *   TPart = TBase + TIndex
2635 *
2636 * LVT (Full)
2637 *   LIndex = SIndex / NCount
2638 *   VIndex = (Sindex % NCount) / TCount
2639 *   TIndex = (Sindex % TCount)
2640 *   LPart = LBase + LIndex
2641 *   VPart = VBase + VIndex
2642 *   if (TIndex == 0) {
2643 *          d = <LPart, VPart>
2644 *   } else {
2645 *          TPart = TBase + TIndex
2646 *          d = <LPart, VPart, TPart>
2647 *   }
2648 */
2649
2650/* Constants */
2651#define SB      (0xAC00)
2652#define LB      (0x1100)
2653#define VB      (0x1161)
2654#define TB      (0x11A7)
2655#define LC      (19)
2656#define VC      (21)
2657#define TC      (28)
2658#define NC      (VC * TC)
2659#define SC      (LC * NC)
2660
2661/* Algorithmic decomposition of hangul syllable. */
2662static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2663{
2664        unsigned int    si;
2665        unsigned int    li;
2666        unsigned int    vi;
2667        unsigned int    ti;
2668        unsigned char   *h;
2669
2670        /* Calculate the SI, LI, VI, and TI values. */
2671        si = utf8decode(str) - SB;
2672        li = si / NC;
2673        vi = (si % NC) / TC;
2674        ti = si % TC;
2675
2676        /* Fill in base of leaf. */
2677        h = hangul;
2678        LEAF_GEN(h) = 2;
2679        LEAF_CCC(h) = DECOMPOSE;
2680        h += 2;
2681
2682        /* Add LPart, a 3-byte UTF-8 sequence. */
2683        h += utf8encode((char *)h, li + LB);
2684
2685        /* Add VPart, a 3-byte UTF-8 sequence. */
2686        h += utf8encode((char *)h, vi + VB);
2687
2688        /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2689        if (ti)
2690                h += utf8encode((char *)h, ti + TB);
2691
2692        /* Terminate string. */
2693        h[0] = '\0';
2694
2695        return hangul;
2696}
2697
2698/*
2699 * Use trie to scan s, touching at most len bytes.
2700 * Returns the leaf if one exists, NULL otherwise.
2701 *
2702 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703 * is well-formed and corresponds to a known unicode code point.  The
2704 * shorthand for this will be "is valid UTF-8 unicode".
2705 */
2706static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2707                               const char *s, size_t len)
2708{
2709        utf8trie_t      *trie;
2710        int             offlen;
2711        int             offset;
2712        int             mask;
2713        int             node;
2714
2715        if (!tree)
2716                return NULL;
2717        if (len == 0)
2718                return NULL;
2719        node = 1;
2720        trie = utf8data + tree->index;
2721        while (node) {
2722                offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723                if (*trie & NEXTBYTE) {
2724                        if (--len == 0)
2725                                return NULL;
2726                        s++;
2727                }
2728                mask = 1 << (*trie & BITNUM);
2729                if (*s & mask) {
2730                        /* Right leg */
2731                        if (offlen) {
2732                                /* Right node at offset of trie */
2733                                node = (*trie & RIGHTNODE);
2734                                offset = trie[offlen];
2735                                while (--offlen) {
2736                                        offset <<= 8;
2737                                        offset |= trie[offlen];
2738                                }
2739                                trie += offset;
2740                        } else if (*trie & RIGHTPATH) {
2741                                /* Right node after this node */
2742                                node = (*trie & TRIENODE);
2743                                trie++;
2744                        } else {
2745                                /* No right node. */
2746                                return NULL;
2747                        }
2748                } else {
2749                        /* Left leg */
2750                        if (offlen) {
2751                                /* Left node after this node. */
2752                                node = (*trie & LEFTNODE);
2753                                trie += offlen + 1;
2754                        } else if (*trie & RIGHTPATH) {
2755                                /* No left node. */
2756                                return NULL;
2757                        } else {
2758                                /* Left node after this node */
2759                                node = (*trie & TRIENODE);
2760                                trie++;
2761                        }
2762                }
2763        }
2764        /*
2765         * Hangul decomposition is done algorithmically. These are the
2766         * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2767         * always 3 bytes long, so s has been advanced twice, and the
2768         * start of the sequence is at s-2.
2769         */
2770        if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2771                trie = utf8hangul(s - 2, hangul);
2772        return trie;
2773}
2774
2775/*
2776 * Use trie to scan s.
2777 * Returns the leaf if one exists, NULL otherwise.
2778 *
2779 * Forwards to trie_nlookup().
2780 */
2781static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2782                              const char *s)
2783{
2784        return utf8nlookup(tree, hangul, s, (size_t)-1);
2785}
2786
2787/*
2788 * Return the number of bytes used by the current UTF-8 sequence.
2789 * Assumes the input points to the first byte of a valid UTF-8
2790 * sequence.
2791 */
2792static inline int utf8clen(const char *s)
2793{
2794        unsigned char c = *s;
2795        return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2796}
2797
2798/*
2799 * Maximum age of any character in s.
2800 * Return -1 if s is not valid UTF-8 unicode.
2801 * Return 0 if only non-assigned code points are used.
2802 */
2803int utf8agemax(struct tree *tree, const char *s)
2804{
2805        utf8leaf_t      *leaf;
2806        int             age = 0;
2807        int             leaf_age;
2808        unsigned char   hangul[UTF8HANGULLEAF];
2809
2810        if (!tree)
2811                return -1;
2812
2813        while (*s) {
2814                leaf = utf8lookup(tree, hangul, s);
2815                if (!leaf)
2816                        return -1;
2817                leaf_age = ages[LEAF_GEN(leaf)];
2818                if (leaf_age <= tree->maxage && leaf_age > age)
2819                        age = leaf_age;
2820                s += utf8clen(s);
2821        }
2822        return age;
2823}
2824
2825/*
2826 * Minimum age of any character in s.
2827 * Return -1 if s is not valid UTF-8 unicode.
2828 * Return 0 if non-assigned code points are used.
2829 */
2830int utf8agemin(struct tree *tree, const char *s)
2831{
2832        utf8leaf_t      *leaf;
2833        int             age;
2834        int             leaf_age;
2835        unsigned char   hangul[UTF8HANGULLEAF];
2836
2837        if (!tree)
2838                return -1;
2839        age = tree->maxage;
2840        while (*s) {
2841                leaf = utf8lookup(tree, hangul, s);
2842                if (!leaf)
2843                        return -1;
2844                leaf_age = ages[LEAF_GEN(leaf)];
2845                if (leaf_age <= tree->maxage && leaf_age < age)
2846                        age = leaf_age;
2847                s += utf8clen(s);
2848        }
2849        return age;
2850}
2851
2852/*
2853 * Maximum age of any character in s, touch at most len bytes.
2854 * Return -1 if s is not valid UTF-8 unicode.
2855 */
2856int utf8nagemax(struct tree *tree, const char *s, size_t len)
2857{
2858        utf8leaf_t      *leaf;
2859        int             age = 0;
2860        int             leaf_age;
2861        unsigned char   hangul[UTF8HANGULLEAF];
2862
2863        if (!tree)
2864                return -1;
2865
2866        while (len && *s) {
2867                leaf = utf8nlookup(tree, hangul, s, len);
2868                if (!leaf)
2869                        return -1;
2870                leaf_age = ages[LEAF_GEN(leaf)];
2871                if (leaf_age <= tree->maxage && leaf_age > age)
2872                        age = leaf_age;
2873                len -= utf8clen(s);
2874                s += utf8clen(s);
2875        }
2876        return age;
2877}
2878
2879/*
2880 * Maximum age of any character in s, touch at most len bytes.
2881 * Return -1 if s is not valid UTF-8 unicode.
2882 */
2883int utf8nagemin(struct tree *tree, const char *s, size_t len)
2884{
2885        utf8leaf_t      *leaf;
2886        int             leaf_age;
2887        int             age;
2888        unsigned char   hangul[UTF8HANGULLEAF];
2889
2890        if (!tree)
2891                return -1;
2892        age = tree->maxage;
2893        while (len && *s) {
2894                leaf = utf8nlookup(tree, hangul, s, len);
2895                if (!leaf)
2896                        return -1;
2897                leaf_age = ages[LEAF_GEN(leaf)];
2898                if (leaf_age <= tree->maxage && leaf_age < age)
2899                        age = leaf_age;
2900                len -= utf8clen(s);
2901                s += utf8clen(s);
2902        }
2903        return age;
2904}
2905
2906/*
2907 * Length of the normalization of s.
2908 * Return -1 if s is not valid UTF-8 unicode.
2909 *
2910 * A string of Default_Ignorable_Code_Point has length 0.
2911 */
2912ssize_t utf8len(struct tree *tree, const char *s)
2913{
2914        utf8leaf_t      *leaf;
2915        size_t          ret = 0;
2916        unsigned char   hangul[UTF8HANGULLEAF];
2917
2918        if (!tree)
2919                return -1;
2920        while (*s) {
2921                leaf = utf8lookup(tree, hangul, s);
2922                if (!leaf)
2923                        return -1;
2924                if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925                        ret += utf8clen(s);
2926                else if (LEAF_CCC(leaf) == DECOMPOSE)
2927                        ret += strlen(LEAF_STR(leaf));
2928                else
2929                        ret += utf8clen(s);
2930                s += utf8clen(s);
2931        }
2932        return ret;
2933}
2934
2935/*
2936 * Length of the normalization of s, touch at most len bytes.
2937 * Return -1 if s is not valid UTF-8 unicode.
2938 */
2939ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2940{
2941        utf8leaf_t      *leaf;
2942        size_t          ret = 0;
2943        unsigned char   hangul[UTF8HANGULLEAF];
2944
2945        if (!tree)
2946                return -1;
2947        while (len && *s) {
2948                leaf = utf8nlookup(tree, hangul, s, len);
2949                if (!leaf)
2950                        return -1;
2951                if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952                        ret += utf8clen(s);
2953                else if (LEAF_CCC(leaf) == DECOMPOSE)
2954                        ret += strlen(LEAF_STR(leaf));
2955                else
2956                        ret += utf8clen(s);
2957                len -= utf8clen(s);
2958                s += utf8clen(s);
2959        }
2960        return ret;
2961}
2962
2963/*
2964 * Cursor structure used by the normalizer.
2965 */
2966struct utf8cursor {
2967        struct tree     *tree;
2968        const char      *s;
2969        const char      *p;
2970        const char      *ss;
2971        const char      *sp;
2972        unsigned int    len;
2973        unsigned int    slen;
2974        short int       ccc;
2975        short int       nccc;
2976        unsigned int    unichar;
2977        unsigned char   hangul[UTF8HANGULLEAF];
2978};
2979
2980/*
2981 * Set up an utf8cursor for use by utf8byte().
2982 *
2983 *   s      : string.
2984 *   len    : length of s.
2985 *   u8c    : pointer to cursor.
2986 *   trie   : utf8trie_t to use for normalization.
2987 *
2988 * Returns -1 on error, 0 on success.
2989 */
2990int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2991                size_t len)
2992{
2993        if (!tree)
2994                return -1;
2995        if (!s)
2996                return -1;
2997        u8c->tree = tree;
2998        u8c->s = s;
2999        u8c->p = NULL;
3000        u8c->ss = NULL;
3001        u8c->sp = NULL;
3002        u8c->len = len;
3003        u8c->slen = 0;
3004        u8c->ccc = STOPPER;
3005        u8c->nccc = STOPPER;
3006        u8c->unichar = 0;
3007        /* Check we didn't clobber the maximum length. */
3008        if (u8c->len != len)
3009                return -1;
3010        /* The first byte of s may not be an utf8 continuation. */
3011        if (len > 0 && (*s & 0xC0) == 0x80)
3012                return -1;
3013        return 0;
3014}
3015
3016/*
3017 * Set up an utf8cursor for use by utf8byte().
3018 *
3019 *   s      : NUL-terminated string.
3020 *   u8c    : pointer to cursor.
3021 *   trie   : utf8trie_t to use for normalization.
3022 *
3023 * Returns -1 on error, 0 on success.
3024 */
3025int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
3026{
3027        return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3028}
3029
3030/*
3031 * Get one byte from the normalized form of the string described by u8c.
3032 *
3033 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3034 *
3035 * The cursor keeps track of the location in the string in u8c->s.
3036 * When a character is decomposed, the current location is stored in
3037 * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038 * that bytes from a decomposition do not count against u8c->len.
3039 *
3040 * Characters are emitted if they match the current CCC in u8c->ccc.
3041 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042 * and the function returns 0 in that case.
3043 *
3044 * Sorting by CCC is done by repeatedly scanning the string.  The
3045 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3046 * the start of the scan.  The first pass finds the lowest CCC to be
3047 * emitted and stores it in u8c->nccc, the second pass emits the
3048 * characters with this CCC and finds the next lowest CCC. This limits
3049 * the number of passes to 1 + the number of different CCCs in the
3050 * sequence being scanned.
3051 *
3052 * Therefore:
3053 *  u8c->p  != NULL -> a decomposition is being scanned.
3054 *  u8c->ss != NULL -> this is a repeating scan.
3055 *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
3056 */
3057int utf8byte(struct utf8cursor *u8c)
3058{
3059        utf8leaf_t *leaf;
3060        int ccc;
3061
3062        for (;;) {
3063                /* Check for the end of a decomposed character. */
3064                if (u8c->p && *u8c->s == '\0') {
3065                        u8c->s = u8c->p;
3066                        u8c->p = NULL;
3067                }
3068
3069                /* Check for end-of-string. */
3070                if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3071                        /* There is no next byte. */
3072                        if (u8c->ccc == STOPPER)
3073                                return 0;
3074                        /* End-of-string during a scan counts as a stopper. */
3075                        ccc = STOPPER;
3076                        goto ccc_mismatch;
3077                } else if ((*u8c->s & 0xC0) == 0x80) {
3078                        /* This is a continuation of the current character. */
3079                        if (!u8c->p)
3080                                u8c->len--;
3081                        return (unsigned char)*u8c->s++;
3082                }
3083
3084                /* Look up the data for the current character. */
3085                if (u8c->p) {
3086                        leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3087                } else {
3088                        leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089                                           u8c->s, u8c->len);
3090                }
3091
3092                /* No leaf found implies that the input is a binary blob. */
3093                if (!leaf)
3094                        return -1;
3095
3096                /* Characters that are too new have CCC 0. */
3097                if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098                        ccc = STOPPER;
3099                } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100                        u8c->len -= utf8clen(u8c->s);
3101                        u8c->p = u8c->s + utf8clen(u8c->s);
3102                        u8c->s = LEAF_STR(leaf);
3103                        /* Empty decomposition implies CCC 0. */
3104                        if (*u8c->s == '\0') {
3105                                if (u8c->ccc == STOPPER)
3106                                        continue;
3107                                ccc = STOPPER;
3108                                goto ccc_mismatch;
3109                        }
3110                        leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3111                        ccc = LEAF_CCC(leaf);
3112                }
3113                u8c->unichar = utf8decode(u8c->s);
3114
3115                /*
3116                 * If this is not a stopper, then see if it updates
3117                 * the next canonical class to be emitted.
3118                 */
3119                if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120                        u8c->nccc = ccc;
3121
3122                /*
3123                 * Return the current byte if this is the current
3124                 * combining class.
3125                 */
3126                if (ccc == u8c->ccc) {
3127                        if (!u8c->p)
3128                                u8c->len--;
3129                        return (unsigned char)*u8c->s++;
3130                }
3131
3132                /* Current combining class mismatch. */
3133        ccc_mismatch:
3134                if (u8c->nccc == STOPPER) {
3135                        /*
3136                         * Scan forward for the first canonical class
3137                         * to be emitted.  Save the position from
3138                         * which to restart.
3139                         */
3140                        assert(u8c->ccc == STOPPER);
3141                        u8c->ccc = MINCCC - 1;
3142                        u8c->nccc = ccc;
3143                        u8c->sp = u8c->p;
3144                        u8c->ss = u8c->s;
3145                        u8c->slen = u8c->len;
3146                        if (!u8c->p)
3147                                u8c->len -= utf8clen(u8c->s);
3148                        u8c->s += utf8clen(u8c->s);
3149                } else if (ccc != STOPPER) {
3150                        /* Not a stopper, and not the ccc we're emitting. */
3151                        if (!u8c->p)
3152                                u8c->len -= utf8clen(u8c->s);
3153                        u8c->s += utf8clen(u8c->s);
3154                } else if (u8c->nccc != MAXCCC + 1) {
3155                        /* At a stopper, restart for next ccc. */
3156                        u8c->ccc = u8c->nccc;
3157                        u8c->nccc = MAXCCC + 1;
3158                        u8c->s = u8c->ss;
3159                        u8c->p = u8c->sp;
3160                        u8c->len = u8c->slen;
3161                } else {
3162                        /* All done, proceed from here. */
3163                        u8c->ccc = STOPPER;
3164                        u8c->nccc = STOPPER;
3165                        u8c->sp = NULL;
3166                        u8c->ss = NULL;
3167                        u8c->slen = 0;
3168                }
3169        }
3170}
3171
3172/* ------------------------------------------------------------------ */
3173
3174static int normalize_line(struct tree *tree)
3175{
3176        char *s;
3177        char *t;
3178        int c;
3179        struct utf8cursor u8c;
3180
3181        /* First test: null-terminated string. */
3182        s = buf2;
3183        t = buf3;
3184        if (utf8cursor(&u8c, tree, s))
3185                return -1;
3186        while ((c = utf8byte(&u8c)) > 0)
3187                if (c != (unsigned char)*t++)
3188                        return -1;
3189        if (c < 0)
3190                return -1;
3191        if (*t != 0)
3192                return -1;
3193
3194        /* Second test: length-limited string. */
3195        s = buf2;
3196        /* Replace NUL with a value that will cause an error if seen. */
3197        s[strlen(s) + 1] = -1;
3198        t = buf3;
3199        if (utf8cursor(&u8c, tree, s))
3200                return -1;
3201        while ((c = utf8byte(&u8c)) > 0)
3202                if (c != (unsigned char)*t++)
3203                        return -1;
3204        if (c < 0)
3205                return -1;
3206        if (*t != 0)
3207                return -1;
3208
3209        return 0;
3210}
3211
3212static void normalization_test(void)
3213{
3214        FILE *file;
3215        unsigned int unichar;
3216        struct unicode_data *data;
3217        char *s;
3218        char *t;
3219        int ret;
3220        int ignorables;
3221        int tests = 0;
3222        int failures = 0;
3223
3224        if (verbose > 0)
3225                printf("Parsing %s\n", test_name);
3226        /* Step one, read data from file. */
3227        file = fopen(test_name, "r");
3228        if (!file)
3229                open_fail(test_name, errno);
3230
3231        while (fgets(line, LINESIZE, file)) {
3232                ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3233                             buf0, buf1);
3234                if (ret != 2 || *line == '#')
3235                        continue;
3236                s = buf0;
3237                t = buf2;
3238                while (*s) {
3239                        unichar = strtoul(s, &s, 16);
3240                        t += utf8encode(t, unichar);
3241                }
3242                *t = '\0';
3243
3244                ignorables = 0;
3245                s = buf1;
3246                t = buf3;
3247                while (*s) {
3248                        unichar = strtoul(s, &s, 16);
3249                        data = &unicode_data[unichar];
3250                        if (data->utf8nfdi && !*data->utf8nfdi)
3251                                ignorables = 1;
3252                        else
3253                                t += utf8encode(t, unichar);
3254                }
3255                *t = '\0';
3256
3257                tests++;
3258                if (normalize_line(nfdi_tree) < 0) {
3259                        printf("Line %s -> %s", buf0, buf1);
3260                        if (ignorables)
3261                                printf(" (ignorables removed)");
3262                        printf(" failure\n");
3263                        failures++;
3264                }
3265        }
3266        fclose(file);
3267        if (verbose > 0)
3268                printf("Ran %d tests with %d failures\n", tests, failures);
3269        if (failures)
3270                file_fail(test_name);
3271}
3272
3273/* ------------------------------------------------------------------ */
3274
3275static void write_file(void)
3276{
3277        FILE *file;
3278        int i;
3279        int j;
3280        int t;
3281        int gen;
3282
3283        if (verbose > 0)
3284                printf("Writing %s\n", utf8_name);
3285        file = fopen(utf8_name, "w");
3286        if (!file)
3287                open_fail(utf8_name, errno);
3288
3289        fprintf(file, "/* This file is generated code, do not edit. */\n");
3290        fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3291        fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3292        fprintf(file, "#endif\n");
3293        fprintf(file, "\n");
3294        fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3295                unicode_maxage);
3296        fprintf(file, "\n");
3297        fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3298        for (i = 0; i != ages_count; i++)
3299                fprintf(file, "\t%#x%s\n", ages[i],
3300                        ages[i] == unicode_maxage ? "" : ",");
3301        fprintf(file, "};\n");
3302        fprintf(file, "\n");
3303        fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3304        t = 0;
3305        for (gen = 0; gen < ages_count; gen++) {
3306                fprintf(file, "\t{ %#x, %d }%s\n",
3307                        ages[gen], trees[t].index,
3308                        ages[gen] == unicode_maxage ? "" : ",");
3309                if (trees[t].maxage == ages[gen])
3310                        t += 2;
3311        }
3312        fprintf(file, "};\n");
3313        fprintf(file, "\n");
3314        fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3315        t = 1;
3316        for (gen = 0; gen < ages_count; gen++) {
3317                fprintf(file, "\t{ %#x, %d }%s\n",
3318                        ages[gen], trees[t].index,
3319                        ages[gen] == unicode_maxage ? "" : ",");
3320                if (trees[t].maxage == ages[gen])
3321                        t += 2;
3322        }
3323        fprintf(file, "};\n");
3324        fprintf(file, "\n");
3325        fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3326                utf8data_size);
3327        t = 0;
3328        for (i = 0; i != utf8data_size; i += 16) {
3329                if (i == trees[t].index) {
3330                        fprintf(file, "\t/* %s_%x */\n",
3331                                trees[t].type, trees[t].maxage);
3332                        if (t < trees_count-1)
3333                                t++;
3334                }
3335                fprintf(file, "\t");
3336                for (j = i; j != i + 16; j++)
3337                        fprintf(file, "0x%.2x%s", utf8data[j],
3338                                (j < utf8data_size -1 ? "," : ""));
3339                fprintf(file, "\n");
3340        }
3341        fprintf(file, "};\n");
3342        fclose(file);
3343}
3344
3345/* ------------------------------------------------------------------ */
3346
3347int main(int argc, char *argv[])
3348{
3349        unsigned int unichar;
3350        int opt;
3351
3352        argv0 = argv[0];
3353
3354        while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3355                switch (opt) {
3356                case 'a':
3357                        age_name = optarg;
3358                        break;
3359                case 'c':
3360                        ccc_name = optarg;
3361                        break;
3362                case 'd':
3363                        data_name = optarg;
3364                        break;
3365                case 'f':
3366                        fold_name = optarg;
3367                        break;
3368                case 'n':
3369                        norm_name = optarg;
3370                        break;
3371                case 'o':
3372                        utf8_name = optarg;
3373                        break;
3374                case 'p':
3375                        prop_name = optarg;
3376                        break;
3377                case 't':
3378                        test_name = optarg;
3379                        break;
3380                case 'v':
3381                        verbose++;
3382                        break;
3383                case 'h':
3384                        help();
3385                        exit(0);
3386                default:
3387                        usage();
3388                }
3389        }
3390
3391        if (verbose > 1)
3392                help();
3393        for (unichar = 0; unichar != 0x110000; unichar++)
3394                unicode_data[unichar].code = unichar;
3395        age_init();
3396        ccc_init();
3397        nfdi_init();
3398        nfdicf_init();
3399        ignore_init();
3400        corrections_init();
3401        hangul_decompose();
3402        nfdi_decompose();
3403        nfdicf_decompose();
3404        utf8_init();
3405        trees_init();
3406        trees_populate();
3407        trees_reduce();
3408        trees_verify();
3409        /* Prevent "unused function" warning. */
3410        (void)lookup(nfdi_tree, " ");
3411        if (verbose > 2)
3412                tree_walk(nfdi_tree);
3413        if (verbose > 2)
3414                tree_walk(nfdicf_tree);
3415        normalization_test();
3416        write_file();
3417
3418        return 0;
3419}
3420