linux/fs/unicode/utf8-norm.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2014 SGI.
   4 * All rights reserved.
   5 */
   6
   7#include "utf8n.h"
   8
   9struct utf8data {
  10        unsigned int maxage;
  11        unsigned int offset;
  12};
  13
  14#define __INCLUDED_FROM_UTF8NORM_C__
  15#include "utf8data.h"
  16#undef __INCLUDED_FROM_UTF8NORM_C__
  17
  18int utf8version_is_supported(u8 maj, u8 min, u8 rev)
  19{
  20        int i = ARRAY_SIZE(utf8agetab) - 1;
  21        unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
  22
  23        while (i >= 0 && utf8agetab[i] != 0) {
  24                if (sb_utf8version == utf8agetab[i])
  25                        return 1;
  26                i--;
  27        }
  28        return 0;
  29}
  30EXPORT_SYMBOL(utf8version_is_supported);
  31
  32int utf8version_latest(void)
  33{
  34        return utf8vers;
  35}
  36EXPORT_SYMBOL(utf8version_latest);
  37
  38/*
  39 * UTF-8 valid ranges.
  40 *
  41 * The UTF-8 encoding spreads the bits of a 32bit word over several
  42 * bytes. This table gives the ranges that can be held and how they'd
  43 * be represented.
  44 *
  45 * 0x00000000 0x0000007F: 0xxxxxxx
  46 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
  47 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  48 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  49 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  50 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  51 *
  52 * There is an additional requirement on UTF-8, in that only the
  53 * shortest representation of a 32bit value is to be used.  A decoder
  54 * must not decode sequences that do not satisfy this requirement.
  55 * Thus the allowed ranges have a lower bound.
  56 *
  57 * 0x00000000 0x0000007F: 0xxxxxxx
  58 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
  59 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  60 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  61 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  62 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  63 *
  64 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
  65 * 17 planes of 65536 values.  This limits the sequences actually seen
  66 * even more, to just the following.
  67 *
  68 *          0 -     0x7F: 0                   - 0x7F
  69 *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
  70 *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
  71 *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
  72 *
  73 * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
  74 *
  75 * Note that the longest sequence seen with valid usage is 4 bytes,
  76 * the same a single UTF-32 character.  This makes the UTF-8
  77 * representation of Unicode strictly smaller than UTF-32.
  78 *
  79 * The shortest sequence requirement was introduced by:
  80 *    Corrigendum #1: UTF-8 Shortest Form
  81 * It can be found here:
  82 *    http://www.unicode.org/versions/corrigendum1.html
  83 *
  84 */
  85
  86/*
  87 * Return the number of bytes used by the current UTF-8 sequence.
  88 * Assumes the input points to the first byte of a valid UTF-8
  89 * sequence.
  90 */
  91static inline int utf8clen(const char *s)
  92{
  93        unsigned char c = *s;
  94
  95        return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
  96}
  97
  98/*
  99 * Decode a 3-byte UTF-8 sequence.
 100 */
 101static unsigned int
 102utf8decode3(const char *str)
 103{
 104        unsigned int            uc;
 105
 106        uc = *str++ & 0x0F;
 107        uc <<= 6;
 108        uc |= *str++ & 0x3F;
 109        uc <<= 6;
 110        uc |= *str++ & 0x3F;
 111
 112        return uc;
 113}
 114
 115/*
 116 * Encode a 3-byte UTF-8 sequence.
 117 */
 118static int
 119utf8encode3(char *str, unsigned int val)
 120{
 121        str[2] = (val & 0x3F) | 0x80;
 122        val >>= 6;
 123        str[1] = (val & 0x3F) | 0x80;
 124        val >>= 6;
 125        str[0] = val | 0xE0;
 126
 127        return 3;
 128}
 129
 130/*
 131 * utf8trie_t
 132 *
 133 * A compact binary tree, used to decode UTF-8 characters.
 134 *
 135 * Internal nodes are one byte for the node itself, and up to three
 136 * bytes for an offset into the tree.  The first byte contains the
 137 * following information:
 138 *  NEXTBYTE  - flag        - advance to next byte if set
 139 *  BITNUM    - 3 bit field - the bit number to tested
 140 *  OFFLEN    - 2 bit field - number of bytes in the offset
 141 * if offlen == 0 (non-branching node)
 142 *  RIGHTPATH - 1 bit field - set if the following node is for the
 143 *                            right-hand path (tested bit is set)
 144 *  TRIENODE  - 1 bit field - set if the following node is an internal
 145 *                            node, otherwise it is a leaf node
 146 * if offlen != 0 (branching node)
 147 *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 148 *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 149 *
 150 * Due to the way utf8 works, there cannot be branching nodes with
 151 * NEXTBYTE set, and moreover those nodes always have a righthand
 152 * descendant.
 153 */
 154typedef const unsigned char utf8trie_t;
 155#define BITNUM          0x07
 156#define NEXTBYTE        0x08
 157#define OFFLEN          0x30
 158#define OFFLEN_SHIFT    4
 159#define RIGHTPATH       0x40
 160#define TRIENODE        0x80
 161#define RIGHTNODE       0x40
 162#define LEFTNODE        0x80
 163
 164/*
 165 * utf8leaf_t
 166 *
 167 * The leaves of the trie are embedded in the trie, and so the same
 168 * underlying datatype: unsigned char.
 169 *
 170 * leaf[0]: The unicode version, stored as a generation number that is
 171 *          an index into utf8agetab[].  With this we can filter code
 172 *          points based on the unicode version in which they were
 173 *          defined.  The CCC of a non-defined code point is 0.
 174 * leaf[1]: Canonical Combining Class. During normalization, we need
 175 *          to do a stable sort into ascending order of all characters
 176 *          with a non-zero CCC that occur between two characters with
 177 *          a CCC of 0, or at the begin or end of a string.
 178 *          The unicode standard guarantees that all CCC values are
 179 *          between 0 and 254 inclusive, which leaves 255 available as
 180 *          a special value.
 181 *          Code points with CCC 0 are known as stoppers.
 182 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 183 *          start of a NUL-terminated string that is the decomposition
 184 *          of the character.
 185 *          The CCC of a decomposable character is the same as the CCC
 186 *          of the first character of its decomposition.
 187 *          Some characters decompose as the empty string: these are
 188 *          characters with the Default_Ignorable_Code_Point property.
 189 *          These do affect normalization, as they all have CCC 0.
 190 *
 191 * The decompositions in the trie have been fully expanded, with the
 192 * exception of Hangul syllables, which are decomposed algorithmically.
 193 *
 194 * Casefolding, if applicable, is also done using decompositions.
 195 *
 196 * The trie is constructed in such a way that leaves exist for all
 197 * UTF-8 sequences that match the criteria from the "UTF-8 valid
 198 * ranges" comment above, and only for those sequences.  Therefore a
 199 * lookup in the trie can be used to validate the UTF-8 input.
 200 */
 201typedef const unsigned char utf8leaf_t;
 202
 203#define LEAF_GEN(LEAF)  ((LEAF)[0])
 204#define LEAF_CCC(LEAF)  ((LEAF)[1])
 205#define LEAF_STR(LEAF)  ((const char *)((LEAF) + 2))
 206
 207#define MINCCC          (0)
 208#define MAXCCC          (254)
 209#define STOPPER         (0)
 210#define DECOMPOSE       (255)
 211
 212/* Marker for hangul syllable decomposition. */
 213#define HANGUL          ((char)(255))
 214/* Size of the synthesized leaf used for Hangul syllable decomposition. */
 215#define UTF8HANGULLEAF  (12)
 216
 217/*
 218 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
 219 *
 220 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
 221 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
 222 *
 223 * SBase = 0xAC00
 224 * LBase = 0x1100
 225 * VBase = 0x1161
 226 * TBase = 0x11A7
 227 * LCount = 19
 228 * VCount = 21
 229 * TCount = 28
 230 * NCount = 588 (VCount * TCount)
 231 * SCount = 11172 (LCount * NCount)
 232 *
 233 * Decomposition:
 234 *   SIndex = s - SBase
 235 *
 236 * LV (Canonical/Full)
 237 *   LIndex = SIndex / NCount
 238 *   VIndex = (Sindex % NCount) / TCount
 239 *   LPart = LBase + LIndex
 240 *   VPart = VBase + VIndex
 241 *
 242 * LVT (Canonical)
 243 *   LVIndex = (SIndex / TCount) * TCount
 244 *   TIndex = (Sindex % TCount)
 245 *   LVPart = SBase + LVIndex
 246 *   TPart = TBase + TIndex
 247 *
 248 * LVT (Full)
 249 *   LIndex = SIndex / NCount
 250 *   VIndex = (Sindex % NCount) / TCount
 251 *   TIndex = (Sindex % TCount)
 252 *   LPart = LBase + LIndex
 253 *   VPart = VBase + VIndex
 254 *   if (TIndex == 0) {
 255 *          d = <LPart, VPart>
 256 *   } else {
 257 *          TPart = TBase + TIndex
 258 *          d = <LPart, TPart, VPart>
 259 *   }
 260 */
 261
 262/* Constants */
 263#define SB      (0xAC00)
 264#define LB      (0x1100)
 265#define VB      (0x1161)
 266#define TB      (0x11A7)
 267#define LC      (19)
 268#define VC      (21)
 269#define TC      (28)
 270#define NC      (VC * TC)
 271#define SC      (LC * NC)
 272
 273/* Algorithmic decomposition of hangul syllable. */
 274static utf8leaf_t *
 275utf8hangul(const char *str, unsigned char *hangul)
 276{
 277        unsigned int    si;
 278        unsigned int    li;
 279        unsigned int    vi;
 280        unsigned int    ti;
 281        unsigned char   *h;
 282
 283        /* Calculate the SI, LI, VI, and TI values. */
 284        si = utf8decode3(str) - SB;
 285        li = si / NC;
 286        vi = (si % NC) / TC;
 287        ti = si % TC;
 288
 289        /* Fill in base of leaf. */
 290        h = hangul;
 291        LEAF_GEN(h) = 2;
 292        LEAF_CCC(h) = DECOMPOSE;
 293        h += 2;
 294
 295        /* Add LPart, a 3-byte UTF-8 sequence. */
 296        h += utf8encode3((char *)h, li + LB);
 297
 298        /* Add VPart, a 3-byte UTF-8 sequence. */
 299        h += utf8encode3((char *)h, vi + VB);
 300
 301        /* Add TPart if required, also a 3-byte UTF-8 sequence. */
 302        if (ti)
 303                h += utf8encode3((char *)h, ti + TB);
 304
 305        /* Terminate string. */
 306        h[0] = '\0';
 307
 308        return hangul;
 309}
 310
 311/*
 312 * Use trie to scan s, touching at most len bytes.
 313 * Returns the leaf if one exists, NULL otherwise.
 314 *
 315 * A non-NULL return guarantees that the UTF-8 sequence starting at s
 316 * is well-formed and corresponds to a known unicode code point.  The
 317 * shorthand for this will be "is valid UTF-8 unicode".
 318 */
 319static utf8leaf_t *utf8nlookup(const struct utf8data *data,
 320                               unsigned char *hangul, const char *s, size_t len)
 321{
 322        utf8trie_t      *trie = NULL;
 323        int             offlen;
 324        int             offset;
 325        int             mask;
 326        int             node;
 327
 328        if (!data)
 329                return NULL;
 330        if (len == 0)
 331                return NULL;
 332
 333        trie = utf8data + data->offset;
 334        node = 1;
 335        while (node) {
 336                offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
 337                if (*trie & NEXTBYTE) {
 338                        if (--len == 0)
 339                                return NULL;
 340                        s++;
 341                }
 342                mask = 1 << (*trie & BITNUM);
 343                if (*s & mask) {
 344                        /* Right leg */
 345                        if (offlen) {
 346                                /* Right node at offset of trie */
 347                                node = (*trie & RIGHTNODE);
 348                                offset = trie[offlen];
 349                                while (--offlen) {
 350                                        offset <<= 8;
 351                                        offset |= trie[offlen];
 352                                }
 353                                trie += offset;
 354                        } else if (*trie & RIGHTPATH) {
 355                                /* Right node after this node */
 356                                node = (*trie & TRIENODE);
 357                                trie++;
 358                        } else {
 359                                /* No right node. */
 360                                return NULL;
 361                        }
 362                } else {
 363                        /* Left leg */
 364                        if (offlen) {
 365                                /* Left node after this node. */
 366                                node = (*trie & LEFTNODE);
 367                                trie += offlen + 1;
 368                        } else if (*trie & RIGHTPATH) {
 369                                /* No left node. */
 370                                return NULL;
 371                        } else {
 372                                /* Left node after this node */
 373                                node = (*trie & TRIENODE);
 374                                trie++;
 375                        }
 376                }
 377        }
 378        /*
 379         * Hangul decomposition is done algorithmically. These are the
 380         * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
 381         * always 3 bytes long, so s has been advanced twice, and the
 382         * start of the sequence is at s-2.
 383         */
 384        if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
 385                trie = utf8hangul(s - 2, hangul);
 386        return trie;
 387}
 388
 389/*
 390 * Use trie to scan s.
 391 * Returns the leaf if one exists, NULL otherwise.
 392 *
 393 * Forwards to utf8nlookup().
 394 */
 395static utf8leaf_t *utf8lookup(const struct utf8data *data,
 396                              unsigned char *hangul, const char *s)
 397{
 398        return utf8nlookup(data, hangul, s, (size_t)-1);
 399}
 400
 401/*
 402 * Maximum age of any character in s.
 403 * Return -1 if s is not valid UTF-8 unicode.
 404 * Return 0 if only non-assigned code points are used.
 405 */
 406int utf8agemax(const struct utf8data *data, const char *s)
 407{
 408        utf8leaf_t      *leaf;
 409        int             age = 0;
 410        int             leaf_age;
 411        unsigned char   hangul[UTF8HANGULLEAF];
 412
 413        if (!data)
 414                return -1;
 415
 416        while (*s) {
 417                leaf = utf8lookup(data, hangul, s);
 418                if (!leaf)
 419                        return -1;
 420
 421                leaf_age = utf8agetab[LEAF_GEN(leaf)];
 422                if (leaf_age <= data->maxage && leaf_age > age)
 423                        age = leaf_age;
 424                s += utf8clen(s);
 425        }
 426        return age;
 427}
 428EXPORT_SYMBOL(utf8agemax);
 429
 430/*
 431 * Minimum age of any character in s.
 432 * Return -1 if s is not valid UTF-8 unicode.
 433 * Return 0 if non-assigned code points are used.
 434 */
 435int utf8agemin(const struct utf8data *data, const char *s)
 436{
 437        utf8leaf_t      *leaf;
 438        int             age;
 439        int             leaf_age;
 440        unsigned char   hangul[UTF8HANGULLEAF];
 441
 442        if (!data)
 443                return -1;
 444        age = data->maxage;
 445        while (*s) {
 446                leaf = utf8lookup(data, hangul, s);
 447                if (!leaf)
 448                        return -1;
 449                leaf_age = utf8agetab[LEAF_GEN(leaf)];
 450                if (leaf_age <= data->maxage && leaf_age < age)
 451                        age = leaf_age;
 452                s += utf8clen(s);
 453        }
 454        return age;
 455}
 456EXPORT_SYMBOL(utf8agemin);
 457
 458/*
 459 * Maximum age of any character in s, touch at most len bytes.
 460 * Return -1 if s is not valid UTF-8 unicode.
 461 */
 462int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
 463{
 464        utf8leaf_t      *leaf;
 465        int             age = 0;
 466        int             leaf_age;
 467        unsigned char   hangul[UTF8HANGULLEAF];
 468
 469        if (!data)
 470                return -1;
 471
 472        while (len && *s) {
 473                leaf = utf8nlookup(data, hangul, s, len);
 474                if (!leaf)
 475                        return -1;
 476                leaf_age = utf8agetab[LEAF_GEN(leaf)];
 477                if (leaf_age <= data->maxage && leaf_age > age)
 478                        age = leaf_age;
 479                len -= utf8clen(s);
 480                s += utf8clen(s);
 481        }
 482        return age;
 483}
 484EXPORT_SYMBOL(utf8nagemax);
 485
 486/*
 487 * Maximum age of any character in s, touch at most len bytes.
 488 * Return -1 if s is not valid UTF-8 unicode.
 489 */
 490int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
 491{
 492        utf8leaf_t      *leaf;
 493        int             leaf_age;
 494        int             age;
 495        unsigned char   hangul[UTF8HANGULLEAF];
 496
 497        if (!data)
 498                return -1;
 499        age = data->maxage;
 500        while (len && *s) {
 501                leaf = utf8nlookup(data, hangul, s, len);
 502                if (!leaf)
 503                        return -1;
 504                leaf_age = utf8agetab[LEAF_GEN(leaf)];
 505                if (leaf_age <= data->maxage && leaf_age < age)
 506                        age = leaf_age;
 507                len -= utf8clen(s);
 508                s += utf8clen(s);
 509        }
 510        return age;
 511}
 512EXPORT_SYMBOL(utf8nagemin);
 513
 514/*
 515 * Length of the normalization of s.
 516 * Return -1 if s is not valid UTF-8 unicode.
 517 *
 518 * A string of Default_Ignorable_Code_Point has length 0.
 519 */
 520ssize_t utf8len(const struct utf8data *data, const char *s)
 521{
 522        utf8leaf_t      *leaf;
 523        size_t          ret = 0;
 524        unsigned char   hangul[UTF8HANGULLEAF];
 525
 526        if (!data)
 527                return -1;
 528        while (*s) {
 529                leaf = utf8lookup(data, hangul, s);
 530                if (!leaf)
 531                        return -1;
 532                if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 533                        ret += utf8clen(s);
 534                else if (LEAF_CCC(leaf) == DECOMPOSE)
 535                        ret += strlen(LEAF_STR(leaf));
 536                else
 537                        ret += utf8clen(s);
 538                s += utf8clen(s);
 539        }
 540        return ret;
 541}
 542EXPORT_SYMBOL(utf8len);
 543
 544/*
 545 * Length of the normalization of s, touch at most len bytes.
 546 * Return -1 if s is not valid UTF-8 unicode.
 547 */
 548ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
 549{
 550        utf8leaf_t      *leaf;
 551        size_t          ret = 0;
 552        unsigned char   hangul[UTF8HANGULLEAF];
 553
 554        if (!data)
 555                return -1;
 556        while (len && *s) {
 557                leaf = utf8nlookup(data, hangul, s, len);
 558                if (!leaf)
 559                        return -1;
 560                if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 561                        ret += utf8clen(s);
 562                else if (LEAF_CCC(leaf) == DECOMPOSE)
 563                        ret += strlen(LEAF_STR(leaf));
 564                else
 565                        ret += utf8clen(s);
 566                len -= utf8clen(s);
 567                s += utf8clen(s);
 568        }
 569        return ret;
 570}
 571EXPORT_SYMBOL(utf8nlen);
 572
 573/*
 574 * Set up an utf8cursor for use by utf8byte().
 575 *
 576 *   u8c    : pointer to cursor.
 577 *   data   : const struct utf8data to use for normalization.
 578 *   s      : string.
 579 *   len    : length of s.
 580 *
 581 * Returns -1 on error, 0 on success.
 582 */
 583int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
 584                const char *s, size_t len)
 585{
 586        if (!data)
 587                return -1;
 588        if (!s)
 589                return -1;
 590        u8c->data = data;
 591        u8c->s = s;
 592        u8c->p = NULL;
 593        u8c->ss = NULL;
 594        u8c->sp = NULL;
 595        u8c->len = len;
 596        u8c->slen = 0;
 597        u8c->ccc = STOPPER;
 598        u8c->nccc = STOPPER;
 599        /* Check we didn't clobber the maximum length. */
 600        if (u8c->len != len)
 601                return -1;
 602        /* The first byte of s may not be an utf8 continuation. */
 603        if (len > 0 && (*s & 0xC0) == 0x80)
 604                return -1;
 605        return 0;
 606}
 607EXPORT_SYMBOL(utf8ncursor);
 608
 609/*
 610 * Set up an utf8cursor for use by utf8byte().
 611 *
 612 *   u8c    : pointer to cursor.
 613 *   data   : const struct utf8data to use for normalization.
 614 *   s      : NUL-terminated string.
 615 *
 616 * Returns -1 on error, 0 on success.
 617 */
 618int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
 619               const char *s)
 620{
 621        return utf8ncursor(u8c, data, s, (unsigned int)-1);
 622}
 623EXPORT_SYMBOL(utf8cursor);
 624
 625/*
 626 * Get one byte from the normalized form of the string described by u8c.
 627 *
 628 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
 629 *
 630 * The cursor keeps track of the location in the string in u8c->s.
 631 * When a character is decomposed, the current location is stored in
 632 * u8c->p, and u8c->s is set to the start of the decomposition. Note
 633 * that bytes from a decomposition do not count against u8c->len.
 634 *
 635 * Characters are emitted if they match the current CCC in u8c->ccc.
 636 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
 637 * and the function returns 0 in that case.
 638 *
 639 * Sorting by CCC is done by repeatedly scanning the string.  The
 640 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
 641 * the start of the scan.  The first pass finds the lowest CCC to be
 642 * emitted and stores it in u8c->nccc, the second pass emits the
 643 * characters with this CCC and finds the next lowest CCC. This limits
 644 * the number of passes to 1 + the number of different CCCs in the
 645 * sequence being scanned.
 646 *
 647 * Therefore:
 648 *  u8c->p  != NULL -> a decomposition is being scanned.
 649 *  u8c->ss != NULL -> this is a repeating scan.
 650 *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
 651 */
 652int utf8byte(struct utf8cursor *u8c)
 653{
 654        utf8leaf_t *leaf;
 655        int ccc;
 656
 657        for (;;) {
 658                /* Check for the end of a decomposed character. */
 659                if (u8c->p && *u8c->s == '\0') {
 660                        u8c->s = u8c->p;
 661                        u8c->p = NULL;
 662                }
 663
 664                /* Check for end-of-string. */
 665                if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
 666                        /* There is no next byte. */
 667                        if (u8c->ccc == STOPPER)
 668                                return 0;
 669                        /* End-of-string during a scan counts as a stopper. */
 670                        ccc = STOPPER;
 671                        goto ccc_mismatch;
 672                } else if ((*u8c->s & 0xC0) == 0x80) {
 673                        /* This is a continuation of the current character. */
 674                        if (!u8c->p)
 675                                u8c->len--;
 676                        return (unsigned char)*u8c->s++;
 677                }
 678
 679                /* Look up the data for the current character. */
 680                if (u8c->p) {
 681                        leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 682                } else {
 683                        leaf = utf8nlookup(u8c->data, u8c->hangul,
 684                                           u8c->s, u8c->len);
 685                }
 686
 687                /* No leaf found implies that the input is a binary blob. */
 688                if (!leaf)
 689                        return -1;
 690
 691                ccc = LEAF_CCC(leaf);
 692                /* Characters that are too new have CCC 0. */
 693                if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
 694                        ccc = STOPPER;
 695                } else if (ccc == DECOMPOSE) {
 696                        u8c->len -= utf8clen(u8c->s);
 697                        u8c->p = u8c->s + utf8clen(u8c->s);
 698                        u8c->s = LEAF_STR(leaf);
 699                        /* Empty decomposition implies CCC 0. */
 700                        if (*u8c->s == '\0') {
 701                                if (u8c->ccc == STOPPER)
 702                                        continue;
 703                                ccc = STOPPER;
 704                                goto ccc_mismatch;
 705                        }
 706
 707                        leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 708                        if (!leaf)
 709                                return -1;
 710                        ccc = LEAF_CCC(leaf);
 711                }
 712
 713                /*
 714                 * If this is not a stopper, then see if it updates
 715                 * the next canonical class to be emitted.
 716                 */
 717                if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
 718                        u8c->nccc = ccc;
 719
 720                /*
 721                 * Return the current byte if this is the current
 722                 * combining class.
 723                 */
 724                if (ccc == u8c->ccc) {
 725                        if (!u8c->p)
 726                                u8c->len--;
 727                        return (unsigned char)*u8c->s++;
 728                }
 729
 730                /* Current combining class mismatch. */
 731ccc_mismatch:
 732                if (u8c->nccc == STOPPER) {
 733                        /*
 734                         * Scan forward for the first canonical class
 735                         * to be emitted.  Save the position from
 736                         * which to restart.
 737                         */
 738                        u8c->ccc = MINCCC - 1;
 739                        u8c->nccc = ccc;
 740                        u8c->sp = u8c->p;
 741                        u8c->ss = u8c->s;
 742                        u8c->slen = u8c->len;
 743                        if (!u8c->p)
 744                                u8c->len -= utf8clen(u8c->s);
 745                        u8c->s += utf8clen(u8c->s);
 746                } else if (ccc != STOPPER) {
 747                        /* Not a stopper, and not the ccc we're emitting. */
 748                        if (!u8c->p)
 749                                u8c->len -= utf8clen(u8c->s);
 750                        u8c->s += utf8clen(u8c->s);
 751                } else if (u8c->nccc != MAXCCC + 1) {
 752                        /* At a stopper, restart for next ccc. */
 753                        u8c->ccc = u8c->nccc;
 754                        u8c->nccc = MAXCCC + 1;
 755                        u8c->s = u8c->ss;
 756                        u8c->p = u8c->sp;
 757                        u8c->len = u8c->slen;
 758                } else {
 759                        /* All done, proceed from here. */
 760                        u8c->ccc = STOPPER;
 761                        u8c->nccc = STOPPER;
 762                        u8c->sp = NULL;
 763                        u8c->ss = NULL;
 764                        u8c->slen = 0;
 765                }
 766        }
 767}
 768EXPORT_SYMBOL(utf8byte);
 769
 770const struct utf8data *utf8nfdi(unsigned int maxage)
 771{
 772        int i = ARRAY_SIZE(utf8nfdidata) - 1;
 773
 774        while (maxage < utf8nfdidata[i].maxage)
 775                i--;
 776        if (maxage > utf8nfdidata[i].maxage)
 777                return NULL;
 778        return &utf8nfdidata[i];
 779}
 780EXPORT_SYMBOL(utf8nfdi);
 781
 782const struct utf8data *utf8nfdicf(unsigned int maxage)
 783{
 784        int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
 785
 786        while (maxage < utf8nfdicfdata[i].maxage)
 787                i--;
 788        if (maxage > utf8nfdicfdata[i].maxage)
 789                return NULL;
 790        return &utf8nfdicfdata[i];
 791}
 792EXPORT_SYMBOL(utf8nfdicf);
 793