1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <sys/types.h>
22#include <stddef.h>
23#include <stdlib.h>
24#include <stdio.h>
25#include <assert.h>
26#include <string.h>
27#include <unistd.h>
28#include <errno.h>
29
30
31
32#define AGE_NAME "DerivedAge.txt"
33#define CCC_NAME "DerivedCombiningClass.txt"
34#define PROP_NAME "DerivedCoreProperties.txt"
35#define DATA_NAME "UnicodeData.txt"
36#define FOLD_NAME "CaseFolding.txt"
37#define NORM_NAME "NormalizationCorrections.txt"
38#define TEST_NAME "NormalizationTest.txt"
39#define UTF8_NAME "utf8data.h"
40
41const char *age_name = AGE_NAME;
42const char *ccc_name = CCC_NAME;
43const char *prop_name = PROP_NAME;
44const char *data_name = DATA_NAME;
45const char *fold_name = FOLD_NAME;
46const char *norm_name = NORM_NAME;
47const char *test_name = TEST_NAME;
48const char *utf8_name = UTF8_NAME;
49
50int verbose = 0;
51
52
53
54#define LINESIZE 1024
55char line[LINESIZE];
56char buf0[LINESIZE];
57char buf1[LINESIZE];
58char buf2[LINESIZE];
59char buf3[LINESIZE];
60
61const char *argv0;
62
63#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
64
65
66
67
68
69
70
71
72
73
74
75
76
77#define UNICODE_MAJ_SHIFT (16)
78#define UNICODE_MIN_SHIFT (8)
79
80#define UNICODE_MAJ_MAX ((unsigned short)-1)
81#define UNICODE_MIN_MAX ((unsigned char)-1)
82#define UNICODE_REV_MAX ((unsigned char)-1)
83
84#define UNICODE_AGE(MAJ,MIN,REV) \
85 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
86 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
87 ((unsigned int)(REV)))
88
89unsigned int *ages;
90int ages_count;
91
92unsigned int unicode_maxage;
93
94static int age_valid(unsigned int major, unsigned int minor,
95 unsigned int revision)
96{
97 if (major > UNICODE_MAJ_MAX)
98 return 0;
99 if (minor > UNICODE_MIN_MAX)
100 return 0;
101 if (revision > UNICODE_REV_MAX)
102 return 0;
103 return 1;
104}
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132typedef unsigned char utf8trie_t;
133#define BITNUM 0x07
134#define NEXTBYTE 0x08
135#define OFFLEN 0x30
136#define OFFLEN_SHIFT 4
137#define RIGHTPATH 0x40
138#define TRIENODE 0x80
139#define RIGHTNODE 0x40
140#define LEFTNODE 0x80
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173typedef unsigned char utf8leaf_t;
174
175#define LEAF_GEN(LEAF) ((LEAF)[0])
176#define LEAF_CCC(LEAF) ((LEAF)[1])
177#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
178
179#define MAXGEN (255)
180
181#define MINCCC (0)
182#define MAXCCC (254)
183#define STOPPER (0)
184#define DECOMPOSE (255)
185#define HANGUL ((char)(255))
186
187#define UTF8HANGULLEAF (12)
188
189struct tree;
190static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
191 const char *, size_t);
192static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
193
194unsigned char *utf8data;
195size_t utf8data_size;
196
197utf8trie_t *nfdi;
198utf8trie_t *nfdicf;
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251#define UTF8_2_BITS 0xC0
252#define UTF8_3_BITS 0xE0
253#define UTF8_4_BITS 0xF0
254#define UTF8_N_BITS 0x80
255#define UTF8_2_MASK 0xE0
256#define UTF8_3_MASK 0xF0
257#define UTF8_4_MASK 0xF8
258#define UTF8_N_MASK 0xC0
259#define UTF8_V_MASK 0x3F
260#define UTF8_V_SHIFT 6
261
262static int utf8encode(char *str, unsigned int val)
263{
264 int len;
265
266 if (val < 0x80) {
267 str[0] = val;
268 len = 1;
269 } else if (val < 0x800) {
270 str[1] = val & UTF8_V_MASK;
271 str[1] |= UTF8_N_BITS;
272 val >>= UTF8_V_SHIFT;
273 str[0] = val;
274 str[0] |= UTF8_2_BITS;
275 len = 2;
276 } else if (val < 0x10000) {
277 str[2] = val & UTF8_V_MASK;
278 str[2] |= UTF8_N_BITS;
279 val >>= UTF8_V_SHIFT;
280 str[1] = val & UTF8_V_MASK;
281 str[1] |= UTF8_N_BITS;
282 val >>= UTF8_V_SHIFT;
283 str[0] = val;
284 str[0] |= UTF8_3_BITS;
285 len = 3;
286 } else if (val < 0x110000) {
287 str[3] = val & UTF8_V_MASK;
288 str[3] |= UTF8_N_BITS;
289 val >>= UTF8_V_SHIFT;
290 str[2] = val & UTF8_V_MASK;
291 str[2] |= UTF8_N_BITS;
292 val >>= UTF8_V_SHIFT;
293 str[1] = val & UTF8_V_MASK;
294 str[1] |= UTF8_N_BITS;
295 val >>= UTF8_V_SHIFT;
296 str[0] = val;
297 str[0] |= UTF8_4_BITS;
298 len = 4;
299 } else {
300 printf("%#x: illegal val\n", val);
301 len = 0;
302 }
303 return len;
304}
305
306static unsigned int utf8decode(const char *str)
307{
308 const unsigned char *s = (const unsigned char*)str;
309 unsigned int unichar = 0;
310
311 if (*s < 0x80) {
312 unichar = *s;
313 } else if (*s < UTF8_3_BITS) {
314 unichar = *s++ & 0x1F;
315 unichar <<= UTF8_V_SHIFT;
316 unichar |= *s & 0x3F;
317 } else if (*s < UTF8_4_BITS) {
318 unichar = *s++ & 0x0F;
319 unichar <<= UTF8_V_SHIFT;
320 unichar |= *s++ & 0x3F;
321 unichar <<= UTF8_V_SHIFT;
322 unichar |= *s & 0x3F;
323 } else {
324 unichar = *s++ & 0x0F;
325 unichar <<= UTF8_V_SHIFT;
326 unichar |= *s++ & 0x3F;
327 unichar <<= UTF8_V_SHIFT;
328 unichar |= *s++ & 0x3F;
329 unichar <<= UTF8_V_SHIFT;
330 unichar |= *s & 0x3F;
331 }
332 return unichar;
333}
334
335static int utf32valid(unsigned int unichar)
336{
337 return unichar < 0x110000;
338}
339
340#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
341
342#define NODE 1
343#define LEAF 0
344
345struct tree {
346 void *root;
347 int childnode;
348 const char *type;
349 unsigned int maxage;
350 struct tree *next;
351 int (*leaf_equal)(void *, void *);
352 void (*leaf_print)(void *, int);
353 int (*leaf_mark)(void *);
354 int (*leaf_size)(void *);
355 int *(*leaf_index)(struct tree *, void *);
356 unsigned char *(*leaf_emit)(void *, unsigned char *);
357 int leafindex[0x110000];
358 int index;
359};
360
361struct node {
362 int index;
363 int offset;
364 int mark;
365 int size;
366 struct node *parent;
367 void *left;
368 void *right;
369 unsigned char bitnum;
370 unsigned char nextbyte;
371 unsigned char leftnode;
372 unsigned char rightnode;
373 unsigned int keybits;
374 unsigned int keymask;
375};
376
377
378
379
380static void *lookup(struct tree *tree, const char *key)
381{
382 struct node *node;
383 void *leaf = NULL;
384
385 node = tree->root;
386 while (!leaf && node) {
387 if (node->nextbyte)
388 key++;
389 if (*key & (1 << (node->bitnum & 7))) {
390
391 if (node->rightnode == NODE) {
392 node = node->right;
393 } else if (node->rightnode == LEAF) {
394 leaf = node->right;
395 } else {
396 node = NULL;
397 }
398 } else {
399
400 if (node->leftnode == NODE) {
401 node = node->left;
402 } else if (node->leftnode == LEAF) {
403 leaf = node->left;
404 } else {
405 node = NULL;
406 }
407 }
408 }
409
410 return leaf;
411}
412
413
414
415
416
417static void tree_walk(struct tree *tree)
418{
419 struct node *node;
420 unsigned int leftmask;
421 unsigned int rightmask;
422 unsigned int bitmask;
423 int indent = 1;
424 int nodes, singletons, leaves;
425
426 nodes = singletons = leaves = 0;
427
428 printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
429 if (tree->childnode == LEAF) {
430 assert(tree->root);
431 tree->leaf_print(tree->root, indent);
432 leaves = 1;
433 } else {
434 assert(tree->childnode == NODE);
435 node = tree->root;
436 leftmask = rightmask = 0;
437 while (node) {
438 printf("%*snode @ %p bitnum %d nextbyte %d"
439 " left %p right %p mask %x bits %x\n",
440 indent, "", node,
441 node->bitnum, node->nextbyte,
442 node->left, node->right,
443 node->keymask, node->keybits);
444 nodes += 1;
445 if (!(node->left && node->right))
446 singletons += 1;
447
448 while (node) {
449 bitmask = 1 << node->bitnum;
450 if ((leftmask & bitmask) == 0) {
451 leftmask |= bitmask;
452 if (node->leftnode == LEAF) {
453 assert(node->left);
454 tree->leaf_print(node->left,
455 indent+1);
456 leaves += 1;
457 } else if (node->left) {
458 assert(node->leftnode == NODE);
459 indent += 1;
460 node = node->left;
461 break;
462 }
463 }
464 if ((rightmask & bitmask) == 0) {
465 rightmask |= bitmask;
466 if (node->rightnode == LEAF) {
467 assert(node->right);
468 tree->leaf_print(node->right,
469 indent+1);
470 leaves += 1;
471 } else if (node->right) {
472 assert(node->rightnode == NODE);
473 indent += 1;
474 node = node->right;
475 break;
476 }
477 }
478 leftmask &= ~bitmask;
479 rightmask &= ~bitmask;
480 node = node->parent;
481 indent -= 1;
482 }
483 }
484 }
485 printf("nodes %d leaves %d singletons %d\n",
486 nodes, leaves, singletons);
487}
488
489
490
491
492static struct node *alloc_node(struct node *parent)
493{
494 struct node *node;
495 int bitnum;
496
497 node = malloc(sizeof(*node));
498 node->left = node->right = NULL;
499 node->parent = parent;
500 node->leftnode = NODE;
501 node->rightnode = NODE;
502 node->keybits = 0;
503 node->keymask = 0;
504 node->mark = 0;
505 node->index = 0;
506 node->offset = -1;
507 node->size = 4;
508
509 if (node->parent) {
510 bitnum = parent->bitnum;
511 if ((bitnum & 7) == 0) {
512 node->bitnum = bitnum + 7 + 8;
513 node->nextbyte = 1;
514 } else {
515 node->bitnum = bitnum - 1;
516 node->nextbyte = 0;
517 }
518 } else {
519 node->bitnum = 7;
520 node->nextbyte = 0;
521 }
522
523 return node;
524}
525
526
527
528
529
530
531
532
533static int insert(struct tree *tree, char *key, int keylen, void *leaf)
534{
535 struct node *node;
536 struct node *parent;
537 void **cursor;
538 int keybits;
539
540 assert(keylen >= 1 && keylen <= 4);
541
542 node = NULL;
543 cursor = &tree->root;
544 keybits = 8 * keylen;
545
546
547 while (keybits) {
548 if (!*cursor)
549 *cursor = alloc_node(node);
550 node = *cursor;
551 if (node->nextbyte)
552 key++;
553 if (*key & (1 << (node->bitnum & 7)))
554 cursor = &node->right;
555 else
556 cursor = &node->left;
557 keybits--;
558 }
559 *cursor = leaf;
560
561
562 while (node) {
563 if (*key & (1 << (node->bitnum & 7)))
564 node->rightnode = LEAF;
565 else
566 node->leftnode = LEAF;
567 if (node->nextbyte)
568 break;
569 if (node->leftnode == NODE || node->rightnode == NODE)
570 break;
571 assert(node->left);
572 assert(node->right);
573
574 if (! tree->leaf_equal(node->left, node->right))
575 break;
576
577 leaf = node->left;
578
579 parent = node->parent;
580 if (!parent) {
581
582 tree->root = leaf;
583 tree->childnode = LEAF;
584 } else if (parent->left == node) {
585 parent->left = leaf;
586 parent->leftnode = LEAF;
587 if (parent->right) {
588 parent->keymask = 0;
589 parent->keybits = 0;
590 } else {
591 parent->keymask |= (1 << node->bitnum);
592 }
593 } else if (parent->right == node) {
594 parent->right = leaf;
595 parent->rightnode = LEAF;
596 if (parent->left) {
597 parent->keymask = 0;
598 parent->keybits = 0;
599 } else {
600 parent->keymask |= (1 << node->bitnum);
601 parent->keybits |= (1 << node->bitnum);
602 }
603 } else {
604
605 assert(0);
606 }
607 free(node);
608 node = parent;
609 }
610
611
612 while (node) {
613 parent = node->parent;
614 if (!parent)
615 break;
616
617 if (node->keymask == 0) {
618 parent->keymask = 0;
619 parent->keybits = 0;
620 } else if (parent->left && parent->right) {
621 parent->keymask = 0;
622 parent->keybits = 0;
623 } else {
624 assert((parent->keymask & node->keymask) == 0);
625 parent->keymask |= node->keymask;
626 parent->keymask |= (1 << parent->bitnum);
627 parent->keybits |= node->keybits;
628 if (parent->right)
629 parent->keybits |= (1 << parent->bitnum);
630 }
631 node = parent;
632 }
633
634 return 0;
635}
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654static void prune(struct tree *tree)
655{
656 struct node *node;
657 struct node *left;
658 struct node *right;
659 struct node *parent;
660 void *leftleaf;
661 void *rightleaf;
662 unsigned int leftmask;
663 unsigned int rightmask;
664 unsigned int bitmask;
665 int count;
666
667 if (verbose > 0)
668 printf("Pruning %s_%x\n", tree->type, tree->maxage);
669
670 count = 0;
671 if (tree->childnode == LEAF)
672 return;
673 if (!tree->root)
674 return;
675
676 leftmask = rightmask = 0;
677 node = tree->root;
678 while (node) {
679 if (node->nextbyte)
680 goto advance;
681 if (node->leftnode == LEAF)
682 goto advance;
683 if (node->rightnode == LEAF)
684 goto advance;
685 if (!node->left)
686 goto advance;
687 if (!node->right)
688 goto advance;
689 left = node->left;
690 right = node->right;
691 if (left->keymask == 0)
692 goto advance;
693 if (right->keymask == 0)
694 goto advance;
695 if (left->keymask != right->keymask)
696 goto advance;
697 if (left->keybits != right->keybits)
698 goto advance;
699 leftleaf = NULL;
700 while (!leftleaf) {
701 assert(left->left || left->right);
702 if (left->leftnode == LEAF)
703 leftleaf = left->left;
704 else if (left->rightnode == LEAF)
705 leftleaf = left->right;
706 else if (left->left)
707 left = left->left;
708 else if (left->right)
709 left = left->right;
710 else
711 assert(0);
712 }
713 rightleaf = NULL;
714 while (!rightleaf) {
715 assert(right->left || right->right);
716 if (right->leftnode == LEAF)
717 rightleaf = right->left;
718 else if (right->rightnode == LEAF)
719 rightleaf = right->right;
720 else if (right->left)
721 right = right->left;
722 else if (right->right)
723 right = right->right;
724 else
725 assert(0);
726 }
727 if (! tree->leaf_equal(leftleaf, rightleaf))
728 goto advance;
729
730
731
732
733 parent = node->parent;
734 left = node->left;
735 right = node->right;
736 if (parent->left == node)
737 parent->left = left;
738 else if (parent->right == node)
739 parent->right = left;
740 else
741 assert(0);
742 left->parent = parent;
743 left->keymask |= (1 << node->bitnum);
744 node->left = NULL;
745 while (node) {
746 bitmask = 1 << node->bitnum;
747 leftmask &= ~bitmask;
748 rightmask &= ~bitmask;
749 if (node->leftnode == NODE && node->left) {
750 left = node->left;
751 free(node);
752 count++;
753 node = left;
754 } else if (node->rightnode == NODE && node->right) {
755 right = node->right;
756 free(node);
757 count++;
758 node = right;
759 } else {
760 node = NULL;
761 }
762 }
763
764 node = parent;
765
766 bitmask = 1 << node->bitnum;
767 leftmask &= ~bitmask;
768 rightmask &= ~bitmask;
769 for (;;) {
770 if (node->left && node->right)
771 break;
772 if (node->left) {
773 left = node->left;
774 node->keymask |= left->keymask;
775 node->keybits |= left->keybits;
776 }
777 if (node->right) {
778 right = node->right;
779 node->keymask |= right->keymask;
780 node->keybits |= right->keybits;
781 }
782 node->keymask |= (1 << node->bitnum);
783 node = node->parent;
784
785 bitmask = 1 << node->bitnum;
786 leftmask &= ~bitmask;
787 rightmask &= ~bitmask;
788 }
789 advance:
790 bitmask = 1 << node->bitnum;
791 if ((leftmask & bitmask) == 0 &&
792 node->leftnode == NODE &&
793 node->left) {
794 leftmask |= bitmask;
795 node = node->left;
796 } else if ((rightmask & bitmask) == 0 &&
797 node->rightnode == NODE &&
798 node->right) {
799 rightmask |= bitmask;
800 node = node->right;
801 } else {
802 leftmask &= ~bitmask;
803 rightmask &= ~bitmask;
804 node = node->parent;
805 }
806 }
807 if (verbose > 0)
808 printf("Pruned %d nodes\n", count);
809}
810
811
812
813
814
815static void mark_nodes(struct tree *tree)
816{
817 struct node *node;
818 struct node *n;
819 unsigned int leftmask;
820 unsigned int rightmask;
821 unsigned int bitmask;
822 int marked;
823
824 marked = 0;
825 if (verbose > 0)
826 printf("Marking %s_%x\n", tree->type, tree->maxage);
827 if (tree->childnode == LEAF)
828 goto done;
829
830 assert(tree->childnode == NODE);
831 node = tree->root;
832 leftmask = rightmask = 0;
833 while (node) {
834 bitmask = 1 << node->bitnum;
835 if ((leftmask & bitmask) == 0) {
836 leftmask |= bitmask;
837 if (node->leftnode == LEAF) {
838 assert(node->left);
839 if (tree->leaf_mark(node->left)) {
840 n = node;
841 while (n && !n->mark) {
842 marked++;
843 n->mark = 1;
844 n = n->parent;
845 }
846 }
847 } else if (node->left) {
848 assert(node->leftnode == NODE);
849 node = node->left;
850 continue;
851 }
852 }
853 if ((rightmask & bitmask) == 0) {
854 rightmask |= bitmask;
855 if (node->rightnode == LEAF) {
856 assert(node->right);
857 if (tree->leaf_mark(node->right)) {
858 n = node;
859 while (n && !n->mark) {
860 marked++;
861 n->mark = 1;
862 n = n->parent;
863 }
864 }
865 } else if (node->right) {
866 assert(node->rightnode == NODE);
867 node = node->right;
868 continue;
869 }
870 }
871 leftmask &= ~bitmask;
872 rightmask &= ~bitmask;
873 node = node->parent;
874 }
875
876
877
878 assert(tree->childnode == NODE);
879 node = tree->root;
880 leftmask = rightmask = 0;
881 while (node) {
882 bitmask = 1 << node->bitnum;
883 if ((leftmask & bitmask) == 0) {
884 leftmask |= bitmask;
885 if (node->leftnode == LEAF) {
886 assert(node->left);
887 if (tree->leaf_mark(node->left)) {
888 n = node;
889 while (n && !n->mark) {
890 marked++;
891 n->mark = 1;
892 n = n->parent;
893 }
894 }
895 } else if (node->left) {
896 assert(node->leftnode == NODE);
897 node = node->left;
898 if (!node->mark && node->parent->mark) {
899 marked++;
900 node->mark = 1;
901 }
902 continue;
903 }
904 }
905 if ((rightmask & bitmask) == 0) {
906 rightmask |= bitmask;
907 if (node->rightnode == LEAF) {
908 assert(node->right);
909 if (tree->leaf_mark(node->right)) {
910 n = node;
911 while (n && !n->mark) {
912 marked++;
913 n->mark = 1;
914 n = n->parent;
915 }
916 }
917 } else if (node->right) {
918 assert(node->rightnode == NODE);
919 node = node->right;
920 if (!node->mark && node->parent->mark &&
921 !node->parent->left) {
922 marked++;
923 node->mark = 1;
924 }
925 continue;
926 }
927 }
928 leftmask &= ~bitmask;
929 rightmask &= ~bitmask;
930 node = node->parent;
931 }
932done:
933 if (verbose > 0)
934 printf("Marked %d nodes\n", marked);
935}
936
937
938
939
940
941
942static int index_nodes(struct tree *tree, int index)
943{
944 struct node *node;
945 unsigned int leftmask;
946 unsigned int rightmask;
947 unsigned int bitmask;
948 int count;
949 int indent;
950
951
952 while (index % 64)
953 index++;
954 tree->index = index;
955 indent = 1;
956 count = 0;
957
958 if (verbose > 0)
959 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
960 if (tree->childnode == LEAF) {
961 index += tree->leaf_size(tree->root);
962 goto done;
963 }
964
965 assert(tree->childnode == NODE);
966 node = tree->root;
967 leftmask = rightmask = 0;
968 while (node) {
969 if (!node->mark)
970 goto skip;
971 count++;
972 if (node->index != index)
973 node->index = index;
974 index += node->size;
975skip:
976 while (node) {
977 bitmask = 1 << node->bitnum;
978 if (node->mark && (leftmask & bitmask) == 0) {
979 leftmask |= bitmask;
980 if (node->leftnode == LEAF) {
981 assert(node->left);
982 *tree->leaf_index(tree, node->left) =
983 index;
984 index += tree->leaf_size(node->left);
985 count++;
986 } else if (node->left) {
987 assert(node->leftnode == NODE);
988 indent += 1;
989 node = node->left;
990 break;
991 }
992 }
993 if (node->mark && (rightmask & bitmask) == 0) {
994 rightmask |= bitmask;
995 if (node->rightnode == LEAF) {
996 assert(node->right);
997 *tree->leaf_index(tree, node->right) = index;
998 index += tree->leaf_size(node->right);
999 count++;
1000 } else if (node->right) {
1001 assert(node->rightnode == NODE);
1002 indent += 1;
1003 node = node->right;
1004 break;
1005 }
1006 }
1007 leftmask &= ~bitmask;
1008 rightmask &= ~bitmask;
1009 node = node->parent;
1010 indent -= 1;
1011 }
1012 }
1013done:
1014
1015 while (index % 16)
1016 index++;
1017 if (verbose > 0)
1018 printf("Final index %d\n", index);
1019 return index;
1020}
1021
1022
1023
1024
1025static int mark_subtree(struct node *node)
1026{
1027 int changed;
1028
1029 if (!node || node->mark)
1030 return 0;
1031 node->mark = 1;
1032 node->index = node->parent->index;
1033 changed = 1;
1034 if (node->leftnode == NODE)
1035 changed += mark_subtree(node->left);
1036 if (node->rightnode == NODE)
1037 changed += mark_subtree(node->right);
1038 return changed;
1039}
1040
1041
1042
1043
1044
1045
1046
1047
1048static int size_nodes(struct tree *tree)
1049{
1050 struct tree *next;
1051 struct node *node;
1052 struct node *right;
1053 struct node *n;
1054 unsigned int leftmask;
1055 unsigned int rightmask;
1056 unsigned int bitmask;
1057 unsigned int pathbits;
1058 unsigned int pathmask;
1059 unsigned int nbit;
1060 int changed;
1061 int offset;
1062 int size;
1063 int indent;
1064
1065 indent = 1;
1066 changed = 0;
1067 size = 0;
1068
1069 if (verbose > 0)
1070 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071 if (tree->childnode == LEAF)
1072 goto done;
1073
1074 assert(tree->childnode == NODE);
1075 pathbits = 0;
1076 pathmask = 0;
1077 node = tree->root;
1078 leftmask = rightmask = 0;
1079 while (node) {
1080 if (!node->mark)
1081 goto skip;
1082 offset = 0;
1083 if (!node->left || !node->right) {
1084 size = 1;
1085 } else {
1086 if (node->rightnode == NODE) {
1087
1088
1089
1090
1091
1092
1093 right = node->right;
1094 next = tree->next;
1095 while (!right->mark) {
1096 assert(next);
1097 n = next->root;
1098 while (n->bitnum != node->bitnum) {
1099 nbit = 1 << n->bitnum;
1100 if (!(pathmask & nbit))
1101 break;
1102 if (pathbits & nbit) {
1103 if (n->rightnode == LEAF)
1104 break;
1105 n = n->right;
1106 } else {
1107 if (n->leftnode == LEAF)
1108 break;
1109 n = n->left;
1110 }
1111 }
1112 if (n->bitnum != node->bitnum)
1113 break;
1114 n = n->right;
1115 right = n;
1116 next = next->next;
1117 }
1118
1119 if (!right->mark)
1120 changed += mark_subtree(right);
1121 offset = right->index - node->index;
1122 } else {
1123 offset = *tree->leaf_index(tree, node->right);
1124 offset -= node->index;
1125 }
1126 assert(offset >= 0);
1127 assert(offset <= 0xffffff);
1128 if (offset <= 0xff) {
1129 size = 2;
1130 } else if (offset <= 0xffff) {
1131 size = 3;
1132 } else {
1133 size = 4;
1134 }
1135 }
1136 if (node->size != size || node->offset != offset) {
1137 node->size = size;
1138 node->offset = offset;
1139 changed++;
1140 }
1141skip:
1142 while (node) {
1143 bitmask = 1 << node->bitnum;
1144 pathmask |= bitmask;
1145 if (node->mark && (leftmask & bitmask) == 0) {
1146 leftmask |= bitmask;
1147 if (node->leftnode == LEAF) {
1148 assert(node->left);
1149 } else if (node->left) {
1150 assert(node->leftnode == NODE);
1151 indent += 1;
1152 node = node->left;
1153 break;
1154 }
1155 }
1156 if (node->mark && (rightmask & bitmask) == 0) {
1157 rightmask |= bitmask;
1158 pathbits |= bitmask;
1159 if (node->rightnode == LEAF) {
1160 assert(node->right);
1161 } else if (node->right) {
1162 assert(node->rightnode == NODE);
1163 indent += 1;
1164 node = node->right;
1165 break;
1166 }
1167 }
1168 leftmask &= ~bitmask;
1169 rightmask &= ~bitmask;
1170 pathmask &= ~bitmask;
1171 pathbits &= ~bitmask;
1172 node = node->parent;
1173 indent -= 1;
1174 }
1175 }
1176done:
1177 if (verbose > 0)
1178 printf("Found %d changes\n", changed);
1179 return changed;
1180}
1181
1182
1183
1184
1185static void emit(struct tree *tree, unsigned char *data)
1186{
1187 struct node *node;
1188 unsigned int leftmask;
1189 unsigned int rightmask;
1190 unsigned int bitmask;
1191 int offlen;
1192 int offset;
1193 int index;
1194 int indent;
1195 int size;
1196 int bytes;
1197 int leaves;
1198 int nodes[4];
1199 unsigned char byte;
1200
1201 nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202 leaves = 0;
1203 bytes = 0;
1204 index = tree->index;
1205 data += index;
1206 indent = 1;
1207 if (verbose > 0)
1208 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209 if (tree->childnode == LEAF) {
1210 assert(tree->root);
1211 tree->leaf_emit(tree->root, data);
1212 size = tree->leaf_size(tree->root);
1213 index += size;
1214 leaves++;
1215 goto done;
1216 }
1217
1218 assert(tree->childnode == NODE);
1219 node = tree->root;
1220 leftmask = rightmask = 0;
1221 while (node) {
1222 if (!node->mark)
1223 goto skip;
1224 assert(node->offset != -1);
1225 assert(node->index == index);
1226
1227 byte = 0;
1228 if (node->nextbyte)
1229 byte |= NEXTBYTE;
1230 byte |= (node->bitnum & BITNUM);
1231 if (node->left && node->right) {
1232 if (node->leftnode == NODE)
1233 byte |= LEFTNODE;
1234 if (node->rightnode == NODE)
1235 byte |= RIGHTNODE;
1236 if (node->offset <= 0xff)
1237 offlen = 1;
1238 else if (node->offset <= 0xffff)
1239 offlen = 2;
1240 else
1241 offlen = 3;
1242 nodes[offlen]++;
1243 offset = node->offset;
1244 byte |= offlen << OFFLEN_SHIFT;
1245 *data++ = byte;
1246 index++;
1247 while (offlen--) {
1248 *data++ = offset & 0xff;
1249 index++;
1250 offset >>= 8;
1251 }
1252 } else if (node->left) {
1253 if (node->leftnode == NODE)
1254 byte |= TRIENODE;
1255 nodes[0]++;
1256 *data++ = byte;
1257 index++;
1258 } else if (node->right) {
1259 byte |= RIGHTNODE;
1260 if (node->rightnode == NODE)
1261 byte |= TRIENODE;
1262 nodes[0]++;
1263 *data++ = byte;
1264 index++;
1265 } else {
1266 assert(0);
1267 }
1268skip:
1269 while (node) {
1270 bitmask = 1 << node->bitnum;
1271 if (node->mark && (leftmask & bitmask) == 0) {
1272 leftmask |= bitmask;
1273 if (node->leftnode == LEAF) {
1274 assert(node->left);
1275 data = tree->leaf_emit(node->left,
1276 data);
1277 size = tree->leaf_size(node->left);
1278 index += size;
1279 bytes += size;
1280 leaves++;
1281 } else if (node->left) {
1282 assert(node->leftnode == NODE);
1283 indent += 1;
1284 node = node->left;
1285 break;
1286 }
1287 }
1288 if (node->mark && (rightmask & bitmask) == 0) {
1289 rightmask |= bitmask;
1290 if (node->rightnode == LEAF) {
1291 assert(node->right);
1292 data = tree->leaf_emit(node->right,
1293 data);
1294 size = tree->leaf_size(node->right);
1295 index += size;
1296 bytes += size;
1297 leaves++;
1298 } else if (node->right) {
1299 assert(node->rightnode == NODE);
1300 indent += 1;
1301 node = node->right;
1302 break;
1303 }
1304 }
1305 leftmask &= ~bitmask;
1306 rightmask &= ~bitmask;
1307 node = node->parent;
1308 indent -= 1;
1309 }
1310 }
1311done:
1312 if (verbose > 0) {
1313 printf("Emitted %d (%d) leaves",
1314 leaves, bytes);
1315 printf(" %d (%d+%d+%d+%d) nodes",
1316 nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317 nodes[0], nodes[1], nodes[2], nodes[3]);
1318 printf(" %d total\n", index - tree->index);
1319 }
1320}
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339struct unicode_data {
1340 unsigned int code;
1341 int ccc;
1342 int gen;
1343 int correction;
1344 unsigned int *utf32nfdi;
1345 unsigned int *utf32nfdicf;
1346 char *utf8nfdi;
1347 char *utf8nfdicf;
1348};
1349
1350struct unicode_data unicode_data[0x110000];
1351struct unicode_data *corrections;
1352int corrections_count;
1353
1354struct tree *nfdi_tree;
1355struct tree *nfdicf_tree;
1356
1357struct tree *trees;
1358int trees_count;
1359
1360
1361
1362
1363
1364static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365{
1366 int i;
1367
1368 for (i = 0; i != corrections_count; i++)
1369 if (u->code == corrections[i].code)
1370 return &corrections[i];
1371 return u;
1372}
1373
1374static int nfdi_equal(void *l, void *r)
1375{
1376 struct unicode_data *left = l;
1377 struct unicode_data *right = r;
1378
1379 if (left->gen != right->gen)
1380 return 0;
1381 if (left->ccc != right->ccc)
1382 return 0;
1383 if (left->utf8nfdi && right->utf8nfdi &&
1384 strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385 return 1;
1386 if (left->utf8nfdi || right->utf8nfdi)
1387 return 0;
1388 return 1;
1389}
1390
1391static int nfdicf_equal(void *l, void *r)
1392{
1393 struct unicode_data *left = l;
1394 struct unicode_data *right = r;
1395
1396 if (left->gen != right->gen)
1397 return 0;
1398 if (left->ccc != right->ccc)
1399 return 0;
1400 if (left->utf8nfdicf && right->utf8nfdicf &&
1401 strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402 return 1;
1403 if (left->utf8nfdicf && right->utf8nfdicf)
1404 return 0;
1405 if (left->utf8nfdicf || right->utf8nfdicf)
1406 return 0;
1407 if (left->utf8nfdi && right->utf8nfdi &&
1408 strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409 return 1;
1410 if (left->utf8nfdi || right->utf8nfdi)
1411 return 0;
1412 return 1;
1413}
1414
1415static void nfdi_print(void *l, int indent)
1416{
1417 struct unicode_data *leaf = l;
1418
1419 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420 leaf->code, leaf->ccc, leaf->gen);
1421
1422 if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424 else if (leaf->utf8nfdi)
1425 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426
1427 printf("\n");
1428}
1429
1430static void nfdicf_print(void *l, int indent)
1431{
1432 struct unicode_data *leaf = l;
1433
1434 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435 leaf->code, leaf->ccc, leaf->gen);
1436
1437 if (leaf->utf8nfdicf)
1438 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439 else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441 else if (leaf->utf8nfdi)
1442 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443 printf("\n");
1444}
1445
1446static int nfdi_mark(void *l)
1447{
1448 return 1;
1449}
1450
1451static int nfdicf_mark(void *l)
1452{
1453 struct unicode_data *leaf = l;
1454
1455 if (leaf->utf8nfdicf)
1456 return 1;
1457 return 0;
1458}
1459
1460static int correction_mark(void *l)
1461{
1462 struct unicode_data *leaf = l;
1463
1464 return leaf->correction;
1465}
1466
1467static int nfdi_size(void *l)
1468{
1469 struct unicode_data *leaf = l;
1470 int size = 2;
1471
1472 if (HANGUL_SYLLABLE(leaf->code))
1473 size += 1;
1474 else if (leaf->utf8nfdi)
1475 size += strlen(leaf->utf8nfdi) + 1;
1476 return size;
1477}
1478
1479static int nfdicf_size(void *l)
1480{
1481 struct unicode_data *leaf = l;
1482 int size = 2;
1483
1484 if (HANGUL_SYLLABLE(leaf->code))
1485 size += 1;
1486 else if (leaf->utf8nfdicf)
1487 size += strlen(leaf->utf8nfdicf) + 1;
1488 else if (leaf->utf8nfdi)
1489 size += strlen(leaf->utf8nfdi) + 1;
1490 return size;
1491}
1492
1493static int *nfdi_index(struct tree *tree, void *l)
1494{
1495 struct unicode_data *leaf = l;
1496
1497 return &tree->leafindex[leaf->code];
1498}
1499
1500static int *nfdicf_index(struct tree *tree, void *l)
1501{
1502 struct unicode_data *leaf = l;
1503
1504 return &tree->leafindex[leaf->code];
1505}
1506
1507static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508{
1509 struct unicode_data *leaf = l;
1510 unsigned char *s;
1511
1512 *data++ = leaf->gen;
1513
1514 if (HANGUL_SYLLABLE(leaf->code)) {
1515 *data++ = DECOMPOSE;
1516 *data++ = HANGUL;
1517 } else if (leaf->utf8nfdi) {
1518 *data++ = DECOMPOSE;
1519 s = (unsigned char*)leaf->utf8nfdi;
1520 while ((*data++ = *s++) != 0)
1521 ;
1522 } else {
1523 *data++ = leaf->ccc;
1524 }
1525 return data;
1526}
1527
1528static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529{
1530 struct unicode_data *leaf = l;
1531 unsigned char *s;
1532
1533 *data++ = leaf->gen;
1534
1535 if (HANGUL_SYLLABLE(leaf->code)) {
1536 *data++ = DECOMPOSE;
1537 *data++ = HANGUL;
1538 } else if (leaf->utf8nfdicf) {
1539 *data++ = DECOMPOSE;
1540 s = (unsigned char*)leaf->utf8nfdicf;
1541 while ((*data++ = *s++) != 0)
1542 ;
1543 } else if (leaf->utf8nfdi) {
1544 *data++ = DECOMPOSE;
1545 s = (unsigned char*)leaf->utf8nfdi;
1546 while ((*data++ = *s++) != 0)
1547 ;
1548 } else {
1549 *data++ = leaf->ccc;
1550 }
1551 return data;
1552}
1553
1554static void utf8_create(struct unicode_data *data)
1555{
1556 char utf[18*4+1];
1557 char *u;
1558 unsigned int *um;
1559 int i;
1560
1561 if (data->utf8nfdi) {
1562 assert(data->utf8nfdi[0] == HANGUL);
1563 return;
1564 }
1565
1566 u = utf;
1567 um = data->utf32nfdi;
1568 if (um) {
1569 for (i = 0; um[i]; i++)
1570 u += utf8encode(u, um[i]);
1571 *u = '\0';
1572 data->utf8nfdi = strdup(utf);
1573 }
1574 u = utf;
1575 um = data->utf32nfdicf;
1576 if (um) {
1577 for (i = 0; um[i]; i++)
1578 u += utf8encode(u, um[i]);
1579 *u = '\0';
1580 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581 data->utf8nfdicf = strdup(utf);
1582 }
1583}
1584
1585static void utf8_init(void)
1586{
1587 unsigned int unichar;
1588 int i;
1589
1590 for (unichar = 0; unichar != 0x110000; unichar++)
1591 utf8_create(&unicode_data[unichar]);
1592
1593 for (i = 0; i != corrections_count; i++)
1594 utf8_create(&corrections[i]);
1595}
1596
1597static void trees_init(void)
1598{
1599 struct unicode_data *data;
1600 unsigned int maxage;
1601 unsigned int nextage;
1602 int count;
1603 int i;
1604 int j;
1605
1606
1607 count = 0;
1608 nextage = (unsigned int)-1;
1609 do {
1610 maxage = nextage;
1611 nextage = 0;
1612 for (i = 0; i <= corrections_count; i++) {
1613 data = &corrections[i];
1614 if (nextage < data->correction &&
1615 data->correction < maxage)
1616 nextage = data->correction;
1617 }
1618 count++;
1619 } while (nextage);
1620
1621
1622 trees_count = count * 2;
1623 trees = calloc(trees_count, sizeof(struct tree));
1624
1625
1626 count = trees_count;
1627 nextage = (unsigned int)-1;
1628 do {
1629 maxage = nextage;
1630 trees[--count].maxage = maxage;
1631 trees[--count].maxage = maxage;
1632 nextage = 0;
1633 for (i = 0; i <= corrections_count; i++) {
1634 data = &corrections[i];
1635 if (nextage < data->correction &&
1636 data->correction < maxage)
1637 nextage = data->correction;
1638 }
1639 } while (nextage);
1640
1641
1642 for (i = 0; i != trees_count; i++) {
1643 j = 0;
1644 while (ages[j] < trees[i].maxage)
1645 j++;
1646 trees[i].maxage = ages[j-1];
1647 }
1648
1649
1650 trees[trees_count-2].next = &trees[trees_count-1];
1651 trees[trees_count-1].leaf_mark = nfdi_mark;
1652 trees[trees_count-2].leaf_mark = nfdicf_mark;
1653 for (i = 0; i != trees_count-2; i += 2) {
1654 trees[i].next = &trees[trees_count-2];
1655 trees[i].leaf_mark = correction_mark;
1656 trees[i+1].next = &trees[trees_count-1];
1657 trees[i+1].leaf_mark = correction_mark;
1658 }
1659
1660
1661 for (i = 0; i != trees_count; i += 2) {
1662 trees[i].type = "nfdicf";
1663 trees[i].leaf_equal = nfdicf_equal;
1664 trees[i].leaf_print = nfdicf_print;
1665 trees[i].leaf_size = nfdicf_size;
1666 trees[i].leaf_index = nfdicf_index;
1667 trees[i].leaf_emit = nfdicf_emit;
1668
1669 trees[i+1].type = "nfdi";
1670 trees[i+1].leaf_equal = nfdi_equal;
1671 trees[i+1].leaf_print = nfdi_print;
1672 trees[i+1].leaf_size = nfdi_size;
1673 trees[i+1].leaf_index = nfdi_index;
1674 trees[i+1].leaf_emit = nfdi_emit;
1675 }
1676
1677
1678 for (i = 0; i != trees_count; i++)
1679 trees[i].childnode = NODE;
1680}
1681
1682static void trees_populate(void)
1683{
1684 struct unicode_data *data;
1685 unsigned int unichar;
1686 char keyval[4];
1687 int keylen;
1688 int i;
1689
1690 for (i = 0; i != trees_count; i++) {
1691 if (verbose > 0) {
1692 printf("Populating %s_%x\n",
1693 trees[i].type, trees[i].maxage);
1694 }
1695 for (unichar = 0; unichar != 0x110000; unichar++) {
1696 if (unicode_data[unichar].gen < 0)
1697 continue;
1698 keylen = utf8encode(keyval, unichar);
1699 data = corrections_lookup(&unicode_data[unichar]);
1700 if (data->correction <= trees[i].maxage)
1701 data = &unicode_data[unichar];
1702 insert(&trees[i], keyval, keylen, data);
1703 }
1704 }
1705}
1706
1707static void trees_reduce(void)
1708{
1709 int i;
1710 int size;
1711 int changed;
1712
1713 for (i = 0; i != trees_count; i++)
1714 prune(&trees[i]);
1715 for (i = 0; i != trees_count; i++)
1716 mark_nodes(&trees[i]);
1717 do {
1718 size = 0;
1719 for (i = 0; i != trees_count; i++)
1720 size = index_nodes(&trees[i], size);
1721 changed = 0;
1722 for (i = 0; i != trees_count; i++)
1723 changed += size_nodes(&trees[i]);
1724 } while (changed);
1725
1726 utf8data = calloc(size, 1);
1727 utf8data_size = size;
1728 for (i = 0; i != trees_count; i++)
1729 emit(&trees[i], utf8data);
1730
1731 if (verbose > 0) {
1732 for (i = 0; i != trees_count; i++) {
1733 printf("%s_%x idx %d\n",
1734 trees[i].type, trees[i].maxage, trees[i].index);
1735 }
1736 }
1737
1738 nfdi = utf8data + trees[trees_count-1].index;
1739 nfdicf = utf8data + trees[trees_count-2].index;
1740
1741 nfdi_tree = &trees[trees_count-1];
1742 nfdicf_tree = &trees[trees_count-2];
1743}
1744
1745static void verify(struct tree *tree)
1746{
1747 struct unicode_data *data;
1748 utf8leaf_t *leaf;
1749 unsigned int unichar;
1750 char key[4];
1751 unsigned char hangul[UTF8HANGULLEAF];
1752 int report;
1753 int nocf;
1754
1755 if (verbose > 0)
1756 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757 nocf = strcmp(tree->type, "nfdicf");
1758
1759 for (unichar = 0; unichar != 0x110000; unichar++) {
1760 report = 0;
1761 data = corrections_lookup(&unicode_data[unichar]);
1762 if (data->correction <= tree->maxage)
1763 data = &unicode_data[unichar];
1764 utf8encode(key,unichar);
1765 leaf = utf8lookup(tree, hangul, key);
1766
1767 if (!leaf) {
1768 if (data->gen != -1)
1769 report++;
1770 if (unichar < 0xd800 || unichar > 0xdfff)
1771 report++;
1772 } else {
1773 if (unichar >= 0xd800 && unichar <= 0xdfff)
1774 report++;
1775 if (data->gen == -1)
1776 report++;
1777 if (data->gen != LEAF_GEN(leaf))
1778 report++;
1779 if (LEAF_CCC(leaf) == DECOMPOSE) {
1780 if (HANGUL_SYLLABLE(data->code)) {
1781 if (data->utf8nfdi[0] != HANGUL)
1782 report++;
1783 } else if (nocf) {
1784 if (!data->utf8nfdi) {
1785 report++;
1786 } else if (strcmp(data->utf8nfdi,
1787 LEAF_STR(leaf))) {
1788 report++;
1789 }
1790 } else {
1791 if (!data->utf8nfdicf &&
1792 !data->utf8nfdi) {
1793 report++;
1794 } else if (data->utf8nfdicf) {
1795 if (strcmp(data->utf8nfdicf,
1796 LEAF_STR(leaf)))
1797 report++;
1798 } else if (strcmp(data->utf8nfdi,
1799 LEAF_STR(leaf))) {
1800 report++;
1801 }
1802 }
1803 } else if (data->ccc != LEAF_CCC(leaf)) {
1804 report++;
1805 }
1806 }
1807 if (report) {
1808 printf("%X code %X gen %d ccc %d"
1809 " nfdi -> \"%s\"",
1810 unichar, data->code, data->gen,
1811 data->ccc,
1812 data->utf8nfdi);
1813 if (leaf) {
1814 printf(" gen %d ccc %d"
1815 " nfdi -> \"%s\"",
1816 LEAF_GEN(leaf),
1817 LEAF_CCC(leaf),
1818 LEAF_CCC(leaf) == DECOMPOSE ?
1819 LEAF_STR(leaf) : "");
1820 }
1821 printf("\n");
1822 }
1823 }
1824}
1825
1826static void trees_verify(void)
1827{
1828 int i;
1829
1830 for (i = 0; i != trees_count; i++)
1831 verify(&trees[i]);
1832}
1833
1834
1835
1836static void help(void)
1837{
1838 printf("Usage: %s [options]\n", argv0);
1839 printf("\n");
1840 printf("This program creates an a data trie used for parsing and\n");
1841 printf("normalization of UTF-8 strings. The trie is derived from\n");
1842 printf("a set of input files from the Unicode character database\n");
1843 printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844 printf("\n");
1845 printf("The generated tree supports two normalization forms:\n");
1846 printf("\n");
1847 printf("\tnfdi:\n");
1848 printf("\t- Apply unicode normalization form NFD.\n");
1849 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850 printf("\n");
1851 printf("\tnfdicf:\n");
1852 printf("\t- Apply unicode normalization form NFD.\n");
1853 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854 printf("\t- Apply a full casefold (C + F).\n");
1855 printf("\n");
1856 printf("These forms were chosen as being most useful when dealing\n");
1857 printf("with file names: NFD catches most cases where characters\n");
1858 printf("should be considered equivalent. The ignorables are mostly\n");
1859 printf("invisible, making names hard to type.\n");
1860 printf("\n");
1861 printf("The options to specify the files to be used are listed\n");
1862 printf("below with their default values, which are the names used\n");
1863 printf("by version 11.0.0 of the Unicode Character Database.\n");
1864 printf("\n");
1865 printf("The input files:\n");
1866 printf("\t-a %s\n", AGE_NAME);
1867 printf("\t-c %s\n", CCC_NAME);
1868 printf("\t-p %s\n", PROP_NAME);
1869 printf("\t-d %s\n", DATA_NAME);
1870 printf("\t-f %s\n", FOLD_NAME);
1871 printf("\t-n %s\n", NORM_NAME);
1872 printf("\n");
1873 printf("Additionally, the generated tables are tested using:\n");
1874 printf("\t-t %s\n", TEST_NAME);
1875 printf("\n");
1876 printf("Finally, the output file:\n");
1877 printf("\t-o %s\n", UTF8_NAME);
1878 printf("\n");
1879}
1880
1881static void usage(void)
1882{
1883 help();
1884 exit(1);
1885}
1886
1887static void open_fail(const char *name, int error)
1888{
1889 printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890 exit(1);
1891}
1892
1893static void file_fail(const char *filename)
1894{
1895 printf("Error parsing %s\n", filename);
1896 exit(1);
1897}
1898
1899static void line_fail(const char *filename, const char *line)
1900{
1901 printf("Error parsing %s:%s\n", filename, line);
1902 exit(1);
1903}
1904
1905
1906
1907static void print_utf32(unsigned int *utf32str)
1908{
1909 int i;
1910
1911 for (i = 0; utf32str[i]; i++)
1912 printf(" %X", utf32str[i]);
1913}
1914
1915static void print_utf32nfdi(unsigned int unichar)
1916{
1917 printf(" %X ->", unichar);
1918 print_utf32(unicode_data[unichar].utf32nfdi);
1919 printf("\n");
1920}
1921
1922static void print_utf32nfdicf(unsigned int unichar)
1923{
1924 printf(" %X ->", unichar);
1925 print_utf32(unicode_data[unichar].utf32nfdicf);
1926 printf("\n");
1927}
1928
1929
1930
1931static void age_init(void)
1932{
1933 FILE *file;
1934 unsigned int first;
1935 unsigned int last;
1936 unsigned int unichar;
1937 unsigned int major;
1938 unsigned int minor;
1939 unsigned int revision;
1940 int gen;
1941 int count;
1942 int ret;
1943
1944 if (verbose > 0)
1945 printf("Parsing %s\n", age_name);
1946
1947 file = fopen(age_name, "r");
1948 if (!file)
1949 open_fail(age_name, errno);
1950 count = 0;
1951
1952 gen = 0;
1953 while (fgets(line, LINESIZE, file)) {
1954 ret = sscanf(line, "# Age=V%d_%d_%d",
1955 &major, &minor, &revision);
1956 if (ret == 3) {
1957 ages_count++;
1958 if (verbose > 1)
1959 printf(" Age V%d_%d_%d\n",
1960 major, minor, revision);
1961 if (!age_valid(major, minor, revision))
1962 line_fail(age_name, line);
1963 continue;
1964 }
1965 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966 if (ret == 2) {
1967 ages_count++;
1968 if (verbose > 1)
1969 printf(" Age V%d_%d\n", major, minor);
1970 if (!age_valid(major, minor, 0))
1971 line_fail(age_name, line);
1972 continue;
1973 }
1974 }
1975
1976
1977 if (verbose > 1)
1978 printf("%d age entries\n", ages_count);
1979 if (ages_count == 0 || ages_count > MAXGEN)
1980 file_fail(age_name);
1981
1982
1983 ages_count++;
1984 ages = calloc(ages_count + 1, sizeof(*ages));
1985
1986 ages[ages_count] = (unsigned int)-1;
1987
1988 rewind(file);
1989 count = 0;
1990 gen = 0;
1991 while (fgets(line, LINESIZE, file)) {
1992 ret = sscanf(line, "# Age=V%d_%d_%d",
1993 &major, &minor, &revision);
1994 if (ret == 3) {
1995 ages[++gen] =
1996 UNICODE_AGE(major, minor, revision);
1997 if (verbose > 1)
1998 printf(" Age V%d_%d_%d = gen %d\n",
1999 major, minor, revision, gen);
2000 if (!age_valid(major, minor, revision))
2001 line_fail(age_name, line);
2002 continue;
2003 }
2004 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005 if (ret == 2) {
2006 ages[++gen] = UNICODE_AGE(major, minor, 0);
2007 if (verbose > 1)
2008 printf(" Age V%d_%d = %d\n",
2009 major, minor, gen);
2010 if (!age_valid(major, minor, 0))
2011 line_fail(age_name, line);
2012 continue;
2013 }
2014 ret = sscanf(line, "%X..%X ; %d.%d #",
2015 &first, &last, &major, &minor);
2016 if (ret == 4) {
2017 for (unichar = first; unichar <= last; unichar++)
2018 unicode_data[unichar].gen = gen;
2019 count += 1 + last - first;
2020 if (verbose > 1)
2021 printf(" %X..%X gen %d\n", first, last, gen);
2022 if (!utf32valid(first) || !utf32valid(last))
2023 line_fail(age_name, line);
2024 continue;
2025 }
2026 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027 if (ret == 3) {
2028 unicode_data[unichar].gen = gen;
2029 count++;
2030 if (verbose > 1)
2031 printf(" %X gen %d\n", unichar, gen);
2032 if (!utf32valid(unichar))
2033 line_fail(age_name, line);
2034 continue;
2035 }
2036 }
2037 unicode_maxage = ages[gen];
2038 fclose(file);
2039
2040
2041 if (verbose > 1)
2042 printf(" Removing surrogate block D800..DFFF\n");
2043 for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044 unicode_data[unichar].gen = -1;
2045
2046 if (verbose > 0)
2047 printf("Found %d entries\n", count);
2048 if (count == 0)
2049 file_fail(age_name);
2050}
2051
2052static void ccc_init(void)
2053{
2054 FILE *file;
2055 unsigned int first;
2056 unsigned int last;
2057 unsigned int unichar;
2058 unsigned int value;
2059 int count;
2060 int ret;
2061
2062 if (verbose > 0)
2063 printf("Parsing %s\n", ccc_name);
2064
2065 file = fopen(ccc_name, "r");
2066 if (!file)
2067 open_fail(ccc_name, errno);
2068
2069 count = 0;
2070 while (fgets(line, LINESIZE, file)) {
2071 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072 if (ret == 3) {
2073 for (unichar = first; unichar <= last; unichar++) {
2074 unicode_data[unichar].ccc = value;
2075 count++;
2076 }
2077 if (verbose > 1)
2078 printf(" %X..%X ccc %d\n", first, last, value);
2079 if (!utf32valid(first) || !utf32valid(last))
2080 line_fail(ccc_name, line);
2081 continue;
2082 }
2083 ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084 if (ret == 2) {
2085 unicode_data[unichar].ccc = value;
2086 count++;
2087 if (verbose > 1)
2088 printf(" %X ccc %d\n", unichar, value);
2089 if (!utf32valid(unichar))
2090 line_fail(ccc_name, line);
2091 continue;
2092 }
2093 }
2094 fclose(file);
2095
2096 if (verbose > 0)
2097 printf("Found %d entries\n", count);
2098 if (count == 0)
2099 file_fail(ccc_name);
2100}
2101
2102static int ignore_compatibility_form(char *type)
2103{
2104 int i;
2105 char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106 "final", "isolated", "circle", "super",
2107 "sub", "vertical", "wide", "narrow",
2108 "small", "square", "fraction", "compat"};
2109
2110 for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111 if (strcmp(type, ignored_types[i]) == 0)
2112 return 1;
2113 return 0;
2114}
2115
2116static void nfdi_init(void)
2117{
2118 FILE *file;
2119 unsigned int unichar;
2120 unsigned int mapping[19];
2121 char *s;
2122 char *type;
2123 unsigned int *um;
2124 int count;
2125 int i;
2126 int ret;
2127
2128 if (verbose > 0)
2129 printf("Parsing %s\n", data_name);
2130 file = fopen(data_name, "r");
2131 if (!file)
2132 open_fail(data_name, errno);
2133
2134 count = 0;
2135 while (fgets(line, LINESIZE, file)) {
2136 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137 &unichar, buf0);
2138 if (ret != 2)
2139 continue;
2140 if (!utf32valid(unichar))
2141 line_fail(data_name, line);
2142
2143 s = buf0;
2144
2145 if (*s == '<') {
2146 type = ++s;
2147 while (*++s != '>');
2148 *s++ = '\0';
2149 if(ignore_compatibility_form(type))
2150 continue;
2151 }
2152
2153 i = 0;
2154 while (*s) {
2155 mapping[i] = strtoul(s, &s, 16);
2156 if (!utf32valid(mapping[i]))
2157 line_fail(data_name, line);
2158 i++;
2159 }
2160 mapping[i++] = 0;
2161
2162 um = malloc(i * sizeof(unsigned int));
2163 memcpy(um, mapping, i * sizeof(unsigned int));
2164 unicode_data[unichar].utf32nfdi = um;
2165
2166 if (verbose > 1)
2167 print_utf32nfdi(unichar);
2168 count++;
2169 }
2170 fclose(file);
2171 if (verbose > 0)
2172 printf("Found %d entries\n", count);
2173 if (count == 0)
2174 file_fail(data_name);
2175}
2176
2177static void nfdicf_init(void)
2178{
2179 FILE *file;
2180 unsigned int unichar;
2181 unsigned int mapping[19];
2182 char status;
2183 char *s;
2184 unsigned int *um;
2185 int i;
2186 int count;
2187 int ret;
2188
2189 if (verbose > 0)
2190 printf("Parsing %s\n", fold_name);
2191 file = fopen(fold_name, "r");
2192 if (!file)
2193 open_fail(fold_name, errno);
2194
2195 count = 0;
2196 while (fgets(line, LINESIZE, file)) {
2197 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198 if (ret != 3)
2199 continue;
2200 if (!utf32valid(unichar))
2201 line_fail(fold_name, line);
2202
2203 if (status != 'C' && status != 'F')
2204 continue;
2205 s = buf0;
2206 if (*s == '<')
2207 while (*s++ != ' ')
2208 ;
2209 i = 0;
2210 while (*s) {
2211 mapping[i] = strtoul(s, &s, 16);
2212 if (!utf32valid(mapping[i]))
2213 line_fail(fold_name, line);
2214 i++;
2215 }
2216 mapping[i++] = 0;
2217
2218 um = malloc(i * sizeof(unsigned int));
2219 memcpy(um, mapping, i * sizeof(unsigned int));
2220 unicode_data[unichar].utf32nfdicf = um;
2221
2222 if (verbose > 1)
2223 print_utf32nfdicf(unichar);
2224 count++;
2225 }
2226 fclose(file);
2227 if (verbose > 0)
2228 printf("Found %d entries\n", count);
2229 if (count == 0)
2230 file_fail(fold_name);
2231}
2232
2233static void ignore_init(void)
2234{
2235 FILE *file;
2236 unsigned int unichar;
2237 unsigned int first;
2238 unsigned int last;
2239 unsigned int *um;
2240 int count;
2241 int ret;
2242
2243 if (verbose > 0)
2244 printf("Parsing %s\n", prop_name);
2245 file = fopen(prop_name, "r");
2246 if (!file)
2247 open_fail(prop_name, errno);
2248 assert(file);
2249 count = 0;
2250 while (fgets(line, LINESIZE, file)) {
2251 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252 if (ret == 3) {
2253 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254 continue;
2255 if (!utf32valid(first) || !utf32valid(last))
2256 line_fail(prop_name, line);
2257 for (unichar = first; unichar <= last; unichar++) {
2258 free(unicode_data[unichar].utf32nfdi);
2259 um = malloc(sizeof(unsigned int));
2260 *um = 0;
2261 unicode_data[unichar].utf32nfdi = um;
2262 free(unicode_data[unichar].utf32nfdicf);
2263 um = malloc(sizeof(unsigned int));
2264 *um = 0;
2265 unicode_data[unichar].utf32nfdicf = um;
2266 count++;
2267 }
2268 if (verbose > 1)
2269 printf(" %X..%X Default_Ignorable_Code_Point\n",
2270 first, last);
2271 continue;
2272 }
2273 ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274 if (ret == 2) {
2275 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276 continue;
2277 if (!utf32valid(unichar))
2278 line_fail(prop_name, line);
2279 free(unicode_data[unichar].utf32nfdi);
2280 um = malloc(sizeof(unsigned int));
2281 *um = 0;
2282 unicode_data[unichar].utf32nfdi = um;
2283 free(unicode_data[unichar].utf32nfdicf);
2284 um = malloc(sizeof(unsigned int));
2285 *um = 0;
2286 unicode_data[unichar].utf32nfdicf = um;
2287 if (verbose > 1)
2288 printf(" %X Default_Ignorable_Code_Point\n",
2289 unichar);
2290 count++;
2291 continue;
2292 }
2293 }
2294 fclose(file);
2295
2296 if (verbose > 0)
2297 printf("Found %d entries\n", count);
2298 if (count == 0)
2299 file_fail(prop_name);
2300}
2301
2302static void corrections_init(void)
2303{
2304 FILE *file;
2305 unsigned int unichar;
2306 unsigned int major;
2307 unsigned int minor;
2308 unsigned int revision;
2309 unsigned int age;
2310 unsigned int *um;
2311 unsigned int mapping[19];
2312 char *s;
2313 int i;
2314 int count;
2315 int ret;
2316
2317 if (verbose > 0)
2318 printf("Parsing %s\n", norm_name);
2319 file = fopen(norm_name, "r");
2320 if (!file)
2321 open_fail(norm_name, errno);
2322
2323 count = 0;
2324 while (fgets(line, LINESIZE, file)) {
2325 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326 &unichar, buf0, buf1,
2327 &major, &minor, &revision);
2328 if (ret != 6)
2329 continue;
2330 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2331 line_fail(norm_name, line);
2332 count++;
2333 }
2334 corrections = calloc(count, sizeof(struct unicode_data));
2335 corrections_count = count;
2336 rewind(file);
2337
2338 count = 0;
2339 while (fgets(line, LINESIZE, file)) {
2340 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341 &unichar, buf0, buf1,
2342 &major, &minor, &revision);
2343 if (ret != 6)
2344 continue;
2345 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2346 line_fail(norm_name, line);
2347 corrections[count] = unicode_data[unichar];
2348 assert(corrections[count].code == unichar);
2349 age = UNICODE_AGE(major, minor, revision);
2350 corrections[count].correction = age;
2351
2352 i = 0;
2353 s = buf0;
2354 while (*s) {
2355 mapping[i] = strtoul(s, &s, 16);
2356 if (!utf32valid(mapping[i]))
2357 line_fail(norm_name, line);
2358 i++;
2359 }
2360 mapping[i++] = 0;
2361
2362 um = malloc(i * sizeof(unsigned int));
2363 memcpy(um, mapping, i * sizeof(unsigned int));
2364 corrections[count].utf32nfdi = um;
2365
2366 if (verbose > 1)
2367 printf(" %X -> %s -> %s V%d_%d_%d\n",
2368 unichar, buf0, buf1, major, minor, revision);
2369 count++;
2370 }
2371 fclose(file);
2372
2373 if (verbose > 0)
2374 printf("Found %d entries\n", count);
2375 if (count == 0)
2376 file_fail(norm_name);
2377}
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427static void hangul_decompose(void)
2428{
2429 unsigned int sb = 0xAC00;
2430 unsigned int lb = 0x1100;
2431 unsigned int vb = 0x1161;
2432 unsigned int tb = 0x11a7;
2433
2434 unsigned int vc = 21;
2435 unsigned int tc = 28;
2436 unsigned int nc = (vc * tc);
2437
2438 unsigned int unichar;
2439 unsigned int mapping[4];
2440 unsigned int *um;
2441 int count;
2442 int i;
2443
2444 if (verbose > 0)
2445 printf("Decomposing hangul\n");
2446
2447 count = 0;
2448 for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2449 unsigned int si = unichar - sb;
2450 unsigned int li = si / nc;
2451 unsigned int vi = (si % nc) / tc;
2452 unsigned int ti = si % tc;
2453
2454 i = 0;
2455 mapping[i++] = lb + li;
2456 mapping[i++] = vb + vi;
2457 if (ti)
2458 mapping[i++] = tb + ti;
2459 mapping[i++] = 0;
2460
2461 assert(!unicode_data[unichar].utf32nfdi);
2462 um = malloc(i * sizeof(unsigned int));
2463 memcpy(um, mapping, i * sizeof(unsigned int));
2464 unicode_data[unichar].utf32nfdi = um;
2465
2466 assert(!unicode_data[unichar].utf32nfdicf);
2467 um = malloc(i * sizeof(unsigned int));
2468 memcpy(um, mapping, i * sizeof(unsigned int));
2469 unicode_data[unichar].utf32nfdicf = um;
2470
2471
2472
2473
2474
2475
2476 unicode_data[unichar].utf8nfdi = malloc(2);
2477 unicode_data[unichar].utf8nfdi[0] = HANGUL;
2478 unicode_data[unichar].utf8nfdi[1] = '\0';
2479
2480 if (verbose > 1)
2481 print_utf32nfdi(unichar);
2482
2483 count++;
2484 }
2485 if (verbose > 0)
2486 printf("Created %d entries\n", count);
2487}
2488
2489static void nfdi_decompose(void)
2490{
2491 unsigned int unichar;
2492 unsigned int mapping[19];
2493 unsigned int *um;
2494 unsigned int *dc;
2495 int count;
2496 int i;
2497 int j;
2498 int ret;
2499
2500 if (verbose > 0)
2501 printf("Decomposing nfdi\n");
2502
2503 count = 0;
2504 for (unichar = 0; unichar != 0x110000; unichar++) {
2505 if (!unicode_data[unichar].utf32nfdi)
2506 continue;
2507 for (;;) {
2508 ret = 1;
2509 i = 0;
2510 um = unicode_data[unichar].utf32nfdi;
2511 while (*um) {
2512 dc = unicode_data[*um].utf32nfdi;
2513 if (dc) {
2514 for (j = 0; dc[j]; j++)
2515 mapping[i++] = dc[j];
2516 ret = 0;
2517 } else {
2518 mapping[i++] = *um;
2519 }
2520 um++;
2521 }
2522 mapping[i++] = 0;
2523 if (ret)
2524 break;
2525 free(unicode_data[unichar].utf32nfdi);
2526 um = malloc(i * sizeof(unsigned int));
2527 memcpy(um, mapping, i * sizeof(unsigned int));
2528 unicode_data[unichar].utf32nfdi = um;
2529 }
2530
2531 if (!unicode_data[unichar].utf32nfdicf) {
2532 um = malloc(i * sizeof(unsigned int));
2533 memcpy(um, mapping, i * sizeof(unsigned int));
2534 unicode_data[unichar].utf32nfdicf = um;
2535 }
2536 if (verbose > 1)
2537 print_utf32nfdi(unichar);
2538 count++;
2539 }
2540 if (verbose > 0)
2541 printf("Processed %d entries\n", count);
2542}
2543
2544static void nfdicf_decompose(void)
2545{
2546 unsigned int unichar;
2547 unsigned int mapping[19];
2548 unsigned int *um;
2549 unsigned int *dc;
2550 int count;
2551 int i;
2552 int j;
2553 int ret;
2554
2555 if (verbose > 0)
2556 printf("Decomposing nfdicf\n");
2557 count = 0;
2558 for (unichar = 0; unichar != 0x110000; unichar++) {
2559 if (!unicode_data[unichar].utf32nfdicf)
2560 continue;
2561 for (;;) {
2562 ret = 1;
2563 i = 0;
2564 um = unicode_data[unichar].utf32nfdicf;
2565 while (*um) {
2566 dc = unicode_data[*um].utf32nfdicf;
2567 if (dc) {
2568 for (j = 0; dc[j]; j++)
2569 mapping[i++] = dc[j];
2570 ret = 0;
2571 } else {
2572 mapping[i++] = *um;
2573 }
2574 um++;
2575 }
2576 mapping[i++] = 0;
2577 if (ret)
2578 break;
2579 free(unicode_data[unichar].utf32nfdicf);
2580 um = malloc(i * sizeof(unsigned int));
2581 memcpy(um, mapping, i * sizeof(unsigned int));
2582 unicode_data[unichar].utf32nfdicf = um;
2583 }
2584 if (verbose > 1)
2585 print_utf32nfdicf(unichar);
2586 count++;
2587 }
2588 if (verbose > 0)
2589 printf("Processed %d entries\n", count);
2590}
2591
2592
2593
2594int utf8agemax(struct tree *, const char *);
2595int utf8nagemax(struct tree *, const char *, size_t);
2596int utf8agemin(struct tree *, const char *);
2597int utf8nagemin(struct tree *, const char *, size_t);
2598ssize_t utf8len(struct tree *, const char *);
2599ssize_t utf8nlen(struct tree *, const char *, size_t);
2600struct utf8cursor;
2601int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2602int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2603int utf8byte(struct utf8cursor *);
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651#define SB (0xAC00)
2652#define LB (0x1100)
2653#define VB (0x1161)
2654#define TB (0x11A7)
2655#define LC (19)
2656#define VC (21)
2657#define TC (28)
2658#define NC (VC * TC)
2659#define SC (LC * NC)
2660
2661
2662static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2663{
2664 unsigned int si;
2665 unsigned int li;
2666 unsigned int vi;
2667 unsigned int ti;
2668 unsigned char *h;
2669
2670
2671 si = utf8decode(str) - SB;
2672 li = si / NC;
2673 vi = (si % NC) / TC;
2674 ti = si % TC;
2675
2676
2677 h = hangul;
2678 LEAF_GEN(h) = 2;
2679 LEAF_CCC(h) = DECOMPOSE;
2680 h += 2;
2681
2682
2683 h += utf8encode((char *)h, li + LB);
2684
2685
2686 h += utf8encode((char *)h, vi + VB);
2687
2688
2689 if (ti)
2690 h += utf8encode((char *)h, ti + TB);
2691
2692
2693 h[0] = '\0';
2694
2695 return hangul;
2696}
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2707 const char *s, size_t len)
2708{
2709 utf8trie_t *trie;
2710 int offlen;
2711 int offset;
2712 int mask;
2713 int node;
2714
2715 if (!tree)
2716 return NULL;
2717 if (len == 0)
2718 return NULL;
2719 node = 1;
2720 trie = utf8data + tree->index;
2721 while (node) {
2722 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723 if (*trie & NEXTBYTE) {
2724 if (--len == 0)
2725 return NULL;
2726 s++;
2727 }
2728 mask = 1 << (*trie & BITNUM);
2729 if (*s & mask) {
2730
2731 if (offlen) {
2732
2733 node = (*trie & RIGHTNODE);
2734 offset = trie[offlen];
2735 while (--offlen) {
2736 offset <<= 8;
2737 offset |= trie[offlen];
2738 }
2739 trie += offset;
2740 } else if (*trie & RIGHTPATH) {
2741
2742 node = (*trie & TRIENODE);
2743 trie++;
2744 } else {
2745
2746 return NULL;
2747 }
2748 } else {
2749
2750 if (offlen) {
2751
2752 node = (*trie & LEFTNODE);
2753 trie += offlen + 1;
2754 } else if (*trie & RIGHTPATH) {
2755
2756 return NULL;
2757 } else {
2758
2759 node = (*trie & TRIENODE);
2760 trie++;
2761 }
2762 }
2763 }
2764
2765
2766
2767
2768
2769
2770 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2771 trie = utf8hangul(s - 2, hangul);
2772 return trie;
2773}
2774
2775
2776
2777
2778
2779
2780
2781static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2782 const char *s)
2783{
2784 return utf8nlookup(tree, hangul, s, (size_t)-1);
2785}
2786
2787
2788
2789
2790
2791
2792static inline int utf8clen(const char *s)
2793{
2794 unsigned char c = *s;
2795 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2796}
2797
2798
2799
2800
2801
2802
2803int utf8agemax(struct tree *tree, const char *s)
2804{
2805 utf8leaf_t *leaf;
2806 int age = 0;
2807 int leaf_age;
2808 unsigned char hangul[UTF8HANGULLEAF];
2809
2810 if (!tree)
2811 return -1;
2812
2813 while (*s) {
2814 leaf = utf8lookup(tree, hangul, s);
2815 if (!leaf)
2816 return -1;
2817 leaf_age = ages[LEAF_GEN(leaf)];
2818 if (leaf_age <= tree->maxage && leaf_age > age)
2819 age = leaf_age;
2820 s += utf8clen(s);
2821 }
2822 return age;
2823}
2824
2825
2826
2827
2828
2829
2830int utf8agemin(struct tree *tree, const char *s)
2831{
2832 utf8leaf_t *leaf;
2833 int age;
2834 int leaf_age;
2835 unsigned char hangul[UTF8HANGULLEAF];
2836
2837 if (!tree)
2838 return -1;
2839 age = tree->maxage;
2840 while (*s) {
2841 leaf = utf8lookup(tree, hangul, s);
2842 if (!leaf)
2843 return -1;
2844 leaf_age = ages[LEAF_GEN(leaf)];
2845 if (leaf_age <= tree->maxage && leaf_age < age)
2846 age = leaf_age;
2847 s += utf8clen(s);
2848 }
2849 return age;
2850}
2851
2852
2853
2854
2855
2856int utf8nagemax(struct tree *tree, const char *s, size_t len)
2857{
2858 utf8leaf_t *leaf;
2859 int age = 0;
2860 int leaf_age;
2861 unsigned char hangul[UTF8HANGULLEAF];
2862
2863 if (!tree)
2864 return -1;
2865
2866 while (len && *s) {
2867 leaf = utf8nlookup(tree, hangul, s, len);
2868 if (!leaf)
2869 return -1;
2870 leaf_age = ages[LEAF_GEN(leaf)];
2871 if (leaf_age <= tree->maxage && leaf_age > age)
2872 age = leaf_age;
2873 len -= utf8clen(s);
2874 s += utf8clen(s);
2875 }
2876 return age;
2877}
2878
2879
2880
2881
2882
2883int utf8nagemin(struct tree *tree, const char *s, size_t len)
2884{
2885 utf8leaf_t *leaf;
2886 int leaf_age;
2887 int age;
2888 unsigned char hangul[UTF8HANGULLEAF];
2889
2890 if (!tree)
2891 return -1;
2892 age = tree->maxage;
2893 while (len && *s) {
2894 leaf = utf8nlookup(tree, hangul, s, len);
2895 if (!leaf)
2896 return -1;
2897 leaf_age = ages[LEAF_GEN(leaf)];
2898 if (leaf_age <= tree->maxage && leaf_age < age)
2899 age = leaf_age;
2900 len -= utf8clen(s);
2901 s += utf8clen(s);
2902 }
2903 return age;
2904}
2905
2906
2907
2908
2909
2910
2911
2912ssize_t utf8len(struct tree *tree, const char *s)
2913{
2914 utf8leaf_t *leaf;
2915 size_t ret = 0;
2916 unsigned char hangul[UTF8HANGULLEAF];
2917
2918 if (!tree)
2919 return -1;
2920 while (*s) {
2921 leaf = utf8lookup(tree, hangul, s);
2922 if (!leaf)
2923 return -1;
2924 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925 ret += utf8clen(s);
2926 else if (LEAF_CCC(leaf) == DECOMPOSE)
2927 ret += strlen(LEAF_STR(leaf));
2928 else
2929 ret += utf8clen(s);
2930 s += utf8clen(s);
2931 }
2932 return ret;
2933}
2934
2935
2936
2937
2938
2939ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2940{
2941 utf8leaf_t *leaf;
2942 size_t ret = 0;
2943 unsigned char hangul[UTF8HANGULLEAF];
2944
2945 if (!tree)
2946 return -1;
2947 while (len && *s) {
2948 leaf = utf8nlookup(tree, hangul, s, len);
2949 if (!leaf)
2950 return -1;
2951 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952 ret += utf8clen(s);
2953 else if (LEAF_CCC(leaf) == DECOMPOSE)
2954 ret += strlen(LEAF_STR(leaf));
2955 else
2956 ret += utf8clen(s);
2957 len -= utf8clen(s);
2958 s += utf8clen(s);
2959 }
2960 return ret;
2961}
2962
2963
2964
2965
2966struct utf8cursor {
2967 struct tree *tree;
2968 const char *s;
2969 const char *p;
2970 const char *ss;
2971 const char *sp;
2972 unsigned int len;
2973 unsigned int slen;
2974 short int ccc;
2975 short int nccc;
2976 unsigned int unichar;
2977 unsigned char hangul[UTF8HANGULLEAF];
2978};
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2991 size_t len)
2992{
2993 if (!tree)
2994 return -1;
2995 if (!s)
2996 return -1;
2997 u8c->tree = tree;
2998 u8c->s = s;
2999 u8c->p = NULL;
3000 u8c->ss = NULL;
3001 u8c->sp = NULL;
3002 u8c->len = len;
3003 u8c->slen = 0;
3004 u8c->ccc = STOPPER;
3005 u8c->nccc = STOPPER;
3006 u8c->unichar = 0;
3007
3008 if (u8c->len != len)
3009 return -1;
3010
3011 if (len > 0 && (*s & 0xC0) == 0x80)
3012 return -1;
3013 return 0;
3014}
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
3026{
3027 return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3028}
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057int utf8byte(struct utf8cursor *u8c)
3058{
3059 utf8leaf_t *leaf;
3060 int ccc;
3061
3062 for (;;) {
3063
3064 if (u8c->p && *u8c->s == '\0') {
3065 u8c->s = u8c->p;
3066 u8c->p = NULL;
3067 }
3068
3069
3070 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3071
3072 if (u8c->ccc == STOPPER)
3073 return 0;
3074
3075 ccc = STOPPER;
3076 goto ccc_mismatch;
3077 } else if ((*u8c->s & 0xC0) == 0x80) {
3078
3079 if (!u8c->p)
3080 u8c->len--;
3081 return (unsigned char)*u8c->s++;
3082 }
3083
3084
3085 if (u8c->p) {
3086 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3087 } else {
3088 leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089 u8c->s, u8c->len);
3090 }
3091
3092
3093 if (!leaf)
3094 return -1;
3095
3096
3097 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098 ccc = STOPPER;
3099 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100 u8c->len -= utf8clen(u8c->s);
3101 u8c->p = u8c->s + utf8clen(u8c->s);
3102 u8c->s = LEAF_STR(leaf);
3103
3104 if (*u8c->s == '\0') {
3105 if (u8c->ccc == STOPPER)
3106 continue;
3107 ccc = STOPPER;
3108 goto ccc_mismatch;
3109 }
3110 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3111 ccc = LEAF_CCC(leaf);
3112 }
3113 u8c->unichar = utf8decode(u8c->s);
3114
3115
3116
3117
3118
3119 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120 u8c->nccc = ccc;
3121
3122
3123
3124
3125
3126 if (ccc == u8c->ccc) {
3127 if (!u8c->p)
3128 u8c->len--;
3129 return (unsigned char)*u8c->s++;
3130 }
3131
3132
3133 ccc_mismatch:
3134 if (u8c->nccc == STOPPER) {
3135
3136
3137
3138
3139
3140 assert(u8c->ccc == STOPPER);
3141 u8c->ccc = MINCCC - 1;
3142 u8c->nccc = ccc;
3143 u8c->sp = u8c->p;
3144 u8c->ss = u8c->s;
3145 u8c->slen = u8c->len;
3146 if (!u8c->p)
3147 u8c->len -= utf8clen(u8c->s);
3148 u8c->s += utf8clen(u8c->s);
3149 } else if (ccc != STOPPER) {
3150
3151 if (!u8c->p)
3152 u8c->len -= utf8clen(u8c->s);
3153 u8c->s += utf8clen(u8c->s);
3154 } else if (u8c->nccc != MAXCCC + 1) {
3155
3156 u8c->ccc = u8c->nccc;
3157 u8c->nccc = MAXCCC + 1;
3158 u8c->s = u8c->ss;
3159 u8c->p = u8c->sp;
3160 u8c->len = u8c->slen;
3161 } else {
3162
3163 u8c->ccc = STOPPER;
3164 u8c->nccc = STOPPER;
3165 u8c->sp = NULL;
3166 u8c->ss = NULL;
3167 u8c->slen = 0;
3168 }
3169 }
3170}
3171
3172
3173
3174static int normalize_line(struct tree *tree)
3175{
3176 char *s;
3177 char *t;
3178 int c;
3179 struct utf8cursor u8c;
3180
3181
3182 s = buf2;
3183 t = buf3;
3184 if (utf8cursor(&u8c, tree, s))
3185 return -1;
3186 while ((c = utf8byte(&u8c)) > 0)
3187 if (c != (unsigned char)*t++)
3188 return -1;
3189 if (c < 0)
3190 return -1;
3191 if (*t != 0)
3192 return -1;
3193
3194
3195 s = buf2;
3196
3197 s[strlen(s) + 1] = -1;
3198 t = buf3;
3199 if (utf8cursor(&u8c, tree, s))
3200 return -1;
3201 while ((c = utf8byte(&u8c)) > 0)
3202 if (c != (unsigned char)*t++)
3203 return -1;
3204 if (c < 0)
3205 return -1;
3206 if (*t != 0)
3207 return -1;
3208
3209 return 0;
3210}
3211
3212static void normalization_test(void)
3213{
3214 FILE *file;
3215 unsigned int unichar;
3216 struct unicode_data *data;
3217 char *s;
3218 char *t;
3219 int ret;
3220 int ignorables;
3221 int tests = 0;
3222 int failures = 0;
3223
3224 if (verbose > 0)
3225 printf("Parsing %s\n", test_name);
3226
3227 file = fopen(test_name, "r");
3228 if (!file)
3229 open_fail(test_name, errno);
3230
3231 while (fgets(line, LINESIZE, file)) {
3232 ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3233 buf0, buf1);
3234 if (ret != 2 || *line == '#')
3235 continue;
3236 s = buf0;
3237 t = buf2;
3238 while (*s) {
3239 unichar = strtoul(s, &s, 16);
3240 t += utf8encode(t, unichar);
3241 }
3242 *t = '\0';
3243
3244 ignorables = 0;
3245 s = buf1;
3246 t = buf3;
3247 while (*s) {
3248 unichar = strtoul(s, &s, 16);
3249 data = &unicode_data[unichar];
3250 if (data->utf8nfdi && !*data->utf8nfdi)
3251 ignorables = 1;
3252 else
3253 t += utf8encode(t, unichar);
3254 }
3255 *t = '\0';
3256
3257 tests++;
3258 if (normalize_line(nfdi_tree) < 0) {
3259 printf("Line %s -> %s", buf0, buf1);
3260 if (ignorables)
3261 printf(" (ignorables removed)");
3262 printf(" failure\n");
3263 failures++;
3264 }
3265 }
3266 fclose(file);
3267 if (verbose > 0)
3268 printf("Ran %d tests with %d failures\n", tests, failures);
3269 if (failures)
3270 file_fail(test_name);
3271}
3272
3273
3274
3275static void write_file(void)
3276{
3277 FILE *file;
3278 int i;
3279 int j;
3280 int t;
3281 int gen;
3282
3283 if (verbose > 0)
3284 printf("Writing %s\n", utf8_name);
3285 file = fopen(utf8_name, "w");
3286 if (!file)
3287 open_fail(utf8_name, errno);
3288
3289 fprintf(file, "/* This file is generated code, do not edit. */\n");
3290 fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3291 fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3292 fprintf(file, "#endif\n");
3293 fprintf(file, "\n");
3294 fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3295 unicode_maxage);
3296 fprintf(file, "\n");
3297 fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3298 for (i = 0; i != ages_count; i++)
3299 fprintf(file, "\t%#x%s\n", ages[i],
3300 ages[i] == unicode_maxage ? "" : ",");
3301 fprintf(file, "};\n");
3302 fprintf(file, "\n");
3303 fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3304 t = 0;
3305 for (gen = 0; gen < ages_count; gen++) {
3306 fprintf(file, "\t{ %#x, %d }%s\n",
3307 ages[gen], trees[t].index,
3308 ages[gen] == unicode_maxage ? "" : ",");
3309 if (trees[t].maxage == ages[gen])
3310 t += 2;
3311 }
3312 fprintf(file, "};\n");
3313 fprintf(file, "\n");
3314 fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3315 t = 1;
3316 for (gen = 0; gen < ages_count; gen++) {
3317 fprintf(file, "\t{ %#x, %d }%s\n",
3318 ages[gen], trees[t].index,
3319 ages[gen] == unicode_maxage ? "" : ",");
3320 if (trees[t].maxage == ages[gen])
3321 t += 2;
3322 }
3323 fprintf(file, "};\n");
3324 fprintf(file, "\n");
3325 fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3326 utf8data_size);
3327 t = 0;
3328 for (i = 0; i != utf8data_size; i += 16) {
3329 if (i == trees[t].index) {
3330 fprintf(file, "\t/* %s_%x */\n",
3331 trees[t].type, trees[t].maxage);
3332 if (t < trees_count-1)
3333 t++;
3334 }
3335 fprintf(file, "\t");
3336 for (j = i; j != i + 16; j++)
3337 fprintf(file, "0x%.2x%s", utf8data[j],
3338 (j < utf8data_size -1 ? "," : ""));
3339 fprintf(file, "\n");
3340 }
3341 fprintf(file, "};\n");
3342 fclose(file);
3343}
3344
3345
3346
3347int main(int argc, char *argv[])
3348{
3349 unsigned int unichar;
3350 int opt;
3351
3352 argv0 = argv[0];
3353
3354 while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3355 switch (opt) {
3356 case 'a':
3357 age_name = optarg;
3358 break;
3359 case 'c':
3360 ccc_name = optarg;
3361 break;
3362 case 'd':
3363 data_name = optarg;
3364 break;
3365 case 'f':
3366 fold_name = optarg;
3367 break;
3368 case 'n':
3369 norm_name = optarg;
3370 break;
3371 case 'o':
3372 utf8_name = optarg;
3373 break;
3374 case 'p':
3375 prop_name = optarg;
3376 break;
3377 case 't':
3378 test_name = optarg;
3379 break;
3380 case 'v':
3381 verbose++;
3382 break;
3383 case 'h':
3384 help();
3385 exit(0);
3386 default:
3387 usage();
3388 }
3389 }
3390
3391 if (verbose > 1)
3392 help();
3393 for (unichar = 0; unichar != 0x110000; unichar++)
3394 unicode_data[unichar].code = unichar;
3395 age_init();
3396 ccc_init();
3397 nfdi_init();
3398 nfdicf_init();
3399 ignore_init();
3400 corrections_init();
3401 hangul_decompose();
3402 nfdi_decompose();
3403 nfdicf_decompose();
3404 utf8_init();
3405 trees_init();
3406 trees_populate();
3407 trees_reduce();
3408 trees_verify();
3409
3410 (void)lookup(nfdi_tree, " ");
3411 if (verbose > 2)
3412 tree_walk(nfdi_tree);
3413 if (verbose > 2)
3414 tree_walk(nfdicf_tree);
3415 normalization_test();
3416 write_file();
3417
3418 return 0;
3419}
3420