1
2
3
4
5
6
7#include "utf8n.h"
8
9struct utf8data {
10 unsigned int maxage;
11 unsigned int offset;
12};
13
14#define __INCLUDED_FROM_UTF8NORM_C__
15#include "utf8data.h"
16#undef __INCLUDED_FROM_UTF8NORM_C__
17
18int utf8version_is_supported(u8 maj, u8 min, u8 rev)
19{
20 int i = ARRAY_SIZE(utf8agetab) - 1;
21 unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
22
23 while (i >= 0 && utf8agetab[i] != 0) {
24 if (sb_utf8version == utf8agetab[i])
25 return 1;
26 i--;
27 }
28 return 0;
29}
30EXPORT_SYMBOL(utf8version_is_supported);
31
32int utf8version_latest(void)
33{
34 return utf8vers;
35}
36EXPORT_SYMBOL(utf8version_latest);
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91static inline int utf8clen(const char *s)
92{
93 unsigned char c = *s;
94
95 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
96}
97
98
99
100
101static unsigned int
102utf8decode3(const char *str)
103{
104 unsigned int uc;
105
106 uc = *str++ & 0x0F;
107 uc <<= 6;
108 uc |= *str++ & 0x3F;
109 uc <<= 6;
110 uc |= *str++ & 0x3F;
111
112 return uc;
113}
114
115
116
117
118static int
119utf8encode3(char *str, unsigned int val)
120{
121 str[2] = (val & 0x3F) | 0x80;
122 val >>= 6;
123 str[1] = (val & 0x3F) | 0x80;
124 val >>= 6;
125 str[0] = val | 0xE0;
126
127 return 3;
128}
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154typedef const unsigned char utf8trie_t;
155#define BITNUM 0x07
156#define NEXTBYTE 0x08
157#define OFFLEN 0x30
158#define OFFLEN_SHIFT 4
159#define RIGHTPATH 0x40
160#define TRIENODE 0x80
161#define RIGHTNODE 0x40
162#define LEFTNODE 0x80
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201typedef const unsigned char utf8leaf_t;
202
203#define LEAF_GEN(LEAF) ((LEAF)[0])
204#define LEAF_CCC(LEAF) ((LEAF)[1])
205#define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
206
207#define MINCCC (0)
208#define MAXCCC (254)
209#define STOPPER (0)
210#define DECOMPOSE (255)
211
212
213#define HANGUL ((char)(255))
214
215#define UTF8HANGULLEAF (12)
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263#define SB (0xAC00)
264#define LB (0x1100)
265#define VB (0x1161)
266#define TB (0x11A7)
267#define LC (19)
268#define VC (21)
269#define TC (28)
270#define NC (VC * TC)
271#define SC (LC * NC)
272
273
274static utf8leaf_t *
275utf8hangul(const char *str, unsigned char *hangul)
276{
277 unsigned int si;
278 unsigned int li;
279 unsigned int vi;
280 unsigned int ti;
281 unsigned char *h;
282
283
284 si = utf8decode3(str) - SB;
285 li = si / NC;
286 vi = (si % NC) / TC;
287 ti = si % TC;
288
289
290 h = hangul;
291 LEAF_GEN(h) = 2;
292 LEAF_CCC(h) = DECOMPOSE;
293 h += 2;
294
295
296 h += utf8encode3((char *)h, li + LB);
297
298
299 h += utf8encode3((char *)h, vi + VB);
300
301
302 if (ti)
303 h += utf8encode3((char *)h, ti + TB);
304
305
306 h[0] = '\0';
307
308 return hangul;
309}
310
311
312
313
314
315
316
317
318
319static utf8leaf_t *utf8nlookup(const struct utf8data *data,
320 unsigned char *hangul, const char *s, size_t len)
321{
322 utf8trie_t *trie = NULL;
323 int offlen;
324 int offset;
325 int mask;
326 int node;
327
328 if (!data)
329 return NULL;
330 if (len == 0)
331 return NULL;
332
333 trie = utf8data + data->offset;
334 node = 1;
335 while (node) {
336 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
337 if (*trie & NEXTBYTE) {
338 if (--len == 0)
339 return NULL;
340 s++;
341 }
342 mask = 1 << (*trie & BITNUM);
343 if (*s & mask) {
344
345 if (offlen) {
346
347 node = (*trie & RIGHTNODE);
348 offset = trie[offlen];
349 while (--offlen) {
350 offset <<= 8;
351 offset |= trie[offlen];
352 }
353 trie += offset;
354 } else if (*trie & RIGHTPATH) {
355
356 node = (*trie & TRIENODE);
357 trie++;
358 } else {
359
360 return NULL;
361 }
362 } else {
363
364 if (offlen) {
365
366 node = (*trie & LEFTNODE);
367 trie += offlen + 1;
368 } else if (*trie & RIGHTPATH) {
369
370 return NULL;
371 } else {
372
373 node = (*trie & TRIENODE);
374 trie++;
375 }
376 }
377 }
378
379
380
381
382
383
384 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
385 trie = utf8hangul(s - 2, hangul);
386 return trie;
387}
388
389
390
391
392
393
394
395static utf8leaf_t *utf8lookup(const struct utf8data *data,
396 unsigned char *hangul, const char *s)
397{
398 return utf8nlookup(data, hangul, s, (size_t)-1);
399}
400
401
402
403
404
405
406int utf8agemax(const struct utf8data *data, const char *s)
407{
408 utf8leaf_t *leaf;
409 int age = 0;
410 int leaf_age;
411 unsigned char hangul[UTF8HANGULLEAF];
412
413 if (!data)
414 return -1;
415
416 while (*s) {
417 leaf = utf8lookup(data, hangul, s);
418 if (!leaf)
419 return -1;
420
421 leaf_age = utf8agetab[LEAF_GEN(leaf)];
422 if (leaf_age <= data->maxage && leaf_age > age)
423 age = leaf_age;
424 s += utf8clen(s);
425 }
426 return age;
427}
428EXPORT_SYMBOL(utf8agemax);
429
430
431
432
433
434
435int utf8agemin(const struct utf8data *data, const char *s)
436{
437 utf8leaf_t *leaf;
438 int age;
439 int leaf_age;
440 unsigned char hangul[UTF8HANGULLEAF];
441
442 if (!data)
443 return -1;
444 age = data->maxage;
445 while (*s) {
446 leaf = utf8lookup(data, hangul, s);
447 if (!leaf)
448 return -1;
449 leaf_age = utf8agetab[LEAF_GEN(leaf)];
450 if (leaf_age <= data->maxage && leaf_age < age)
451 age = leaf_age;
452 s += utf8clen(s);
453 }
454 return age;
455}
456EXPORT_SYMBOL(utf8agemin);
457
458
459
460
461
462int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
463{
464 utf8leaf_t *leaf;
465 int age = 0;
466 int leaf_age;
467 unsigned char hangul[UTF8HANGULLEAF];
468
469 if (!data)
470 return -1;
471
472 while (len && *s) {
473 leaf = utf8nlookup(data, hangul, s, len);
474 if (!leaf)
475 return -1;
476 leaf_age = utf8agetab[LEAF_GEN(leaf)];
477 if (leaf_age <= data->maxage && leaf_age > age)
478 age = leaf_age;
479 len -= utf8clen(s);
480 s += utf8clen(s);
481 }
482 return age;
483}
484EXPORT_SYMBOL(utf8nagemax);
485
486
487
488
489
490int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
491{
492 utf8leaf_t *leaf;
493 int leaf_age;
494 int age;
495 unsigned char hangul[UTF8HANGULLEAF];
496
497 if (!data)
498 return -1;
499 age = data->maxage;
500 while (len && *s) {
501 leaf = utf8nlookup(data, hangul, s, len);
502 if (!leaf)
503 return -1;
504 leaf_age = utf8agetab[LEAF_GEN(leaf)];
505 if (leaf_age <= data->maxage && leaf_age < age)
506 age = leaf_age;
507 len -= utf8clen(s);
508 s += utf8clen(s);
509 }
510 return age;
511}
512EXPORT_SYMBOL(utf8nagemin);
513
514
515
516
517
518
519
520ssize_t utf8len(const struct utf8data *data, const char *s)
521{
522 utf8leaf_t *leaf;
523 size_t ret = 0;
524 unsigned char hangul[UTF8HANGULLEAF];
525
526 if (!data)
527 return -1;
528 while (*s) {
529 leaf = utf8lookup(data, hangul, s);
530 if (!leaf)
531 return -1;
532 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
533 ret += utf8clen(s);
534 else if (LEAF_CCC(leaf) == DECOMPOSE)
535 ret += strlen(LEAF_STR(leaf));
536 else
537 ret += utf8clen(s);
538 s += utf8clen(s);
539 }
540 return ret;
541}
542EXPORT_SYMBOL(utf8len);
543
544
545
546
547
548ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
549{
550 utf8leaf_t *leaf;
551 size_t ret = 0;
552 unsigned char hangul[UTF8HANGULLEAF];
553
554 if (!data)
555 return -1;
556 while (len && *s) {
557 leaf = utf8nlookup(data, hangul, s, len);
558 if (!leaf)
559 return -1;
560 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
561 ret += utf8clen(s);
562 else if (LEAF_CCC(leaf) == DECOMPOSE)
563 ret += strlen(LEAF_STR(leaf));
564 else
565 ret += utf8clen(s);
566 len -= utf8clen(s);
567 s += utf8clen(s);
568 }
569 return ret;
570}
571EXPORT_SYMBOL(utf8nlen);
572
573
574
575
576
577
578
579
580
581
582
583int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
584 const char *s, size_t len)
585{
586 if (!data)
587 return -1;
588 if (!s)
589 return -1;
590 u8c->data = data;
591 u8c->s = s;
592 u8c->p = NULL;
593 u8c->ss = NULL;
594 u8c->sp = NULL;
595 u8c->len = len;
596 u8c->slen = 0;
597 u8c->ccc = STOPPER;
598 u8c->nccc = STOPPER;
599
600 if (u8c->len != len)
601 return -1;
602
603 if (len > 0 && (*s & 0xC0) == 0x80)
604 return -1;
605 return 0;
606}
607EXPORT_SYMBOL(utf8ncursor);
608
609
610
611
612
613
614
615
616
617
618int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
619 const char *s)
620{
621 return utf8ncursor(u8c, data, s, (unsigned int)-1);
622}
623EXPORT_SYMBOL(utf8cursor);
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652int utf8byte(struct utf8cursor *u8c)
653{
654 utf8leaf_t *leaf;
655 int ccc;
656
657 for (;;) {
658
659 if (u8c->p && *u8c->s == '\0') {
660 u8c->s = u8c->p;
661 u8c->p = NULL;
662 }
663
664
665 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
666
667 if (u8c->ccc == STOPPER)
668 return 0;
669
670 ccc = STOPPER;
671 goto ccc_mismatch;
672 } else if ((*u8c->s & 0xC0) == 0x80) {
673
674 if (!u8c->p)
675 u8c->len--;
676 return (unsigned char)*u8c->s++;
677 }
678
679
680 if (u8c->p) {
681 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
682 } else {
683 leaf = utf8nlookup(u8c->data, u8c->hangul,
684 u8c->s, u8c->len);
685 }
686
687
688 if (!leaf)
689 return -1;
690
691 ccc = LEAF_CCC(leaf);
692
693 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
694 ccc = STOPPER;
695 } else if (ccc == DECOMPOSE) {
696 u8c->len -= utf8clen(u8c->s);
697 u8c->p = u8c->s + utf8clen(u8c->s);
698 u8c->s = LEAF_STR(leaf);
699
700 if (*u8c->s == '\0') {
701 if (u8c->ccc == STOPPER)
702 continue;
703 ccc = STOPPER;
704 goto ccc_mismatch;
705 }
706
707 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
708 if (!leaf)
709 return -1;
710 ccc = LEAF_CCC(leaf);
711 }
712
713
714
715
716
717 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
718 u8c->nccc = ccc;
719
720
721
722
723
724 if (ccc == u8c->ccc) {
725 if (!u8c->p)
726 u8c->len--;
727 return (unsigned char)*u8c->s++;
728 }
729
730
731ccc_mismatch:
732 if (u8c->nccc == STOPPER) {
733
734
735
736
737
738 u8c->ccc = MINCCC - 1;
739 u8c->nccc = ccc;
740 u8c->sp = u8c->p;
741 u8c->ss = u8c->s;
742 u8c->slen = u8c->len;
743 if (!u8c->p)
744 u8c->len -= utf8clen(u8c->s);
745 u8c->s += utf8clen(u8c->s);
746 } else if (ccc != STOPPER) {
747
748 if (!u8c->p)
749 u8c->len -= utf8clen(u8c->s);
750 u8c->s += utf8clen(u8c->s);
751 } else if (u8c->nccc != MAXCCC + 1) {
752
753 u8c->ccc = u8c->nccc;
754 u8c->nccc = MAXCCC + 1;
755 u8c->s = u8c->ss;
756 u8c->p = u8c->sp;
757 u8c->len = u8c->slen;
758 } else {
759
760 u8c->ccc = STOPPER;
761 u8c->nccc = STOPPER;
762 u8c->sp = NULL;
763 u8c->ss = NULL;
764 u8c->slen = 0;
765 }
766 }
767}
768EXPORT_SYMBOL(utf8byte);
769
770const struct utf8data *utf8nfdi(unsigned int maxage)
771{
772 int i = ARRAY_SIZE(utf8nfdidata) - 1;
773
774 while (maxage < utf8nfdidata[i].maxage)
775 i--;
776 if (maxage > utf8nfdidata[i].maxage)
777 return NULL;
778 return &utf8nfdidata[i];
779}
780EXPORT_SYMBOL(utf8nfdi);
781
782const struct utf8data *utf8nfdicf(unsigned int maxage)
783{
784 int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
785
786 while (maxage < utf8nfdicfdata[i].maxage)
787 i--;
788 if (maxage > utf8nfdicfdata[i].maxage)
789 return NULL;
790 return &utf8nfdicfdata[i];
791}
792EXPORT_SYMBOL(utf8nfdicf);
793