1
2
3
4
5
6
7
8
9
10
11
12
13#include "qemu/osdep.h"
14#include "qemu/unicode.h"
15
16static bool is_valid_codepoint(int codepoint)
17{
18 if (codepoint > 0x10FFFFu) {
19 return false;
20 }
21 if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF)
22 || (codepoint & 0xFFFE) == 0xFFFE) {
23 return false;
24 }
25 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
26 return false;
27 }
28 return true;
29}
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62int mod_utf8_codepoint(const char *s, size_t n, char **end)
63{
64 static int min_cp[5] = { 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
65 const unsigned char *p;
66 unsigned byte, mask, len, i;
67 int cp;
68
69 if (n == 0 || *s == 0) {
70
71 *end = (char *)s;
72 return -1;
73 }
74
75 p = (const unsigned char *)s;
76 byte = *p++;
77 if (byte < 0x80) {
78 cp = byte;
79 } else if (byte >= 0xFE) {
80 cp = -1;
81 } else if ((byte & 0x40) == 0) {
82 cp = -1;
83 } else {
84
85 len = 0;
86 for (mask = 0x80; byte & mask; mask >>= 1) {
87 len++;
88 }
89 assert(len > 1 && len < 7);
90 cp = byte & (mask - 1);
91 for (i = 1; i < len; i++) {
92 byte = i < n ? *p : 0;
93 if ((byte & 0xC0) != 0x80) {
94 cp = -1;
95 goto out;
96 }
97 p++;
98 cp <<= 6;
99 cp |= byte & 0x3F;
100 }
101 if (!is_valid_codepoint(cp)) {
102 cp = -1;
103 } else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) {
104 cp = -1;
105 }
106 }
107
108out:
109 *end = (char *)p;
110 return cp;
111}
112
113
114
115
116
117
118
119
120
121
122
123
124ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint)
125{
126 assert(bufsz >= 5);
127
128 if (!is_valid_codepoint(codepoint)) {
129 return -1;
130 }
131
132 if (codepoint > 0 && codepoint <= 0x7F) {
133 buf[0] = codepoint & 0x7F;
134 buf[1] = 0;
135 return 1;
136 }
137 if (codepoint <= 0x7FF) {
138 buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
139 buf[1] = 0x80 | (codepoint & 0x3F);
140 buf[2] = 0;
141 return 2;
142 }
143 if (codepoint <= 0xFFFF) {
144 buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
145 buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
146 buf[2] = 0x80 | (codepoint & 0x3F);
147 buf[3] = 0;
148 return 3;
149 }
150 buf[0] = 0xF0 | ((codepoint >> 18) & 0x07);
151 buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
152 buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
153 buf[3] = 0x80 | (codepoint & 0x3F);
154 buf[4] = 0;
155 return 4;
156}
157