This source file includes following definitions.
- gb18030_mbc_enc_len
- gb18030_mbc_to_code
- gb18030_code_to_mbc
- gb18030_mbc_case_fold
- gb18030_is_mbc_ambiguous
- gb18030_is_code_ctype
- gb18030_left_adjust_char_head
- gb18030_is_allowed_reverse_match
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 #include "regenc.h"
32
33 #if 1
34 #define DEBUG_GB18030(arg)
35 #else
36 #define DEBUG_GB18030(arg) printf arg
37 #endif
38
39 enum {
40 C1,
41 C2,
42 C4,
43 CM
44 };
45
46 static const char GB18030_MAP[] = {
47 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
48 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50 C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
51 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
52 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
55 C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
56 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
63 };
64
65 static int
66 gb18030_mbc_enc_len(const UChar* p)
67 {
68 if (GB18030_MAP[*p] != CM)
69 return 1;
70 p++;
71 if (GB18030_MAP[*p] == C4)
72 return 4;
73 if (GB18030_MAP[*p] == C1)
74 return 1;
75 return 2;
76 }
77
78 static OnigCodePoint
79 gb18030_mbc_to_code(const UChar* p, const UChar* end)
80 {
81 return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
82 }
83
84 static int
85 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
86 {
87 return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
88 }
89
90 static int
91 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
92 UChar* lower)
93 {
94 return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
95 pp, end, lower);
96 }
97
98 #if 0
99 static int
100 gb18030_is_mbc_ambiguous(OnigCaseFoldType flag,
101 const UChar** pp, const UChar* end)
102 {
103 return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
104 }
105 #endif
106
107 static int
108 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
109 {
110 return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
111 }
112
113 enum state {
114 S_START,
115 S_one_C2,
116 S_one_C4,
117 S_one_CM,
118
119 S_odd_CM_one_CX,
120 S_even_CM_one_CX,
121
122
123 S_one_CMC4,
124 S_odd_CMC4,
125 S_one_C4_odd_CMC4,
126 S_even_CMC4,
127 S_one_C4_even_CMC4,
128
129 S_odd_CM_odd_CMC4,
130 S_even_CM_odd_CMC4,
131
132 S_odd_CM_even_CMC4,
133 S_even_CM_even_CMC4,
134
135
136 S_odd_C4CM,
137 S_one_CM_odd_C4CM,
138 S_even_C4CM,
139 S_one_CM_even_C4CM,
140
141 S_even_CM_odd_C4CM,
142 S_odd_CM_odd_C4CM,
143 S_even_CM_even_C4CM,
144 S_odd_CM_even_C4CM,
145 };
146
147 static UChar*
148 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
149 {
150 const UChar *p;
151 enum state state = S_START;
152
153 DEBUG_GB18030(("----------------\n"));
154 for (p = s; p >= start; p--) {
155 DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
156 switch (state) {
157 case S_START:
158 switch (GB18030_MAP[*p]) {
159 case C1:
160 return (UChar *)s;
161 case C2:
162 state = S_one_C2;
163 break;
164 case C4:
165 state = S_one_C4;
166 break;
167 case CM:
168 state = S_one_CM;
169 break;
170 }
171 break;
172 case S_one_C2:
173 switch (GB18030_MAP[*p]) {
174 case C1:
175 case C2:
176 case C4:
177 return (UChar *)s;
178 case CM:
179 state = S_odd_CM_one_CX;
180 break;
181 }
182 break;
183 case S_one_C4:
184 switch (GB18030_MAP[*p]) {
185 case C1:
186 case C2:
187 case C4:
188 return (UChar *)s;
189 case CM:
190 state = S_one_CMC4;
191 break;
192 }
193 break;
194 case S_one_CM:
195 switch (GB18030_MAP[*p]) {
196 case C1:
197 case C2:
198 return (UChar *)s;
199 case C4:
200 state = S_odd_C4CM;
201 break;
202 case CM:
203 state = S_odd_CM_one_CX;
204 break;
205 }
206 break;
207
208 case S_odd_CM_one_CX:
209 switch (GB18030_MAP[*p]) {
210 case C1:
211 case C2:
212 case C4:
213 return (UChar *)(s - 1);
214 case CM:
215 state = S_even_CM_one_CX;
216 break;
217 }
218 break;
219 case S_even_CM_one_CX:
220 switch (GB18030_MAP[*p]) {
221 case C1:
222 case C2:
223 case C4:
224 return (UChar *)s;
225 case CM:
226 state = S_odd_CM_one_CX;
227 break;
228 }
229 break;
230
231 case S_one_CMC4:
232 switch (GB18030_MAP[*p]) {
233 case C1:
234 case C2:
235 return (UChar *)(s - 1);
236 case C4:
237 state = S_one_C4_odd_CMC4;
238 break;
239 case CM:
240 state = S_even_CM_one_CX;
241 break;
242 }
243 break;
244 case S_odd_CMC4:
245 switch (GB18030_MAP[*p]) {
246 case C1:
247 case C2:
248 return (UChar *)(s - 1);
249 case C4:
250 state = S_one_C4_odd_CMC4;
251 break;
252 case CM:
253 state = S_odd_CM_odd_CMC4;
254 break;
255 }
256 break;
257 case S_one_C4_odd_CMC4:
258 switch (GB18030_MAP[*p]) {
259 case C1:
260 case C2:
261 case C4:
262 return (UChar *)(s - 1);
263 case CM:
264 state = S_even_CMC4;
265 break;
266 }
267 break;
268 case S_even_CMC4:
269 switch (GB18030_MAP[*p]) {
270 case C1:
271 case C2:
272 return (UChar *)(s - 3);
273 case C4:
274 state = S_one_C4_even_CMC4;
275 break;
276 case CM:
277 state = S_odd_CM_even_CMC4;
278 break;
279 }
280 break;
281 case S_one_C4_even_CMC4:
282 switch (GB18030_MAP[*p]) {
283 case C1:
284 case C2:
285 case C4:
286 return (UChar *)(s - 3);
287 case CM:
288 state = S_odd_CMC4;
289 break;
290 }
291 break;
292
293 case S_odd_CM_odd_CMC4:
294 switch (GB18030_MAP[*p]) {
295 case C1:
296 case C2:
297 case C4:
298 return (UChar *)(s - 3);
299 case CM:
300 state = S_even_CM_odd_CMC4;
301 break;
302 }
303 break;
304 case S_even_CM_odd_CMC4:
305 switch (GB18030_MAP[*p]) {
306 case C1:
307 case C2:
308 case C4:
309 return (UChar *)(s - 1);
310 case CM:
311 state = S_odd_CM_odd_CMC4;
312 break;
313 }
314 break;
315
316 case S_odd_CM_even_CMC4:
317 switch (GB18030_MAP[*p]) {
318 case C1:
319 case C2:
320 case C4:
321 return (UChar *)(s - 1);
322 case CM:
323 state = S_even_CM_even_CMC4;
324 break;
325 }
326 break;
327 case S_even_CM_even_CMC4:
328 switch (GB18030_MAP[*p]) {
329 case C1:
330 case C2:
331 case C4:
332 return (UChar *)(s - 3);
333 case CM:
334 state = S_odd_CM_even_CMC4;
335 break;
336 }
337 break;
338
339 case S_odd_C4CM:
340 switch (GB18030_MAP[*p]) {
341 case C1:
342 case C2:
343 case C4:
344 return (UChar *)s;
345 case CM:
346 state = S_one_CM_odd_C4CM;
347 break;
348 }
349 break;
350 case S_one_CM_odd_C4CM:
351 switch (GB18030_MAP[*p]) {
352 case C1:
353 case C2:
354 return (UChar *)(s - 2);
355 case C4:
356 state = S_even_C4CM;
357 break;
358 case CM:
359 state = S_even_CM_odd_C4CM;
360 break;
361 }
362 break;
363 case S_even_C4CM:
364 switch (GB18030_MAP[*p]) {
365 case C1:
366 case C2:
367 case C4:
368 return (UChar *)(s - 2);
369 case CM:
370 state = S_one_CM_even_C4CM;
371 break;
372 }
373 break;
374 case S_one_CM_even_C4CM:
375 switch (GB18030_MAP[*p]) {
376 case C1:
377 case C2:
378 return (UChar *)(s - 0);
379 case C4:
380 state = S_odd_C4CM;
381 break;
382 case CM:
383 state = S_even_CM_even_C4CM;
384 break;
385 }
386 break;
387
388 case S_even_CM_odd_C4CM:
389 switch (GB18030_MAP[*p]) {
390 case C1:
391 case C2:
392 case C4:
393 return (UChar *)(s - 0);
394 case CM:
395 state = S_odd_CM_odd_C4CM;
396 break;
397 }
398 break;
399 case S_odd_CM_odd_C4CM:
400 switch (GB18030_MAP[*p]) {
401 case C1:
402 case C2:
403 case C4:
404 return (UChar *)(s - 2);
405 case CM:
406 state = S_even_CM_odd_C4CM;
407 break;
408 }
409 break;
410
411 case S_even_CM_even_C4CM:
412 switch (GB18030_MAP[*p]) {
413 case C1:
414 case C2:
415 case C4:
416 return (UChar *)(s - 2);
417 case CM:
418 state = S_odd_CM_even_C4CM;
419 break;
420 }
421 break;
422 case S_odd_CM_even_C4CM:
423 switch (GB18030_MAP[*p]) {
424 case C1:
425 case C2:
426 case C4:
427 return (UChar *)(s - 0);
428 case CM:
429 state = S_even_CM_even_C4CM;
430 break;
431 }
432 break;
433 }
434 }
435
436 DEBUG_GB18030(("state %d\n", state));
437 switch (state) {
438 case S_START: return (UChar *)(s - 0);
439 case S_one_C2: return (UChar *)(s - 0);
440 case S_one_C4: return (UChar *)(s - 0);
441 case S_one_CM: return (UChar *)(s - 0);
442
443 case S_odd_CM_one_CX: return (UChar *)(s - 1);
444 case S_even_CM_one_CX: return (UChar *)(s - 0);
445
446 case S_one_CMC4: return (UChar *)(s - 1);
447 case S_odd_CMC4: return (UChar *)(s - 1);
448 case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
449 case S_even_CMC4: return (UChar *)(s - 3);
450 case S_one_C4_even_CMC4: return (UChar *)(s - 3);
451
452 case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
453 case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
454
455 case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
456 case S_even_CM_even_CMC4: return (UChar *)(s - 3);
457
458 case S_odd_C4CM: return (UChar *)(s - 0);
459 case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
460 case S_even_C4CM: return (UChar *)(s - 2);
461 case S_one_CM_even_C4CM: return (UChar *)(s - 0);
462
463 case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
464 case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
465 case S_even_CM_even_C4CM: return (UChar *)(s - 2);
466 case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
467 }
468
469 return (UChar* )s;
470 }
471
472 static int
473 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
474 {
475 return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
476 }
477
478 OnigEncodingType OnigEncodingGB18030 = {
479 gb18030_mbc_enc_len,
480 "GB18030",
481 4,
482 1,
483 onigenc_is_mbc_newline_0x0a,
484 gb18030_mbc_to_code,
485 onigenc_mb4_code_to_mbclen,
486 gb18030_code_to_mbc,
487 gb18030_mbc_case_fold,
488 onigenc_ascii_apply_all_case_fold,
489 onigenc_ascii_get_case_fold_codes_by_str,
490 onigenc_minimum_property_name_to_ctype,
491 gb18030_is_code_ctype,
492 onigenc_not_support_get_ctype_code_range,
493 gb18030_left_adjust_char_head,
494 gb18030_is_allowed_reverse_match
495 };