root/ext/mbstring/oniguruma/enc/gb18030.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. gb18030_mbc_enc_len
  2. gb18030_mbc_to_code
  3. gb18030_code_to_mbc
  4. gb18030_mbc_case_fold
  5. gb18030_is_mbc_ambiguous
  6. gb18030_is_code_ctype
  7. gb18030_left_adjust_char_head
  8. gb18030_is_allowed_reverse_match

   1 /**********************************************************************
   2   gb18030.c -  Oniguruma (regular expression library)
   3 **********************************************************************/
   4 /*-
   5  * Copyright (c) 2005-2007  KUBO Takehiro <kubo AT jiubao DOT org>
   6  *                          K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
   7  * All rights reserved.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  28  * SUCH DAMAGE.
  29  */
  30 
  31 #include "regenc.h"
  32 
  33 #if 1
  34 #define DEBUG_GB18030(arg)
  35 #else
  36 #define DEBUG_GB18030(arg) printf arg
  37 #endif
  38 
  39 enum {
  40   C1, /* one-byte char */
  41   C2, /* one-byte or second of two-byte char */
  42   C4, /* one-byte or second or fourth of four-byte char */
  43   CM  /* first of two- or four-byte char or second of two-byte char */
  44 };
  45 
  46 static const char GB18030_MAP[] = {
  47   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
  48   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
  49   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
  50   C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
  51   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
  52   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
  53   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
  54   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
  55   C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  56   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  57   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  58   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  59   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  60   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  61   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
  62   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
  63 };
  64 
  65 static int
  66 gb18030_mbc_enc_len(const UChar* p)
  67 {
  68   if (GB18030_MAP[*p] != CM)
  69     return 1;
  70   p++;
  71   if (GB18030_MAP[*p] == C4)
  72     return 4;
  73   if (GB18030_MAP[*p] == C1)
  74     return 1; /* illegal sequence */
  75   return 2;
  76 }
  77 
  78 static OnigCodePoint
  79 gb18030_mbc_to_code(const UChar* p, const UChar* end)
  80 {
  81   return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
  82 }
  83 
  84 static int
  85 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
  86 {
  87   return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
  88 }
  89 
  90 static int
  91 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
  92                       UChar* lower)
  93 {
  94   return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
  95                                    pp, end, lower);
  96 }
  97 
  98 #if 0
  99 static int
 100 gb18030_is_mbc_ambiguous(OnigCaseFoldType flag,
 101                          const UChar** pp, const UChar* end)
 102 {
 103   return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
 104 }
 105 #endif
 106 
 107 static int
 108 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
 109 {
 110   return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
 111 }
 112 
 113 enum state {
 114   S_START,
 115   S_one_C2,
 116   S_one_C4,
 117   S_one_CM,
 118 
 119   S_odd_CM_one_CX,
 120   S_even_CM_one_CX,
 121 
 122   /* CMC4 : pair of "CM C4" */
 123   S_one_CMC4,
 124   S_odd_CMC4,
 125   S_one_C4_odd_CMC4,
 126   S_even_CMC4,
 127   S_one_C4_even_CMC4,
 128 
 129   S_odd_CM_odd_CMC4,
 130   S_even_CM_odd_CMC4,
 131 
 132   S_odd_CM_even_CMC4,
 133   S_even_CM_even_CMC4,
 134 
 135   /* C4CM : pair of "C4 CM" */
 136   S_odd_C4CM,
 137   S_one_CM_odd_C4CM,
 138   S_even_C4CM,
 139   S_one_CM_even_C4CM,
 140 
 141   S_even_CM_odd_C4CM,
 142   S_odd_CM_odd_C4CM,
 143   S_even_CM_even_C4CM,
 144   S_odd_CM_even_C4CM,
 145 };
 146 
 147 static UChar*
 148 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
 149 {
 150   const UChar *p;
 151   enum state state = S_START;
 152 
 153   DEBUG_GB18030(("----------------\n"));
 154   for (p = s; p >= start; p--) {
 155     DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
 156     switch (state) {
 157     case S_START:
 158       switch (GB18030_MAP[*p]) {
 159       case C1:
 160         return (UChar *)s;
 161       case C2:
 162         state = S_one_C2; /* C2 */
 163         break;
 164       case C4:
 165         state = S_one_C4; /* C4 */
 166         break;
 167       case CM:
 168         state = S_one_CM; /* CM */
 169         break;
 170       }
 171       break;
 172     case S_one_C2: /* C2 */
 173       switch (GB18030_MAP[*p]) {
 174       case C1:
 175       case C2:
 176       case C4:
 177         return (UChar *)s;
 178       case CM:
 179         state = S_odd_CM_one_CX; /* CM C2 */
 180         break;
 181       }
 182       break;
 183     case S_one_C4: /* C4 */
 184       switch (GB18030_MAP[*p]) {
 185       case C1:
 186       case C2:
 187       case C4:
 188         return (UChar *)s;
 189       case CM:
 190         state = S_one_CMC4;
 191         break;
 192       }
 193       break;
 194     case S_one_CM: /* CM */
 195       switch (GB18030_MAP[*p]) {
 196       case C1:
 197       case C2:
 198         return (UChar *)s;
 199       case C4:
 200         state = S_odd_C4CM;
 201         break;
 202       case CM:
 203         state = S_odd_CM_one_CX; /* CM CM */
 204         break;
 205       }
 206       break;
 207 
 208     case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
 209       switch (GB18030_MAP[*p]) {
 210       case C1:
 211       case C2:
 212       case C4:
 213         return (UChar *)(s - 1);
 214       case CM:
 215         state = S_even_CM_one_CX;
 216         break;
 217       }
 218       break;
 219     case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
 220       switch (GB18030_MAP[*p]) {
 221       case C1:
 222       case C2:
 223       case C4:
 224         return (UChar *)s;
 225       case CM:
 226         state = S_odd_CM_one_CX;
 227         break;
 228       }
 229       break;
 230 
 231     case S_one_CMC4: /* CM C4 */
 232       switch (GB18030_MAP[*p]) {
 233       case C1:
 234       case C2:
 235         return (UChar *)(s - 1);
 236       case C4:
 237         state = S_one_C4_odd_CMC4; /* C4 CM C4 */
 238         break;
 239       case CM:
 240         state = S_even_CM_one_CX; /* CM CM C4 */
 241         break;
 242       }
 243       break;
 244     case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
 245       switch (GB18030_MAP[*p]) {
 246       case C1:
 247       case C2:
 248         return (UChar *)(s - 1);
 249       case C4:
 250         state = S_one_C4_odd_CMC4;
 251         break;
 252       case CM:
 253         state = S_odd_CM_odd_CMC4;
 254         break;
 255       }
 256       break;
 257     case S_one_C4_odd_CMC4: /* C4 CM C4 */
 258       switch (GB18030_MAP[*p]) {
 259       case C1:
 260       case C2:
 261       case C4:
 262         return (UChar *)(s - 1);
 263       case CM:
 264         state = S_even_CMC4; /* CM C4 CM C4 */
 265         break;
 266       }
 267       break;
 268     case S_even_CMC4: /* CM C4 CM C4 */
 269       switch (GB18030_MAP[*p]) {
 270       case C1:
 271       case C2:
 272         return (UChar *)(s - 3);
 273       case C4:
 274         state = S_one_C4_even_CMC4;
 275         break;
 276       case CM:
 277         state = S_odd_CM_even_CMC4;
 278         break;
 279       }
 280       break;
 281     case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
 282       switch (GB18030_MAP[*p]) {
 283       case C1:
 284       case C2:
 285       case C4:
 286         return (UChar *)(s - 3);
 287       case CM:
 288         state = S_odd_CMC4;
 289         break;
 290       }
 291       break;
 292 
 293     case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
 294       switch (GB18030_MAP[*p]) {
 295       case C1:
 296       case C2:
 297       case C4:
 298         return (UChar *)(s - 3);
 299       case CM:
 300         state = S_even_CM_odd_CMC4;
 301         break;
 302       }
 303       break;
 304     case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
 305       switch (GB18030_MAP[*p]) {
 306       case C1:
 307       case C2:
 308       case C4:
 309         return (UChar *)(s - 1);
 310       case CM:
 311         state = S_odd_CM_odd_CMC4;
 312         break;
 313       }
 314       break;
 315 
 316     case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
 317       switch (GB18030_MAP[*p]) {
 318       case C1:
 319       case C2:
 320       case C4:
 321         return (UChar *)(s - 1);
 322       case CM:
 323         state = S_even_CM_even_CMC4;
 324         break;
 325       }
 326       break;
 327     case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
 328       switch (GB18030_MAP[*p]) {
 329       case C1:
 330       case C2:
 331       case C4:
 332         return (UChar *)(s - 3);
 333       case CM:
 334         state = S_odd_CM_even_CMC4;
 335         break;
 336       }
 337       break;
 338 
 339     case S_odd_C4CM: /* C4 CM */  /* C4 CM C4 CM C4 CM*/
 340       switch (GB18030_MAP[*p]) {
 341       case C1:
 342       case C2:
 343       case C4:
 344         return (UChar *)s;
 345       case CM:
 346         state = S_one_CM_odd_C4CM; /* CM C4 CM */
 347         break;
 348       }
 349       break;
 350     case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
 351       switch (GB18030_MAP[*p]) {
 352       case C1:
 353       case C2:
 354         return (UChar *)(s - 2); /* |CM C4 CM */
 355       case C4:
 356         state = S_even_C4CM;
 357         break;
 358       case CM:
 359         state = S_even_CM_odd_C4CM;
 360         break;
 361       }
 362       break;
 363     case S_even_C4CM: /* C4 CM C4 CM */
 364       switch (GB18030_MAP[*p]) {
 365       case C1:
 366       case C2:
 367       case C4:
 368         return (UChar *)(s - 2);  /* C4|CM C4 CM */
 369       case CM:
 370         state = S_one_CM_even_C4CM;
 371         break;
 372       }
 373       break;
 374     case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
 375       switch (GB18030_MAP[*p]) {
 376       case C1:
 377       case C2:
 378         return (UChar *)(s - 0);  /*|CM C4 CM C4|CM */
 379       case C4:
 380         state = S_odd_C4CM;
 381         break;
 382       case CM:
 383         state = S_even_CM_even_C4CM;
 384         break;
 385       }
 386       break;
 387 
 388     case S_even_CM_odd_C4CM: /* CM CM C4 CM */
 389       switch (GB18030_MAP[*p]) {
 390       case C1:
 391       case C2:
 392       case C4:
 393         return (UChar *)(s - 0); /* |CM CM|C4|CM */
 394       case CM:
 395         state = S_odd_CM_odd_C4CM;
 396         break;
 397       }
 398       break;
 399     case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
 400       switch (GB18030_MAP[*p]) {
 401       case C1:
 402       case C2:
 403       case C4:
 404         return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
 405       case CM:
 406         state = S_even_CM_odd_C4CM;
 407         break;
 408       }
 409       break;
 410 
 411     case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
 412       switch (GB18030_MAP[*p]) {
 413       case C1:
 414       case C2:
 415       case C4:
 416         return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
 417       case CM:
 418         state = S_odd_CM_even_C4CM;
 419         break;
 420       }
 421       break;
 422     case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
 423       switch (GB18030_MAP[*p]) {
 424       case C1:
 425       case C2:
 426       case C4:
 427         return (UChar *)(s - 0);  /* |CM CM|CM C4 CM C4|CM */
 428       case CM:
 429         state = S_even_CM_even_C4CM;
 430         break;
 431       }
 432       break;
 433     }
 434   }
 435 
 436   DEBUG_GB18030(("state %d\n", state));
 437   switch (state) {
 438   case S_START:             return (UChar *)(s - 0);
 439   case S_one_C2:            return (UChar *)(s - 0);
 440   case S_one_C4:            return (UChar *)(s - 0);
 441   case S_one_CM:            return (UChar *)(s - 0);
 442 
 443   case S_odd_CM_one_CX:     return (UChar *)(s - 1);
 444   case S_even_CM_one_CX:    return (UChar *)(s - 0);
 445 
 446   case S_one_CMC4:          return (UChar *)(s - 1);
 447   case S_odd_CMC4:          return (UChar *)(s - 1);
 448   case S_one_C4_odd_CMC4:   return (UChar *)(s - 1);
 449   case S_even_CMC4:         return (UChar *)(s - 3);
 450   case S_one_C4_even_CMC4:  return (UChar *)(s - 3);
 451 
 452   case S_odd_CM_odd_CMC4:   return (UChar *)(s - 3);
 453   case S_even_CM_odd_CMC4:  return (UChar *)(s - 1);
 454 
 455   case S_odd_CM_even_CMC4:  return (UChar *)(s - 1);
 456   case S_even_CM_even_CMC4: return (UChar *)(s - 3);
 457 
 458   case S_odd_C4CM:          return (UChar *)(s - 0);
 459   case S_one_CM_odd_C4CM:   return (UChar *)(s - 2);
 460   case S_even_C4CM:         return (UChar *)(s - 2);
 461   case S_one_CM_even_C4CM:  return (UChar *)(s - 0);
 462 
 463   case S_even_CM_odd_C4CM:  return (UChar *)(s - 0);
 464   case S_odd_CM_odd_C4CM:   return (UChar *)(s - 2);
 465   case S_even_CM_even_C4CM: return (UChar *)(s - 2);
 466   case S_odd_CM_even_C4CM:  return (UChar *)(s - 0);
 467   }
 468 
 469   return (UChar* )s;  /* never come here. (escape warning) */
 470 }
 471 
 472 static int
 473 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
 474 {
 475   return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
 476 }
 477 
 478 OnigEncodingType OnigEncodingGB18030 = {
 479   gb18030_mbc_enc_len,
 480   "GB18030",   /* name */
 481   4,          /* max enc length */
 482   1,          /* min enc length */
 483   onigenc_is_mbc_newline_0x0a,
 484   gb18030_mbc_to_code,
 485   onigenc_mb4_code_to_mbclen,
 486   gb18030_code_to_mbc,
 487   gb18030_mbc_case_fold,
 488   onigenc_ascii_apply_all_case_fold,
 489   onigenc_ascii_get_case_fold_codes_by_str,
 490   onigenc_minimum_property_name_to_ctype,
 491   gb18030_is_code_ctype,
 492   onigenc_not_support_get_ctype_code_range,
 493   gb18030_left_adjust_char_head,
 494   gb18030_is_allowed_reverse_match
 495 };

/* [<][>][^][v][top][bottom][index][help] */