root/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mbfl_bisec_srch
  2. mbfl_bisec_srch2
  3. mbfl_filt_conv_gb18030_wchar
  4. mbfl_filt_conv_wchar_gb18030
  5. mbfl_filt_ident_gb18030

   1 /*
   2  * "streamable kanji code filter and converter"
   3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
   4  *
   5  * LICENSE NOTICES
   6  *
   7  * This file is part of "streamable kanji code filter and converter",
   8  * which is distributed under the terms of GNU Lesser General Public
   9  * License (version 2) as published by the Free Software Foundation.
  10  *
  11  * This software is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with "streamable kanji code filter and converter";
  18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  19  * Suite 330, Boston, MA  02111-1307  USA
  20  *
  21  * The author of this file:
  22  *
  23  */
  24 /*
  25  * the source code included in this files was separated from mbfilter_cp936.c
  26  * by rui hirokawa <hirokawa@php.net> on 11 Aug 2011.
  27  *
  28  */
  29 
  30 #ifdef HAVE_CONFIG_H
  31 #include "config.h"
  32 #endif
  33 
  34 #include "mbfilter.h"
  35 #include "mbfilter_gb18030.h"
  36 
  37 #include "unicode_table_cp936.h"
  38 #include "unicode_table_gb18030.h"
  39 
  40 static int mbfl_filt_ident_gb18030(int c, mbfl_identify_filter *filter);
  41 
  42 static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
  43 
  44 const mbfl_encoding mbfl_encoding_gb18030 = {
  45         mbfl_no_encoding_gb18030,
  46         "GB18030",
  47         "GB18030",
  48         (const char *(*)[])&mbfl_encoding_gb18030_aliases,
  49         NULL,
  50         MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE
  51 };
  52 
  53 const struct mbfl_identify_vtbl vtbl_identify_gb18030 = {
  54         mbfl_no_encoding_gb18030,
  55         mbfl_filt_ident_common_ctor,
  56         mbfl_filt_ident_common_dtor,
  57         mbfl_filt_ident_gb18030
  58 };
  59 
  60 const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
  61         mbfl_no_encoding_gb18030,
  62         mbfl_no_encoding_wchar,
  63         mbfl_filt_conv_common_ctor,
  64         mbfl_filt_conv_common_dtor,
  65         mbfl_filt_conv_gb18030_wchar,
  66         mbfl_filt_conv_common_flush
  67 };
  68 
  69 const struct mbfl_convert_vtbl vtbl_wchar_gb18030 = {
  70         mbfl_no_encoding_wchar,
  71         mbfl_no_encoding_gb18030,
  72         mbfl_filt_conv_common_ctor,
  73         mbfl_filt_conv_common_dtor,
  74         mbfl_filt_conv_wchar_gb18030,
  75         mbfl_filt_conv_common_flush
  76 };
  77 
  78 #define CK(statement)   do { if ((statement) < 0) return (-1); } while (0)
  79 
  80 
  81 int
  82 mbfl_bisec_srch(int w, const unsigned short *tbl, int n)
  83 {
  84         int k, k1 = 0, k2 = n-1;
  85 
  86         while (k1 < k2) {
  87                 k = (k1+k2) >> 1;
  88                 if (w <= tbl[2*k+1]) {
  89                         k2 = k;
  90                 } else if (w >= tbl[2*k+2]) {
  91                         k1 = k + 1;
  92                 } else {
  93                         return -1;
  94                 }
  95         }
  96         return k1;
  97 }
  98 
  99 int
 100 mbfl_bisec_srch2(int w, const unsigned short tbl[], int n)
 101 {
 102         int k, k1 = 0, k2 = n;
 103 
 104         if (w == tbl[0]) {
 105                 return 0;
 106         }
 107 
 108         while (k2 - k1 > 1) {
 109                 k = (k1 + k2) >> 1;
 110                 if (w < tbl[k]) {
 111                         k2 = k;
 112                 } else if (w > tbl[k]) {
 113                         k1 = k;
 114                 } else {
 115                         return k;
 116                 }
 117         }
 118         return -1;
 119 }
 120 
 121 /*
 122  * GB18030 => wchar
 123  */
 124 int
 125 mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
 126 {
 127         int k;
 128         int c1, c2, c3, w = -1;
 129 
 130         switch (filter->status) {
 131         case 0:
 132                 if (c >= 0 && c < 0x80) {       /* latin */
 133                         CK((*filter->output_function)(c, filter->data));
 134                 } else if (c == 0x80) { /* euro sign */
 135                         CK((*filter->output_function)(0x20ac, filter->data));
 136                 } else if (c == 0xff) {
 137                         CK((*filter->output_function)(0x00ff, filter->data));
 138                 } else if (c > 0x80 && c < 0xff) {      /* dbcs/qbcs lead byte */
 139                         filter->status = 1;
 140                         filter->cache = c;
 141                 } else {
 142                         w = c & MBFL_WCSGROUP_MASK;
 143                         w |= MBFL_WCSGROUP_THROUGH;
 144                         CK((*filter->output_function)(w, filter->data));
 145                 }
 146                 break;
 147 
 148         case 1:         /* dbcs/qbcs second byte */
 149                 c1 = filter->cache;
 150                 filter->status = 0;
 151 
 152                 if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) { /* 4 byte range: Unicode BMP */
 153                         filter->status = 2;
 154                         filter->cache = (c1 << 8) | c;
 155                         return c;
 156                 } else if (c1 >= 0x90 && c1 <= 0xe3 && c >= 0x30 && c <= 0x39) {
 157                         /* 4 byte range: Unicode 16 planes */
 158                         filter->status = 2;
 159                         filter->cache = (c1 << 8) | c;
 160                         return c;
 161                 } else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) &&
 162                                    (c >= 0xa1 && c <= 0xfe)) { /* UDA part1,2: U+E000-U+E4C5 */
 163                         w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
 164                         CK((*filter->output_function)(w, filter->data));
 165                 } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
 166                         /* UDA part3 : U+E4C6-U+E765*/
 167                         w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
 168                         CK((*filter->output_function)(w, filter->data));
 169                 }
 170 
 171                 c2 = (c1 << 8) | c;
 172 
 173                 if (w <= 0 &&
 174                         ((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
 175                          (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
 176                          (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
 177                         for (k = 0; k < mbfl_gb18030_pua_tbl_max; k++) {
 178                                 if (c2 >= mbfl_gb18030_pua_tbl[k][2] &&
 179                                         c2 <= mbfl_gb18030_pua_tbl[k][2] +  mbfl_gb18030_pua_tbl[k][1]
 180                                         -  mbfl_gb18030_pua_tbl[k][0]) {
 181                                         w = c2 -  mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][0];
 182                                         CK((*filter->output_function)(w, filter->data));
 183                                         break;
 184                                 }
 185                         }
 186                 }
 187 
 188                 if (w <= 0) {
 189                         if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) ||
 190                                 (c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) ||
 191                                 (c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) ||
 192                                 (c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) ||
 193                                 (c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) {
 194                                 w = (c1 - 0x81)*192 + (c - 0x40);
 195                                 if (w >= 0 && w < cp936_ucs_table_size) {
 196                                         w = cp936_ucs_table[w];
 197                                 } else {
 198                                         w = 0;
 199                                 }
 200                                 if (w <= 0) {
 201                                         w = (c1 << 8) | c;
 202                                         w &= MBFL_WCSPLANE_MASK;
 203                                         w |= MBFL_WCSPLANE_GB18030;
 204                                 }
 205                                 CK((*filter->output_function)(w, filter->data));
 206                         } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
 207                                 CK((*filter->output_function)(c, filter->data));
 208                         } else {
 209                                 w = (c1 << 8) | c;
 210                                 w &= MBFL_WCSGROUP_MASK;
 211                                 w |= MBFL_WCSGROUP_THROUGH;
 212                                 CK((*filter->output_function)(w, filter->data));
 213                         }
 214                 }
 215                 break;
 216         case 2: /* qbcs third byte */
 217                 c1 = (filter->cache >> 8) & 0xff;
 218                 c2 = filter->cache & 0xff;
 219                 filter->status = 0;
 220                 filter->cache = 0;
 221                 if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) &&
 222                         c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) {
 223                         filter->cache = (c1 << 16) | (c2 << 8) | c;
 224                         filter->status = 3;
 225                 } else {
 226                         w = (c1 << 16) | (c2 << 8) | c;
 227                         w &= MBFL_WCSGROUP_MASK;
 228                         w |= MBFL_WCSGROUP_THROUGH;
 229                         CK((*filter->output_function)(w, filter->data));
 230                 }
 231                 break;
 232 
 233         case 3: /* qbcs fourth byte */
 234                 c1 = (filter->cache >> 16) & 0xff;
 235                 c2 = (filter->cache >> 8) & 0xff;
 236                 c3 = filter->cache & 0xff;
 237                 filter->status = 0;
 238                 filter->cache = 0;
 239                 if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) &&
 240                         c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) {
 241                         if (c1 >= 0x90 && c1 <= 0xe3) {
 242                                 w = ((((c1 - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c - 0x30) + 0x10000;
 243                         } else { /* Unicode BMP */
 244                                 w = (((c1 - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c - 0x30);
 245                                 if (w >= 0 && w <= 39419) {
 246                                         k = mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max);
 247                                         if (k<0) {
 248                                                 /* error */
 249                                                 w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c;
 250                                                 w &= MBFL_WCSGROUP_MASK;
 251                                                 w |= MBFL_WCSGROUP_THROUGH;
 252                                                 CK((*filter->output_function)(w, filter->data));
 253                                                 return c;
 254                                         }
 255                                         w += mbfl_gb_uni_ofst[k];
 256                                 } else {
 257                                         w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c;
 258                                         w &= MBFL_WCSGROUP_MASK;
 259                                         w |= MBFL_WCSGROUP_THROUGH;
 260                                         CK((*filter->output_function)(w, filter->data));
 261                                         return c;
 262                                 }
 263                         }
 264                         CK((*filter->output_function)(w, filter->data));
 265                 } else {
 266                         w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c;
 267                         w &= MBFL_WCSGROUP_MASK;
 268                         w |= MBFL_WCSGROUP_THROUGH;
 269                         CK((*filter->output_function)(w, filter->data));
 270                 }
 271                 break;
 272 
 273         default:
 274                 filter->status = 0;
 275                 break;
 276         }
 277 
 278         return c;
 279 }
 280 
 281 /*
 282  * wchar => GB18030
 283  */
 284 int
 285 mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
 286 {
 287         int k, k1, k2;
 288         int c1, s = 0, s1 = 0;
 289 
 290         if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
 291                 s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
 292         } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
 293                 s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
 294         } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
 295                 s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
 296         } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
 297                 s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
 298         } else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
 299                 /* U+F900-FA2F CJK Compatibility Ideographs */
 300                 if (c == 0xf92c) {
 301                         s = 0xfd9c;
 302                 } else if (c == 0xf979) {
 303                         s = 0xfd9d;
 304                 } else if (c == 0xf995) {
 305                         s = 0xfd9e;
 306                 } else if (c == 0xf9e7) {
 307                         s = 0xfd9f;
 308                 } else if (c == 0xf9f1) {
 309                         s = 0xfda0;
 310                 } else if (c >= 0xfa0c && c <= 0xfa29) {
 311                         s = ucs_ci_s_cp936_table[c - 0xfa0c];
 312                 }
 313         } else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
 314                 /* FE30h CJK Compatibility Forms  */
 315                 s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
 316         } else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
 317                 /* U+FE50-FE6F Small Form Variants */
 318                 s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min];
 319         } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
 320                 /* U+FF00-FFFF HW/FW Forms */
 321                 if (c == 0xff04) {
 322                         s = 0xa1e7;
 323                 } else if (c == 0xff5e) {
 324                         s = 0xa1ab;
 325                 } else if (c >= 0xff01 && c <= 0xff5d) {
 326                         s = c - 0xff01 + 0xa3a1;
 327                 } else if (c >= 0xffe0 && c <= 0xffe5) {
 328                         s = ucs_hff_s_cp936_table[c-0xffe0];
 329                 }
 330         }
 331 
 332         if (c == 0x20ac) { /* euro-sign */
 333                 s = 0xa2e3;
 334         }
 335 
 336         if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] &&
 337                 c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
 338                 k1 = mbfl_bisec_srch2(c, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
 339                 if (k1 >= 0) {
 340                         s = mbfl_gb18030_c_tbl_val[k1];
 341                 }
 342         }
 343 
 344         if (c >= 0xe000 && c <= 0xe864) { /* PUA */
 345                 if (c < 0xe766) {
 346                         if (c < 0xe4c6) {
 347                                 c1 = c - 0xe000;
 348                                 s = (c1 % 94) + 0xa1; c1 /= 94;
 349                                 s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
 350                         } else {
 351                                 c1 = c - 0xe4c6;
 352                                 s = ((c1 / 96) + 0xa1) << 8; c1 %= 96;
 353                                 s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
 354                         }
 355                 } else {
 356                         /* U+E766..U+E864 */
 357                         k1 = 0; k2 = mbfl_gb18030_pua_tbl_max;
 358                         while (k1 < k2) {
 359                                 k = (k1 + k2) >> 1;
 360                                 if (c < mbfl_gb18030_pua_tbl[k][0]) {
 361                                         k2 = k;
 362                                 } else if (c > mbfl_gb18030_pua_tbl[k][1]) {
 363                                         k1 = k + 1;
 364                                 } else {
 365                                         s = c - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
 366                                         break;
 367                                 }
 368                         }
 369                 }
 370         }
 371 
 372         if (s <= 0 && c >= 0x0080 && c <= 0xffff) { /* BMP */
 373                 s = mbfl_bisec_srch(c, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
 374                 if (s >= 0) {
 375                         c1 = c - mbfl_gb_uni_ofst[s];
 376                         s = (c1 % 10) + 0x30; c1 /= 10;
 377                         s |= ((c1 % 126) + 0x81) << 8; c1 /= 126;
 378                         s |= ((c1 % 10) + 0x30) << 16; c1 /= 10;
 379                         s1 = c1 + 0x81;
 380                 }
 381         } else if (c >= 0x10000 && c <= 0x10ffff) { /* Code set 3: Unicode U+10000..U+10FFFF */
 382                 c1 = c - 0x10000;
 383                 s = (c1 % 10) + 0x30; c1 /= 10;
 384                 s |= ((c1 % 126) + 0x81) << 8; c1 /= 126;
 385                 s |= ((c1 % 10) + 0x30) << 16; c1 /= 10;
 386                 s1 = c1 + 0x90;
 387         }
 388 
 389         if (s <= 0) {
 390                 c1 = c & ~MBFL_WCSPLANE_MASK;
 391                 if (c1 == MBFL_WCSPLANE_WINCP936) {
 392                         s = c & MBFL_WCSPLANE_MASK;
 393                 }
 394                 if (c == 0) {
 395                         s = 0;
 396                 } else if (s <= 0) {
 397                         s = -1;
 398                 }
 399         }
 400         if (s >= 0) {
 401                 if (s <= 0x80) {        /* latin */
 402                         CK((*filter->output_function)(s, filter->data));
 403                 } else if (s1 > 0) { /* qbcs */
 404                         CK((*filter->output_function)(s1 & 0xff, filter->data));
 405                         CK((*filter->output_function)((s >> 16) & 0xff, filter->data));
 406                         CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
 407                         CK((*filter->output_function)(s & 0xff, filter->data));
 408                 } else { /* dbcs */
 409                         CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
 410                         CK((*filter->output_function)(s & 0xff, filter->data));
 411                 }
 412         } else {
 413                 if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
 414                         CK(mbfl_filt_conv_illegal_output(c, filter));
 415                 }
 416         }
 417 
 418         return c;
 419 }
 420 
 421 static int mbfl_filt_ident_gb18030(int c, mbfl_identify_filter *filter)
 422 {
 423         int c1;
 424 
 425         c1 = (filter->status >> 8) & 0xff;
 426         filter->status &= 0xff;
 427 
 428         if (filter->status == 0) {
 429                 if (c <= 0x80 || c == 0xff) {
 430                         filter->status = 0;
 431                 } else {
 432                         filter->status = 1;
 433                         filter->status |= (c << 8);
 434                 }
 435         } else if (filter->status == 1) { /* dbcs/qbcs 2nd byte */
 436                 if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c >= 0x30 && c <= 0x39) { /* qbcs */
 437                         filter->status = 2;
 438                 } else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && (c >= 0xa1 && c <= 0xfe)) {
 439                         filter->status = 0; /* UDA part 1,2 */
 440                 } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
 441                         filter->status = 0; /* UDA part 3 */
 442                 } else if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) ||
 443                                    (c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) ||
 444                                    (c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) ||
 445                                    (c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) ||
 446                                    (c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) {
 447                         filter->status = 0; /* DBCS */
 448                 } else {
 449                         filter->flag = 1; /* bad */
 450                         filter->status = 0;
 451                 }
 452         } else if (filter->status == 2) { /* qbcs 3rd byte */
 453                 if (c > 0x80 && c < 0xff) {
 454                         filter->status = 3;
 455                 } else {
 456                         filter->flag = 1; /* bad */
 457                         filter->status = 0;
 458                 }
 459         } else if (filter->status == 3) { /* qbcs 4th byte */
 460                 if (c >= 0x30 && c < 0x40) {
 461                         filter->status = 0;
 462                 } else {
 463                         filter->flag = 1; /* bad */
 464                         filter->status = 0;
 465                 }
 466         } else {                                                        /* bad */
 467                 filter->flag = 1;
 468         }
 469 
 470         return c;
 471 }
 472 
 473 

/* [<][>][^][v][top][bottom][index][help] */