root/ext/mbstring/libmbfl/filters/mbfilter_big5.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mbfl_filt_conv_big5_wchar
  2. mbfl_filt_conv_wchar_big5
  3. mbfl_filt_ident_big5

   1 /*
   2  * "streamable kanji code filter and converter"
   3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
   4  *
   5  * LICENSE NOTICES
   6  *
   7  * This file is part of "streamable kanji code filter and converter",
   8  * which is distributed under the terms of GNU Lesser General Public
   9  * License (version 2) as published by the Free Software Foundation.
  10  *
  11  * This software is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with "streamable kanji code filter and converter";
  18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  19  * Suite 330, Boston, MA  02111-1307  USA
  20  *
  21  * The author of this file: Rui Hirokawa <hirokawa@php.net>
  22  *
  23  */
  24 /*
  25  * The source code included in this files was separated from mbfilter_tw.c
  26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
  27  *
  28  */
  29 
  30 #ifdef HAVE_CONFIG_H
  31 #include "config.h"
  32 #endif
  33 
  34 #include "mbfilter.h"
  35 #include "mbfilter_big5.h"
  36 
  37 #include "unicode_table_big5.h"
  38 
  39 static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter);
  40 
  41 static const unsigned char mblen_table_big5[] = { /* 0x81-0xFE */
  42   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  43   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  44   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  45   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  46   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  47   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  50   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  51   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  52   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  53   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  54   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  55   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  56   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  57   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
  58 };
  59 
  60 static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG-FIVE", "BIGFIVE", NULL};
  61 
  62 const mbfl_encoding mbfl_encoding_big5 = {
  63         mbfl_no_encoding_big5,
  64         "BIG-5",
  65         "BIG5",
  66         (const char *(*)[])&mbfl_encoding_big5_aliases,
  67         mblen_table_big5,
  68         MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE
  69 };
  70 
  71 const mbfl_encoding mbfl_encoding_cp950 = {
  72         mbfl_no_encoding_cp950,
  73         "CP950",
  74         "BIG5",
  75         NULL,
  76         mblen_table_big5,
  77         MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE
  78 };
  79 
  80 const struct mbfl_identify_vtbl vtbl_identify_big5 = {
  81         mbfl_no_encoding_big5,
  82         mbfl_filt_ident_common_ctor,
  83         mbfl_filt_ident_common_dtor,
  84         mbfl_filt_ident_big5
  85 };
  86 
  87 const struct mbfl_identify_vtbl vtbl_identify_cp950 = {
  88         mbfl_no_encoding_cp950,
  89         mbfl_filt_ident_common_ctor,
  90         mbfl_filt_ident_common_dtor,
  91         mbfl_filt_ident_big5
  92 };
  93 
  94 const struct mbfl_convert_vtbl vtbl_big5_wchar = {
  95         mbfl_no_encoding_big5,
  96         mbfl_no_encoding_wchar,
  97         mbfl_filt_conv_common_ctor,
  98         mbfl_filt_conv_common_dtor,
  99         mbfl_filt_conv_big5_wchar,
 100         mbfl_filt_conv_common_flush
 101 };
 102 
 103 const struct mbfl_convert_vtbl vtbl_wchar_big5 = {
 104         mbfl_no_encoding_wchar,
 105         mbfl_no_encoding_big5,
 106         mbfl_filt_conv_common_ctor,
 107         mbfl_filt_conv_common_dtor,
 108         mbfl_filt_conv_wchar_big5,
 109         mbfl_filt_conv_common_flush
 110 };
 111 
 112 const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
 113         mbfl_no_encoding_cp950,
 114         mbfl_no_encoding_wchar,
 115         mbfl_filt_conv_common_ctor,
 116         mbfl_filt_conv_common_dtor,
 117         mbfl_filt_conv_big5_wchar,
 118         mbfl_filt_conv_common_flush
 119 };
 120 
 121 const struct mbfl_convert_vtbl vtbl_wchar_cp950 = {
 122         mbfl_no_encoding_wchar,
 123         mbfl_no_encoding_cp950,
 124         mbfl_filt_conv_common_ctor,
 125         mbfl_filt_conv_common_dtor,
 126         mbfl_filt_conv_wchar_big5,
 127         mbfl_filt_conv_common_flush
 128 };
 129 
 130 #define CK(statement)   do { if ((statement) < 0) return (-1); } while (0)
 131 
 132 /* 63 + 94 = 157 or 94 */
 133 static unsigned short cp950_pua_tbl[][4] = {
 134         {0xe000,0xe310,0xfa40,0xfefe},
 135         {0xe311,0xeeb7,0x8e40,0xa0fe},
 136         {0xeeb8,0xf6b0,0x8140,0x8dfe},
 137         {0xf6b1,0xf70e,0xc6a1,0xc6fe},
 138         {0xf70f,0xf848,0xc740,0xc8fe},
 139 };
 140 
 141 /*
 142  * Big5 => wchar
 143  */
 144 int
 145 mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
 146 {
 147         int k;
 148         int c1, w, c2;
 149 
 150         switch (filter->status) {
 151         case 0:
 152                 if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
 153                         c1 = 0x80;
 154                 } else {
 155                         c1 = 0xa0;
 156                 }
 157 
 158                 if (c >= 0 && c <= 0x80) {      /* latin */
 159                         CK((*filter->output_function)(c, filter->data));
 160                 } else if (c == 0xff) {
 161                         CK((*filter->output_function)(0xf8f8, filter->data));
 162                 } else if (c > c1 && c < 0xff) {        /* dbcs lead byte */
 163                         filter->status = 1;
 164                         filter->cache = c;
 165                 } else {
 166                         w = c & MBFL_WCSGROUP_MASK;
 167                         w |= MBFL_WCSGROUP_THROUGH;
 168                         CK((*filter->output_function)(w, filter->data));
 169                 }
 170                 break;
 171 
 172         case 1:         /* dbcs second byte */
 173                 filter->status = 0;
 174                 c1 = filter->cache;
 175                 if ((c > 0x39 && c < 0x7f) | (c > 0xa0 && c < 0xff)) {
 176                         if (c < 0x7f){
 177                                 w = (c1 - 0xa1)*157 + (c - 0x40);
 178                         } else {
 179                                 w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f;
 180                         }
 181                         if (w >= 0 && w < big5_ucs_table_size) {
 182                                 w = big5_ucs_table[w];
 183                         } else {
 184                                 w = 0;
 185                         }
 186 
 187                         if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
 188                                 /* PUA for CP950 */
 189                                 if (w <= 0 &&
 190                                         (((c1 >= 0xfa && c1 <= 0xfe) || (c1 >= 0x8e && c1 <= 0xa0) ||
 191                                           (c1 >= 0x81 && c1 <= 0x8d) ||(c1 >= 0xc7 && c1 <= 0xc8))
 192                                          && ((c > 0x39 && c < 0x7f) || (c > 0xa0 && c < 0xff))) ||
 193                                         ((c1 == 0xc6) && (c > 0xa0 && c < 0xff))) {
 194                                         c2 = c1 << 8 | c;
 195                                         for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) {
 196                                                 if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) {
 197                                                         break;
 198                                                 }
 199                                         }
 200 
 201                                         if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
 202                                                 w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40)
 203                                                         + cp950_pua_tbl[k][0];
 204                                         } else {
 205                                                 w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
 206                                         }
 207                                 }
 208                         }
 209 
 210                         if (w <= 0) {
 211                                 w = (c1 << 8) | c;
 212                                 w &= MBFL_WCSPLANE_MASK;
 213                                 w |= MBFL_WCSPLANE_BIG5;
 214                         }
 215                         CK((*filter->output_function)(w, filter->data));
 216                 } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
 217                         CK((*filter->output_function)(c, filter->data));
 218                 } else {
 219                         w = (c1 << 8) | c;
 220                         w &= MBFL_WCSGROUP_MASK;
 221                         w |= MBFL_WCSGROUP_THROUGH;
 222                         CK((*filter->output_function)(w, filter->data));
 223                 }
 224                 break;
 225 
 226         default:
 227                 filter->status = 0;
 228                 break;
 229         }
 230 
 231         return c;
 232 }
 233 
 234 /*
 235  * wchar => Big5
 236  */
 237 int
 238 mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
 239 {
 240         int k;
 241         int c1, s, c2;
 242 
 243         s = 0;
 244         if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) {
 245                 s = ucs_a1_big5_table[c - ucs_a1_big5_table_min];
 246         } else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) {
 247                 s = ucs_a2_big5_table[c - ucs_a2_big5_table_min];
 248         } else if (c >= ucs_a3_big5_table_min && c < ucs_a3_big5_table_max) {
 249                 s = ucs_a3_big5_table[c - ucs_a3_big5_table_min];
 250         } else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) {
 251                 s = ucs_i_big5_table[c - ucs_i_big5_table_min];
 252         } else if (c >= ucs_pua_big5_table_min && c < ucs_pua_big5_table_max) {
 253                 s = ucs_pua_big5_table[c - ucs_pua_big5_table_min];
 254         } else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) {
 255                 s = ucs_r1_big5_table[c - ucs_r1_big5_table_min];
 256         } else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) {
 257                 s = ucs_r2_big5_table[c - ucs_r2_big5_table_min];
 258         }
 259 
 260         if (filter->to->no_encoding == mbfl_no_encoding_cp950) {
 261                 if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */
 262                         for (k = 0; k < sizeof(cp950_pua_tbl)/(sizeof(unsigned short)*4); k++) {
 263                                 if (c <= cp950_pua_tbl[k][1]) {
 264                                         break;
 265                                 }
 266                         }
 267                         c1 = c - cp950_pua_tbl[k][0];
 268                         if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
 269                                 c2 = cp950_pua_tbl[k][2] >> 8;
 270                                 s = ((c1 / 157) + c2) << 8; c1 %= 157;
 271                                 s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40);
 272                         } else {
 273                                 s = c1 + cp950_pua_tbl[k][2];
 274                         }
 275                 }
 276 
 277                 if (c == 0x80) {
 278                         s = 0x80;
 279                 } else if (c == 0xf8f8) {
 280                         s = 0xff;
 281                 } else if (c == 0x256d) {
 282                         s = 0xa27e;
 283                 } else if (c == 0x256e) {
 284                         s = 0xa2a1;
 285                 } else if (c == 0x256f) {
 286                         s = 0xa2a3;
 287                 } else if (c == 0x2570) {
 288                         s = 0xa2a2;
 289                 }
 290         }
 291 
 292         if (s <= 0) {
 293                 c1 = c & ~MBFL_WCSPLANE_MASK;
 294                 if (c1 == MBFL_WCSPLANE_BIG5) {
 295                         s = c & MBFL_WCSPLANE_MASK;
 296                 }
 297                 if (c == 0) {
 298                         s = 0;
 299                 } else if (s <= 0) {
 300                         s = -1;
 301                 }
 302         }
 303         if (s >= 0) {
 304                 if (s <= 0x80 || s == 0xff) {   /* latin */
 305                         CK((*filter->output_function)(s, filter->data));
 306                 } else {
 307                         CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
 308                         CK((*filter->output_function)(s & 0xff, filter->data));
 309                 }
 310         } else {
 311                 if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
 312                         CK(mbfl_filt_conv_illegal_output(c, filter));
 313                 }
 314         }
 315 
 316         return c;
 317 }
 318 
 319 static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter)
 320 {
 321         int c1;
 322         if (filter->encoding->no_encoding == mbfl_no_encoding_cp950) {
 323                 c1 = 0x80;
 324         } else {
 325                 c1 = 0xa0;
 326         }
 327 
 328         if (filter->status) {           /* kanji second char */
 329                 if (c < 0x40 || (c > 0x7e && c < 0xa1) ||c > 0xfe) {    /* bad */
 330                     filter->flag = 1;
 331                 }
 332                 filter->status = 0;
 333         } else if (c >= 0 && c < 0x80) {        /* latin  ok */
 334                 ;
 335         } else if (c > c1 && c < 0xff) {        /* DBCS lead byte */
 336                 filter->status = 1;
 337         } else {                                                        /* bad */
 338                 filter->flag = 1;
 339         }
 340 
 341         return c;
 342 }
 343 
 344 

/* [<][>][^][v][top][bottom][index][help] */