root/ext/mbstring/oniguruma/enc/sjis.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mbc_enc_len
  2. code_to_mbclen
  3. mbc_to_code
  4. code_to_mbc
  5. mbc_case_fold
  6. is_mbc_ambiguous
  7. is_code_ctype
  8. left_adjust_char_head
  9. is_allowed_reverse_match
  10. init_property_list
  11. property_name_to_ctype
  12. is_code_ctype
  13. get_ctype_code_range

   1 /**********************************************************************
   2   sjis.c -  Oniguruma (regular expression library)
   3 **********************************************************************/
   4 /*-
   5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29 
  30 #include "regint.h"
  31 
  32 static const int EncLen_SJIS[] = {
  33   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  35   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  37   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  38   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  39   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  41   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  42   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  43   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  44   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  45   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  46   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  47   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  48   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
  49 };
  50 
  51 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
  52   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  53   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  54   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  55   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  56   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  57   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  58   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  59   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
  60   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  61   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  62   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  63   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  64   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  65   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  66   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  67   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
  68 };
  69 
  70 #define SJIS_ISMB_FIRST(byte)  (EncLen_SJIS[byte] > 1)
  71 #define SJIS_ISMB_TRAIL(byte)  SJIS_CAN_BE_TRAIL_TABLE[(byte)]
  72 
  73 static int
  74 mbc_enc_len(const UChar* p)
  75 {
  76   return EncLen_SJIS[*p];
  77 }
  78 
  79 static int
  80 code_to_mbclen(OnigCodePoint code)
  81 {
  82   if (code < 256) {
  83     if (EncLen_SJIS[(int )code] == 1)
  84       return 1;
  85     else
  86       return 0;
  87   }
  88   else if (code <= 0xffff) {
  89     return 2;
  90   }
  91   else
  92     return ONIGERR_INVALID_CODE_POINT_VALUE;
  93 }
  94 
  95 static OnigCodePoint
  96 mbc_to_code(const UChar* p, const UChar* end)
  97 {
  98   int c, i, len;
  99   OnigCodePoint n;
 100 
 101   len = enclen(ONIG_ENCODING_SJIS, p);
 102   c = *p++;
 103   n = c;
 104   if (len == 1) return n;
 105 
 106   for (i = 1; i < len; i++) {
 107     if (p >= end) break;
 108     c = *p++;
 109     n <<= 8;  n += c;
 110   }
 111   return n;
 112 }
 113 
 114 static int
 115 code_to_mbc(OnigCodePoint code, UChar *buf)
 116 {
 117   UChar *p = buf;
 118 
 119   if ((code & 0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
 120   *p++ = (UChar )(code & 0xff);
 121 
 122 #if 0
 123   if (enclen(ONIG_ENCODING_SJIS, buf) != (p - buf))
 124     return REGERR_INVALID_CODE_POINT_VALUE;
 125 #endif
 126   return p - buf;
 127 }
 128 
 129 static int
 130 mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
 131               const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
 132 {
 133   const UChar* p = *pp;
 134 
 135   if (ONIGENC_IS_MBC_ASCII(p)) {
 136     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
 137     (*pp)++;
 138     return 1;
 139   }
 140   else {
 141     int i;
 142     int len = enclen(ONIG_ENCODING_SJIS, p);
 143 
 144     for (i = 0; i < len; i++) {
 145       *lower++ = *p++;
 146     }
 147     (*pp) += len;
 148     return len; /* return byte length of converted char to lower */
 149   }
 150 }
 151 
 152 #if 0
 153 static int
 154 is_mbc_ambiguous(OnigCaseFoldType flag,
 155                  const UChar** pp, const UChar* end)
 156 {
 157   return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end);
 158                                       
 159 }
 160 #endif
 161 
 162 #if 0
 163 static int
 164 is_code_ctype(OnigCodePoint code, unsigned int ctype)
 165 {
 166   if (code < 128)
 167     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
 168   else {
 169     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
 170       return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
 171     }
 172   }
 173 
 174   return FALSE;
 175 }
 176 #endif
 177 
 178 static UChar*
 179 left_adjust_char_head(const UChar* start, const UChar* s)
 180 {
 181   const UChar *p;
 182   int len;
 183 
 184   if (s <= start) return (UChar* )s;
 185   p = s;
 186 
 187   if (SJIS_ISMB_TRAIL(*p)) {
 188     while (p > start) {
 189       if (! SJIS_ISMB_FIRST(*--p)) {
 190         p++;
 191         break;
 192       }
 193     } 
 194   }
 195   len = enclen(ONIG_ENCODING_SJIS, p);
 196   if (p + len > s) return (UChar* )p;
 197   p += len;
 198   return (UChar* )(p + ((s - p) & ~1));
 199 }
 200 
 201 static int
 202 is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
 203 {
 204   const UChar c = *s;
 205   return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
 206 }
 207 
 208 
 209 static int PropertyInited = 0;
 210 static const OnigCodePoint** PropertyList;
 211 static int PropertyListNum;
 212 static int PropertyListSize;
 213 static hash_table_type* PropertyNameTable;
 214 
 215 static const OnigCodePoint CR_Hiragana[] = {
 216   1,
 217   0x829f, 0x82f1
 218 }; /* CR_Hiragana */
 219 
 220 static const OnigCodePoint CR_Katakana[] = {
 221   4,
 222   0x00a6, 0x00af,
 223   0x00b1, 0x00dd,
 224   0x8340, 0x837e,
 225   0x8380, 0x8396,
 226 }; /* CR_Katakana */
 227 
 228 static int
 229 init_property_list(void)
 230 {
 231   int r;
 232 
 233   PROPERTY_LIST_ADD_PROP("Hiragana", CR_Hiragana);
 234   PROPERTY_LIST_ADD_PROP("Katakana", CR_Katakana);
 235   PropertyInited = 1;
 236 
 237  end:
 238   return r;
 239 }
 240 
 241 static int
 242 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
 243 {
 244   hash_data_type ctype;
 245 
 246   PROPERTY_LIST_INIT_CHECK;
 247 
 248   if (onig_st_lookup_strend(PropertyNameTable, p, end, &ctype) == 0) {
 249     return onigenc_minimum_property_name_to_ctype(enc, p, end);
 250   }
 251 
 252   return (int )ctype;
 253 }
 254 
 255 static int
 256 is_code_ctype(OnigCodePoint code, unsigned int ctype)
 257 {
 258   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
 259     if (code < 128)
 260       return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
 261     else {
 262       if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
 263         return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
 264       }
 265     }
 266   }
 267   else {
 268     PROPERTY_LIST_INIT_CHECK;
 269 
 270     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
 271     if (ctype >= (unsigned int )PropertyListNum)
 272       return ONIGERR_TYPE_BUG;
 273 
 274     return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
 275   }
 276 
 277   return FALSE;
 278 }
 279 
 280 static int
 281 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
 282                      const OnigCodePoint* ranges[])
 283 {
 284   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
 285     return ONIG_NO_SUPPORT_CONFIG;
 286   }
 287   else {
 288     *sb_out = 0x80;
 289 
 290     PROPERTY_LIST_INIT_CHECK;
 291 
 292     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
 293     if (ctype >= (OnigCtype )PropertyListNum)
 294       return ONIGERR_TYPE_BUG;
 295 
 296     *ranges = PropertyList[ctype];
 297     return 0;
 298   }
 299 }
 300 
 301 OnigEncodingType OnigEncodingSJIS = {
 302   mbc_enc_len,
 303   "Shift_JIS",   /* name */
 304   2,             /* max byte length */
 305   1,             /* min byte length */
 306   onigenc_is_mbc_newline_0x0a,
 307   mbc_to_code,
 308   code_to_mbclen,
 309   code_to_mbc,
 310   mbc_case_fold,
 311   onigenc_ascii_apply_all_case_fold,
 312   onigenc_ascii_get_case_fold_codes_by_str,
 313   property_name_to_ctype,
 314   is_code_ctype,
 315   get_ctype_code_range,
 316   left_adjust_char_head,
 317   is_allowed_reverse_match
 318 };

/* [<][>][^][v][top][bottom][index][help] */