root/ext/mbstring/oniguruma/enc/euc_jp.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mbc_enc_len
  2. mbc_to_code
  3. code_to_mbclen
  4. code_to_mbc_first
  5. code_to_mbc
  6. mbc_case_fold
  7. left_adjust_char_head
  8. is_allowed_reverse_match
  9. init_property_list
  10. property_name_to_ctype
  11. is_code_ctype
  12. get_ctype_code_range

   1 /**********************************************************************
   2   euc_jp.c -  Oniguruma (regular expression library)
   3 **********************************************************************/
   4 /*-
   5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29 
  30 #include "regint.h"
  31 
  32 #define eucjp_islead(c)    ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
  33 
  34 static const int EncLen_EUCJP[] = {
  35   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  37   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  38   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  39   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  41   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  42   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  43   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
  44   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  45   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  46   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  47   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  48   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  49   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  50   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
  51 };
  52 
  53 static int
  54 mbc_enc_len(const UChar* p)
  55 {
  56   return EncLen_EUCJP[*p];
  57 }
  58 
  59 static OnigCodePoint
  60 mbc_to_code(const UChar* p, const UChar* end)
  61 {
  62   int c, i, len;
  63   OnigCodePoint n;
  64 
  65   len = enclen(ONIG_ENCODING_EUC_JP, p);
  66   n = (OnigCodePoint )*p++;
  67   if (len == 1) return n;
  68 
  69   for (i = 1; i < len; i++) {
  70     if (p >= end) break;
  71     c = *p++;
  72     n <<= 8;  n += c;
  73   }
  74   return n;
  75 }
  76 
  77 static int
  78 code_to_mbclen(OnigCodePoint code)
  79 {
  80   if (ONIGENC_IS_CODE_ASCII(code)) return 1;
  81   else if ((code & 0xff0000) != 0) return 3;
  82   else if ((code &   0xff00) != 0) return 2;
  83   else
  84     return ONIGERR_INVALID_CODE_POINT_VALUE;
  85 }
  86 
  87 #if 0
  88 static int
  89 code_to_mbc_first(OnigCodePoint code)
  90 {
  91   int first;
  92 
  93   if ((code & 0xff0000) != 0) {
  94     first = (code >> 16) & 0xff;
  95   }
  96   else if ((code & 0xff00) != 0) {
  97     first = (code >> 8) & 0xff;
  98   }
  99   else {
 100     return (int )code;
 101   }
 102   return first;
 103 }
 104 #endif
 105 
 106 static int
 107 code_to_mbc(OnigCodePoint code, UChar *buf)
 108 {
 109   UChar *p = buf;
 110 
 111   if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
 112   if ((code &   0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
 113   *p++ = (UChar )(code & 0xff);
 114 
 115 #if 1
 116   if (enclen(ONIG_ENCODING_EUC_JP, buf) != (p - buf))
 117     return ONIGERR_INVALID_CODE_POINT_VALUE;
 118 #endif  
 119   return p - buf;
 120 }
 121 
 122 static int
 123 mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
 124               const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
 125 {
 126   int len;
 127   const UChar* p = *pp;
 128 
 129   if (ONIGENC_IS_MBC_ASCII(p)) {
 130     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
 131     (*pp)++;
 132     return 1;
 133   }
 134   else {
 135     int i;
 136 
 137     len = enclen(ONIG_ENCODING_EUC_JP, p);
 138     for (i = 0; i < len; i++) {
 139       *lower++ = *p++;
 140     }
 141     (*pp) += len;
 142     return len; /* return byte length of converted char to lower */
 143   }
 144 }
 145 
 146 static UChar*
 147 left_adjust_char_head(const UChar* start, const UChar* s)
 148 {
 149   /* In this encoding
 150      mb-trail bytes doesn't mix with single bytes.
 151   */
 152   const UChar *p;
 153   int len;
 154 
 155   if (s <= start) return (UChar* )s;
 156   p = s;
 157 
 158   while (!eucjp_islead(*p) && p > start) p--;
 159   len = enclen(ONIG_ENCODING_EUC_JP, p);
 160   if (p + len > s) return (UChar* )p;
 161   p += len;
 162   return (UChar* )(p + ((s - p) & ~1));
 163 }
 164 
 165 static int
 166 is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
 167 {
 168   const UChar c = *s;
 169   if (c <= 0x7e || c == 0x8e || c == 0x8f)
 170     return TRUE;
 171   else
 172     return FALSE;
 173 }
 174 
 175 
 176 static int PropertyInited = 0;
 177 static const OnigCodePoint** PropertyList;
 178 static int PropertyListNum;
 179 static int PropertyListSize;
 180 static hash_table_type* PropertyNameTable;
 181 
 182 static const OnigCodePoint CR_Hiragana[] = {
 183   1,
 184   0xa4a1, 0xa4f3
 185 }; /* CR_Hiragana */
 186 
 187 static const OnigCodePoint CR_Katakana[] = {
 188   3,
 189   0xa5a1, 0xa5f6,
 190   0xaaa6, 0xaaaf,
 191   0xaab1, 0xaadd
 192 }; /* CR_Katakana */
 193 
 194 static int
 195 init_property_list(void)
 196 {
 197   int r;
 198 
 199   PROPERTY_LIST_ADD_PROP("Hiragana", CR_Hiragana);
 200   PROPERTY_LIST_ADD_PROP("Katakana", CR_Katakana);
 201   PropertyInited = 1;
 202 
 203  end:
 204   return r;
 205 }
 206 
 207 static int
 208 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
 209 {
 210   hash_data_type ctype;
 211 
 212   PROPERTY_LIST_INIT_CHECK;
 213 
 214   if (onig_st_lookup_strend(PropertyNameTable, p, end, &ctype) == 0) {
 215     return onigenc_minimum_property_name_to_ctype(enc, p, end);
 216   }
 217 
 218   return (int )ctype;
 219 }
 220 
 221 static int
 222 is_code_ctype(OnigCodePoint code, unsigned int ctype)
 223 {
 224   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
 225     if (code < 128)
 226       return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
 227     else {
 228       if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
 229         return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
 230       }
 231     }
 232   }
 233   else {
 234     PROPERTY_LIST_INIT_CHECK;
 235 
 236     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
 237     if (ctype >= (unsigned int )PropertyListNum)
 238       return ONIGERR_TYPE_BUG;
 239 
 240     return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
 241   }
 242 
 243   return FALSE;
 244 }
 245 
 246 static int
 247 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
 248                      const OnigCodePoint* ranges[])
 249 {
 250   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
 251     return ONIG_NO_SUPPORT_CONFIG;
 252   }
 253   else {
 254     *sb_out = 0x80;
 255 
 256     PROPERTY_LIST_INIT_CHECK;
 257 
 258     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
 259     if (ctype >= (OnigCtype )PropertyListNum)
 260       return ONIGERR_TYPE_BUG;
 261 
 262     *ranges = PropertyList[ctype];
 263     return 0;
 264   }
 265 }
 266 
 267 
 268 OnigEncodingType OnigEncodingEUC_JP = {
 269   mbc_enc_len,
 270   "EUC-JP",   /* name */
 271   3,          /* max enc length */
 272   1,          /* min enc length */
 273   onigenc_is_mbc_newline_0x0a,
 274   mbc_to_code,
 275   code_to_mbclen,
 276   code_to_mbc,
 277   mbc_case_fold,
 278   onigenc_ascii_apply_all_case_fold,
 279   onigenc_ascii_get_case_fold_codes_by_str,
 280   property_name_to_ctype,
 281   is_code_ctype,
 282   get_ctype_code_range,
 283   left_adjust_char_head,
 284   is_allowed_reverse_match
 285 };

/* [<][>][^][v][top][bottom][index][help] */