root/ext/mbstring/oniguruma/enc/utf8.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mbc_enc_len
  2. is_mbc_newline
  3. mbc_to_code
  4. code_to_mbclen
  5. code_to_mbc
  6. mbc_case_fold
  7. is_mbc_ambiguous
  8. get_ctype_code_range
  9. left_adjust_char_head
  10. get_case_fold_codes_by_str

   1 /**********************************************************************
   2   utf8.c -  Oniguruma (regular expression library)
   3 **********************************************************************/
   4 /*-
   5  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29 
  30 #include "regenc.h"
  31 
  32 #define USE_INVALID_CODE_SCHEME
  33 
  34 #ifdef USE_INVALID_CODE_SCHEME
  35 /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
  36 #define INVALID_CODE_FE   0xfffffffe
  37 #define INVALID_CODE_FF   0xffffffff
  38 #define VALID_CODE_LIMIT  0x7fffffff
  39 #endif
  40 
  41 #define utf8_islead(c)     ((UChar )((c) & 0xc0) != 0x80)
  42 
  43 static const int EncLen_UTF8[] = {
  44   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  45   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  46   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  47   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  52   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  55   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  56   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  57   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  58   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  59   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
  60 };
  61 
  62 static int
  63 mbc_enc_len(const UChar* p)
  64 {
  65   return EncLen_UTF8[*p];
  66 }
  67 
  68 static int
  69 is_mbc_newline(const UChar* p, const UChar* end)
  70 {
  71   if (p < end) {
  72     if (*p == 0x0a) return 1;
  73 
  74 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
  75 #ifndef USE_CRNL_AS_LINE_TERMINATOR
  76     if (*p == 0x0d) return 1;
  77 #endif
  78     if (p + 1 < end) {
  79       if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
  80         return 1;
  81       if (p + 2 < end) {
  82         if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
  83             && *(p+1) == 0x80 && *p == 0xe2)  /* U+2028, U+2029 */
  84           return 1;
  85       }
  86     }
  87 #endif
  88   }
  89 
  90   return 0;
  91 }
  92 
  93 static OnigCodePoint
  94 mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
  95 {
  96   int c, len;
  97   OnigCodePoint n;
  98 
  99   len = enclen(ONIG_ENCODING_UTF8, p);
 100   c = *p++;
 101   if (len > 1) {
 102     len--;
 103     n = c & ((1 << (6 - len)) - 1);
 104     while (len--) {
 105       c = *p++;
 106       n = (n << 6) | (c & ((1 << 6) - 1));
 107     }
 108     return n;
 109   }
 110   else {
 111 #ifdef USE_INVALID_CODE_SCHEME
 112     if (c > 0xfd) {
 113       return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
 114     }
 115 #endif
 116     return (OnigCodePoint )c;
 117   }
 118 }
 119 
 120 static int
 121 code_to_mbclen(OnigCodePoint code)
 122 {
 123   if      ((code & 0xffffff80) == 0) return 1;
 124   else if ((code & 0xfffff800) == 0) return 2;
 125   else if ((code & 0xffff0000) == 0) return 3;
 126   else if ((code & 0xffe00000) == 0) return 4;
 127   else if ((code & 0xfc000000) == 0) return 5;
 128   else if ((code & 0x80000000) == 0) return 6;
 129 #ifdef USE_INVALID_CODE_SCHEME
 130   else if (code == INVALID_CODE_FE) return 1;
 131   else if (code == INVALID_CODE_FF) return 1;
 132 #endif
 133   else
 134     return ONIGERR_INVALID_CODE_POINT_VALUE;
 135 }
 136 
 137 static int
 138 code_to_mbc(OnigCodePoint code, UChar *buf)
 139 {
 140 #define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
 141 #define UTF8_TRAIL0(code)        (UChar )(((code) & 0x3f) | 0x80)
 142 
 143   if ((code & 0xffffff80) == 0) {
 144     *buf = (UChar )code;
 145     return 1;
 146   }
 147   else {
 148     UChar *p = buf;
 149 
 150     if ((code & 0xfffff800) == 0) {
 151       *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
 152     }
 153     else if ((code & 0xffff0000) == 0) {
 154       *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
 155       *p++ = UTF8_TRAILS(code, 6);
 156     }
 157     else if ((code & 0xffe00000) == 0) {
 158       *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
 159       *p++ = UTF8_TRAILS(code, 12);
 160       *p++ = UTF8_TRAILS(code,  6);
 161     }
 162     else if ((code & 0xfc000000) == 0) {
 163       *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
 164       *p++ = UTF8_TRAILS(code, 18);
 165       *p++ = UTF8_TRAILS(code, 12);
 166       *p++ = UTF8_TRAILS(code,  6);
 167     }
 168     else if ((code & 0x80000000) == 0) {
 169       *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
 170       *p++ = UTF8_TRAILS(code, 24);
 171       *p++ = UTF8_TRAILS(code, 18);
 172       *p++ = UTF8_TRAILS(code, 12);
 173       *p++ = UTF8_TRAILS(code,  6);
 174     }
 175 #ifdef USE_INVALID_CODE_SCHEME
 176     else if (code == INVALID_CODE_FE) {
 177       *p = 0xfe;
 178       return 1;
 179     }
 180     else if (code == INVALID_CODE_FF) {
 181       *p = 0xff;
 182       return 1;
 183     }
 184 #endif
 185     else {
 186       return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
 187     }
 188 
 189     *p++ = UTF8_TRAIL0(code);
 190     return p - buf;
 191   }
 192 }
 193 
 194 static int
 195 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
 196               const UChar* end, UChar* fold)
 197 {
 198   const UChar* p = *pp;
 199 
 200   if (ONIGENC_IS_MBC_ASCII(p)) {
 201 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
 202     if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
 203       if (*p == 0x49) {
 204         *fold++ = 0xc4;
 205         *fold   = 0xb1;
 206         (*pp)++;
 207         return 2;
 208       }
 209     }
 210 #endif
 211 
 212     *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
 213     (*pp)++;
 214     return 1; /* return byte length of converted char to lower */
 215   }
 216   else {
 217     return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF8, flag,
 218                                          pp, end, fold);
 219   }
 220 }
 221 
 222 #if 0
 223 static int
 224 is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
 225 {
 226   const UChar* p = *pp;
 227 
 228   if (ONIGENC_IS_MBC_ASCII(p)) {
 229     (*pp)++;
 230     return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
 231   }
 232   else {
 233     (*pp) += enclen(ONIG_ENCODING_UTF8, p);
 234 
 235     if (*p == 0xc3) {
 236       int c = *(p + 1);
 237       if (c >= 0x80) {
 238         if (c <= (UChar )0x9e) { /* upper */
 239           if (c == (UChar )0x97) return FALSE;
 240           return TRUE;
 241         }
 242         else if (c >= (UChar )0xa0 && c <= (UChar )0xbe) { /* lower */
 243           if (c == (UChar )'\267') return FALSE;
 244           return TRUE;
 245         }
 246         else if (c == (UChar )0x9f &&
 247                  (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
 248           return TRUE;
 249         }
 250       }
 251     }
 252   }
 253 
 254   return FALSE;
 255 }
 256 #endif
 257 
 258 
 259 static int
 260 get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
 261                      const OnigCodePoint* ranges[])
 262 {
 263   *sb_out = 0x80;
 264   return onigenc_unicode_ctype_code_range(ctype, ranges);
 265 }
 266 
 267 
 268 static UChar*
 269 left_adjust_char_head(const UChar* start, const UChar* s)
 270 {
 271   const UChar *p;
 272 
 273   if (s <= start) return (UChar* )s;
 274   p = s;
 275 
 276   while (!utf8_islead(*p) && p > start) p--;
 277   return (UChar* )p;
 278 }
 279 
 280 static int
 281 get_case_fold_codes_by_str(OnigCaseFoldType flag,
 282     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
 283 {
 284   return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF8,
 285                                                     flag, p, end, items);
 286 }
 287 
 288 OnigEncodingType OnigEncodingUTF8 = {
 289   mbc_enc_len,
 290   "UTF-8",     /* name */
 291   6,           /* max byte length */
 292   1,           /* min byte length */
 293   is_mbc_newline,
 294   mbc_to_code,
 295   code_to_mbclen,
 296   code_to_mbc,
 297   mbc_case_fold,
 298   onigenc_unicode_apply_all_case_fold,
 299   get_case_fold_codes_by_str,
 300   onigenc_unicode_property_name_to_ctype,
 301   onigenc_unicode_is_code_ctype,
 302   get_ctype_code_range,
 303   left_adjust_char_head,
 304   onigenc_always_true_is_allowed_reverse_match
 305 };

/* [<][>][^][v][top][bottom][index][help] */