root/ext/mbstring/oniguruma/enc/utf16_be.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. utf16be_mbc_enc_len
  2. utf16be_is_mbc_newline
  3. utf16be_mbc_to_code
  4. utf16be_code_to_mbclen
  5. utf16be_code_to_mbc
  6. utf16be_mbc_case_fold
  7. utf16be_is_mbc_ambiguous
  8. utf16be_left_adjust_char_head
  9. utf16be_get_case_fold_codes_by_str

   1 /**********************************************************************
   2   utf16_be.c -  Oniguruma (regular expression library)
   3 **********************************************************************/
   4 /*-
   5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29 
  30 #include "regenc.h"
  31 
  32 static const int EncLen_UTF16[] = {
  33   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  34   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  35   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  36   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  37   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  38   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  39   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  40   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  41   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  42   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  43   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  44   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  45   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  46   2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
  47   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  48   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  49 };
  50 
  51 static int
  52 utf16be_mbc_enc_len(const UChar* p)
  53 {
  54   return EncLen_UTF16[*p];
  55 }
  56 
  57 static int
  58 utf16be_is_mbc_newline(const UChar* p, const UChar* end)
  59 {
  60   if (p + 1 < end) {
  61     if (*(p+1) == 0x0a && *p == 0x00)
  62       return 1;
  63 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
  64     if ((
  65 #ifndef USE_CRNL_AS_LINE_TERMINATOR
  66          *(p+1) == 0x0d ||
  67 #endif
  68          *(p+1) == 0x85) && *p == 0x00)
  69       return 1;
  70     if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28))
  71       return 1;
  72 #endif
  73   }
  74   return 0;
  75 }
  76 
  77 static OnigCodePoint
  78 utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
  79 {
  80   OnigCodePoint code;
  81 
  82   if (UTF16_IS_SURROGATE_FIRST(*p)) {
  83     code = ((((p[0] - 0xd8) << 2) + ((p[1] & 0xc0) >> 6) + 1) << 16)
  84          + ((((p[1] & 0x3f) << 2) + (p[2] - 0xdc)) << 8)
  85          + p[3];
  86   }
  87   else {
  88     code = p[0] * 256 + p[1];
  89   }
  90   return code;
  91 }
  92 
  93 static int
  94 utf16be_code_to_mbclen(OnigCodePoint code)
  95 {
  96   return (code > 0xffff ? 4 : 2);
  97 }
  98 
  99 static int
 100 utf16be_code_to_mbc(OnigCodePoint code, UChar *buf)
 101 {
 102   UChar* p = buf;
 103 
 104   if (code > 0xffff) {
 105     unsigned int plane, high;
 106 
 107     plane = (code >> 16) - 1;
 108     *p++ = (plane >> 2) + 0xd8;
 109     high = (code & 0xff00) >> 8;
 110     *p++ = ((plane & 0x03) << 6) + (high >> 2);
 111     *p++ = (high & 0x03) + 0xdc;
 112     *p   = (UChar )(code & 0xff);
 113     return 4;
 114   }
 115   else {
 116     *p++ = (UChar )((code & 0xff00) >> 8);
 117     *p++ = (UChar )(code & 0xff);
 118     return 2;
 119   }
 120 }
 121 
 122 static int
 123 utf16be_mbc_case_fold(OnigCaseFoldType flag,
 124                       const UChar** pp, const UChar* end, UChar* fold)
 125 {
 126   const UChar* p = *pp;
 127 
 128   if (ONIGENC_IS_ASCII_CODE(*(p+1)) && *p == 0) {
 129     p++;
 130 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
 131     if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
 132       if (*p == 0x49) {
 133         *fold++ = 0x01;
 134         *fold   = 0x31;
 135         (*pp) += 2;
 136         return 2;
 137       }
 138     }
 139 #endif
 140 
 141     *fold++ = 0;
 142     *fold   = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
 143     *pp += 2;
 144     return 2;
 145   }
 146   else
 147     return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF16_BE, flag,
 148                                          pp, end, fold);
 149 }
 150 
 151 #if 0
 152 static int
 153 utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
 154 {
 155   const UChar* p = *pp;
 156 
 157   (*pp) += EncLen_UTF16[*p];
 158 
 159   if (*p == 0) {
 160     int c, v;
 161 
 162     p++;
 163     if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
 164       return TRUE;
 165     }
 166 
 167     c = *p;
 168     v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
 169                 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
 170 
 171     if ((v | BIT_CTYPE_LOWER) != 0) {
 172       /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
 173       if (c >= 0xaa && c <= 0xba)
 174         return FALSE;
 175       else
 176         return TRUE;
 177     }
 178     return (v != 0 ? TRUE : FALSE);
 179   }
 180 
 181   return FALSE;
 182 }
 183 #endif
 184 
 185 static UChar*
 186 utf16be_left_adjust_char_head(const UChar* start, const UChar* s)
 187 {
 188   if (s <= start) return (UChar* )s;
 189 
 190   if ((s - start) % 2 == 1) {
 191     s--;
 192   }
 193 
 194   if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1)
 195     s -= 2;
 196 
 197   return (UChar* )s;
 198 }
 199 
 200 static int
 201 utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag,
 202     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
 203 {
 204   return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF16_BE,
 205                                                     flag, p, end, items);
 206 }
 207 
 208 OnigEncodingType OnigEncodingUTF16_BE = {
 209   utf16be_mbc_enc_len,
 210   "UTF-16BE",   /* name */
 211   4,            /* max byte length */
 212   2,            /* min byte length */
 213   utf16be_is_mbc_newline,
 214   utf16be_mbc_to_code,
 215   utf16be_code_to_mbclen,
 216   utf16be_code_to_mbc,
 217   utf16be_mbc_case_fold,
 218   onigenc_unicode_apply_all_case_fold,
 219   utf16be_get_case_fold_codes_by_str,
 220   onigenc_unicode_property_name_to_ctype,
 221   onigenc_unicode_is_code_ctype,
 222   onigenc_utf16_32_get_ctype_code_range,
 223   utf16be_left_adjust_char_head,
 224   onigenc_always_false_is_allowed_reverse_match
 225 };

/* [<][>][^][v][top][bottom][index][help] */