root/ext/mbstring/php_unicode.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. prop_lookup
  2. php_unicode_is_prop
  3. case_lookup
  4. php_turkish_toupper
  5. php_turkish_tolower
  6. php_unicode_toupper
  7. php_unicode_tolower
  8. php_unicode_totitle
  9. php_unicode_convert_case

   1 /*
   2    +----------------------------------------------------------------------+
   3    | PHP Version 7                                                        |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 1997-2016 The PHP Group                                |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15    | Author: Wez Furlong (wez@thebrainroom.com)                           |
  16    +----------------------------------------------------------------------+
  17 
  18         Based on code from ucdata-2.5, which has the following Copyright:
  19 
  20         Copyright 2001 Computing Research Labs, New Mexico State University
  21 
  22         Permission is hereby granted, free of charge, to any person obtaining a
  23         copy of this software and associated documentation files (the "Software"),
  24         to deal in the Software without restriction, including without limitation
  25         the rights to use, copy, modify, merge, publish, distribute, sublicense,
  26         and/or sell copies of the Software, and to permit persons to whom the
  27         Software is furnished to do so, subject to the following conditions:
  28 
  29         The above copyright notice and this permission notice shall be included in
  30         all copies or substantial portions of the Software.
  31 */
  32 
  33 #ifdef HAVE_CONFIG_H
  34 #include "config.h"
  35 #endif
  36 
  37 #include "php.h"
  38 #include "php_ini.h"
  39 
  40 #if HAVE_MBSTRING
  41 
  42 /* include case folding data generated from the official UnicodeData.txt file */
  43 #include "mbstring.h"
  44 #include "php_unicode.h"
  45 #include "unicode_data.h"
  46 
  47 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
  48 
  49 /*
  50  * A simple array of 32-bit masks for lookup.
  51  */
  52 static unsigned long masks32[32] = {
  53     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
  54     0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
  55     0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
  56     0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
  57     0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
  58     0x40000000, 0x80000000
  59 };
  60 
  61 
  62 static int prop_lookup(unsigned long code, unsigned long n)
  63 {
  64         long l, r, m;
  65 
  66         /*
  67          * There is an extra node on the end of the offsets to allow this routine
  68          * to work right.  If the index is 0xffff, then there are no nodes for the
  69          * property.
  70          */
  71         if ((l = _ucprop_offsets[n]) == 0xffff)
  72                 return 0;
  73 
  74         /*
  75          * Locate the next offset that is not 0xffff.  The sentinel at the end of
  76          * the array is the max index value.
  77          */
  78         for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
  79                 ;
  80 
  81         r = _ucprop_offsets[n + m] - 1;
  82 
  83         while (l <= r) {
  84                 /*
  85                  * Determine a "mid" point and adjust to make sure the mid point is at
  86                  * the beginning of a range pair.
  87                  */
  88                 m = (l + r) >> 1;
  89                 m -= (m & 1);
  90                 if (code > _ucprop_ranges[m + 1])
  91                         l = m + 2;
  92                 else if (code < _ucprop_ranges[m])
  93                         r = m - 2;
  94                 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
  95                         return 1;
  96         }
  97         return 0;
  98 
  99 }
 100 
 101 MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1,
 102                 unsigned long mask2)
 103 {
 104         unsigned long i;
 105 
 106         if (mask1 == 0 && mask2 == 0)
 107                 return 0;
 108 
 109         for (i = 0; mask1 && i < 32; i++) {
 110                 if ((mask1 & masks32[i]) && prop_lookup(code, i))
 111                         return 1;
 112         }
 113 
 114         for (i = 32; mask2 && i < _ucprop_size; i++) {
 115                 if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
 116                         return 1;
 117         }
 118 
 119         return 0;
 120 }
 121 
 122 static unsigned long case_lookup(unsigned long code, long l, long r, int field)
 123 {
 124         long m;
 125 
 126         /*
 127          * Do the binary search.
 128          */
 129         while (l <= r) {
 130                 /*
 131                  * Determine a "mid" point and adjust to make sure the mid point is at
 132                  * the beginning of a case mapping triple.
 133                  */
 134                 m = (l + r) >> 1;
 135                 m -= (m % 3);
 136                 if (code > _uccase_map[m])
 137                         l = m + 3;
 138                 else if (code < _uccase_map[m])
 139                         r = m - 3;
 140                 else if (code == _uccase_map[m])
 141                         return _uccase_map[m + field];
 142         }
 143 
 144         return code;
 145 }
 146 
 147 MBSTRING_API unsigned long php_turkish_toupper(unsigned long code, long l, long r, int field)
 148 {
 149         if (code == 0x0069L) {
 150                 return 0x0130L;
 151         }
 152         return case_lookup(code, l, r, field);
 153 }
 154 
 155 MBSTRING_API unsigned long php_turkish_tolower(unsigned long code, long l, long r, int field)
 156 {
 157         if (code == 0x0049L) {
 158                 return 0x0131L;
 159         }
 160         return case_lookup(code, l, r, field);
 161 }
 162 
 163 MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc)
 164 {
 165         int field;
 166         long l, r;
 167 
 168         if (php_unicode_is_upper(code))
 169                 return code;
 170 
 171         if (php_unicode_is_lower(code)) {
 172                 /*
 173                  * The character is lower case.
 174                  */
 175                 field = 2;
 176                 l = _uccase_len[0];
 177                 r = (l + _uccase_len[1]) - 3;
 178 
 179                 if (enc == mbfl_no_encoding_8859_9) {
 180                         return php_turkish_toupper(code, l, r, field);
 181                 }
 182 
 183         } else {
 184                 /*
 185                  * The character is title case.
 186                  */
 187                 field = 1;
 188                 l = _uccase_len[0] + _uccase_len[1];
 189                 r = _uccase_size - 3;
 190         }
 191         return case_lookup(code, l, r, field);
 192 }
 193 
 194 MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc)
 195 {
 196         int field;
 197         long l, r;
 198 
 199         if (php_unicode_is_lower(code))
 200                 return code;
 201 
 202         if (php_unicode_is_upper(code)) {
 203                 /*
 204                  * The character is upper case.
 205                  */
 206                 field = 1;
 207                 l = 0;
 208                 r = _uccase_len[0] - 3;
 209 
 210                 if (enc == mbfl_no_encoding_8859_9) {
 211                         return php_turkish_tolower(code, l, r, field);
 212                 }
 213 
 214         } else {
 215                 /*
 216                  * The character is title case.
 217                  */
 218                 field = 2;
 219                 l = _uccase_len[0] + _uccase_len[1];
 220                 r = _uccase_size - 3;
 221         }
 222         return case_lookup(code, l, r, field);
 223 }
 224 
 225 MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc)
 226 {
 227         int field;
 228         long l, r;
 229 
 230         if (php_unicode_is_title(code))
 231                 return code;
 232 
 233         /*
 234          * The offset will always be the same for converting to title case.
 235          */
 236         field = 2;
 237 
 238         if (php_unicode_is_upper(code)) {
 239                 /*
 240                  * The character is upper case.
 241                  */
 242                 l = 0;
 243                 r = _uccase_len[0] - 3;
 244         } else {
 245                 /*
 246                  * The character is lower case.
 247                  */
 248                 l = _uccase_len[0];
 249                 r = (l + _uccase_len[1]) - 3;
 250         }
 251         return case_lookup(code, l, r, field);
 252 
 253 }
 254 
 255 
 256 #define BE_ARY_TO_UINT32(ptr) (\
 257         ((unsigned char*)(ptr))[0]<<24 |\
 258         ((unsigned char*)(ptr))[1]<<16 |\
 259         ((unsigned char*)(ptr))[2]<< 8 |\
 260         ((unsigned char*)(ptr))[3] )
 261 
 262 #define UINT32_TO_BE_ARY(ptr,val) { \
 263         unsigned int v = val; \
 264         ((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
 265         ((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
 266         ((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
 267         ((unsigned char*)(ptr))[3] = (v    ) & 0xff;\
 268 }
 269 
 270 MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
 271                 const char *src_encoding)
 272 {
 273         char *unicode, *newstr;
 274         size_t unicode_len;
 275         unsigned char *unicode_ptr;
 276         size_t i;
 277         enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
 278 
 279         if (_src_encoding == mbfl_no_encoding_invalid) {
 280                 php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", src_encoding);
 281                 return NULL;
 282         }
 283 
 284         unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len);
 285         if (unicode == NULL)
 286                 return NULL;
 287 
 288         unicode_ptr = (unsigned char *)unicode;
 289 
 290         switch(case_mode) {
 291                 case PHP_UNICODE_CASE_UPPER:
 292                         for (i = 0; i < unicode_len; i+=4) {
 293                                 UINT32_TO_BE_ARY(&unicode_ptr[i],
 294                                         php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
 295                         }
 296                         break;
 297 
 298                 case PHP_UNICODE_CASE_LOWER:
 299                         for (i = 0; i < unicode_len; i+=4) {
 300                                 UINT32_TO_BE_ARY(&unicode_ptr[i],
 301                                         php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
 302                         }
 303                         break;
 304 
 305                 case PHP_UNICODE_CASE_TITLE: {
 306                         int mode = 0;
 307 
 308                         for (i = 0; i < unicode_len; i+=4) {
 309                                 int res = php_unicode_is_prop(
 310                                         BE_ARY_TO_UINT32(&unicode_ptr[i]),
 311                                         UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT|UC_PO|UC_OS, 0);
 312                                 if (mode) {
 313                                         if (res) {
 314                                                 UINT32_TO_BE_ARY(&unicode_ptr[i],
 315                                                         php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
 316                                         } else {
 317                                                 mode = 0;
 318                                         }
 319                                 } else {
 320                                         if (res) {
 321                                                 mode = 1;
 322                                                 UINT32_TO_BE_ARY(&unicode_ptr[i],
 323                                                         php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
 324                                         }
 325                                 }
 326                         }
 327                 } break;
 328 
 329         }
 330 
 331         newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len);
 332         efree(unicode);
 333 
 334         return newstr;
 335 }
 336 
 337 
 338 #endif /* HAVE_MBSTRING */
 339 
 340 /*
 341  * Local variables:
 342  * tab-width: 4
 343  * c-basic-offset: 4
 344  * End:
 345  * vim600: sw=4 ts=4 fdm=marker
 346  * vim<600: sw=4 ts=4
 347  */

/* [<][>][^][v][top][bottom][index][help] */