root/ext/intl/grapheme/grapheme_string.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. grapheme_register_constants
  2. PHP_FUNCTION
  3. PHP_FUNCTION
  4. PHP_FUNCTION
  5. PHP_FUNCTION
  6. PHP_FUNCTION
  7. PHP_FUNCTION
  8. strstr_common_handler
  9. PHP_FUNCTION
  10. PHP_FUNCTION
  11. grapheme_extract_charcount_iter
  12. grapheme_extract_bytecount_iter
  13. grapheme_extract_count_iter
  14. PHP_FUNCTION

   1 /*
   2    +----------------------------------------------------------------------+
   3    | PHP Version 7                                                                                                                |
   4    +----------------------------------------------------------------------+
   5    | This source file is subject to version 3.01 of the PHP license,      |
   6    | that is bundled with this package in the file LICENSE, and is                |
   7    | available through the world-wide-web at the following url:                   |
   8    | http://www.php.net/license/3_01.txt                                                                  |
   9    | If you did not receive a copy of the PHP license and are unable to   |
  10    | obtain it through the world-wide-web, please send a note to                  |
  11    | license@php.net so we can mail you a copy immediately.                               |
  12    +----------------------------------------------------------------------+
  13    | Author: Ed Batutis <ed@batutis.com>                                                                  |
  14    +----------------------------------------------------------------------+
  15  */
  16 
  17 /* {{{ includes */
  18 #ifdef HAVE_CONFIG_H
  19 #include "config.h"
  20 #endif
  21 
  22 #include <php.h>
  23 #include "grapheme.h"
  24 #include "grapheme_util.h"
  25 
  26 #include <unicode/utypes.h>
  27 #include <unicode/ucol.h>
  28 #include <unicode/ustring.h>
  29 #include <unicode/ubrk.h>
  30 
  31 #include "ext/standard/php_string.h"
  32 
  33 /* }}} */
  34 
  35 #define GRAPHEME_EXTRACT_TYPE_COUNT             0
  36 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES  1
  37 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS  2
  38 #define GRAPHEME_EXTRACT_TYPE_MIN       GRAPHEME_EXTRACT_TYPE_COUNT
  39 #define GRAPHEME_EXTRACT_TYPE_MAX       GRAPHEME_EXTRACT_TYPE_MAXCHARS
  40 
  41 
  42 /* {{{ grapheme_register_constants
  43  * Register API constants
  44  */
  45 void grapheme_register_constants( INIT_FUNC_ARGS )
  46 {
  47         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
  48         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
  49         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
  50 }
  51 /* }}} */
  52 
  53 /* {{{ proto size_t grapheme_strlen(string str)
  54    Get number of graphemes in a string */
  55 PHP_FUNCTION(grapheme_strlen)
  56 {
  57         char* string;
  58         size_t string_len;
  59         UChar* ustring = NULL;
  60         int ustring_len = 0;
  61         zend_long ret_len;
  62         UErrorCode status;
  63 
  64         if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
  65                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  66                          "grapheme_strlen: unable to parse input param", 0 );
  67                 RETURN_FALSE;
  68         }
  69 
  70         ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
  71 
  72         if ( ret_len >= 0 )
  73                 RETURN_LONG(string_len);
  74 
  75         /* convert the string to UTF-16. */
  76         status = U_ZERO_ERROR;
  77         intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
  78 
  79         if ( U_FAILURE( status ) ) {
  80                 /* Set global error code. */
  81                 intl_error_set_code( NULL, status );
  82 
  83                 /* Set error messages. */
  84                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
  85                 if (ustring) {
  86                         efree( ustring );
  87                 }
  88                 RETURN_NULL();
  89         }
  90 
  91         ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
  92 
  93         if (ustring) {
  94                 efree( ustring );
  95         }
  96 
  97         if (ret_len >= 0) {
  98                 RETVAL_LONG(ret_len);
  99         } else {
 100                 RETVAL_FALSE;
 101         }
 102 }
 103 /* }}} */
 104 
 105 /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
 106    Find position of first occurrence of a string within another */
 107 PHP_FUNCTION(grapheme_strpos)
 108 {
 109         char *haystack, *needle;
 110         size_t haystack_len, needle_len;
 111         const char *found;
 112         zend_long loffset = 0;
 113         int32_t offset = 0;
 114         zend_long ret_pos;
 115 
 116         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
 117                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 118                          "grapheme_strpos: unable to parse input param", 0 );
 119                 RETURN_FALSE;
 120         }
 121 
 122         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
 123                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
 124                 RETURN_FALSE;
 125         }
 126 
 127         /* we checked that it will fit: */
 128         offset = (int32_t) loffset;
 129 
 130         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 131 
 132         if (needle_len == 0) {
 133                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
 134                 RETURN_FALSE;
 135         }
 136 
 137 
 138         /* quick check to see if the string might be there
 139          * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
 140         */
 141         found = php_memnstr(haystack + offset, needle, needle_len, haystack + haystack_len);
 142 
 143         /* if it isn't there the we are done */
 144         if (!found) {
 145                 RETURN_FALSE;
 146         }
 147 
 148         /* if it is there, and if the haystack is ascii, we are all done */
 149         if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
 150                 RETURN_LONG(found - haystack);
 151         }
 152 
 153         /* do utf16 part of the strpos */
 154         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
 155 
 156         if ( ret_pos >= 0 ) {
 157                 RETURN_LONG(ret_pos);
 158         } else {
 159                 RETURN_FALSE;
 160         }
 161 
 162 }
 163 /* }}} */
 164 
 165 /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
 166    Find position of first occurrence of a string within another, ignoring case differences */
 167 PHP_FUNCTION(grapheme_stripos)
 168 {
 169         char *haystack, *needle, *haystack_dup, *needle_dup;
 170         size_t haystack_len, needle_len;
 171         const char *found;
 172         zend_long loffset = 0;
 173         int32_t offset = 0;
 174         zend_long ret_pos;
 175         int is_ascii;
 176 
 177         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
 178                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 179                          "grapheme_stripos: unable to parse input param", 0 );
 180                 RETURN_FALSE;
 181         }
 182 
 183         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
 184                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 );
 185                 RETURN_FALSE;
 186         }
 187 
 188         /* we checked that it will fit: */
 189         offset = (int32_t) loffset;
 190 
 191         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 192 
 193         if (needle_len == 0) {
 194                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 );
 195                 RETURN_FALSE;
 196         }
 197 
 198 
 199         is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
 200 
 201         if ( is_ascii ) {
 202                 needle_dup = estrndup(needle, needle_len);
 203                 php_strtolower(needle_dup, needle_len);
 204                 haystack_dup = estrndup(haystack, haystack_len);
 205                 php_strtolower(haystack_dup, haystack_len);
 206 
 207                 found = php_memnstr(haystack_dup + offset, needle_dup, needle_len, haystack_dup + haystack_len);
 208 
 209                 efree(haystack_dup);
 210                 efree(needle_dup);
 211 
 212                 if (found) {
 213                         RETURN_LONG(found - haystack_dup);
 214                 }
 215 
 216                 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
 217                 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
 218                         RETURN_FALSE;
 219                 }
 220         }
 221 
 222         /* do utf16 part of the strpos */
 223         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
 224 
 225         if ( ret_pos >= 0 ) {
 226                 RETURN_LONG(ret_pos);
 227         } else {
 228                 RETURN_FALSE;
 229         }
 230 
 231 }
 232 /* }}} */
 233 
 234 /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
 235    Find position of last occurrence of a string within another */
 236 PHP_FUNCTION(grapheme_strrpos)
 237 {
 238         char *haystack, *needle;
 239         size_t haystack_len, needle_len;
 240         zend_long loffset = 0;
 241         int32_t offset = 0;
 242         zend_long ret_pos;
 243         int is_ascii;
 244 
 245         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
 246                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 247                          "grapheme_strrpos: unable to parse input param", 0 );
 248                 RETURN_FALSE;
 249         }
 250 
 251         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
 252                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
 253                 RETURN_FALSE;
 254         }
 255 
 256         /* we checked that it will fit: */
 257         offset = (int32_t) loffset;
 258 
 259         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 260 
 261         if (needle_len == 0) {
 262                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
 263                 RETURN_FALSE;
 264         }
 265 
 266         is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
 267 
 268         if ( is_ascii ) {
 269 
 270                 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
 271 
 272                 if ( ret_pos >= 0 ) {
 273                         RETURN_LONG(ret_pos);
 274                 }
 275 
 276                 /* if the needle was ascii too, we are done */
 277 
 278                 if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
 279                         RETURN_FALSE;
 280                 }
 281 
 282                 /* else we need to continue via utf16 */
 283         }
 284 
 285         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
 286 
 287         if ( ret_pos >= 0 ) {
 288                 RETURN_LONG(ret_pos);
 289         } else {
 290                 RETURN_FALSE;
 291         }
 292 
 293 
 294 }
 295 /* }}} */
 296 
 297 /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
 298    Find position of last occurrence of a string within another, ignoring case */
 299 PHP_FUNCTION(grapheme_strripos)
 300 {
 301         char *haystack, *needle;
 302         size_t haystack_len, needle_len;
 303         zend_long loffset = 0;
 304         int32_t offset = 0;
 305         zend_long ret_pos;
 306         int is_ascii;
 307 
 308         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
 309                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 310                          "grapheme_strrpos: unable to parse input param", 0 );
 311                 RETURN_FALSE;
 312         }
 313 
 314         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
 315                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
 316                 RETURN_FALSE;
 317         }
 318 
 319         /* we checked that it will fit: */
 320         offset = (int32_t) loffset;
 321 
 322         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 323 
 324         if (needle_len == 0) {
 325                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
 326                 RETURN_FALSE;
 327         }
 328 
 329         is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
 330 
 331         if ( is_ascii ) {
 332                 char *needle_dup, *haystack_dup;
 333 
 334                 needle_dup = estrndup(needle, needle_len);
 335                 php_strtolower(needle_dup, needle_len);
 336                 haystack_dup = estrndup(haystack, haystack_len);
 337                 php_strtolower(haystack_dup, haystack_len);
 338 
 339                 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
 340 
 341                 efree(haystack_dup);
 342                 efree(needle_dup);
 343 
 344                 if ( ret_pos >= 0 ) {
 345                         RETURN_LONG(ret_pos);
 346                 }
 347 
 348                 /* if the needle was ascii too, we are done */
 349 
 350                 if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
 351                         RETURN_FALSE;
 352                 }
 353 
 354                 /* else we need to continue via utf16 */
 355         }
 356 
 357         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */);
 358 
 359         if ( ret_pos >= 0 ) {
 360                 RETURN_LONG(ret_pos);
 361         } else {
 362                 RETURN_FALSE;
 363         }
 364 
 365 
 366 }
 367 /* }}} */
 368 
 369 /* {{{ proto string grapheme_substr(string str, int start [, int length])
 370    Returns part of a string */
 371 PHP_FUNCTION(grapheme_substr)
 372 {
 373         char *str;
 374         zend_string *u8_sub_str;
 375         UChar *ustr;
 376         size_t str_len;
 377         int32_t ustr_len;
 378         zend_long lstart = 0, length = 0;
 379         int32_t start = 0;
 380         int iter_val;
 381         UErrorCode status;
 382         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
 383         UBreakIterator* bi = NULL;
 384         int sub_str_start_pos, sub_str_end_pos;
 385         int32_t (*iter_func)(UBreakIterator *);
 386         zend_bool no_length = 1;
 387 
 388         if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
 389                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 390                          "grapheme_substr: unable to parse input param", 0 );
 391                 RETURN_FALSE;
 392         }
 393 
 394         if ( OUTSIDE_STRING(lstart, str_len)) {
 395                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
 396                 RETURN_FALSE;
 397         }
 398 
 399         /* we checked that it will fit: */
 400         start = (int32_t) lstart;
 401 
 402         if(no_length) {
 403                 length = str_len;
 404         }
 405 
 406         if(length < INT32_MIN) {
 407                 length = INT32_MIN;
 408         } else if(length > INT32_MAX) {
 409                 length = INT32_MAX;
 410         }
 411 
 412         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 413 
 414         if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
 415                 int32_t asub_str_len;
 416                 char *sub_str;
 417                 grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
 418 
 419                 if ( NULL == sub_str ) {
 420                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
 421                         RETURN_FALSE;
 422                 }
 423 
 424                 RETURN_STRINGL(sub_str, asub_str_len);
 425         }
 426 
 427         ustr = NULL;
 428         ustr_len = 0;
 429         status = U_ZERO_ERROR;
 430         intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
 431 
 432         if ( U_FAILURE( status ) ) {
 433                 /* Set global error code. */
 434                 intl_error_set_code( NULL, status );
 435 
 436                 /* Set error messages. */
 437                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
 438                 if (ustr) {
 439                         efree( ustr );
 440                 }
 441                 RETURN_FALSE;
 442         }
 443 
 444         bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
 445 
 446         if( U_FAILURE(status) ) {
 447                 RETURN_FALSE;
 448         }
 449 
 450         ubrk_setText(bi, ustr, ustr_len,        &status);
 451 
 452         if ( start < 0 ) {
 453                 iter_func = ubrk_previous;
 454                 ubrk_last(bi);
 455                 iter_val = 1;
 456         }
 457         else {
 458                 iter_func = ubrk_next;
 459                 iter_val = -1;
 460         }
 461 
 462         sub_str_start_pos = 0;
 463 
 464         while ( start ) {
 465                 sub_str_start_pos = iter_func(bi);
 466 
 467                 if ( UBRK_DONE == sub_str_start_pos ) {
 468                         break;
 469                 }
 470 
 471                 start += iter_val;
 472         }
 473 
 474         if ( 0 != start || sub_str_start_pos >= ustr_len ) {
 475 
 476                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
 477 
 478                 if (ustr) {
 479                         efree(ustr);
 480                 }
 481                 ubrk_close(bi);
 482                 RETURN_FALSE;
 483         }
 484 
 485         /* OK to convert here since if str_len were big, convert above would fail */
 486         if (length >= (int32_t)str_len) {
 487 
 488                 /* no length supplied or length is too big, return the rest of the string */
 489 
 490                 status = U_ZERO_ERROR;
 491                 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
 492 
 493                 if (ustr) {
 494                         efree( ustr );
 495                 }
 496                 ubrk_close( bi );
 497 
 498                 if ( !u8_sub_str ) {
 499                         /* Set global error code. */
 500                         intl_error_set_code( NULL, status );
 501 
 502                         /* Set error messages. */
 503                         intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
 504 
 505                         RETURN_FALSE;
 506                 }
 507 
 508                 /* return the allocated string, not a duplicate */
 509                 RETVAL_NEW_STR(u8_sub_str);
 510                 return;
 511         }
 512 
 513         if(length == 0) {
 514                 /* empty length - we've validated start, we can return "" now */
 515                 if (ustr) {
 516                         efree(ustr);
 517                 }
 518                 ubrk_close(bi);
 519                 RETURN_EMPTY_STRING();
 520         }
 521 
 522         /* find the end point of the string to return */
 523 
 524         if ( length < 0 ) {
 525                 iter_func = ubrk_previous;
 526                 ubrk_last(bi);
 527                 iter_val = 1;
 528         }
 529         else {
 530                 iter_func = ubrk_next;
 531                 iter_val = -1;
 532         }
 533 
 534         sub_str_end_pos = 0;
 535 
 536         while ( length ) {
 537                 sub_str_end_pos = iter_func(bi);
 538 
 539                 if ( UBRK_DONE == sub_str_end_pos ) {
 540                         break;
 541                 }
 542 
 543                 length += iter_val;
 544         }
 545 
 546         ubrk_close(bi);
 547 
 548         if ( UBRK_DONE == sub_str_end_pos) {
 549                 if(length < 0) {
 550                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 );
 551 
 552                         efree(ustr);
 553                         RETURN_FALSE;
 554                 } else {
 555                         sub_str_end_pos = ustr_len;
 556                 }
 557         }
 558 
 559         if(sub_str_start_pos > sub_str_end_pos) {
 560                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 );
 561 
 562                 efree(ustr);
 563                 RETURN_FALSE;
 564         }
 565 
 566         status = U_ZERO_ERROR;
 567         u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
 568 
 569         efree( ustr );
 570 
 571         if ( !u8_sub_str ) {
 572                 /* Set global error code. */
 573                 intl_error_set_code( NULL, status );
 574 
 575                 /* Set error messages. */
 576                 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
 577 
 578                 RETURN_FALSE;
 579         }
 580 
 581          /* return the allocated string, not a duplicate */
 582         RETVAL_NEW_STR(u8_sub_str);
 583 }
 584 /* }}} */
 585 
 586 /* {{{  strstr_common_handler */
 587 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
 588 {
 589         char *haystack, *needle;
 590         const char *found;
 591         size_t haystack_len, needle_len;
 592         int32_t ret_pos, uchar_pos;
 593         zend_bool part = 0;
 594 
 595         if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
 596 
 597                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 598                          "grapheme_strstr: unable to parse input param", 0 );
 599 
 600                 RETURN_FALSE;
 601         }
 602 
 603         if (needle_len == 0) {
 604 
 605                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
 606 
 607                 RETURN_FALSE;
 608         }
 609 
 610 
 611         if ( !f_ignore_case ) {
 612 
 613                 /* ASCII optimization: quick check to see if the string might be there
 614                  * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
 615                 */
 616                 found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
 617 
 618                 /* if it isn't there the we are done */
 619                 if ( !found ) {
 620                         RETURN_FALSE;
 621                 }
 622 
 623                 /* if it is there, and if the haystack is ascii, we are all done */
 624                 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
 625                         size_t found_offset = found - haystack;
 626 
 627                         if (part) {
 628                                 RETURN_STRINGL(haystack, found_offset);
 629                         } else {
 630                                 RETURN_STRINGL(found, haystack_len - found_offset);
 631                         }
 632                 }
 633 
 634         }
 635 
 636         /* need to work in utf16 */
 637         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
 638 
 639         if ( ret_pos < 0 ) {
 640                 RETURN_FALSE;
 641         }
 642 
 643         /* uchar_pos is the 'nth' Unicode character position of the needle */
 644 
 645         ret_pos = 0;
 646         U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
 647 
 648         if (part) {
 649                 RETURN_STRINGL(haystack, ret_pos);
 650         } else {
 651                 RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
 652         }
 653 
 654 }
 655 /* }}} */
 656 
 657 /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
 658    Finds first occurrence of a string within another */
 659 PHP_FUNCTION(grapheme_strstr)
 660 {
 661         strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
 662 }
 663 /* }}} */
 664 
 665 /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
 666    Finds first occurrence of a string within another */
 667 PHP_FUNCTION(grapheme_stristr)
 668 {
 669         strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
 670 }
 671 /* }}} */
 672 
 673 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
 674 static inline int32_t
 675 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
 676 {
 677         int pos = 0, prev_pos = 0;
 678         int ret_pos = 0, prev_ret_pos = 0;
 679 
 680         while ( 1 ) {
 681                 pos = ubrk_next(bi);
 682 
 683                 if ( UBRK_DONE == pos ) {
 684                         break;
 685                 }
 686 
 687                 /* if we are beyond our limit, then the loop is done */
 688                 if ( pos > csize ) {
 689                         break;
 690                 }
 691 
 692                 /* update our pointer in the original UTF-8 buffer by as many characters
 693                    as ubrk_next iterated over */
 694 
 695                 prev_ret_pos = ret_pos;
 696                 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
 697 
 698                 if ( prev_ret_pos == ret_pos ) {
 699                         /* something wrong - malformed utf8? */
 700                         break;
 701                 }
 702 
 703                 prev_pos = pos;
 704         }
 705 
 706         return ret_pos;
 707 }
 708 /* }}} */
 709 
 710 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
 711 static inline int32_t
 712 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
 713 {
 714         int pos = 0, prev_pos = 0;
 715         int ret_pos = 0, prev_ret_pos = 0;
 716 
 717         while ( 1 ) {
 718                 pos = ubrk_next(bi);
 719 
 720                 if ( UBRK_DONE == pos ) {
 721                         break;
 722                 }
 723 
 724                 prev_ret_pos = ret_pos;
 725                 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
 726 
 727                 if ( ret_pos > bsize ) {
 728                         ret_pos = prev_ret_pos;
 729                         break;
 730                 }
 731 
 732                 if ( prev_ret_pos == ret_pos ) {
 733                         /* something wrong - malformed utf8? */
 734                         break;
 735                 }
 736 
 737                 prev_pos = pos;
 738         }
 739 
 740         return ret_pos;
 741 }
 742 /* }}} */
 743 
 744 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
 745 static inline int32_t
 746 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
 747 {
 748         int pos = 0, next_pos = 0;
 749         int ret_pos = 0;
 750 
 751         while ( size ) {
 752                 next_pos = ubrk_next(bi);
 753 
 754                 if ( UBRK_DONE == next_pos ) {
 755                         break;
 756                 }
 757                 pos = next_pos;
 758                 size--;
 759         }
 760 
 761         /* pos is one past the last UChar - and represent the number of code units to
 762                 advance in the utf-8 buffer
 763         */
 764 
 765         U8_FWD_N(pstr, ret_pos, str_len, pos);
 766 
 767         return ret_pos;
 768 }
 769 /* }}} */
 770 
 771 /* {{{ grapheme extract iter function pointer array */
 772 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
 773 
 774 static grapheme_extract_iter grapheme_extract_iters[] = {
 775         &grapheme_extract_count_iter,
 776         &grapheme_extract_bytecount_iter,
 777         &grapheme_extract_charcount_iter,
 778 };
 779 /* }}} */
 780 
 781 /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
 782         Function to extract a sequence of default grapheme clusters */
 783 PHP_FUNCTION(grapheme_extract)
 784 {
 785         char *str, *pstr;
 786         UChar *ustr;
 787         size_t str_len;
 788         int32_t ustr_len;
 789         zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
 790         zend_long lstart = 0; /* starting position in str in bytes */
 791         int32_t start = 0;
 792         zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
 793         UErrorCode status;
 794         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
 795         UBreakIterator* bi = NULL;
 796         int ret_pos;
 797         zval *next = NULL; /* return offset of next part of the string */
 798 
 799         if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
 800                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 801                          "grapheme_extract: unable to parse input param", 0 );
 802                 RETURN_FALSE;
 803         }
 804 
 805         if ( NULL != next ) {
 806                 if ( !Z_ISREF_P(next) ) {
 807                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 808                                  "grapheme_extract: 'next' was not passed by reference", 0 );
 809                         RETURN_FALSE;
 810                 } else {
 811                         ZVAL_DEREF(next);
 812                         /* initialize next */
 813                         SEPARATE_ZVAL_NOREF(next);
 814                         zval_dtor(next);
 815             ZVAL_LONG(next, lstart);
 816                 }
 817         }
 818 
 819         if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
 820                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 821                          "grapheme_extract: unknown extract type param", 0 );
 822                 RETURN_FALSE;
 823         }
 824 
 825         if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
 826                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
 827                 RETURN_FALSE;
 828         }
 829 
 830         if ( size > INT32_MAX || size < 0) {
 831                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 );
 832                 RETURN_FALSE;
 833         }
 834         if (size == 0) {
 835                 RETURN_EMPTY_STRING();
 836         }
 837 
 838         /* we checked that it will fit: */
 839         start = (int32_t) lstart;
 840 
 841         pstr = str + start;
 842 
 843         /* just in case pstr points in the middle of a character, move forward to the start of the next char */
 844         if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
 845                 char *str_end = str + str_len;
 846 
 847                 while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
 848                         pstr++;
 849                         if ( pstr >= str_end ) {
 850                                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 851                                                                 "grapheme_extract: invalid input string", 0 );
 852 
 853                                 RETURN_FALSE;
 854                         }
 855                 }
 856         }
 857 
 858         str_len -= (pstr - str);
 859 
 860         /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
 861                 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
 862          */
 863 
 864         if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
 865         size_t nsize = MIN(size, str_len);
 866                 if ( NULL != next ) {
 867                         ZVAL_LONG(next, start+nsize);
 868                 }
 869                 RETURN_STRINGL(pstr, nsize);
 870         }
 871 
 872         /* convert the strings to UTF-16. */
 873         ustr = NULL;
 874         ustr_len = 0;
 875         status = U_ZERO_ERROR;
 876         intl_convert_utf8_to_utf16(&ustr, &ustr_len, pstr, str_len, &status );
 877 
 878         if ( U_FAILURE( status ) ) {
 879                 /* Set global error code. */
 880                 intl_error_set_code( NULL, status );
 881 
 882                 /* Set error messages. */
 883                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
 884 
 885                 if ( NULL != ustr )
 886                         efree( ustr );
 887 
 888                 RETURN_FALSE;
 889         }
 890 
 891         bi = NULL;
 892         status = U_ZERO_ERROR;
 893         bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
 894 
 895         ubrk_setText(bi, ustr, ustr_len, &status);
 896 
 897         /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
 898                 can't back up. So, we will not do anything. */
 899 
 900         /* now we need to find the end of the chunk the user wants us to return */
 901         /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
 902         ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
 903 
 904         if (ustr) {
 905                 efree(ustr);
 906         }
 907         ubrk_close(bi);
 908 
 909         if ( NULL != next ) {
 910                 ZVAL_LONG(next, start+ret_pos);
 911         }
 912 
 913         RETURN_STRINGL(((char *)pstr), ret_pos);
 914 }
 915 
 916 /* }}} */
 917 
 918 /*
 919  * Local variables:
 920  * tab-width: 4
 921  * c-basic-offset: 4
 922  * End:
 923  * vim600: fdm=marker
 924  * vim: noet sw=4 ts=4
 925  */
 926 

/* [<][>][^][v][top][bottom][index][help] */