root/ext/intl/grapheme/grapheme_util.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ZEND_EXTERN_MODULE_GLOBALS
  2. grapheme_substr_ascii
  3. grapheme_strpos_utf16
  4. grapheme_ascii_check
  5. grapheme_split_string
  6. grapheme_count_graphemes
  7. grapheme_get_haystack_offset
  8. grapheme_strrpos_ascii
  9. grapheme_get_break_iterator

   1 /*
   2    +----------------------------------------------------------------------+
   3    | PHP Version 7                                                        |
   4    +----------------------------------------------------------------------+
   5    | This source file is subject to version 3.01 of the PHP license,      |
   6    | that is bundled with this package in the file LICENSE, and is        |
   7    | available through the world-wide-web at the following url:           |
   8    | http://www.php.net/license/3_01.txt                                  |
   9    | If you did not receive a copy of the PHP license and are unable to   |
  10    | obtain it through the world-wide-web, please send a note to          |
  11    | license@php.net so we can mail you a copy immediately.               |
  12    +----------------------------------------------------------------------+
  13    | Author: Ed Batutis <ed@batutis.com>                                  |
  14    +----------------------------------------------------------------------+
  15  */
  16 
  17 /* {{{ includes */
  18 #ifdef HAVE_CONFIG_H
  19 #include "config.h"
  20 #endif
  21 
  22 #include <php.h>
  23 #include "grapheme.h"
  24 #include "grapheme_util.h"
  25 #include "intl_common.h"
  26 
  27 #include <unicode/utypes.h>
  28 #include <unicode/ucol.h>
  29 #include <unicode/ustring.h>
  30 #include <unicode/ubrk.h>
  31 #include <unicode/usearch.h>
  32 
  33 #include "ext/standard/php_string.h"
  34 
  35 ZEND_EXTERN_MODULE_GLOBALS( intl )
  36 
  37 /* }}} */
  38 
  39 /* {{{ grapheme_close_global_iterator - clean up */
  40 void
  41 grapheme_close_global_iterator( void )
  42 {
  43         UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
  44 
  45         if ( NULL != global_break_iterator ) {
  46                 ubrk_close(global_break_iterator);
  47         }
  48 }
  49 /* }}} */
  50 
  51 /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
  52 void grapheme_substr_ascii(char *str, size_t str_len, int32_t f, int32_t l, char **sub_str, int32_t *sub_str_len)
  53 {
  54         int32_t str_len2 = (int32_t)str_len; /* in order to avoid signed/unsigned problems */
  55     *sub_str = NULL;
  56 
  57     if(str_len > INT32_MAX) {
  58         /* We can not return long strings from ICU functions, so we won't here too */
  59         return;
  60     }
  61 
  62     if ((l < 0 && -l > str_len2)) {
  63         return;
  64     } else if (l > 0 && l > str_len2) {
  65         l = str_len2;
  66     }
  67 
  68     if (f > str_len2 || (f < 0 && -f > str_len2)) {
  69         return;
  70     }
  71 
  72     if (l < 0 && str_len2 < f - l) {
  73         return;
  74     }
  75 
  76     /* if "from" position is negative, count start position from the end
  77      * of the string
  78      */
  79     if (f < 0) {
  80         f = str_len2 + f;
  81         if (f < 0) {
  82             f = 0;
  83         }
  84     }
  85 
  86 
  87     /* if "length" position is negative, set it to the length
  88      * needed to stop that many chars from the end of the string
  89      */
  90     if (l < 0) {
  91         l = (str_len2 - f) + l;
  92         if (l < 0) {
  93             l = 0;
  94         }
  95     }
  96 
  97     if (f >= str_len2) {
  98         return;
  99     }
 100 
 101     if ((f + l) > str_len2) {
 102         l = str_len - f;
 103     }
 104 
 105     *sub_str = str + f;
 106     *sub_str_len = l;
 107 
 108     return;
 109 }
 110 /* }}} */
 111 
 112 #define STRPOS_CHECK_STATUS(status, error)                                                      \
 113         if ( U_FAILURE( (status) ) ) {                                                                  \
 114                 intl_error_set_code( NULL, (status) );                  \
 115                 intl_error_set_custom_msg( NULL, (error), 0 );  \
 116                 if (uhaystack) {                                                                                        \
 117                         efree( uhaystack );                                                                     \
 118                 }                                                                                                                       \
 119                 if (uneedle) {                                                                                          \
 120                         efree( uneedle );                                                                               \
 121                 }                                                                                                                       \
 122                 if(bi) {                                                                                                        \
 123                         ubrk_close (bi);                                                                                \
 124                 }                                                                                                                       \
 125                 if(src) {                                                                                                       \
 126                         usearch_close(src);                                                                             \
 127                 }                                                                                                                       \
 128                 return -1;                                                                                                      \
 129         }
 130 
 131 
 132 /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
 133 int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last)
 134 {
 135         UChar *uhaystack = NULL, *uneedle = NULL;
 136         int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0;
 137         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
 138         UBreakIterator* bi = NULL;
 139         UErrorCode status;
 140         UStringSearch* src = NULL;
 141         UCollator *coll;
 142 
 143         if(puchar_pos) {
 144                 *puchar_pos = -1;
 145         }
 146         /* convert the strings to UTF-16. */
 147 
 148         status = U_ZERO_ERROR;
 149         intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, haystack, haystack_len, &status );
 150         STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16");
 151 
 152         status = U_ZERO_ERROR;
 153         intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, needle, needle_len, &status );
 154         STRPOS_CHECK_STATUS(status, "Error converting needle string to UTF-16");
 155 
 156         /* get a pointer to the haystack taking into account the offset */
 157         status = U_ZERO_ERROR;
 158         bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
 159         STRPOS_CHECK_STATUS(status, "Failed to get iterator");
 160         status = U_ZERO_ERROR;
 161         ubrk_setText(bi, uhaystack, uhaystack_len, &status);
 162         STRPOS_CHECK_STATUS(status, "Failed to set up iterator");
 163 
 164         status = U_ZERO_ERROR;
 165         src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status);
 166         STRPOS_CHECK_STATUS(status, "Error creating search object");
 167 
 168         if(f_ignore_case) {
 169                 coll = usearch_getCollator(src);
 170                 status = U_ZERO_ERROR;
 171                 ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status);
 172                 STRPOS_CHECK_STATUS(status, "Error setting collation strength");
 173                 usearch_reset(src);
 174         }
 175 
 176         if(offset != 0) {
 177                 offset_pos = grapheme_get_haystack_offset(bi, offset);
 178                 if(offset_pos == -1) {
 179                         status = U_ILLEGAL_ARGUMENT_ERROR;
 180                         STRPOS_CHECK_STATUS(status, "Invalid search offset");
 181                 }
 182                 status = U_ZERO_ERROR;
 183                 usearch_setOffset(src, offset_pos, &status);
 184                 STRPOS_CHECK_STATUS(status, "Invalid search offset");
 185         }
 186 
 187 
 188         if(last) {
 189                 char_pos = usearch_last(src, &status);
 190                 if(char_pos < offset_pos) {
 191                         /* last one is beyound our start offset */
 192                         char_pos = USEARCH_DONE;
 193                 }
 194         } else {
 195                 char_pos = usearch_next(src, &status);
 196         }
 197         STRPOS_CHECK_STATUS(status, "Error looking up string");
 198         if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) {
 199                 ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos);
 200                 if(puchar_pos) {
 201                         *puchar_pos = char_pos;
 202                 }
 203         } else {
 204                 ret_pos = -1;
 205         }
 206 
 207         if (uhaystack) {
 208                 efree( uhaystack );
 209         }
 210         if (uneedle) {
 211                 efree( uneedle );
 212         }
 213         ubrk_close (bi);
 214         usearch_close (src);
 215 
 216         return ret_pos;
 217 }
 218 
 219 /* }}} */
 220 
 221 /* {{{ grapheme_ascii_check: ASCII check */
 222 zend_long grapheme_ascii_check(const unsigned char *day, size_t len)
 223 {
 224         int ret_len = len;
 225         while ( len-- ) {
 226         if ( *day++ > 0x7f )
 227                 return -1;
 228         }
 229 
 230         return ret_len;
 231 }
 232 
 233 /* }}} */
 234 
 235 /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
 236 int32_t grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len )
 237 {
 238         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
 239         UErrorCode              status = U_ZERO_ERROR;
 240         int ret_len, pos;
 241         UBreakIterator* bi;
 242 
 243         bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
 244 
 245         if( U_FAILURE(status) ) {
 246                 return -1;
 247         }
 248 
 249         ubrk_setText(bi, text, text_length,     &status);
 250 
 251         pos = 0;
 252 
 253         for ( ret_len = 0; pos != UBRK_DONE; ) {
 254 
 255                 pos = ubrk_next(bi);
 256 
 257                 if ( pos != UBRK_DONE ) {
 258 
 259                         if ( NULL != boundary_array && ret_len < boundary_array_len ) {
 260                                 boundary_array[ret_len] = pos;
 261                         }
 262 
 263                         ret_len++;
 264                 }
 265         }
 266 
 267         ubrk_close(bi);
 268 
 269         return ret_len;
 270 }
 271 /* }}} */
 272 
 273 /* {{{ grapheme_count_graphemes */
 274 int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
 275 {
 276         int ret_len = 0;
 277         int pos = 0;
 278         UErrorCode              status = U_ZERO_ERROR;
 279 
 280         ubrk_setText(bi, string, string_len, &status);
 281 
 282         do {
 283 
 284                 pos = ubrk_next(bi);
 285 
 286                 if ( UBRK_DONE != pos ) {
 287                         ret_len++;
 288                 }
 289 
 290         } while ( UBRK_DONE != pos );
 291 
 292         return ret_len;
 293 }
 294 /* }}} */
 295 
 296 
 297 /* {{{  grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
 298 int32_t grapheme_get_haystack_offset(UBreakIterator* bi, int32_t offset)
 299 {
 300         int32_t pos;
 301         int32_t (*iter_op)(UBreakIterator* bi);
 302         int iter_incr;
 303 
 304         if ( 0 == offset ) {
 305                 return 0;
 306         }
 307 
 308         if ( offset < 0 ) {
 309                 iter_op = ubrk_previous;
 310                 ubrk_last(bi); /* one past the end */
 311                 iter_incr = 1;
 312         }
 313         else {
 314                 iter_op = ubrk_next;
 315                 iter_incr = -1;
 316         }
 317 
 318         pos = 0;
 319 
 320         while ( pos != UBRK_DONE && offset != 0 ) {
 321 
 322                 pos = iter_op(bi);
 323 
 324                 if ( UBRK_DONE != pos ) {
 325                         offset += iter_incr;
 326                 }
 327         }
 328 
 329         if ( offset != 0 ) {
 330                 return -1;
 331         }
 332 
 333         return pos;
 334 }
 335 /* }}} */
 336 
 337 /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
 338  zend_long
 339 grapheme_strrpos_ascii(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset)
 340 {
 341         char *p, *e;
 342 
 343         if (offset >= 0) {
 344                 p = haystack + offset;
 345                 e = haystack + haystack_len - needle_len;
 346         } else {
 347                 p = haystack;
 348                 if (needle_len > -offset) {
 349                         e = haystack + haystack_len - needle_len;
 350                 } else {
 351                         e = haystack + haystack_len + offset;
 352                 }
 353         }
 354 
 355         if (needle_len == 1) {
 356                 /* Single character search can shortcut memcmps */
 357                 while (e >= p) {
 358                         if (*e == *needle) {
 359                                 return (e - p + (offset > 0 ? offset : 0));
 360                         }
 361                         e--;
 362                 }
 363                 return -1;
 364         }
 365 
 366         while (e >= p) {
 367                 if (memcmp(e, needle, needle_len) == 0) {
 368                         return (e - p + (offset > 0 ? offset : 0));
 369                 }
 370                 e--;
 371         }
 372 
 373         return -1;
 374 }
 375 
 376 /* }}} */
 377 
 378 /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
 379 UBreakIterator* grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status )
 380 {
 381         int32_t buffer_size;
 382 
 383         UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
 384 
 385         if ( NULL == global_break_iterator ) {
 386 
 387                 global_break_iterator = ubrk_open(UBRK_CHARACTER,
 388                                                                                         NULL,   /* icu default locale - locale has no effect on this iterator */
 389                                                                                         NULL,   /* text not set in global iterator */
 390                                                                                         0,              /* text length = 0 */
 391                                                                                         status);
 392 
 393                 INTL_G(grapheme_iterator) = global_break_iterator;
 394         }
 395 
 396         buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
 397 
 398         return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
 399 }
 400 /* }}} */
 401 
 402 /*
 403  * Local variables:
 404  * tab-width: 4
 405  * c-basic-offset: 4
 406  * End:
 407  * vim600: fdm=marker
 408  * vim: noet sw=4 ts=4
 409  */
 410 

/* [<][>][^][v][top][bottom][index][help] */