root/ext/standard/metaphone.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. PHP_FUNCTION
  2. Lookahead
  3. metaphone

   1 /*
   2    +----------------------------------------------------------------------+
   3    | PHP Version 7                                                        |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 1997-2016 The PHP Group                                |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15    | Author: Thies C. Arntzen <thies@thieso.net>                          |
  16    +----------------------------------------------------------------------+
  17 */
  18 
  19 /* $Id$ */
  20 
  21 /*
  22         Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
  23 */
  24 
  25 #include "php.h"
  26 #include "php_metaphone.h"
  27 
  28 static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional);
  29 
  30 /* {{{ proto string metaphone(string text[, int phones])
  31    Break english phrases down into their phonemes */
  32 PHP_FUNCTION(metaphone)
  33 {
  34         zend_string *str;
  35         zend_string *result = NULL;
  36         zend_long phones = 0;
  37 
  38         if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|l", &str, &phones) == FAILURE) {
  39                 return;
  40         }
  41 
  42         if (metaphone((unsigned char *)ZSTR_VAL(str), ZSTR_LEN(str), phones, &result, 1) == 0) {
  43                 RETVAL_STR(result);
  44         } else {
  45                 if (result) {
  46                         zend_string_free(result);
  47                 }
  48                 RETURN_FALSE;
  49         }
  50 }
  51 /* }}} */
  52 
  53 /*
  54    this is now the original code by Michael G Schwern:
  55    i've changed it just a slightly bit (use emalloc,
  56    get rid of includes etc)
  57         - thies - 13.09.1999
  58 */
  59 
  60 /*-----------------------------  */
  61 /* this used to be "metaphone.h" */
  62 /*-----------------------------  */
  63 
  64 /* Special encodings */
  65 #define  SH     'X'
  66 #define  TH             '0'
  67 
  68 /*-----------------------------  */
  69 /* end of "metaphone.h"          */
  70 /*-----------------------------  */
  71 
  72 /*----------------------------- */
  73 /* this used to be "metachar.h" */
  74 /*----------------------------- */
  75 
  76 /* Metachar.h ... little bits about characters for metaphone */
  77 /*-- Character encoding array & accessing macros --*/
  78 /* Stolen directly out of the book... */
  79 char _codes[26] =
  80 {
  81         1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
  82 /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
  83 };
  84 
  85 
  86 #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
  87 
  88 #define isvowel(c)  (ENCODE(c) & 1)             /* AEIOU */
  89 
  90 /* These letters are passed through unchanged */
  91 #define NOCHANGE(c) (ENCODE(c) & 2)             /* FJMNR */
  92 
  93 /* These form diphthongs when preceding H */
  94 #define AFFECTH(c)  (ENCODE(c) & 4)             /* CGPST */
  95 
  96 /* These make C and G soft */
  97 #define MAKESOFT(c) (ENCODE(c) & 8)             /* EIY */
  98 
  99 /* These prevent GH from becoming F */
 100 #define NOGHTOF(c)  (ENCODE(c) & 16)    /* BDH */
 101 
 102 /*----------------------------- */
 103 /* end of "metachar.h"          */
 104 /*----------------------------- */
 105 
 106 /* I suppose I could have been using a character pointer instead of
 107  * accesssing the array directly... */
 108 
 109 /* Look at the next letter in the word */
 110 #define Next_Letter (toupper(word[w_idx+1]))
 111 /* Look at the current letter in the word */
 112 #define Curr_Letter (toupper(word[w_idx]))
 113 /* Go N letters back. */
 114 #define Look_Back_Letter(n)     (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
 115 /* Previous letter.  I dunno, should this return null on failure? */
 116 #define Prev_Letter (Look_Back_Letter(1))
 117 /* Look two letters down.  It makes sure you don't walk off the string. */
 118 #define After_Next_Letter       (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
 119                                                                                              : '\0')
 120 #define Look_Ahead_Letter(n) (toupper(Lookahead((char *) word+w_idx, n)))
 121 
 122 
 123 /* Allows us to safely look ahead an arbitrary # of letters */
 124 /* I probably could have just used strlen... */
 125 static char Lookahead(char *word, int how_far)
 126 {
 127         char letter_ahead = '\0';       /* null by default */
 128         int idx;
 129         for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
 130         /* Edge forward in the string... */
 131 
 132         letter_ahead = word[idx];       /* idx will be either == to how_far or
 133                                                                  * at the end of the string
 134                                                                  */
 135         return letter_ahead;
 136 }
 137 
 138 
 139 /* phonize one letter
 140  * We don't know the buffers size in advance. On way to solve this is to just
 141  * re-allocate the buffer size. We're using an extra of 2 characters (this
 142  * could be one though; or more too). */
 143 #define Phonize(c)      { \
 144                                                 if (p_idx >= max_buffer_len) { \
 145                                                         *phoned_word = zend_string_extend(*phoned_word, 2 * sizeof(char) + max_buffer_len, 0); \
 146                                                         max_buffer_len += 2; \
 147                                                 } \
 148                                                 ZSTR_VAL(*phoned_word)[p_idx++] = c; \
 149                                                 ZSTR_LEN(*phoned_word) = p_idx; \
 150                                         }
 151 /* Slap a null character on the end of the phoned word */
 152 #define End_Phoned_Word { \
 153                                                         if (p_idx == max_buffer_len) { \
 154                                                                 *phoned_word = zend_string_extend(*phoned_word, 1 * sizeof(char) + max_buffer_len, 0); \
 155                                                                 max_buffer_len += 1; \
 156                                                         } \
 157                                                         ZSTR_VAL(*phoned_word)[p_idx] = '\0'; \
 158                                                         ZSTR_LEN(*phoned_word) = p_idx; \
 159                                                 }
 160 /* How long is the phoned word? */
 161 #define Phone_Len       (p_idx)
 162 
 163 /* Note is a letter is a 'break' in the word */
 164 #define Isbreak(c)  (!isalpha(c))
 165 
 166 /* {{{ metaphone
 167  */
 168 static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional)
 169 {
 170         int w_idx = 0;                          /* point in the phonization we're at. */
 171         int p_idx = 0;                          /* end of the phoned phrase */
 172         size_t max_buffer_len = 0;              /* maximum length of the destination buffer */
 173 
 174 /*-- Parameter checks --*/
 175         /* Negative phoneme length is meaningless */
 176 
 177         if (max_phonemes < 0)
 178                 return -1;
 179 
 180         /* Empty/null string is meaningless */
 181         /* Overly paranoid */
 182         /* assert(word != NULL && word[0] != '\0'); */
 183 
 184         if (word == NULL)
 185                 return -1;
 186 
 187 /*-- Allocate memory for our phoned_phrase --*/
 188         if (max_phonemes == 0) {        /* Assume largest possible */
 189                 max_buffer_len = word_len;
 190                 *phoned_word = zend_string_alloc(sizeof(char) * word_len + 1, 0);
 191         } else {
 192                 max_buffer_len = max_phonemes;
 193                 *phoned_word = zend_string_alloc(sizeof(char) * max_phonemes + 1, 0);
 194         }
 195 
 196 
 197 /*-- The first phoneme has to be processed specially. --*/
 198         /* Find our first letter */
 199         for (; !isalpha(Curr_Letter); w_idx++) {
 200                 /* On the off chance we were given nothing but crap... */
 201                 if (Curr_Letter == '\0') {
 202                         End_Phoned_Word
 203                                 return SUCCESS; /* For testing */
 204                 }
 205         }
 206 
 207         switch (Curr_Letter) {
 208                 /* AE becomes E */
 209         case 'A':
 210                 if (Next_Letter == 'E') {
 211                         Phonize('E');
 212                         w_idx += 2;
 213                 }
 214                 /* Remember, preserve vowels at the beginning */
 215                 else {
 216                         Phonize('A');
 217                         w_idx++;
 218                 }
 219                 break;
 220                 /* [GKP]N becomes N */
 221         case 'G':
 222         case 'K':
 223         case 'P':
 224                 if (Next_Letter == 'N') {
 225                         Phonize('N');
 226                         w_idx += 2;
 227                 }
 228                 break;
 229                 /* WH becomes W,
 230                    WR becomes R
 231                    W if followed by a vowel */
 232         case 'W':
 233                 if (Next_Letter == 'R') {
 234                         Phonize(Next_Letter);
 235                         w_idx += 2;
 236                 } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
 237                         Phonize('W');
 238                         w_idx += 2;
 239                 }
 240                 /* else ignore */
 241                 break;
 242                 /* X becomes S */
 243         case 'X':
 244                 Phonize('S');
 245                 w_idx++;
 246                 break;
 247                 /* Vowels are kept */
 248                 /* We did A already
 249                    case 'A':
 250                    case 'a':
 251                  */
 252         case 'E':
 253         case 'I':
 254         case 'O':
 255         case 'U':
 256                 Phonize(Curr_Letter);
 257                 w_idx++;
 258                 break;
 259         default:
 260                 /* do nothing */
 261                 break;
 262         }
 263 
 264 
 265 
 266         /* On to the metaphoning */
 267         for (; Curr_Letter != '\0' &&
 268                  (max_phonemes == 0 || Phone_Len < max_phonemes);
 269                  w_idx++) {
 270                 /* How many letters to skip because an eariler encoding handled
 271                  * multiple letters */
 272                 unsigned short int skip_letter = 0;
 273 
 274 
 275                 /* THOUGHT:  It would be nice if, rather than having things like...
 276                  * well, SCI.  For SCI you encode the S, then have to remember
 277                  * to skip the C.  So the phonome SCI invades both S and C.  It would
 278                  * be better, IMHO, to skip the C from the S part of the encoding.
 279                  * Hell, I'm trying it.
 280                  */
 281 
 282                 /* Ignore non-alphas */
 283                 if (!isalpha(Curr_Letter))
 284                         continue;
 285 
 286                 /* Drop duplicates, except CC */
 287                 if (Curr_Letter == Prev_Letter &&
 288                         Curr_Letter != 'C')
 289                         continue;
 290 
 291                 switch (Curr_Letter) {
 292                         /* B -> B unless in MB */
 293                 case 'B':
 294                         if (Prev_Letter != 'M')
 295                                 Phonize('B');
 296                         break;
 297                         /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
 298                          * (SCHW is handled in S)
 299                          *  S if -CI-, -CE- or -CY-
 300                          *  dropped if -SCI-, SCE-, -SCY- (handed in S)
 301                          *  else K
 302                          */
 303                 case 'C':
 304                         if (MAKESOFT(Next_Letter)) {    /* C[IEY] */
 305                                 if (After_Next_Letter == 'A' &&
 306                                         Next_Letter == 'I') {   /* CIA */
 307                                         Phonize(SH);
 308                                 }
 309                                 /* SC[IEY] */
 310                                 else if (Prev_Letter == 'S') {
 311                                         /* Dropped */
 312                                 } else {
 313                                         Phonize('S');
 314                                 }
 315                         } else if (Next_Letter == 'H') {
 316                                 if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {       /* Christ, School */
 317                                         Phonize('K');
 318                                 } else {
 319                                         Phonize(SH);
 320                                 }
 321                                 skip_letter++;
 322                         } else {
 323                                 Phonize('K');
 324                         }
 325                         break;
 326                         /* J if in -DGE-, -DGI- or -DGY-
 327                          * else T
 328                          */
 329                 case 'D':
 330                         if (Next_Letter == 'G' &&
 331                                 MAKESOFT(After_Next_Letter)) {
 332                                 Phonize('J');
 333                                 skip_letter++;
 334                         } else
 335                                 Phonize('T');
 336                         break;
 337                         /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
 338                          * else dropped if -GNED, -GN,
 339                          * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
 340                          * else J if in -GE-, -GI, -GY and not GG
 341                          * else K
 342                          */
 343                 case 'G':
 344                         if (Next_Letter == 'H') {
 345                                 if (!(NOGHTOF(Look_Back_Letter(3)) ||
 346                                           Look_Back_Letter(4) == 'H')) {
 347                                         Phonize('F');
 348                                         skip_letter++;
 349                                 } else {
 350                                         /* silent */
 351                                 }
 352                         } else if (Next_Letter == 'N') {
 353                                 if (Isbreak(After_Next_Letter) ||
 354                                         (After_Next_Letter == 'E' &&
 355                                          Look_Ahead_Letter(3) == 'D')) {
 356                                         /* dropped */
 357                                 } else
 358                                         Phonize('K');
 359                         } else if (MAKESOFT(Next_Letter) &&
 360                                            Prev_Letter != 'G') {
 361                                 Phonize('J');
 362                         } else {
 363                                 Phonize('K');
 364                         }
 365                         break;
 366                         /* H if before a vowel and not after C,G,P,S,T */
 367                 case 'H':
 368                         if (isvowel(Next_Letter) &&
 369                                 !AFFECTH(Prev_Letter))
 370                                 Phonize('H');
 371                         break;
 372                         /* dropped if after C
 373                          * else K
 374                          */
 375                 case 'K':
 376                         if (Prev_Letter != 'C')
 377                                 Phonize('K');
 378                         break;
 379                         /* F if before H
 380                          * else P
 381                          */
 382                 case 'P':
 383                         if (Next_Letter == 'H') {
 384                                 Phonize('F');
 385                         } else {
 386                                 Phonize('P');
 387                         }
 388                         break;
 389                         /* K
 390                          */
 391                 case 'Q':
 392                         Phonize('K');
 393                         break;
 394                         /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
 395                          * else S
 396                          */
 397                 case 'S':
 398                         if (Next_Letter == 'I' &&
 399                                 (After_Next_Letter == 'O' ||
 400                                  After_Next_Letter == 'A')) {
 401                                 Phonize(SH);
 402                         } else if (Next_Letter == 'H') {
 403                                 Phonize(SH);
 404                                 skip_letter++;
 405                         } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
 406                                 Phonize(SH);
 407                                 skip_letter += 2;
 408                         } else {
 409                                 Phonize('S');
 410                         }
 411                         break;
 412                         /* 'sh' in -TIA- or -TIO-
 413                          * else 'th' before H
 414                          * else T
 415                          */
 416                 case 'T':
 417                         if (Next_Letter == 'I' &&
 418                                 (After_Next_Letter == 'O' ||
 419                                  After_Next_Letter == 'A')) {
 420                                 Phonize(SH);
 421                         } else if (Next_Letter == 'H') {
 422                                 Phonize(TH);
 423                                 skip_letter++;
 424                         } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
 425                                 Phonize('T');
 426                         }
 427                         break;
 428                         /* F */
 429                 case 'V':
 430                         Phonize('F');
 431                         break;
 432                         /* W before a vowel, else dropped */
 433                 case 'W':
 434                         if (isvowel(Next_Letter))
 435                                 Phonize('W');
 436                         break;
 437                         /* KS */
 438                 case 'X':
 439                         Phonize('K');
 440                         Phonize('S');
 441                         break;
 442                         /* Y if followed by a vowel */
 443                 case 'Y':
 444                         if (isvowel(Next_Letter))
 445                                 Phonize('Y');
 446                         break;
 447                         /* S */
 448                 case 'Z':
 449                         Phonize('S');
 450                         break;
 451                         /* No transformation */
 452                 case 'F':
 453                 case 'J':
 454                 case 'L':
 455                 case 'M':
 456                 case 'N':
 457                 case 'R':
 458                         Phonize(Curr_Letter);
 459                         break;
 460                 default:
 461                         /* nothing */
 462                         break;
 463                 }                                               /* END SWITCH */
 464 
 465                 w_idx += skip_letter;
 466         }                                                       /* END FOR */
 467 
 468         End_Phoned_Word;
 469 
 470         return 0;
 471 }                                                               /* END metaphone */
 472 /* }}} */
 473 
 474 /*
 475  * Local variables:
 476  * tab-width: 4
 477  * c-basic-offset: 4
 478  * End:
 479  * vim600: sw=4 ts=4 fdm=marker
 480  * vim<600: sw=4 ts=4
 481  */

/* [<][>][^][v][top][bottom][index][help] */