This source file includes following definitions.
- onig_null_warn
- onig_set_warn_func
- onig_set_verb_warn_func
- bbuf_free
- bbuf_clone
- bitset_set_range
- bitset_set_all
- bitset_invert
- bitset_invert_to
- bitset_and
- bitset_or
- bitset_copy
- onig_strncmp
- onig_strcpy
- strdup_with_null
- strcat_capa
- strcat_capa_from_static
- str_end_cmp
- str_end_hash
- onig_st_init_strend_table_with_size
- onig_st_lookup_strend
- onig_st_insert_strend
- i_print_name_entry
- onig_print_names
- i_free_name_entry
- names_clear
- onig_names_free
- name_find
- i_names
- onig_foreach_name
- i_renumber_name
- onig_renumber_name_table
- onig_number_of_names
- onig_print_names
- names_clear
- onig_names_free
- name_find
- onig_foreach_name
- onig_number_of_names
- name_add
- onig_name_to_group_numbers
- onig_name_to_backref_number
- onig_name_to_group_numbers
- onig_name_to_backref_number
- onig_foreach_name
- onig_number_of_names
- onig_noname_group_capture_is_active
- scan_env_clear
- scan_env_add_mem_entry
- scan_env_set_mem_node
- onig_node_free
- onig_free_node_list
- node_new
- initialize_cclass
- node_new_cclass
- node_new_cclass_by_codepoint_range
- node_new_ctype
- node_new_anychar
- node_new_list
- onig_node_new_list
- onig_node_list_add
- onig_node_new_alt
- onig_node_new_anchor
- node_new_backref
- node_new_call
- node_new_quantifier
- node_new_enclose
- onig_node_new_enclose
- node_new_enclose_memory
- node_new_option
- onig_node_str_cat
- onig_node_str_set
- node_str_cat_char
- onig_node_conv_to_str_node
- onig_node_str_clear
- node_new_str
- onig_node_new_str
- node_new_str_raw
- node_new_empty
- node_new_str_raw_char
- str_node_split_last_char
- str_node_can_be_split
- node_str_head_pad
- onig_scan_unsigned_number
- scan_unsigned_hexadecimal_number
- scan_unsigned_octal_number
- new_code_range
- add_code_range_to_buf
- add_code_range
- not_code_range_buf
- or_code_range_buf
- and_code_range1
- and_code_range_buf
- and_cclass
- or_cclass
- conv_backslash_value
- is_invalid_quantifier_target
- popular_quantifier_num
- onig_reduce_nested_quantifier
- fetch_range_quantifier
- fetch_escaped_value
- get_name_end_code_point
- fetch_name_with_level
- fetch_name
- fetch_name
- CC_ESC_WARN
- CLOSE_BRACKET_WITHOUT_ESC_WARN
- find_str_position
- str_exist_check_with_esc
- fetch_token_in_cc
- fetch_token
- add_ctype_to_cc_by_range
- add_ctype_to_cc
- parse_posix_bracket
- fetch_char_property_to_ctype
- parse_char_property
- next_state_class
- next_state_val
- code_exist_check
- parse_char_class
- parse_enclose
- set_quantifier
- type_cclass_cmp
- type_cclass_hash
- i_free_shared_class
- onig_free_shared_cclass_table
- clear_not_flag_cclass
- i_apply_case_fold
- parse_exp
- parse_branch
- parse_subexp
- parse_regexp
- onig_parse_make_tree
- onig_scan_env_set_error_string
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 #include "regparse.h"
31 #include "st.h"
32
33 #define WARN_BUFSIZE 256
34
35 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
36
37
38 OnigSyntaxType OnigSyntaxRuby = {
39 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
40 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
41 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
42 ONIG_SYN_OP_ESC_C_CONTROL )
43 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
44 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
45 ONIG_SYN_OP2_OPTION_RUBY |
46 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
47 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
48 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
49 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
50 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
51 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
52 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
53 ONIG_SYN_OP2_ESC_H_XDIGIT )
54 , ( SYN_GNU_REGEX_BV |
55 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
56 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
57 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
58 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
59 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
60 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
61 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
62 , ONIG_OPTION_NONE
63 ,
64 {
65 (OnigCodePoint )'\\'
66 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
67 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
68 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
69 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
70 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
71 }
72 };
73
74 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
75
76 extern void onig_null_warn(const char* s ARG_UNUSED) { }
77
78 #ifdef DEFAULT_WARN_FUNCTION
79 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
80 #else
81 static OnigWarnFunc onig_warn = onig_null_warn;
82 #endif
83
84 #ifdef DEFAULT_VERB_WARN_FUNCTION
85 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
86 #else
87 static OnigWarnFunc onig_verb_warn = onig_null_warn;
88 #endif
89
90 extern void onig_set_warn_func(OnigWarnFunc f)
91 {
92 onig_warn = f;
93 }
94
95 extern void onig_set_verb_warn_func(OnigWarnFunc f)
96 {
97 onig_verb_warn = f;
98 }
99
100 static void
101 bbuf_free(BBuf* bbuf)
102 {
103 if (IS_NOT_NULL(bbuf)) {
104 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
105 xfree(bbuf);
106 }
107 }
108
109 static int
110 bbuf_clone(BBuf** rto, BBuf* from)
111 {
112 int r;
113 BBuf *to;
114
115 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
116 CHECK_NULL_RETURN_MEMERR(to);
117 r = BBUF_INIT(to, from->alloc);
118 if (r != 0) return r;
119 to->used = from->used;
120 xmemcpy(to->p, from->p, from->used);
121 return 0;
122 }
123
124 #define BACKREF_REL_TO_ABS(rel_no, env) \
125 ((env)->num_mem + 1 + (rel_no))
126
127 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
128
129 #define MBCODE_START_POS(enc) \
130 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
131
132 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
133 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
134
135 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
136 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
137 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
138 if (r) return r;\
139 }\
140 } while (0)
141
142
143 #define BITSET_IS_EMPTY(bs,empty) do {\
144 int i;\
145 empty = 1;\
146 for (i = 0; i < (int )BITSET_SIZE; i++) {\
147 if ((bs)[i] != 0) {\
148 empty = 0; break;\
149 }\
150 }\
151 } while (0)
152
153 static void
154 bitset_set_range(BitSetRef bs, int from, int to)
155 {
156 int i;
157 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
158 BITSET_SET_BIT(bs, i);
159 }
160 }
161
162 #if 0
163 static void
164 bitset_set_all(BitSetRef bs)
165 {
166 int i;
167 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
168 }
169 #endif
170
171 static void
172 bitset_invert(BitSetRef bs)
173 {
174 int i;
175 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
176 }
177
178 static void
179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181 int i;
182 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
183 }
184
185 static void
186 bitset_and(BitSetRef dest, BitSetRef bs)
187 {
188 int i;
189 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
190 }
191
192 static void
193 bitset_or(BitSetRef dest, BitSetRef bs)
194 {
195 int i;
196 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
197 }
198
199 static void
200 bitset_copy(BitSetRef dest, BitSetRef bs)
201 {
202 int i;
203 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
204 }
205
206 extern int
207 onig_strncmp(const UChar* s1, const UChar* s2, int n)
208 {
209 int x;
210
211 while (n-- > 0) {
212 x = *s2++ - *s1++;
213 if (x) return x;
214 }
215 return 0;
216 }
217
218 extern void
219 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
220 {
221 int len = end - src;
222 if (len > 0) {
223 xmemcpy(dest, src, len);
224 dest[len] = (UChar )0;
225 }
226 }
227
228 #ifdef USE_NAMED_GROUP
229 static UChar*
230 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
231 {
232 int slen, term_len, i;
233 UChar *r;
234
235 slen = end - s;
236 term_len = ONIGENC_MBC_MINLEN(enc);
237
238 r = (UChar* )xmalloc(slen + term_len);
239 CHECK_NULL_RETURN(r);
240 xmemcpy(r, s, slen);
241
242 for (i = 0; i < term_len; i++)
243 r[slen + i] = (UChar )0;
244
245 return r;
246 }
247 #endif
248
249
250 #define PEND_VALUE 0
251
252 #define PFETCH_READY UChar* pfetch_prev
253 #define PEND (p < end ? 0 : 1)
254 #define PUNFETCH p = pfetch_prev
255 #define PINC do { \
256 pfetch_prev = p; \
257 p += ONIGENC_MBC_ENC_LEN(enc, p); \
258 } while (0)
259 #define PFETCH(c) do { \
260 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
261 pfetch_prev = p; \
262 p += ONIGENC_MBC_ENC_LEN(enc, p); \
263 } while (0)
264
265 #define PINC_S do { \
266 p += ONIGENC_MBC_ENC_LEN(enc, p); \
267 } while (0)
268 #define PFETCH_S(c) do { \
269 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
270 p += ONIGENC_MBC_ENC_LEN(enc, p); \
271 } while (0)
272
273 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
274 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
275
276 static UChar*
277 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
278 int capa)
279 {
280 UChar* r;
281
282 if (dest)
283 r = (UChar* )xrealloc(dest, capa + 1);
284 else
285 r = (UChar* )xmalloc(capa + 1);
286
287 CHECK_NULL_RETURN(r);
288 onig_strcpy(r + (dest_end - dest), src, src_end);
289 return r;
290 }
291
292
293 static UChar*
294 strcat_capa_from_static(UChar* dest, UChar* dest_end,
295 const UChar* src, const UChar* src_end, int capa)
296 {
297 UChar* r;
298
299 r = (UChar* )xmalloc(capa + 1);
300 CHECK_NULL_RETURN(r);
301 onig_strcpy(r, dest, dest_end);
302 onig_strcpy(r + (dest_end - dest), src, src_end);
303 return r;
304 }
305
306
307 #ifdef USE_ST_LIBRARY
308
309 typedef struct {
310 UChar* s;
311 UChar* end;
312 } st_str_end_key;
313
314 static int
315 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
316 {
317 UChar *p, *q;
318 int c;
319
320 if ((x->end - x->s) != (y->end - y->s))
321 return 1;
322
323 p = x->s;
324 q = y->s;
325 while (p < x->end) {
326 c = (int )*p - (int )*q;
327 if (c != 0) return c;
328
329 p++; q++;
330 }
331
332 return 0;
333 }
334
335 static int
336 str_end_hash(st_str_end_key* x)
337 {
338 UChar *p;
339 int val = 0;
340
341 p = x->s;
342 while (p < x->end) {
343 val = val * 997 + (int )*p++;
344 }
345
346 return val + (val >> 5);
347 }
348
349 extern hash_table_type*
350 onig_st_init_strend_table_with_size(int size)
351 {
352 static struct st_hash_type hashType = {
353 str_end_cmp,
354 str_end_hash,
355 };
356
357 return (hash_table_type* )
358 onig_st_init_table_with_size(&hashType, size);
359 }
360
361 extern int
362 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
363 const UChar* end_key, hash_data_type *value)
364 {
365 st_str_end_key key;
366
367 key.s = (UChar* )str_key;
368 key.end = (UChar* )end_key;
369
370 return onig_st_lookup(table, (st_data_t )(&key), value);
371 }
372
373 extern int
374 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
375 const UChar* end_key, hash_data_type value)
376 {
377 st_str_end_key* key;
378 int result;
379
380 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
381 key->s = (UChar* )str_key;
382 key->end = (UChar* )end_key;
383 result = onig_st_insert(table, (st_data_t )key, value);
384 if (result) {
385 xfree(key);
386 }
387 return result;
388 }
389
390 #endif
391
392
393 #ifdef USE_NAMED_GROUP
394
395 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
396
397 typedef struct {
398 UChar* name;
399 int name_len;
400 int back_num;
401 int back_alloc;
402 int back_ref1;
403 int* back_refs;
404 } NameEntry;
405
406 #ifdef USE_ST_LIBRARY
407
408 typedef st_table NameTable;
409 typedef st_data_t HashDataType;
410
411 #define NAMEBUF_SIZE 24
412 #define NAMEBUF_SIZE_1 25
413
414 #ifdef ONIG_DEBUG
415 static int
416 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
417 {
418 int i;
419 FILE* fp = (FILE* )arg;
420
421 fprintf(fp, "%s: ", e->name);
422 if (e->back_num == 0)
423 fputs("-", fp);
424 else if (e->back_num == 1)
425 fprintf(fp, "%d", e->back_ref1);
426 else {
427 for (i = 0; i < e->back_num; i++) {
428 if (i > 0) fprintf(fp, ", ");
429 fprintf(fp, "%d", e->back_refs[i]);
430 }
431 }
432 fputs("\n", fp);
433 return ST_CONTINUE;
434 }
435
436 extern int
437 onig_print_names(FILE* fp, regex_t* reg)
438 {
439 NameTable* t = (NameTable* )reg->name_table;
440
441 if (IS_NOT_NULL(t)) {
442 fprintf(fp, "name table\n");
443 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
444 fputs("\n", fp);
445 }
446 return 0;
447 }
448 #endif
449
450 static int
451 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
452 {
453 xfree(e->name);
454 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
455 xfree(key);
456 xfree(e);
457 return ST_DELETE;
458 }
459
460 static int
461 names_clear(regex_t* reg)
462 {
463 NameTable* t = (NameTable* )reg->name_table;
464
465 if (IS_NOT_NULL(t)) {
466 onig_st_foreach(t, i_free_name_entry, 0);
467 }
468 return 0;
469 }
470
471 extern int
472 onig_names_free(regex_t* reg)
473 {
474 int r;
475 NameTable* t;
476
477 r = names_clear(reg);
478 if (r) return r;
479
480 t = (NameTable* )reg->name_table;
481 if (IS_NOT_NULL(t)) onig_st_free_table(t);
482 reg->name_table = (void* )NULL;
483 return 0;
484 }
485
486 static NameEntry*
487 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
488 {
489 NameEntry* e;
490 NameTable* t = (NameTable* )reg->name_table;
491
492 e = (NameEntry* )NULL;
493 if (IS_NOT_NULL(t)) {
494 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
495 }
496 return e;
497 }
498
499 typedef struct {
500 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
501 regex_t* reg;
502 void* arg;
503 int ret;
504 OnigEncoding enc;
505 } INamesArg;
506
507 static int
508 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
509 {
510 int r = (*(arg->func))(e->name,
511 e->name + e->name_len,
512 e->back_num,
513 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
514 arg->reg, arg->arg);
515 if (r != 0) {
516 arg->ret = r;
517 return ST_STOP;
518 }
519 return ST_CONTINUE;
520 }
521
522 extern int
523 onig_foreach_name(regex_t* reg,
524 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
525 {
526 INamesArg narg;
527 NameTable* t = (NameTable* )reg->name_table;
528
529 narg.ret = 0;
530 if (IS_NOT_NULL(t)) {
531 narg.func = func;
532 narg.reg = reg;
533 narg.arg = arg;
534 narg.enc = reg->enc;
535 onig_st_foreach(t, i_names, (HashDataType )&narg);
536 }
537 return narg.ret;
538 }
539
540 static int
541 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
542 {
543 int i;
544
545 if (e->back_num > 1) {
546 for (i = 0; i < e->back_num; i++) {
547 e->back_refs[i] = map[e->back_refs[i]].new_val;
548 }
549 }
550 else if (e->back_num == 1) {
551 e->back_ref1 = map[e->back_ref1].new_val;
552 }
553
554 return ST_CONTINUE;
555 }
556
557 extern int
558 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
559 {
560 NameTable* t = (NameTable* )reg->name_table;
561
562 if (IS_NOT_NULL(t)) {
563 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
564 }
565 return 0;
566 }
567
568
569 extern int
570 onig_number_of_names(regex_t* reg)
571 {
572 NameTable* t = (NameTable* )reg->name_table;
573
574 if (IS_NOT_NULL(t))
575 return t->num_entries;
576 else
577 return 0;
578 }
579
580 #else
581
582 #define INIT_NAMES_ALLOC_NUM 8
583
584 typedef struct {
585 NameEntry* e;
586 int num;
587 int alloc;
588 } NameTable;
589
590 #ifdef ONIG_DEBUG
591 extern int
592 onig_print_names(FILE* fp, regex_t* reg)
593 {
594 int i, j;
595 NameEntry* e;
596 NameTable* t = (NameTable* )reg->name_table;
597
598 if (IS_NOT_NULL(t) && t->num > 0) {
599 fprintf(fp, "name table\n");
600 for (i = 0; i < t->num; i++) {
601 e = &(t->e[i]);
602 fprintf(fp, "%s: ", e->name);
603 if (e->back_num == 0) {
604 fputs("-", fp);
605 }
606 else if (e->back_num == 1) {
607 fprintf(fp, "%d", e->back_ref1);
608 }
609 else {
610 for (j = 0; j < e->back_num; j++) {
611 if (j > 0) fprintf(fp, ", ");
612 fprintf(fp, "%d", e->back_refs[j]);
613 }
614 }
615 fputs("\n", fp);
616 }
617 fputs("\n", fp);
618 }
619 return 0;
620 }
621 #endif
622
623 static int
624 names_clear(regex_t* reg)
625 {
626 int i;
627 NameEntry* e;
628 NameTable* t = (NameTable* )reg->name_table;
629
630 if (IS_NOT_NULL(t)) {
631 for (i = 0; i < t->num; i++) {
632 e = &(t->e[i]);
633 if (IS_NOT_NULL(e->name)) {
634 xfree(e->name);
635 e->name = NULL;
636 e->name_len = 0;
637 e->back_num = 0;
638 e->back_alloc = 0;
639 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
640 e->back_refs = (int* )NULL;
641 }
642 }
643 if (IS_NOT_NULL(t->e)) {
644 xfree(t->e);
645 t->e = NULL;
646 }
647 t->num = 0;
648 }
649 return 0;
650 }
651
652 extern int
653 onig_names_free(regex_t* reg)
654 {
655 int r;
656 NameTable* t;
657
658 r = names_clear(reg);
659 if (r) return r;
660
661 t = (NameTable* )reg->name_table;
662 if (IS_NOT_NULL(t)) xfree(t);
663 reg->name_table = NULL;
664 return 0;
665 }
666
667 static NameEntry*
668 name_find(regex_t* reg, UChar* name, UChar* name_end)
669 {
670 int i, len;
671 NameEntry* e;
672 NameTable* t = (NameTable* )reg->name_table;
673
674 if (IS_NOT_NULL(t)) {
675 len = name_end - name;
676 for (i = 0; i < t->num; i++) {
677 e = &(t->e[i]);
678 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
679 return e;
680 }
681 }
682 return (NameEntry* )NULL;
683 }
684
685 extern int
686 onig_foreach_name(regex_t* reg,
687 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
688 {
689 int i, r;
690 NameEntry* e;
691 NameTable* t = (NameTable* )reg->name_table;
692
693 if (IS_NOT_NULL(t)) {
694 for (i = 0; i < t->num; i++) {
695 e = &(t->e[i]);
696 r = (*func)(e->name, e->name + e->name_len, e->back_num,
697 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
698 reg, arg);
699 if (r != 0) return r;
700 }
701 }
702 return 0;
703 }
704
705 extern int
706 onig_number_of_names(regex_t* reg)
707 {
708 NameTable* t = (NameTable* )reg->name_table;
709
710 if (IS_NOT_NULL(t))
711 return t->num;
712 else
713 return 0;
714 }
715
716 #endif
717
718 static int
719 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
720 {
721 int alloc;
722 NameEntry* e;
723 NameTable* t = (NameTable* )reg->name_table;
724
725 if (name_end - name <= 0)
726 return ONIGERR_EMPTY_GROUP_NAME;
727
728 e = name_find(reg, name, name_end);
729 if (IS_NULL(e)) {
730 #ifdef USE_ST_LIBRARY
731 if (IS_NULL(t)) {
732 t = onig_st_init_strend_table_with_size(5);
733 reg->name_table = (void* )t;
734 }
735 e = (NameEntry* )xmalloc(sizeof(NameEntry));
736 CHECK_NULL_RETURN_MEMERR(e);
737
738 e->name = strdup_with_null(reg->enc, name, name_end);
739 if (IS_NULL(e->name)) {
740 xfree(e); return ONIGERR_MEMORY;
741 }
742 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
743 (HashDataType )e);
744
745 e->name_len = name_end - name;
746 e->back_num = 0;
747 e->back_alloc = 0;
748 e->back_refs = (int* )NULL;
749
750 #else
751
752 if (IS_NULL(t)) {
753 alloc = INIT_NAMES_ALLOC_NUM;
754 t = (NameTable* )xmalloc(sizeof(NameTable));
755 CHECK_NULL_RETURN_MEMERR(t);
756 t->e = NULL;
757 t->alloc = 0;
758 t->num = 0;
759
760 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
761 if (IS_NULL(t->e)) {
762 xfree(t);
763 return ONIGERR_MEMORY;
764 }
765 t->alloc = alloc;
766 reg->name_table = t;
767 goto clear;
768 }
769 else if (t->num == t->alloc) {
770 int i;
771
772 alloc = t->alloc * 2;
773 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
774 CHECK_NULL_RETURN_MEMERR(t->e);
775 t->alloc = alloc;
776
777 clear:
778 for (i = t->num; i < t->alloc; i++) {
779 t->e[i].name = NULL;
780 t->e[i].name_len = 0;
781 t->e[i].back_num = 0;
782 t->e[i].back_alloc = 0;
783 t->e[i].back_refs = (int* )NULL;
784 }
785 }
786 e = &(t->e[t->num]);
787 t->num++;
788 e->name = strdup_with_null(reg->enc, name, name_end);
789 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
790 e->name_len = name_end - name;
791 #endif
792 }
793
794 if (e->back_num >= 1 &&
795 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
796 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
797 name, name_end);
798 return ONIGERR_MULTIPLEX_DEFINED_NAME;
799 }
800
801 e->back_num++;
802 if (e->back_num == 1) {
803 e->back_ref1 = backref;
804 }
805 else {
806 if (e->back_num == 2) {
807 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
808 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
809 CHECK_NULL_RETURN_MEMERR(e->back_refs);
810 e->back_alloc = alloc;
811 e->back_refs[0] = e->back_ref1;
812 e->back_refs[1] = backref;
813 }
814 else {
815 if (e->back_num > e->back_alloc) {
816 alloc = e->back_alloc * 2;
817 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
818 CHECK_NULL_RETURN_MEMERR(e->back_refs);
819 e->back_alloc = alloc;
820 }
821 e->back_refs[e->back_num - 1] = backref;
822 }
823 }
824
825 return 0;
826 }
827
828 extern int
829 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
830 const UChar* name_end, int** nums)
831 {
832 NameEntry* e = name_find(reg, name, name_end);
833
834 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
835
836 switch (e->back_num) {
837 case 0:
838 break;
839 case 1:
840 *nums = &(e->back_ref1);
841 break;
842 default:
843 *nums = e->back_refs;
844 break;
845 }
846 return e->back_num;
847 }
848
849 extern int
850 onig_name_to_backref_number(regex_t* reg, const UChar* name,
851 const UChar* name_end, OnigRegion *region)
852 {
853 int i, n, *nums;
854
855 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
856 if (n < 0)
857 return n;
858 else if (n == 0)
859 return ONIGERR_PARSER_BUG;
860 else if (n == 1)
861 return nums[0];
862 else {
863 if (IS_NOT_NULL(region)) {
864 for (i = n - 1; i >= 0; i--) {
865 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
866 return nums[i];
867 }
868 }
869 return nums[n - 1];
870 }
871 }
872
873 #else
874
875 extern int
876 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
877 const UChar* name_end, int** nums)
878 {
879 return ONIG_NO_SUPPORT_CONFIG;
880 }
881
882 extern int
883 onig_name_to_backref_number(regex_t* reg, const UChar* name,
884 const UChar* name_end, OnigRegion* region)
885 {
886 return ONIG_NO_SUPPORT_CONFIG;
887 }
888
889 extern int
890 onig_foreach_name(regex_t* reg,
891 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
892 {
893 return ONIG_NO_SUPPORT_CONFIG;
894 }
895
896 extern int
897 onig_number_of_names(regex_t* reg)
898 {
899 return 0;
900 }
901 #endif
902
903 extern int
904 onig_noname_group_capture_is_active(regex_t* reg)
905 {
906 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
907 return 0;
908
909 #ifdef USE_NAMED_GROUP
910 if (onig_number_of_names(reg) > 0 &&
911 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
912 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
913 return 0;
914 }
915 #endif
916
917 return 1;
918 }
919
920
921 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
922
923 static void
924 scan_env_clear(ScanEnv* env)
925 {
926 int i;
927
928 BIT_STATUS_CLEAR(env->capture_history);
929 BIT_STATUS_CLEAR(env->bt_mem_start);
930 BIT_STATUS_CLEAR(env->bt_mem_end);
931 BIT_STATUS_CLEAR(env->backrefed_mem);
932 env->error = (UChar* )NULL;
933 env->error_end = (UChar* )NULL;
934 env->num_call = 0;
935 env->num_mem = 0;
936 #ifdef USE_NAMED_GROUP
937 env->num_named = 0;
938 #endif
939 env->mem_alloc = 0;
940 env->mem_nodes_dynamic = (Node** )NULL;
941
942 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
943 env->mem_nodes_static[i] = NULL_NODE;
944
945 #ifdef USE_COMBINATION_EXPLOSION_CHECK
946 env->num_comb_exp_check = 0;
947 env->comb_exp_max_regnum = 0;
948 env->curr_max_regnum = 0;
949 env->has_recursion = 0;
950 #endif
951 }
952
953 static int
954 scan_env_add_mem_entry(ScanEnv* env)
955 {
956 int i, need, alloc;
957 Node** p;
958
959 need = env->num_mem + 1;
960 if (need >= SCANENV_MEMNODES_SIZE) {
961 if (env->mem_alloc <= need) {
962 if (IS_NULL(env->mem_nodes_dynamic)) {
963 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
964 p = (Node** )xmalloc(sizeof(Node*) * alloc);
965 xmemcpy(p, env->mem_nodes_static,
966 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
967 }
968 else {
969 alloc = env->mem_alloc * 2;
970 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
971 }
972 CHECK_NULL_RETURN_MEMERR(p);
973
974 for (i = env->num_mem + 1; i < alloc; i++)
975 p[i] = NULL_NODE;
976
977 env->mem_nodes_dynamic = p;
978 env->mem_alloc = alloc;
979 }
980 }
981
982 env->num_mem++;
983 return env->num_mem;
984 }
985
986 static int
987 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
988 {
989 if (env->num_mem >= num)
990 SCANENV_MEM_NODES(env)[num] = node;
991 else
992 return ONIGERR_PARSER_BUG;
993 return 0;
994 }
995
996
997 #ifdef USE_PARSE_TREE_NODE_RECYCLE
998 typedef struct _FreeNode {
999 struct _FreeNode* next;
1000 } FreeNode;
1001
1002 static FreeNode* FreeNodeList = (FreeNode* )NULL;
1003 #endif
1004
1005 extern void
1006 onig_node_free(Node* node)
1007 {
1008 start:
1009 if (IS_NULL(node)) return ;
1010
1011 switch (NTYPE(node)) {
1012 case NT_STR:
1013 if (NSTR(node)->capa != 0 &&
1014 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1015 xfree(NSTR(node)->s);
1016 }
1017 break;
1018
1019 case NT_LIST:
1020 case NT_ALT:
1021 onig_node_free(NCAR(node));
1022 {
1023 Node* next_node = NCDR(node);
1024
1025 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1026 {
1027 FreeNode* n = (FreeNode* )node;
1028
1029 THREAD_ATOMIC_START;
1030 n->next = FreeNodeList;
1031 FreeNodeList = n;
1032 THREAD_ATOMIC_END;
1033 }
1034 #else
1035 xfree(node);
1036 #endif
1037 node = next_node;
1038 goto start;
1039 }
1040 break;
1041
1042 case NT_CCLASS:
1043 {
1044 CClassNode* cc = NCCLASS(node);
1045
1046 if (IS_NCCLASS_SHARE(cc)) return ;
1047 if (cc->mbuf)
1048 bbuf_free(cc->mbuf);
1049 }
1050 break;
1051
1052 case NT_QTFR:
1053 if (NQTFR(node)->target)
1054 onig_node_free(NQTFR(node)->target);
1055 break;
1056
1057 case NT_ENCLOSE:
1058 if (NENCLOSE(node)->target)
1059 onig_node_free(NENCLOSE(node)->target);
1060 break;
1061
1062 case NT_BREF:
1063 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1064 xfree(NBREF(node)->back_dynamic);
1065 break;
1066
1067 case NT_ANCHOR:
1068 if (NANCHOR(node)->target)
1069 onig_node_free(NANCHOR(node)->target);
1070 break;
1071 }
1072
1073 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1074 {
1075 FreeNode* n = (FreeNode* )node;
1076
1077 THREAD_ATOMIC_START;
1078 n->next = FreeNodeList;
1079 FreeNodeList = n;
1080 THREAD_ATOMIC_END;
1081 }
1082 #else
1083 xfree(node);
1084 #endif
1085 }
1086
1087 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1088 extern int
1089 onig_free_node_list(void)
1090 {
1091 FreeNode* n;
1092
1093
1094 while (IS_NOT_NULL(FreeNodeList)) {
1095 n = FreeNodeList;
1096 FreeNodeList = FreeNodeList->next;
1097 xfree(n);
1098 }
1099
1100 return 0;
1101 }
1102 #endif
1103
1104 static Node*
1105 node_new(void)
1106 {
1107 Node* node;
1108
1109 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1110 THREAD_ATOMIC_START;
1111 if (IS_NOT_NULL(FreeNodeList)) {
1112 node = (Node* )FreeNodeList;
1113 FreeNodeList = FreeNodeList->next;
1114 THREAD_ATOMIC_END;
1115 return node;
1116 }
1117 THREAD_ATOMIC_END;
1118 #endif
1119
1120 node = (Node* )xmalloc(sizeof(Node));
1121
1122 return node;
1123 }
1124
1125
1126 static void
1127 initialize_cclass(CClassNode* cc)
1128 {
1129 BITSET_CLEAR(cc->bs);
1130
1131 cc->flags = 0;
1132 cc->mbuf = NULL;
1133 }
1134
1135 static Node*
1136 node_new_cclass(void)
1137 {
1138 Node* node = node_new();
1139 CHECK_NULL_RETURN(node);
1140
1141 SET_NTYPE(node, NT_CCLASS);
1142 initialize_cclass(NCCLASS(node));
1143 return node;
1144 }
1145
1146 static Node*
1147 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1148 const OnigCodePoint ranges[])
1149 {
1150 int n, i;
1151 CClassNode* cc;
1152 OnigCodePoint j;
1153
1154 Node* node = node_new_cclass();
1155 CHECK_NULL_RETURN(node);
1156
1157 cc = NCCLASS(node);
1158 if (not != 0) NCCLASS_SET_NOT(cc);
1159
1160 BITSET_CLEAR(cc->bs);
1161 if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1162 n = ONIGENC_CODE_RANGE_NUM(ranges);
1163 for (i = 0; i < n; i++) {
1164 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
1165 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1166 if (j >= sb_out) goto sb_end;
1167
1168 BITSET_SET_BIT(cc->bs, j);
1169 }
1170 }
1171 }
1172
1173 sb_end:
1174 if (IS_NULL(ranges)) {
1175 is_null:
1176 cc->mbuf = NULL;
1177 }
1178 else {
1179 BBuf* bbuf;
1180
1181 n = ONIGENC_CODE_RANGE_NUM(ranges);
1182 if (n == 0) goto is_null;
1183
1184 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1185 CHECK_NULL_RETURN(bbuf);
1186 bbuf->alloc = n + 1;
1187 bbuf->used = n + 1;
1188 bbuf->p = (UChar* )((void* )ranges);
1189
1190 cc->mbuf = bbuf;
1191 }
1192
1193 return node;
1194 }
1195
1196 static Node*
1197 node_new_ctype(int type, int not)
1198 {
1199 Node* node = node_new();
1200 CHECK_NULL_RETURN(node);
1201
1202 SET_NTYPE(node, NT_CTYPE);
1203 NCTYPE(node)->ctype = type;
1204 NCTYPE(node)->not = not;
1205 return node;
1206 }
1207
1208 static Node*
1209 node_new_anychar(void)
1210 {
1211 Node* node = node_new();
1212 CHECK_NULL_RETURN(node);
1213
1214 SET_NTYPE(node, NT_CANY);
1215 return node;
1216 }
1217
1218 static Node*
1219 node_new_list(Node* left, Node* right)
1220 {
1221 Node* node = node_new();
1222 CHECK_NULL_RETURN(node);
1223
1224 SET_NTYPE(node, NT_LIST);
1225 NCAR(node) = left;
1226 NCDR(node) = right;
1227 return node;
1228 }
1229
1230 extern Node*
1231 onig_node_new_list(Node* left, Node* right)
1232 {
1233 return node_new_list(left, right);
1234 }
1235
1236 extern Node*
1237 onig_node_list_add(Node* list, Node* x)
1238 {
1239 Node *n;
1240
1241 n = onig_node_new_list(x, NULL);
1242 if (IS_NULL(n)) return NULL_NODE;
1243
1244 if (IS_NOT_NULL(list)) {
1245 while (IS_NOT_NULL(NCDR(list)))
1246 list = NCDR(list);
1247
1248 NCDR(list) = n;
1249 }
1250
1251 return n;
1252 }
1253
1254 extern Node*
1255 onig_node_new_alt(Node* left, Node* right)
1256 {
1257 Node* node = node_new();
1258 CHECK_NULL_RETURN(node);
1259
1260 SET_NTYPE(node, NT_ALT);
1261 NCAR(node) = left;
1262 NCDR(node) = right;
1263 return node;
1264 }
1265
1266 extern Node*
1267 onig_node_new_anchor(int type)
1268 {
1269 Node* node = node_new();
1270 CHECK_NULL_RETURN(node);
1271
1272 SET_NTYPE(node, NT_ANCHOR);
1273 NANCHOR(node)->type = type;
1274 NANCHOR(node)->target = NULL;
1275 NANCHOR(node)->char_len = -1;
1276 return node;
1277 }
1278
1279 static Node*
1280 node_new_backref(int back_num, int* backrefs, int by_name,
1281 #ifdef USE_BACKREF_WITH_LEVEL
1282 int exist_level, int nest_level,
1283 #endif
1284 ScanEnv* env)
1285 {
1286 int i;
1287 Node* node = node_new();
1288
1289 CHECK_NULL_RETURN(node);
1290
1291 SET_NTYPE(node, NT_BREF);
1292 NBREF(node)->state = 0;
1293 NBREF(node)->back_num = back_num;
1294 NBREF(node)->back_dynamic = (int* )NULL;
1295 if (by_name != 0)
1296 NBREF(node)->state |= NST_NAME_REF;
1297
1298 #ifdef USE_BACKREF_WITH_LEVEL
1299 if (exist_level != 0) {
1300 NBREF(node)->state |= NST_NEST_LEVEL;
1301 NBREF(node)->nest_level = nest_level;
1302 }
1303 #endif
1304
1305 for (i = 0; i < back_num; i++) {
1306 if (backrefs[i] <= env->num_mem &&
1307 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1308 NBREF(node)->state |= NST_RECURSION;
1309 break;
1310 }
1311 }
1312
1313 if (back_num <= NODE_BACKREFS_SIZE) {
1314 for (i = 0; i < back_num; i++)
1315 NBREF(node)->back_static[i] = backrefs[i];
1316 }
1317 else {
1318 int* p = (int* )xmalloc(sizeof(int) * back_num);
1319 if (IS_NULL(p)) {
1320 onig_node_free(node);
1321 return NULL;
1322 }
1323 NBREF(node)->back_dynamic = p;
1324 for (i = 0; i < back_num; i++)
1325 p[i] = backrefs[i];
1326 }
1327 return node;
1328 }
1329
1330 #ifdef USE_SUBEXP_CALL
1331 static Node*
1332 node_new_call(UChar* name, UChar* name_end, int gnum)
1333 {
1334 Node* node = node_new();
1335 CHECK_NULL_RETURN(node);
1336
1337 SET_NTYPE(node, NT_CALL);
1338 NCALL(node)->state = 0;
1339 NCALL(node)->target = NULL_NODE;
1340 NCALL(node)->name = name;
1341 NCALL(node)->name_end = name_end;
1342 NCALL(node)->group_num = gnum;
1343 return node;
1344 }
1345 #endif
1346
1347 static Node*
1348 node_new_quantifier(int lower, int upper, int by_number)
1349 {
1350 Node* node = node_new();
1351 CHECK_NULL_RETURN(node);
1352
1353 SET_NTYPE(node, NT_QTFR);
1354 NQTFR(node)->state = 0;
1355 NQTFR(node)->target = NULL;
1356 NQTFR(node)->lower = lower;
1357 NQTFR(node)->upper = upper;
1358 NQTFR(node)->greedy = 1;
1359 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1360 NQTFR(node)->head_exact = NULL_NODE;
1361 NQTFR(node)->next_head_exact = NULL_NODE;
1362 NQTFR(node)->is_refered = 0;
1363 if (by_number != 0)
1364 NQTFR(node)->state |= NST_BY_NUMBER;
1365
1366 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1367 NQTFR(node)->comb_exp_check_num = 0;
1368 #endif
1369
1370 return node;
1371 }
1372
1373 static Node*
1374 node_new_enclose(int type)
1375 {
1376 Node* node = node_new();
1377 CHECK_NULL_RETURN(node);
1378
1379 SET_NTYPE(node, NT_ENCLOSE);
1380 NENCLOSE(node)->type = type;
1381 NENCLOSE(node)->state = 0;
1382 NENCLOSE(node)->regnum = 0;
1383 NENCLOSE(node)->option = 0;
1384 NENCLOSE(node)->target = NULL;
1385 NENCLOSE(node)->call_addr = -1;
1386 NENCLOSE(node)->opt_count = 0;
1387 return node;
1388 }
1389
1390 extern Node*
1391 onig_node_new_enclose(int type)
1392 {
1393 return node_new_enclose(type);
1394 }
1395
1396 static Node*
1397 node_new_enclose_memory(OnigOptionType option, int is_named)
1398 {
1399 Node* node = node_new_enclose(ENCLOSE_MEMORY);
1400 CHECK_NULL_RETURN(node);
1401 if (is_named != 0)
1402 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1403
1404 #ifdef USE_SUBEXP_CALL
1405 NENCLOSE(node)->option = option;
1406 #endif
1407 return node;
1408 }
1409
1410 static Node*
1411 node_new_option(OnigOptionType option)
1412 {
1413 Node* node = node_new_enclose(ENCLOSE_OPTION);
1414 CHECK_NULL_RETURN(node);
1415 NENCLOSE(node)->option = option;
1416 return node;
1417 }
1418
1419 extern int
1420 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1421 {
1422 int addlen = end - s;
1423
1424 if (addlen > 0) {
1425 int len = NSTR(node)->end - NSTR(node)->s;
1426
1427 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1428 UChar* p;
1429 int capa = len + addlen + NODE_STR_MARGIN;
1430
1431 if (capa <= NSTR(node)->capa) {
1432 onig_strcpy(NSTR(node)->s + len, s, end);
1433 }
1434 else {
1435 if (NSTR(node)->s == NSTR(node)->buf)
1436 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1437 s, end, capa);
1438 else
1439 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1440
1441 CHECK_NULL_RETURN_MEMERR(p);
1442 NSTR(node)->s = p;
1443 NSTR(node)->capa = capa;
1444 }
1445 }
1446 else {
1447 onig_strcpy(NSTR(node)->s + len, s, end);
1448 }
1449 NSTR(node)->end = NSTR(node)->s + len + addlen;
1450 }
1451
1452 return 0;
1453 }
1454
1455 extern int
1456 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1457 {
1458 onig_node_str_clear(node);
1459 return onig_node_str_cat(node, s, end);
1460 }
1461
1462 static int
1463 node_str_cat_char(Node* node, UChar c)
1464 {
1465 UChar s[1];
1466
1467 s[0] = c;
1468 return onig_node_str_cat(node, s, s + 1);
1469 }
1470
1471 extern void
1472 onig_node_conv_to_str_node(Node* node, int flag)
1473 {
1474 SET_NTYPE(node, NT_STR);
1475 NSTR(node)->flag = flag;
1476 NSTR(node)->capa = 0;
1477 NSTR(node)->s = NSTR(node)->buf;
1478 NSTR(node)->end = NSTR(node)->buf;
1479 }
1480
1481 extern void
1482 onig_node_str_clear(Node* node)
1483 {
1484 if (NSTR(node)->capa != 0 &&
1485 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1486 xfree(NSTR(node)->s);
1487 }
1488
1489 NSTR(node)->capa = 0;
1490 NSTR(node)->flag = 0;
1491 NSTR(node)->s = NSTR(node)->buf;
1492 NSTR(node)->end = NSTR(node)->buf;
1493 }
1494
1495 static Node*
1496 node_new_str(const UChar* s, const UChar* end)
1497 {
1498 Node* node = node_new();
1499 CHECK_NULL_RETURN(node);
1500
1501 SET_NTYPE(node, NT_STR);
1502 NSTR(node)->capa = 0;
1503 NSTR(node)->flag = 0;
1504 NSTR(node)->s = NSTR(node)->buf;
1505 NSTR(node)->end = NSTR(node)->buf;
1506 if (onig_node_str_cat(node, s, end)) {
1507 onig_node_free(node);
1508 return NULL;
1509 }
1510 return node;
1511 }
1512
1513 extern Node*
1514 onig_node_new_str(const UChar* s, const UChar* end)
1515 {
1516 return node_new_str(s, end);
1517 }
1518
1519 static Node*
1520 node_new_str_raw(UChar* s, UChar* end)
1521 {
1522 Node* node = node_new_str(s, end);
1523 NSTRING_SET_RAW(node);
1524 return node;
1525 }
1526
1527 static Node*
1528 node_new_empty(void)
1529 {
1530 return node_new_str(NULL, NULL);
1531 }
1532
1533 static Node*
1534 node_new_str_raw_char(UChar c)
1535 {
1536 UChar p[1];
1537
1538 p[0] = c;
1539 return node_new_str_raw(p, p + 1);
1540 }
1541
1542 static Node*
1543 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1544 {
1545 const UChar *p;
1546 Node* n = NULL_NODE;
1547
1548 if (sn->end > sn->s) {
1549 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1550 if (p && p > sn->s) {
1551 n = node_new_str(p, sn->end);
1552 if ((sn->flag & NSTR_RAW) != 0)
1553 NSTRING_SET_RAW(n);
1554 sn->end = (UChar* )p;
1555 }
1556 }
1557 return n;
1558 }
1559
1560 static int
1561 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1562 {
1563 if (sn->end > sn->s) {
1564 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1565 }
1566 return 0;
1567 }
1568
1569 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1570 static int
1571 node_str_head_pad(StrNode* sn, int num, UChar val)
1572 {
1573 UChar buf[NODE_STR_BUF_SIZE];
1574 int i, len;
1575
1576 len = sn->end - sn->s;
1577 onig_strcpy(buf, sn->s, sn->end);
1578 onig_strcpy(&(sn->s[num]), buf, buf + len);
1579 sn->end += num;
1580
1581 for (i = 0; i < num; i++) {
1582 sn->s[i] = val;
1583 }
1584 }
1585 #endif
1586
1587 extern int
1588 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1589 {
1590 unsigned int num, val;
1591 OnigCodePoint c;
1592 UChar* p = *src;
1593 PFETCH_READY;
1594
1595 num = 0;
1596 while (!PEND) {
1597 PFETCH(c);
1598 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1599 val = (unsigned int )DIGITVAL(c);
1600 if ((INT_MAX_LIMIT - val) / 10UL < num)
1601 return -1;
1602
1603 num = num * 10 + val;
1604 }
1605 else {
1606 PUNFETCH;
1607 break;
1608 }
1609 }
1610 *src = p;
1611 return num;
1612 }
1613
1614 static int
1615 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1616 OnigEncoding enc)
1617 {
1618 OnigCodePoint c;
1619 unsigned int num, val;
1620 UChar* p = *src;
1621 PFETCH_READY;
1622
1623 num = 0;
1624 while (!PEND && maxlen-- != 0) {
1625 PFETCH(c);
1626 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1627 val = (unsigned int )XDIGITVAL(enc,c);
1628 if ((INT_MAX_LIMIT - val) / 16UL < num)
1629 return -1;
1630
1631 num = (num << 4) + XDIGITVAL(enc,c);
1632 }
1633 else {
1634 PUNFETCH;
1635 break;
1636 }
1637 }
1638 *src = p;
1639 return num;
1640 }
1641
1642 static int
1643 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1644 OnigEncoding enc)
1645 {
1646 OnigCodePoint c;
1647 unsigned int num, val;
1648 UChar* p = *src;
1649 PFETCH_READY;
1650
1651 num = 0;
1652 while (!PEND && maxlen-- != 0) {
1653 PFETCH(c);
1654 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1655 val = ODIGITVAL(c);
1656 if ((INT_MAX_LIMIT - val) / 8UL < num)
1657 return -1;
1658
1659 num = (num << 3) + val;
1660 }
1661 else {
1662 PUNFETCH;
1663 break;
1664 }
1665 }
1666 *src = p;
1667 return num;
1668 }
1669
1670
1671 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1672 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1673
1674
1675
1676
1677
1678 static int
1679 new_code_range(BBuf** pbuf)
1680 {
1681 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1682 int r;
1683 OnigCodePoint n;
1684 BBuf* bbuf;
1685
1686 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1687 CHECK_NULL_RETURN_MEMERR(*pbuf);
1688 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1689 if (r) return r;
1690
1691 n = 0;
1692 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1693 return 0;
1694 }
1695
1696 static int
1697 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1698 {
1699 int r, inc_n, pos;
1700 int low, high, bound, x;
1701 OnigCodePoint n, *data;
1702 BBuf* bbuf;
1703
1704 if (from > to) {
1705 n = from; from = to; to = n;
1706 }
1707
1708 if (IS_NULL(*pbuf)) {
1709 r = new_code_range(pbuf);
1710 if (r) return r;
1711 bbuf = *pbuf;
1712 n = 0;
1713 }
1714 else {
1715 bbuf = *pbuf;
1716 GET_CODE_POINT(n, bbuf->p);
1717 }
1718 data = (OnigCodePoint* )(bbuf->p);
1719 data++;
1720
1721 for (low = 0, bound = n; low < bound; ) {
1722 x = (low + bound) >> 1;
1723 if (from > data[x*2 + 1])
1724 low = x + 1;
1725 else
1726 bound = x;
1727 }
1728
1729 for (high = low, bound = n; high < bound; ) {
1730 x = (high + bound) >> 1;
1731 if (to >= data[x*2] - 1)
1732 high = x + 1;
1733 else
1734 bound = x;
1735 }
1736
1737 inc_n = low + 1 - high;
1738 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1739 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1740
1741 if (inc_n != 1) {
1742 if (from > data[low*2])
1743 from = data[low*2];
1744 if (to < data[(high - 1)*2 + 1])
1745 to = data[(high - 1)*2 + 1];
1746 }
1747
1748 if (inc_n != 0 && (OnigCodePoint )high < n) {
1749 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1750 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1751 int size = (n - high) * 2 * SIZE_CODE_POINT;
1752
1753 if (inc_n > 0) {
1754 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1755 }
1756 else {
1757 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1758 }
1759 }
1760
1761 pos = SIZE_CODE_POINT * (1 + low * 2);
1762 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1763 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1764 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1765 n += inc_n;
1766 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1767
1768 return 0;
1769 }
1770
1771 static int
1772 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1773 {
1774 if (from > to) {
1775 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1776 return 0;
1777 else
1778 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1779 }
1780
1781 return add_code_range_to_buf(pbuf, from, to);
1782 }
1783
1784 static int
1785 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1786 {
1787 int r, i, n;
1788 OnigCodePoint pre, from, *data, to = 0;
1789
1790 *pbuf = (BBuf* )NULL;
1791 if (IS_NULL(bbuf)) {
1792 set_all:
1793 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1794 }
1795
1796 data = (OnigCodePoint* )(bbuf->p);
1797 GET_CODE_POINT(n, data);
1798 data++;
1799 if (n <= 0) goto set_all;
1800
1801 r = 0;
1802 pre = MBCODE_START_POS(enc);
1803 for (i = 0; i < n; i++) {
1804 from = data[i*2];
1805 to = data[i*2+1];
1806 if (pre <= from - 1) {
1807 r = add_code_range_to_buf(pbuf, pre, from - 1);
1808 if (r != 0) return r;
1809 }
1810 if (to == ~((OnigCodePoint )0)) break;
1811 pre = to + 1;
1812 }
1813 if (to < ~((OnigCodePoint )0)) {
1814 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1815 }
1816 return r;
1817 }
1818
1819 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1820 BBuf *tbuf; \
1821 int tnot; \
1822 tnot = not1; not1 = not2; not2 = tnot; \
1823 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1824 } while (0)
1825
1826 static int
1827 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1828 BBuf* bbuf2, int not2, BBuf** pbuf)
1829 {
1830 int r;
1831 OnigCodePoint i, n1, *data1;
1832 OnigCodePoint from, to;
1833
1834 *pbuf = (BBuf* )NULL;
1835 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1836 if (not1 != 0 || not2 != 0)
1837 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1838 return 0;
1839 }
1840
1841 r = 0;
1842 if (IS_NULL(bbuf2))
1843 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1844
1845 if (IS_NULL(bbuf1)) {
1846 if (not1 != 0) {
1847 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1848 }
1849 else {
1850 if (not2 == 0) {
1851 return bbuf_clone(pbuf, bbuf2);
1852 }
1853 else {
1854 return not_code_range_buf(enc, bbuf2, pbuf);
1855 }
1856 }
1857 }
1858
1859 if (not1 != 0)
1860 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1861
1862 data1 = (OnigCodePoint* )(bbuf1->p);
1863 GET_CODE_POINT(n1, data1);
1864 data1++;
1865
1866 if (not2 == 0 && not1 == 0) {
1867 r = bbuf_clone(pbuf, bbuf2);
1868 }
1869 else if (not1 == 0) {
1870 r = not_code_range_buf(enc, bbuf2, pbuf);
1871 }
1872 if (r != 0) return r;
1873
1874 for (i = 0; i < n1; i++) {
1875 from = data1[i*2];
1876 to = data1[i*2+1];
1877 r = add_code_range_to_buf(pbuf, from, to);
1878 if (r != 0) return r;
1879 }
1880 return 0;
1881 }
1882
1883 static int
1884 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1885 OnigCodePoint* data, int n)
1886 {
1887 int i, r;
1888 OnigCodePoint from2, to2;
1889
1890 for (i = 0; i < n; i++) {
1891 from2 = data[i*2];
1892 to2 = data[i*2+1];
1893 if (from2 < from1) {
1894 if (to2 < from1) continue;
1895 else {
1896 from1 = to2 + 1;
1897 }
1898 }
1899 else if (from2 <= to1) {
1900 if (to2 < to1) {
1901 if (from1 <= from2 - 1) {
1902 r = add_code_range_to_buf(pbuf, from1, from2-1);
1903 if (r != 0) return r;
1904 }
1905 from1 = to2 + 1;
1906 }
1907 else {
1908 to1 = from2 - 1;
1909 }
1910 }
1911 else {
1912 from1 = from2;
1913 }
1914 if (from1 > to1) break;
1915 }
1916 if (from1 <= to1) {
1917 r = add_code_range_to_buf(pbuf, from1, to1);
1918 if (r != 0) return r;
1919 }
1920 return 0;
1921 }
1922
1923 static int
1924 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1925 {
1926 int r;
1927 OnigCodePoint i, j, n1, n2, *data1, *data2;
1928 OnigCodePoint from, to, from1, to1, from2, to2;
1929
1930 *pbuf = (BBuf* )NULL;
1931 if (IS_NULL(bbuf1)) {
1932 if (not1 != 0 && IS_NOT_NULL(bbuf2))
1933 return bbuf_clone(pbuf, bbuf2);
1934 return 0;
1935 }
1936 else if (IS_NULL(bbuf2)) {
1937 if (not2 != 0)
1938 return bbuf_clone(pbuf, bbuf1);
1939 return 0;
1940 }
1941
1942 if (not1 != 0)
1943 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1944
1945 data1 = (OnigCodePoint* )(bbuf1->p);
1946 data2 = (OnigCodePoint* )(bbuf2->p);
1947 GET_CODE_POINT(n1, data1);
1948 GET_CODE_POINT(n2, data2);
1949 data1++;
1950 data2++;
1951
1952 if (not2 == 0 && not1 == 0) {
1953 for (i = 0; i < n1; i++) {
1954 from1 = data1[i*2];
1955 to1 = data1[i*2+1];
1956 for (j = 0; j < n2; j++) {
1957 from2 = data2[j*2];
1958 to2 = data2[j*2+1];
1959 if (from2 > to1) break;
1960 if (to2 < from1) continue;
1961 from = MAX(from1, from2);
1962 to = MIN(to1, to2);
1963 r = add_code_range_to_buf(pbuf, from, to);
1964 if (r != 0) return r;
1965 }
1966 }
1967 }
1968 else if (not1 == 0) {
1969 for (i = 0; i < n1; i++) {
1970 from1 = data1[i*2];
1971 to1 = data1[i*2+1];
1972 r = and_code_range1(pbuf, from1, to1, data2, n2);
1973 if (r != 0) return r;
1974 }
1975 }
1976
1977 return 0;
1978 }
1979
1980 static int
1981 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1982 {
1983 int r, not1, not2;
1984 BBuf *buf1, *buf2, *pbuf;
1985 BitSetRef bsr1, bsr2;
1986 BitSet bs1, bs2;
1987
1988 not1 = IS_NCCLASS_NOT(dest);
1989 bsr1 = dest->bs;
1990 buf1 = dest->mbuf;
1991 not2 = IS_NCCLASS_NOT(cc);
1992 bsr2 = cc->bs;
1993 buf2 = cc->mbuf;
1994
1995 if (not1 != 0) {
1996 bitset_invert_to(bsr1, bs1);
1997 bsr1 = bs1;
1998 }
1999 if (not2 != 0) {
2000 bitset_invert_to(bsr2, bs2);
2001 bsr2 = bs2;
2002 }
2003 bitset_and(bsr1, bsr2);
2004 if (bsr1 != dest->bs) {
2005 bitset_copy(dest->bs, bsr1);
2006 bsr1 = dest->bs;
2007 }
2008 if (not1 != 0) {
2009 bitset_invert(dest->bs);
2010 }
2011
2012 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2013 if (not1 != 0 && not2 != 0) {
2014 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2015 }
2016 else {
2017 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2018 if (r == 0 && not1 != 0) {
2019 BBuf *tbuf;
2020 r = not_code_range_buf(enc, pbuf, &tbuf);
2021 if (r != 0) {
2022 bbuf_free(pbuf);
2023 return r;
2024 }
2025 bbuf_free(pbuf);
2026 pbuf = tbuf;
2027 }
2028 }
2029 if (r != 0) return r;
2030
2031 dest->mbuf = pbuf;
2032 bbuf_free(buf1);
2033 return r;
2034 }
2035 return 0;
2036 }
2037
2038 static int
2039 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2040 {
2041 int r, not1, not2;
2042 BBuf *buf1, *buf2, *pbuf;
2043 BitSetRef bsr1, bsr2;
2044 BitSet bs1, bs2;
2045
2046 not1 = IS_NCCLASS_NOT(dest);
2047 bsr1 = dest->bs;
2048 buf1 = dest->mbuf;
2049 not2 = IS_NCCLASS_NOT(cc);
2050 bsr2 = cc->bs;
2051 buf2 = cc->mbuf;
2052
2053 if (not1 != 0) {
2054 bitset_invert_to(bsr1, bs1);
2055 bsr1 = bs1;
2056 }
2057 if (not2 != 0) {
2058 bitset_invert_to(bsr2, bs2);
2059 bsr2 = bs2;
2060 }
2061 bitset_or(bsr1, bsr2);
2062 if (bsr1 != dest->bs) {
2063 bitset_copy(dest->bs, bsr1);
2064 bsr1 = dest->bs;
2065 }
2066 if (not1 != 0) {
2067 bitset_invert(dest->bs);
2068 }
2069
2070 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2071 if (not1 != 0 && not2 != 0) {
2072 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2073 }
2074 else {
2075 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2076 if (r == 0 && not1 != 0) {
2077 BBuf *tbuf;
2078 r = not_code_range_buf(enc, pbuf, &tbuf);
2079 if (r != 0) {
2080 bbuf_free(pbuf);
2081 return r;
2082 }
2083 bbuf_free(pbuf);
2084 pbuf = tbuf;
2085 }
2086 }
2087 if (r != 0) return r;
2088
2089 dest->mbuf = pbuf;
2090 bbuf_free(buf1);
2091 return r;
2092 }
2093 else
2094 return 0;
2095 }
2096
2097 static int
2098 conv_backslash_value(int c, ScanEnv* env)
2099 {
2100 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2101 switch (c) {
2102 case 'n': return '\n';
2103 case 't': return '\t';
2104 case 'r': return '\r';
2105 case 'f': return '\f';
2106 case 'a': return '\007';
2107 case 'b': return '\010';
2108 case 'e': return '\033';
2109 case 'v':
2110 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2111 return '\v';
2112 break;
2113
2114 default:
2115 break;
2116 }
2117 }
2118 return c;
2119 }
2120
2121 static int
2122 is_invalid_quantifier_target(Node* node)
2123 {
2124 switch (NTYPE(node)) {
2125 case NT_ANCHOR:
2126 return 1;
2127 break;
2128
2129 case NT_ENCLOSE:
2130
2131
2132 break;
2133
2134 case NT_LIST:
2135 do {
2136 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2137 } while (IS_NOT_NULL(node = NCDR(node)));
2138 return 0;
2139 break;
2140
2141 case NT_ALT:
2142 do {
2143 if (is_invalid_quantifier_target(NCAR(node))) return 1;
2144 } while (IS_NOT_NULL(node = NCDR(node)));
2145 break;
2146
2147 default:
2148 break;
2149 }
2150 return 0;
2151 }
2152
2153
2154 static int
2155 popular_quantifier_num(QtfrNode* q)
2156 {
2157 if (q->greedy) {
2158 if (q->lower == 0) {
2159 if (q->upper == 1) return 0;
2160 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2161 }
2162 else if (q->lower == 1) {
2163 if (IS_REPEAT_INFINITE(q->upper)) return 2;
2164 }
2165 }
2166 else {
2167 if (q->lower == 0) {
2168 if (q->upper == 1) return 3;
2169 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2170 }
2171 else if (q->lower == 1) {
2172 if (IS_REPEAT_INFINITE(q->upper)) return 5;
2173 }
2174 }
2175 return -1;
2176 }
2177
2178
2179 enum ReduceType {
2180 RQ_ASIS = 0,
2181 RQ_DEL = 1,
2182 RQ_A,
2183 RQ_AQ,
2184 RQ_QQ,
2185 RQ_P_QQ,
2186 RQ_PQ_Q
2187 };
2188
2189 static enum ReduceType ReduceTypeTable[6][6] = {
2190 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS},
2191 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},
2192 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},
2193 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ},
2194 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL},
2195 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL}
2196 };
2197
2198 extern void
2199 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2200 {
2201 int pnum, cnum;
2202 QtfrNode *p, *c;
2203
2204 p = NQTFR(pnode);
2205 c = NQTFR(cnode);
2206 pnum = popular_quantifier_num(p);
2207 cnum = popular_quantifier_num(c);
2208 if (pnum < 0 || cnum < 0) return ;
2209
2210 switch(ReduceTypeTable[cnum][pnum]) {
2211 case RQ_DEL:
2212 *pnode = *cnode;
2213 break;
2214 case RQ_A:
2215 p->target = c->target;
2216 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2217 break;
2218 case RQ_AQ:
2219 p->target = c->target;
2220 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2221 break;
2222 case RQ_QQ:
2223 p->target = c->target;
2224 p->lower = 0; p->upper = 1; p->greedy = 0;
2225 break;
2226 case RQ_P_QQ:
2227 p->target = cnode;
2228 p->lower = 0; p->upper = 1; p->greedy = 0;
2229 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2230 return ;
2231 break;
2232 case RQ_PQ_Q:
2233 p->target = cnode;
2234 p->lower = 0; p->upper = 1; p->greedy = 1;
2235 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2236 return ;
2237 break;
2238 case RQ_ASIS:
2239 p->target = cnode;
2240 return ;
2241 break;
2242 }
2243
2244 c->target = NULL_NODE;
2245 onig_node_free(cnode);
2246 }
2247
2248
2249 enum TokenSyms {
2250 TK_EOT = 0,
2251 TK_RAW_BYTE = 1,
2252 TK_CHAR,
2253 TK_STRING,
2254 TK_CODE_POINT,
2255 TK_ANYCHAR,
2256 TK_CHAR_TYPE,
2257 TK_BACKREF,
2258 TK_CALL,
2259 TK_ANCHOR,
2260 TK_OP_REPEAT,
2261 TK_INTERVAL,
2262 TK_ANYCHAR_ANYTIME,
2263 TK_ALT,
2264 TK_SUBEXP_OPEN,
2265 TK_SUBEXP_CLOSE,
2266 TK_CC_OPEN,
2267 TK_QUOTE_OPEN,
2268 TK_CHAR_PROPERTY,
2269
2270 TK_CC_CLOSE,
2271 TK_CC_RANGE,
2272 TK_POSIX_BRACKET_OPEN,
2273 TK_CC_AND,
2274 TK_CC_CC_OPEN
2275 };
2276
2277 typedef struct {
2278 enum TokenSyms type;
2279 int escaped;
2280 int base;
2281 UChar* backp;
2282 union {
2283 UChar* s;
2284 int c;
2285 OnigCodePoint code;
2286 int anchor;
2287 int subtype;
2288 struct {
2289 int lower;
2290 int upper;
2291 int greedy;
2292 int possessive;
2293 } repeat;
2294 struct {
2295 int num;
2296 int ref1;
2297 int* refs;
2298 int by_name;
2299 #ifdef USE_BACKREF_WITH_LEVEL
2300 int exist_level;
2301 int level;
2302 #endif
2303 } backref;
2304 struct {
2305 UChar* name;
2306 UChar* name_end;
2307 int gnum;
2308 } call;
2309 struct {
2310 int ctype;
2311 int not;
2312 } prop;
2313 } u;
2314 } OnigToken;
2315
2316
2317 static int
2318 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2319 {
2320 int low, up, syn_allow, non_low = 0;
2321 int r = 0;
2322 OnigCodePoint c;
2323 OnigEncoding enc = env->enc;
2324 UChar* p = *src;
2325 PFETCH_READY;
2326
2327 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2328
2329 if (PEND) {
2330 if (syn_allow)
2331 return 1;
2332 else
2333 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2334 }
2335
2336 if (! syn_allow) {
2337 c = PPEEK;
2338 if (c == ')' || c == '(' || c == '|') {
2339 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2340 }
2341 }
2342
2343 low = onig_scan_unsigned_number(&p, end, env->enc);
2344 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2345 if (low > ONIG_MAX_REPEAT_NUM)
2346 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2347
2348 if (p == *src) {
2349 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2350
2351 low = 0;
2352 non_low = 1;
2353 }
2354 else
2355 goto invalid;
2356 }
2357
2358 if (PEND) goto invalid;
2359 PFETCH(c);
2360 if (c == ',') {
2361 UChar* prev = p;
2362 up = onig_scan_unsigned_number(&p, end, env->enc);
2363 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2364 if (up > ONIG_MAX_REPEAT_NUM)
2365 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2366
2367 if (p == prev) {
2368 if (non_low != 0)
2369 goto invalid;
2370 up = REPEAT_INFINITE;
2371 }
2372 }
2373 else {
2374 if (non_low != 0)
2375 goto invalid;
2376
2377 PUNFETCH;
2378 up = low;
2379 r = 2;
2380 }
2381
2382 if (PEND) goto invalid;
2383 PFETCH(c);
2384 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2385 if (c != MC_ESC(env->syntax)) goto invalid;
2386 PFETCH(c);
2387 }
2388 if (c != '}') goto invalid;
2389
2390 if (!IS_REPEAT_INFINITE(up) && low > up) {
2391 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2392 }
2393
2394 tok->type = TK_INTERVAL;
2395 tok->u.repeat.lower = low;
2396 tok->u.repeat.upper = up;
2397 *src = p;
2398 return r;
2399
2400 invalid:
2401 if (syn_allow)
2402 return 1;
2403 else
2404 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2405 }
2406
2407
2408 static int
2409 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2410 {
2411 int v;
2412 OnigCodePoint c;
2413 OnigEncoding enc = env->enc;
2414 UChar* p = *src;
2415
2416 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2417
2418 PFETCH_S(c);
2419 switch (c) {
2420 case 'M':
2421 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2422 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2423 PFETCH_S(c);
2424 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2425 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2426 PFETCH_S(c);
2427 if (c == MC_ESC(env->syntax)) {
2428 v = fetch_escaped_value(&p, end, env);
2429 if (v < 0) return v;
2430 c = (OnigCodePoint )v;
2431 }
2432 c = ((c & 0xff) | 0x80);
2433 }
2434 else
2435 goto backslash;
2436 break;
2437
2438 case 'C':
2439 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2440 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2441 PFETCH_S(c);
2442 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2443 goto control;
2444 }
2445 else
2446 goto backslash;
2447
2448 case 'c':
2449 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2450 control:
2451 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2452 PFETCH_S(c);
2453 if (c == '?') {
2454 c = 0177;
2455 }
2456 else {
2457 if (c == MC_ESC(env->syntax)) {
2458 v = fetch_escaped_value(&p, end, env);
2459 if (v < 0) return v;
2460 c = (OnigCodePoint )v;
2461 }
2462 c &= 0x9f;
2463 }
2464 break;
2465 }
2466
2467
2468 default:
2469 {
2470 backslash:
2471 c = conv_backslash_value(c, env);
2472 }
2473 break;
2474 }
2475
2476 *src = p;
2477 return c;
2478 }
2479
2480 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2481
2482 static OnigCodePoint
2483 get_name_end_code_point(OnigCodePoint start)
2484 {
2485 switch (start) {
2486 case '<': return (OnigCodePoint )'>'; break;
2487 case '\'': return (OnigCodePoint )'\''; break;
2488 default:
2489 break;
2490 }
2491
2492 return (OnigCodePoint )0;
2493 }
2494
2495 #ifdef USE_NAMED_GROUP
2496 #ifdef USE_BACKREF_WITH_LEVEL
2497
2498
2499
2500
2501
2502 static int
2503 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2504 UChar** rname_end, ScanEnv* env,
2505 int* rback_num, int* rlevel)
2506 {
2507 int r, sign, is_num, exist_level;
2508 OnigCodePoint end_code;
2509 OnigCodePoint c = 0;
2510 OnigEncoding enc = env->enc;
2511 UChar *name_end;
2512 UChar *pnum_head;
2513 UChar *p = *src;
2514 PFETCH_READY;
2515
2516 *rback_num = 0;
2517 is_num = exist_level = 0;
2518 sign = 1;
2519 pnum_head = *src;
2520
2521 end_code = get_name_end_code_point(start_code);
2522
2523 name_end = end;
2524 r = 0;
2525 if (PEND) {
2526 return ONIGERR_EMPTY_GROUP_NAME;
2527 }
2528 else {
2529 PFETCH(c);
2530 if (c == end_code)
2531 return ONIGERR_EMPTY_GROUP_NAME;
2532
2533 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2534 is_num = 1;
2535 }
2536 else if (c == '-') {
2537 is_num = 2;
2538 sign = -1;
2539 pnum_head = p;
2540 }
2541 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2542 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2543 }
2544 }
2545
2546 while (!PEND) {
2547 name_end = p;
2548 PFETCH(c);
2549 if (c == end_code || c == ')' || c == '+' || c == '-') {
2550 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2551 break;
2552 }
2553
2554 if (is_num != 0) {
2555 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2556 is_num = 1;
2557 }
2558 else {
2559 r = ONIGERR_INVALID_GROUP_NAME;
2560 is_num = 0;
2561 }
2562 }
2563 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2564 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2565 }
2566 }
2567
2568 if (r == 0 && c != end_code) {
2569 if (c == '+' || c == '-') {
2570 int level;
2571 int flag = (c == '-' ? -1 : 1);
2572
2573 PFETCH(c);
2574 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2575 PUNFETCH;
2576 level = onig_scan_unsigned_number(&p, end, enc);
2577 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2578 *rlevel = (level * flag);
2579 exist_level = 1;
2580
2581 PFETCH(c);
2582 if (c == end_code)
2583 goto end;
2584 }
2585
2586 err:
2587 r = ONIGERR_INVALID_GROUP_NAME;
2588 name_end = end;
2589 }
2590
2591 end:
2592 if (r == 0) {
2593 if (is_num != 0) {
2594 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2595 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2596 else if (*rback_num == 0) goto err;
2597
2598 *rback_num *= sign;
2599 }
2600
2601 *rname_end = name_end;
2602 *src = p;
2603 return (exist_level ? 1 : 0);
2604 }
2605 else {
2606 onig_scan_env_set_error_string(env, r, *src, name_end);
2607 return r;
2608 }
2609 }
2610 #endif
2611
2612
2613
2614
2615
2616 static int
2617 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2618 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2619 {
2620 int r, is_num, sign;
2621 OnigCodePoint end_code;
2622 OnigCodePoint c = 0;
2623 OnigEncoding enc = env->enc;
2624 UChar *name_end;
2625 UChar *pnum_head;
2626 UChar *p = *src;
2627
2628 *rback_num = 0;
2629
2630 end_code = get_name_end_code_point(start_code);
2631
2632 name_end = end;
2633 pnum_head = *src;
2634 r = 0;
2635 is_num = 0;
2636 sign = 1;
2637 if (PEND) {
2638 return ONIGERR_EMPTY_GROUP_NAME;
2639 }
2640 else {
2641 PFETCH_S(c);
2642 if (c == end_code)
2643 return ONIGERR_EMPTY_GROUP_NAME;
2644
2645 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2646 if (ref == 1)
2647 is_num = 1;
2648 else {
2649 r = ONIGERR_INVALID_GROUP_NAME;
2650 is_num = 0;
2651 }
2652 }
2653 else if (c == '-') {
2654 if (ref == 1) {
2655 is_num = 2;
2656 sign = -1;
2657 pnum_head = p;
2658 }
2659 else {
2660 r = ONIGERR_INVALID_GROUP_NAME;
2661 is_num = 0;
2662 }
2663 }
2664 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2665 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2666 }
2667 }
2668
2669 if (r == 0) {
2670 while (!PEND) {
2671 name_end = p;
2672 PFETCH_S(c);
2673 if (c == end_code || c == ')') {
2674 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2675 break;
2676 }
2677
2678 if (is_num != 0) {
2679 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2680 is_num = 1;
2681 }
2682 else {
2683 if (!ONIGENC_IS_CODE_WORD(enc, c))
2684 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2685 else
2686 r = ONIGERR_INVALID_GROUP_NAME;
2687 is_num = 0;
2688 }
2689 }
2690 else {
2691 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2692 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2693 }
2694 }
2695 }
2696
2697 if (c != end_code) {
2698 r = ONIGERR_INVALID_GROUP_NAME;
2699 name_end = end;
2700 }
2701
2702 if (is_num != 0) {
2703 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2704 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2705 else if (*rback_num == 0) {
2706 r = ONIGERR_INVALID_GROUP_NAME;
2707 goto err;
2708 }
2709
2710 *rback_num *= sign;
2711 }
2712
2713 *rname_end = name_end;
2714 *src = p;
2715 return 0;
2716 }
2717 else {
2718 while (!PEND) {
2719 name_end = p;
2720 PFETCH_S(c);
2721 if (c == end_code || c == ')')
2722 break;
2723 }
2724 if (PEND)
2725 name_end = end;
2726
2727 err:
2728 onig_scan_env_set_error_string(env, r, *src, name_end);
2729 return r;
2730 }
2731 }
2732 #else
2733 static int
2734 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2735 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2736 {
2737 int r, is_num, sign;
2738 OnigCodePoint end_code;
2739 OnigCodePoint c = 0;
2740 UChar *name_end;
2741 OnigEncoding enc = env->enc;
2742 UChar *pnum_head;
2743 UChar *p = *src;
2744 PFETCH_READY;
2745
2746 *rback_num = 0;
2747
2748 end_code = get_name_end_code_point(start_code);
2749
2750 *rname_end = name_end = end;
2751 r = 0;
2752 pnum_head = *src;
2753 is_num = 0;
2754 sign = 1;
2755
2756 if (PEND) {
2757 return ONIGERR_EMPTY_GROUP_NAME;
2758 }
2759 else {
2760 PFETCH(c);
2761 if (c == end_code)
2762 return ONIGERR_EMPTY_GROUP_NAME;
2763
2764 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2765 is_num = 1;
2766 }
2767 else if (c == '-') {
2768 is_num = 2;
2769 sign = -1;
2770 pnum_head = p;
2771 }
2772 else {
2773 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2774 }
2775 }
2776
2777 while (!PEND) {
2778 name_end = p;
2779
2780 PFETCH(c);
2781 if (c == end_code || c == ')') break;
2782 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2783 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2784 }
2785 if (r == 0 && c != end_code) {
2786 r = ONIGERR_INVALID_GROUP_NAME;
2787 name_end = end;
2788 }
2789
2790 if (r == 0) {
2791 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2792 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2793 else if (*rback_num == 0) {
2794 r = ONIGERR_INVALID_GROUP_NAME;
2795 goto err;
2796 }
2797 *rback_num *= sign;
2798
2799 *rname_end = name_end;
2800 *src = p;
2801 return 0;
2802 }
2803 else {
2804 err:
2805 onig_scan_env_set_error_string(env, r, *src, name_end);
2806 return r;
2807 }
2808 }
2809 #endif
2810
2811 static void
2812 CC_ESC_WARN(ScanEnv* env, UChar *c)
2813 {
2814 if (onig_warn == onig_null_warn) return ;
2815
2816 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2817 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2818 UChar buf[WARN_BUFSIZE];
2819 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2820 env->pattern, env->pattern_end,
2821 (UChar* )"character class has '%s' without escape", c);
2822 (*onig_warn)((char* )buf);
2823 }
2824 }
2825
2826 static void
2827 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2828 {
2829 if (onig_warn == onig_null_warn) return ;
2830
2831 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2832 UChar buf[WARN_BUFSIZE];
2833 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2834 (env)->pattern, (env)->pattern_end,
2835 (UChar* )"regular expression has '%s' without escape", c);
2836 (*onig_warn)((char* )buf);
2837 }
2838 }
2839
2840 static UChar*
2841 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2842 UChar **next, OnigEncoding enc)
2843 {
2844 int i;
2845 OnigCodePoint x;
2846 UChar *q;
2847 UChar *p = from;
2848
2849 while (p < to) {
2850 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2851 q = p + enclen(enc, p);
2852 if (x == s[0]) {
2853 for (i = 1; i < n && q < to; i++) {
2854 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2855 if (x != s[i]) break;
2856 q += enclen(enc, q);
2857 }
2858 if (i >= n) {
2859 if (IS_NOT_NULL(next))
2860 *next = q;
2861 return p;
2862 }
2863 }
2864 p = q;
2865 }
2866 return NULL_UCHARP;
2867 }
2868
2869 static int
2870 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2871 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2872 {
2873 int i, in_esc;
2874 OnigCodePoint x;
2875 UChar *q;
2876 UChar *p = from;
2877
2878 in_esc = 0;
2879 while (p < to) {
2880 if (in_esc) {
2881 in_esc = 0;
2882 p += enclen(enc, p);
2883 }
2884 else {
2885 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2886 q = p + enclen(enc, p);
2887 if (x == s[0]) {
2888 for (i = 1; i < n && q < to; i++) {
2889 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2890 if (x != s[i]) break;
2891 q += enclen(enc, q);
2892 }
2893 if (i >= n) return 1;
2894 p += enclen(enc, p);
2895 }
2896 else {
2897 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2898 if (x == bad) return 0;
2899 else if (x == MC_ESC(syn)) in_esc = 1;
2900 p = q;
2901 }
2902 }
2903 }
2904 return 0;
2905 }
2906
2907 static int
2908 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2909 {
2910 int num;
2911 OnigCodePoint c, c2;
2912 OnigSyntaxType* syn = env->syntax;
2913 OnigEncoding enc = env->enc;
2914 UChar* prev;
2915 UChar* p = *src;
2916 PFETCH_READY;
2917
2918 if (PEND) {
2919 tok->type = TK_EOT;
2920 return tok->type;
2921 }
2922
2923 PFETCH(c);
2924 tok->type = TK_CHAR;
2925 tok->base = 0;
2926 tok->u.c = c;
2927 tok->escaped = 0;
2928
2929 if (c == ']') {
2930 tok->type = TK_CC_CLOSE;
2931 }
2932 else if (c == '-') {
2933 tok->type = TK_CC_RANGE;
2934 }
2935 else if (c == MC_ESC(syn)) {
2936 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2937 goto end;
2938
2939 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2940
2941 PFETCH(c);
2942 tok->escaped = 1;
2943 tok->u.c = c;
2944 switch (c) {
2945 case 'w':
2946 tok->type = TK_CHAR_TYPE;
2947 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2948 tok->u.prop.not = 0;
2949 break;
2950 case 'W':
2951 tok->type = TK_CHAR_TYPE;
2952 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2953 tok->u.prop.not = 1;
2954 break;
2955 case 'd':
2956 tok->type = TK_CHAR_TYPE;
2957 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2958 tok->u.prop.not = 0;
2959 break;
2960 case 'D':
2961 tok->type = TK_CHAR_TYPE;
2962 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2963 tok->u.prop.not = 1;
2964 break;
2965 case 's':
2966 tok->type = TK_CHAR_TYPE;
2967 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2968 tok->u.prop.not = 0;
2969 break;
2970 case 'S':
2971 tok->type = TK_CHAR_TYPE;
2972 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2973 tok->u.prop.not = 1;
2974 break;
2975 case 'h':
2976 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2977 tok->type = TK_CHAR_TYPE;
2978 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2979 tok->u.prop.not = 0;
2980 break;
2981 case 'H':
2982 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2983 tok->type = TK_CHAR_TYPE;
2984 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2985 tok->u.prop.not = 1;
2986 break;
2987
2988 case 'p':
2989 case 'P':
2990 c2 = PPEEK;
2991 if (c2 == '{' &&
2992 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2993 PINC;
2994 tok->type = TK_CHAR_PROPERTY;
2995 tok->u.prop.not = (c == 'P' ? 1 : 0);
2996
2997 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2998 PFETCH(c2);
2999 if (c2 == '^') {
3000 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3001 }
3002 else
3003 PUNFETCH;
3004 }
3005 }
3006 break;
3007
3008 case 'x':
3009 if (PEND) break;
3010
3011 prev = p;
3012 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3013 PINC;
3014 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3015 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3016 if (!PEND) {
3017 c2 = PPEEK;
3018 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3019 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3020 }
3021
3022 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3023 PINC;
3024 tok->type = TK_CODE_POINT;
3025 tok->base = 16;
3026 tok->u.code = (OnigCodePoint )num;
3027 }
3028 else {
3029
3030 p = prev;
3031 }
3032 }
3033 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3034 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3035 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3036 if (p == prev) {
3037 num = 0;
3038 }
3039 tok->type = TK_RAW_BYTE;
3040 tok->base = 16;
3041 tok->u.c = num;
3042 }
3043 break;
3044
3045 case 'u':
3046 if (PEND) break;
3047
3048 prev = p;
3049 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3050 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3051 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3052 if (p == prev) {
3053 num = 0;
3054 }
3055 tok->type = TK_CODE_POINT;
3056 tok->base = 16;
3057 tok->u.code = (OnigCodePoint )num;
3058 }
3059 break;
3060
3061 case '0':
3062 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3063 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3064 PUNFETCH;
3065 prev = p;
3066 num = scan_unsigned_octal_number(&p, end, 3, enc);
3067 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3068 if (p == prev) {
3069 num = 0;
3070 }
3071 tok->type = TK_RAW_BYTE;
3072 tok->base = 8;
3073 tok->u.c = num;
3074 }
3075 break;
3076
3077 default:
3078 PUNFETCH;
3079 num = fetch_escaped_value(&p, end, env);
3080 if (num < 0) return num;
3081 if (tok->u.c != num) {
3082 tok->u.code = (OnigCodePoint )num;
3083 tok->type = TK_CODE_POINT;
3084 }
3085 break;
3086 }
3087 }
3088 else if (c == '[') {
3089 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3090 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3091 tok->backp = p;
3092 PINC;
3093 if (str_exist_check_with_esc(send, 2, p, end,
3094 (OnigCodePoint )']', enc, syn)) {
3095 tok->type = TK_POSIX_BRACKET_OPEN;
3096 }
3097 else {
3098 PUNFETCH;
3099 goto cc_in_cc;
3100 }
3101 }
3102 else {
3103 cc_in_cc:
3104 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3105 tok->type = TK_CC_CC_OPEN;
3106 }
3107 else {
3108 CC_ESC_WARN(env, (UChar* )"[");
3109 }
3110 }
3111 }
3112 else if (c == '&') {
3113 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3114 !PEND && (PPEEK_IS('&'))) {
3115 PINC;
3116 tok->type = TK_CC_AND;
3117 }
3118 }
3119
3120 end:
3121 *src = p;
3122 return tok->type;
3123 }
3124
3125 static int
3126 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3127 {
3128 int r, num;
3129 OnigCodePoint c;
3130 OnigEncoding enc = env->enc;
3131 OnigSyntaxType* syn = env->syntax;
3132 UChar* prev;
3133 UChar* p = *src;
3134 PFETCH_READY;
3135
3136 start:
3137 if (PEND) {
3138 tok->type = TK_EOT;
3139 return tok->type;
3140 }
3141
3142 tok->type = TK_STRING;
3143 tok->base = 0;
3144 tok->backp = p;
3145
3146 PFETCH(c);
3147 if (IS_MC_ESC_CODE(c, syn)) {
3148 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3149
3150 tok->backp = p;
3151 PFETCH(c);
3152
3153 tok->u.c = c;
3154 tok->escaped = 1;
3155 switch (c) {
3156 case '*':
3157 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3158 tok->type = TK_OP_REPEAT;
3159 tok->u.repeat.lower = 0;
3160 tok->u.repeat.upper = REPEAT_INFINITE;
3161 goto greedy_check;
3162 break;
3163
3164 case '+':
3165 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3166 tok->type = TK_OP_REPEAT;
3167 tok->u.repeat.lower = 1;
3168 tok->u.repeat.upper = REPEAT_INFINITE;
3169 goto greedy_check;
3170 break;
3171
3172 case '?':
3173 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3174 tok->type = TK_OP_REPEAT;
3175 tok->u.repeat.lower = 0;
3176 tok->u.repeat.upper = 1;
3177 greedy_check:
3178 if (!PEND && PPEEK_IS('?') &&
3179 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3180 PFETCH(c);
3181 tok->u.repeat.greedy = 0;
3182 tok->u.repeat.possessive = 0;
3183 }
3184 else {
3185 possessive_check:
3186 if (!PEND && PPEEK_IS('+') &&
3187 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3188 tok->type != TK_INTERVAL) ||
3189 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3190 tok->type == TK_INTERVAL))) {
3191 PFETCH(c);
3192 tok->u.repeat.greedy = 1;
3193 tok->u.repeat.possessive = 1;
3194 }
3195 else {
3196 tok->u.repeat.greedy = 1;
3197 tok->u.repeat.possessive = 0;
3198 }
3199 }
3200 break;
3201
3202 case '{':
3203 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3204 r = fetch_range_quantifier(&p, end, tok, env);
3205 if (r < 0) return r;
3206 if (r == 0) goto greedy_check;
3207 else if (r == 2) {
3208 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3209 goto possessive_check;
3210
3211 goto greedy_check;
3212 }
3213
3214 break;
3215
3216 case '|':
3217 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3218 tok->type = TK_ALT;
3219 break;
3220
3221 case '(':
3222 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3223 tok->type = TK_SUBEXP_OPEN;
3224 break;
3225
3226 case ')':
3227 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3228 tok->type = TK_SUBEXP_CLOSE;
3229 break;
3230
3231 case 'w':
3232 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3233 tok->type = TK_CHAR_TYPE;
3234 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3235 tok->u.prop.not = 0;
3236 break;
3237
3238 case 'W':
3239 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3240 tok->type = TK_CHAR_TYPE;
3241 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3242 tok->u.prop.not = 1;
3243 break;
3244
3245 case 'b':
3246 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3247 tok->type = TK_ANCHOR;
3248 tok->u.anchor = ANCHOR_WORD_BOUND;
3249 break;
3250
3251 case 'B':
3252 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3253 tok->type = TK_ANCHOR;
3254 tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3255 break;
3256
3257 #ifdef USE_WORD_BEGIN_END
3258 case '<':
3259 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3260 tok->type = TK_ANCHOR;
3261 tok->u.anchor = ANCHOR_WORD_BEGIN;
3262 break;
3263
3264 case '>':
3265 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3266 tok->type = TK_ANCHOR;
3267 tok->u.anchor = ANCHOR_WORD_END;
3268 break;
3269 #endif
3270
3271 case 's':
3272 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3273 tok->type = TK_CHAR_TYPE;
3274 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3275 tok->u.prop.not = 0;
3276 break;
3277
3278 case 'S':
3279 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3280 tok->type = TK_CHAR_TYPE;
3281 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3282 tok->u.prop.not = 1;
3283 break;
3284
3285 case 'd':
3286 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3287 tok->type = TK_CHAR_TYPE;
3288 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3289 tok->u.prop.not = 0;
3290 break;
3291
3292 case 'D':
3293 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3294 tok->type = TK_CHAR_TYPE;
3295 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3296 tok->u.prop.not = 1;
3297 break;
3298
3299 case 'h':
3300 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3301 tok->type = TK_CHAR_TYPE;
3302 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3303 tok->u.prop.not = 0;
3304 break;
3305
3306 case 'H':
3307 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3308 tok->type = TK_CHAR_TYPE;
3309 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3310 tok->u.prop.not = 1;
3311 break;
3312
3313 case 'A':
3314 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3315 begin_buf:
3316 tok->type = TK_ANCHOR;
3317 tok->u.subtype = ANCHOR_BEGIN_BUF;
3318 break;
3319
3320 case 'Z':
3321 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3322 tok->type = TK_ANCHOR;
3323 tok->u.subtype = ANCHOR_SEMI_END_BUF;
3324 break;
3325
3326 case 'z':
3327 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3328 end_buf:
3329 tok->type = TK_ANCHOR;
3330 tok->u.subtype = ANCHOR_END_BUF;
3331 break;
3332
3333 case 'G':
3334 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3335 tok->type = TK_ANCHOR;
3336 tok->u.subtype = ANCHOR_BEGIN_POSITION;
3337 break;
3338
3339 case '`':
3340 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3341 goto begin_buf;
3342 break;
3343
3344 case '\'':
3345 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3346 goto end_buf;
3347 break;
3348
3349 case 'x':
3350 if (PEND) break;
3351
3352 prev = p;
3353 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3354 PINC;
3355 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3356 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3357 if (!PEND) {
3358 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3359 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3360 }
3361
3362 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3363 PINC;
3364 tok->type = TK_CODE_POINT;
3365 tok->u.code = (OnigCodePoint )num;
3366 }
3367 else {
3368
3369 p = prev;
3370 }
3371 }
3372 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3373 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3374 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3375 if (p == prev) {
3376 num = 0;
3377 }
3378 tok->type = TK_RAW_BYTE;
3379 tok->base = 16;
3380 tok->u.c = num;
3381 }
3382 break;
3383
3384 case 'u':
3385 if (PEND) break;
3386
3387 prev = p;
3388 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3389 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3390 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3391 if (p == prev) {
3392 num = 0;
3393 }
3394 tok->type = TK_CODE_POINT;
3395 tok->base = 16;
3396 tok->u.code = (OnigCodePoint )num;
3397 }
3398 break;
3399
3400 case '1': case '2': case '3': case '4':
3401 case '5': case '6': case '7': case '8': case '9':
3402 PUNFETCH;
3403 prev = p;
3404 num = onig_scan_unsigned_number(&p, end, enc);
3405 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3406 goto skip_backref;
3407 }
3408
3409 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3410 (num <= env->num_mem || num <= 9)) {
3411 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3412 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3413 return ONIGERR_INVALID_BACKREF;
3414 }
3415
3416 tok->type = TK_BACKREF;
3417 tok->u.backref.num = 1;
3418 tok->u.backref.ref1 = num;
3419 tok->u.backref.by_name = 0;
3420 #ifdef USE_BACKREF_WITH_LEVEL
3421 tok->u.backref.exist_level = 0;
3422 #endif
3423 break;
3424 }
3425
3426 skip_backref:
3427 if (c == '8' || c == '9') {
3428
3429 p = prev; PINC;
3430 break;
3431 }
3432
3433 p = prev;
3434
3435 case '0':
3436 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3437 prev = p;
3438 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3439 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3440 if (p == prev) {
3441 num = 0;
3442 }
3443 tok->type = TK_RAW_BYTE;
3444 tok->base = 8;
3445 tok->u.c = num;
3446 }
3447 else if (c != '0') {
3448 PINC;
3449 }
3450 break;
3451
3452 #ifdef USE_NAMED_GROUP
3453 case 'k':
3454 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3455 PFETCH(c);
3456 if (c == '<' || c == '\'') {
3457 UChar* name_end;
3458 int* backs;
3459 int back_num;
3460
3461 prev = p;
3462
3463 #ifdef USE_BACKREF_WITH_LEVEL
3464 name_end = NULL_UCHARP;
3465 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3466 env, &back_num, &tok->u.backref.level);
3467 if (r == 1) tok->u.backref.exist_level = 1;
3468 else tok->u.backref.exist_level = 0;
3469 #else
3470 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3471 #endif
3472 if (r < 0) return r;
3473
3474 if (back_num != 0) {
3475 if (back_num < 0) {
3476 back_num = BACKREF_REL_TO_ABS(back_num, env);
3477 if (back_num <= 0)
3478 return ONIGERR_INVALID_BACKREF;
3479 }
3480
3481 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3482 if (back_num > env->num_mem ||
3483 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3484 return ONIGERR_INVALID_BACKREF;
3485 }
3486 tok->type = TK_BACKREF;
3487 tok->u.backref.by_name = 0;
3488 tok->u.backref.num = 1;
3489 tok->u.backref.ref1 = back_num;
3490 }
3491 else {
3492 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3493 if (num <= 0) {
3494 onig_scan_env_set_error_string(env,
3495 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3496 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3497 }
3498 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3499 int i;
3500 for (i = 0; i < num; i++) {
3501 if (backs[i] > env->num_mem ||
3502 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3503 return ONIGERR_INVALID_BACKREF;
3504 }
3505 }
3506
3507 tok->type = TK_BACKREF;
3508 tok->u.backref.by_name = 1;
3509 if (num == 1) {
3510 tok->u.backref.num = 1;
3511 tok->u.backref.ref1 = backs[0];
3512 }
3513 else {
3514 tok->u.backref.num = num;
3515 tok->u.backref.refs = backs;
3516 }
3517 }
3518 }
3519 else
3520 PUNFETCH;
3521 }
3522 break;
3523 #endif
3524
3525 #ifdef USE_SUBEXP_CALL
3526 case 'g':
3527 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3528 PFETCH(c);
3529 if (c == '<' || c == '\'') {
3530 int gnum;
3531 UChar* name_end;
3532
3533 prev = p;
3534 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3535 if (r < 0) return r;
3536
3537 tok->type = TK_CALL;
3538 tok->u.call.name = prev;
3539 tok->u.call.name_end = name_end;
3540 tok->u.call.gnum = gnum;
3541 }
3542 else
3543 PUNFETCH;
3544 }
3545 break;
3546 #endif
3547
3548 case 'Q':
3549 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3550 tok->type = TK_QUOTE_OPEN;
3551 }
3552 break;
3553
3554 case 'p':
3555 case 'P':
3556 if (PPEEK_IS('{') &&
3557 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3558 PINC;
3559 tok->type = TK_CHAR_PROPERTY;
3560 tok->u.prop.not = (c == 'P' ? 1 : 0);
3561
3562 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3563 PFETCH(c);
3564 if (c == '^') {
3565 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3566 }
3567 else
3568 PUNFETCH;
3569 }
3570 }
3571 break;
3572
3573 default:
3574 PUNFETCH;
3575 num = fetch_escaped_value(&p, end, env);
3576 if (num < 0) return num;
3577
3578 if (tok->u.c != num) {
3579 tok->type = TK_CODE_POINT;
3580 tok->u.code = (OnigCodePoint )num;
3581 }
3582 else {
3583 p = tok->backp + enclen(enc, tok->backp);
3584 }
3585 break;
3586 }
3587 }
3588 else {
3589 tok->u.c = c;
3590 tok->escaped = 0;
3591
3592 #ifdef USE_VARIABLE_META_CHARS
3593 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3594 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3595 if (c == MC_ANYCHAR(syn))
3596 goto any_char;
3597 else if (c == MC_ANYTIME(syn))
3598 goto anytime;
3599 else if (c == MC_ZERO_OR_ONE_TIME(syn))
3600 goto zero_or_one_time;
3601 else if (c == MC_ONE_OR_MORE_TIME(syn))
3602 goto one_or_more_time;
3603 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3604 tok->type = TK_ANYCHAR_ANYTIME;
3605 goto out;
3606 }
3607 }
3608 #endif
3609
3610 switch (c) {
3611 case '.':
3612 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3613 #ifdef USE_VARIABLE_META_CHARS
3614 any_char:
3615 #endif
3616 tok->type = TK_ANYCHAR;
3617 break;
3618
3619 case '*':
3620 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3621 #ifdef USE_VARIABLE_META_CHARS
3622 anytime:
3623 #endif
3624 tok->type = TK_OP_REPEAT;
3625 tok->u.repeat.lower = 0;
3626 tok->u.repeat.upper = REPEAT_INFINITE;
3627 goto greedy_check;
3628 break;
3629
3630 case '+':
3631 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3632 #ifdef USE_VARIABLE_META_CHARS
3633 one_or_more_time:
3634 #endif
3635 tok->type = TK_OP_REPEAT;
3636 tok->u.repeat.lower = 1;
3637 tok->u.repeat.upper = REPEAT_INFINITE;
3638 goto greedy_check;
3639 break;
3640
3641 case '?':
3642 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3643 #ifdef USE_VARIABLE_META_CHARS
3644 zero_or_one_time:
3645 #endif
3646 tok->type = TK_OP_REPEAT;
3647 tok->u.repeat.lower = 0;
3648 tok->u.repeat.upper = 1;
3649 goto greedy_check;
3650 break;
3651
3652 case '{':
3653 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3654 r = fetch_range_quantifier(&p, end, tok, env);
3655 if (r < 0) return r;
3656 if (r == 0) goto greedy_check;
3657 else if (r == 2) {
3658 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3659 goto possessive_check;
3660
3661 goto greedy_check;
3662 }
3663
3664 break;
3665
3666 case '|':
3667 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3668 tok->type = TK_ALT;
3669 break;
3670
3671 case '(':
3672 if (PPEEK_IS('?') &&
3673 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3674 PINC;
3675 if (PPEEK_IS('#')) {
3676 PFETCH(c);
3677 while (1) {
3678 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3679 PFETCH(c);
3680 if (c == MC_ESC(syn)) {
3681 if (!PEND) PFETCH(c);
3682 }
3683 else {
3684 if (c == ')') break;
3685 }
3686 }
3687 goto start;
3688 }
3689 PUNFETCH;
3690 }
3691
3692 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3693 tok->type = TK_SUBEXP_OPEN;
3694 break;
3695
3696 case ')':
3697 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3698 tok->type = TK_SUBEXP_CLOSE;
3699 break;
3700
3701 case '^':
3702 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3703 tok->type = TK_ANCHOR;
3704 tok->u.subtype = (IS_SINGLELINE(env->option)
3705 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3706 break;
3707
3708 case '$':
3709 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3710 tok->type = TK_ANCHOR;
3711 tok->u.subtype = (IS_SINGLELINE(env->option)
3712 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3713 break;
3714
3715 case '[':
3716 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3717 tok->type = TK_CC_OPEN;
3718 break;
3719
3720 case ']':
3721 if (*src > env->pattern)
3722 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3723 break;
3724
3725 case '#':
3726 if (IS_EXTEND(env->option)) {
3727 while (!PEND) {
3728 PFETCH(c);
3729 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3730 break;
3731 }
3732 goto start;
3733 break;
3734 }
3735 break;
3736
3737 case ' ': case '\t': case '\n': case '\r': case '\f':
3738 if (IS_EXTEND(env->option))
3739 goto start;
3740 break;
3741
3742 default:
3743
3744 break;
3745 }
3746 }
3747
3748 #ifdef USE_VARIABLE_META_CHARS
3749 out:
3750 #endif
3751 *src = p;
3752 return tok->type;
3753 }
3754
3755 static int
3756 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3757 OnigEncoding enc ARG_UNUSED,
3758 OnigCodePoint sb_out, const OnigCodePoint mbr[])
3759 {
3760 int i, r;
3761 OnigCodePoint j;
3762
3763 int n = ONIGENC_CODE_RANGE_NUM(mbr);
3764
3765 if (not == 0) {
3766 for (i = 0; i < n; i++) {
3767 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
3768 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3769 if (j >= sb_out) {
3770 if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
3771 else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3772 r = add_code_range_to_buf(&(cc->mbuf), j,
3773 ONIGENC_CODE_RANGE_TO(mbr, i));
3774 if (r != 0) return r;
3775 i++;
3776 }
3777
3778 goto sb_end;
3779 }
3780 BITSET_SET_BIT(cc->bs, j);
3781 }
3782 }
3783
3784 sb_end:
3785 for ( ; i < n; i++) {
3786 r = add_code_range_to_buf(&(cc->mbuf),
3787 ONIGENC_CODE_RANGE_FROM(mbr, i),
3788 ONIGENC_CODE_RANGE_TO(mbr, i));
3789 if (r != 0) return r;
3790 }
3791 }
3792 else {
3793 OnigCodePoint prev = 0;
3794
3795 for (i = 0; i < n; i++) {
3796 for (j = prev;
3797 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3798 if (j >= sb_out) {
3799 goto sb_end2;
3800 }
3801 BITSET_SET_BIT(cc->bs, j);
3802 }
3803 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3804 }
3805 for (j = prev; j < sb_out; j++) {
3806 BITSET_SET_BIT(cc->bs, j);
3807 }
3808
3809 sb_end2:
3810 prev = sb_out;
3811
3812 for (i = 0; i < n; i++) {
3813 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3814 r = add_code_range_to_buf(&(cc->mbuf), prev,
3815 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3816 if (r != 0) return r;
3817 }
3818 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3819 }
3820 if (prev < 0x7fffffff) {
3821 r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3822 if (r != 0) return r;
3823 }
3824 }
3825
3826 return 0;
3827 }
3828
3829 static int
3830 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3831 {
3832 int c, r;
3833 const OnigCodePoint *ranges;
3834 OnigCodePoint sb_out;
3835 OnigEncoding enc = env->enc;
3836
3837 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3838 if (r == 0) {
3839 return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3840 }
3841 else if (r != ONIG_NO_SUPPORT_CONFIG) {
3842 return r;
3843 }
3844
3845 r = 0;
3846 switch (ctype) {
3847 case ONIGENC_CTYPE_ALPHA:
3848 case ONIGENC_CTYPE_BLANK:
3849 case ONIGENC_CTYPE_CNTRL:
3850 case ONIGENC_CTYPE_DIGIT:
3851 case ONIGENC_CTYPE_LOWER:
3852 case ONIGENC_CTYPE_PUNCT:
3853 case ONIGENC_CTYPE_SPACE:
3854 case ONIGENC_CTYPE_UPPER:
3855 case ONIGENC_CTYPE_XDIGIT:
3856 case ONIGENC_CTYPE_ASCII:
3857 case ONIGENC_CTYPE_ALNUM:
3858 if (not != 0) {
3859 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3860 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3861 BITSET_SET_BIT(cc->bs, c);
3862 }
3863 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3864 }
3865 else {
3866 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3867 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3868 BITSET_SET_BIT(cc->bs, c);
3869 }
3870 }
3871 break;
3872
3873 case ONIGENC_CTYPE_GRAPH:
3874 case ONIGENC_CTYPE_PRINT:
3875 if (not != 0) {
3876 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3877 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3878 BITSET_SET_BIT(cc->bs, c);
3879 }
3880 }
3881 else {
3882 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3883 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3884 BITSET_SET_BIT(cc->bs, c);
3885 }
3886 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3887 }
3888 break;
3889
3890 case ONIGENC_CTYPE_WORD:
3891 if (not == 0) {
3892 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3893 if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3894 }
3895 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3896 }
3897 else {
3898 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3899 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0)
3900 && ! ONIGENC_IS_CODE_WORD(enc, c))
3901 BITSET_SET_BIT(cc->bs, c);
3902 }
3903 }
3904 break;
3905
3906 default:
3907 return ONIGERR_PARSER_BUG;
3908 break;
3909 }
3910
3911 return r;
3912 }
3913
3914 static int
3915 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3916 {
3917 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
3918 #define POSIX_BRACKET_NAME_MIN_LEN 4
3919
3920 static PosixBracketEntryType PBS[] = {
3921 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
3922 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
3923 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
3924 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3925 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
3926 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
3927 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
3928 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
3929 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
3930 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
3931 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
3932 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3933 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
3934 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
3935 { (UChar* )NULL, -1, 0 }
3936 };
3937
3938 PosixBracketEntryType *pb;
3939 int not, i, r;
3940 OnigCodePoint c;
3941 OnigEncoding enc = env->enc;
3942 UChar *p = *src;
3943
3944 if (PPEEK_IS('^')) {
3945 PINC_S;
3946 not = 1;
3947 }
3948 else
3949 not = 0;
3950
3951 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3952 goto not_posix_bracket;
3953
3954 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3955 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3956 p = (UChar* )onigenc_step(enc, p, end, pb->len);
3957 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3958 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3959
3960 r = add_ctype_to_cc(cc, pb->ctype, not, env);
3961 if (r != 0) return r;
3962
3963 PINC_S; PINC_S;
3964 *src = p;
3965 return 0;
3966 }
3967 }
3968
3969 not_posix_bracket:
3970 c = 0;
3971 i = 0;
3972 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3973 PINC_S;
3974 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3975 }
3976 if (c == ':' && ! PEND) {
3977 PINC_S;
3978 if (! PEND) {
3979 PFETCH_S(c);
3980 if (c == ']')
3981 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3982 }
3983 }
3984
3985 return 1;
3986 }
3987
3988 static int
3989 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3990 {
3991 int r;
3992 OnigCodePoint c;
3993 OnigEncoding enc = env->enc;
3994 UChar *prev, *start, *p = *src;
3995
3996 r = 0;
3997 start = prev = p;
3998
3999 while (!PEND) {
4000 prev = p;
4001 PFETCH_S(c);
4002 if (c == '}') {
4003 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4004 if (r < 0) break;
4005
4006 *src = p;
4007 return r;
4008 }
4009 else if (c == '(' || c == ')' || c == '{' || c == '|') {
4010 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4011 break;
4012 }
4013 }
4014
4015 onig_scan_env_set_error_string(env, r, *src, prev);
4016 return r;
4017 }
4018
4019 static int
4020 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4021 ScanEnv* env)
4022 {
4023 int r, ctype;
4024 CClassNode* cc;
4025
4026 ctype = fetch_char_property_to_ctype(src, end, env);
4027 if (ctype < 0) return ctype;
4028
4029 *np = node_new_cclass();
4030 CHECK_NULL_RETURN_MEMERR(*np);
4031 cc = NCCLASS(*np);
4032 r = add_ctype_to_cc(cc, ctype, 0, env);
4033 if (r != 0) return r;
4034 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4035
4036 return 0;
4037 }
4038
4039
4040 enum CCSTATE {
4041 CCS_VALUE,
4042 CCS_RANGE,
4043 CCS_COMPLETE,
4044 CCS_START
4045 };
4046
4047 enum CCVALTYPE {
4048 CCV_SB,
4049 CCV_CODE_POINT,
4050 CCV_CLASS
4051 };
4052
4053 static int
4054 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4055 enum CCSTATE* state, ScanEnv* env)
4056 {
4057 int r;
4058
4059 if (*state == CCS_RANGE)
4060 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4061
4062 if (*state == CCS_VALUE && *type != CCV_CLASS) {
4063 if (*type == CCV_SB)
4064 BITSET_SET_BIT(cc->bs, (int )(*vs));
4065 else if (*type == CCV_CODE_POINT) {
4066 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4067 if (r < 0) return r;
4068 }
4069 }
4070
4071 *state = CCS_VALUE;
4072 *type = CCV_CLASS;
4073 return 0;
4074 }
4075
4076 static int
4077 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
4078 int* vs_israw, int v_israw,
4079 enum CCVALTYPE intype, enum CCVALTYPE* type,
4080 enum CCSTATE* state, ScanEnv* env)
4081 {
4082 int r;
4083
4084 switch (*state) {
4085 case CCS_VALUE:
4086 if (*type == CCV_SB)
4087 BITSET_SET_BIT(cc->bs, (int )(*vs));
4088 else if (*type == CCV_CODE_POINT) {
4089 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4090 if (r < 0) return r;
4091 }
4092 break;
4093
4094 case CCS_RANGE:
4095 if (intype == *type) {
4096 if (intype == CCV_SB) {
4097 if (*vs > 0xff || v > 0xff)
4098 return ONIGERR_INVALID_CODE_POINT_VALUE;
4099
4100 if (*vs > v) {
4101 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4102 goto ccs_range_end;
4103 else
4104 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4105 }
4106 bitset_set_range(cc->bs, (int )*vs, (int )v);
4107 }
4108 else {
4109 r = add_code_range(&(cc->mbuf), env, *vs, v);
4110 if (r < 0) return r;
4111 }
4112 }
4113 else {
4114 #if 0
4115 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4116 #endif
4117 if (*vs > v) {
4118 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4119 goto ccs_range_end;
4120 else
4121 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4122 }
4123 bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4124 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4125 if (r < 0) return r;
4126 #if 0
4127 }
4128 else
4129 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4130 #endif
4131 }
4132 ccs_range_end:
4133 *state = CCS_COMPLETE;
4134 break;
4135
4136 case CCS_COMPLETE:
4137 case CCS_START:
4138 *state = CCS_VALUE;
4139 break;
4140
4141 default:
4142 break;
4143 }
4144
4145 *vs_israw = v_israw;
4146 *vs = v;
4147 *type = intype;
4148 return 0;
4149 }
4150
4151 static int
4152 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4153 ScanEnv* env)
4154 {
4155 int in_esc;
4156 OnigCodePoint code;
4157 OnigEncoding enc = env->enc;
4158 UChar* p = from;
4159
4160 in_esc = 0;
4161 while (! PEND) {
4162 if (ignore_escaped && in_esc) {
4163 in_esc = 0;
4164 }
4165 else {
4166 PFETCH_S(code);
4167 if (code == c) return 1;
4168 if (code == MC_ESC(env->syntax)) in_esc = 1;
4169 }
4170 }
4171 return 0;
4172 }
4173
4174 static int
4175 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4176 ScanEnv* env)
4177 {
4178 int r, neg, len, fetched, and_start;
4179 OnigCodePoint v, vs;
4180 UChar *p;
4181 Node* node;
4182 CClassNode *cc, *prev_cc;
4183 CClassNode work_cc;
4184
4185 enum CCSTATE state;
4186 enum CCVALTYPE val_type, in_type;
4187 int val_israw, in_israw;
4188
4189 prev_cc = (CClassNode* )NULL;
4190 *np = NULL_NODE;
4191 r = fetch_token_in_cc(tok, src, end, env);
4192 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4193 neg = 1;
4194 r = fetch_token_in_cc(tok, src, end, env);
4195 }
4196 else {
4197 neg = 0;
4198 }
4199
4200 if (r < 0) return r;
4201 if (r == TK_CC_CLOSE) {
4202 if (! code_exist_check((OnigCodePoint )']',
4203 *src, env->pattern_end, 1, env))
4204 return ONIGERR_EMPTY_CHAR_CLASS;
4205
4206 CC_ESC_WARN(env, (UChar* )"]");
4207 r = tok->type = TK_CHAR;
4208 }
4209
4210 *np = node = node_new_cclass();
4211 CHECK_NULL_RETURN_MEMERR(node);
4212 cc = NCCLASS(node);
4213
4214 and_start = 0;
4215 state = CCS_START;
4216 p = *src;
4217 while (r != TK_CC_CLOSE) {
4218 fetched = 0;
4219 switch (r) {
4220 case TK_CHAR:
4221 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4222 if (len > 1) {
4223 in_type = CCV_CODE_POINT;
4224 }
4225 else if (len < 0) {
4226 r = len;
4227 goto err;
4228 }
4229 else {
4230 sb_char:
4231 in_type = CCV_SB;
4232 }
4233 v = (OnigCodePoint )tok->u.c;
4234 in_israw = 0;
4235 goto val_entry2;
4236 break;
4237
4238 case TK_RAW_BYTE:
4239
4240 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4241 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4242 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4243 UChar* psave = p;
4244 int i, base = tok->base;
4245
4246 buf[0] = tok->u.c;
4247 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4248 r = fetch_token_in_cc(tok, &p, end, env);
4249 if (r < 0) goto err;
4250 if (r != TK_RAW_BYTE || tok->base != base) {
4251 fetched = 1;
4252 break;
4253 }
4254 buf[i] = tok->u.c;
4255 }
4256
4257 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4258 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4259 goto err;
4260 }
4261
4262 len = enclen(env->enc, buf);
4263 if (i < len) {
4264 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4265 goto err;
4266 }
4267 else if (i > len) {
4268 p = psave;
4269 for (i = 1; i < len; i++) {
4270 r = fetch_token_in_cc(tok, &p, end, env);
4271 }
4272 fetched = 0;
4273 }
4274
4275 if (i == 1) {
4276 v = (OnigCodePoint )buf[0];
4277 goto raw_single;
4278 }
4279 else {
4280 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4281 in_type = CCV_CODE_POINT;
4282 }
4283 }
4284 else {
4285 v = (OnigCodePoint )tok->u.c;
4286 raw_single:
4287 in_type = CCV_SB;
4288 }
4289 in_israw = 1;
4290 goto val_entry2;
4291 break;
4292
4293 case TK_CODE_POINT:
4294 v = tok->u.code;
4295 in_israw = 1;
4296 val_entry:
4297 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4298 if (len < 0) {
4299 r = len;
4300 goto err;
4301 }
4302 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4303 val_entry2:
4304 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4305 &state, env);
4306 if (r != 0) goto err;
4307 break;
4308
4309 case TK_POSIX_BRACKET_OPEN:
4310 r = parse_posix_bracket(cc, &p, end, env);
4311 if (r < 0) goto err;
4312 if (r == 1) {
4313 CC_ESC_WARN(env, (UChar* )"[");
4314 p = tok->backp;
4315 v = (OnigCodePoint )tok->u.c;
4316 in_israw = 0;
4317 goto val_entry;
4318 }
4319 goto next_class;
4320 break;
4321
4322 case TK_CHAR_TYPE:
4323 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4324 if (r != 0) return r;
4325
4326 next_class:
4327 r = next_state_class(cc, &vs, &val_type, &state, env);
4328 if (r != 0) goto err;
4329 break;
4330
4331 case TK_CHAR_PROPERTY:
4332 {
4333 int ctype;
4334
4335 ctype = fetch_char_property_to_ctype(&p, end, env);
4336 if (ctype < 0) return ctype;
4337 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4338 if (r != 0) return r;
4339 goto next_class;
4340 }
4341 break;
4342
4343 case TK_CC_RANGE:
4344 if (state == CCS_VALUE) {
4345 r = fetch_token_in_cc(tok, &p, end, env);
4346 if (r < 0) goto err;
4347 fetched = 1;
4348 if (r == TK_CC_CLOSE) {
4349 range_end_val:
4350 v = (OnigCodePoint )'-';
4351 in_israw = 0;
4352 goto val_entry;
4353 }
4354 else if (r == TK_CC_AND) {
4355 CC_ESC_WARN(env, (UChar* )"-");
4356 goto range_end_val;
4357 }
4358 state = CCS_RANGE;
4359 }
4360 else if (state == CCS_START) {
4361
4362 v = (OnigCodePoint )tok->u.c;
4363 in_israw = 0;
4364
4365 r = fetch_token_in_cc(tok, &p, end, env);
4366 if (r < 0) goto err;
4367 fetched = 1;
4368
4369 if (r == TK_CC_RANGE || and_start != 0)
4370 CC_ESC_WARN(env, (UChar* )"-");
4371
4372 goto val_entry;
4373 }
4374 else if (state == CCS_RANGE) {
4375 CC_ESC_WARN(env, (UChar* )"-");
4376 goto sb_char;
4377 }
4378 else {
4379 r = fetch_token_in_cc(tok, &p, end, env);
4380 if (r < 0) goto err;
4381 fetched = 1;
4382 if (r == TK_CC_CLOSE) goto range_end_val;
4383 else if (r == TK_CC_AND) {
4384 CC_ESC_WARN(env, (UChar* )"-");
4385 goto range_end_val;
4386 }
4387
4388 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4389 CC_ESC_WARN(env, (UChar* )"-");
4390 goto sb_char;
4391 }
4392 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4393 goto err;
4394 }
4395 break;
4396
4397 case TK_CC_CC_OPEN:
4398 {
4399 Node *anode;
4400 CClassNode* acc;
4401
4402 r = parse_char_class(&anode, tok, &p, end, env);
4403 if (r != 0) goto cc_open_err;
4404 acc = NCCLASS(anode);
4405 r = or_cclass(cc, acc, env->enc);
4406
4407 onig_node_free(anode);
4408 cc_open_err:
4409 if (r != 0) goto err;
4410 }
4411 break;
4412
4413 case TK_CC_AND:
4414 {
4415 if (state == CCS_VALUE) {
4416 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4417 &val_type, &state, env);
4418 if (r != 0) goto err;
4419 }
4420
4421 and_start = 1;
4422 state = CCS_START;
4423
4424 if (IS_NOT_NULL(prev_cc)) {
4425 r = and_cclass(prev_cc, cc, env->enc);
4426 if (r != 0) goto err;
4427 bbuf_free(cc->mbuf);
4428 }
4429 else {
4430 prev_cc = cc;
4431 cc = &work_cc;
4432 }
4433 initialize_cclass(cc);
4434 }
4435 break;
4436
4437 case TK_EOT:
4438 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4439 goto err;
4440 break;
4441 default:
4442 r = ONIGERR_PARSER_BUG;
4443 goto err;
4444 break;
4445 }
4446
4447 if (fetched)
4448 r = tok->type;
4449 else {
4450 r = fetch_token_in_cc(tok, &p, end, env);
4451 if (r < 0) goto err;
4452 }
4453 }
4454
4455 if (state == CCS_VALUE) {
4456 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4457 &val_type, &state, env);
4458 if (r != 0) goto err;
4459 }
4460
4461 if (IS_NOT_NULL(prev_cc)) {
4462 r = and_cclass(prev_cc, cc, env->enc);
4463 if (r != 0) goto err;
4464 bbuf_free(cc->mbuf);
4465 cc = prev_cc;
4466 }
4467
4468 if (neg != 0)
4469 NCCLASS_SET_NOT(cc);
4470 else
4471 NCCLASS_CLEAR_NOT(cc);
4472 if (IS_NCCLASS_NOT(cc) &&
4473 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4474 int is_empty;
4475
4476 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4477 if (is_empty != 0)
4478 BITSET_IS_EMPTY(cc->bs, is_empty);
4479
4480 if (is_empty == 0) {
4481 #define NEWLINE_CODE 0x0a
4482
4483 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4484 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4485 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4486 else
4487 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4488 }
4489 }
4490 }
4491 *src = p;
4492 return 0;
4493
4494 err:
4495 if (cc != NCCLASS(*np))
4496 bbuf_free(cc->mbuf);
4497 onig_node_free(*np);
4498 return r;
4499 }
4500
4501 static int parse_subexp(Node** top, OnigToken* tok, int term,
4502 UChar** src, UChar* end, ScanEnv* env);
4503
4504 static int
4505 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4506 ScanEnv* env)
4507 {
4508 int r, num;
4509 Node *target;
4510 OnigOptionType option;
4511 OnigCodePoint c;
4512 OnigEncoding enc = env->enc;
4513
4514 #ifdef USE_NAMED_GROUP
4515 int list_capture;
4516 #endif
4517
4518 UChar* p = *src;
4519 PFETCH_READY;
4520
4521 *np = NULL;
4522 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4523
4524 option = env->option;
4525 if (PPEEK_IS('?') &&
4526 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4527 PINC;
4528 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4529
4530 PFETCH(c);
4531 switch (c) {
4532 case ':':
4533 group:
4534 r = fetch_token(tok, &p, end, env);
4535 if (r < 0) return r;
4536 r = parse_subexp(np, tok, term, &p, end, env);
4537 if (r < 0) return r;
4538 *src = p;
4539 return 1;
4540 break;
4541
4542 case '=':
4543 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4544 break;
4545 case '!':
4546 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4547 break;
4548 case '>':
4549 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4550 break;
4551
4552 #ifdef USE_NAMED_GROUP
4553 case '\'':
4554 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4555 goto named_group1;
4556 }
4557 else
4558 return ONIGERR_UNDEFINED_GROUP_OPTION;
4559 break;
4560 #endif
4561
4562 case '<':
4563 PFETCH(c);
4564 if (c == '=')
4565 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4566 else if (c == '!')
4567 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4568 #ifdef USE_NAMED_GROUP
4569 else {
4570 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4571 UChar *name;
4572 UChar *name_end;
4573
4574 PUNFETCH;
4575 c = '<';
4576
4577 named_group1:
4578 list_capture = 0;
4579
4580 named_group2:
4581 name = p;
4582 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4583 if (r < 0) return r;
4584
4585 num = scan_env_add_mem_entry(env);
4586 if (num < 0) return num;
4587 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4588 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4589
4590 r = name_add(env->reg, name, name_end, num, env);
4591 if (r != 0) return r;
4592 *np = node_new_enclose_memory(env->option, 1);
4593 CHECK_NULL_RETURN_MEMERR(*np);
4594 NENCLOSE(*np)->regnum = num;
4595 if (list_capture != 0)
4596 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4597 env->num_named++;
4598 }
4599 else {
4600 return ONIGERR_UNDEFINED_GROUP_OPTION;
4601 }
4602 }
4603 #else
4604 else {
4605 return ONIGERR_UNDEFINED_GROUP_OPTION;
4606 }
4607 #endif
4608 break;
4609
4610 case '@':
4611 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4612 #ifdef USE_NAMED_GROUP
4613 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4614 PFETCH(c);
4615 if (c == '<' || c == '\'') {
4616 list_capture = 1;
4617 goto named_group2;
4618 }
4619 PUNFETCH;
4620 }
4621 #endif
4622 *np = node_new_enclose_memory(env->option, 0);
4623 CHECK_NULL_RETURN_MEMERR(*np);
4624 num = scan_env_add_mem_entry(env);
4625 if (num < 0) {
4626 onig_node_free(*np);
4627 return num;
4628 }
4629 else if (num >= (int )BIT_STATUS_BITS_NUM) {
4630 onig_node_free(*np);
4631 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4632 }
4633 NENCLOSE(*np)->regnum = num;
4634 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4635 }
4636 else {
4637 return ONIGERR_UNDEFINED_GROUP_OPTION;
4638 }
4639 break;
4640
4641 #ifdef USE_POSIXLINE_OPTION
4642 case 'p':
4643 #endif
4644 case '-': case 'i': case 'm': case 's': case 'x':
4645 {
4646 int neg = 0;
4647
4648 while (1) {
4649 switch (c) {
4650 case ':':
4651 case ')':
4652 break;
4653
4654 case '-': neg = 1; break;
4655 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
4656 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4657 case 's':
4658 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4659 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4660 }
4661 else
4662 return ONIGERR_UNDEFINED_GROUP_OPTION;
4663 break;
4664
4665 case 'm':
4666 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4667 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4668 }
4669 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4670 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4671 }
4672 else
4673 return ONIGERR_UNDEFINED_GROUP_OPTION;
4674 break;
4675 #ifdef USE_POSIXLINE_OPTION
4676 case 'p':
4677 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4678 break;
4679 #endif
4680 default:
4681 return ONIGERR_UNDEFINED_GROUP_OPTION;
4682 }
4683
4684 if (c == ')') {
4685 *np = node_new_option(option);
4686 CHECK_NULL_RETURN_MEMERR(*np);
4687 *src = p;
4688 return 2;
4689 }
4690 else if (c == ':') {
4691 OnigOptionType prev = env->option;
4692
4693 env->option = option;
4694 r = fetch_token(tok, &p, end, env);
4695 if (r < 0) return r;
4696 r = parse_subexp(&target, tok, term, &p, end, env);
4697 env->option = prev;
4698 if (r < 0) return r;
4699 *np = node_new_option(option);
4700 CHECK_NULL_RETURN_MEMERR(*np);
4701 NENCLOSE(*np)->target = target;
4702 *src = p;
4703 return 0;
4704 }
4705
4706 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4707 PFETCH(c);
4708 }
4709 }
4710 break;
4711
4712 default:
4713 return ONIGERR_UNDEFINED_GROUP_OPTION;
4714 }
4715 }
4716 else {
4717 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4718 goto group;
4719
4720 *np = node_new_enclose_memory(env->option, 0);
4721 CHECK_NULL_RETURN_MEMERR(*np);
4722 num = scan_env_add_mem_entry(env);
4723 if (num < 0) return num;
4724 NENCLOSE(*np)->regnum = num;
4725 }
4726
4727 CHECK_NULL_RETURN_MEMERR(*np);
4728 r = fetch_token(tok, &p, end, env);
4729 if (r < 0) return r;
4730 r = parse_subexp(&target, tok, term, &p, end, env);
4731 if (r < 0) return r;
4732
4733 if (NTYPE(*np) == NT_ANCHOR)
4734 NANCHOR(*np)->target = target;
4735 else {
4736 NENCLOSE(*np)->target = target;
4737 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4738
4739 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4740 if (r != 0) return r;
4741 }
4742 }
4743
4744 *src = p;
4745 return 0;
4746 }
4747
4748 static const char* PopularQStr[] = {
4749 "?", "*", "+", "??", "*?", "+?"
4750 };
4751
4752 static const char* ReduceQStr[] = {
4753 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4754 };
4755
4756 static int
4757 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4758 {
4759 QtfrNode* qn;
4760
4761 qn = NQTFR(qnode);
4762 if (qn->lower == 1 && qn->upper == 1) {
4763 return 1;
4764 }
4765
4766 switch (NTYPE(target)) {
4767 case NT_STR:
4768 if (! group) {
4769 StrNode* sn = NSTR(target);
4770 if (str_node_can_be_split(sn, env->enc)) {
4771 Node* n = str_node_split_last_char(sn, env->enc);
4772 if (IS_NOT_NULL(n)) {
4773 qn->target = n;
4774 return 2;
4775 }
4776 }
4777 }
4778 break;
4779
4780 case NT_QTFR:
4781 {
4782
4783 QtfrNode* qnt = NQTFR(target);
4784 int nestq_num = popular_quantifier_num(qn);
4785 int targetq_num = popular_quantifier_num(qnt);
4786
4787 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4788 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4789 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4790 UChar buf[WARN_BUFSIZE];
4791
4792 switch(ReduceTypeTable[targetq_num][nestq_num]) {
4793 case RQ_ASIS:
4794 break;
4795
4796 case RQ_DEL:
4797 if (onig_verb_warn != onig_null_warn) {
4798 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4799 env->pattern, env->pattern_end,
4800 (UChar* )"redundant nested repeat operator");
4801 (*onig_verb_warn)((char* )buf);
4802 }
4803 goto warn_exit;
4804 break;
4805
4806 default:
4807 if (onig_verb_warn != onig_null_warn) {
4808 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4809 env->pattern, env->pattern_end,
4810 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4811 PopularQStr[targetq_num], PopularQStr[nestq_num],
4812 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4813 (*onig_verb_warn)((char* )buf);
4814 }
4815 goto warn_exit;
4816 break;
4817 }
4818 }
4819
4820 warn_exit:
4821 #endif
4822 if (targetq_num >= 0) {
4823 if (nestq_num >= 0) {
4824 onig_reduce_nested_quantifier(qnode, target);
4825 goto q_exit;
4826 }
4827 else if (targetq_num == 1 || targetq_num == 2) {
4828
4829 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4830 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4831 }
4832 }
4833 }
4834 }
4835 break;
4836
4837 default:
4838 break;
4839 }
4840
4841 qn->target = target;
4842 q_exit:
4843 return 0;
4844 }
4845
4846
4847 #ifdef USE_SHARED_CCLASS_TABLE
4848
4849 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
4850
4851
4852
4853 typedef struct {
4854 OnigEncoding enc;
4855 int not;
4856 int type;
4857 } type_cclass_key;
4858
4859 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4860 {
4861 if (x->type != y->type) return 1;
4862 if (x->enc != y->enc) return 1;
4863 if (x->not != y->not) return 1;
4864 return 0;
4865 }
4866
4867 static int type_cclass_hash(type_cclass_key* key)
4868 {
4869 int i, val;
4870 UChar *p;
4871
4872 val = 0;
4873
4874 p = (UChar* )&(key->enc);
4875 for (i = 0; i < (int )sizeof(key->enc); i++) {
4876 val = val * 997 + (int )*p++;
4877 }
4878
4879 p = (UChar* )(&key->type);
4880 for (i = 0; i < (int )sizeof(key->type); i++) {
4881 val = val * 997 + (int )*p++;
4882 }
4883
4884 val += key->not;
4885 return val + (val >> 5);
4886 }
4887
4888 static struct st_hash_type type_type_cclass_hash = {
4889 type_cclass_cmp,
4890 type_cclass_hash,
4891 };
4892
4893 static st_table* OnigTypeCClassTable;
4894
4895
4896 static int
4897 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
4898 {
4899 if (IS_NOT_NULL(node)) {
4900 CClassNode* cc = NCCLASS(node);
4901 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4902 xfree(node);
4903 }
4904
4905 if (IS_NOT_NULL(key)) xfree(key);
4906 return ST_DELETE;
4907 }
4908
4909 extern int
4910 onig_free_shared_cclass_table(void)
4911 {
4912 if (IS_NOT_NULL(OnigTypeCClassTable)) {
4913 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4914 onig_st_free_table(OnigTypeCClassTable);
4915 OnigTypeCClassTable = NULL;
4916 }
4917
4918 return 0;
4919 }
4920
4921 #endif
4922
4923
4924 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4925 static int
4926 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4927 {
4928 BBuf *tbuf;
4929 int r;
4930
4931 if (IS_NCCLASS_NOT(cc)) {
4932 bitset_invert(cc->bs);
4933
4934 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4935 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4936 if (r != 0) return r;
4937
4938 bbuf_free(cc->mbuf);
4939 cc->mbuf = tbuf;
4940 }
4941
4942 NCCLASS_CLEAR_NOT(cc);
4943 }
4944
4945 return 0;
4946 }
4947 #endif
4948
4949 typedef struct {
4950 ScanEnv* env;
4951 CClassNode* cc;
4952 Node* alt_root;
4953 Node** ptail;
4954 } IApplyCaseFoldArg;
4955
4956 static int
4957 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4958 int to_len, void* arg)
4959 {
4960 IApplyCaseFoldArg* iarg;
4961 ScanEnv* env;
4962 CClassNode* cc;
4963 BitSetRef bs;
4964
4965 iarg = (IApplyCaseFoldArg* )arg;
4966 env = iarg->env;
4967 cc = iarg->cc;
4968 bs = cc->bs;
4969
4970 if (to_len == 1) {
4971 int is_in = onig_is_code_in_cc(env->enc, from, cc);
4972 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4973 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4974 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
4975 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4976 add_code_range(&(cc->mbuf), env, *to, *to);
4977 }
4978 else {
4979 BITSET_SET_BIT(bs, *to);
4980 }
4981 }
4982 #else
4983 if (is_in != 0) {
4984 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4985 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4986 add_code_range(&(cc->mbuf), env, *to, *to);
4987 }
4988 else {
4989 if (IS_NCCLASS_NOT(cc)) {
4990 BITSET_CLEAR_BIT(bs, *to);
4991 }
4992 else
4993 BITSET_SET_BIT(bs, *to);
4994 }
4995 }
4996 #endif
4997 }
4998 else {
4999 int r, i, len;
5000 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5001 Node *snode = NULL_NODE;
5002
5003 if (onig_is_code_in_cc(env->enc, from, cc)
5004 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5005 && !IS_NCCLASS_NOT(cc)
5006 #endif
5007 ) {
5008 for (i = 0; i < to_len; i++) {
5009 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5010 if (i == 0) {
5011 snode = onig_node_new_str(buf, buf + len);
5012 CHECK_NULL_RETURN_MEMERR(snode);
5013
5014
5015
5016 NSTRING_SET_AMBIG(snode);
5017 }
5018 else {
5019 r = onig_node_str_cat(snode, buf, buf + len);
5020 if (r < 0) {
5021 onig_node_free(snode);
5022 return r;
5023 }
5024 }
5025 }
5026
5027 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5028 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5029 iarg->ptail = &(NCDR((*(iarg->ptail))));
5030 }
5031 }
5032
5033 return 0;
5034 }
5035
5036 static int
5037 parse_exp(Node** np, OnigToken* tok, int term,
5038 UChar** src, UChar* end, ScanEnv* env)
5039 {
5040 int r, len, group = 0;
5041 Node* qn;
5042 Node** targetp;
5043
5044 *np = NULL;
5045 if (tok->type == (enum TokenSyms )term)
5046 goto end_of_token;
5047
5048 switch (tok->type) {
5049 case TK_ALT:
5050 case TK_EOT:
5051 end_of_token:
5052 *np = node_new_empty();
5053 return tok->type;
5054 break;
5055
5056 case TK_SUBEXP_OPEN:
5057 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5058 if (r < 0) return r;
5059 if (r == 1) group = 1;
5060 else if (r == 2) {
5061 Node* target;
5062 OnigOptionType prev = env->option;
5063
5064 env->option = NENCLOSE(*np)->option;
5065 r = fetch_token(tok, src, end, env);
5066 if (r < 0) return r;
5067 r = parse_subexp(&target, tok, term, src, end, env);
5068 env->option = prev;
5069 if (r < 0) return r;
5070 NENCLOSE(*np)->target = target;
5071 return tok->type;
5072 }
5073 break;
5074
5075 case TK_SUBEXP_CLOSE:
5076 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5077 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5078
5079 if (tok->escaped) goto tk_raw_byte;
5080 else goto tk_byte;
5081 break;
5082
5083 case TK_STRING:
5084 tk_byte:
5085 {
5086 *np = node_new_str(tok->backp, *src);
5087 CHECK_NULL_RETURN_MEMERR(*np);
5088
5089 while (1) {
5090 r = fetch_token(tok, src, end, env);
5091 if (r < 0) return r;
5092 if (r != TK_STRING) break;
5093
5094 r = onig_node_str_cat(*np, tok->backp, *src);
5095 if (r < 0) return r;
5096 }
5097
5098 string_end:
5099 targetp = np;
5100 goto repeat;
5101 }
5102 break;
5103
5104 case TK_RAW_BYTE:
5105 tk_raw_byte:
5106 {
5107 *np = node_new_str_raw_char((UChar )tok->u.c);
5108 CHECK_NULL_RETURN_MEMERR(*np);
5109 len = 1;
5110 while (1) {
5111 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5112 if (len == enclen(env->enc, NSTR(*np)->s)) {
5113 r = fetch_token(tok, src, end, env);
5114 NSTRING_CLEAR_RAW(*np);
5115 goto string_end;
5116 }
5117 }
5118
5119 r = fetch_token(tok, src, end, env);
5120 if (r < 0) return r;
5121 if (r != TK_RAW_BYTE) {
5122
5123 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5124 int rem;
5125 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5126 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5127 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5128 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5129 NSTRING_CLEAR_RAW(*np);
5130 goto string_end;
5131 }
5132 }
5133 #endif
5134 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5135 }
5136
5137 r = node_str_cat_char(*np, (UChar )tok->u.c);
5138 if (r < 0) return r;
5139
5140 len++;
5141 }
5142 }
5143 break;
5144
5145 case TK_CODE_POINT:
5146 {
5147 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5148 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5149 if (num < 0) return num;
5150 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5151 *np = node_new_str_raw(buf, buf + num);
5152 #else
5153 *np = node_new_str(buf, buf + num);
5154 #endif
5155 CHECK_NULL_RETURN_MEMERR(*np);
5156 }
5157 break;
5158
5159 case TK_QUOTE_OPEN:
5160 {
5161 OnigCodePoint end_op[2];
5162 UChar *qstart, *qend, *nextp;
5163
5164 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5165 end_op[1] = (OnigCodePoint )'E';
5166 qstart = *src;
5167 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5168 if (IS_NULL(qend)) {
5169 nextp = qend = end;
5170 }
5171 *np = node_new_str(qstart, qend);
5172 CHECK_NULL_RETURN_MEMERR(*np);
5173 *src = nextp;
5174 }
5175 break;
5176
5177 case TK_CHAR_TYPE:
5178 {
5179 switch (tok->u.prop.ctype) {
5180 case ONIGENC_CTYPE_WORD:
5181 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5182 CHECK_NULL_RETURN_MEMERR(*np);
5183 break;
5184
5185 case ONIGENC_CTYPE_SPACE:
5186 case ONIGENC_CTYPE_DIGIT:
5187 case ONIGENC_CTYPE_XDIGIT:
5188 {
5189 CClassNode* cc;
5190
5191 #ifdef USE_SHARED_CCLASS_TABLE
5192 const OnigCodePoint *mbr;
5193 OnigCodePoint sb_out;
5194
5195 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
5196 &sb_out, &mbr);
5197 if (r == 0 &&
5198 ONIGENC_CODE_RANGE_NUM(mbr)
5199 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
5200 type_cclass_key key;
5201 type_cclass_key* new_key;
5202
5203 key.enc = env->enc;
5204 key.not = tok->u.prop.not;
5205 key.type = tok->u.prop.ctype;
5206
5207 THREAD_ATOMIC_START;
5208
5209 if (IS_NULL(OnigTypeCClassTable)) {
5210 OnigTypeCClassTable
5211 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
5212 if (IS_NULL(OnigTypeCClassTable)) {
5213 THREAD_ATOMIC_END;
5214 return ONIGERR_MEMORY;
5215 }
5216 }
5217 else {
5218 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
5219 (st_data_t* )np)) {
5220 THREAD_ATOMIC_END;
5221 break;
5222 }
5223 }
5224
5225 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
5226 sb_out, mbr);
5227 if (IS_NULL(*np)) {
5228 THREAD_ATOMIC_END;
5229 return ONIGERR_MEMORY;
5230 }
5231
5232 cc = NCCLASS(*np);
5233 NCCLASS_SET_SHARE(cc);
5234 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
5235 xmemcpy(new_key, &key, sizeof(type_cclass_key));
5236 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
5237 (st_data_t )*np);
5238
5239 THREAD_ATOMIC_END;
5240 }
5241 else {
5242 #endif
5243 *np = node_new_cclass();
5244 CHECK_NULL_RETURN_MEMERR(*np);
5245 cc = NCCLASS(*np);
5246 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5247 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5248 #ifdef USE_SHARED_CCLASS_TABLE
5249 }
5250 #endif
5251 }
5252 break;
5253
5254 default:
5255 return ONIGERR_PARSER_BUG;
5256 break;
5257 }
5258 }
5259 break;
5260
5261 case TK_CHAR_PROPERTY:
5262 r = parse_char_property(np, tok, src, end, env);
5263 if (r != 0) return r;
5264 break;
5265
5266 case TK_CC_OPEN:
5267 {
5268 CClassNode* cc;
5269
5270 r = parse_char_class(np, tok, src, end, env);
5271 if (r != 0) return r;
5272
5273 cc = NCCLASS(*np);
5274 if (IS_IGNORECASE(env->option)) {
5275 IApplyCaseFoldArg iarg;
5276
5277 iarg.env = env;
5278 iarg.cc = cc;
5279 iarg.alt_root = NULL_NODE;
5280 iarg.ptail = &(iarg.alt_root);
5281
5282 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5283 i_apply_case_fold, &iarg);
5284 if (r != 0) {
5285 onig_node_free(iarg.alt_root);
5286 return r;
5287 }
5288 if (IS_NOT_NULL(iarg.alt_root)) {
5289 Node* work = onig_node_new_alt(*np, iarg.alt_root);
5290 if (IS_NULL(work)) {
5291 onig_node_free(iarg.alt_root);
5292 return ONIGERR_MEMORY;
5293 }
5294 *np = work;
5295 }
5296 }
5297 }
5298 break;
5299
5300 case TK_ANYCHAR:
5301 *np = node_new_anychar();
5302 CHECK_NULL_RETURN_MEMERR(*np);
5303 break;
5304
5305 case TK_ANYCHAR_ANYTIME:
5306 *np = node_new_anychar();
5307 CHECK_NULL_RETURN_MEMERR(*np);
5308 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5309 CHECK_NULL_RETURN_MEMERR(qn);
5310 NQTFR(qn)->target = *np;
5311 *np = qn;
5312 break;
5313
5314 case TK_BACKREF:
5315 len = tok->u.backref.num;
5316 *np = node_new_backref(len,
5317 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5318 tok->u.backref.by_name,
5319 #ifdef USE_BACKREF_WITH_LEVEL
5320 tok->u.backref.exist_level,
5321 tok->u.backref.level,
5322 #endif
5323 env);
5324 CHECK_NULL_RETURN_MEMERR(*np);
5325 break;
5326
5327 #ifdef USE_SUBEXP_CALL
5328 case TK_CALL:
5329 {
5330 int gnum = tok->u.call.gnum;
5331
5332 if (gnum < 0) {
5333 gnum = BACKREF_REL_TO_ABS(gnum, env);
5334 if (gnum <= 0)
5335 return ONIGERR_INVALID_BACKREF;
5336 }
5337 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5338 CHECK_NULL_RETURN_MEMERR(*np);
5339 env->num_call++;
5340 }
5341 break;
5342 #endif
5343
5344 case TK_ANCHOR:
5345 *np = onig_node_new_anchor(tok->u.anchor);
5346 break;
5347
5348 case TK_OP_REPEAT:
5349 case TK_INTERVAL:
5350 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5351 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5352 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5353 else
5354 *np = node_new_empty();
5355 }
5356 else {
5357 goto tk_byte;
5358 }
5359 break;
5360
5361 default:
5362 return ONIGERR_PARSER_BUG;
5363 break;
5364 }
5365
5366 {
5367 targetp = np;
5368
5369 re_entry:
5370 r = fetch_token(tok, src, end, env);
5371 if (r < 0) return r;
5372
5373 repeat:
5374 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5375 if (is_invalid_quantifier_target(*targetp))
5376 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5377
5378 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5379 (r == TK_INTERVAL ? 1 : 0));
5380 CHECK_NULL_RETURN_MEMERR(qn);
5381 NQTFR(qn)->greedy = tok->u.repeat.greedy;
5382 r = set_quantifier(qn, *targetp, group, env);
5383 if (r < 0) {
5384 onig_node_free(qn);
5385 return r;
5386 }
5387
5388 if (tok->u.repeat.possessive != 0) {
5389 Node* en;
5390 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5391 if (IS_NULL(en)) {
5392 onig_node_free(qn);
5393 return ONIGERR_MEMORY;
5394 }
5395 NENCLOSE(en)->target = qn;
5396 qn = en;
5397 }
5398
5399 if (r == 0) {
5400 *targetp = qn;
5401 }
5402 else if (r == 1) {
5403 onig_node_free(qn);
5404 }
5405 else if (r == 2) {
5406 Node *tmp;
5407
5408 *targetp = node_new_list(*targetp, NULL);
5409 if (IS_NULL(*targetp)) {
5410 onig_node_free(qn);
5411 return ONIGERR_MEMORY;
5412 }
5413 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5414 if (IS_NULL(tmp)) {
5415 onig_node_free(qn);
5416 return ONIGERR_MEMORY;
5417 }
5418 targetp = &(NCAR(tmp));
5419 }
5420 goto re_entry;
5421 }
5422 }
5423
5424 return r;
5425 }
5426
5427 static int
5428 parse_branch(Node** top, OnigToken* tok, int term,
5429 UChar** src, UChar* end, ScanEnv* env)
5430 {
5431 int r;
5432 Node *node, **headp;
5433
5434 *top = NULL;
5435 r = parse_exp(&node, tok, term, src, end, env);
5436 if (r < 0) return r;
5437
5438 if (r == TK_EOT || r == term || r == TK_ALT) {
5439 *top = node;
5440 }
5441 else {
5442 *top = node_new_list(node, NULL);
5443 headp = &(NCDR(*top));
5444 while (r != TK_EOT && r != term && r != TK_ALT) {
5445 r = parse_exp(&node, tok, term, src, end, env);
5446 if (r < 0) return r;
5447
5448 if (NTYPE(node) == NT_LIST) {
5449 *headp = node;
5450 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5451 headp = &(NCDR(node));
5452 }
5453 else {
5454 *headp = node_new_list(node, NULL);
5455 headp = &(NCDR(*headp));
5456 }
5457 }
5458 }
5459
5460 return r;
5461 }
5462
5463
5464 static int
5465 parse_subexp(Node** top, OnigToken* tok, int term,
5466 UChar** src, UChar* end, ScanEnv* env)
5467 {
5468 int r;
5469 Node *node, **headp;
5470
5471 *top = NULL;
5472 r = parse_branch(&node, tok, term, src, end, env);
5473 if (r < 0) {
5474 onig_node_free(node);
5475 return r;
5476 }
5477
5478 if (r == term) {
5479 *top = node;
5480 }
5481 else if (r == TK_ALT) {
5482 *top = onig_node_new_alt(node, NULL);
5483 headp = &(NCDR(*top));
5484 while (r == TK_ALT) {
5485 r = fetch_token(tok, src, end, env);
5486 if (r < 0) return r;
5487 r = parse_branch(&node, tok, term, src, end, env);
5488 if (r < 0) return r;
5489
5490 *headp = onig_node_new_alt(node, NULL);
5491 headp = &(NCDR(*headp));
5492 }
5493
5494 if (tok->type != (enum TokenSyms )term)
5495 goto err;
5496 }
5497 else {
5498 err:
5499 if (term == TK_SUBEXP_CLOSE)
5500 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5501 else
5502 return ONIGERR_PARSER_BUG;
5503 }
5504
5505 return r;
5506 }
5507
5508 static int
5509 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5510 {
5511 int r;
5512 OnigToken tok;
5513
5514 r = fetch_token(&tok, src, end, env);
5515 if (r < 0) return r;
5516 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5517 if (r < 0) return r;
5518 return 0;
5519 }
5520
5521 extern int
5522 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5523 regex_t* reg, ScanEnv* env)
5524 {
5525 int r;
5526 UChar* p;
5527
5528 #ifdef USE_NAMED_GROUP
5529 names_clear(reg);
5530 #endif
5531
5532 scan_env_clear(env);
5533 env->option = reg->options;
5534 env->case_fold_flag = reg->case_fold_flag;
5535 env->enc = reg->enc;
5536 env->syntax = reg->syntax;
5537 env->pattern = (UChar* )pattern;
5538 env->pattern_end = (UChar* )end;
5539 env->reg = reg;
5540
5541 *root = NULL;
5542 p = (UChar* )pattern;
5543 r = parse_regexp(root, &p, (UChar* )end, env);
5544 reg->num_mem = env->num_mem;
5545 return r;
5546 }
5547
5548 extern void
5549 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5550 UChar* arg, UChar* arg_end)
5551 {
5552 env->error = arg;
5553 env->error_end = arg_end;
5554 }