Commit | Line | Data |
---|---|---|
970ed795 | 1 | /****************************************************************************** |
d44e3c4f | 2 | * Copyright (c) 2000-2016 Ericsson Telecom AB |
970ed795 EL |
3 | * All rights reserved. This program and the accompanying materials |
4 | * are made available under the terms of the Eclipse Public License v1.0 | |
5 | * which accompanies this distribution, and is available at | |
6 | * http://www.eclipse.org/legal/epl-v10.html | |
d44e3c4f | 7 | * |
8 | * Contributors: | |
9 | * Balasko, Jeno | |
10 | * Baranyi, Botond | |
11 | * Delic, Adam | |
12 | * Forstner, Matyas | |
13 | * Raduly, Csaba | |
14 | * Szabados, Kristof | |
15 | * Szabo, Janos Zoltan – initial implementation | |
16 | * Szalai, Gabor | |
17 | * | |
970ed795 EL |
18 | ******************************************************************************/ |
19 | ||
20 | /** | |
21 | * Parser for TTCN-3 character patterns. | |
22 | * | |
23 | * \author Matyas Forstner (Matyas.Forstner@eth.ericsson.se) | |
24 | * | |
25 | * 20031121 | |
26 | */ | |
27 | ||
28 | %{ | |
29 | ||
30 | /********************************************************************* | |
31 | * C(++) declarations | |
32 | *********************************************************************/ | |
33 | ||
34 | #include <stdio.h> | |
35 | #include <string.h> | |
36 | #include <ctype.h> | |
37 | #if defined(__CYGWIN__) && defined(__clang__) | |
38 | /* Cygwin's clang 3.0 has its own limits.h, which does not bring in | |
39 | the system's limits.h unless we define this macro: */ | |
40 | #define __STDC_HOSTED__ 1 | |
41 | #define _GCC_NEXT_LIMITS_H | |
42 | #endif | |
43 | #include <limits.h> | |
44 | ||
45 | #include <regex.h> | |
46 | #if !defined(RE_DUP_MAX) | |
47 | /* RE_DUP_MAX is defined in limits.h or regex.h, except on Cygwin 1.5 */ | |
48 | # include <sys/syslimits.h> | |
49 | #endif | |
50 | ||
51 | #include "memory.h" | |
52 | #include "pattern.hh" | |
53 | ||
54 | /* defined in lexer c-file: */ | |
55 | ||
56 | union YYSTYPE; | |
57 | extern int pattern_yylex(); | |
58 | extern void init_pattern_yylex(YYSTYPE *p); | |
59 | struct yy_buffer_state; | |
60 | extern yy_buffer_state* pattern_yy_scan_string(const char*); | |
61 | extern int pattern_yylex_destroy(); | |
62 | extern unsigned int get_nof_parentheses(); | |
63 | ||
64 | /* defined in this file: */ | |
65 | ||
66 | /** The converted regexp. */ | |
67 | static char *ret_val; | |
3abe9331 | 68 | /** Turns error messages for extended ASCII characters on or off */ |
69 | static bool allow_ext_ascii = false; | |
970ed795 EL |
70 | /** The parser error reporting function. */ |
71 | static void pattern_yyerror(const char *error_str); | |
72 | /** Creates the POSIX equivalent of literal character \a c using the | |
73 | * appropriate escape sequence when needed. */ | |
74 | static char *translate_character(char c); | |
75 | /** Returns the printable equivalent of character \a c */ | |
76 | static char *print_character(char c); | |
77 | /** Returns the printable equivalent of range \a lower .. \a upper */ | |
78 | static char *print_range(char lower, char upper); | |
79 | /** structure for manipulating character sets */ | |
80 | struct character_set; | |
81 | /** allocates, initializes and returns a new empty set */ | |
82 | static character_set *set_init(); | |
83 | /** allocates and returns a copy of \a set */ | |
84 | static character_set *set_copy(const character_set *set); | |
85 | /** deallocates set \a set */ | |
86 | static void set_free(character_set *set); | |
87 | /** returns whether set \a set is empty */ | |
88 | static int set_is_empty(const character_set *set); | |
89 | /** returns whether set \a set contains all characters in range 1..127 */ | |
90 | static int set_is_full(const character_set *set); | |
91 | /** returns whether set \a set contains the character \a c */ | |
92 | static int set_has_char(const character_set *set, char c); | |
93 | /** adds character \a c to set \a set */ | |
94 | static void set_add_char(character_set *set, char c); | |
95 | /** removes character \a c to set \a set */ | |
96 | static void set_remove_char(character_set *set, char c); | |
97 | /** returns whether set \a set contains at least one character in the range | |
98 | * \a lower .. \a upper */ | |
99 | static int set_has_range(const character_set *set, char lower, char upper); | |
100 | /** adds range \a lower .. \a upper to set \a set */ | |
101 | static void set_add_range(character_set *set, char lower, char upper); | |
102 | /** returns whether set \a set1 and \a set2 has non-empty intersect */ | |
103 | static int set_has_intersect(const character_set *set1, | |
104 | const character_set *set2); | |
105 | /** joins sets \a dst and \a src into \a dst */ | |
106 | static void set_join(character_set *dst, const character_set *src); | |
107 | /** negates the set \a set */ | |
108 | static void set_negate(character_set *set); | |
109 | /** reports the duplicate occurrences of characters and ranges in \a set1 | |
110 | * and \a set2 */ | |
111 | static void set_report_duplicates(const character_set *set1, | |
112 | const character_set *set2); | |
113 | /** generates the POSIX equivalent of \a set */ | |
114 | static char *set_generate_posix(const character_set *set); | |
115 | ||
116 | #define YYERROR_VERBOSE | |
117 | ||
118 | static void yyprint(FILE *file, int type, const YYSTYPE& value); | |
119 | #define YYPRINT(f,t,v) yyprint(f,t,v) | |
120 | ||
121 | %} | |
122 | ||
123 | /********************************************************************* | |
124 | * Bison declarations | |
125 | *********************************************************************/ | |
126 | ||
127 | %name-prefix="pattern_yy" | |
128 | %output="pattern_p.cc" | |
129 | %defines | |
130 | %verbose | |
131 | %expect 0 | |
132 | %start Pattern | |
133 | %debug | |
134 | ||
135 | /********************************************************************* | |
136 | * The union-type | |
137 | * Must be kept in sync with the one in pattern_uni.y ! | |
138 | *********************************************************************/ | |
139 | ||
140 | %union { | |
141 | int b; /* boolean */ | |
142 | char c; /* single character */ | |
143 | char *s; /* character string */ | |
144 | unsigned long int u; /* unsigned integer */ | |
145 | struct character_set *set; // used by nonterminals in pattern_p.y | |
146 | ||
147 | union { | |
148 | unsigned int value; | |
149 | #if defined(__sparc__) || defined(__sparc) | |
150 | struct { | |
151 | unsigned char group; | |
152 | unsigned char plane; | |
153 | unsigned char row; | |
154 | unsigned char cell; | |
155 | } comp; | |
156 | #else | |
157 | struct { | |
158 | unsigned char cell; | |
159 | unsigned char row; | |
160 | unsigned char plane; | |
161 | unsigned char group; | |
162 | } comp; | |
163 | #endif | |
164 | } q; // single universal char, used by nonterminals in pattern_uni.y | |
165 | class QuadSet* qset; // used by nonterminals in pattern_uni.y | |
166 | } | |
167 | ||
168 | /********************************************************************* | |
169 | * Tokens | |
170 | *********************************************************************/ | |
171 | ||
172 | %token <c> TOK_Char "<ordinary character>" | |
173 | %token <u> TOK_Number "<number>" | |
174 | %token <u> TOK_Digit "<digit>" | |
175 | ||
176 | /********************************************************************* | |
177 | * Keywords | |
178 | *********************************************************************/ | |
179 | ||
180 | %token KW_BS_q "\\q" | |
181 | %token KW_BS_d "\\d" | |
182 | %token KW_BS_w "\\w" | |
183 | %token KW_BS_t "\\t" | |
184 | %token KW_BS_n "\\n" | |
185 | %token KW_BS_r "\\r" | |
186 | %token KW_BS_s "\\s" | |
187 | %token KW_BS_b "\\b" | |
188 | ||
189 | %token KW_Group_Begin "(" | |
190 | %token KW_Group_End ")" | |
191 | %token KW_Set_Begin "[" | |
192 | %token KW_Set_Begin_Neg "[^" | |
193 | %token KW_Set_Begin_Rsbrkt "[]" | |
194 | %token KW_Set_Begin_Neg_Rsbrkt "[^]" | |
195 | %token KW_Set_End "]" | |
196 | %token KW_Set_Dash_End "-]" | |
197 | ||
198 | /********************************************************************* | |
199 | * semantic types of nonterminals | |
200 | *********************************************************************/ | |
201 | ||
202 | %type <b> RE_Set_Begin RE_Set_Begin_Rsbrkt RE_Set_End | |
203 | %type <c> RE_Set_Range_Char RE_Quadruple | |
204 | %type <s> RE_Body RE_Elems RE_Alter_Elem RE_Concat_Elem | |
205 | RE_Multiply_Elem RE_Multiply_Statement RE_Group | |
206 | RE_OneCharPos | |
207 | %type <set> RE_Set RE_Set_Body RE_Set_Elem RE_Set_NoRange_Char | |
208 | ||
209 | /********************************************************************* | |
210 | * Destructors | |
211 | *********************************************************************/ | |
212 | ||
213 | %destructor { Free($$); } | |
214 | RE_Alter_Elem | |
215 | RE_Body | |
216 | RE_Concat_Elem | |
217 | RE_Elems | |
218 | RE_Group | |
219 | RE_Multiply_Elem | |
220 | RE_Multiply_Statement | |
221 | RE_OneCharPos | |
222 | ||
223 | %destructor { set_free($$); } | |
224 | RE_Set | |
225 | RE_Set_Body | |
226 | RE_Set_Elem | |
227 | RE_Set_NoRange_Char | |
228 | ||
229 | %% | |
230 | ||
231 | /********************************************************************* | |
232 | * Grammar | |
233 | *********************************************************************/ | |
234 | ||
235 | Pattern: | |
236 | RE_Body {ret_val=$1;} | |
237 | ; | |
238 | ||
239 | RE_Body: | |
240 | /* empty */ | |
241 | { | |
242 | $$ = mcopystr("^$"); | |
243 | } | |
244 | | RE_Elems | |
245 | { | |
246 | if ($1 != NULL) { | |
247 | $$ = mprintf("^%s$", $1); | |
248 | Free($1); | |
249 | } else $$ = mcopystr("^$"); | |
250 | } | |
251 | ; | |
252 | ||
253 | RE_Elems: | |
254 | RE_Alter_Elem { $$ = $1; } | |
255 | | RE_Elems '|' RE_Alter_Elem | |
256 | { | |
257 | unsigned int nof_pars = get_nof_parentheses() + (yychar==KW_Group_End ? 1 : 0); | |
258 | if ($3 != NULL) { | |
259 | if ($1 != NULL) $$ = mputprintf($1, nof_pars ? "|%s" : "$|^%s", $3); | |
260 | else $$ = mprintf( nof_pars ? "()|%s" : "()$|^%s" , $3); | |
261 | Free($3); | |
262 | } else { | |
263 | if ($1 != NULL) $$ = mputstr($1, nof_pars ? "|()" : "$|^()"); | |
264 | else $$ = NULL; | |
265 | } | |
266 | } | |
267 | ; | |
268 | ||
269 | RE_Alter_Elem: | |
270 | RE_Concat_Elem { $$ = $1; } | |
271 | | RE_Alter_Elem RE_Concat_Elem | |
272 | { | |
273 | $$ = mputstr($1, $2); | |
274 | Free($2); | |
275 | } | |
276 | ; | |
277 | ||
278 | RE_Concat_Elem: | |
279 | RE_Multiply_Elem {$$=$1;} | |
280 | | RE_Multiply_Elem RE_Multiply_Statement | |
281 | { | |
282 | if ($1 != NULL && $2 != NULL) { | |
283 | $$ = mputstr($1, $2); | |
284 | Free($2); | |
285 | } else { | |
286 | Free($1); | |
287 | Free($2); | |
288 | $$ = NULL; | |
289 | } | |
290 | } | |
291 | | '*' {$$=mcopystr(".*");} | |
292 | ; | |
293 | ||
294 | RE_Multiply_Elem: | |
295 | RE_Group {$$=$1;} | |
296 | | RE_OneCharPos {$$=$1;} | |
297 | ; | |
298 | ||
299 | RE_Group: | |
300 | KW_Group_Begin KW_Group_End | |
301 | { | |
302 | $$ = mcopystr("()"); | |
303 | } | |
304 | | KW_Group_Begin RE_Elems KW_Group_End | |
305 | { | |
306 | if ($2 != NULL) { | |
307 | $$ = mprintf("(%s)", $2); | |
308 | Free($2); | |
309 | } else { | |
310 | $$ = mcopystr("()"); | |
311 | } | |
312 | } | |
313 | ; | |
314 | ||
315 | RE_Multiply_Statement: | |
316 | '+' | |
317 | { | |
318 | $$ = mcopystr("+"); | |
319 | } | |
320 | | '#' '(' ',' ')' | |
321 | { | |
322 | $$ = mcopystr("*"); | |
323 | } | |
324 | | '#' TOK_Digit | |
325 | { | |
326 | if ($2 == 0) { | |
327 | TTCN_pattern_warning("The number of repetitions is zero: `#0'."); | |
328 | $$ = NULL; | |
329 | } else if ($2 == 1) $$ = memptystr(); | |
330 | else { | |
331 | if ($2 > 9) TTCN_pattern_warning("Internal error: Invalid number of " | |
332 | "repetitions: `#%lu'.", $2); | |
333 | $$ = mprintf("{%lu}", $2); | |
334 | } | |
335 | } | |
336 | | '#' '(' TOK_Number ')' | |
337 | { | |
338 | if ($3 == 0) { | |
339 | TTCN_pattern_warning("The number of repetitions is zero: `#(0)'."); | |
340 | $$ = NULL; | |
341 | } else if ($3 == 1) $$ = memptystr(); | |
342 | else { | |
343 | #ifdef RE_DUP_MAX | |
344 | if ($3 > RE_DUP_MAX) TTCN_pattern_warning("The number of repetitions in " | |
345 | "`#(%lu)' exceeds the limit allowed by this system (%d).", $3, | |
346 | RE_DUP_MAX); | |
347 | #endif | |
348 | $$ = mprintf("{%lu}", $3); | |
349 | } | |
350 | } | |
351 | | '#' '(' TOK_Number ',' TOK_Number ')' | |
352 | { | |
353 | #ifdef RE_DUP_MAX | |
354 | if ($3 > RE_DUP_MAX) TTCN_pattern_warning("The minimum number of " | |
355 | "repetitions in `#(%lu,%lu)' exceeds the limit allowed by this system " | |
356 | "(%d).", $3, $5, RE_DUP_MAX); | |
357 | if ($5 > RE_DUP_MAX) TTCN_pattern_warning("The maximum number of " | |
358 | "repetitions in `#(%lu,%lu)' exceeds the limit allowed by this system " | |
359 | "(%d).", $3, $5, RE_DUP_MAX); | |
360 | #endif | |
361 | if ($3 > $5) TTCN_pattern_error("The lower bound is higher than the upper " | |
362 | "bound in the number of repetitions: `#(%lu,%lu)'.", $3, $5); | |
363 | if ($3 == $5) { | |
364 | if ($3 == 0) { | |
365 | TTCN_pattern_warning("The number of repetitions is zero: `#(0,0)'."); | |
366 | $$ = NULL; | |
367 | } else if ($3 == 1) $$ = memptystr(); | |
368 | else { | |
369 | $$ = mprintf("{%lu}", $3); | |
370 | } | |
371 | } else { | |
372 | if ($3 == 0 && $5 == 1) $$ = mcopystr("?"); | |
373 | else $$ = mprintf("{%lu,%lu}", $3, $5); | |
374 | } | |
375 | } | |
376 | | '#' '(' ',' TOK_Number ')' | |
377 | { | |
378 | if ($4 == 0) { | |
379 | TTCN_pattern_warning("The number of repetitions is zero: `#(,0)'."); | |
380 | $$ = NULL; | |
381 | } else { | |
382 | #ifdef RE_DUP_MAX | |
383 | if ($4 > RE_DUP_MAX) TTCN_pattern_warning("The maximum number of " | |
384 | "repetitions in `#(,%lu)' exceeds the limit allowed by this system " | |
385 | "(%d).", $4, RE_DUP_MAX); | |
386 | #endif | |
387 | if ($4 == 1) $$ = mcopystr("?"); | |
388 | else $$ = mprintf("{0,%lu}", $4); | |
389 | } | |
390 | } | |
391 | | '#' '(' TOK_Number ',' ')' | |
392 | { | |
393 | if ($3 == 0) $$ = mcopystr("*"); | |
394 | else { | |
395 | #ifdef RE_DUP_MAX | |
396 | if ($3 > RE_DUP_MAX) TTCN_pattern_warning("The minimum number of " | |
397 | "repetitions in `#(%lu,)' exceeds the limit allowed by this system " | |
398 | "(%d).", $3, RE_DUP_MAX); | |
399 | #endif | |
400 | if ($3 == 1) $$ = mcopystr("+"); | |
401 | else $$ = mprintf("{%lu,}", $3); | |
402 | } | |
403 | } | |
404 | ; | |
405 | ||
406 | RE_OneCharPos: | |
407 | '?' {$$=mcopystr(".");} | |
408 | | KW_BS_d {$$=mcopystr("[0-9]");} | |
409 | | KW_BS_w {$$=mcopystr("[0-9A-Za-z]");} | |
410 | | KW_BS_t {$$=mcopystr("\t");} | |
411 | | KW_BS_n {$$=mcopystr("[\n-\r]");} | |
412 | | KW_BS_r {$$=mcopystr("\r");} | |
413 | | KW_BS_s {$$=mcopystr("[\t-\r ]");} | |
414 | | KW_BS_b | |
415 | { | |
416 | TTCN_pattern_warning("Metacharacter `\\b' is not supported yet."); | |
417 | $$ = NULL; | |
418 | } | |
419 | | TOK_Char | |
420 | { | |
421 | unsigned char c = $1; | |
3abe9331 | 422 | if (c == 0 || (c > 127 && !allow_ext_ascii)) TTCN_pattern_error("Character " |
423 | "with code %u (0x%02x) cannot be used in a pattern for type charstring.", c, c); | |
970ed795 EL |
424 | $$ = translate_character($1); |
425 | } | |
426 | | RE_Quadruple | |
427 | { | |
428 | $$ = translate_character($1); | |
429 | } | |
430 | | RE_Set | |
431 | { | |
432 | if (set_is_empty($1)) { | |
433 | TTCN_pattern_error("Empty character set."); | |
434 | $$ = NULL; | |
435 | } else $$ = set_generate_posix($1); | |
436 | set_free($1); | |
437 | } | |
438 | ; | |
439 | ||
440 | RE_Set: | |
441 | /* RE_Set_Begin is 1 for "[^", 0 for "[" | |
442 | * RE_Set_Begin_Rsbrkt is 1 for "[^]", 0 for "[]" | |
443 | * RE_Set_End is 1 for "-]", 0 for "]" | |
444 | */ | |
445 | RE_Set_Begin RE_Set_Body RE_Set_End | |
446 | { | |
447 | if ($2 != NULL) $$ = $2; | |
448 | else $$ = set_init(); | |
449 | if ($3) { | |
450 | if (set_has_char($$, '-')) | |
451 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
452 | else set_add_char($$, '-'); | |
453 | } | |
454 | if ($1) set_negate($$); | |
455 | } | |
456 | | RE_Set_Begin '-' RE_Set_Body RE_Set_End | |
457 | { | |
458 | if ($3 != NULL) $$ = $3; | |
459 | else $$ = set_init(); | |
460 | if (set_has_char($$, '-')) | |
461 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
462 | else set_add_char($$, '-'); | |
463 | if ($4) { | |
464 | if (set_has_char($$, '-')) | |
465 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
466 | else set_add_char($$, '-'); | |
467 | } | |
468 | if ($1) set_negate($$); | |
469 | } | |
470 | | RE_Set_Begin_Rsbrkt RE_Set_Body RE_Set_End | |
471 | { | |
472 | if ($2 != NULL) $$ = $2; | |
473 | else $$ = set_init(); | |
474 | if (set_has_char($$, ']')) | |
475 | TTCN_pattern_warning("Duplicate character `]' in the character set."); | |
476 | else set_add_char($$, ']'); | |
477 | if ($3) { | |
478 | if (set_has_char($$, '-')) | |
479 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
480 | else set_add_char($$, '-'); | |
481 | } | |
482 | if ($1) set_negate($$); | |
483 | } | |
484 | | RE_Set_Begin_Rsbrkt '-' RE_Set_Range_Char RE_Set_Body RE_Set_End | |
485 | { | |
486 | if ($4 != NULL) $$ = $4; | |
487 | else $$ = set_init(); | |
488 | char *range_str = print_range(']', $3); | |
489 | if (']' > $3) { | |
490 | TTCN_pattern_error("Invalid range `%s' in the character set: the " | |
491 | "character code of the lower bound (%u) is higher than that of the " | |
492 | "upper bound (%u).", range_str, ']', (unsigned char)$3); | |
493 | } else { | |
494 | if (set_has_range($$, ']', $3)) { | |
495 | character_set *tmpset = set_init(); | |
496 | set_add_range(tmpset, ']', $3); | |
497 | set_report_duplicates($$, tmpset); | |
498 | set_free(tmpset); | |
499 | } | |
500 | } | |
501 | set_add_range($$, ']', $3); | |
502 | Free(range_str); | |
503 | if ($5) { | |
504 | if (set_has_char($$, '-')) | |
505 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
506 | else set_add_char($$, '-'); | |
507 | } | |
508 | if ($1) set_negate($$); | |
509 | } | |
510 | ; | |
511 | ||
512 | RE_Set_Begin: | |
513 | KW_Set_Begin { $$ = 0; } | |
514 | | KW_Set_Begin_Neg { $$ = 1; } | |
515 | ; | |
516 | ||
517 | RE_Set_Begin_Rsbrkt: | |
518 | KW_Set_Begin_Rsbrkt { $$ = 0; } | |
519 | | KW_Set_Begin_Neg_Rsbrkt { $$ = 1; } | |
520 | ; | |
521 | ||
522 | RE_Set_End: | |
523 | KW_Set_End { $$ = 0; } | |
524 | | KW_Set_Dash_End { $$ = 1; } | |
525 | ; | |
526 | ||
527 | RE_Set_Body: | |
528 | /* empty */ { $$ = NULL; } | |
529 | | RE_Set_Body RE_Set_Elem | |
530 | { | |
531 | if ($1 != NULL) { | |
532 | $$ = $1; | |
533 | if (set_has_intersect($$, $2)) set_report_duplicates($$, $2); | |
534 | set_join($$, $2); | |
535 | set_free($2); | |
536 | } else $$ = $2; | |
537 | } | |
538 | ; | |
539 | ||
540 | RE_Set_Elem: | |
541 | RE_Set_Range_Char | |
542 | { | |
543 | $$ = set_init(); | |
544 | set_add_char($$, $1); | |
545 | } | |
546 | | RE_Set_NoRange_Char { $$ = $1; } | |
547 | | RE_Set_Range_Char '-' RE_Set_Range_Char | |
548 | { | |
549 | if ($1 > $3) { | |
550 | char *range_str = print_range($1, $3); | |
551 | TTCN_pattern_error("Invalid range `%s' in the character set: the " | |
552 | "character code of the lower bound (%u) is higher than that of the " | |
553 | "upper bound (%u).", range_str, (unsigned char)$1, (unsigned char)$3); | |
554 | Free(range_str); | |
555 | } | |
556 | $$ = set_init(); | |
557 | set_add_range($$, $1, $3); | |
558 | } | |
559 | ; | |
560 | ||
561 | RE_Set_Range_Char: | |
562 | KW_BS_t { $$ = '\t'; } | |
563 | | KW_BS_r { $$ = '\r'; } | |
564 | | TOK_Char | |
565 | { | |
566 | unsigned char c = $1; | |
3abe9331 | 567 | if (c == 0 || (c > 127 && !allow_ext_ascii)) TTCN_pattern_error("Character " |
568 | "with code %u (0x%02x) cannot be used in a pattern for type charstring.", c, c); | |
970ed795 EL |
569 | $$ = $1; |
570 | } | |
571 | | RE_Quadruple { $$ = $1; } | |
572 | ; | |
573 | ||
574 | RE_Set_NoRange_Char: | |
575 | KW_BS_d | |
576 | { | |
577 | $$ = set_init(); | |
578 | set_add_range($$, '0', '9'); | |
579 | } | |
580 | | KW_BS_w | |
581 | { | |
582 | $$ = set_init(); | |
583 | set_add_range($$, '0', '9'); | |
584 | set_add_range($$, 'A', 'Z'); | |
585 | set_add_range($$, 'a', 'z'); | |
586 | } | |
587 | | KW_BS_n | |
588 | { | |
589 | $$ = set_init(); | |
590 | set_add_range($$, '\n', '\r'); | |
591 | } | |
592 | | KW_BS_s | |
593 | { | |
594 | $$ = set_init(); | |
595 | set_add_range($$, '\t', '\r'); | |
596 | set_add_char($$, ' '); | |
597 | } | |
598 | | KW_BS_b | |
599 | { | |
600 | TTCN_pattern_error("Metacharacter `\\b' does not make any sense in a " | |
601 | "character set."); | |
602 | $$ = set_init(); | |
603 | } | |
604 | ; | |
605 | ||
606 | RE_Quadruple: | |
607 | KW_BS_q '{' TOK_Number ',' TOK_Number ',' TOK_Number ',' TOK_Number '}' | |
608 | { | |
609 | if ($3 > 127) TTCN_pattern_error("The first number (group) of quadruple " | |
610 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..127 " | |
611 | "instead of %lu.", $3, $5, $7, $9, $3); | |
612 | if ($5 > 255) TTCN_pattern_error("The second number (plane) of quadruple " | |
613 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..255 " | |
614 | "instead of %lu.", $3, $5, $7, $9, $5); | |
615 | if ($7 > 255) TTCN_pattern_error("The third number (row) of quadruple " | |
616 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..255 " | |
617 | "instead of %lu.", $3, $5, $7, $9, $7); | |
618 | if ($9 > 255) TTCN_pattern_error("The fourth number (cell) of quadruple " | |
619 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..255 " | |
620 | "instead of %lu.", $3, $5, $7, $9, $9); | |
621 | if ($3 > 0 || $5 > 0 || $7 > 0 || $9 > 127) TTCN_pattern_error("Quadruple " | |
622 | "`\\q{%lu,%lu,%lu,%lu}' is not valid in a pattern for type charstring.", | |
623 | $3, $5, $7, $9); | |
624 | if ($3 == 0 && $5 == 0 && $7 == 0 && $9 == 0) TTCN_pattern_error("Zero " | |
625 | "character (i.e. quadruple `\\q{0,0,0,0}') is not supported in a " | |
626 | "pattern for type charstring."); | |
627 | $$ = $9; | |
628 | } | |
629 | ; | |
630 | ||
631 | %% | |
632 | ||
633 | /********************************************************************* | |
634 | * Interface | |
635 | *********************************************************************/ | |
636 | ||
3abe9331 | 637 | char* TTCN_pattern_to_regexp(const char* p_pattern, bool utf8) |
970ed795 EL |
638 | { |
639 | /* if you want to debug */ | |
640 | //pattern_yydebug=1; | |
641 | ||
642 | ret_val=NULL; | |
643 | ||
3abe9331 | 644 | /* allow extended ASCII characters if the pattern is in UTF-8 format */ |
645 | allow_ext_ascii = utf8; | |
646 | ||
970ed795 EL |
647 | yy_buffer_state *flex_buffer = pattern_yy_scan_string(p_pattern); |
648 | if(flex_buffer == NULL) { | |
649 | TTCN_pattern_error("Flex buffer creation failed."); | |
650 | return NULL; | |
651 | } | |
652 | init_pattern_yylex(&yylval); | |
653 | if(pattern_yyparse()) { | |
654 | Free(ret_val); | |
655 | ret_val=NULL; | |
656 | } | |
657 | pattern_yylex_destroy(); | |
658 | return ret_val; | |
659 | } | |
660 | ||
661 | // Backwards compatibility shim | |
662 | char* TTCN_pattern_to_regexp(const char* p_pattern, int ere) | |
663 | { | |
664 | TTCN_pattern_warning( | |
665 | "TTCN_pattern_to_regexp(const char* p_pattern, int ere) is deprecated"); | |
666 | if (ere != 1) TTCN_pattern_error( | |
667 | "BRE is not supported for TTCN_pattern_to_regexp"); | |
668 | return TTCN_pattern_to_regexp(p_pattern); | |
669 | } | |
670 | ||
671 | /********************************************************************* | |
672 | * Static functions | |
673 | *********************************************************************/ | |
674 | ||
675 | /// Error reporting function | |
676 | void pattern_yyerror(const char *error_str) | |
677 | { | |
678 | TTCN_pattern_error("%s", error_str); | |
679 | } | |
680 | ||
681 | /** Escape plain characters which would be metacharacters in a regex. | |
682 | * | |
683 | * @param c plain character | |
684 | * @return a newly allocated string which must be Free() 'd | |
685 | */ | |
686 | char *translate_character(char c) | |
687 | { | |
688 | int escape_needed = 0; | |
689 | switch (c) { | |
690 | case '|': | |
691 | case '+': | |
692 | case '?': | |
693 | case '{': | |
694 | case '}': | |
695 | case '(': | |
696 | case ')': | |
697 | case '.': | |
698 | case '^': | |
699 | case '$': | |
700 | case '[': | |
701 | case '*': | |
702 | case '\\': | |
703 | escape_needed = 1; | |
704 | break; | |
705 | } | |
706 | if (escape_needed) return mprintf("\\%c", c); | |
707 | else return mputc(NULL, c); | |
708 | } | |
709 | ||
710 | char *print_character(char c) | |
711 | { | |
712 | switch (c) { | |
713 | case '\t': | |
714 | return mcopystr("\\t"); | |
715 | case '\r': | |
716 | return mcopystr("\\r"); | |
717 | default: | |
718 | if (isprint((unsigned char)c)) return mprintf("%c", c); | |
719 | else return mprintf("\\q{0,0,0,%u}", (unsigned char)c); | |
720 | } | |
721 | } | |
722 | ||
723 | char *print_range(char lower, char upper) | |
724 | { | |
725 | char *range_str = print_character(lower); | |
726 | range_str = mputc(range_str, '-'); | |
727 | char *upper_str = print_character(upper); | |
728 | range_str = mputstr(range_str, upper_str); | |
729 | Free(upper_str); | |
730 | return range_str; | |
731 | } | |
732 | ||
733 | #define CS_BITS_PER_ELEM (8 * sizeof(unsigned long)) | |
734 | #define CS_NOF_ELEMS ((128 + CS_BITS_PER_ELEM - 1) / CS_BITS_PER_ELEM) | |
735 | ||
736 | struct character_set { | |
737 | unsigned long set_members[CS_NOF_ELEMS]; | |
738 | }; | |
739 | ||
740 | character_set *set_init() | |
741 | { | |
742 | character_set *set = (character_set*)Malloc(sizeof(*set)); | |
743 | memset(set->set_members, 0, sizeof(set->set_members)); | |
744 | return set; | |
745 | } | |
746 | ||
747 | character_set *set_copy(const character_set *set) | |
748 | { | |
749 | character_set *set2 = (character_set*)Malloc(sizeof(*set2)); | |
750 | memcpy(set2, set, sizeof(*set2)); | |
751 | return set2; | |
752 | } | |
753 | ||
754 | void set_free(character_set *set) | |
755 | { | |
756 | Free(set); | |
757 | } | |
758 | ||
759 | int set_is_empty(const character_set *set) | |
760 | { | |
761 | if ((set->set_members[0] & ~1UL) != 0) return 0; | |
762 | for (size_t i = 1; i < CS_NOF_ELEMS; i++) | |
763 | if (set->set_members[i] != 0) return 0; | |
764 | return 1; | |
765 | } | |
766 | ||
767 | int set_is_full(const character_set *set) | |
768 | { | |
769 | if (~(set->set_members[0] | 1UL) != 0) return 0; | |
770 | for (size_t i = 1; i < CS_NOF_ELEMS; i++) | |
771 | if (~set->set_members[i] != 0) return 0; | |
772 | return 1; | |
773 | } | |
774 | ||
775 | int set_has_char(const character_set *set, char c) | |
776 | { | |
777 | if (set->set_members[c / CS_BITS_PER_ELEM] & 1UL << c % CS_BITS_PER_ELEM) | |
778 | return 1; | |
779 | else return 0; | |
780 | } | |
781 | ||
782 | void set_add_char(character_set *set, char c) | |
783 | { | |
784 | set->set_members[c / CS_BITS_PER_ELEM] |= 1UL << c % CS_BITS_PER_ELEM; | |
785 | } | |
786 | ||
787 | void set_remove_char(character_set *set, char c) | |
788 | { | |
789 | set->set_members[c / CS_BITS_PER_ELEM] &= ~(1UL << c % CS_BITS_PER_ELEM); | |
790 | } | |
791 | ||
792 | int set_has_range(const character_set *set, char lower, char upper) | |
793 | { | |
794 | for (size_t i = lower; i <= (unsigned char)upper; i++) | |
795 | if (set->set_members[i / CS_BITS_PER_ELEM] & 1UL << i % CS_BITS_PER_ELEM) | |
796 | return 1; | |
797 | return 0; | |
798 | } | |
799 | ||
800 | void set_add_range(character_set *set, char lower, char upper) | |
801 | { | |
802 | for (size_t i = lower; i <= (unsigned char)upper; i++) | |
803 | set->set_members[i / CS_BITS_PER_ELEM] |= 1UL << i % CS_BITS_PER_ELEM; | |
804 | } | |
805 | ||
806 | int set_has_intersect(const character_set *set1, const character_set *set2) | |
807 | { | |
808 | for (size_t i = 0; i < CS_NOF_ELEMS; i++) | |
809 | if (set1->set_members[i] & set2->set_members[i]) return 1; | |
810 | return 0; | |
811 | } | |
812 | ||
813 | void set_join(character_set *dst, const character_set *src) | |
814 | { | |
815 | for (size_t i = 0; i < CS_NOF_ELEMS; i++) | |
816 | dst->set_members[i] |= src->set_members[i]; | |
817 | } | |
818 | ||
819 | void set_negate(character_set *set) | |
820 | { | |
821 | for (size_t i = 0; i < CS_NOF_ELEMS; i++) | |
822 | set->set_members[i] = ~set->set_members[i]; | |
823 | } | |
824 | ||
825 | void set_report_duplicates(const character_set *set1, | |
826 | const character_set *set2) | |
827 | { | |
828 | for (unsigned char i = 0; i <= 127; ) { | |
829 | for (i++; i <= 127; i++) | |
830 | if (set_has_char(set2, i) && set_has_char(set1, i)) break; | |
831 | if (i > 127) break; | |
832 | char lower = i; | |
833 | for (i++; i <= 127; i++) | |
834 | if (!set_has_char(set2, i) || !set_has_char(set1, i)) break; | |
835 | char upper = i - 1; | |
836 | if (lower < upper) { | |
837 | char *range_str = print_range(lower, upper); | |
838 | TTCN_pattern_warning("Duplicate range `%s' in the character set.", | |
839 | range_str); | |
840 | Free(range_str); | |
841 | } else { | |
842 | char *char_str = print_character(lower); | |
843 | if(lower == '\r' ){ | |
844 | TTCN_pattern_warning("Duplicate character `%s' in the character " | |
845 | "set. Please note the \\n includes the \\r implicitly. " | |
846 | "Use \\q{0,0,0,10} if you would like to match the LF only.", char_str); | |
847 | } else { | |
848 | TTCN_pattern_warning("Duplicate character `%s' in the character " | |
849 | "set.", char_str); | |
850 | } | |
851 | Free(char_str); | |
852 | } | |
853 | } | |
854 | } | |
855 | ||
856 | static char *append_posix_body(char *set_body, const character_set *set) | |
857 | { | |
858 | for (unsigned char i = 0; i <= 127; ) { | |
859 | for (i++; i <= 127; i++) if (set_has_char(set, i)) break; | |
860 | if (i > 127) break; | |
861 | char lower = i; | |
862 | set_body = mputc(set_body, lower); | |
863 | for (i++; i <= 127; i++) if (!set_has_char(set, i)) break; | |
864 | char upper = i - 1; | |
865 | if (lower < upper) { | |
866 | if (lower + 1 < upper) set_body = mputc(set_body, '-'); | |
867 | set_body = mputc(set_body, upper); | |
868 | } | |
869 | } | |
870 | return set_body; | |
871 | } | |
872 | ||
873 | static char *generate_posix_body(character_set *set) | |
874 | { | |
875 | int has_caret; | |
876 | if (set_has_char(set, '^') && !(set_has_char(set, '^' - 1) && | |
877 | set_has_char(set, '^' + 1))) { | |
878 | set_remove_char(set, '^'); | |
879 | has_caret = 1; | |
880 | } else has_caret = 0; | |
881 | int has_dash; | |
882 | if (set_has_char(set, '-') && !(set_has_char(set, '-' - 1) && | |
883 | set_has_char(set, '-' + 1))) { | |
884 | set_remove_char(set, '-'); | |
885 | has_dash = 1; | |
886 | } else has_dash = 0; | |
887 | int has_rsbrkt; | |
888 | if (set_has_char(set, ']') && !(set_has_char(set, ']' - 1) && | |
889 | set_has_char(set, ']' + 1))) { | |
890 | set_remove_char(set, ']'); | |
891 | has_rsbrkt = 1; | |
892 | } else has_rsbrkt = 0; | |
893 | char *set_body = memptystr(); | |
894 | if (set_is_empty(set) && !has_rsbrkt) { | |
895 | /* the `-' must precede the `^' */ | |
896 | if (has_dash) set_body = mputc(set_body, '-'); | |
897 | if (has_caret) set_body = mputc(set_body, '^'); | |
898 | } else { | |
899 | /* order: ']', others, '^', '-' */ | |
900 | if (has_rsbrkt) set_body = mputc(set_body, ']'); | |
901 | set_body = append_posix_body(set_body, set); | |
902 | if (has_caret) set_body = mputc(set_body, '^'); | |
903 | if (has_dash) set_body = mputc(set_body, '-'); | |
904 | } | |
905 | return set_body; | |
906 | } | |
907 | ||
908 | static char *generate_posix_body_compl(character_set *set) | |
909 | { | |
910 | set_negate(set); | |
911 | int has_dash; | |
912 | if (set_has_char(set, '-') && !(set_has_char(set, '-' - 1) && | |
913 | set_has_char(set, '-' + 1))) { | |
914 | set_remove_char(set, '-'); | |
915 | has_dash = 1; | |
916 | } else has_dash = 0; | |
917 | int has_rsbrkt; | |
918 | if (set_has_char(set, ']') && !(set_has_char(set, ']' - 1) && | |
919 | set_has_char(set, ']' + 1))) { | |
920 | set_remove_char(set, ']'); | |
921 | has_rsbrkt = 1; | |
922 | } else has_rsbrkt = 0; | |
923 | char *set_body = mcopystr("^"); | |
924 | /* order: ']', others, '-' */ | |
925 | if (has_rsbrkt) set_body = mputc(set_body, ']'); | |
926 | set_body = append_posix_body(set_body, set); | |
927 | if (has_dash) set_body = mputc(set_body, '-'); | |
928 | return set_body; | |
929 | } | |
930 | ||
931 | char *set_generate_posix(const character_set *set) | |
932 | { | |
933 | /* a full set can only be represented in this way: */ | |
934 | if (set_is_full(set)) return mcopystr("."); | |
935 | character_set *tempset = set_copy(set); | |
936 | char *set_body = generate_posix_body(tempset); | |
937 | set_free(tempset); | |
938 | char *posix_str; | |
939 | if (set_body[0] == '\0') { | |
940 | Free(set_body); | |
941 | TTCN_pattern_error("Internal error: empty POSIX set."); | |
942 | return NULL; | |
943 | } | |
944 | /* do not use the set notation in POSIX if the set contains only one | |
945 | * character */ | |
946 | if (set_body[1] == '\0') posix_str = translate_character(set_body[0]); | |
947 | else { | |
948 | /* create the complemented version of the same set */ | |
949 | tempset = set_copy(set); | |
950 | char *compl_body = generate_posix_body_compl(tempset); | |
951 | set_free(tempset); | |
952 | if (compl_body[0] == '\0') { | |
953 | Free(set_body); | |
954 | Free(compl_body); | |
955 | TTCN_pattern_error("Internal error: empty complemented POSIX set."); | |
956 | return NULL; | |
957 | } | |
958 | /* use the complemented form in the POSIX equivalent if it is the shorter | |
959 | * one */ | |
960 | if (mstrlen(compl_body) < mstrlen(set_body)) | |
961 | posix_str = mprintf("[%s]", compl_body); | |
962 | else posix_str = mprintf("[%s]", set_body); | |
963 | Free(compl_body); | |
964 | } | |
965 | Free(set_body); | |
966 | return posix_str; | |
967 | } | |
968 | ||
969 | void yyprint(FILE *file, int type, const YYSTYPE& value) | |
970 | { | |
971 | switch (type) { | |
972 | case TOK_Char: | |
973 | fprintf(file, "'%c'", value.c); | |
974 | break; | |
975 | case TOK_Digit: case TOK_Number: | |
976 | fprintf(file, "'%lu'", value.u); | |
977 | break; | |
978 | default: | |
979 | break; | |
980 | } | |
981 | } | |
982 |