Commit | Line | Data |
---|---|---|
970ed795 EL |
1 | /****************************************************************************** |
2 | * Copyright (c) 2000-2014 Ericsson Telecom AB | |
3 | * All rights reserved. This program and the accompanying materials | |
4 | * are made available under the terms of the Eclipse Public License v1.0 | |
5 | * which accompanies this distribution, and is available at | |
6 | * http://www.eclipse.org/legal/epl-v10.html | |
7 | ******************************************************************************/ | |
8 | ||
9 | /** | |
10 | * Parser for TTCN-3 character patterns. | |
11 | * | |
12 | * \author Matyas Forstner (Matyas.Forstner@eth.ericsson.se) | |
13 | * | |
14 | * 20031121 | |
15 | */ | |
16 | ||
17 | %{ | |
18 | ||
19 | /********************************************************************* | |
20 | * C(++) declarations | |
21 | *********************************************************************/ | |
22 | ||
23 | #include <stdio.h> | |
24 | #include <string.h> | |
25 | #include <ctype.h> | |
26 | #if defined(__CYGWIN__) && defined(__clang__) | |
27 | /* Cygwin's clang 3.0 has its own limits.h, which does not bring in | |
28 | the system's limits.h unless we define this macro: */ | |
29 | #define __STDC_HOSTED__ 1 | |
30 | #define _GCC_NEXT_LIMITS_H | |
31 | #endif | |
32 | #include <limits.h> | |
33 | ||
34 | #include <regex.h> | |
35 | #if !defined(RE_DUP_MAX) | |
36 | /* RE_DUP_MAX is defined in limits.h or regex.h, except on Cygwin 1.5 */ | |
37 | # include <sys/syslimits.h> | |
38 | #endif | |
39 | ||
40 | #include "memory.h" | |
41 | #include "pattern.hh" | |
42 | ||
43 | /* defined in lexer c-file: */ | |
44 | ||
45 | union YYSTYPE; | |
46 | extern int pattern_yylex(); | |
47 | extern void init_pattern_yylex(YYSTYPE *p); | |
48 | struct yy_buffer_state; | |
49 | extern yy_buffer_state* pattern_yy_scan_string(const char*); | |
50 | extern int pattern_yylex_destroy(); | |
51 | extern unsigned int get_nof_parentheses(); | |
52 | ||
53 | /* defined in this file: */ | |
54 | ||
55 | /** The converted regexp. */ | |
56 | static char *ret_val; | |
57 | /** The parser error reporting function. */ | |
58 | static void pattern_yyerror(const char *error_str); | |
59 | /** Creates the POSIX equivalent of literal character \a c using the | |
60 | * appropriate escape sequence when needed. */ | |
61 | static char *translate_character(char c); | |
62 | /** Returns the printable equivalent of character \a c */ | |
63 | static char *print_character(char c); | |
64 | /** Returns the printable equivalent of range \a lower .. \a upper */ | |
65 | static char *print_range(char lower, char upper); | |
66 | /** structure for manipulating character sets */ | |
67 | struct character_set; | |
68 | /** allocates, initializes and returns a new empty set */ | |
69 | static character_set *set_init(); | |
70 | /** allocates and returns a copy of \a set */ | |
71 | static character_set *set_copy(const character_set *set); | |
72 | /** deallocates set \a set */ | |
73 | static void set_free(character_set *set); | |
74 | /** returns whether set \a set is empty */ | |
75 | static int set_is_empty(const character_set *set); | |
76 | /** returns whether set \a set contains all characters in range 1..127 */ | |
77 | static int set_is_full(const character_set *set); | |
78 | /** returns whether set \a set contains the character \a c */ | |
79 | static int set_has_char(const character_set *set, char c); | |
80 | /** adds character \a c to set \a set */ | |
81 | static void set_add_char(character_set *set, char c); | |
82 | /** removes character \a c to set \a set */ | |
83 | static void set_remove_char(character_set *set, char c); | |
84 | /** returns whether set \a set contains at least one character in the range | |
85 | * \a lower .. \a upper */ | |
86 | static int set_has_range(const character_set *set, char lower, char upper); | |
87 | /** adds range \a lower .. \a upper to set \a set */ | |
88 | static void set_add_range(character_set *set, char lower, char upper); | |
89 | /** returns whether set \a set1 and \a set2 has non-empty intersect */ | |
90 | static int set_has_intersect(const character_set *set1, | |
91 | const character_set *set2); | |
92 | /** joins sets \a dst and \a src into \a dst */ | |
93 | static void set_join(character_set *dst, const character_set *src); | |
94 | /** negates the set \a set */ | |
95 | static void set_negate(character_set *set); | |
96 | /** reports the duplicate occurrences of characters and ranges in \a set1 | |
97 | * and \a set2 */ | |
98 | static void set_report_duplicates(const character_set *set1, | |
99 | const character_set *set2); | |
100 | /** generates the POSIX equivalent of \a set */ | |
101 | static char *set_generate_posix(const character_set *set); | |
102 | ||
103 | #define YYERROR_VERBOSE | |
104 | ||
105 | static void yyprint(FILE *file, int type, const YYSTYPE& value); | |
106 | #define YYPRINT(f,t,v) yyprint(f,t,v) | |
107 | ||
108 | %} | |
109 | ||
110 | /********************************************************************* | |
111 | * Bison declarations | |
112 | *********************************************************************/ | |
113 | ||
114 | %name-prefix="pattern_yy" | |
115 | %output="pattern_p.cc" | |
116 | %defines | |
117 | %verbose | |
118 | %expect 0 | |
119 | %start Pattern | |
120 | %debug | |
121 | ||
122 | /********************************************************************* | |
123 | * The union-type | |
124 | * Must be kept in sync with the one in pattern_uni.y ! | |
125 | *********************************************************************/ | |
126 | ||
127 | %union { | |
128 | int b; /* boolean */ | |
129 | char c; /* single character */ | |
130 | char *s; /* character string */ | |
131 | unsigned long int u; /* unsigned integer */ | |
132 | struct character_set *set; // used by nonterminals in pattern_p.y | |
133 | ||
134 | union { | |
135 | unsigned int value; | |
136 | #if defined(__sparc__) || defined(__sparc) | |
137 | struct { | |
138 | unsigned char group; | |
139 | unsigned char plane; | |
140 | unsigned char row; | |
141 | unsigned char cell; | |
142 | } comp; | |
143 | #else | |
144 | struct { | |
145 | unsigned char cell; | |
146 | unsigned char row; | |
147 | unsigned char plane; | |
148 | unsigned char group; | |
149 | } comp; | |
150 | #endif | |
151 | } q; // single universal char, used by nonterminals in pattern_uni.y | |
152 | class QuadSet* qset; // used by nonterminals in pattern_uni.y | |
153 | } | |
154 | ||
155 | /********************************************************************* | |
156 | * Tokens | |
157 | *********************************************************************/ | |
158 | ||
159 | %token <c> TOK_Char "<ordinary character>" | |
160 | %token <u> TOK_Number "<number>" | |
161 | %token <u> TOK_Digit "<digit>" | |
162 | ||
163 | /********************************************************************* | |
164 | * Keywords | |
165 | *********************************************************************/ | |
166 | ||
167 | %token KW_BS_q "\\q" | |
168 | %token KW_BS_d "\\d" | |
169 | %token KW_BS_w "\\w" | |
170 | %token KW_BS_t "\\t" | |
171 | %token KW_BS_n "\\n" | |
172 | %token KW_BS_r "\\r" | |
173 | %token KW_BS_s "\\s" | |
174 | %token KW_BS_b "\\b" | |
175 | ||
176 | %token KW_Group_Begin "(" | |
177 | %token KW_Group_End ")" | |
178 | %token KW_Set_Begin "[" | |
179 | %token KW_Set_Begin_Neg "[^" | |
180 | %token KW_Set_Begin_Rsbrkt "[]" | |
181 | %token KW_Set_Begin_Neg_Rsbrkt "[^]" | |
182 | %token KW_Set_End "]" | |
183 | %token KW_Set_Dash_End "-]" | |
184 | ||
185 | /********************************************************************* | |
186 | * semantic types of nonterminals | |
187 | *********************************************************************/ | |
188 | ||
189 | %type <b> RE_Set_Begin RE_Set_Begin_Rsbrkt RE_Set_End | |
190 | %type <c> RE_Set_Range_Char RE_Quadruple | |
191 | %type <s> RE_Body RE_Elems RE_Alter_Elem RE_Concat_Elem | |
192 | RE_Multiply_Elem RE_Multiply_Statement RE_Group | |
193 | RE_OneCharPos | |
194 | %type <set> RE_Set RE_Set_Body RE_Set_Elem RE_Set_NoRange_Char | |
195 | ||
196 | /********************************************************************* | |
197 | * Destructors | |
198 | *********************************************************************/ | |
199 | ||
200 | %destructor { Free($$); } | |
201 | RE_Alter_Elem | |
202 | RE_Body | |
203 | RE_Concat_Elem | |
204 | RE_Elems | |
205 | RE_Group | |
206 | RE_Multiply_Elem | |
207 | RE_Multiply_Statement | |
208 | RE_OneCharPos | |
209 | ||
210 | %destructor { set_free($$); } | |
211 | RE_Set | |
212 | RE_Set_Body | |
213 | RE_Set_Elem | |
214 | RE_Set_NoRange_Char | |
215 | ||
216 | %% | |
217 | ||
218 | /********************************************************************* | |
219 | * Grammar | |
220 | *********************************************************************/ | |
221 | ||
222 | Pattern: | |
223 | RE_Body {ret_val=$1;} | |
224 | ; | |
225 | ||
226 | RE_Body: | |
227 | /* empty */ | |
228 | { | |
229 | $$ = mcopystr("^$"); | |
230 | } | |
231 | | RE_Elems | |
232 | { | |
233 | if ($1 != NULL) { | |
234 | $$ = mprintf("^%s$", $1); | |
235 | Free($1); | |
236 | } else $$ = mcopystr("^$"); | |
237 | } | |
238 | ; | |
239 | ||
240 | RE_Elems: | |
241 | RE_Alter_Elem { $$ = $1; } | |
242 | | RE_Elems '|' RE_Alter_Elem | |
243 | { | |
244 | unsigned int nof_pars = get_nof_parentheses() + (yychar==KW_Group_End ? 1 : 0); | |
245 | if ($3 != NULL) { | |
246 | if ($1 != NULL) $$ = mputprintf($1, nof_pars ? "|%s" : "$|^%s", $3); | |
247 | else $$ = mprintf( nof_pars ? "()|%s" : "()$|^%s" , $3); | |
248 | Free($3); | |
249 | } else { | |
250 | if ($1 != NULL) $$ = mputstr($1, nof_pars ? "|()" : "$|^()"); | |
251 | else $$ = NULL; | |
252 | } | |
253 | } | |
254 | ; | |
255 | ||
256 | RE_Alter_Elem: | |
257 | RE_Concat_Elem { $$ = $1; } | |
258 | | RE_Alter_Elem RE_Concat_Elem | |
259 | { | |
260 | $$ = mputstr($1, $2); | |
261 | Free($2); | |
262 | } | |
263 | ; | |
264 | ||
265 | RE_Concat_Elem: | |
266 | RE_Multiply_Elem {$$=$1;} | |
267 | | RE_Multiply_Elem RE_Multiply_Statement | |
268 | { | |
269 | if ($1 != NULL && $2 != NULL) { | |
270 | $$ = mputstr($1, $2); | |
271 | Free($2); | |
272 | } else { | |
273 | Free($1); | |
274 | Free($2); | |
275 | $$ = NULL; | |
276 | } | |
277 | } | |
278 | | '*' {$$=mcopystr(".*");} | |
279 | ; | |
280 | ||
281 | RE_Multiply_Elem: | |
282 | RE_Group {$$=$1;} | |
283 | | RE_OneCharPos {$$=$1;} | |
284 | ; | |
285 | ||
286 | RE_Group: | |
287 | KW_Group_Begin KW_Group_End | |
288 | { | |
289 | $$ = mcopystr("()"); | |
290 | } | |
291 | | KW_Group_Begin RE_Elems KW_Group_End | |
292 | { | |
293 | if ($2 != NULL) { | |
294 | $$ = mprintf("(%s)", $2); | |
295 | Free($2); | |
296 | } else { | |
297 | $$ = mcopystr("()"); | |
298 | } | |
299 | } | |
300 | ; | |
301 | ||
302 | RE_Multiply_Statement: | |
303 | '+' | |
304 | { | |
305 | $$ = mcopystr("+"); | |
306 | } | |
307 | | '#' '(' ',' ')' | |
308 | { | |
309 | $$ = mcopystr("*"); | |
310 | } | |
311 | | '#' TOK_Digit | |
312 | { | |
313 | if ($2 == 0) { | |
314 | TTCN_pattern_warning("The number of repetitions is zero: `#0'."); | |
315 | $$ = NULL; | |
316 | } else if ($2 == 1) $$ = memptystr(); | |
317 | else { | |
318 | if ($2 > 9) TTCN_pattern_warning("Internal error: Invalid number of " | |
319 | "repetitions: `#%lu'.", $2); | |
320 | $$ = mprintf("{%lu}", $2); | |
321 | } | |
322 | } | |
323 | | '#' '(' TOK_Number ')' | |
324 | { | |
325 | if ($3 == 0) { | |
326 | TTCN_pattern_warning("The number of repetitions is zero: `#(0)'."); | |
327 | $$ = NULL; | |
328 | } else if ($3 == 1) $$ = memptystr(); | |
329 | else { | |
330 | #ifdef RE_DUP_MAX | |
331 | if ($3 > RE_DUP_MAX) TTCN_pattern_warning("The number of repetitions in " | |
332 | "`#(%lu)' exceeds the limit allowed by this system (%d).", $3, | |
333 | RE_DUP_MAX); | |
334 | #endif | |
335 | $$ = mprintf("{%lu}", $3); | |
336 | } | |
337 | } | |
338 | | '#' '(' TOK_Number ',' TOK_Number ')' | |
339 | { | |
340 | #ifdef RE_DUP_MAX | |
341 | if ($3 > RE_DUP_MAX) TTCN_pattern_warning("The minimum number of " | |
342 | "repetitions in `#(%lu,%lu)' exceeds the limit allowed by this system " | |
343 | "(%d).", $3, $5, RE_DUP_MAX); | |
344 | if ($5 > RE_DUP_MAX) TTCN_pattern_warning("The maximum number of " | |
345 | "repetitions in `#(%lu,%lu)' exceeds the limit allowed by this system " | |
346 | "(%d).", $3, $5, RE_DUP_MAX); | |
347 | #endif | |
348 | if ($3 > $5) TTCN_pattern_error("The lower bound is higher than the upper " | |
349 | "bound in the number of repetitions: `#(%lu,%lu)'.", $3, $5); | |
350 | if ($3 == $5) { | |
351 | if ($3 == 0) { | |
352 | TTCN_pattern_warning("The number of repetitions is zero: `#(0,0)'."); | |
353 | $$ = NULL; | |
354 | } else if ($3 == 1) $$ = memptystr(); | |
355 | else { | |
356 | $$ = mprintf("{%lu}", $3); | |
357 | } | |
358 | } else { | |
359 | if ($3 == 0 && $5 == 1) $$ = mcopystr("?"); | |
360 | else $$ = mprintf("{%lu,%lu}", $3, $5); | |
361 | } | |
362 | } | |
363 | | '#' '(' ',' TOK_Number ')' | |
364 | { | |
365 | if ($4 == 0) { | |
366 | TTCN_pattern_warning("The number of repetitions is zero: `#(,0)'."); | |
367 | $$ = NULL; | |
368 | } else { | |
369 | #ifdef RE_DUP_MAX | |
370 | if ($4 > RE_DUP_MAX) TTCN_pattern_warning("The maximum number of " | |
371 | "repetitions in `#(,%lu)' exceeds the limit allowed by this system " | |
372 | "(%d).", $4, RE_DUP_MAX); | |
373 | #endif | |
374 | if ($4 == 1) $$ = mcopystr("?"); | |
375 | else $$ = mprintf("{0,%lu}", $4); | |
376 | } | |
377 | } | |
378 | | '#' '(' TOK_Number ',' ')' | |
379 | { | |
380 | if ($3 == 0) $$ = mcopystr("*"); | |
381 | else { | |
382 | #ifdef RE_DUP_MAX | |
383 | if ($3 > RE_DUP_MAX) TTCN_pattern_warning("The minimum number of " | |
384 | "repetitions in `#(%lu,)' exceeds the limit allowed by this system " | |
385 | "(%d).", $3, RE_DUP_MAX); | |
386 | #endif | |
387 | if ($3 == 1) $$ = mcopystr("+"); | |
388 | else $$ = mprintf("{%lu,}", $3); | |
389 | } | |
390 | } | |
391 | ; | |
392 | ||
393 | RE_OneCharPos: | |
394 | '?' {$$=mcopystr(".");} | |
395 | | KW_BS_d {$$=mcopystr("[0-9]");} | |
396 | | KW_BS_w {$$=mcopystr("[0-9A-Za-z]");} | |
397 | | KW_BS_t {$$=mcopystr("\t");} | |
398 | | KW_BS_n {$$=mcopystr("[\n-\r]");} | |
399 | | KW_BS_r {$$=mcopystr("\r");} | |
400 | | KW_BS_s {$$=mcopystr("[\t-\r ]");} | |
401 | | KW_BS_b | |
402 | { | |
403 | TTCN_pattern_warning("Metacharacter `\\b' is not supported yet."); | |
404 | $$ = NULL; | |
405 | } | |
406 | | TOK_Char | |
407 | { | |
408 | unsigned char c = $1; | |
409 | if (c == 0 || c > 127) TTCN_pattern_error("Character with code %u " | |
410 | "(0x%02x) cannot be used in a pattern for type charstring.", c, c); | |
411 | $$ = translate_character($1); | |
412 | } | |
413 | | RE_Quadruple | |
414 | { | |
415 | $$ = translate_character($1); | |
416 | } | |
417 | | RE_Set | |
418 | { | |
419 | if (set_is_empty($1)) { | |
420 | TTCN_pattern_error("Empty character set."); | |
421 | $$ = NULL; | |
422 | } else $$ = set_generate_posix($1); | |
423 | set_free($1); | |
424 | } | |
425 | ; | |
426 | ||
427 | RE_Set: | |
428 | /* RE_Set_Begin is 1 for "[^", 0 for "[" | |
429 | * RE_Set_Begin_Rsbrkt is 1 for "[^]", 0 for "[]" | |
430 | * RE_Set_End is 1 for "-]", 0 for "]" | |
431 | */ | |
432 | RE_Set_Begin RE_Set_Body RE_Set_End | |
433 | { | |
434 | if ($2 != NULL) $$ = $2; | |
435 | else $$ = set_init(); | |
436 | if ($3) { | |
437 | if (set_has_char($$, '-')) | |
438 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
439 | else set_add_char($$, '-'); | |
440 | } | |
441 | if ($1) set_negate($$); | |
442 | } | |
443 | | RE_Set_Begin '-' RE_Set_Body RE_Set_End | |
444 | { | |
445 | if ($3 != NULL) $$ = $3; | |
446 | else $$ = set_init(); | |
447 | if (set_has_char($$, '-')) | |
448 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
449 | else set_add_char($$, '-'); | |
450 | if ($4) { | |
451 | if (set_has_char($$, '-')) | |
452 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
453 | else set_add_char($$, '-'); | |
454 | } | |
455 | if ($1) set_negate($$); | |
456 | } | |
457 | | RE_Set_Begin_Rsbrkt RE_Set_Body RE_Set_End | |
458 | { | |
459 | if ($2 != NULL) $$ = $2; | |
460 | else $$ = set_init(); | |
461 | if (set_has_char($$, ']')) | |
462 | TTCN_pattern_warning("Duplicate character `]' in the character set."); | |
463 | else set_add_char($$, ']'); | |
464 | if ($3) { | |
465 | if (set_has_char($$, '-')) | |
466 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
467 | else set_add_char($$, '-'); | |
468 | } | |
469 | if ($1) set_negate($$); | |
470 | } | |
471 | | RE_Set_Begin_Rsbrkt '-' RE_Set_Range_Char RE_Set_Body RE_Set_End | |
472 | { | |
473 | if ($4 != NULL) $$ = $4; | |
474 | else $$ = set_init(); | |
475 | char *range_str = print_range(']', $3); | |
476 | if (']' > $3) { | |
477 | TTCN_pattern_error("Invalid range `%s' in the character set: the " | |
478 | "character code of the lower bound (%u) is higher than that of the " | |
479 | "upper bound (%u).", range_str, ']', (unsigned char)$3); | |
480 | } else { | |
481 | if (set_has_range($$, ']', $3)) { | |
482 | character_set *tmpset = set_init(); | |
483 | set_add_range(tmpset, ']', $3); | |
484 | set_report_duplicates($$, tmpset); | |
485 | set_free(tmpset); | |
486 | } | |
487 | } | |
488 | set_add_range($$, ']', $3); | |
489 | Free(range_str); | |
490 | if ($5) { | |
491 | if (set_has_char($$, '-')) | |
492 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
493 | else set_add_char($$, '-'); | |
494 | } | |
495 | if ($1) set_negate($$); | |
496 | } | |
497 | ; | |
498 | ||
499 | RE_Set_Begin: | |
500 | KW_Set_Begin { $$ = 0; } | |
501 | | KW_Set_Begin_Neg { $$ = 1; } | |
502 | ; | |
503 | ||
504 | RE_Set_Begin_Rsbrkt: | |
505 | KW_Set_Begin_Rsbrkt { $$ = 0; } | |
506 | | KW_Set_Begin_Neg_Rsbrkt { $$ = 1; } | |
507 | ; | |
508 | ||
509 | RE_Set_End: | |
510 | KW_Set_End { $$ = 0; } | |
511 | | KW_Set_Dash_End { $$ = 1; } | |
512 | ; | |
513 | ||
514 | RE_Set_Body: | |
515 | /* empty */ { $$ = NULL; } | |
516 | | RE_Set_Body RE_Set_Elem | |
517 | { | |
518 | if ($1 != NULL) { | |
519 | $$ = $1; | |
520 | if (set_has_intersect($$, $2)) set_report_duplicates($$, $2); | |
521 | set_join($$, $2); | |
522 | set_free($2); | |
523 | } else $$ = $2; | |
524 | } | |
525 | ; | |
526 | ||
527 | RE_Set_Elem: | |
528 | RE_Set_Range_Char | |
529 | { | |
530 | $$ = set_init(); | |
531 | set_add_char($$, $1); | |
532 | } | |
533 | | RE_Set_NoRange_Char { $$ = $1; } | |
534 | | RE_Set_Range_Char '-' RE_Set_Range_Char | |
535 | { | |
536 | if ($1 > $3) { | |
537 | char *range_str = print_range($1, $3); | |
538 | TTCN_pattern_error("Invalid range `%s' in the character set: the " | |
539 | "character code of the lower bound (%u) is higher than that of the " | |
540 | "upper bound (%u).", range_str, (unsigned char)$1, (unsigned char)$3); | |
541 | Free(range_str); | |
542 | } | |
543 | $$ = set_init(); | |
544 | set_add_range($$, $1, $3); | |
545 | } | |
546 | ; | |
547 | ||
548 | RE_Set_Range_Char: | |
549 | KW_BS_t { $$ = '\t'; } | |
550 | | KW_BS_r { $$ = '\r'; } | |
551 | | TOK_Char | |
552 | { | |
553 | unsigned char c = $1; | |
554 | if (c == 0 || c > 127) TTCN_pattern_error("Character with code %u " | |
555 | "(0x%02x) cannot be used in a pattern for type charstring.", c, c); | |
556 | $$ = $1; | |
557 | } | |
558 | | RE_Quadruple { $$ = $1; } | |
559 | ; | |
560 | ||
561 | RE_Set_NoRange_Char: | |
562 | KW_BS_d | |
563 | { | |
564 | $$ = set_init(); | |
565 | set_add_range($$, '0', '9'); | |
566 | } | |
567 | | KW_BS_w | |
568 | { | |
569 | $$ = set_init(); | |
570 | set_add_range($$, '0', '9'); | |
571 | set_add_range($$, 'A', 'Z'); | |
572 | set_add_range($$, 'a', 'z'); | |
573 | } | |
574 | | KW_BS_n | |
575 | { | |
576 | $$ = set_init(); | |
577 | set_add_range($$, '\n', '\r'); | |
578 | } | |
579 | | KW_BS_s | |
580 | { | |
581 | $$ = set_init(); | |
582 | set_add_range($$, '\t', '\r'); | |
583 | set_add_char($$, ' '); | |
584 | } | |
585 | | KW_BS_b | |
586 | { | |
587 | TTCN_pattern_error("Metacharacter `\\b' does not make any sense in a " | |
588 | "character set."); | |
589 | $$ = set_init(); | |
590 | } | |
591 | ; | |
592 | ||
593 | RE_Quadruple: | |
594 | KW_BS_q '{' TOK_Number ',' TOK_Number ',' TOK_Number ',' TOK_Number '}' | |
595 | { | |
596 | if ($3 > 127) TTCN_pattern_error("The first number (group) of quadruple " | |
597 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..127 " | |
598 | "instead of %lu.", $3, $5, $7, $9, $3); | |
599 | if ($5 > 255) TTCN_pattern_error("The second number (plane) of quadruple " | |
600 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..255 " | |
601 | "instead of %lu.", $3, $5, $7, $9, $5); | |
602 | if ($7 > 255) TTCN_pattern_error("The third number (row) of quadruple " | |
603 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..255 " | |
604 | "instead of %lu.", $3, $5, $7, $9, $7); | |
605 | if ($9 > 255) TTCN_pattern_error("The fourth number (cell) of quadruple " | |
606 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..255 " | |
607 | "instead of %lu.", $3, $5, $7, $9, $9); | |
608 | if ($3 > 0 || $5 > 0 || $7 > 0 || $9 > 127) TTCN_pattern_error("Quadruple " | |
609 | "`\\q{%lu,%lu,%lu,%lu}' is not valid in a pattern for type charstring.", | |
610 | $3, $5, $7, $9); | |
611 | if ($3 == 0 && $5 == 0 && $7 == 0 && $9 == 0) TTCN_pattern_error("Zero " | |
612 | "character (i.e. quadruple `\\q{0,0,0,0}') is not supported in a " | |
613 | "pattern for type charstring."); | |
614 | $$ = $9; | |
615 | } | |
616 | ; | |
617 | ||
618 | %% | |
619 | ||
620 | /********************************************************************* | |
621 | * Interface | |
622 | *********************************************************************/ | |
623 | ||
624 | char* TTCN_pattern_to_regexp(const char* p_pattern) | |
625 | { | |
626 | /* if you want to debug */ | |
627 | //pattern_yydebug=1; | |
628 | ||
629 | ret_val=NULL; | |
630 | ||
631 | yy_buffer_state *flex_buffer = pattern_yy_scan_string(p_pattern); | |
632 | if(flex_buffer == NULL) { | |
633 | TTCN_pattern_error("Flex buffer creation failed."); | |
634 | return NULL; | |
635 | } | |
636 | init_pattern_yylex(&yylval); | |
637 | if(pattern_yyparse()) { | |
638 | Free(ret_val); | |
639 | ret_val=NULL; | |
640 | } | |
641 | pattern_yylex_destroy(); | |
642 | return ret_val; | |
643 | } | |
644 | ||
645 | // Backwards compatibility shim | |
646 | char* TTCN_pattern_to_regexp(const char* p_pattern, int ere) | |
647 | { | |
648 | TTCN_pattern_warning( | |
649 | "TTCN_pattern_to_regexp(const char* p_pattern, int ere) is deprecated"); | |
650 | if (ere != 1) TTCN_pattern_error( | |
651 | "BRE is not supported for TTCN_pattern_to_regexp"); | |
652 | return TTCN_pattern_to_regexp(p_pattern); | |
653 | } | |
654 | ||
655 | /********************************************************************* | |
656 | * Static functions | |
657 | *********************************************************************/ | |
658 | ||
659 | /// Error reporting function | |
660 | void pattern_yyerror(const char *error_str) | |
661 | { | |
662 | TTCN_pattern_error("%s", error_str); | |
663 | } | |
664 | ||
665 | /** Escape plain characters which would be metacharacters in a regex. | |
666 | * | |
667 | * @param c plain character | |
668 | * @return a newly allocated string which must be Free() 'd | |
669 | */ | |
670 | char *translate_character(char c) | |
671 | { | |
672 | int escape_needed = 0; | |
673 | switch (c) { | |
674 | case '|': | |
675 | case '+': | |
676 | case '?': | |
677 | case '{': | |
678 | case '}': | |
679 | case '(': | |
680 | case ')': | |
681 | case '.': | |
682 | case '^': | |
683 | case '$': | |
684 | case '[': | |
685 | case '*': | |
686 | case '\\': | |
687 | escape_needed = 1; | |
688 | break; | |
689 | } | |
690 | if (escape_needed) return mprintf("\\%c", c); | |
691 | else return mputc(NULL, c); | |
692 | } | |
693 | ||
694 | char *print_character(char c) | |
695 | { | |
696 | switch (c) { | |
697 | case '\t': | |
698 | return mcopystr("\\t"); | |
699 | case '\r': | |
700 | return mcopystr("\\r"); | |
701 | default: | |
702 | if (isprint((unsigned char)c)) return mprintf("%c", c); | |
703 | else return mprintf("\\q{0,0,0,%u}", (unsigned char)c); | |
704 | } | |
705 | } | |
706 | ||
707 | char *print_range(char lower, char upper) | |
708 | { | |
709 | char *range_str = print_character(lower); | |
710 | range_str = mputc(range_str, '-'); | |
711 | char *upper_str = print_character(upper); | |
712 | range_str = mputstr(range_str, upper_str); | |
713 | Free(upper_str); | |
714 | return range_str; | |
715 | } | |
716 | ||
717 | #define CS_BITS_PER_ELEM (8 * sizeof(unsigned long)) | |
718 | #define CS_NOF_ELEMS ((128 + CS_BITS_PER_ELEM - 1) / CS_BITS_PER_ELEM) | |
719 | ||
720 | struct character_set { | |
721 | unsigned long set_members[CS_NOF_ELEMS]; | |
722 | }; | |
723 | ||
724 | character_set *set_init() | |
725 | { | |
726 | character_set *set = (character_set*)Malloc(sizeof(*set)); | |
727 | memset(set->set_members, 0, sizeof(set->set_members)); | |
728 | return set; | |
729 | } | |
730 | ||
731 | character_set *set_copy(const character_set *set) | |
732 | { | |
733 | character_set *set2 = (character_set*)Malloc(sizeof(*set2)); | |
734 | memcpy(set2, set, sizeof(*set2)); | |
735 | return set2; | |
736 | } | |
737 | ||
738 | void set_free(character_set *set) | |
739 | { | |
740 | Free(set); | |
741 | } | |
742 | ||
743 | int set_is_empty(const character_set *set) | |
744 | { | |
745 | if ((set->set_members[0] & ~1UL) != 0) return 0; | |
746 | for (size_t i = 1; i < CS_NOF_ELEMS; i++) | |
747 | if (set->set_members[i] != 0) return 0; | |
748 | return 1; | |
749 | } | |
750 | ||
751 | int set_is_full(const character_set *set) | |
752 | { | |
753 | if (~(set->set_members[0] | 1UL) != 0) return 0; | |
754 | for (size_t i = 1; i < CS_NOF_ELEMS; i++) | |
755 | if (~set->set_members[i] != 0) return 0; | |
756 | return 1; | |
757 | } | |
758 | ||
759 | int set_has_char(const character_set *set, char c) | |
760 | { | |
761 | if (set->set_members[c / CS_BITS_PER_ELEM] & 1UL << c % CS_BITS_PER_ELEM) | |
762 | return 1; | |
763 | else return 0; | |
764 | } | |
765 | ||
766 | void set_add_char(character_set *set, char c) | |
767 | { | |
768 | set->set_members[c / CS_BITS_PER_ELEM] |= 1UL << c % CS_BITS_PER_ELEM; | |
769 | } | |
770 | ||
771 | void set_remove_char(character_set *set, char c) | |
772 | { | |
773 | set->set_members[c / CS_BITS_PER_ELEM] &= ~(1UL << c % CS_BITS_PER_ELEM); | |
774 | } | |
775 | ||
776 | int set_has_range(const character_set *set, char lower, char upper) | |
777 | { | |
778 | for (size_t i = lower; i <= (unsigned char)upper; i++) | |
779 | if (set->set_members[i / CS_BITS_PER_ELEM] & 1UL << i % CS_BITS_PER_ELEM) | |
780 | return 1; | |
781 | return 0; | |
782 | } | |
783 | ||
784 | void set_add_range(character_set *set, char lower, char upper) | |
785 | { | |
786 | for (size_t i = lower; i <= (unsigned char)upper; i++) | |
787 | set->set_members[i / CS_BITS_PER_ELEM] |= 1UL << i % CS_BITS_PER_ELEM; | |
788 | } | |
789 | ||
790 | int set_has_intersect(const character_set *set1, const character_set *set2) | |
791 | { | |
792 | for (size_t i = 0; i < CS_NOF_ELEMS; i++) | |
793 | if (set1->set_members[i] & set2->set_members[i]) return 1; | |
794 | return 0; | |
795 | } | |
796 | ||
797 | void set_join(character_set *dst, const character_set *src) | |
798 | { | |
799 | for (size_t i = 0; i < CS_NOF_ELEMS; i++) | |
800 | dst->set_members[i] |= src->set_members[i]; | |
801 | } | |
802 | ||
803 | void set_negate(character_set *set) | |
804 | { | |
805 | for (size_t i = 0; i < CS_NOF_ELEMS; i++) | |
806 | set->set_members[i] = ~set->set_members[i]; | |
807 | } | |
808 | ||
809 | void set_report_duplicates(const character_set *set1, | |
810 | const character_set *set2) | |
811 | { | |
812 | for (unsigned char i = 0; i <= 127; ) { | |
813 | for (i++; i <= 127; i++) | |
814 | if (set_has_char(set2, i) && set_has_char(set1, i)) break; | |
815 | if (i > 127) break; | |
816 | char lower = i; | |
817 | for (i++; i <= 127; i++) | |
818 | if (!set_has_char(set2, i) || !set_has_char(set1, i)) break; | |
819 | char upper = i - 1; | |
820 | if (lower < upper) { | |
821 | char *range_str = print_range(lower, upper); | |
822 | TTCN_pattern_warning("Duplicate range `%s' in the character set.", | |
823 | range_str); | |
824 | Free(range_str); | |
825 | } else { | |
826 | char *char_str = print_character(lower); | |
827 | if(lower == '\r' ){ | |
828 | TTCN_pattern_warning("Duplicate character `%s' in the character " | |
829 | "set. Please note the \\n includes the \\r implicitly. " | |
830 | "Use \\q{0,0,0,10} if you would like to match the LF only.", char_str); | |
831 | } else { | |
832 | TTCN_pattern_warning("Duplicate character `%s' in the character " | |
833 | "set.", char_str); | |
834 | } | |
835 | Free(char_str); | |
836 | } | |
837 | } | |
838 | } | |
839 | ||
840 | static char *append_posix_body(char *set_body, const character_set *set) | |
841 | { | |
842 | for (unsigned char i = 0; i <= 127; ) { | |
843 | for (i++; i <= 127; i++) if (set_has_char(set, i)) break; | |
844 | if (i > 127) break; | |
845 | char lower = i; | |
846 | set_body = mputc(set_body, lower); | |
847 | for (i++; i <= 127; i++) if (!set_has_char(set, i)) break; | |
848 | char upper = i - 1; | |
849 | if (lower < upper) { | |
850 | if (lower + 1 < upper) set_body = mputc(set_body, '-'); | |
851 | set_body = mputc(set_body, upper); | |
852 | } | |
853 | } | |
854 | return set_body; | |
855 | } | |
856 | ||
857 | static char *generate_posix_body(character_set *set) | |
858 | { | |
859 | int has_caret; | |
860 | if (set_has_char(set, '^') && !(set_has_char(set, '^' - 1) && | |
861 | set_has_char(set, '^' + 1))) { | |
862 | set_remove_char(set, '^'); | |
863 | has_caret = 1; | |
864 | } else has_caret = 0; | |
865 | int has_dash; | |
866 | if (set_has_char(set, '-') && !(set_has_char(set, '-' - 1) && | |
867 | set_has_char(set, '-' + 1))) { | |
868 | set_remove_char(set, '-'); | |
869 | has_dash = 1; | |
870 | } else has_dash = 0; | |
871 | int has_rsbrkt; | |
872 | if (set_has_char(set, ']') && !(set_has_char(set, ']' - 1) && | |
873 | set_has_char(set, ']' + 1))) { | |
874 | set_remove_char(set, ']'); | |
875 | has_rsbrkt = 1; | |
876 | } else has_rsbrkt = 0; | |
877 | char *set_body = memptystr(); | |
878 | if (set_is_empty(set) && !has_rsbrkt) { | |
879 | /* the `-' must precede the `^' */ | |
880 | if (has_dash) set_body = mputc(set_body, '-'); | |
881 | if (has_caret) set_body = mputc(set_body, '^'); | |
882 | } else { | |
883 | /* order: ']', others, '^', '-' */ | |
884 | if (has_rsbrkt) set_body = mputc(set_body, ']'); | |
885 | set_body = append_posix_body(set_body, set); | |
886 | if (has_caret) set_body = mputc(set_body, '^'); | |
887 | if (has_dash) set_body = mputc(set_body, '-'); | |
888 | } | |
889 | return set_body; | |
890 | } | |
891 | ||
892 | static char *generate_posix_body_compl(character_set *set) | |
893 | { | |
894 | set_negate(set); | |
895 | int has_dash; | |
896 | if (set_has_char(set, '-') && !(set_has_char(set, '-' - 1) && | |
897 | set_has_char(set, '-' + 1))) { | |
898 | set_remove_char(set, '-'); | |
899 | has_dash = 1; | |
900 | } else has_dash = 0; | |
901 | int has_rsbrkt; | |
902 | if (set_has_char(set, ']') && !(set_has_char(set, ']' - 1) && | |
903 | set_has_char(set, ']' + 1))) { | |
904 | set_remove_char(set, ']'); | |
905 | has_rsbrkt = 1; | |
906 | } else has_rsbrkt = 0; | |
907 | char *set_body = mcopystr("^"); | |
908 | /* order: ']', others, '-' */ | |
909 | if (has_rsbrkt) set_body = mputc(set_body, ']'); | |
910 | set_body = append_posix_body(set_body, set); | |
911 | if (has_dash) set_body = mputc(set_body, '-'); | |
912 | return set_body; | |
913 | } | |
914 | ||
915 | char *set_generate_posix(const character_set *set) | |
916 | { | |
917 | /* a full set can only be represented in this way: */ | |
918 | if (set_is_full(set)) return mcopystr("."); | |
919 | character_set *tempset = set_copy(set); | |
920 | char *set_body = generate_posix_body(tempset); | |
921 | set_free(tempset); | |
922 | char *posix_str; | |
923 | if (set_body[0] == '\0') { | |
924 | Free(set_body); | |
925 | TTCN_pattern_error("Internal error: empty POSIX set."); | |
926 | return NULL; | |
927 | } | |
928 | /* do not use the set notation in POSIX if the set contains only one | |
929 | * character */ | |
930 | if (set_body[1] == '\0') posix_str = translate_character(set_body[0]); | |
931 | else { | |
932 | /* create the complemented version of the same set */ | |
933 | tempset = set_copy(set); | |
934 | char *compl_body = generate_posix_body_compl(tempset); | |
935 | set_free(tempset); | |
936 | if (compl_body[0] == '\0') { | |
937 | Free(set_body); | |
938 | Free(compl_body); | |
939 | TTCN_pattern_error("Internal error: empty complemented POSIX set."); | |
940 | return NULL; | |
941 | } | |
942 | /* use the complemented form in the POSIX equivalent if it is the shorter | |
943 | * one */ | |
944 | if (mstrlen(compl_body) < mstrlen(set_body)) | |
945 | posix_str = mprintf("[%s]", compl_body); | |
946 | else posix_str = mprintf("[%s]", set_body); | |
947 | Free(compl_body); | |
948 | } | |
949 | Free(set_body); | |
950 | return posix_str; | |
951 | } | |
952 | ||
953 | void yyprint(FILE *file, int type, const YYSTYPE& value) | |
954 | { | |
955 | switch (type) { | |
956 | case TOK_Char: | |
957 | fprintf(file, "'%c'", value.c); | |
958 | break; | |
959 | case TOK_Digit: case TOK_Number: | |
960 | fprintf(file, "'%lu'", value.u); | |
961 | break; | |
962 | default: | |
963 | break; | |
964 | } | |
965 | } | |
966 |