8252f49a2c2266617d2c8a82477d439c7b70d2c8
[deliverable/titan.core.git] / compiler2 / PredefFunc.cc
1 /******************************************************************************
2 * Copyright (c) 2000-2016 Ericsson Telecom AB
3 * All rights reserved. This program and the accompanying materials
4 * are made available under the terms of the Eclipse Public License v1.0
5 * which accompanies this distribution, and is available at
6 * http://www.eclipse.org/legal/epl-v10.html
7 *
8 * Contributors:
9 * Baji, Laszlo
10 * Balasko, Jeno
11 * Baranyi, Botond
12 * Kovacs, Ferenc
13 * Raduly, Csaba
14 * Zalanyi, Balazs Andor
15 *
16 ******************************************************************************/
17 #include "PredefFunc.hh"
18 #include "error.h"
19 #include "Int.hh"
20 #include "Real.hh"
21 #include "Setting.hh"
22 #include "string.hh"
23 #include "ustring.hh"
24 #include "CompilerError.hh"
25 #include <stdio.h>
26 #include <sys/types.h>
27 #include <regex.h>
28 #include <stdint.h>
29 #include "../common/memory.h"
30 #include "../common/pattern.hh"
31 #include <iostream>
32
33 // used by regex
34 #define ERRMSG_BUFSIZE 512
35
36 namespace Common {
37
38 static const char utf32be[] = {'0','0','0','0','F','E','F','F',0};
39 static const char utf32le[] = {'F','F','F','E','0','0','0','0',0};
40 static const char utf16be[] = {'F','E','F','F',0};
41 static const char utf16le[] = {'F','F','F','E',0};
42 static const char utf8[] = {'E','F','B','B','B','F',0};
43
44 static inline unsigned char get_bit_value(char c, unsigned char bit_value)
45 {
46 switch (c) {
47 case '0':
48 return 0;
49 case '1':
50 return bit_value;
51 default:
52 FATAL_ERROR("Invalid binary digit (%c) in bitstring value", c);
53 return 0;
54 }
55 }
56
57 char toupper (const char c)
58 {
59 if (('A' <= c && 'F' >= c) ||
60 ('0' <= c && '9' >= c)) return c;
61 switch (c)
62 {
63 case 'a' : return 'A';
64 case 'b' : return 'B';
65 case 'c' : return 'C';
66 case 'd' : return 'D';
67 case 'e' : return 'E';
68 case 'f' : return 'F';
69 default:
70 FATAL_ERROR("%c cannot be converted to hex character", c);
71 break;
72 }
73 }
74
75 char hexdigit_to_char(unsigned char hexdigit)
76 {
77 if (hexdigit < 10) return '0' + hexdigit;
78 else if (hexdigit < 16) return 'A' + hexdigit - 10;
79 else {
80 FATAL_ERROR("hexdigit_to_char(): invalid argument: %d", hexdigit);
81 return '\0'; // to avoid warning
82 }
83 }
84
85 unsigned char char_to_hexdigit(char c)
86 {
87 if (c >= '0' && c <= '9') return c - '0';
88 else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
89 else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
90 else {
91 FATAL_ERROR("char_to_hexdigit(): invalid argument: %c", c);
92 return 0; // to avoid warning
93 }
94 }
95
96 string uchar2str(unsigned char uchar)
97 {
98 char str[2];
99 str[0] = hexdigit_to_char(uchar / 16);
100 str[1] = hexdigit_to_char(uchar % 16);
101 return string(2, str);
102 }
103
104 unsigned char str2uchar(const char& c1, const char& c2)
105 {
106 unsigned char uc = 0;
107 uc = char_to_hexdigit(c1);
108 uc <<= 4;
109 uc += char_to_hexdigit(c2);
110 return uc;
111 }
112
113 int_val_t rem(const int_val_t& left, const int_val_t& right)
114 {
115 return (left - right * (left / right));
116 }
117
118 int_val_t mod(const int_val_t& left, const int_val_t& right)
119 {
120 int_val_t r = right < 0 ? -right : right;
121 if (left > 0) {
122 return rem(left, r);
123 } else {
124 int_val_t result = rem(left, r);
125 return result == 0 ? result : result + r;
126 }
127 }
128
129 string* to_uppercase(const string& value)
130 {
131 string *s = new string(value);
132 for (size_t i = 0; i < s->size(); i++) {
133 char& c=(*s)[i];
134 if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
135 }
136 return s;
137 }
138
139 string* not4b_bit(const string& bstr)
140 {
141 string *s=new string(bstr);
142 for(size_t i=0; i<s->size(); i++) {
143 char& c=(*s)[i];
144 switch(c) {
145 case '0': c='1'; break;
146 case '1': c='0'; break;
147 default:
148 FATAL_ERROR("not4b_bit(): Invalid char in bitstring.");
149 } // switch c
150 } // for i
151 return s;
152 }
153
154 string* not4b_hex(const string& hstr)
155 {
156 string *s=new string(hstr);
157 for(size_t i=0; i<s->size(); i++) {
158 char& c=(*s)[i];
159 switch(c) {
160 case '0': c='F'; break;
161 case '1': c='E'; break;
162 case '2': c='D'; break;
163 case '3': c='C'; break;
164 case '4': c='B'; break;
165 case '5': c='A'; break;
166 case '6': c='9'; break;
167 case '7': c='8'; break;
168 case '8': c='7'; break;
169 case '9': c='6'; break;
170 case 'A': c='5'; break;
171 case 'B': c='4'; break;
172 case 'C': c='3'; break;
173 case 'D': c='2'; break;
174 case 'E': c='1'; break;
175 case 'F': c='0'; break;
176 case 'a': c='5'; break;
177 case 'b': c='4'; break;
178 case 'c': c='3'; break;
179 case 'd': c='2'; break;
180 case 'e': c='1'; break;
181 case 'f': c='0'; break;
182 default:
183 FATAL_ERROR("not4b_hex(): Invalid char in hexstring.");
184 } // switch c
185 } // for i
186 return s;
187 }
188
189 string* and4b(const string& left, const string& right)
190 {
191 string *s=new string(left);
192 for(size_t i=0; i<s->size(); i++) {
193 char& c=(*s)[i];
194 c=hexdigit_to_char(char_to_hexdigit(c) & char_to_hexdigit(right[i]));
195 } // for i
196 return s;
197 }
198
199 string* or4b(const string& left, const string& right)
200 {
201 string *s=new string(left);
202 for(size_t i=0; i<s->size(); i++) {
203 char& c=(*s)[i];
204 c=hexdigit_to_char(char_to_hexdigit(c) | char_to_hexdigit(right[i]));
205 } // for i
206 return s;
207 }
208
209 string* xor4b(const string& left, const string& right)
210 {
211 string *s=new string(left);
212 for(size_t i=0; i<s->size(); i++) {
213 char& c=(*s)[i];
214 c=hexdigit_to_char(char_to_hexdigit(c) ^ char_to_hexdigit(right[i]));
215 } // for i
216 return s;
217 }
218
219 string* shift_left(const string& value, const Int& count)
220 {
221 if (count > 0) {
222 string *s = new string;
223 if (count < static_cast<Int>(value.size())) *s = value.substr(count);
224 s->resize(value.size(), '0');
225 return s;
226 } else if (count < 0) return shift_right(value, -count);
227 else return new string(value);
228 }
229
230 string* shift_right(const string& value, const Int& count)
231 {
232 if (count > 0) {
233 string *s = new string;
234 if (count < static_cast<Int>(value.size())) {
235 s->resize(count, '0');
236 *s += value.substr(0, value.size()-count);
237 } else s->resize(value.size(), '0');
238 return s;
239 } else if (count < 0) return shift_left(value, -count);
240 else return new string(value);
241 }
242
243 string* rotate_left(const string& value, const Int& p_count)
244 {
245 size_t size = value.size();
246 if (size == 0) return new string(value);
247 else if (p_count < 0) return rotate_right(value, -p_count);
248 size_t count = p_count % size;
249 if (count == 0) return new string(value);
250 else return new string(value.substr(count) + value.substr(0, count));
251 }
252
253 string* rotate_right(const string& value, const Int& p_count)
254 {
255 size_t size = value.size();
256 if (size == 0) return new string(value);
257 else if (p_count < 0) return rotate_left(value, -p_count);
258 size_t count = p_count % size;
259 if (count == 0) return new string(value);
260 else return new string(value.substr(size - count) +
261 value.substr(0, size - count));
262 }
263
264
265 ustring* rotate_left(const ustring& value, const Int& p_count)
266 {
267 size_t size = value.size();
268 if (size == 0) return new ustring(value);
269 else if (p_count < 0) return rotate_right(value, -p_count);
270 size_t count = p_count % size;
271 if (count == 0) return new ustring(value);
272 else return new ustring(value.substr(count) + value.substr(0, count));
273 }
274
275 ustring* rotate_right(const ustring& value, const Int& p_count)
276 {
277 size_t size = value.size();
278 if (size == 0) return new ustring(value);
279 else if (p_count < 0) return rotate_left(value, -p_count);
280 size_t count = p_count % size;
281 if (count == 0) return new ustring(value);
282 else return new ustring(value.substr(size - count) +
283 value.substr(0, size - count));
284 }
285
286 int_val_t* bit2int(const string& bstr)
287 {
288 size_t nof_bits = bstr.size();
289 // skip the leading zeros
290 size_t start_index = 0;
291 while (start_index < nof_bits && bstr[start_index] == '0') start_index++;
292 int_val_t *ret_val = new int_val_t((Int)0);
293 for (size_t i = start_index; i < nof_bits; i++) {
294 *ret_val <<= 1;
295 if (bstr[i] == '1') *ret_val += 1;
296 }
297 return ret_val;
298 }
299
300 int_val_t* hex2int(const string& hstr)
301 {
302 size_t nof_digits = hstr.size();
303 size_t start_index = 0;
304 // Skip the leading zeros.
305 while (start_index < nof_digits && hstr[start_index] == '0')
306 start_index++;
307 int_val_t *ret_val = new int_val_t((Int)0);
308 for (size_t i = start_index; i < nof_digits; i++) {
309 *ret_val <<= 4;
310 *ret_val += char_to_hexdigit(hstr[i]);
311 }
312 return ret_val;
313 }
314
315 Int unichar2int(const ustring& ustr)
316 {
317 if (ustr.size() != 1) FATAL_ERROR("unichar2int(): invalid argument");
318 const ustring::universal_char& uchar = ustr.u_str()[0];
319 Int ret_val = (uchar.group << 24) | (uchar.plane << 16) | (uchar.row << 8) |
320 uchar.cell;
321 return ret_val;
322 }
323
324 string *int2bit(const int_val_t& value, const Int& length)
325 {
326 if (length < 0) FATAL_ERROR("int2bit(): negative length");
327 size_t string_length = static_cast<size_t>(length);
328 if (static_cast<Int>(string_length) != length ||
329 string_length > string::max_string_len)
330 FATAL_ERROR("int2bit(): length is too large");
331 if (value < 0) FATAL_ERROR("int2bit(): negative value");
332 string *bstr = new string;
333 bstr->resize(string_length);
334 int_val_t tmp_value = value;
335 for (size_t i = 1; i <= string_length; i++) {
336 (*bstr)[string_length - i] = (tmp_value & 1).get_val() ? '1' : '0';
337 tmp_value >>= 1;
338 }
339 if (tmp_value != 0)
340 FATAL_ERROR("int2bit(): %s does not fit in %lu bits", \
341 value.t_str().c_str(), (unsigned long)string_length);
342 return bstr;
343 }
344
345 static const char hdigits[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
346 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
347
348 string *int2hex(const int_val_t& value, const Int& length)
349 {
350 if (length < 0)
351 FATAL_ERROR("int2hex(): negative length");
352 size_t string_length = static_cast<size_t>(length);
353 if (static_cast<Int>(string_length) != length ||
354 string_length > string::max_string_len)
355 FATAL_ERROR("int2hex(): length is too large");
356 if (value < 0) FATAL_ERROR("int2hex(): negative value");
357 string *hstr = new string;
358 hstr->resize(string_length);
359 int_val_t tmp_value = value;
360 for (size_t i = 1; i <= string_length; i++) {
361 (*hstr)[string_length - i] = hdigits[(tmp_value & 0x0f).get_val()];
362 tmp_value >>= 4;
363 }
364 if (tmp_value != 0) {
365 FATAL_ERROR("int2hex(): %s does not fit in %lu hexadecimal digits",
366 value.t_str().c_str(), (unsigned long)string_length);
367 }
368 return hstr;
369 }
370
371 ustring *int2unichar(const Int& value)
372 {
373 if (value < 0 || value > 2147483647)
374 FATAL_ERROR("int2unichar(): invalid argument");
375 unsigned char group = (value >> 24) & 0xFF,
376 plane = (value >> 16) & 0xFF,
377 row = (value >> 8) & 0xFF,
378 cell = value & 0xFF;
379 return new ustring(group, plane, row, cell);
380 }
381
382 string *oct2char(const string& ostr)
383 {
384 string *cstr = new string;
385 size_t ostr_size = ostr.size();
386 if (ostr_size % 2)
387 FATAL_ERROR("oct2char(): argument has odd length: %lu",
388 (unsigned long) ostr_size);
389 size_t cstr_size = ostr_size / 2;
390 cstr->resize(cstr_size);
391 const char *ostr_ptr = ostr.c_str();
392 for (size_t i = 0; i < cstr_size; i++) {
393 unsigned char c = 16 * char_to_hexdigit(ostr_ptr[2 * i]) +
394 char_to_hexdigit(ostr_ptr[2 * i + 1]);
395 if (c > 127) FATAL_ERROR("oct2char(): resulting charstring contains " \
396 "non-ascii character: %d", c);
397 (*cstr)[i] = c;
398 }
399 return cstr;
400 }
401
402 string *char2oct(const string& cstr)
403 {
404 string *ostr = new string;
405 size_t cstr_size = cstr.size();
406 ostr->resize(cstr_size * 2, '0');
407 const char *cstr_ptr = cstr.c_str();
408 for (size_t i = 0; i < cstr_size; i++) {
409 unsigned char c = cstr_ptr[i];
410 (*ostr)[2 * i] = hexdigit_to_char(c / 16);
411 (*ostr)[2 * i + 1] = hexdigit_to_char(c % 16);
412 }
413 return ostr;
414 }
415
416 string *bit2hex(const string& bstr)
417 {
418 size_t size=bstr.size();
419 size_t hsize=(size+3)/4;
420 string *hstr = new string;
421 string *bstr4=NULL;
422 if(size%4) {
423 bstr4=new string;
424 bstr4->resize(hsize*4,'0');
425 bstr4->replace(4-(size%4),size,bstr);
426 }
427 hstr->resize(hsize,'0');
428 string b4(4,"0000");
429 for(size_t i=0;i<hsize;i++) {
430 unsigned int u;
431 if(size%4)b4=bstr4->substr(i*4,4);
432 else b4=bstr.substr(i*4,4);
433 if(b4[0]=='1')u=8;else u=0;
434 if(b4[1]=='1')u+=4;
435 if(b4[2]=='1')u+=2;
436 if(b4[3]=='1')u++;
437 (*hstr)[i]=hdigits[u];
438 }
439 if(bstr4!=NULL)delete bstr4;
440 return hstr;
441 }
442
443 string *hex2oct(const string& hstr)
444 {
445 if(hstr.size()%2==0)return new string(hstr);
446 else {
447 string *ostr=new string("0");
448 (*ostr)+=hstr;
449 return ostr;
450 }
451 }
452
453 string *asn_hex2oct(const string& hstr)
454 {
455 string *ostr = new string(hstr);
456 size_t size = ostr->size();
457 if (size % 2) ostr->resize(size + 1, '0');
458 return ostr;
459 }
460
461 string *bit2oct(const string& bstr)
462 {
463 string *s1,*s2;
464 s1=bit2hex(bstr);
465 s2=hex2oct(*s1);
466 delete s1;
467 return s2;
468 }
469
470 string *asn_bit2oct(const string& bstr)
471 {
472 size_t size = bstr.size();
473 string *ostr = new string;
474 ostr->resize(((size+7)/8)*2);
475 for(size_t i=0, j=0; i<size; ) {
476 unsigned char digit1=0, digit2=0;
477 digit1 += get_bit_value(bstr[i++], 8);
478 if (i < size) {
479 digit1 += get_bit_value(bstr[i++], 4);
480 if (i < size) {
481 digit1 += get_bit_value(bstr[i++], 2);
482 if (i < size) {
483 digit1 += get_bit_value(bstr[i++], 1);
484 if (i < size) {
485 digit2 += get_bit_value(bstr[i++], 8);
486 if (i < size) {
487 digit2 += get_bit_value(bstr[i++], 4);
488 if (i < size) {
489 digit2 += get_bit_value(bstr[i++], 2);
490 if (i < size) digit2 += get_bit_value(bstr[i++], 1);
491 }
492 }
493 }
494 }
495 }
496 }
497 (*ostr)[j++] = hexdigit_to_char(digit1);
498 (*ostr)[j++] = hexdigit_to_char(digit2);
499 }
500 return ostr;
501 }
502
503 string *hex2bit(const string& hstr)
504 {
505 size_t size=hstr.size();
506 string *bstr = new string;
507 bstr->resize(4*size);
508 for(size_t i=0; i<size; i++) {
509 switch(hstr[i]) {
510 case '0':
511 bstr->replace(4*i, 4, "0000");
512 break;
513 case '1':
514 bstr->replace(4*i, 4, "0001");
515 break;
516 case '2':
517 bstr->replace(4*i, 4, "0010");
518 break;
519 case '3':
520 bstr->replace(4*i, 4, "0011");
521 break;
522 case '4':
523 bstr->replace(4*i, 4, "0100");
524 break;
525 case '5':
526 bstr->replace(4*i, 4, "0101");
527 break;
528 case '6':
529 bstr->replace(4*i, 4, "0110");
530 break;
531 case '7':
532 bstr->replace(4*i, 4, "0111");
533 break;
534 case '8':
535 bstr->replace(4*i, 4, "1000");
536 break;
537 case '9':
538 bstr->replace(4*i, 4, "1001");
539 break;
540 case 'A':
541 case 'a':
542 bstr->replace(4*i, 4, "1010");
543 break;
544 case 'B':
545 case 'b':
546 bstr->replace(4*i, 4, "1011");
547 break;
548 case 'C':
549 case 'c':
550 bstr->replace(4*i, 4, "1100");
551 break;
552 case 'D':
553 case 'd':
554 bstr->replace(4*i, 4, "1101");
555 break;
556 case 'E':
557 case 'e':
558 bstr->replace(4*i, 4, "1110");
559 break;
560 case 'F':
561 case 'f':
562 bstr->replace(4*i, 4, "1111");
563 break;
564 default:
565 FATAL_ERROR("Common::hex2bit(): invalid hexadecimal "
566 "digit in hexstring value");
567 }
568 }
569 return bstr;
570 }
571
572 int_val_t* float2int(const Real& value, const Location& loc)
573 {
574 // We shouldn't mimic generality with `Int'.
575 if (value >= (Real)LLONG_MIN && value <= (Real)LLONG_MAX)
576 return new int_val_t((Int)value);
577 char buf[512] = "";
578 snprintf(buf, 511, "%f", value);
579 char *dot = strchr(buf, '.');
580 if (!dot) FATAL_ERROR("Conversion of float value `%f' to integer failed", value);
581 else memset(dot, 0, sizeof(buf) - (dot - buf));
582 return new int_val_t(buf, loc);
583 }
584
585 /* TTCN-3 float values that have absolute value smaller than this are
586 displayed in exponential notation. Same as in core/Float.hh */
587 #ifndef MIN_DECIMAL_FLOAT
588 #define MIN_DECIMAL_FLOAT 1.0E-4
589 #endif
590 /* TTCN-3 float values that have absolute value larger or equal than
591 this are displayed in exponential notation. Same as in
592 core/Float.hh */
593 #ifndef MAX_DECIMAL_FLOAT
594 #define MAX_DECIMAL_FLOAT 1.0E+10
595 #endif
596
597 string *float2str(const Real& value)
598 {
599 char str_buf[64];
600 if ( (value > -MAX_DECIMAL_FLOAT && value <= -MIN_DECIMAL_FLOAT)
601 || (value >= MIN_DECIMAL_FLOAT && value < MAX_DECIMAL_FLOAT)
602 || (value == 0.0))
603 snprintf(str_buf,64,"%f",value);
604 else snprintf(str_buf,64,"%e",value);
605 return new string(str_buf);
606 }
607
608 string* regexp(const string& instr, const string& expression,
609 const Int& groupno)
610 {
611 string *retval=0;
612
613 if(groupno<0) {
614 FATAL_ERROR("regexp(): groupno must be a non-negative integer");
615 return retval;
616 }
617 // do not report the warnings again
618 // they were already reported while checking the operands
619 unsigned orig_verb_level = verb_level;
620 verb_level &= ~(1|2);
621 char *posix_str=TTCN_pattern_to_regexp(expression.c_str());
622 verb_level = orig_verb_level;
623 if(posix_str==NULL) {
624 FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
625 expression.c_str());
626 return retval;
627 }
628
629 regex_t posix_regexp;
630 int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED);
631 Free(posix_str);
632 if(ret_val!=0) {
633 /* regexp error */
634 char msg[ERRMSG_BUFSIZE];
635 regerror(ret_val, &posix_regexp, msg, sizeof(msg));
636 FATAL_ERROR("regexp(): regcomp() failed: %s", msg);
637 return retval;
638 }
639
640 size_t nmatch=groupno+1;
641 if(nmatch>posix_regexp.re_nsub) {
642 FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
643 "contains only %lu group(s).", (unsigned long) (nmatch - 1),
644 (unsigned long) posix_regexp.re_nsub);
645 return retval;
646 }
647 regmatch_t* pmatch=(regmatch_t*)Malloc((nmatch+1)*sizeof(regmatch_t));
648 ret_val=regexec(&posix_regexp, instr.c_str(), nmatch+1, pmatch, 0);
649 if(ret_val==0) {
650 if(pmatch[nmatch].rm_so != -1 && pmatch[nmatch].rm_eo != -1)
651 retval = new string(instr.substr(pmatch[nmatch].rm_so,
652 pmatch[nmatch].rm_eo - pmatch[nmatch].rm_so));
653 else retval=new string();
654 }
655 Free(pmatch);
656 if(ret_val!=0) {
657 if(ret_val==REG_NOMATCH) {
658 regfree(&posix_regexp);
659 retval=new string();
660 }
661 else {
662 /* regexp error */
663 char msg[ERRMSG_BUFSIZE];
664 regerror(ret_val, &posix_regexp, msg, sizeof(msg));
665 FATAL_ERROR("regexp(): regexec() failed: %s", msg);
666 }
667 }
668 else regfree(&posix_regexp);
669
670 return retval;
671 }
672
673 ustring* regexp(const ustring& instr, const ustring& expression,
674 const Int& groupno)
675 {
676 ustring *retval=0;
677
678 if(groupno<0) {
679 FATAL_ERROR("regexp(): groupno must be a non-negative integer");
680 return retval;
681 }
682 // do not report the warnings again
683 // they were already reported while checking the operands
684 unsigned orig_verb_level = verb_level;
685 verb_level &= ~(1|2);
686 int* user_groups;
687 char *posix_str = TTCN_pattern_to_regexp_uni(
688 expression.get_stringRepr_for_pattern().c_str(), &user_groups);
689 if (user_groups == 0)
690 FATAL_ERROR("regexp(): Cannot find any groups in the second argument.");
691 verb_level = orig_verb_level;
692 if(posix_str==NULL) {
693 FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
694 expression.get_stringRepr().c_str());
695 return retval;
696 }
697
698 regex_t posix_regexp;
699 int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED);
700 Free(posix_str);
701 if(ret_val!=0) {
702 /* regexp error */
703 char msg[ERRMSG_BUFSIZE];
704 regerror(ret_val, &posix_regexp, msg, sizeof(msg));
705 FATAL_ERROR("regexp(): regcomp() failed: %s", msg);
706 return retval;
707 }
708
709 size_t nmatch=user_groups[groupno+1]+1;
710 if(nmatch>posix_regexp.re_nsub) {
711 FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
712 "contains only %lu group(s).", (unsigned long) (groupno),
713 (unsigned long) user_groups[0]);
714 return retval;
715 }
716
717 Free(user_groups);
718
719 regmatch_t* pmatch = (regmatch_t*)Malloc((nmatch+1)*sizeof(regmatch_t));
720 char* tmp = instr.convert_to_regexp_form();
721 string instr_conv(tmp);
722 Free(tmp);
723 ret_val = regexec(&posix_regexp, instr_conv.c_str(), nmatch+1, pmatch, 0);
724 if(ret_val == 0) {
725 if(pmatch[nmatch].rm_so != -1 && pmatch[nmatch].rm_eo != -1) {
726 retval = new ustring(
727 instr_conv.substr(pmatch[nmatch].rm_so,
728 pmatch[nmatch].rm_eo - pmatch[nmatch].rm_so)
729 .convert_stringRepr_for_pattern());
730 } else { retval = new ustring(); }
731 }
732 Free(pmatch);
733 if(ret_val!=0) {
734 if(ret_val==REG_NOMATCH) {
735 regfree(&posix_regexp);
736 retval=new ustring();
737 }
738 else {
739 /* regexp error */
740 char msg[ERRMSG_BUFSIZE];
741 regerror(ret_val, &posix_regexp, msg, sizeof(msg));
742 FATAL_ERROR("regexp(): regexec() failed: %s", msg);
743 }
744 }
745 else regfree(&posix_regexp);
746
747 return retval;
748 }
749
750 string* remove_bom(const string& encoded_value)
751 {
752 size_t length = encoded_value.size();
753 if (0 == length) return new string();
754 if (length % 2 || 0 > length) {
755 ERROR("remove_bom(): Wrong string. The number of nibbles (%d) in string "
756 "shall be divisible by 2", static_cast<int>(length));
757 return new string(encoded_value);
758 }
759
760 int length_of_BOM = 0;
761 string str_uppercase(encoded_value);
762 size_t enough = length > sizeof(utf32be)-1 ? sizeof(utf32be)-1 : length;
763 for (size_t i = 0; i < enough; ++i) {
764 str_uppercase[i] = toupper(encoded_value[i]);
765 }
766
767 if (str_uppercase.find(utf32be, 0) < length) length_of_BOM = sizeof(utf32be)-1;
768 else if (str_uppercase.find(utf32le, 0) < length) length_of_BOM = sizeof(utf32le)-1;
769 else if (str_uppercase.find(utf16be, 0) < length) length_of_BOM = sizeof(utf16be)-1;
770 else if (str_uppercase.find(utf16le, 0) < length) length_of_BOM = sizeof(utf16le)-1;
771 else if (str_uppercase.find(utf8, 0) < length) length_of_BOM = sizeof(utf8)-1;
772 else return new string(encoded_value); // no BOM found
773
774 return new string(encoded_value.substr(length_of_BOM, length));
775 }
776
777 static CharCoding::CharCodingType is_ascii (size_t length, const unsigned char* strptr)
778 {
779 const unsigned char nonASCII = 1 << 7;// MSB is 1 in case of non ASCII character
780 CharCoding::CharCodingType ret = CharCoding::ASCII;
781 for (size_t i = 0; i < length; ++i) {
782 if ( strptr[i] & nonASCII) {
783 ret = CharCoding::UNKNOWN;
784 break;
785 }
786 }
787 return ret;
788 }
789
790 static CharCoding::CharCodingType is_utf8(size_t length, const unsigned char* strptr)
791 {
792 const char MSB = 1 << 7; // MSB is 1 in case of non ASCII character
793 const char MSBmin1 = 1 << 6; // 0100 0000
794 size_t i = 0;
795 while (length > i) {
796 if ( strptr[i] & MSB) { // non ASCII char
797 char maskUTF8 = 1 << 6; // 111x xxxx shows how many additional bytes are there
798 if (!(strptr[i] & maskUTF8)) return CharCoding::UNKNOWN; // accepted 11xxx xxxx but received 10xx xxxx
799 unsigned int noofUTF8 = 0; // 11xx xxxxx -> 2 bytes, 111x xxxxx -> 3 bytes , 1111 xxxxx -> 4 bytes in UTF-8
800 while (strptr[i] & maskUTF8) {
801 ++noofUTF8;
802 maskUTF8 >>= 1; // shift right the mask
803 }
804 // the second and third (and so on) UTF-8 byte looks like 10xx xxxx
805 while (0 < noofUTF8 ) {
806 ++i;
807 if (!(strptr[i] & MSB) || (strptr[i] & MSBmin1) || i >= length) { // if not like this: 10xx xxxx
808 return CharCoding::UNKNOWN;
809 }
810 --noofUTF8;
811 }
812 }
813 ++i;
814 }
815 return CharCoding::UTF_8;
816 }
817
818 string* get_stringencoding(const string& encoded_value)
819 {
820 size_t length = encoded_value.size();
821 if (0 == length) return new string("<unknown>");
822 if (length % 2 || 0 > length) {
823 ERROR("get_stringencoding(): Wrong string. The number of nibbles (%d) in string "
824 "shall be divisible by 2", static_cast<int>(length));
825 return new string("<unknown>");
826 }
827
828 string str_uppercase(encoded_value);
829 size_t enough = length > sizeof(utf32be)-1 ? sizeof(utf32be)-1 : length;
830 for (size_t i = 0; i < enough; ++i) {
831 str_uppercase[i] = toupper(encoded_value[i]);
832 }
833
834 if (str_uppercase.find(utf32be, 0) < length) return new string("UTF-32BE");
835 else if (str_uppercase.find(utf32le, 0) < length) return new string("UTF-32LE");
836 else if (str_uppercase.find(utf16be, 0) < length) return new string("UTF-16BE");
837 else if (str_uppercase.find(utf16le, 0) < length) return new string("UTF-16LE");
838 else if (str_uppercase.find(utf8, 0) < length) return new string("UTF-8");
839
840 unsigned char *uc_str = new unsigned char[length/2];
841 string ret;
842 for (size_t i = 0; i < length / 2; ++i) {
843 uc_str[i] = str2uchar(encoded_value[2 * i], encoded_value[2 * i + 1]);
844 }
845 if (is_ascii (length / 2, uc_str) == CharCoding::ASCII) ret = "ASCII";
846 else if (CharCoding::UTF_8 == is_utf8 (length / 2, uc_str)) ret = "UTF-8";
847 else ret = "<unknown>";
848
849 delete [] uc_str;
850 return new string(ret);
851 }
852
853 static size_t check_BOM(CharCoding::CharCodingType expected_coding, size_t n_uc, unsigned char* uc_str)
854 {
855 if (0 == n_uc) return 0;
856
857 switch (expected_coding) {
858 case CharCoding::UTF32:
859 case CharCoding::UTF32BE:
860 case CharCoding::UTF32LE:
861 if (4 > n_uc) {
862 ERROR("decode_utf32(): The string is shorter than the expected BOM");
863 return 0;
864 }
865 break;
866 case CharCoding::UTF16:
867 case CharCoding::UTF16BE:
868 case CharCoding::UTF16LE:
869 if (2 > n_uc) {
870 ERROR("decode_utf16(): The string is shorter than the expected BOM");
871 return 0;
872 }
873 break;
874 default: break;
875 }
876
877 //BOM indicates that the byte order is determined by a byte order mark,
878 //if present at the beginning the length of BOM is returned.
879 bool badBOM = false;
880 string errmsg;
881 string caller;
882 switch (expected_coding) {
883 case CharCoding::UTF32BE:
884 case CharCoding::UTF32:
885 if (0x00 == uc_str[0] && 0x00 == uc_str[1] && 0xFE == uc_str[2] && 0xFF == uc_str[3])
886 return 4;
887 badBOM = true;
888 caller = "decode_utf32()";
889 errmsg = "UTF-32BE";
890 break;
891 case CharCoding::UTF32LE:
892 if (0xFF == uc_str[0] && 0xFE == uc_str[1] && 0x00 == uc_str[2] && 0x00 == uc_str[3])
893 return 4;
894 badBOM = true;
895 caller = "decode_utf32()";
896 errmsg = "UTF-32LE";
897 break;
898 case CharCoding::UTF16BE:
899 case CharCoding::UTF16:
900 if (0xFE == uc_str[0] && 0xFF == uc_str[1])
901 return 2;
902 badBOM = true;
903 caller = "decode_utf16()";
904 errmsg = "UTF-16BE";
905 break;
906 case CharCoding::UTF16LE:
907 if (0xFF == uc_str[0] && 0xFE == uc_str[1])
908 return 2;
909 badBOM = true;
910 caller = "decode_utf16()";
911 errmsg = "UTF-16LE";
912 break;
913 case CharCoding::UTF_8:
914 if (0xEF == uc_str[0] && 0xBB == uc_str[1] && 0xBF == uc_str[2])
915 return 3;
916 return 0;
917 default:
918 if (CharCoding::UTF32 == expected_coding || CharCoding::UTF16 == expected_coding) {
919 const char* str = CharCoding::UTF32 == expected_coding ? "UTF-32" : "UTF-16";
920 ERROR("Wrong %s string. No BOM detected, however the given coding type (%s) "
921 "expects it to define the endianness", str, str);
922 }
923 else {
924 ERROR("Wrong string. No BOM detected");
925 }
926 }
927 if (badBOM) ERROR("%s: Wrong %s string. The expected coding could not be verified",
928 caller.c_str(), errmsg.c_str());
929 return 0;
930 }
931
932 static void fill_continuing_octets(int n_continuing, unsigned char *continuing_ptr,
933 size_t n_uc, const unsigned char* uc_str, int start_pos,
934 int uchar_pos)
935 {
936 for (int i = 0; i < n_continuing; i++) {
937 if (start_pos + i < static_cast<int>(n_uc)) {
938 unsigned char octet = uc_str[start_pos + i];
939 if ((octet & 0xC0) != 0x80) {
940 ERROR("decode_utf8(): Malformed: At character position %u, octet position %u: %02X is "
941 "not a valid continuing octet.", uchar_pos, start_pos + i, octet);
942 return;
943 }
944 continuing_ptr[i] = octet & 0x3F;
945 }
946 else {
947 if (start_pos + i == static_cast<int>(n_uc)) {
948 if (i > 0) {
949 // only a part of octets is missing
950 ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d out "
951 "of %d continuing octets %s missing from the end of the stream.",
952 uchar_pos, start_pos + i, n_continuing - i, n_continuing,
953 n_continuing - i > 1 ? "are" : "is");
954 return;
955 }
956 else {
957 // all octets are missing
958 ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d "
959 "continuing octet%s missing from the end of the stream.", uchar_pos,
960 start_pos, n_continuing, n_continuing > 1 ? "s are" : " is");
961 return;
962 }
963 }
964 continuing_ptr[i] = 0;
965 }
966 }
967 }
968
969 ustring decode_utf8(const string & ostr, CharCoding::CharCodingType expected_coding)
970 {
971 size_t length = ostr.size();
972 if (0 == length) return ustring();
973 if (length % 2) {
974 ERROR("decode_utf8(): Wrong UTF-8 string. The number of nibbles (%d) in octetstring "
975 "shall be divisible by 2", static_cast<int>(length));
976 return ustring();
977 }
978
979 unsigned char *uc_str = new unsigned char[length/2];
980 for (size_t i = 0; i < length / 2; ++i) {
981 uc_str[i] = str2uchar(ostr[2 * i], ostr[2 * i + 1]);
982 }
983 ustring ucstr;
984 size_t start = check_BOM(CharCoding::UTF_8, length /2, uc_str);
985
986 for (size_t i = start; i < length / 2;) {
987 // perform the decoding character by character
988 if (uc_str[i] <= 0x7F) {
989 // character encoded on a single octet: 0xxxxxxx (7 useful bits)
990 unsigned char g = 0;
991 unsigned char p = 0;
992 unsigned char r = 0;
993 unsigned char c = uc_str[i];
994 ucstr += ustring(g, p, r, c);
995 ++i;
996 }
997 else if (uc_str[i] <= 0xBF) {
998 // continuing octet (10xxxxxx) without leading octet ==> malformed
999 ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: continuing "
1000 "octet %02X without leading octet.", static_cast<int>(ucstr.size()),
1001 static_cast<int>(i), uc_str[i]);
1002 goto dec_error;
1003 }
1004 else if (uc_str[i] <= 0xDF) {
1005 // character encoded on 2 octets: 110xxxxx 10xxxxxx (11 useful bits)
1006 unsigned char octets[2];
1007 octets[0] = uc_str[i] & 0x1F;
1008 fill_continuing_octets(1, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
1009 unsigned char g = 0;
1010 unsigned char p = 0;
1011 unsigned char r = octets[0] >> 2;
1012 unsigned char c = octets[0] << 6 | octets[1];
1013 if (r == 0x00 && c < 0x80) {
1014 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 2-octet "
1015 "encoding for quadruple (0, 0, 0, %u).", static_cast<int>(ucstr.size()),
1016 static_cast<int>(i), c);
1017 goto dec_error;
1018 }
1019 ucstr += ustring(g, p, r, c);
1020 i += 2;
1021 }
1022 else if (uc_str[i] <= 0xEF) {
1023 // character encoded on 3 octets: 1110xxxx 10xxxxxx 10xxxxxx
1024 // (16 useful bits)
1025 unsigned char octets[3];
1026 octets[0] = uc_str[i] & 0x0F;
1027 fill_continuing_octets(2, octets + 1, length / 2, uc_str, i + 1,ucstr.size());
1028 unsigned char g = 0;
1029 unsigned char p = 0;
1030 unsigned char r = octets[0] << 4 | octets[1] >> 2;
1031 unsigned char c = octets[1] << 6 | octets[2];
1032 if (r < 0x08) {
1033 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 3-octet "
1034 "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr.size()),
1035 static_cast<int>(i), r, c);
1036 goto dec_error;
1037 }
1038 ucstr += ustring(g, p, r, c);
1039 i += 3;
1040 }
1041 else if (uc_str[i] <= 0xF7) {
1042 // character encoded on 4 octets: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1043 // (21 useful bits)
1044 unsigned char octets[4];
1045 octets[0] = uc_str[i] & 0x07;
1046 fill_continuing_octets(3, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
1047 unsigned char g = 0;
1048 unsigned char p = octets[0] << 2 | octets[1] >> 4;
1049 unsigned char r = octets[1] << 4 | octets[2] >> 2;
1050 unsigned char c = octets[2] << 6 | octets[3];
1051 if (p == 0x00) {
1052 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 4-octet "
1053 "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr.size()),
1054 static_cast<int>(i), r, c);
1055 goto dec_error;
1056 }
1057 ucstr += ustring(g, p, r, c);
1058 i += 4;
1059 }
1060 else if (uc_str[i] <= 0xFB) {
1061 // character encoded on 5 octets: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
1062 // 10xxxxxx (26 useful bits)
1063 unsigned char octets[5];
1064 octets[0] = uc_str[i] & 0x03;
1065 fill_continuing_octets(4, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
1066 unsigned char g = octets[0];
1067 unsigned char p = octets[1] << 2 | octets[2] >> 4;
1068 unsigned char r = octets[2] << 4 | octets[3] >> 2;
1069 unsigned char c = octets[3] << 6 | octets[4];
1070 if (g == 0x00 && p < 0x20) {
1071 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 5-octet "
1072 "encoding for quadruple (0, %u, %u, %u).", static_cast<int>(ucstr.size()),
1073 static_cast<int>(i), p, r, c);
1074 goto dec_error;
1075 }
1076 ucstr += ustring(g, p, r, c);
1077 i += 5;
1078 }
1079 else if (uc_str[i] <= 0xFD) {
1080 // character encoded on 6 octets: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx
1081 // 10xxxxxx 10xxxxxx (31 useful bits)
1082 unsigned char octets[6];
1083 octets[0] = uc_str[i] & 0x01;
1084 fill_continuing_octets(5, octets + 1, length / 2, uc_str, i + 1,ucstr.size());
1085 unsigned char g = octets[0] << 6 | octets[1];
1086 unsigned char p = octets[2] << 2 | octets[3] >> 4;
1087 unsigned char r = octets[3] << 4 | octets[4] >> 2;
1088 unsigned char c = octets[4] << 6 | octets[5];
1089 if (g < 0x04) {
1090 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 6-octet "
1091 "encoding for quadruple (%u, %u, %u, %u).", static_cast<int>(ucstr.size()),
1092 static_cast<int>(i), g, p, r, c);
1093 goto dec_error;
1094 }
1095 ucstr += ustring(g, p, r, c);
1096 i += 6;
1097 }
1098 else {
1099 // not used code points: FE and FF => malformed
1100 ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: "
1101 "unused/reserved octet %02X.", static_cast<int>(ucstr.size()),
1102 static_cast<int>(i), uc_str[i]);
1103 goto dec_error;
1104 }
1105 }
1106
1107 dec_error:
1108 delete[] uc_str;
1109 return ucstr;
1110 }
1111
1112 }
This page took 0.054719 seconds and 4 git commands to generate.