compiler2/PredefFunc.cc

   1 /******************************************************************************
   2  * Copyright (c) 2000-2016 Ericsson Telecom AB
   3  * All rights reserved. This program and the accompanying materials
   4  * are made available under the terms of the Eclipse Public License v1.0
   5  * which accompanies this distribution, and is available at
   6  * http://www.eclipse.org/legal/epl-v10.html
   7  *
   8  * Contributors:
   9  *   Baji, Laszlo
  10  *   Balasko, Jeno
  11  *   Baranyi, Botond
  12  *   Kovacs, Ferenc
  13  *   Raduly, Csaba
  14  *   Zalanyi, Balazs Andor
  15  *
  16  ******************************************************************************/
  17 #include "PredefFunc.hh"
  18 #include "error.h"
  19 #include "Int.hh"
  20 #include "Real.hh"
  21 #include "Setting.hh"
  22 #include "string.hh"
  23 #include "ustring.hh"
  24 #include "CompilerError.hh"
  25 #include <stdio.h>
  26 #include <sys/types.h>
  27 #include <regex.h>
  28 #include <stdint.h>
  29 #include "../common/memory.h"
  30 #include "../common/pattern.hh"
  31 #include <iostream>
  32
  33 // used by regex
  34 #define ERRMSG_BUFSIZE 512
  35
  36 namespace Common {
  37
  38   static const char utf32be[] = {'0','0','0','0','F','E','F','F',0};
  39   static const char utf32le[] = {'F','F','F','E','0','0','0','0',0};
  40   static const char utf16be[] = {'F','E','F','F',0};
  41   static const char utf16le[] = {'F','F','F','E',0};
  42   static const char utf8[]    = {'E','F','B','B','B','F',0};
  43
  44   static inline unsigned char get_bit_value(char c, unsigned char bit_value)
  45   {
  46     switch (c) {
  47     case '0':
  48       return 0;
  49     case '1':
  50       return bit_value;
  51     default:
  52       FATAL_ERROR("Invalid binary digit (%c) in bitstring value", c);
  53       return 0;
  54     }
  55   }
  56
  57   char toupper (const char c)
  58   {
  59     if (('A' <= c && 'F' >= c) ||
  60         ('0' <= c && '9' >= c)) return c;
  61     switch (c)
  62     {
  63       case 'a' : return 'A';
  64       case 'b' : return 'B';
  65       case 'c' : return 'C';
  66       case 'd' : return 'D';
  67       case 'e' : return 'E';
  68       case 'f' : return 'F';
  69       default:
  70         FATAL_ERROR("%c cannot be converted to hex character", c);
  71         break;
  72     }
  73   }
  74
  75   char hexdigit_to_char(unsigned char hexdigit)
  76   {
  77     if (hexdigit < 10) return '0' + hexdigit;
  78     else if (hexdigit < 16) return 'A' + hexdigit - 10;
  79     else {
  80       FATAL_ERROR("hexdigit_to_char(): invalid argument: %d", hexdigit);
  81       return '\0'; // to avoid warning
  82     }
  83   }
  84
  85   unsigned char char_to_hexdigit(char c)
  86   {
  87     if (c >= '0' && c <= '9') return c - '0';
  88     else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
  89     else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
  90     else {
  91       FATAL_ERROR("char_to_hexdigit(): invalid argument: %c", c);
  92       return 0; // to avoid warning
  93     }
  94   }
  95
  96   string uchar2str(unsigned char uchar)
  97   {
  98     char str[2];
  99     str[0] = hexdigit_to_char(uchar / 16);
 100     str[1] = hexdigit_to_char(uchar % 16);
 101     return string(2, str);
 102   }
 103
 104   unsigned char str2uchar(const char& c1, const char& c2)
 105   {
 106     unsigned char uc = 0;
 107     uc = char_to_hexdigit(c1);
 108     uc <<= 4;
 109     uc += char_to_hexdigit(c2);
 110     return uc;
 111   }
 112
 113   int_val_t rem(const int_val_t& left, const int_val_t& right)
 114   {
 115     return (left - right * (left / right));
 116   }
 117
 118   int_val_t mod(const int_val_t& left, const int_val_t& right)
 119   {
 120     int_val_t r = right < 0 ? -right : right;
 121     if (left > 0) {
 122       return rem(left, r);
 123     } else {
 124       int_val_t result = rem(left, r);
 125       return result == 0 ? result : result + r;
 126     }
 127   }
 128
 129   string* to_uppercase(const string& value)
 130   {
 131     string *s = new string(value);
 132     for (size_t i = 0; i < s->size(); i++) {
 133       char& c=(*s)[i];
 134       if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
 135     }
 136     return s;
 137   }
 138
 139   string* not4b_bit(const string& bstr)
 140   {
 141     string *s=new string(bstr);
 142     for(size_t i=0; i<s->size(); i++) {
 143       char& c=(*s)[i];
 144       switch(c) {
 145       case '0': c='1'; break;
 146       case '1': c='0'; break;
 147       default:
 148         FATAL_ERROR("not4b_bit(): Invalid char in bitstring.");
 149       } // switch c
 150     } // for i
 151     return s;
 152   }
 153
 154   string* not4b_hex(const string& hstr)
 155   {
 156     string *s=new string(hstr);
 157     for(size_t i=0; i<s->size(); i++) {
 158       char& c=(*s)[i];
 159       switch(c) {
 160       case '0': c='F'; break;
 161       case '1': c='E'; break;
 162       case '2': c='D'; break;
 163       case '3': c='C'; break;
 164       case '4': c='B'; break;
 165       case '5': c='A'; break;
 166       case '6': c='9'; break;
 167       case '7': c='8'; break;
 168       case '8': c='7'; break;
 169       case '9': c='6'; break;
 170       case 'A': c='5'; break;
 171       case 'B': c='4'; break;
 172       case 'C': c='3'; break;
 173       case 'D': c='2'; break;
 174       case 'E': c='1'; break;
 175       case 'F': c='0'; break;
 176       case 'a': c='5'; break;
 177       case 'b': c='4'; break;
 178       case 'c': c='3'; break;
 179       case 'd': c='2'; break;
 180       case 'e': c='1'; break;
 181       case 'f': c='0'; break;
 182       default:
 183         FATAL_ERROR("not4b_hex(): Invalid char in hexstring.");
 184       } // switch c
 185     } // for i
 186     return s;
 187   }
 188
 189   string* and4b(const string& left, const string& right)
 190   {
 191     string *s=new string(left);
 192     for(size_t i=0; i<s->size(); i++) {
 193       char& c=(*s)[i];
 194       c=hexdigit_to_char(char_to_hexdigit(c) & char_to_hexdigit(right[i]));
 195     } // for i
 196     return s;
 197   }
 198
 199   string* or4b(const string& left, const string& right)
 200   {
 201     string *s=new string(left);
 202     for(size_t i=0; i<s->size(); i++) {
 203       char& c=(*s)[i];
 204       c=hexdigit_to_char(char_to_hexdigit(c) | char_to_hexdigit(right[i]));
 205     } // for i
 206     return s;
 207   }
 208
 209   string* xor4b(const string& left, const string& right)
 210   {
 211     string *s=new string(left);
 212     for(size_t i=0; i<s->size(); i++) {
 213       char& c=(*s)[i];
 214       c=hexdigit_to_char(char_to_hexdigit(c) ^ char_to_hexdigit(right[i]));
 215     } // for i
 216     return s;
 217   }
 218
 219   string* shift_left(const string& value, const Int& count)
 220   {
 221     if (count > 0) {
 222       string *s = new string;
 223       if (count < static_cast<Int>(value.size())) *s = value.substr(count);
 224       s->resize(value.size(), '0');
 225       return s;
 226     } else if (count < 0) return shift_right(value, -count);
 227     else return new string(value);
 228   }
 229
 230   string* shift_right(const string& value, const Int& count)
 231   {
 232     if (count > 0) {
 233       string *s = new string;
 234       if (count < static_cast<Int>(value.size())) {
 235         s->resize(count, '0');
 236         *s += value.substr(0, value.size()-count);
 237       } else s->resize(value.size(), '0');
 238       return s;
 239     } else if (count < 0) return shift_left(value, -count);
 240     else return new string(value);
 241   }
 242
 243   string* rotate_left(const string& value, const Int& p_count)
 244   {
 245     size_t size = value.size();
 246     if (size == 0) return new string(value);
 247     else if (p_count < 0) return rotate_right(value, -p_count);
 248     size_t count = p_count % size;
 249     if (count == 0) return new string(value);
 250     else return new string(value.substr(count) + value.substr(0, count));
 251   }
 252
 253   string* rotate_right(const string& value, const Int& p_count)
 254   {
 255     size_t size = value.size();
 256     if (size == 0) return new string(value);
 257     else if (p_count < 0) return rotate_left(value, -p_count);
 258     size_t count = p_count % size;
 259     if (count == 0) return new string(value);
 260     else return new string(value.substr(size - count) +
 261       value.substr(0, size - count));
 262   }
 263
 264
 265   ustring* rotate_left(const ustring& value, const Int& p_count)
 266   {
 267     size_t size = value.size();
 268     if (size == 0) return new ustring(value);
 269     else if (p_count < 0) return rotate_right(value, -p_count);
 270     size_t count = p_count % size;
 271     if (count == 0) return new ustring(value);
 272     else return new ustring(value.substr(count) + value.substr(0, count));
 273   }
 274
 275   ustring* rotate_right(const ustring& value, const Int& p_count)
 276   {
 277     size_t size = value.size();
 278     if (size == 0) return new ustring(value);
 279     else if (p_count < 0) return rotate_left(value, -p_count);
 280     size_t count = p_count % size;
 281     if (count == 0) return new ustring(value);
 282     else return new ustring(value.substr(size - count) +
 283       value.substr(0, size - count));
 284   }
 285
 286   int_val_t* bit2int(const string& bstr)
 287   {
 288     size_t nof_bits = bstr.size();
 289     // skip the leading zeros
 290     size_t start_index = 0;
 291     while (start_index < nof_bits && bstr[start_index] == '0') start_index++;
 292     int_val_t *ret_val = new int_val_t((Int)0);
 293     for (size_t i = start_index; i < nof_bits; i++) {
 294       *ret_val <<= 1;
 295       if (bstr[i] == '1') *ret_val += 1;
 296     }
 297     return ret_val;
 298   }
 299
 300   int_val_t* hex2int(const string& hstr)
 301   {
 302     size_t nof_digits = hstr.size();
 303     size_t start_index = 0;
 304     // Skip the leading zeros.
 305     while (start_index < nof_digits && hstr[start_index] == '0')
 306       start_index++;
 307     int_val_t *ret_val = new int_val_t((Int)0);
 308     for (size_t i = start_index; i < nof_digits; i++) {
 309       *ret_val <<= 4;
 310       *ret_val += char_to_hexdigit(hstr[i]);
 311     }
 312     return ret_val;
 313   }
 314
 315   Int unichar2int(const ustring& ustr)
 316   {
 317     if (ustr.size() != 1) FATAL_ERROR("unichar2int(): invalid argument");
 318     const ustring::universal_char& uchar = ustr.u_str()[0];
 319     Int ret_val = (uchar.group << 24) | (uchar.plane << 16) | (uchar.row << 8) |
 320       uchar.cell;
 321     return ret_val;
 322   }
 323
 324   string *int2bit(const int_val_t& value, const Int& length)
 325   {
 326     if (length < 0) FATAL_ERROR("int2bit(): negative length");
 327     size_t string_length = static_cast<size_t>(length);
 328     if (static_cast<Int>(string_length) != length ||
 329         string_length > string::max_string_len)
 330       FATAL_ERROR("int2bit(): length is too large");
 331     if (value < 0) FATAL_ERROR("int2bit(): negative value");
 332     string *bstr = new string;
 333     bstr->resize(string_length);
 334     int_val_t tmp_value = value;
 335     for (size_t i = 1; i <= string_length; i++) {
 336       (*bstr)[string_length - i] = (tmp_value & 1).get_val() ? '1' : '0';
 337       tmp_value >>= 1;
 338     }
 339     if (tmp_value != 0)
 340       FATAL_ERROR("int2bit(): %s does not fit in %lu bits", \
 341         value.t_str().c_str(), (unsigned long)string_length);
 342     return bstr;
 343   }
 344
 345   static const char hdigits[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
 346     '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
 347
 348   string *int2hex(const int_val_t& value, const Int& length)
 349   {
 350     if (length < 0)
 351       FATAL_ERROR("int2hex(): negative length");
 352     size_t string_length = static_cast<size_t>(length);
 353     if (static_cast<Int>(string_length) != length ||
 354         string_length > string::max_string_len)
 355       FATAL_ERROR("int2hex(): length is too large");
 356     if (value < 0) FATAL_ERROR("int2hex(): negative value");
 357     string *hstr = new string;
 358     hstr->resize(string_length);
 359     int_val_t tmp_value = value;
 360     for (size_t i = 1; i <= string_length; i++) {
 361       (*hstr)[string_length - i] = hdigits[(tmp_value & 0x0f).get_val()];
 362       tmp_value >>= 4;
 363     }
 364     if (tmp_value != 0) {
 365       FATAL_ERROR("int2hex(): %s does not fit in %lu hexadecimal digits",
 366         value.t_str().c_str(), (unsigned long)string_length);
 367     }
 368     return hstr;
 369   }
 370
 371   ustring *int2unichar(const Int& value)
 372   {
 373     if (value < 0 || value > 2147483647)
 374       FATAL_ERROR("int2unichar(): invalid argument");
 375     unsigned char group = (value >> 24) & 0xFF,
 376       plane = (value >> 16) & 0xFF,
 377       row = (value >> 8) & 0xFF,
 378       cell = value & 0xFF;
 379     return new ustring(group, plane, row, cell);
 380   }
 381
 382   string *oct2char(const string& ostr)
 383   {
 384     string *cstr = new string;
 385     size_t ostr_size = ostr.size();
 386     if (ostr_size % 2)
 387       FATAL_ERROR("oct2char(): argument has odd length: %lu",
 388         (unsigned long) ostr_size);
 389     size_t cstr_size = ostr_size / 2;
 390     cstr->resize(cstr_size);
 391     const char *ostr_ptr = ostr.c_str();
 392     for (size_t i = 0; i < cstr_size; i++) {
 393       unsigned char c = 16 * char_to_hexdigit(ostr_ptr[2 * i]) +
 394       char_to_hexdigit(ostr_ptr[2 * i + 1]);
 395       if (c > 127) FATAL_ERROR("oct2char(): resulting charstring contains " \
 396                                "non-ascii character: %d", c);
 397       (*cstr)[i] = c;
 398     }
 399     return cstr;
 400   }
 401
 402   string *char2oct(const string& cstr)
 403   {
 404     string *ostr = new string;
 405     size_t cstr_size = cstr.size();
 406     ostr->resize(cstr_size * 2, '0');
 407     const char *cstr_ptr = cstr.c_str();
 408     for (size_t i = 0; i < cstr_size; i++) {
 409       unsigned char c = cstr_ptr[i];
 410       (*ostr)[2 * i] = hexdigit_to_char(c / 16);
 411       (*ostr)[2 * i + 1] = hexdigit_to_char(c % 16);
 412     }
 413     return ostr;
 414   }
 415
 416   string *bit2hex(const string& bstr)
 417   {
 418     size_t size=bstr.size();
 419     size_t hsize=(size+3)/4;
 420     string *hstr = new string;
 421     string *bstr4=NULL;
 422     if(size%4) {
 423       bstr4=new string;
 424       bstr4->resize(hsize*4,'0');
 425       bstr4->replace(4-(size%4),size,bstr);
 426     }
 427     hstr->resize(hsize,'0');
 428     string b4(4,"0000");
 429     for(size_t i=0;i<hsize;i++) {
 430       unsigned int u;
 431       if(size%4)b4=bstr4->substr(i*4,4);
 432       else b4=bstr.substr(i*4,4);
 433       if(b4[0]=='1')u=8;else u=0;
 434       if(b4[1]=='1')u+=4;
 435       if(b4[2]=='1')u+=2;
 436       if(b4[3]=='1')u++;
 437       (*hstr)[i]=hdigits[u];
 438     }
 439     if(bstr4!=NULL)delete bstr4;
 440     return hstr;
 441   }
 442
 443   string *hex2oct(const string& hstr)
 444   {
 445     if(hstr.size()%2==0)return new string(hstr);
 446     else {
 447       string *ostr=new string("0");
 448       (*ostr)+=hstr;
 449       return ostr;
 450     }
 451   }
 452
 453   string *asn_hex2oct(const string& hstr)
 454   {
 455     string *ostr = new string(hstr);
 456     size_t size = ostr->size();
 457     if (size % 2) ostr->resize(size + 1, '0');
 458     return ostr;
 459   }
 460
 461   string *bit2oct(const string& bstr)
 462   {
 463     string *s1,*s2;
 464     s1=bit2hex(bstr);
 465     s2=hex2oct(*s1);
 466     delete s1;
 467     return s2;
 468   }
 469
 470   string *asn_bit2oct(const string& bstr)
 471   {
 472     size_t size = bstr.size();
 473     string *ostr = new string;
 474     ostr->resize(((size+7)/8)*2);
 475     for(size_t i=0, j=0; i<size; ) {
 476       unsigned char digit1=0, digit2=0;
 477       digit1 += get_bit_value(bstr[i++], 8);
 478       if (i < size) {
 479         digit1 += get_bit_value(bstr[i++], 4);
 480         if (i < size) {
 481           digit1 += get_bit_value(bstr[i++], 2);
 482           if (i < size) {
 483             digit1 += get_bit_value(bstr[i++], 1);
 484             if (i < size) {
 485               digit2 += get_bit_value(bstr[i++], 8);
 486               if (i < size) {
 487                 digit2 += get_bit_value(bstr[i++], 4);
 488                 if (i < size) {
 489                   digit2 += get_bit_value(bstr[i++], 2);
 490                   if (i < size) digit2 += get_bit_value(bstr[i++], 1);
 491                 }
 492               }
 493             }
 494           }
 495         }
 496       }
 497       (*ostr)[j++] = hexdigit_to_char(digit1);
 498       (*ostr)[j++] = hexdigit_to_char(digit2);
 499     }
 500     return ostr;
 501   }
 502
 503   string *hex2bit(const string& hstr)
 504   {
 505     size_t size=hstr.size();
 506     string *bstr = new string;
 507     bstr->resize(4*size);
 508     for(size_t i=0; i<size; i++) {
 509       switch(hstr[i]) {
 510       case '0':
 511         bstr->replace(4*i, 4, "0000");
 512         break;
 513       case '1':
 514         bstr->replace(4*i, 4, "0001");
 515         break;
 516       case '2':
 517         bstr->replace(4*i, 4, "0010");
 518         break;
 519       case '3':
 520         bstr->replace(4*i, 4, "0011");
 521         break;
 522       case '4':
 523         bstr->replace(4*i, 4, "0100");
 524         break;
 525       case '5':
 526         bstr->replace(4*i, 4, "0101");
 527         break;
 528       case '6':
 529         bstr->replace(4*i, 4, "0110");
 530         break;
 531       case '7':
 532         bstr->replace(4*i, 4, "0111");
 533         break;
 534       case '8':
 535         bstr->replace(4*i, 4, "1000");
 536         break;
 537       case '9':
 538         bstr->replace(4*i, 4, "1001");
 539         break;
 540       case 'A':
 541       case 'a':
 542         bstr->replace(4*i, 4, "1010");
 543         break;
 544       case 'B':
 545       case 'b':
 546         bstr->replace(4*i, 4, "1011");
 547         break;
 548       case 'C':
 549       case 'c':
 550         bstr->replace(4*i, 4, "1100");
 551         break;
 552       case 'D':
 553       case 'd':
 554         bstr->replace(4*i, 4, "1101");
 555         break;
 556       case 'E':
 557       case 'e':
 558         bstr->replace(4*i, 4, "1110");
 559         break;
 560       case 'F':
 561       case 'f':
 562         bstr->replace(4*i, 4, "1111");
 563         break;
 564       default:
 565         FATAL_ERROR("Common::hex2bit(): invalid hexadecimal "
 566                     "digit in hexstring value");
 567       }
 568     }
 569     return bstr;
 570   }
 571
 572   int_val_t* float2int(const Real& value, const Location& loc)
 573   {
 574     // We shouldn't mimic generality with `Int'.
 575     if (value >= (Real)LLONG_MIN && value <= (Real)LLONG_MAX)
 576       return new int_val_t((Int)value);
 577     char buf[512] = "";
 578     snprintf(buf, 511, "%f", value);
 579     char *dot = strchr(buf, '.');
 580     if (!dot) FATAL_ERROR("Conversion of float value `%f' to integer failed", value);
 581     else memset(dot, 0, sizeof(buf) - (dot - buf));
 582     return new int_val_t(buf, loc);
 583   }
 584
 585 /* TTCN-3 float values that have absolute value smaller than this are
 586    displayed in exponential notation. Same as in core/Float.hh */
 587 #ifndef MIN_DECIMAL_FLOAT
 588 #define MIN_DECIMAL_FLOAT               1.0E-4
 589 #endif
 590 /* TTCN-3 float values that have absolute value larger or equal than
 591    this are displayed in exponential notation. Same as in
 592    core/Float.hh */
 593 #ifndef MAX_DECIMAL_FLOAT
 594 #define MAX_DECIMAL_FLOAT               1.0E+10
 595 #endif
 596
 597   string *float2str(const Real& value)
 598   {
 599     char str_buf[64];
 600     if ( (value > -MAX_DECIMAL_FLOAT && value <= -MIN_DECIMAL_FLOAT)
 601       || (value >= MIN_DECIMAL_FLOAT && value <   MAX_DECIMAL_FLOAT)
 602       || (value == 0.0))
 603       snprintf(str_buf,64,"%f",value);
 604     else snprintf(str_buf,64,"%e",value);
 605     return new string(str_buf);
 606   }
 607
 608   string* regexp(const string& instr, const string& expression,
 609                  const Int& groupno)
 610   {
 611     string *retval=0;
 612
 613     if(groupno<0) {
 614       FATAL_ERROR("regexp(): groupno must be a non-negative integer");
 615       return retval;
 616     }
 617     // do not report the warnings again
 618     // they were already reported while checking the operands
 619     unsigned orig_verb_level = verb_level;
 620     verb_level &= ~(1|2);
 621     char *posix_str=TTCN_pattern_to_regexp(expression.c_str());
 622     verb_level = orig_verb_level;
 623     if(posix_str==NULL) {
 624       FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
 625                   expression.c_str());
 626       return retval;
 627     }
 628
 629     regex_t posix_regexp;
 630     int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED);
 631     Free(posix_str);
 632     if(ret_val!=0) {
 633       /* regexp error */
 634       char msg[ERRMSG_BUFSIZE];
 635       regerror(ret_val, &posix_regexp, msg, sizeof(msg));
 636       FATAL_ERROR("regexp(): regcomp() failed: %s", msg);
 637       return retval;
 638     }
 639
 640     size_t nmatch=groupno+1;
 641     if(nmatch>posix_regexp.re_nsub) {
 642       FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
 643                   "contains only %lu group(s).", (unsigned long) (nmatch - 1),
 644                   (unsigned long) posix_regexp.re_nsub);
 645       return retval;
 646     }
 647     regmatch_t* pmatch=(regmatch_t*)Malloc((nmatch+1)*sizeof(regmatch_t));
 648     ret_val=regexec(&posix_regexp, instr.c_str(), nmatch+1, pmatch, 0);
 649     if(ret_val==0) {
 650       if(pmatch[nmatch].rm_so != -1 && pmatch[nmatch].rm_eo != -1)
 651         retval = new string(instr.substr(pmatch[nmatch].rm_so,
 652           pmatch[nmatch].rm_eo - pmatch[nmatch].rm_so));
 653       else retval=new string();
 654     }
 655     Free(pmatch);
 656     if(ret_val!=0) {
 657       if(ret_val==REG_NOMATCH) {
 658         regfree(&posix_regexp);
 659         retval=new string();
 660       }
 661       else {
 662         /* regexp error */
 663         char msg[ERRMSG_BUFSIZE];
 664         regerror(ret_val, &posix_regexp, msg, sizeof(msg));
 665         FATAL_ERROR("regexp(): regexec() failed: %s", msg);
 666       }
 667     }
 668     else regfree(&posix_regexp);
 669
 670     return retval;
 671   }
 672
 673   ustring* regexp(const ustring& instr, const ustring& expression,
 674     const Int& groupno)
 675   {
 676     ustring *retval=0;
 677
 678     if(groupno<0) {
 679       FATAL_ERROR("regexp(): groupno must be a non-negative integer");
 680       return retval;
 681     }
 682     // do not report the warnings again
 683     // they were already reported while checking the operands
 684     unsigned orig_verb_level = verb_level;
 685     verb_level &= ~(1|2);
 686     int* user_groups;
 687     char *posix_str = TTCN_pattern_to_regexp_uni(
 688       expression.get_stringRepr_for_pattern().c_str(), &user_groups);
 689     if (user_groups == 0)
 690       FATAL_ERROR("regexp(): Cannot find any groups in the second argument.");
 691     verb_level = orig_verb_level;
 692     if(posix_str==NULL) {
 693       FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
 694         expression.get_stringRepr().c_str());
 695       return retval;
 696     }
 697
 698     regex_t posix_regexp;
 699     int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED);
 700     Free(posix_str);
 701     if(ret_val!=0) {
 702       /* regexp error */
 703       char msg[ERRMSG_BUFSIZE];
 704       regerror(ret_val, &posix_regexp, msg, sizeof(msg));
 705       FATAL_ERROR("regexp(): regcomp() failed: %s", msg);
 706       return retval;
 707     }
 708
 709     size_t nmatch=user_groups[groupno+1]+1;
 710     if(nmatch>posix_regexp.re_nsub) {
 711       FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
 712         "contains only %lu group(s).", (unsigned long) (groupno),
 713         (unsigned long) user_groups[0]);
 714       return retval;
 715     }
 716
 717     Free(user_groups);
 718
 719     regmatch_t* pmatch = (regmatch_t*)Malloc((nmatch+1)*sizeof(regmatch_t));
 720     char* tmp = instr.convert_to_regexp_form();
 721     string instr_conv(tmp);
 722     Free(tmp);
 723     ret_val = regexec(&posix_regexp, instr_conv.c_str(), nmatch+1, pmatch, 0);
 724     if(ret_val == 0) {
 725       if(pmatch[nmatch].rm_so != -1 && pmatch[nmatch].rm_eo != -1) {
 726         retval = new ustring(
 727           instr_conv.substr(pmatch[nmatch].rm_so,
 728             pmatch[nmatch].rm_eo - pmatch[nmatch].rm_so)
 729             .convert_stringRepr_for_pattern());
 730       } else { retval = new ustring(); }
 731     }
 732     Free(pmatch);
 733     if(ret_val!=0) {
 734       if(ret_val==REG_NOMATCH) {
 735         regfree(&posix_regexp);
 736         retval=new ustring();
 737       }
 738       else {
 739         /* regexp error */
 740         char msg[ERRMSG_BUFSIZE];
 741         regerror(ret_val, &posix_regexp, msg, sizeof(msg));
 742         FATAL_ERROR("regexp(): regexec() failed: %s", msg);
 743       }
 744     }
 745     else regfree(&posix_regexp);
 746
 747     return retval;
 748   }
 749
 750 string* remove_bom(const string& encoded_value)
 751 {
 752   size_t length = encoded_value.size();
 753   if (0 == length) return new string();
 754   if (length % 2 || 0 > length) {
 755     ERROR("remove_bom(): Wrong string. The number of nibbles (%d) in string "
 756                 "shall be divisible by 2", static_cast<int>(length));
 757     return new string(encoded_value);
 758   }
 759
 760   int length_of_BOM = 0;
 761   string str_uppercase(encoded_value);
 762   size_t enough = length > sizeof(utf32be)-1 ? sizeof(utf32be)-1 : length;
 763   for (size_t i = 0; i < enough; ++i) {
 764     str_uppercase[i] = toupper(encoded_value[i]);
 765   }
 766
 767   if      (str_uppercase.find(utf32be, 0) < length) length_of_BOM = sizeof(utf32be)-1;
 768   else if (str_uppercase.find(utf32le, 0) < length) length_of_BOM = sizeof(utf32le)-1;
 769   else if (str_uppercase.find(utf16be, 0) < length) length_of_BOM = sizeof(utf16be)-1;
 770   else if (str_uppercase.find(utf16le, 0) < length) length_of_BOM = sizeof(utf16le)-1;
 771   else if (str_uppercase.find(utf8,    0) < length) length_of_BOM = sizeof(utf8)-1;
 772   else return new string(encoded_value); // no BOM found
 773
 774   return new string(encoded_value.substr(length_of_BOM, length));
 775 }
 776
 777 static CharCoding::CharCodingType is_ascii (size_t length, const unsigned char* strptr)
 778 {
 779   const unsigned char nonASCII = 1 << 7;// MSB is 1 in case of non ASCII character
 780   CharCoding::CharCodingType ret = CharCoding::ASCII;
 781   for (size_t i = 0; i < length; ++i) {
 782     if ( strptr[i] & nonASCII) {
 783       ret = CharCoding::UNKNOWN;
 784       break;
 785     }
 786   }
 787   return ret;
 788 }
 789
 790 static CharCoding::CharCodingType is_utf8(size_t length, const unsigned char* strptr)
 791 {
 792   const char MSB = 1 << 7; // MSB is 1 in case of non ASCII character
 793   const char MSBmin1 = 1 << 6; // 0100 0000
 794   size_t i = 0;
 795   while (length > i) {
 796     if ( strptr[i] & MSB) { // non ASCII char
 797     char maskUTF8 = 1 << 6; // 111x xxxx shows how many additional bytes are there
 798       if (!(strptr[i] & maskUTF8)) return CharCoding::UNKNOWN; // accepted 11xxx xxxx but received 10xx xxxx
 799       unsigned int noofUTF8 = 0; // 11xx xxxxx -> 2 bytes, 111x xxxxx -> 3 bytes , 1111 xxxxx -> 4 bytes in UTF-8
 800       while (strptr[i] & maskUTF8) {
 801         ++noofUTF8;
 802         maskUTF8 >>= 1; // shift right the mask
 803       }
 804       // the second and third (and so on) UTF-8 byte looks like 10xx xxxx
 805       while (0 < noofUTF8 ) {
 806         ++i;
 807         if (!(strptr[i] & MSB) || (strptr[i] & MSBmin1) || i >= length) { // if not like this: 10xx xxxx
 808           return CharCoding::UNKNOWN;
 809         }
 810         --noofUTF8;
 811       }
 812     }
 813     ++i;
 814   }
 815   return CharCoding::UTF_8;
 816 }
 817
 818 string* get_stringencoding(const string& encoded_value)
 819 {
 820   size_t length = encoded_value.size();
 821   if (0 == length) return new string("<unknown>");
 822   if (length % 2 || 0 > length) {
 823     ERROR("get_stringencoding(): Wrong string. The number of nibbles (%d) in string "
 824                 "shall be divisible by 2", static_cast<int>(length));
 825     return new string("<unknown>");
 826   }
 827
 828   string str_uppercase(encoded_value);
 829   size_t enough = length > sizeof(utf32be)-1 ? sizeof(utf32be)-1 : length;
 830   for (size_t i = 0; i < enough; ++i) {
 831     str_uppercase[i] = toupper(encoded_value[i]);
 832   }
 833
 834   if      (str_uppercase.find(utf32be, 0) < length) return new string("UTF-32BE");
 835   else if (str_uppercase.find(utf32le, 0) < length) return new string("UTF-32LE");
 836   else if (str_uppercase.find(utf16be, 0) < length) return new string("UTF-16BE");
 837   else if (str_uppercase.find(utf16le, 0) < length) return new string("UTF-16LE");
 838   else if (str_uppercase.find(utf8,    0) < length) return new string("UTF-8");
 839
 840   unsigned char *uc_str = new unsigned char[length/2];
 841   string ret;
 842   for (size_t i = 0; i < length / 2; ++i) {
 843     uc_str[i] = str2uchar(encoded_value[2 * i], encoded_value[2 * i + 1]);
 844   }
 845   if (is_ascii (length / 2, uc_str) == CharCoding::ASCII) ret = "ASCII";
 846   else if (CharCoding::UTF_8 == is_utf8 (length / 2, uc_str)) ret = "UTF-8";
 847   else ret = "<unknown>";
 848
 849   delete [] uc_str;
 850   return new string(ret);
 851 }
 852
 853 static size_t check_BOM(CharCoding::CharCodingType expected_coding, size_t n_uc, unsigned char* uc_str)
 854 {
 855   if (0 == n_uc) return 0;
 856
 857   switch (expected_coding) {
 858     case CharCoding::UTF32:
 859     case CharCoding::UTF32BE:
 860     case CharCoding::UTF32LE:
 861       if (4 > n_uc) {
 862         ERROR("decode_utf32(): The string is shorter than the expected BOM");
 863         return 0;
 864       }
 865       break;
 866     case CharCoding::UTF16:
 867     case CharCoding::UTF16BE:
 868     case CharCoding::UTF16LE:
 869       if (2 > n_uc) {
 870         ERROR("decode_utf16(): The string is shorter than the expected BOM");
 871         return 0;
 872       }
 873       break;
 874     default: break;
 875   }
 876
 877   //BOM indicates that the byte order is determined by a byte order mark,
 878   //if present at the beginning the length of BOM is returned.
 879   bool badBOM = false;
 880   string errmsg;
 881   string caller;
 882   switch (expected_coding) {
 883     case CharCoding::UTF32BE:
 884     case CharCoding::UTF32:
 885       if (0x00 == uc_str[0] && 0x00 == uc_str[1] && 0xFE == uc_str[2] && 0xFF == uc_str[3])
 886         return 4;
 887       badBOM = true;
 888       caller = "decode_utf32()";
 889       errmsg = "UTF-32BE";
 890       break;
 891     case CharCoding::UTF32LE:
 892       if (0xFF == uc_str[0] && 0xFE == uc_str[1] && 0x00 == uc_str[2] && 0x00 == uc_str[3])
 893         return 4;
 894       badBOM = true;
 895       caller = "decode_utf32()";
 896       errmsg = "UTF-32LE";
 897       break;
 898     case CharCoding::UTF16BE:
 899     case CharCoding::UTF16:
 900       if (0xFE == uc_str[0] && 0xFF == uc_str[1])
 901         return 2;
 902       badBOM = true;
 903       caller = "decode_utf16()";
 904       errmsg = "UTF-16BE";
 905       break;
 906     case CharCoding::UTF16LE:
 907       if (0xFF == uc_str[0] && 0xFE == uc_str[1])
 908         return 2;
 909       badBOM = true;
 910       caller = "decode_utf16()";
 911       errmsg = "UTF-16LE";
 912       break;
 913     case CharCoding::UTF_8:
 914       if (0xEF == uc_str[0] && 0xBB == uc_str[1] && 0xBF == uc_str[2])
 915         return 3;
 916       return 0;
 917     default:
 918       if (CharCoding::UTF32 == expected_coding || CharCoding::UTF16 == expected_coding) {
 919         const char* str = CharCoding::UTF32 == expected_coding ? "UTF-32" : "UTF-16";
 920         ERROR("Wrong %s string. No BOM detected, however the given coding type (%s) "
 921                "expects it to define the endianness", str, str);
 922       }
 923       else {
 924         ERROR("Wrong string. No BOM detected");
 925       }
 926     }
 927   if (badBOM) ERROR("%s: Wrong %s string. The expected coding could not be verified",
 928                     caller.c_str(), errmsg.c_str());
 929   return 0;
 930 }
 931
 932 static void fill_continuing_octets(int n_continuing, unsigned char *continuing_ptr,
 933                             size_t n_uc, const unsigned char* uc_str, int start_pos,
 934                             int uchar_pos)
 935 {
 936   for (int i = 0; i < n_continuing; i++) {
 937     if (start_pos + i < static_cast<int>(n_uc)) {
 938       unsigned char octet = uc_str[start_pos + i];
 939       if ((octet & 0xC0) != 0x80) {
 940         ERROR("decode_utf8(): Malformed: At character position %u, octet position %u: %02X is "
 941               "not a valid continuing octet.", uchar_pos, start_pos + i, octet);
 942         return;
 943       }
 944       continuing_ptr[i] = octet & 0x3F;
 945     }
 946     else {
 947       if (start_pos + i == static_cast<int>(n_uc)) {
 948         if (i > 0) {
 949     // only a part of octets is missing
 950           ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d out "
 951                 "of %d continuing octets %s missing from the end of the stream.",
 952                 uchar_pos, start_pos + i, n_continuing - i, n_continuing,
 953                 n_continuing - i > 1 ? "are" : "is");
 954           return;
 955         }
 956         else {
 957           // all octets are missing
 958           ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d "
 959                 "continuing octet%s missing from the end of the stream.", uchar_pos,
 960                 start_pos, n_continuing, n_continuing > 1 ? "s are" : " is");
 961           return;
 962         }
 963       }
 964       continuing_ptr[i] = 0;
 965     }
 966   }
 967 }
 968
 969 ustring decode_utf8(const string & ostr, CharCoding::CharCodingType expected_coding)
 970 {
 971   size_t length = ostr.size();
 972   if (0 == length) return ustring();
 973   if (length % 2) {
 974     ERROR("decode_utf8(): Wrong UTF-8 string. The number of nibbles (%d) in octetstring "
 975           "shall be divisible by 2", static_cast<int>(length));
 976     return ustring();
 977   }
 978
 979   unsigned char *uc_str = new unsigned char[length/2];
 980   for (size_t i = 0; i < length / 2; ++i) {
 981     uc_str[i] = str2uchar(ostr[2 * i], ostr[2 * i + 1]);
 982   }
 983   ustring ucstr;
 984   size_t start = check_BOM(CharCoding::UTF_8, length /2, uc_str);
 985
 986   for (size_t i = start; i < length / 2;) {
 987     // perform the decoding character by character
 988     if (uc_str[i] <= 0x7F) {
 989       // character encoded on a single octet: 0xxxxxxx (7 useful bits)
 990       unsigned char g = 0;
 991       unsigned char p = 0;
 992       unsigned char r = 0;
 993       unsigned char c = uc_str[i];
 994       ucstr += ustring(g, p, r, c);
 995       ++i;
 996     }
 997     else if (uc_str[i] <= 0xBF) {
 998       // continuing octet (10xxxxxx) without leading octet ==> malformed
 999       ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: continuing "
1000              "octet %02X without leading octet.", static_cast<int>(ucstr.size()),
1001              static_cast<int>(i), uc_str[i]);
1002       goto dec_error;
1003     }
1004     else if (uc_str[i] <= 0xDF) {
1005       // character encoded on 2 octets: 110xxxxx 10xxxxxx (11 useful bits)
1006       unsigned char octets[2];
1007       octets[0] = uc_str[i] & 0x1F;
1008       fill_continuing_octets(1, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
1009       unsigned char g = 0;
1010       unsigned char p = 0;
1011       unsigned char r = octets[0] >> 2;
1012       unsigned char c = octets[0] << 6 | octets[1];
1013       if (r == 0x00 && c < 0x80) {
1014         ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 2-octet "
1015               "encoding for quadruple (0, 0, 0, %u).", static_cast<int>(ucstr.size()),
1016               static_cast<int>(i), c);
1017         goto dec_error;
1018       }
1019       ucstr += ustring(g, p, r, c);
1020       i += 2;
1021     }
1022     else if (uc_str[i] <= 0xEF) {
1023       // character encoded on 3 octets: 1110xxxx 10xxxxxx 10xxxxxx
1024       // (16 useful bits)
1025       unsigned char octets[3];
1026       octets[0] = uc_str[i] & 0x0F;
1027       fill_continuing_octets(2, octets + 1, length / 2, uc_str, i + 1,ucstr.size());
1028       unsigned char g = 0;
1029       unsigned char p = 0;
1030       unsigned char r = octets[0] << 4 | octets[1] >> 2;
1031       unsigned char c = octets[1] << 6 | octets[2];
1032       if (r < 0x08) {
1033         ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 3-octet "
1034               "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr.size()),
1035               static_cast<int>(i), r, c);
1036         goto dec_error;
1037       }
1038       ucstr += ustring(g, p, r, c);
1039       i += 3;
1040     }
1041     else if (uc_str[i] <= 0xF7) {
1042       // character encoded on 4 octets: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1043       // (21 useful bits)
1044       unsigned char octets[4];
1045       octets[0] = uc_str[i] & 0x07;
1046       fill_continuing_octets(3, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
1047       unsigned char g = 0;
1048       unsigned char p = octets[0] << 2 | octets[1] >> 4;
1049       unsigned char r = octets[1] << 4 | octets[2] >> 2;
1050       unsigned char c = octets[2] << 6 | octets[3];
1051       if (p == 0x00) {
1052         ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 4-octet "
1053               "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr.size()),
1054               static_cast<int>(i), r, c);
1055         goto dec_error;
1056       }
1057       ucstr += ustring(g, p, r, c);
1058       i += 4;
1059     }
1060     else if (uc_str[i] <= 0xFB) {
1061       // character encoded on 5 octets: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
1062       // 10xxxxxx (26 useful bits)
1063       unsigned char octets[5];
1064       octets[0] = uc_str[i] & 0x03;
1065       fill_continuing_octets(4, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
1066       unsigned char g = octets[0];
1067       unsigned char p = octets[1] << 2 | octets[2] >> 4;
1068       unsigned char r = octets[2] << 4 | octets[3] >> 2;
1069       unsigned char c = octets[3] << 6 | octets[4];
1070       if (g == 0x00 && p < 0x20) {
1071         ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 5-octet "
1072               "encoding for quadruple (0, %u, %u, %u).", static_cast<int>(ucstr.size()),
1073               static_cast<int>(i), p, r, c);
1074         goto dec_error;
1075       }
1076       ucstr += ustring(g, p, r, c);
1077       i += 5;
1078     }
1079     else if (uc_str[i] <= 0xFD) {
1080       // character encoded on 6 octets: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx
1081       // 10xxxxxx 10xxxxxx (31 useful bits)
1082       unsigned char octets[6];
1083       octets[0] = uc_str[i] & 0x01;
1084       fill_continuing_octets(5, octets + 1, length / 2, uc_str, i + 1,ucstr.size());
1085       unsigned char g = octets[0] << 6 | octets[1];
1086       unsigned char p = octets[2] << 2 | octets[3] >> 4;
1087       unsigned char r = octets[3] << 4 | octets[4] >> 2;
1088       unsigned char c = octets[4] << 6 | octets[5];
1089       if (g < 0x04) {
1090         ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 6-octet "
1091               "encoding for quadruple (%u, %u, %u, %u).", static_cast<int>(ucstr.size()),
1092                     static_cast<int>(i), g, p, r, c);
1093         goto dec_error;
1094       }
1095       ucstr += ustring(g, p, r, c);
1096       i += 6;
1097     }
1098     else {
1099       // not used code points: FE and FF => malformed
1100       ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: "
1101             "unused/reserved octet %02X.", static_cast<int>(ucstr.size()),
1102             static_cast<int>(i), uc_str[i]);
1103       goto dec_error;
1104     }
1105   }
1106
1107   dec_error:
1108   delete[] uc_str;
1109   return ucstr;
1110 }
1111
1112 }