1 /******************************************************************************
2 * Copyright (c) 2000-2016 Ericsson Telecom AB
3 * All rights reserved. This program and the accompanying materials
4 * are made available under the terms of the Eclipse Public License v1.0
5 * which accompanies this distribution, and is available at
6 * http://www.eclipse.org/legal/epl-v10.html
14 * Zalanyi, Balazs Andor
16 ******************************************************************************/
17 #include "PredefFunc.hh"
24 #include "CompilerError.hh"
26 #include <sys/types.h>
29 #include "../common/memory.h"
30 #include "../common/pattern.hh"
34 #define ERRMSG_BUFSIZE 512
38 static const char utf32be
[] = {'0','0','0','0','F','E','F','F',0};
39 static const char utf32le
[] = {'F','F','F','E','0','0','0','0',0};
40 static const char utf16be
[] = {'F','E','F','F',0};
41 static const char utf16le
[] = {'F','F','F','E',0};
42 static const char utf8
[] = {'E','F','B','B','B','F',0};
44 static inline unsigned char get_bit_value(char c
, unsigned char bit_value
)
52 FATAL_ERROR("Invalid binary digit (%c) in bitstring value", c
);
57 char toupper (const char c
)
59 if (('A' <= c
&& 'F' >= c
) ||
60 ('0' <= c
&& '9' >= c
)) return c
;
63 case 'a' : return 'A';
64 case 'b' : return 'B';
65 case 'c' : return 'C';
66 case 'd' : return 'D';
67 case 'e' : return 'E';
68 case 'f' : return 'F';
70 FATAL_ERROR("%c cannot be converted to hex character", c
);
75 char hexdigit_to_char(unsigned char hexdigit
)
77 if (hexdigit
< 10) return '0' + hexdigit
;
78 else if (hexdigit
< 16) return 'A' + hexdigit
- 10;
80 FATAL_ERROR("hexdigit_to_char(): invalid argument: %d", hexdigit
);
81 return '\0'; // to avoid warning
85 unsigned char char_to_hexdigit(char c
)
87 if (c
>= '0' && c
<= '9') return c
- '0';
88 else if (c
>= 'A' && c
<= 'F') return c
- 'A' + 10;
89 else if (c
>= 'a' && c
<= 'f') return c
- 'a' + 10;
91 FATAL_ERROR("char_to_hexdigit(): invalid argument: %c", c
);
92 return 0; // to avoid warning
96 string
uchar2str(unsigned char uchar
)
99 str
[0] = hexdigit_to_char(uchar
/ 16);
100 str
[1] = hexdigit_to_char(uchar
% 16);
101 return string(2, str
);
104 unsigned char str2uchar(const char& c1
, const char& c2
)
106 unsigned char uc
= 0;
107 uc
= char_to_hexdigit(c1
);
109 uc
+= char_to_hexdigit(c2
);
113 int_val_t
rem(const int_val_t
& left
, const int_val_t
& right
)
115 return (left
- right
* (left
/ right
));
118 int_val_t
mod(const int_val_t
& left
, const int_val_t
& right
)
120 int_val_t r
= right
< 0 ? -right
: right
;
124 int_val_t result
= rem(left
, r
);
125 return result
== 0 ? result
: result
+ r
;
129 string
* to_uppercase(const string
& value
)
131 string
*s
= new string(value
);
132 for (size_t i
= 0; i
< s
->size(); i
++) {
134 if (c
>= 'a' && c
<= 'z') c
= c
- 'a' + 'A';
139 string
* not4b_bit(const string
& bstr
)
141 string
*s
=new string(bstr
);
142 for(size_t i
=0; i
<s
->size(); i
++) {
145 case '0': c
='1'; break;
146 case '1': c
='0'; break;
148 FATAL_ERROR("not4b_bit(): Invalid char in bitstring.");
154 string
* not4b_hex(const string
& hstr
)
156 string
*s
=new string(hstr
);
157 for(size_t i
=0; i
<s
->size(); i
++) {
160 case '0': c
='F'; break;
161 case '1': c
='E'; break;
162 case '2': c
='D'; break;
163 case '3': c
='C'; break;
164 case '4': c
='B'; break;
165 case '5': c
='A'; break;
166 case '6': c
='9'; break;
167 case '7': c
='8'; break;
168 case '8': c
='7'; break;
169 case '9': c
='6'; break;
170 case 'A': c
='5'; break;
171 case 'B': c
='4'; break;
172 case 'C': c
='3'; break;
173 case 'D': c
='2'; break;
174 case 'E': c
='1'; break;
175 case 'F': c
='0'; break;
176 case 'a': c
='5'; break;
177 case 'b': c
='4'; break;
178 case 'c': c
='3'; break;
179 case 'd': c
='2'; break;
180 case 'e': c
='1'; break;
181 case 'f': c
='0'; break;
183 FATAL_ERROR("not4b_hex(): Invalid char in hexstring.");
189 string
* and4b(const string
& left
, const string
& right
)
191 string
*s
=new string(left
);
192 for(size_t i
=0; i
<s
->size(); i
++) {
194 c
=hexdigit_to_char(char_to_hexdigit(c
) & char_to_hexdigit(right
[i
]));
199 string
* or4b(const string
& left
, const string
& right
)
201 string
*s
=new string(left
);
202 for(size_t i
=0; i
<s
->size(); i
++) {
204 c
=hexdigit_to_char(char_to_hexdigit(c
) | char_to_hexdigit(right
[i
]));
209 string
* xor4b(const string
& left
, const string
& right
)
211 string
*s
=new string(left
);
212 for(size_t i
=0; i
<s
->size(); i
++) {
214 c
=hexdigit_to_char(char_to_hexdigit(c
) ^ char_to_hexdigit(right
[i
]));
219 string
* shift_left(const string
& value
, const Int
& count
)
222 string
*s
= new string
;
223 if (count
< static_cast<Int
>(value
.size())) *s
= value
.substr(count
);
224 s
->resize(value
.size(), '0');
226 } else if (count
< 0) return shift_right(value
, -count
);
227 else return new string(value
);
230 string
* shift_right(const string
& value
, const Int
& count
)
233 string
*s
= new string
;
234 if (count
< static_cast<Int
>(value
.size())) {
235 s
->resize(count
, '0');
236 *s
+= value
.substr(0, value
.size()-count
);
237 } else s
->resize(value
.size(), '0');
239 } else if (count
< 0) return shift_left(value
, -count
);
240 else return new string(value
);
243 string
* rotate_left(const string
& value
, const Int
& p_count
)
245 size_t size
= value
.size();
246 if (size
== 0) return new string(value
);
247 else if (p_count
< 0) return rotate_right(value
, -p_count
);
248 size_t count
= p_count
% size
;
249 if (count
== 0) return new string(value
);
250 else return new string(value
.substr(count
) + value
.substr(0, count
));
253 string
* rotate_right(const string
& value
, const Int
& p_count
)
255 size_t size
= value
.size();
256 if (size
== 0) return new string(value
);
257 else if (p_count
< 0) return rotate_left(value
, -p_count
);
258 size_t count
= p_count
% size
;
259 if (count
== 0) return new string(value
);
260 else return new string(value
.substr(size
- count
) +
261 value
.substr(0, size
- count
));
265 ustring
* rotate_left(const ustring
& value
, const Int
& p_count
)
267 size_t size
= value
.size();
268 if (size
== 0) return new ustring(value
);
269 else if (p_count
< 0) return rotate_right(value
, -p_count
);
270 size_t count
= p_count
% size
;
271 if (count
== 0) return new ustring(value
);
272 else return new ustring(value
.substr(count
) + value
.substr(0, count
));
275 ustring
* rotate_right(const ustring
& value
, const Int
& p_count
)
277 size_t size
= value
.size();
278 if (size
== 0) return new ustring(value
);
279 else if (p_count
< 0) return rotate_left(value
, -p_count
);
280 size_t count
= p_count
% size
;
281 if (count
== 0) return new ustring(value
);
282 else return new ustring(value
.substr(size
- count
) +
283 value
.substr(0, size
- count
));
286 int_val_t
* bit2int(const string
& bstr
)
288 size_t nof_bits
= bstr
.size();
289 // skip the leading zeros
290 size_t start_index
= 0;
291 while (start_index
< nof_bits
&& bstr
[start_index
] == '0') start_index
++;
292 int_val_t
*ret_val
= new int_val_t((Int
)0);
293 for (size_t i
= start_index
; i
< nof_bits
; i
++) {
295 if (bstr
[i
] == '1') *ret_val
+= 1;
300 int_val_t
* hex2int(const string
& hstr
)
302 size_t nof_digits
= hstr
.size();
303 size_t start_index
= 0;
304 // Skip the leading zeros.
305 while (start_index
< nof_digits
&& hstr
[start_index
] == '0')
307 int_val_t
*ret_val
= new int_val_t((Int
)0);
308 for (size_t i
= start_index
; i
< nof_digits
; i
++) {
310 *ret_val
+= char_to_hexdigit(hstr
[i
]);
315 Int
unichar2int(const ustring
& ustr
)
317 if (ustr
.size() != 1) FATAL_ERROR("unichar2int(): invalid argument");
318 const ustring::universal_char
& uchar
= ustr
.u_str()[0];
319 Int ret_val
= (uchar
.group
<< 24) | (uchar
.plane
<< 16) | (uchar
.row
<< 8) |
324 string
*int2bit(const int_val_t
& value
, const Int
& length
)
326 if (length
< 0) FATAL_ERROR("int2bit(): negative length");
327 size_t string_length
= static_cast<size_t>(length
);
328 if (static_cast<Int
>(string_length
) != length
||
329 string_length
> string::max_string_len
)
330 FATAL_ERROR("int2bit(): length is too large");
331 if (value
< 0) FATAL_ERROR("int2bit(): negative value");
332 string
*bstr
= new string
;
333 bstr
->resize(string_length
);
334 int_val_t tmp_value
= value
;
335 for (size_t i
= 1; i
<= string_length
; i
++) {
336 (*bstr
)[string_length
- i
] = (tmp_value
& 1).get_val() ? '1' : '0';
340 FATAL_ERROR("int2bit(): %s does not fit in %lu bits", \
341 value
.t_str().c_str(), (unsigned long)string_length
);
345 static const char hdigits
[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
346 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
348 string
*int2hex(const int_val_t
& value
, const Int
& length
)
351 FATAL_ERROR("int2hex(): negative length");
352 size_t string_length
= static_cast<size_t>(length
);
353 if (static_cast<Int
>(string_length
) != length
||
354 string_length
> string::max_string_len
)
355 FATAL_ERROR("int2hex(): length is too large");
356 if (value
< 0) FATAL_ERROR("int2hex(): negative value");
357 string
*hstr
= new string
;
358 hstr
->resize(string_length
);
359 int_val_t tmp_value
= value
;
360 for (size_t i
= 1; i
<= string_length
; i
++) {
361 (*hstr
)[string_length
- i
] = hdigits
[(tmp_value
& 0x0f).get_val()];
364 if (tmp_value
!= 0) {
365 FATAL_ERROR("int2hex(): %s does not fit in %lu hexadecimal digits",
366 value
.t_str().c_str(), (unsigned long)string_length
);
371 ustring
*int2unichar(const Int
& value
)
373 if (value
< 0 || value
> 2147483647)
374 FATAL_ERROR("int2unichar(): invalid argument");
375 unsigned char group
= (value
>> 24) & 0xFF,
376 plane
= (value
>> 16) & 0xFF,
377 row
= (value
>> 8) & 0xFF,
379 return new ustring(group
, plane
, row
, cell
);
382 string
*oct2char(const string
& ostr
)
384 string
*cstr
= new string
;
385 size_t ostr_size
= ostr
.size();
387 FATAL_ERROR("oct2char(): argument has odd length: %lu",
388 (unsigned long) ostr_size
);
389 size_t cstr_size
= ostr_size
/ 2;
390 cstr
->resize(cstr_size
);
391 const char *ostr_ptr
= ostr
.c_str();
392 for (size_t i
= 0; i
< cstr_size
; i
++) {
393 unsigned char c
= 16 * char_to_hexdigit(ostr_ptr
[2 * i
]) +
394 char_to_hexdigit(ostr_ptr
[2 * i
+ 1]);
395 if (c
> 127) FATAL_ERROR("oct2char(): resulting charstring contains " \
396 "non-ascii character: %d", c
);
402 string
*char2oct(const string
& cstr
)
404 string
*ostr
= new string
;
405 size_t cstr_size
= cstr
.size();
406 ostr
->resize(cstr_size
* 2, '0');
407 const char *cstr_ptr
= cstr
.c_str();
408 for (size_t i
= 0; i
< cstr_size
; i
++) {
409 unsigned char c
= cstr_ptr
[i
];
410 (*ostr
)[2 * i
] = hexdigit_to_char(c
/ 16);
411 (*ostr
)[2 * i
+ 1] = hexdigit_to_char(c
% 16);
416 string
*bit2hex(const string
& bstr
)
418 size_t size
=bstr
.size();
419 size_t hsize
=(size
+3)/4;
420 string
*hstr
= new string
;
424 bstr4
->resize(hsize
*4,'0');
425 bstr4
->replace(4-(size
%4),size
,bstr
);
427 hstr
->resize(hsize
,'0');
429 for(size_t i
=0;i
<hsize
;i
++) {
431 if(size
%4)b4
=bstr4
->substr(i
*4,4);
432 else b4
=bstr
.substr(i
*4,4);
433 if(b4
[0]=='1')u
=8;else u
=0;
437 (*hstr
)[i
]=hdigits
[u
];
439 if(bstr4
!=NULL
)delete bstr4
;
443 string
*hex2oct(const string
& hstr
)
445 if(hstr
.size()%2==0)return new string(hstr
);
447 string
*ostr
=new string("0");
453 string
*asn_hex2oct(const string
& hstr
)
455 string
*ostr
= new string(hstr
);
456 size_t size
= ostr
->size();
457 if (size
% 2) ostr
->resize(size
+ 1, '0');
461 string
*bit2oct(const string
& bstr
)
470 string
*asn_bit2oct(const string
& bstr
)
472 size_t size
= bstr
.size();
473 string
*ostr
= new string
;
474 ostr
->resize(((size
+7)/8)*2);
475 for(size_t i
=0, j
=0; i
<size
; ) {
476 unsigned char digit1
=0, digit2
=0;
477 digit1
+= get_bit_value(bstr
[i
++], 8);
479 digit1
+= get_bit_value(bstr
[i
++], 4);
481 digit1
+= get_bit_value(bstr
[i
++], 2);
483 digit1
+= get_bit_value(bstr
[i
++], 1);
485 digit2
+= get_bit_value(bstr
[i
++], 8);
487 digit2
+= get_bit_value(bstr
[i
++], 4);
489 digit2
+= get_bit_value(bstr
[i
++], 2);
490 if (i
< size
) digit2
+= get_bit_value(bstr
[i
++], 1);
497 (*ostr
)[j
++] = hexdigit_to_char(digit1
);
498 (*ostr
)[j
++] = hexdigit_to_char(digit2
);
503 string
*hex2bit(const string
& hstr
)
505 size_t size
=hstr
.size();
506 string
*bstr
= new string
;
507 bstr
->resize(4*size
);
508 for(size_t i
=0; i
<size
; i
++) {
511 bstr
->replace(4*i
, 4, "0000");
514 bstr
->replace(4*i
, 4, "0001");
517 bstr
->replace(4*i
, 4, "0010");
520 bstr
->replace(4*i
, 4, "0011");
523 bstr
->replace(4*i
, 4, "0100");
526 bstr
->replace(4*i
, 4, "0101");
529 bstr
->replace(4*i
, 4, "0110");
532 bstr
->replace(4*i
, 4, "0111");
535 bstr
->replace(4*i
, 4, "1000");
538 bstr
->replace(4*i
, 4, "1001");
542 bstr
->replace(4*i
, 4, "1010");
546 bstr
->replace(4*i
, 4, "1011");
550 bstr
->replace(4*i
, 4, "1100");
554 bstr
->replace(4*i
, 4, "1101");
558 bstr
->replace(4*i
, 4, "1110");
562 bstr
->replace(4*i
, 4, "1111");
565 FATAL_ERROR("Common::hex2bit(): invalid hexadecimal "
566 "digit in hexstring value");
572 int_val_t
* float2int(const Real
& value
, const Location
& loc
)
574 // We shouldn't mimic generality with `Int'.
575 if (value
>= (Real
)LLONG_MIN
&& value
<= (Real
)LLONG_MAX
)
576 return new int_val_t((Int
)value
);
578 snprintf(buf
, 511, "%f", value
);
579 char *dot
= strchr(buf
, '.');
580 if (!dot
) FATAL_ERROR("Conversion of float value `%f' to integer failed", value
);
581 else memset(dot
, 0, sizeof(buf
) - (dot
- buf
));
582 return new int_val_t(buf
, loc
);
585 /* TTCN-3 float values that have absolute value smaller than this are
586 displayed in exponential notation. Same as in core/Float.hh */
587 #ifndef MIN_DECIMAL_FLOAT
588 #define MIN_DECIMAL_FLOAT 1.0E-4
590 /* TTCN-3 float values that have absolute value larger or equal than
591 this are displayed in exponential notation. Same as in
593 #ifndef MAX_DECIMAL_FLOAT
594 #define MAX_DECIMAL_FLOAT 1.0E+10
597 string
*float2str(const Real
& value
)
600 if ( (value
> -MAX_DECIMAL_FLOAT
&& value
<= -MIN_DECIMAL_FLOAT
)
601 || (value
>= MIN_DECIMAL_FLOAT
&& value
< MAX_DECIMAL_FLOAT
)
603 snprintf(str_buf
,64,"%f",value
);
604 else snprintf(str_buf
,64,"%e",value
);
605 return new string(str_buf
);
608 string
* regexp(const string
& instr
, const string
& expression
,
614 FATAL_ERROR("regexp(): groupno must be a non-negative integer");
617 // do not report the warnings again
618 // they were already reported while checking the operands
619 unsigned orig_verb_level
= verb_level
;
620 verb_level
&= ~(1|2);
621 char *posix_str
=TTCN_pattern_to_regexp(expression
.c_str());
622 verb_level
= orig_verb_level
;
623 if(posix_str
==NULL
) {
624 FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
629 regex_t posix_regexp
;
630 int ret_val
=regcomp(&posix_regexp
, posix_str
, REG_EXTENDED
);
634 char msg
[ERRMSG_BUFSIZE
];
635 regerror(ret_val
, &posix_regexp
, msg
, sizeof(msg
));
636 FATAL_ERROR("regexp(): regcomp() failed: %s", msg
);
640 size_t nmatch
=groupno
+1;
641 if(nmatch
>posix_regexp
.re_nsub
) {
642 FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
643 "contains only %lu group(s).", (unsigned long) (nmatch
- 1),
644 (unsigned long) posix_regexp
.re_nsub
);
647 regmatch_t
* pmatch
=(regmatch_t
*)Malloc((nmatch
+1)*sizeof(regmatch_t
));
648 ret_val
=regexec(&posix_regexp
, instr
.c_str(), nmatch
+1, pmatch
, 0);
650 if(pmatch
[nmatch
].rm_so
!= -1 && pmatch
[nmatch
].rm_eo
!= -1)
651 retval
= new string(instr
.substr(pmatch
[nmatch
].rm_so
,
652 pmatch
[nmatch
].rm_eo
- pmatch
[nmatch
].rm_so
));
653 else retval
=new string();
657 if(ret_val
==REG_NOMATCH
) {
658 regfree(&posix_regexp
);
663 char msg
[ERRMSG_BUFSIZE
];
664 regerror(ret_val
, &posix_regexp
, msg
, sizeof(msg
));
665 FATAL_ERROR("regexp(): regexec() failed: %s", msg
);
668 else regfree(&posix_regexp
);
673 ustring
* regexp(const ustring
& instr
, const ustring
& expression
,
679 FATAL_ERROR("regexp(): groupno must be a non-negative integer");
682 // do not report the warnings again
683 // they were already reported while checking the operands
684 unsigned orig_verb_level
= verb_level
;
685 verb_level
&= ~(1|2);
687 char *posix_str
= TTCN_pattern_to_regexp_uni(
688 expression
.get_stringRepr_for_pattern().c_str(), &user_groups
);
689 if (user_groups
== 0)
690 FATAL_ERROR("regexp(): Cannot find any groups in the second argument.");
691 verb_level
= orig_verb_level
;
692 if(posix_str
==NULL
) {
693 FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
694 expression
.get_stringRepr().c_str());
698 regex_t posix_regexp
;
699 int ret_val
=regcomp(&posix_regexp
, posix_str
, REG_EXTENDED
);
703 char msg
[ERRMSG_BUFSIZE
];
704 regerror(ret_val
, &posix_regexp
, msg
, sizeof(msg
));
705 FATAL_ERROR("regexp(): regcomp() failed: %s", msg
);
709 size_t nmatch
=user_groups
[groupno
+1]+1;
710 if(nmatch
>posix_regexp
.re_nsub
) {
711 FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
712 "contains only %lu group(s).", (unsigned long) (groupno
),
713 (unsigned long) user_groups
[0]);
719 regmatch_t
* pmatch
= (regmatch_t
*)Malloc((nmatch
+1)*sizeof(regmatch_t
));
720 char* tmp
= instr
.convert_to_regexp_form();
721 string
instr_conv(tmp
);
723 ret_val
= regexec(&posix_regexp
, instr_conv
.c_str(), nmatch
+1, pmatch
, 0);
725 if(pmatch
[nmatch
].rm_so
!= -1 && pmatch
[nmatch
].rm_eo
!= -1) {
726 retval
= new ustring(
727 instr_conv
.substr(pmatch
[nmatch
].rm_so
,
728 pmatch
[nmatch
].rm_eo
- pmatch
[nmatch
].rm_so
)
729 .convert_stringRepr_for_pattern());
730 } else { retval
= new ustring(); }
734 if(ret_val
==REG_NOMATCH
) {
735 regfree(&posix_regexp
);
736 retval
=new ustring();
740 char msg
[ERRMSG_BUFSIZE
];
741 regerror(ret_val
, &posix_regexp
, msg
, sizeof(msg
));
742 FATAL_ERROR("regexp(): regexec() failed: %s", msg
);
745 else regfree(&posix_regexp
);
750 string
* remove_bom(const string
& encoded_value
)
752 size_t length
= encoded_value
.size();
753 if (0 == length
) return new string();
754 if (length
% 2 || 0 > length
) {
755 ERROR("remove_bom(): Wrong string. The number of nibbles (%d) in string "
756 "shall be divisible by 2", static_cast<int>(length
));
757 return new string(encoded_value
);
760 int length_of_BOM
= 0;
761 string
str_uppercase(encoded_value
);
762 size_t enough
= length
> sizeof(utf32be
)-1 ? sizeof(utf32be
)-1 : length
;
763 for (size_t i
= 0; i
< enough
; ++i
) {
764 str_uppercase
[i
] = toupper(encoded_value
[i
]);
767 if (str_uppercase
.find(utf32be
, 0) < length
) length_of_BOM
= sizeof(utf32be
)-1;
768 else if (str_uppercase
.find(utf32le
, 0) < length
) length_of_BOM
= sizeof(utf32le
)-1;
769 else if (str_uppercase
.find(utf16be
, 0) < length
) length_of_BOM
= sizeof(utf16be
)-1;
770 else if (str_uppercase
.find(utf16le
, 0) < length
) length_of_BOM
= sizeof(utf16le
)-1;
771 else if (str_uppercase
.find(utf8
, 0) < length
) length_of_BOM
= sizeof(utf8
)-1;
772 else return new string(encoded_value
); // no BOM found
774 return new string(encoded_value
.substr(length_of_BOM
, length
));
777 static CharCoding::CharCodingType
is_ascii (size_t length
, const unsigned char* strptr
)
779 const unsigned char nonASCII
= 1 << 7;// MSB is 1 in case of non ASCII character
780 CharCoding::CharCodingType ret
= CharCoding::ASCII
;
781 for (size_t i
= 0; i
< length
; ++i
) {
782 if ( strptr
[i
] & nonASCII
) {
783 ret
= CharCoding::UNKNOWN
;
790 static CharCoding::CharCodingType
is_utf8(size_t length
, const unsigned char* strptr
)
792 const char MSB
= 1 << 7; // MSB is 1 in case of non ASCII character
793 const char MSBmin1
= 1 << 6; // 0100 0000
796 if ( strptr
[i
] & MSB
) { // non ASCII char
797 char maskUTF8
= 1 << 6; // 111x xxxx shows how many additional bytes are there
798 if (!(strptr
[i
] & maskUTF8
)) return CharCoding::UNKNOWN
; // accepted 11xxx xxxx but received 10xx xxxx
799 unsigned int noofUTF8
= 0; // 11xx xxxxx -> 2 bytes, 111x xxxxx -> 3 bytes , 1111 xxxxx -> 4 bytes in UTF-8
800 while (strptr
[i
] & maskUTF8
) {
802 maskUTF8
>>= 1; // shift right the mask
804 // the second and third (and so on) UTF-8 byte looks like 10xx xxxx
805 while (0 < noofUTF8
) {
807 if (!(strptr
[i
] & MSB
) || (strptr
[i
] & MSBmin1
) || i
>= length
) { // if not like this: 10xx xxxx
808 return CharCoding::UNKNOWN
;
815 return CharCoding::UTF_8
;
818 string
* get_stringencoding(const string
& encoded_value
)
820 size_t length
= encoded_value
.size();
821 if (0 == length
) return new string("<unknown>");
822 if (length
% 2 || 0 > length
) {
823 ERROR("get_stringencoding(): Wrong string. The number of nibbles (%d) in string "
824 "shall be divisible by 2", static_cast<int>(length
));
825 return new string("<unknown>");
828 string
str_uppercase(encoded_value
);
829 size_t enough
= length
> sizeof(utf32be
)-1 ? sizeof(utf32be
)-1 : length
;
830 for (size_t i
= 0; i
< enough
; ++i
) {
831 str_uppercase
[i
] = toupper(encoded_value
[i
]);
834 if (str_uppercase
.find(utf32be
, 0) < length
) return new string("UTF-32BE");
835 else if (str_uppercase
.find(utf32le
, 0) < length
) return new string("UTF-32LE");
836 else if (str_uppercase
.find(utf16be
, 0) < length
) return new string("UTF-16BE");
837 else if (str_uppercase
.find(utf16le
, 0) < length
) return new string("UTF-16LE");
838 else if (str_uppercase
.find(utf8
, 0) < length
) return new string("UTF-8");
840 unsigned char *uc_str
= new unsigned char[length
/2];
842 for (size_t i
= 0; i
< length
/ 2; ++i
) {
843 uc_str
[i
] = str2uchar(encoded_value
[2 * i
], encoded_value
[2 * i
+ 1]);
845 if (is_ascii (length
/ 2, uc_str
) == CharCoding::ASCII
) ret
= "ASCII";
846 else if (CharCoding::UTF_8
== is_utf8 (length
/ 2, uc_str
)) ret
= "UTF-8";
847 else ret
= "<unknown>";
850 return new string(ret
);
853 static size_t check_BOM(CharCoding::CharCodingType expected_coding
, size_t n_uc
, unsigned char* uc_str
)
855 if (0 == n_uc
) return 0;
857 switch (expected_coding
) {
858 case CharCoding::UTF32
:
859 case CharCoding::UTF32BE
:
860 case CharCoding::UTF32LE
:
862 ERROR("decode_utf32(): The string is shorter than the expected BOM");
866 case CharCoding::UTF16
:
867 case CharCoding::UTF16BE
:
868 case CharCoding::UTF16LE
:
870 ERROR("decode_utf16(): The string is shorter than the expected BOM");
877 //BOM indicates that the byte order is determined by a byte order mark,
878 //if present at the beginning the length of BOM is returned.
882 switch (expected_coding
) {
883 case CharCoding::UTF32BE
:
884 case CharCoding::UTF32
:
885 if (0x00 == uc_str
[0] && 0x00 == uc_str
[1] && 0xFE == uc_str
[2] && 0xFF == uc_str
[3])
888 caller
= "decode_utf32()";
891 case CharCoding::UTF32LE
:
892 if (0xFF == uc_str
[0] && 0xFE == uc_str
[1] && 0x00 == uc_str
[2] && 0x00 == uc_str
[3])
895 caller
= "decode_utf32()";
898 case CharCoding::UTF16BE
:
899 case CharCoding::UTF16
:
900 if (0xFE == uc_str
[0] && 0xFF == uc_str
[1])
903 caller
= "decode_utf16()";
906 case CharCoding::UTF16LE
:
907 if (0xFF == uc_str
[0] && 0xFE == uc_str
[1])
910 caller
= "decode_utf16()";
913 case CharCoding::UTF_8
:
914 if (0xEF == uc_str
[0] && 0xBB == uc_str
[1] && 0xBF == uc_str
[2])
918 if (CharCoding::UTF32
== expected_coding
|| CharCoding::UTF16
== expected_coding
) {
919 const char* str
= CharCoding::UTF32
== expected_coding
? "UTF-32" : "UTF-16";
920 ERROR("Wrong %s string. No BOM detected, however the given coding type (%s) "
921 "expects it to define the endianness", str
, str
);
924 ERROR("Wrong string. No BOM detected");
927 if (badBOM
) ERROR("%s: Wrong %s string. The expected coding could not be verified",
928 caller
.c_str(), errmsg
.c_str());
932 static void fill_continuing_octets(int n_continuing
, unsigned char *continuing_ptr
,
933 size_t n_uc
, const unsigned char* uc_str
, int start_pos
,
936 for (int i
= 0; i
< n_continuing
; i
++) {
937 if (start_pos
+ i
< static_cast<int>(n_uc
)) {
938 unsigned char octet
= uc_str
[start_pos
+ i
];
939 if ((octet
& 0xC0) != 0x80) {
940 ERROR("decode_utf8(): Malformed: At character position %u, octet position %u: %02X is "
941 "not a valid continuing octet.", uchar_pos
, start_pos
+ i
, octet
);
944 continuing_ptr
[i
] = octet
& 0x3F;
947 if (start_pos
+ i
== static_cast<int>(n_uc
)) {
949 // only a part of octets is missing
950 ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d out "
951 "of %d continuing octets %s missing from the end of the stream.",
952 uchar_pos
, start_pos
+ i
, n_continuing
- i
, n_continuing
,
953 n_continuing
- i
> 1 ? "are" : "is");
957 // all octets are missing
958 ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d "
959 "continuing octet%s missing from the end of the stream.", uchar_pos
,
960 start_pos
, n_continuing
, n_continuing
> 1 ? "s are" : " is");
964 continuing_ptr
[i
] = 0;
969 ustring
decode_utf8(const string
& ostr
, CharCoding::CharCodingType expected_coding
)
971 size_t length
= ostr
.size();
972 if (0 == length
) return ustring();
974 ERROR("decode_utf8(): Wrong UTF-8 string. The number of nibbles (%d) in octetstring "
975 "shall be divisible by 2", static_cast<int>(length
));
979 unsigned char *uc_str
= new unsigned char[length
/2];
980 for (size_t i
= 0; i
< length
/ 2; ++i
) {
981 uc_str
[i
] = str2uchar(ostr
[2 * i
], ostr
[2 * i
+ 1]);
984 size_t start
= check_BOM(CharCoding::UTF_8
, length
/2, uc_str
);
986 for (size_t i
= start
; i
< length
/ 2;) {
987 // perform the decoding character by character
988 if (uc_str
[i
] <= 0x7F) {
989 // character encoded on a single octet: 0xxxxxxx (7 useful bits)
993 unsigned char c
= uc_str
[i
];
994 ucstr
+= ustring(g
, p
, r
, c
);
997 else if (uc_str
[i
] <= 0xBF) {
998 // continuing octet (10xxxxxx) without leading octet ==> malformed
999 ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: continuing "
1000 "octet %02X without leading octet.", static_cast<int>(ucstr
.size()),
1001 static_cast<int>(i
), uc_str
[i
]);
1004 else if (uc_str
[i
] <= 0xDF) {
1005 // character encoded on 2 octets: 110xxxxx 10xxxxxx (11 useful bits)
1006 unsigned char octets
[2];
1007 octets
[0] = uc_str
[i
] & 0x1F;
1008 fill_continuing_octets(1, octets
+ 1, length
/ 2, uc_str
, i
+ 1, ucstr
.size());
1009 unsigned char g
= 0;
1010 unsigned char p
= 0;
1011 unsigned char r
= octets
[0] >> 2;
1012 unsigned char c
= octets
[0] << 6 | octets
[1];
1013 if (r
== 0x00 && c
< 0x80) {
1014 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 2-octet "
1015 "encoding for quadruple (0, 0, 0, %u).", static_cast<int>(ucstr
.size()),
1016 static_cast<int>(i
), c
);
1019 ucstr
+= ustring(g
, p
, r
, c
);
1022 else if (uc_str
[i
] <= 0xEF) {
1023 // character encoded on 3 octets: 1110xxxx 10xxxxxx 10xxxxxx
1025 unsigned char octets
[3];
1026 octets
[0] = uc_str
[i
] & 0x0F;
1027 fill_continuing_octets(2, octets
+ 1, length
/ 2, uc_str
, i
+ 1,ucstr
.size());
1028 unsigned char g
= 0;
1029 unsigned char p
= 0;
1030 unsigned char r
= octets
[0] << 4 | octets
[1] >> 2;
1031 unsigned char c
= octets
[1] << 6 | octets
[2];
1033 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 3-octet "
1034 "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr
.size()),
1035 static_cast<int>(i
), r
, c
);
1038 ucstr
+= ustring(g
, p
, r
, c
);
1041 else if (uc_str
[i
] <= 0xF7) {
1042 // character encoded on 4 octets: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1044 unsigned char octets
[4];
1045 octets
[0] = uc_str
[i
] & 0x07;
1046 fill_continuing_octets(3, octets
+ 1, length
/ 2, uc_str
, i
+ 1, ucstr
.size());
1047 unsigned char g
= 0;
1048 unsigned char p
= octets
[0] << 2 | octets
[1] >> 4;
1049 unsigned char r
= octets
[1] << 4 | octets
[2] >> 2;
1050 unsigned char c
= octets
[2] << 6 | octets
[3];
1052 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 4-octet "
1053 "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr
.size()),
1054 static_cast<int>(i
), r
, c
);
1057 ucstr
+= ustring(g
, p
, r
, c
);
1060 else if (uc_str
[i
] <= 0xFB) {
1061 // character encoded on 5 octets: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
1062 // 10xxxxxx (26 useful bits)
1063 unsigned char octets
[5];
1064 octets
[0] = uc_str
[i
] & 0x03;
1065 fill_continuing_octets(4, octets
+ 1, length
/ 2, uc_str
, i
+ 1, ucstr
.size());
1066 unsigned char g
= octets
[0];
1067 unsigned char p
= octets
[1] << 2 | octets
[2] >> 4;
1068 unsigned char r
= octets
[2] << 4 | octets
[3] >> 2;
1069 unsigned char c
= octets
[3] << 6 | octets
[4];
1070 if (g
== 0x00 && p
< 0x20) {
1071 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 5-octet "
1072 "encoding for quadruple (0, %u, %u, %u).", static_cast<int>(ucstr
.size()),
1073 static_cast<int>(i
), p
, r
, c
);
1076 ucstr
+= ustring(g
, p
, r
, c
);
1079 else if (uc_str
[i
] <= 0xFD) {
1080 // character encoded on 6 octets: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx
1081 // 10xxxxxx 10xxxxxx (31 useful bits)
1082 unsigned char octets
[6];
1083 octets
[0] = uc_str
[i
] & 0x01;
1084 fill_continuing_octets(5, octets
+ 1, length
/ 2, uc_str
, i
+ 1,ucstr
.size());
1085 unsigned char g
= octets
[0] << 6 | octets
[1];
1086 unsigned char p
= octets
[2] << 2 | octets
[3] >> 4;
1087 unsigned char r
= octets
[3] << 4 | octets
[4] >> 2;
1088 unsigned char c
= octets
[4] << 6 | octets
[5];
1090 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 6-octet "
1091 "encoding for quadruple (%u, %u, %u, %u).", static_cast<int>(ucstr
.size()),
1092 static_cast<int>(i
), g
, p
, r
, c
);
1095 ucstr
+= ustring(g
, p
, r
, c
);
1099 // not used code points: FE and FF => malformed
1100 ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: "
1101 "unused/reserved octet %02X.", static_cast<int>(ucstr
.size()),
1102 static_cast<int>(i
), uc_str
[i
]);