1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2019 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
23 #if C_LOCALE_MAYBE_EILSEQ
24 # include "hard-locale.h"
28 #if GNULIB_defined_mbstate_t
29 /* Implement mbrtowc() on top of mbtowc(). */
34 # include "localcharset.h"
37 # include "glthread/lock.h"
41 # define FALLTHROUGH ((void) 0)
43 # define FALLTHROUGH __attribute__ ((__fallthrough__))
47 /* Returns a classification of special values of the encoding of the current
50 enc_other
, /* other */
52 enc_eucjp
, /* EUC-JP */
53 enc_94
, /* EUC-KR, GB2312, BIG5 */
54 enc_euctw
, /* EUC-TW */
55 enc_gb18030
, /* GB18030 */
61 const char *encoding
= locale_charset ();
62 if (STREQ_OPT (encoding
, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
64 if (STREQ_OPT (encoding
, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
66 if (STREQ_OPT (encoding
, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
67 || STREQ_OPT (encoding
, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
68 || STREQ_OPT (encoding
, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
70 if (STREQ_OPT (encoding
, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
72 if (STREQ_OPT (encoding
, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
74 if (STREQ_OPT (encoding
, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
79 # if GNULIB_WCHAR_SINGLE
80 /* When we know that the locale does not change, provide a speedup by
81 caching the value of locale_enc. */
82 static int cached_locale_enc
= -1;
84 locale_enc_cached (void)
86 if (cached_locale_enc
< 0)
87 cached_locale_enc
= locale_enc ();
88 return cached_locale_enc
;
91 /* By default, don't make assumptions, hence no caching. */
92 # define locale_enc_cached locale_enc
95 /* This lock protects the internal state of mbtowc against multiple simultaneous
97 gl_lock_define_initialized(static, mbtowc_lock
)
99 verify (sizeof (mbstate_t) >= 4);
101 static char internal_state
[4];
104 mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
106 char *pstate
= (char *)ps
;
121 pstate
= internal_state
;
124 size_t nstate
= pstate
[0];
162 enc
= locale_enc_cached ();
164 if (enc
== enc_utf8
) /* UTF-8 */
166 /* Achieve multi-thread safety by not calling mbtowc() at all. */
167 /* Cf. unistr/u8-mbtouc.c. */
168 unsigned char c
= (unsigned char) p
[0];
174 res
= (c
== 0 ? 0 : 1);
185 unsigned char c2
= (unsigned char) p
[1];
187 if ((c2
^ 0x80) < 0x40)
190 *pwc
= ((unsigned int) (c
& 0x1f) << 6)
191 | (unsigned int) (c2
^ 0x80);
203 unsigned char c2
= (unsigned char) p
[1];
205 if ((c2
^ 0x80) < 0x40
206 && (c
>= 0xe1 || c2
>= 0xa0)
207 && (c
!= 0xed || c2
< 0xa0))
213 unsigned char c3
= (unsigned char) p
[2];
215 if ((c3
^ 0x80) < 0x40)
218 *pwc
= ((unsigned int) (c
& 0x0f) << 12)
219 | ((unsigned int) (c2
^ 0x80) << 6)
220 | (unsigned int) (c3
^ 0x80);
234 unsigned char c2
= (unsigned char) p
[1];
236 if ((c2
^ 0x80) < 0x40
237 && (c
>= 0xf1 || c2
>= 0x90)
238 && (c
< 0xf4 || (c
== 0xf4 && c2
< 0x90)))
244 unsigned char c3
= (unsigned char) p
[2];
246 if ((c3
^ 0x80) < 0x40)
252 unsigned char c4
= (unsigned char) p
[3];
254 if ((c4
^ 0x80) < 0x40)
257 *pwc
= ((unsigned int) (c
& 0x07) << 18)
258 | ((unsigned int) (c2
^ 0x80) << 12)
259 | ((unsigned int) (c3
^ 0x80) << 6)
260 | (unsigned int) (c4
^ 0x80);
275 /* The hidden internal state of mbtowc would make this function not
276 multi-thread safe. Achieve multi-thread safety through a lock. */
277 gl_lock_lock (mbtowc_lock
);
279 /* Put the hidden internal state of mbtowc into its initial state.
280 This is needed at least with glibc, uClibc, and MSVC CRT.
281 See <https://sourceware.org/bugzilla/show_bug.cgi?id=9674>. */
282 mbtowc (NULL
, NULL
, 0);
284 res
= mbtowc (pwc
, p
, m
);
286 gl_lock_unlock (mbtowc_lock
);
290 if (pwc
!= NULL
&& ((*pwc
== 0) != (res
== 0)))
295 /* mbtowc does not distinguish between invalid and incomplete multibyte
296 sequences. But mbrtowc needs to make this distinction.
297 There are two possible approaches:
298 - Use iconv() and its return value.
299 - Use built-in knowledge about the possible encodings.
300 Given the low quality of implementation of iconv() on the systems
301 that lack mbrtowc(), we use the second approach.
302 The possible encodings are:
304 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
305 - UTF-8 (already handled above).
306 Use specialized code for each. */
307 if (m
>= 4 || m
>= MB_CUR_MAX
)
309 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
312 /* As a reference for this code, you can use the GNU libiconv
313 implementation. Look for uses of the RET_TOOFEW macro. */
315 case enc_eucjp
: /* EUC-JP */
319 unsigned char c
= (unsigned char) p
[0];
321 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e || c
== 0x8f)
326 unsigned char c
= (unsigned char) p
[0];
330 unsigned char c2
= (unsigned char) p
[1];
332 if (c2
>= 0xa1 && c2
< 0xff)
339 case enc_94
: /* EUC-KR, GB2312, BIG5 */
343 unsigned char c
= (unsigned char) p
[0];
345 if (c
>= 0xa1 && c
< 0xff)
351 case enc_euctw
: /* EUC-TW */
355 unsigned char c
= (unsigned char) p
[0];
357 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e)
360 else /* m == 2 || m == 3 */
362 unsigned char c
= (unsigned char) p
[0];
370 case enc_gb18030
: /* GB18030 */
374 unsigned char c
= (unsigned char) p
[0];
376 if ((c
>= 0x90 && c
<= 0xe3) || (c
>= 0xf8 && c
<= 0xfe))
379 else /* m == 2 || m == 3 */
381 unsigned char c
= (unsigned char) p
[0];
383 if (c
>= 0x90 && c
<= 0xe3)
385 unsigned char c2
= (unsigned char) p
[1];
387 if (c2
>= 0x30 && c2
<= 0x39)
393 unsigned char c3
= (unsigned char) p
[2];
395 if (c3
>= 0x81 && c3
<= 0xfe)
404 case enc_sjis
: /* SJIS */
408 unsigned char c
= (unsigned char) p
[0];
410 if ((c
>= 0x81 && c
<= 0x9f) || (c
>= 0xe0 && c
<= 0xea)
411 || (c
>= 0xf0 && c
<= 0xf9))
418 /* An unknown multibyte encoding. */
424 /* res >= 0 is the corrected return value of mbtowc (pwc, p, m). */
425 if (nstate
>= (res
> 0 ? res
: 1))
434 /* Here 0 <= k < m < 4. */
450 /* The conversion state is undefined, says POSIX. */
456 /* Override the system's mbrtowc() function. */
461 rpl_mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
466 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
475 # if MBRTOWC_EMPTY_INPUT_BUG
483 # if MBRTOWC_RETVAL_BUG
485 static mbstate_t internal_state
;
487 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
488 hidden internal state, but we can call it on our variable. */
490 ps
= &internal_state
;
494 /* Parse the rest of the multibyte character byte for byte. */
496 for (; n
> 0; s
++, n
--)
498 ret
= mbrtowc (&wc
, s
, 1, ps
);
500 if (ret
== (size_t)(-1))
503 if (ret
!= (size_t)(-2))
505 /* The multibyte character has been completed. */
507 return (wc
== 0 ? 0 : count
);
515 ret
= mbrtowc (pwc
, s
, n
, ps
);
517 # if MBRTOWC_NUL_RETVAL_BUG
518 if (ret
< (size_t) -2 && !*pwc
)
522 # if C_LOCALE_MAYBE_EILSEQ
523 if ((size_t) -2 <= ret
&& n
!= 0 && ! hard_locale (LC_CTYPE
))
525 unsigned char uc
= *s
;
This page took 0.040918 seconds and 4 git commands to generate.