cdd874bc5a8e6167db6b2dbeb2ea7b4d924e404e
[deliverable/binutils-gdb.git] / gnulib / import / mbrtowc.c
1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2016 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include <config.h>
19
20 /* Specification. */
21 #include <wchar.h>
22
23 #if C_LOCALE_MAYBE_EILSEQ
24 # include "hard-locale.h"
25 # include <locale.h>
26 #endif
27
28 #if GNULIB_defined_mbstate_t
29 /* Implement mbrtowc() on top of mbtowc(). */
30
31 # include <errno.h>
32 # include <stdlib.h>
33
34 # include "localcharset.h"
35 # include "streq.h"
36 # include "verify.h"
37
38
39 verify (sizeof (mbstate_t) >= 4);
40
41 static char internal_state[4];
42
43 size_t
44 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
45 {
46 char *pstate = (char *)ps;
47
48 if (s == NULL)
49 {
50 pwc = NULL;
51 s = "";
52 n = 1;
53 }
54
55 if (n == 0)
56 return (size_t)(-2);
57
58 /* Here n > 0. */
59
60 if (pstate == NULL)
61 pstate = internal_state;
62
63 {
64 size_t nstate = pstate[0];
65 char buf[4];
66 const char *p;
67 size_t m;
68
69 switch (nstate)
70 {
71 case 0:
72 p = s;
73 m = n;
74 break;
75 case 3:
76 buf[2] = pstate[3];
77 /*FALLTHROUGH*/
78 case 2:
79 buf[1] = pstate[2];
80 /*FALLTHROUGH*/
81 case 1:
82 buf[0] = pstate[1];
83 p = buf;
84 m = nstate;
85 buf[m++] = s[0];
86 if (n >= 2 && m < 4)
87 {
88 buf[m++] = s[1];
89 if (n >= 3 && m < 4)
90 buf[m++] = s[2];
91 }
92 break;
93 default:
94 errno = EINVAL;
95 return (size_t)(-1);
96 }
97
98 /* Here m > 0. */
99
100 # if __GLIBC__ || defined __UCLIBC__
101 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
102 mbtowc (NULL, NULL, 0);
103 # endif
104 {
105 int res = mbtowc (pwc, p, m);
106
107 if (res >= 0)
108 {
109 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
110 abort ();
111 if (nstate >= (res > 0 ? res : 1))
112 abort ();
113 res -= nstate;
114 pstate[0] = 0;
115 return res;
116 }
117
118 /* mbtowc does not distinguish between invalid and incomplete multibyte
119 sequences. But mbrtowc needs to make this distinction.
120 There are two possible approaches:
121 - Use iconv() and its return value.
122 - Use built-in knowledge about the possible encodings.
123 Given the low quality of implementation of iconv() on the systems that
124 lack mbrtowc(), we use the second approach.
125 The possible encodings are:
126 - 8-bit encodings,
127 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
128 - UTF-8.
129 Use specialized code for each. */
130 if (m >= 4 || m >= MB_CUR_MAX)
131 goto invalid;
132 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
133 {
134 const char *encoding = locale_charset ();
135
136 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
137 {
138 /* Cf. unistr/u8-mblen.c. */
139 unsigned char c = (unsigned char) p[0];
140
141 if (c >= 0xc2)
142 {
143 if (c < 0xe0)
144 {
145 if (m == 1)
146 goto incomplete;
147 }
148 else if (c < 0xf0)
149 {
150 if (m == 1)
151 goto incomplete;
152 if (m == 2)
153 {
154 unsigned char c2 = (unsigned char) p[1];
155
156 if ((c2 ^ 0x80) < 0x40
157 && (c >= 0xe1 || c2 >= 0xa0)
158 && (c != 0xed || c2 < 0xa0))
159 goto incomplete;
160 }
161 }
162 else if (c <= 0xf4)
163 {
164 if (m == 1)
165 goto incomplete;
166 else /* m == 2 || m == 3 */
167 {
168 unsigned char c2 = (unsigned char) p[1];
169
170 if ((c2 ^ 0x80) < 0x40
171 && (c >= 0xf1 || c2 >= 0x90)
172 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
173 {
174 if (m == 2)
175 goto incomplete;
176 else /* m == 3 */
177 {
178 unsigned char c3 = (unsigned char) p[2];
179
180 if ((c3 ^ 0x80) < 0x40)
181 goto incomplete;
182 }
183 }
184 }
185 }
186 }
187 goto invalid;
188 }
189
190 /* As a reference for this code, you can use the GNU libiconv
191 implementation. Look for uses of the RET_TOOFEW macro. */
192
193 if (STREQ_OPT (encoding,
194 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
195 {
196 if (m == 1)
197 {
198 unsigned char c = (unsigned char) p[0];
199
200 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
201 goto incomplete;
202 }
203 if (m == 2)
204 {
205 unsigned char c = (unsigned char) p[0];
206
207 if (c == 0x8f)
208 {
209 unsigned char c2 = (unsigned char) p[1];
210
211 if (c2 >= 0xa1 && c2 < 0xff)
212 goto incomplete;
213 }
214 }
215 goto invalid;
216 }
217 if (STREQ_OPT (encoding,
218 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
219 || STREQ_OPT (encoding,
220 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
221 || STREQ_OPT (encoding,
222 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
223 {
224 if (m == 1)
225 {
226 unsigned char c = (unsigned char) p[0];
227
228 if (c >= 0xa1 && c < 0xff)
229 goto incomplete;
230 }
231 goto invalid;
232 }
233 if (STREQ_OPT (encoding,
234 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
235 {
236 if (m == 1)
237 {
238 unsigned char c = (unsigned char) p[0];
239
240 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
241 goto incomplete;
242 }
243 else /* m == 2 || m == 3 */
244 {
245 unsigned char c = (unsigned char) p[0];
246
247 if (c == 0x8e)
248 goto incomplete;
249 }
250 goto invalid;
251 }
252 if (STREQ_OPT (encoding,
253 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
254 {
255 if (m == 1)
256 {
257 unsigned char c = (unsigned char) p[0];
258
259 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
260 goto incomplete;
261 }
262 else /* m == 2 || m == 3 */
263 {
264 unsigned char c = (unsigned char) p[0];
265
266 if (c >= 0x90 && c <= 0xe3)
267 {
268 unsigned char c2 = (unsigned char) p[1];
269
270 if (c2 >= 0x30 && c2 <= 0x39)
271 {
272 if (m == 2)
273 goto incomplete;
274 else /* m == 3 */
275 {
276 unsigned char c3 = (unsigned char) p[2];
277
278 if (c3 >= 0x81 && c3 <= 0xfe)
279 goto incomplete;
280 }
281 }
282 }
283 }
284 goto invalid;
285 }
286 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
287 {
288 if (m == 1)
289 {
290 unsigned char c = (unsigned char) p[0];
291
292 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
293 || (c >= 0xf0 && c <= 0xf9))
294 goto incomplete;
295 }
296 goto invalid;
297 }
298
299 /* An unknown multibyte encoding. */
300 goto incomplete;
301 }
302
303 incomplete:
304 {
305 size_t k = nstate;
306 /* Here 0 <= k < m < 4. */
307 pstate[++k] = s[0];
308 if (k < m)
309 {
310 pstate[++k] = s[1];
311 if (k < m)
312 pstate[++k] = s[2];
313 }
314 if (k != m)
315 abort ();
316 }
317 pstate[0] = m;
318 return (size_t)(-2);
319
320 invalid:
321 errno = EILSEQ;
322 /* The conversion state is undefined, says POSIX. */
323 return (size_t)(-1);
324 }
325 }
326 }
327
328 #else
329 /* Override the system's mbrtowc() function. */
330
331 # undef mbrtowc
332
333 size_t
334 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
335 {
336 size_t ret;
337 wchar_t wc;
338
339 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
340 if (s == NULL)
341 {
342 pwc = NULL;
343 s = "";
344 n = 1;
345 }
346 # endif
347
348 # if MBRTOWC_EMPTY_INPUT_BUG
349 if (n == 0)
350 return (size_t) -2;
351 # endif
352
353 if (! pwc)
354 pwc = &wc;
355
356 # if MBRTOWC_RETVAL_BUG
357 {
358 static mbstate_t internal_state;
359
360 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
361 hidden internal state, but we can call it on our variable. */
362 if (ps == NULL)
363 ps = &internal_state;
364
365 if (!mbsinit (ps))
366 {
367 /* Parse the rest of the multibyte character byte for byte. */
368 size_t count = 0;
369 for (; n > 0; s++, n--)
370 {
371 ret = mbrtowc (&wc, s, 1, ps);
372
373 if (ret == (size_t)(-1))
374 return (size_t)(-1);
375 count++;
376 if (ret != (size_t)(-2))
377 {
378 /* The multibyte character has been completed. */
379 *pwc = wc;
380 return (wc == 0 ? 0 : count);
381 }
382 }
383 return (size_t)(-2);
384 }
385 }
386 # endif
387
388 ret = mbrtowc (pwc, s, n, ps);
389
390 # if MBRTOWC_NUL_RETVAL_BUG
391 if (ret < (size_t) -2 && !*pwc)
392 return 0;
393 # endif
394
395 # if C_LOCALE_MAYBE_EILSEQ
396 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
397 {
398 unsigned char uc = *s;
399 *pwc = uc;
400 return 1;
401 }
402 # endif
403
404 return ret;
405 }
406
407 #endif
This page took 0.037233 seconds and 3 git commands to generate.