Import GNU Readline 8.1
[deliverable/binutils-gdb.git] / readline / readline / mbutil.c
CommitLineData
9255ee31
EZ
1/* mbutil.c -- readline multibyte character utility functions */
2
b4f26d54 3/* Copyright (C) 2001-2020 Free Software Foundation, Inc.
9255ee31 4
cc88a640
JK
5 This file is part of the GNU Readline Library (Readline), a library
6 for reading lines of text with interactive input and history editing.
9255ee31 7
cc88a640
JK
8 Readline is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
9255ee31
EZ
11 (at your option) any later version.
12
cc88a640
JK
13 Readline is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9255ee31
EZ
16 GNU General Public License for more details.
17
cc88a640
JK
18 You should have received a copy of the GNU General Public License
19 along with Readline. If not, see <http://www.gnu.org/licenses/>.
20*/
21
9255ee31
EZ
22#define READLINE_LIBRARY
23
24#if defined (HAVE_CONFIG_H)
25# include <config.h>
26#endif
27
28#include <sys/types.h>
29#include <fcntl.h>
30#include "posixjmp.h"
31
32#if defined (HAVE_UNISTD_H)
33# include <unistd.h> /* for _POSIX_VERSION */
34#endif /* HAVE_UNISTD_H */
35
36#if defined (HAVE_STDLIB_H)
37# include <stdlib.h>
38#else
39# include "ansi_stdlib.h"
40#endif /* HAVE_STDLIB_H */
41
42#include <stdio.h>
43#include <ctype.h>
44
45/* System-specific feature definitions and include files. */
46#include "rldefs.h"
47#include "rlmbutil.h"
48
49#if defined (TIOCSTAT_IN_SYS_IOCTL)
50# include <sys/ioctl.h>
51#endif /* TIOCSTAT_IN_SYS_IOCTL */
52
53/* Some standard library routines. */
54#include "readline.h"
55
56#include "rlprivate.h"
57#include "xmalloc.h"
58
59/* Declared here so it can be shared between the readline and history
60 libraries. */
61#if defined (HANDLE_MULTIBYTE)
62int rl_byte_oriented = 0;
63#else
64int rl_byte_oriented = 1;
65#endif
66
775e241e
TT
67/* Ditto */
68int _rl_utf8locale = 0;
69
9255ee31
EZ
70/* **************************************************************** */
71/* */
72/* Multibyte Character Utility Functions */
73/* */
74/* **************************************************************** */
75
76#if defined(HANDLE_MULTIBYTE)
77
cb41b9e7
TT
78/* **************************************************************** */
79/* */
80/* UTF-8 specific Character Utility Functions */
81/* */
82/* **************************************************************** */
83
84/* Return the length in bytes of the possibly-multibyte character beginning
85 at S. Encoding is UTF-8. */
86static int
87_rl_utf8_mblen (const char *s, size_t n)
88{
b4f26d54 89 unsigned char c, c1, c2, c3;
cb41b9e7
TT
90
91 if (s == 0)
92 return (0); /* no shift states */
93 if (n <= 0)
94 return (-1);
95
96 c = (unsigned char)*s;
97 if (c < 0x80)
98 return (c != 0);
99 if (c >= 0xc2)
100 {
101 c1 = (unsigned char)s[1];
102 if (c < 0xe0)
103 {
b4f26d54
TT
104 if (n == 1)
105 return -2;
106 if (n >= 2 && (c1 ^ 0x80) < 0x40)
cb41b9e7
TT
107 return 2;
108 }
109 else if (c < 0xf0)
110 {
b4f26d54
TT
111 if (n == 1)
112 return -2;
113 if ((c1 ^ 0x80) < 0x40
cb41b9e7
TT
114 && (c >= 0xe1 || c1 >= 0xa0)
115 && (c != 0xed || c1 < 0xa0))
b4f26d54
TT
116 {
117 if (n == 2)
118 return -2;
119 c2 = (unsigned char)s[2];
120 if ((c2 ^ 0x80) < 0x40)
121 return 3;
122 }
cb41b9e7 123 }
b4f26d54 124 else if (c < 0xf4)
cb41b9e7 125 {
b4f26d54
TT
126 if (n == 1)
127 return -2;
128 if (((c1 ^ 0x80) < 0x40)
cb41b9e7
TT
129 && (c >= 0xf1 || c1 >= 0x90)
130 && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
b4f26d54
TT
131 {
132 if (n == 2)
133 return -2;
134 c2 = (unsigned char)s[2];
135 if ((c2 ^ 0x80) < 0x40)
136 {
137 if (n == 3)
138 return -2;
139 c3 = (unsigned char)s[3];
140 if ((c3 ^ 0x80) < 0x40)
141 return 4;
142 }
143 }
cb41b9e7
TT
144 }
145 }
146 /* invalid or incomplete multibyte character */
147 return -1;
148}
149
9255ee31 150static int
cb41b9e7 151_rl_find_next_mbchar_internal (char *string, int seed, int count, int find_non_zero)
9255ee31 152{
cc88a640 153 size_t tmp, len;
9255ee31 154 mbstate_t ps;
5bdf8622 155 int point;
9255ee31
EZ
156 wchar_t wc;
157
5bdf8622
DJ
158 tmp = 0;
159
9255ee31
EZ
160 memset(&ps, 0, sizeof (mbstate_t));
161 if (seed < 0)
162 seed = 0;
163 if (count <= 0)
164 return seed;
165
5bdf8622 166 point = seed + _rl_adjust_point (string, seed, &ps);
cb41b9e7
TT
167 /* if _rl_adjust_point returns -1, the character or string is invalid.
168 treat as a byte. */
169 if (point == seed - 1) /* invalid */
170 return seed + 1;
171
cc88a640
JK
172 /* if this is true, means that seed was not pointing to a byte indicating
173 the beginning of a multibyte character. Correct the point and consume
174 one char. */
9255ee31 175 if (seed < point)
5bdf8622 176 count--;
9255ee31
EZ
177
178 while (count > 0)
179 {
cc88a640
JK
180 len = strlen (string + point);
181 if (len == 0)
182 break;
cb41b9e7
TT
183 if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
184 {
185 tmp = 1;
186 wc = (wchar_t) string[point];
187 memset(&ps, 0, sizeof(mbstate_t));
188 }
189 else
190 tmp = mbrtowc (&wc, string+point, len, &ps);
5bdf8622 191 if (MB_INVALIDCH ((size_t)tmp))
9255ee31 192 {
cc88a640 193 /* invalid bytes. assume a byte represents a character */
9255ee31
EZ
194 point++;
195 count--;
196 /* reset states. */
197 memset(&ps, 0, sizeof(mbstate_t));
198 }
5bdf8622
DJ
199 else if (MB_NULLWCH (tmp))
200 break; /* found wide '\0' */
9255ee31
EZ
201 else
202 {
203 /* valid bytes */
204 point += tmp;
205 if (find_non_zero)
206 {
775e241e 207 if (WCWIDTH (wc) == 0)
9255ee31
EZ
208 continue;
209 else
210 count--;
211 }
212 else
213 count--;
214 }
215 }
216
217 if (find_non_zero)
218 {
219 tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
775e241e 220 while (MB_NULLWCH (tmp) == 0 && MB_INVALIDCH (tmp) == 0 && WCWIDTH (wc) == 0)
9255ee31
EZ
221 {
222 point += tmp;
223 tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
9255ee31
EZ
224 }
225 }
5bdf8622
DJ
226
227 return point;
9255ee31
EZ
228}
229
b4f26d54
TT
230static inline int
231_rl_test_nonzero (char *string, int ind, int len)
232{
233 size_t tmp;
234 wchar_t wc;
235 mbstate_t ps;
236
237 memset (&ps, 0, sizeof (mbstate_t));
238 tmp = mbrtowc (&wc, string + ind, len - ind, &ps);
239 /* treat invalid multibyte sequences as non-zero-width */
240 return (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp) || WCWIDTH (wc) > 0);
241}
242
243/* experimental -- needs to handle zero-width characters better */
244static int
245_rl_find_prev_utf8char (char *string, int seed, int find_non_zero)
246{
247 char *s;
248 unsigned char b;
249 int save, prev;
250 size_t len;
251
252 if (find_non_zero)
253 len = RL_STRLEN (string);
254
255 prev = seed - 1;
256 while (prev >= 0)
257 {
258 b = (unsigned char)string[prev];
259 if (UTF8_SINGLEBYTE (b))
260 return (prev);
261
262 save = prev;
263
264 /* Move back until we're not in the middle of a multibyte char */
265 if (UTF8_MBCHAR (b))
266 {
267 while (prev > 0 && (b = (unsigned char)string[--prev]) && UTF8_MBCHAR (b))
268 ;
269 }
270
271 if (UTF8_MBFIRSTCHAR (b))
272 {
273 if (find_non_zero)
274 {
275 if (_rl_test_nonzero (string, prev, len))
276 return (prev);
277 else /* valid but WCWIDTH (wc) == 0 */
278 prev = prev - 1;
279 }
280 else
281 return (prev);
282 }
283 else
284 return (save); /* invalid utf-8 multibyte sequence */
285 }
286
287 return ((prev < 0) ? 0 : prev);
288}
289
775e241e 290/*static*/ int
cb41b9e7 291_rl_find_prev_mbchar_internal (char *string, int seed, int find_non_zero)
9255ee31
EZ
292{
293 mbstate_t ps;
294 int prev, non_zero_prev, point, length;
295 size_t tmp;
296 wchar_t wc;
297
b4f26d54
TT
298 if (_rl_utf8locale)
299 return (_rl_find_prev_utf8char (string, seed, find_non_zero));
300
9255ee31
EZ
301 memset(&ps, 0, sizeof(mbstate_t));
302 length = strlen(string);
303
304 if (seed < 0)
305 return 0;
306 else if (length < seed)
307 return length;
308
309 prev = non_zero_prev = point = 0;
310 while (point < seed)
311 {
cb41b9e7
TT
312 if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
313 {
314 tmp = 1;
315 wc = (wchar_t) string[point];
316 memset(&ps, 0, sizeof(mbstate_t));
317 }
318 else
319 tmp = mbrtowc (&wc, string + point, length - point, &ps);
5bdf8622 320 if (MB_INVALIDCH ((size_t)tmp))
9255ee31 321 {
cb41b9e7 322 /* in this case, bytes are invalid or too short to compose
9255ee31
EZ
323 multibyte char, so assume that the first byte represents
324 a single character anyway. */
325 tmp = 1;
326 /* clear the state of the byte sequence, because
327 in this case effect of mbstate is undefined */
328 memset(&ps, 0, sizeof (mbstate_t));
5bdf8622
DJ
329
330 /* Since we're assuming that this byte represents a single
331 non-zero-width character, don't forget about it. */
332 prev = point;
9255ee31 333 }
5bdf8622 334 else if (MB_NULLWCH (tmp))
9255ee31
EZ
335 break; /* Found '\0' char. Can this happen? */
336 else
337 {
338 if (find_non_zero)
339 {
775e241e 340 if (WCWIDTH (wc) != 0)
9255ee31
EZ
341 prev = point;
342 }
343 else
344 prev = point;
345 }
346
347 point += tmp;
348 }
349
350 return prev;
351}
352
353/* return the number of bytes parsed from the multibyte sequence starting
354 at src, if a non-L'\0' wide character was recognized. It returns 0,
355 if a L'\0' wide character was recognized. It returns (size_t)(-1),
356 if an invalid multibyte sequence was encountered. It returns (size_t)(-2)
357 if it couldn't parse a complete multibyte character. */
358int
cb41b9e7 359_rl_get_char_len (char *src, mbstate_t *ps)
9255ee31 360{
cb41b9e7
TT
361 size_t tmp, l;
362 int mb_cur_max;
9255ee31 363
cb41b9e7
TT
364 /* Look at no more than MB_CUR_MAX characters */
365 l = (size_t)strlen (src);
366 if (_rl_utf8locale && l > 0 && UTF8_SINGLEBYTE(*src))
367 tmp = (*src != 0) ? 1 : 0;
368 else
369 {
370 mb_cur_max = MB_CUR_MAX;
371 tmp = mbrlen((const char *)src, (l < mb_cur_max) ? l : mb_cur_max, ps);
372 }
9255ee31
EZ
373 if (tmp == (size_t)(-2))
374 {
cb41b9e7 375 /* too short to compose multibyte char */
5af408ce
EZ
376 if (ps)
377 memset (ps, 0, sizeof(mbstate_t));
9255ee31
EZ
378 return -2;
379 }
380 else if (tmp == (size_t)(-1))
381 {
382 /* invalid to compose multibyte char */
383 /* initialize the conversion state */
5af408ce
EZ
384 if (ps)
385 memset (ps, 0, sizeof(mbstate_t));
9255ee31
EZ
386 return -1;
387 }
388 else if (tmp == (size_t)0)
389 return 0;
390 else
391 return (int)tmp;
392}
393
394/* compare the specified two characters. If the characters matched,
395 return 1. Otherwise return 0. */
396int
cb41b9e7 397_rl_compare_chars (char *buf1, int pos1, mbstate_t *ps1, char *buf2, int pos2, mbstate_t *ps2)
9255ee31
EZ
398{
399 int i, w1, w2;
400
401 if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 ||
402 (w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
403 (w1 != w2) ||
404 (buf1[pos1] != buf2[pos2]))
405 return 0;
406
407 for (i = 1; i < w1; i++)
408 if (buf1[pos1+i] != buf2[pos2+i])
409 return 0;
410
411 return 1;
412}
413
414/* adjust pointed byte and find mbstate of the point of string.
415 adjusted point will be point <= adjusted_point, and returns
416 differences of the byte(adjusted_point - point).
cb41b9e7 417 if point is invalid (point < 0 || more than string length),
9255ee31
EZ
418 it returns -1 */
419int
cb41b9e7 420_rl_adjust_point (char *string, int point, mbstate_t *ps)
9255ee31 421{
cb41b9e7
TT
422 size_t tmp;
423 int length, pos;
9255ee31 424
cb41b9e7
TT
425 tmp = 0;
426 pos = 0;
9255ee31
EZ
427 length = strlen(string);
428 if (point < 0)
429 return -1;
430 if (length < point)
431 return -1;
432
433 while (pos < point)
434 {
cb41b9e7
TT
435 if (_rl_utf8locale && UTF8_SINGLEBYTE(string[pos]))
436 tmp = 1;
437 else
438 tmp = mbrlen (string + pos, length - pos, ps);
5bdf8622 439 if (MB_INVALIDCH ((size_t)tmp))
9255ee31 440 {
cb41b9e7 441 /* in this case, bytes are invalid or too short to compose
9255ee31
EZ
442 multibyte char, so assume that the first byte represents
443 a single character anyway. */
444 pos++;
445 /* clear the state of the byte sequence, because
446 in this case effect of mbstate is undefined */
5af408ce
EZ
447 if (ps)
448 memset (ps, 0, sizeof (mbstate_t));
9255ee31 449 }
5bdf8622 450 else if (MB_NULLWCH (tmp))
5af408ce 451 pos++;
9255ee31
EZ
452 else
453 pos += tmp;
454 }
455
456 return (pos - point);
457}
458
459int
cb41b9e7 460_rl_is_mbchar_matched (char *string, int seed, int end, char *mbchar, int length)
9255ee31
EZ
461{
462 int i;
463
464 if ((end - seed) < length)
465 return 0;
466
467 for (i = 0; i < length; i++)
468 if (string[seed + i] != mbchar[i])
469 return 0;
470 return 1;
471}
5bdf8622
DJ
472
473wchar_t
cb41b9e7 474_rl_char_value (char *buf, int ind)
5bdf8622
DJ
475{
476 size_t tmp;
477 wchar_t wc;
478 mbstate_t ps;
479 int l;
480
481 if (MB_LEN_MAX == 1 || rl_byte_oriented)
482 return ((wchar_t) buf[ind]);
cb41b9e7
TT
483 if (_rl_utf8locale && UTF8_SINGLEBYTE(buf[ind]))
484 return ((wchar_t) buf[ind]);
5bdf8622
DJ
485 l = strlen (buf);
486 if (ind >= l - 1)
487 return ((wchar_t) buf[ind]);
cb41b9e7
TT
488 if (l < ind) /* Sanity check */
489 l = strlen (buf+ind);
5bdf8622
DJ
490 memset (&ps, 0, sizeof (mbstate_t));
491 tmp = mbrtowc (&wc, buf + ind, l - ind, &ps);
492 if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))
493 return ((wchar_t) buf[ind]);
494 return wc;
495}
9255ee31
EZ
496#endif /* HANDLE_MULTIBYTE */
497
498/* Find next `count' characters started byte point of the specified seed.
499 If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
500 characters. */
501#undef _rl_find_next_mbchar
502int
cb41b9e7 503_rl_find_next_mbchar (char *string, int seed, int count, int flags)
9255ee31
EZ
504{
505#if defined (HANDLE_MULTIBYTE)
506 return _rl_find_next_mbchar_internal (string, seed, count, flags);
507#else
508 return (seed + count);
509#endif
510}
511
512/* Find previous character started byte point of the specified seed.
513 Returned point will be point <= seed. If flags is MB_FIND_NONZERO,
514 we look for non-zero-width multibyte characters. */
515#undef _rl_find_prev_mbchar
516int
cb41b9e7 517_rl_find_prev_mbchar (char *string, int seed, int flags)
9255ee31
EZ
518{
519#if defined (HANDLE_MULTIBYTE)
520 return _rl_find_prev_mbchar_internal (string, seed, flags);
521#else
522 return ((seed == 0) ? seed : seed - 1);
523#endif
524}
This page took 0.97358 seconds and 4 git commands to generate.