gdb:
[deliverable/binutils-gdb.git] / gdb / charset.c
1 /* Character set conversion support for GDB.
2
3 Copyright (C) 2001, 2003, 2007, 2008, 2009 Free Software Foundation, Inc.
4
5 This file is part of GDB.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19
20 #include "defs.h"
21 #include "charset.h"
22 #include "gdbcmd.h"
23 #include "gdb_assert.h"
24 #include "gdb_obstack.h"
25 #include "charset-list.h"
26 #include "vec.h"
27
28 #include <stddef.h>
29 #include "gdb_string.h"
30 #include <ctype.h>
31
32 \f
33 /* How GDB's character set support works
34
35 GDB has three global settings:
36
37 - The `current host character set' is the character set GDB should
38 use in talking to the user, and which (hopefully) the user's
39 terminal knows how to display properly. Most users should not
40 change this.
41
42 - The `current target character set' is the character set the
43 program being debugged uses.
44
45 - The `current target wide character set' is the wide character set
46 the program being debugged uses, that is, the encoding used for
47 wchar_t.
48
49 There are commands to set each of these, and mechanisms for
50 choosing reasonable default values. GDB has a global list of
51 character sets that it can use as its host or target character
52 sets.
53
54 The header file `charset.h' declares various functions that
55 different pieces of GDB need to perform tasks like:
56
57 - printing target strings and characters to the user's terminal
58 (mostly target->host conversions),
59
60 - building target-appropriate representations of strings and
61 characters the user enters in expressions (mostly host->target
62 conversions),
63
64 and so on.
65
66 To avoid excessive code duplication and maintenance efforts,
67 GDB simply requires a capable iconv function. Users on platforms
68 without a suitable iconv can use the GNU iconv library. */
69
70 \f
71 #ifdef PHONY_ICONV
72
73 /* Provide a phony iconv that does as little as possible. Also,
74 arrange for there to be a single available character set. */
75
76 #undef GDB_DEFAULT_HOST_CHARSET
77 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
78 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
79 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
80 #undef DEFAULT_CHARSET_NAMES
81 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
82
83 #undef iconv_t
84 #define iconv_t int
85 #undef iconv_open
86 #undef iconv
87 #undef iconv_close
88
89 iconv_t
90 iconv_open (const char *to, const char *from)
91 {
92 /* We allow conversions from UCS-4BE, wchar_t, and the host charset.
93 We allow conversions to wchar_t and the host charset. */
94 if (strcmp (from, "UCS-4BE") && strcmp (from, "wchar_t")
95 && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
96 return -1;
97 if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
98 return -1;
99
100 /* Return 1 if we are converting from UCS-4BE, 0 otherwise. This is
101 used as a flag in calls to iconv. */
102 return !strcmp (from, "UCS-4BE");
103 }
104
105 int
106 iconv_close (iconv_t arg)
107 {
108 return 0;
109 }
110
111 size_t
112 iconv (iconv_t ucs_flag, char **inbuf, size_t *inbytesleft,
113 char **outbuf, size_t *outbytesleft)
114 {
115 if (ucs_flag)
116 {
117 while (*inbytesleft >= 4)
118 {
119 size_t j;
120 unsigned long c = 0;
121
122 for (j = 0; j < 4; ++j)
123 {
124 c <<= 8;
125 c += (*inbuf)[j] & 0xff;
126 }
127
128 if (c >= 256)
129 {
130 errno = EILSEQ;
131 return -1;
132 }
133 **outbuf = c & 0xff;
134 ++*outbuf;
135 --*outbytesleft;
136
137 ++*inbuf;
138 *inbytesleft -= 4;
139 }
140 if (*inbytesleft < 4)
141 {
142 errno = EINVAL;
143 return -1;
144 }
145 }
146 else
147 {
148 /* In all other cases we simply copy input bytes to the
149 output. */
150 size_t amt = *inbytesleft;
151 if (amt > *outbytesleft)
152 amt = *outbytesleft;
153 memcpy (*outbuf, *inbuf, amt);
154 *inbuf += amt;
155 *outbuf += amt;
156 *inbytesleft -= amt;
157 *outbytesleft -= amt;
158 }
159
160 if (*inbytesleft)
161 {
162 errno = E2BIG;
163 return -1;
164 }
165
166 /* The number of non-reversible conversions -- but they were all
167 reversible. */
168 return 0;
169 }
170
171 #endif
172
173
174 \f
175 /* The global lists of character sets and translations. */
176
177
178 #ifndef GDB_DEFAULT_TARGET_CHARSET
179 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
180 #endif
181
182 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
183 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UCS-4"
184 #endif
185
186 static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
187 static const char *host_charset_name = "auto";
188 static void
189 show_host_charset_name (struct ui_file *file, int from_tty,
190 struct cmd_list_element *c,
191 const char *value)
192 {
193 if (!strcmp (value, "auto"))
194 fprintf_filtered (file,
195 _("The host character set is \"auto; currently %s\".\n"),
196 auto_host_charset_name);
197 else
198 fprintf_filtered (file, _("The host character set is \"%s\".\n"), value);
199 }
200
201 static const char *target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
202 static void
203 show_target_charset_name (struct ui_file *file, int from_tty,
204 struct cmd_list_element *c, const char *value)
205 {
206 fprintf_filtered (file, _("The target character set is \"%s\".\n"),
207 value);
208 }
209
210 static const char *target_wide_charset_name = GDB_DEFAULT_TARGET_WIDE_CHARSET;
211 static void
212 show_target_wide_charset_name (struct ui_file *file, int from_tty,
213 struct cmd_list_element *c, const char *value)
214 {
215 fprintf_filtered (file, _("The target wide character set is \"%s\".\n"),
216 value);
217 }
218
219 static const char *default_charset_names[] =
220 {
221 DEFAULT_CHARSET_NAMES
222 0
223 };
224
225 static const char **charset_enum;
226
227 \f
228 /* If the target wide character set has big- or little-endian
229 variants, these are the corresponding names. */
230 static const char *target_wide_charset_be_name;
231 static const char *target_wide_charset_le_name;
232
233 /* A helper function for validate which sets the target wide big- and
234 little-endian character set names, if possible. */
235
236 static void
237 set_be_le_names (void)
238 {
239 int i, len;
240
241 target_wide_charset_le_name = NULL;
242 target_wide_charset_be_name = NULL;
243
244 len = strlen (target_wide_charset_name);
245 for (i = 0; charset_enum[i]; ++i)
246 {
247 if (strncmp (target_wide_charset_name, charset_enum[i], len))
248 continue;
249 if ((charset_enum[i][len] == 'B'
250 || charset_enum[i][len] == 'L')
251 && charset_enum[i][len + 1] == 'E'
252 && charset_enum[i][len + 2] == '\0')
253 {
254 if (charset_enum[i][len] == 'B')
255 target_wide_charset_be_name = charset_enum[i];
256 else
257 target_wide_charset_le_name = charset_enum[i];
258 }
259 }
260 }
261
262 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
263 target-wide-charset', 'set charset' sfunc's. */
264
265 static void
266 validate (void)
267 {
268 iconv_t desc;
269 const char *host_cset = host_charset ();
270
271 desc = iconv_open (target_wide_charset_name, host_cset);
272 if (desc == (iconv_t) -1)
273 error ("Cannot convert between character sets `%s' and `%s'",
274 target_wide_charset_name, host_cset);
275 iconv_close (desc);
276
277 desc = iconv_open (target_charset_name, host_cset);
278 if (desc == (iconv_t) -1)
279 error ("Cannot convert between character sets `%s' and `%s'",
280 target_charset_name, host_cset);
281 iconv_close (desc);
282
283 set_be_le_names ();
284 }
285
286 /* This is the sfunc for the 'set charset' command. */
287 static void
288 set_charset_sfunc (char *charset, int from_tty, struct cmd_list_element *c)
289 {
290 /* CAREFUL: set the target charset here as well. */
291 target_charset_name = host_charset_name;
292 validate ();
293 }
294
295 /* 'set host-charset' command sfunc. We need a wrapper here because
296 the function needs to have a specific signature. */
297 static void
298 set_host_charset_sfunc (char *charset, int from_tty,
299 struct cmd_list_element *c)
300 {
301 validate ();
302 }
303
304 /* Wrapper for the 'set target-charset' command. */
305 static void
306 set_target_charset_sfunc (char *charset, int from_tty,
307 struct cmd_list_element *c)
308 {
309 validate ();
310 }
311
312 /* Wrapper for the 'set target-wide-charset' command. */
313 static void
314 set_target_wide_charset_sfunc (char *charset, int from_tty,
315 struct cmd_list_element *c)
316 {
317 validate ();
318 }
319
320 /* sfunc for the 'show charset' command. */
321 static void
322 show_charset (struct ui_file *file, int from_tty, struct cmd_list_element *c,
323 const char *name)
324 {
325 show_host_charset_name (file, from_tty, c, host_charset_name);
326 show_target_charset_name (file, from_tty, c, target_charset_name);
327 show_target_wide_charset_name (file, from_tty, c, target_wide_charset_name);
328 }
329
330 \f
331 /* Accessor functions. */
332
333 const char *
334 host_charset (void)
335 {
336 if (!strcmp (host_charset_name, "auto"))
337 return auto_host_charset_name;
338 return host_charset_name;
339 }
340
341 const char *
342 target_charset (void)
343 {
344 return target_charset_name;
345 }
346
347 const char *
348 target_wide_charset (void)
349 {
350 if (gdbarch_byte_order (current_gdbarch) == BFD_ENDIAN_BIG)
351 {
352 if (target_wide_charset_be_name)
353 return target_wide_charset_be_name;
354 }
355 else
356 {
357 if (target_wide_charset_le_name)
358 return target_wide_charset_le_name;
359 }
360
361 return target_wide_charset_name;
362 }
363
364 \f
365 /* Host character set management. For the time being, we assume that
366 the host character set is some superset of ASCII. */
367
368 char
369 host_letter_to_control_character (char c)
370 {
371 if (c == '?')
372 return 0177;
373 return c & 0237;
374 }
375
376 /* Convert a host character, C, to its hex value. C must already have
377 been validated using isxdigit. */
378
379 int
380 host_hex_value (char c)
381 {
382 if (isdigit (c))
383 return c - '0';
384 if (c >= 'a' && c <= 'f')
385 return 10 + c - 'a';
386 gdb_assert (c >= 'A' && c <= 'F');
387 return 10 + c - 'A';
388 }
389
390 \f
391 /* Public character management functions. */
392
393 /* A cleanup function which is run to close an iconv descriptor. */
394
395 static void
396 cleanup_iconv (void *p)
397 {
398 iconv_t *descp = p;
399 iconv_close (*descp);
400 }
401
402 void
403 convert_between_encodings (const char *from, const char *to,
404 const gdb_byte *bytes, unsigned int num_bytes,
405 int width, struct obstack *output,
406 enum transliterations translit)
407 {
408 iconv_t desc;
409 struct cleanup *cleanups;
410 size_t inleft;
411 char *inp;
412 unsigned int space_request;
413
414 /* Often, the host and target charsets will be the same. */
415 if (!strcmp (from, to))
416 {
417 obstack_grow (output, bytes, num_bytes);
418 return;
419 }
420
421 desc = iconv_open (to, from);
422 if (desc == (iconv_t) -1)
423 perror_with_name ("Converting character sets");
424 cleanups = make_cleanup (cleanup_iconv, &desc);
425
426 inleft = num_bytes;
427 inp = (char *) bytes;
428
429 space_request = num_bytes;
430
431 while (inleft > 0)
432 {
433 char *outp;
434 size_t outleft, r;
435 int old_size;
436
437 old_size = obstack_object_size (output);
438 obstack_blank (output, space_request);
439
440 outp = obstack_base (output) + old_size;
441 outleft = space_request;
442
443 r = iconv (desc, &inp, &inleft, &outp, &outleft);
444
445 /* Now make sure that the object on the obstack only includes
446 bytes we have converted. */
447 obstack_blank (output, - (int) outleft);
448
449 if (r == (size_t) -1)
450 {
451 switch (errno)
452 {
453 case EILSEQ:
454 {
455 int i;
456
457 /* Invalid input sequence. */
458 if (translit == translit_none)
459 error (_("Could not convert character to `%s' character set"),
460 to);
461
462 /* We emit escape sequence for the bytes, skip them,
463 and try again. */
464 for (i = 0; i < width; ++i)
465 {
466 char octal[5];
467
468 sprintf (octal, "\\%.3o", *inp & 0xff);
469 obstack_grow_str (output, octal);
470
471 ++inp;
472 --inleft;
473 }
474 }
475 break;
476
477 case E2BIG:
478 /* We ran out of space in the output buffer. Make it
479 bigger next time around. */
480 space_request *= 2;
481 break;
482
483 case EINVAL:
484 /* Incomplete input sequence. FIXME: ought to report this
485 to the caller somehow. */
486 inleft = 0;
487 break;
488
489 default:
490 perror_with_name ("Internal error while converting character sets");
491 }
492 }
493 }
494
495 do_cleanups (cleanups);
496 }
497
498 \f
499
500 /* An iterator that returns host wchar_t's from a target string. */
501 struct wchar_iterator
502 {
503 /* The underlying iconv descriptor. */
504 iconv_t desc;
505
506 /* The input string. This is updated as convert characters. */
507 char *input;
508 /* The number of bytes remaining in the input. */
509 size_t bytes;
510
511 /* The width of an input character. */
512 size_t width;
513
514 /* The output buffer and its size. */
515 gdb_wchar_t *out;
516 size_t out_size;
517 };
518
519 /* Create a new iterator. */
520 struct wchar_iterator *
521 make_wchar_iterator (const gdb_byte *input, size_t bytes, const char *charset,
522 size_t width)
523 {
524 struct wchar_iterator *result;
525 iconv_t desc;
526
527 desc = iconv_open ("wchar_t", charset);
528 if (desc == (iconv_t) -1)
529 perror_with_name ("Converting character sets");
530
531 result = XNEW (struct wchar_iterator);
532 result->desc = desc;
533 result->input = (char *) input;
534 result->bytes = bytes;
535 result->width = width;
536
537 result->out = XNEW (gdb_wchar_t);
538 result->out_size = 1;
539
540 return result;
541 }
542
543 static void
544 do_cleanup_iterator (void *p)
545 {
546 struct wchar_iterator *iter = p;
547
548 iconv_close (iter->desc);
549 xfree (iter->out);
550 xfree (iter);
551 }
552
553 struct cleanup *
554 make_cleanup_wchar_iterator (struct wchar_iterator *iter)
555 {
556 return make_cleanup (do_cleanup_iterator, iter);
557 }
558
559 int
560 wchar_iterate (struct wchar_iterator *iter,
561 enum wchar_iterate_result *out_result,
562 gdb_wchar_t **out_chars,
563 const gdb_byte **ptr,
564 size_t *len)
565 {
566 size_t out_request;
567
568 /* Try to convert some characters. At first we try to convert just
569 a single character. The reason for this is that iconv does not
570 necessarily update its outgoing arguments when it encounters an
571 invalid input sequence -- but we want to reliably report this to
572 our caller so it can emit an escape sequence. */
573 out_request = 1;
574 while (iter->bytes > 0)
575 {
576 char *outptr = (char *) &iter->out[0];
577 char *orig_inptr = iter->input;
578 size_t orig_in = iter->bytes;
579 size_t out_avail = out_request * sizeof (gdb_wchar_t);
580 size_t num;
581 gdb_wchar_t result;
582
583 size_t r = iconv (iter->desc, (char **) &iter->input, &iter->bytes,
584 &outptr, &out_avail);
585 if (r == (size_t) -1)
586 {
587 switch (errno)
588 {
589 case EILSEQ:
590 /* Invalid input sequence. Skip it, and let the caller
591 know about it. */
592 *out_result = wchar_iterate_invalid;
593 *ptr = iter->input;
594 *len = iter->width;
595 iter->input += iter->width;
596 iter->bytes -= iter->width;
597 return 0;
598
599 case E2BIG:
600 /* We ran out of space. We still might have converted a
601 character; if so, return it. Otherwise, grow the
602 buffer and try again. */
603 if (out_avail < out_request * sizeof (gdb_wchar_t))
604 break;
605
606 ++out_request;
607 if (out_request > iter->out_size)
608 {
609 iter->out_size = out_request;
610 iter->out = xrealloc (iter->out,
611 out_request * sizeof (gdb_wchar_t));
612 }
613 continue;
614
615 case EINVAL:
616 /* Incomplete input sequence. Let the caller know, and
617 arrange for future calls to see EOF. */
618 *out_result = wchar_iterate_incomplete;
619 *ptr = iter->input;
620 *len = iter->bytes;
621 iter->bytes = 0;
622 return 0;
623
624 default:
625 perror_with_name ("Internal error while converting character sets");
626 }
627 }
628
629 /* We converted something. */
630 num = out_request - out_avail / sizeof (gdb_wchar_t);
631 *out_result = wchar_iterate_ok;
632 *out_chars = iter->out;
633 *ptr = orig_inptr;
634 *len = orig_in - iter->bytes;
635 return num;
636 }
637
638 /* Really done. */
639 *out_result = wchar_iterate_eof;
640 return -1;
641 }
642
643 \f
644 /* The charset.c module initialization function. */
645
646 extern initialize_file_ftype _initialize_charset; /* -Wmissing-prototype */
647
648 typedef char *char_ptr;
649 DEF_VEC_P (char_ptr);
650
651 static VEC (char_ptr) *charsets;
652
653 #ifdef PHONY_ICONV
654
655 static void
656 find_charset_names (void)
657 {
658 VEC_safe_push (char_ptr, charsets, GDB_DEFAULT_HOST_CHARSET);
659 VEC_safe_push (char_ptr, charsets, NULL);
660 }
661
662 #else /* PHONY_ICONV */
663 #ifdef HAVE_ICONVLIST
664
665 /* A helper function that adds some character sets to the vector of
666 all character sets. This is a callback function for iconvlist. */
667
668 static int
669 add_one (unsigned int count, const char *const *names, void *data)
670 {
671 unsigned int i;
672
673 for (i = 0; i < count; ++i)
674 VEC_safe_push (char_ptr, charsets, xstrdup (names[i]));
675
676 return 0;
677 }
678
679 static void
680 find_charset_names (void)
681 {
682 iconvlist (add_one, NULL);
683 VEC_safe_push (char_ptr, charsets, NULL);
684 }
685
686 #else
687
688 static void
689 find_charset_names (void)
690 {
691 FILE *in;
692
693 in = popen ("iconv -l", "r");
694 /* It is ok to ignore errors; we'll fall back on a default. */
695 if (!in)
696 return;
697
698 /* POSIX says that iconv -l uses an unspecified format. We parse
699 the glibc format; feel free to add others as needed. */
700 while (!feof (in))
701 {
702 /* The size of buf is chosen arbitrarily. A character set name
703 longer than this would not be very nice. */
704 char buf[80];
705 int len;
706 char *r = fgets (buf, sizeof (buf), in);
707 if (!r)
708 break;
709 len = strlen (r);
710 if (len <= 3)
711 continue;
712 if (buf[len - 2] == '/' && buf[len - 3] == '/')
713 buf[len - 3] = '\0';
714 VEC_safe_push (char_ptr, charsets, xstrdup (buf));
715 }
716
717 pclose (in);
718
719 VEC_safe_push (char_ptr, charsets, NULL);
720 }
721
722 #endif /* HAVE_ICONVLIST */
723 #endif /* PHONY_ICONV */
724
725 void
726 _initialize_charset (void)
727 {
728 struct cmd_list_element *new_cmd;
729
730 /* The first element is always "auto"; then we skip it for the
731 commands where it is not allowed. */
732 VEC_safe_push (char_ptr, charsets, "auto");
733 find_charset_names ();
734
735 if (VEC_length (char_ptr, charsets) > 1)
736 charset_enum = (const char **) VEC_address (char_ptr, charsets);
737 else
738 charset_enum = default_charset_names;
739
740 #ifndef PHONY_ICONV
741 #ifdef HAVE_LANGINFO_CODESET
742 auto_host_charset_name = nl_langinfo (CODESET);
743 target_charset_name = auto_host_charset_name;
744
745 set_be_le_names ();
746 #endif
747 #endif
748
749 add_setshow_enum_cmd ("charset", class_support,
750 &charset_enum[1], &host_charset_name, _("\
751 Set the host and target character sets."), _("\
752 Show the host and target character sets."), _("\
753 The `host character set' is the one used by the system GDB is running on.\n\
754 The `target character set' is the one used by the program being debugged.\n\
755 You may only use supersets of ASCII for your host character set; GDB does\n\
756 not support any others.\n\
757 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
758 /* Note that the sfunc below needs to set
759 target_charset_name, because the 'set
760 charset' command sets two variables. */
761 set_charset_sfunc,
762 show_charset,
763 &setlist, &showlist);
764
765 add_setshow_enum_cmd ("host-charset", class_support,
766 charset_enum, &host_charset_name, _("\
767 Set the host character set."), _("\
768 Show the host character set."), _("\
769 The `host character set' is the one used by the system GDB is running on.\n\
770 You may only use supersets of ASCII for your host character set; GDB does\n\
771 not support any others.\n\
772 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
773 set_host_charset_sfunc,
774 show_host_charset_name,
775 &setlist, &showlist);
776
777 add_setshow_enum_cmd ("target-charset", class_support,
778 &charset_enum[1], &target_charset_name, _("\
779 Set the target character set."), _("\
780 Show the target character set."), _("\
781 The `target character set' is the one used by the program being debugged.\n\
782 GDB translates characters and strings between the host and target\n\
783 character sets as needed.\n\
784 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
785 set_target_charset_sfunc,
786 show_target_charset_name,
787 &setlist, &showlist);
788
789 add_setshow_enum_cmd ("target-wide-charset", class_support,
790 &charset_enum[1], &target_wide_charset_name,
791 _("\
792 Set the target wide character set."), _("\
793 Show the target wide character set."), _("\
794 The `target wide character set' is the one used by the program being debugged.\n\
795 In particular it is the encoding used by `wchar_t'.\n\
796 GDB translates characters and strings between the host and target\n\
797 character sets as needed.\n\
798 To see a list of the character sets GDB supports, type\n\
799 `set target-wide-charset'<TAB>"),
800 set_target_wide_charset_sfunc,
801 show_target_wide_charset_name,
802 &setlist, &showlist);
803 }
This page took 0.358406 seconds and 4 git commands to generate.