2009-03-24 Tom Tromey <tromey@redhat.com>
[deliverable/binutils-gdb.git] / gdb / charset.c
1 /* Character set conversion support for GDB.
2
3 Copyright (C) 2001, 2003, 2007, 2008, 2009 Free Software Foundation, Inc.
4
5 This file is part of GDB.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19
20 #include "defs.h"
21 #include "charset.h"
22 #include "gdbcmd.h"
23 #include "gdb_assert.h"
24 #include "gdb_obstack.h"
25 #include "charset-list.h"
26 #include "vec.h"
27
28 #include <stddef.h>
29 #include "gdb_string.h"
30 #include <ctype.h>
31
32 \f
33 /* How GDB's character set support works
34
35 GDB has three global settings:
36
37 - The `current host character set' is the character set GDB should
38 use in talking to the user, and which (hopefully) the user's
39 terminal knows how to display properly. Most users should not
40 change this.
41
42 - The `current target character set' is the character set the
43 program being debugged uses.
44
45 - The `current target wide character set' is the wide character set
46 the program being debugged uses, that is, the encoding used for
47 wchar_t.
48
49 There are commands to set each of these, and mechanisms for
50 choosing reasonable default values. GDB has a global list of
51 character sets that it can use as its host or target character
52 sets.
53
54 The header file `charset.h' declares various functions that
55 different pieces of GDB need to perform tasks like:
56
57 - printing target strings and characters to the user's terminal
58 (mostly target->host conversions),
59
60 - building target-appropriate representations of strings and
61 characters the user enters in expressions (mostly host->target
62 conversions),
63
64 and so on.
65
66 To avoid excessive code duplication and maintenance efforts,
67 GDB simply requires a capable iconv function. Users on platforms
68 without a suitable iconv can use the GNU iconv library. */
69
70 \f
71 #ifdef PHONY_ICONV
72
73 /* Provide a phony iconv that does as little as possible. Also,
74 arrange for there to be a single available character set. */
75
76 #undef GDB_DEFAULT_HOST_CHARSET
77 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
78 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
79 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
80 #undef DEFAULT_CHARSET_NAMES
81 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
82
83 #undef iconv_t
84 #define iconv_t int
85 #undef iconv_open
86 #undef iconv
87 #undef iconv_close
88
89 #undef ICONV_CONST
90 #define ICONV_CONST const
91
92 iconv_t
93 iconv_open (const char *to, const char *from)
94 {
95 /* We allow conversions from UCS-4BE, wchar_t, and the host charset.
96 We allow conversions to wchar_t and the host charset. */
97 if (strcmp (from, "UCS-4BE") && strcmp (from, "wchar_t")
98 && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
99 return -1;
100 if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
101 return -1;
102
103 /* Return 1 if we are converting from UCS-4BE, 0 otherwise. This is
104 used as a flag in calls to iconv. */
105 return !strcmp (from, "UCS-4BE");
106 }
107
108 int
109 iconv_close (iconv_t arg)
110 {
111 return 0;
112 }
113
114 size_t
115 iconv (iconv_t ucs_flag, const char **inbuf, size_t *inbytesleft,
116 char **outbuf, size_t *outbytesleft)
117 {
118 if (ucs_flag)
119 {
120 while (*inbytesleft >= 4)
121 {
122 size_t j;
123 unsigned long c = 0;
124
125 for (j = 0; j < 4; ++j)
126 {
127 c <<= 8;
128 c += (*inbuf)[j] & 0xff;
129 }
130
131 if (c >= 256)
132 {
133 errno = EILSEQ;
134 return -1;
135 }
136 **outbuf = c & 0xff;
137 ++*outbuf;
138 --*outbytesleft;
139
140 ++*inbuf;
141 *inbytesleft -= 4;
142 }
143 if (*inbytesleft < 4)
144 {
145 errno = EINVAL;
146 return -1;
147 }
148 }
149 else
150 {
151 /* In all other cases we simply copy input bytes to the
152 output. */
153 size_t amt = *inbytesleft;
154 if (amt > *outbytesleft)
155 amt = *outbytesleft;
156 memcpy (*outbuf, *inbuf, amt);
157 *inbuf += amt;
158 *outbuf += amt;
159 *inbytesleft -= amt;
160 *outbytesleft -= amt;
161 }
162
163 if (*inbytesleft)
164 {
165 errno = E2BIG;
166 return -1;
167 }
168
169 /* The number of non-reversible conversions -- but they were all
170 reversible. */
171 return 0;
172 }
173
174 #endif
175
176
177 \f
178 /* The global lists of character sets and translations. */
179
180
181 #ifndef GDB_DEFAULT_TARGET_CHARSET
182 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
183 #endif
184
185 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
186 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UCS-4"
187 #endif
188
189 static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
190 static const char *host_charset_name = "auto";
191 static void
192 show_host_charset_name (struct ui_file *file, int from_tty,
193 struct cmd_list_element *c,
194 const char *value)
195 {
196 if (!strcmp (value, "auto"))
197 fprintf_filtered (file,
198 _("The host character set is \"auto; currently %s\".\n"),
199 auto_host_charset_name);
200 else
201 fprintf_filtered (file, _("The host character set is \"%s\".\n"), value);
202 }
203
204 static const char *target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
205 static void
206 show_target_charset_name (struct ui_file *file, int from_tty,
207 struct cmd_list_element *c, const char *value)
208 {
209 fprintf_filtered (file, _("The target character set is \"%s\".\n"),
210 value);
211 }
212
213 static const char *target_wide_charset_name = GDB_DEFAULT_TARGET_WIDE_CHARSET;
214 static void
215 show_target_wide_charset_name (struct ui_file *file, int from_tty,
216 struct cmd_list_element *c, const char *value)
217 {
218 fprintf_filtered (file, _("The target wide character set is \"%s\".\n"),
219 value);
220 }
221
222 static const char *default_charset_names[] =
223 {
224 DEFAULT_CHARSET_NAMES
225 0
226 };
227
228 static const char **charset_enum;
229
230 \f
231 /* If the target wide character set has big- or little-endian
232 variants, these are the corresponding names. */
233 static const char *target_wide_charset_be_name;
234 static const char *target_wide_charset_le_name;
235
236 /* A helper function for validate which sets the target wide big- and
237 little-endian character set names, if possible. */
238
239 static void
240 set_be_le_names (void)
241 {
242 int i, len;
243
244 target_wide_charset_le_name = NULL;
245 target_wide_charset_be_name = NULL;
246
247 len = strlen (target_wide_charset_name);
248 for (i = 0; charset_enum[i]; ++i)
249 {
250 if (strncmp (target_wide_charset_name, charset_enum[i], len))
251 continue;
252 if ((charset_enum[i][len] == 'B'
253 || charset_enum[i][len] == 'L')
254 && charset_enum[i][len + 1] == 'E'
255 && charset_enum[i][len + 2] == '\0')
256 {
257 if (charset_enum[i][len] == 'B')
258 target_wide_charset_be_name = charset_enum[i];
259 else
260 target_wide_charset_le_name = charset_enum[i];
261 }
262 }
263 }
264
265 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
266 target-wide-charset', 'set charset' sfunc's. */
267
268 static void
269 validate (void)
270 {
271 iconv_t desc;
272 const char *host_cset = host_charset ();
273
274 desc = iconv_open (target_wide_charset_name, host_cset);
275 if (desc == (iconv_t) -1)
276 error ("Cannot convert between character sets `%s' and `%s'",
277 target_wide_charset_name, host_cset);
278 iconv_close (desc);
279
280 desc = iconv_open (target_charset_name, host_cset);
281 if (desc == (iconv_t) -1)
282 error ("Cannot convert between character sets `%s' and `%s'",
283 target_charset_name, host_cset);
284 iconv_close (desc);
285
286 set_be_le_names ();
287 }
288
289 /* This is the sfunc for the 'set charset' command. */
290 static void
291 set_charset_sfunc (char *charset, int from_tty, struct cmd_list_element *c)
292 {
293 /* CAREFUL: set the target charset here as well. */
294 target_charset_name = host_charset_name;
295 validate ();
296 }
297
298 /* 'set host-charset' command sfunc. We need a wrapper here because
299 the function needs to have a specific signature. */
300 static void
301 set_host_charset_sfunc (char *charset, int from_tty,
302 struct cmd_list_element *c)
303 {
304 validate ();
305 }
306
307 /* Wrapper for the 'set target-charset' command. */
308 static void
309 set_target_charset_sfunc (char *charset, int from_tty,
310 struct cmd_list_element *c)
311 {
312 validate ();
313 }
314
315 /* Wrapper for the 'set target-wide-charset' command. */
316 static void
317 set_target_wide_charset_sfunc (char *charset, int from_tty,
318 struct cmd_list_element *c)
319 {
320 validate ();
321 }
322
323 /* sfunc for the 'show charset' command. */
324 static void
325 show_charset (struct ui_file *file, int from_tty, struct cmd_list_element *c,
326 const char *name)
327 {
328 show_host_charset_name (file, from_tty, c, host_charset_name);
329 show_target_charset_name (file, from_tty, c, target_charset_name);
330 show_target_wide_charset_name (file, from_tty, c, target_wide_charset_name);
331 }
332
333 \f
334 /* Accessor functions. */
335
336 const char *
337 host_charset (void)
338 {
339 if (!strcmp (host_charset_name, "auto"))
340 return auto_host_charset_name;
341 return host_charset_name;
342 }
343
344 const char *
345 target_charset (void)
346 {
347 return target_charset_name;
348 }
349
350 const char *
351 target_wide_charset (void)
352 {
353 if (gdbarch_byte_order (current_gdbarch) == BFD_ENDIAN_BIG)
354 {
355 if (target_wide_charset_be_name)
356 return target_wide_charset_be_name;
357 }
358 else
359 {
360 if (target_wide_charset_le_name)
361 return target_wide_charset_le_name;
362 }
363
364 return target_wide_charset_name;
365 }
366
367 \f
368 /* Host character set management. For the time being, we assume that
369 the host character set is some superset of ASCII. */
370
371 char
372 host_letter_to_control_character (char c)
373 {
374 if (c == '?')
375 return 0177;
376 return c & 0237;
377 }
378
379 /* Convert a host character, C, to its hex value. C must already have
380 been validated using isxdigit. */
381
382 int
383 host_hex_value (char c)
384 {
385 if (isdigit (c))
386 return c - '0';
387 if (c >= 'a' && c <= 'f')
388 return 10 + c - 'a';
389 gdb_assert (c >= 'A' && c <= 'F');
390 return 10 + c - 'A';
391 }
392
393 \f
394 /* Public character management functions. */
395
396 /* A cleanup function which is run to close an iconv descriptor. */
397
398 static void
399 cleanup_iconv (void *p)
400 {
401 iconv_t *descp = p;
402 iconv_close (*descp);
403 }
404
405 void
406 convert_between_encodings (const char *from, const char *to,
407 const gdb_byte *bytes, unsigned int num_bytes,
408 int width, struct obstack *output,
409 enum transliterations translit)
410 {
411 iconv_t desc;
412 struct cleanup *cleanups;
413 size_t inleft;
414 char *inp;
415 unsigned int space_request;
416
417 /* Often, the host and target charsets will be the same. */
418 if (!strcmp (from, to))
419 {
420 obstack_grow (output, bytes, num_bytes);
421 return;
422 }
423
424 desc = iconv_open (to, from);
425 if (desc == (iconv_t) -1)
426 perror_with_name ("Converting character sets");
427 cleanups = make_cleanup (cleanup_iconv, &desc);
428
429 inleft = num_bytes;
430 inp = (char *) bytes;
431
432 space_request = num_bytes;
433
434 while (inleft > 0)
435 {
436 char *outp;
437 size_t outleft, r;
438 int old_size;
439
440 old_size = obstack_object_size (output);
441 obstack_blank (output, space_request);
442
443 outp = obstack_base (output) + old_size;
444 outleft = space_request;
445
446 r = iconv (desc, (ICONV_CONST char **) &inp, &inleft, &outp, &outleft);
447
448 /* Now make sure that the object on the obstack only includes
449 bytes we have converted. */
450 obstack_blank (output, - (int) outleft);
451
452 if (r == (size_t) -1)
453 {
454 switch (errno)
455 {
456 case EILSEQ:
457 {
458 int i;
459
460 /* Invalid input sequence. */
461 if (translit == translit_none)
462 error (_("Could not convert character to `%s' character set"),
463 to);
464
465 /* We emit escape sequence for the bytes, skip them,
466 and try again. */
467 for (i = 0; i < width; ++i)
468 {
469 char octal[5];
470
471 sprintf (octal, "\\%.3o", *inp & 0xff);
472 obstack_grow_str (output, octal);
473
474 ++inp;
475 --inleft;
476 }
477 }
478 break;
479
480 case E2BIG:
481 /* We ran out of space in the output buffer. Make it
482 bigger next time around. */
483 space_request *= 2;
484 break;
485
486 case EINVAL:
487 /* Incomplete input sequence. FIXME: ought to report this
488 to the caller somehow. */
489 inleft = 0;
490 break;
491
492 default:
493 perror_with_name ("Internal error while converting character sets");
494 }
495 }
496 }
497
498 do_cleanups (cleanups);
499 }
500
501 \f
502
503 /* An iterator that returns host wchar_t's from a target string. */
504 struct wchar_iterator
505 {
506 /* The underlying iconv descriptor. */
507 iconv_t desc;
508
509 /* The input string. This is updated as convert characters. */
510 char *input;
511 /* The number of bytes remaining in the input. */
512 size_t bytes;
513
514 /* The width of an input character. */
515 size_t width;
516
517 /* The output buffer and its size. */
518 gdb_wchar_t *out;
519 size_t out_size;
520 };
521
522 /* Create a new iterator. */
523 struct wchar_iterator *
524 make_wchar_iterator (const gdb_byte *input, size_t bytes, const char *charset,
525 size_t width)
526 {
527 struct wchar_iterator *result;
528 iconv_t desc;
529
530 desc = iconv_open ("wchar_t", charset);
531 if (desc == (iconv_t) -1)
532 perror_with_name ("Converting character sets");
533
534 result = XNEW (struct wchar_iterator);
535 result->desc = desc;
536 result->input = (char *) input;
537 result->bytes = bytes;
538 result->width = width;
539
540 result->out = XNEW (gdb_wchar_t);
541 result->out_size = 1;
542
543 return result;
544 }
545
546 static void
547 do_cleanup_iterator (void *p)
548 {
549 struct wchar_iterator *iter = p;
550
551 iconv_close (iter->desc);
552 xfree (iter->out);
553 xfree (iter);
554 }
555
556 struct cleanup *
557 make_cleanup_wchar_iterator (struct wchar_iterator *iter)
558 {
559 return make_cleanup (do_cleanup_iterator, iter);
560 }
561
562 int
563 wchar_iterate (struct wchar_iterator *iter,
564 enum wchar_iterate_result *out_result,
565 gdb_wchar_t **out_chars,
566 const gdb_byte **ptr,
567 size_t *len)
568 {
569 size_t out_request;
570
571 /* Try to convert some characters. At first we try to convert just
572 a single character. The reason for this is that iconv does not
573 necessarily update its outgoing arguments when it encounters an
574 invalid input sequence -- but we want to reliably report this to
575 our caller so it can emit an escape sequence. */
576 out_request = 1;
577 while (iter->bytes > 0)
578 {
579 char *outptr = (char *) &iter->out[0];
580 char *orig_inptr = iter->input;
581 size_t orig_in = iter->bytes;
582 size_t out_avail = out_request * sizeof (gdb_wchar_t);
583 size_t num;
584 gdb_wchar_t result;
585
586 size_t r = iconv (iter->desc,
587 (ICONV_CONST char **) &iter->input, &iter->bytes,
588 &outptr, &out_avail);
589 if (r == (size_t) -1)
590 {
591 switch (errno)
592 {
593 case EILSEQ:
594 /* Invalid input sequence. Skip it, and let the caller
595 know about it. */
596 *out_result = wchar_iterate_invalid;
597 *ptr = iter->input;
598 *len = iter->width;
599 iter->input += iter->width;
600 iter->bytes -= iter->width;
601 return 0;
602
603 case E2BIG:
604 /* We ran out of space. We still might have converted a
605 character; if so, return it. Otherwise, grow the
606 buffer and try again. */
607 if (out_avail < out_request * sizeof (gdb_wchar_t))
608 break;
609
610 ++out_request;
611 if (out_request > iter->out_size)
612 {
613 iter->out_size = out_request;
614 iter->out = xrealloc (iter->out,
615 out_request * sizeof (gdb_wchar_t));
616 }
617 continue;
618
619 case EINVAL:
620 /* Incomplete input sequence. Let the caller know, and
621 arrange for future calls to see EOF. */
622 *out_result = wchar_iterate_incomplete;
623 *ptr = iter->input;
624 *len = iter->bytes;
625 iter->bytes = 0;
626 return 0;
627
628 default:
629 perror_with_name ("Internal error while converting character sets");
630 }
631 }
632
633 /* We converted something. */
634 num = out_request - out_avail / sizeof (gdb_wchar_t);
635 *out_result = wchar_iterate_ok;
636 *out_chars = iter->out;
637 *ptr = orig_inptr;
638 *len = orig_in - iter->bytes;
639 return num;
640 }
641
642 /* Really done. */
643 *out_result = wchar_iterate_eof;
644 return -1;
645 }
646
647 \f
648 /* The charset.c module initialization function. */
649
650 extern initialize_file_ftype _initialize_charset; /* -Wmissing-prototype */
651
652 typedef char *char_ptr;
653 DEF_VEC_P (char_ptr);
654
655 static VEC (char_ptr) *charsets;
656
657 #ifdef PHONY_ICONV
658
659 static void
660 find_charset_names (void)
661 {
662 VEC_safe_push (char_ptr, charsets, GDB_DEFAULT_HOST_CHARSET);
663 VEC_safe_push (char_ptr, charsets, NULL);
664 }
665
666 #else /* PHONY_ICONV */
667
668 /* Sometimes, libiconv redefines iconvlist as libiconvlist -- but
669 provides different symbols in the static and dynamic libraries.
670 So, configure may see libiconvlist but not iconvlist. But, calling
671 iconvlist is the right thing to do and will work. Hence we do a
672 check here but unconditionally call iconvlist below. */
673 #if defined (HAVE_ICONVLIST) || defined (HAVE_LIBICONVLIST)
674
675 /* A helper function that adds some character sets to the vector of
676 all character sets. This is a callback function for iconvlist. */
677
678 static int
679 add_one (unsigned int count, const char *const *names, void *data)
680 {
681 unsigned int i;
682
683 for (i = 0; i < count; ++i)
684 VEC_safe_push (char_ptr, charsets, xstrdup (names[i]));
685
686 return 0;
687 }
688
689 static void
690 find_charset_names (void)
691 {
692 iconvlist (add_one, NULL);
693 VEC_safe_push (char_ptr, charsets, NULL);
694 }
695
696 #else
697
698 static void
699 find_charset_names (void)
700 {
701 FILE *in;
702
703 in = popen ("iconv -l", "r");
704 /* It is ok to ignore errors; we'll fall back on a default. */
705 if (!in)
706 return;
707
708 /* POSIX says that iconv -l uses an unspecified format. We parse
709 the glibc format; feel free to add others as needed. */
710 while (!feof (in))
711 {
712 /* The size of buf is chosen arbitrarily. A character set name
713 longer than this would not be very nice. */
714 char buf[80];
715 int len;
716 char *r = fgets (buf, sizeof (buf), in);
717 if (!r)
718 break;
719 len = strlen (r);
720 if (len <= 3)
721 continue;
722 if (buf[len - 2] == '/' && buf[len - 3] == '/')
723 buf[len - 3] = '\0';
724 VEC_safe_push (char_ptr, charsets, xstrdup (buf));
725 }
726
727 pclose (in);
728
729 VEC_safe_push (char_ptr, charsets, NULL);
730 }
731
732 #endif /* HAVE_ICONVLIST || HAVE_LIBICONVLIST */
733 #endif /* PHONY_ICONV */
734
735 void
736 _initialize_charset (void)
737 {
738 struct cmd_list_element *new_cmd;
739
740 /* The first element is always "auto"; then we skip it for the
741 commands where it is not allowed. */
742 VEC_safe_push (char_ptr, charsets, "auto");
743 find_charset_names ();
744
745 if (VEC_length (char_ptr, charsets) > 1)
746 charset_enum = (const char **) VEC_address (char_ptr, charsets);
747 else
748 charset_enum = default_charset_names;
749
750 #ifndef PHONY_ICONV
751 #ifdef HAVE_LANGINFO_CODESET
752 auto_host_charset_name = nl_langinfo (CODESET);
753 target_charset_name = auto_host_charset_name;
754
755 set_be_le_names ();
756 #endif
757 #endif
758
759 add_setshow_enum_cmd ("charset", class_support,
760 &charset_enum[1], &host_charset_name, _("\
761 Set the host and target character sets."), _("\
762 Show the host and target character sets."), _("\
763 The `host character set' is the one used by the system GDB is running on.\n\
764 The `target character set' is the one used by the program being debugged.\n\
765 You may only use supersets of ASCII for your host character set; GDB does\n\
766 not support any others.\n\
767 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
768 /* Note that the sfunc below needs to set
769 target_charset_name, because the 'set
770 charset' command sets two variables. */
771 set_charset_sfunc,
772 show_charset,
773 &setlist, &showlist);
774
775 add_setshow_enum_cmd ("host-charset", class_support,
776 charset_enum, &host_charset_name, _("\
777 Set the host character set."), _("\
778 Show the host character set."), _("\
779 The `host character set' is the one used by the system GDB is running on.\n\
780 You may only use supersets of ASCII for your host character set; GDB does\n\
781 not support any others.\n\
782 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
783 set_host_charset_sfunc,
784 show_host_charset_name,
785 &setlist, &showlist);
786
787 add_setshow_enum_cmd ("target-charset", class_support,
788 &charset_enum[1], &target_charset_name, _("\
789 Set the target character set."), _("\
790 Show the target character set."), _("\
791 The `target character set' is the one used by the program being debugged.\n\
792 GDB translates characters and strings between the host and target\n\
793 character sets as needed.\n\
794 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
795 set_target_charset_sfunc,
796 show_target_charset_name,
797 &setlist, &showlist);
798
799 add_setshow_enum_cmd ("target-wide-charset", class_support,
800 &charset_enum[1], &target_wide_charset_name,
801 _("\
802 Set the target wide character set."), _("\
803 Show the target wide character set."), _("\
804 The `target wide character set' is the one used by the program being debugged.\n\
805 In particular it is the encoding used by `wchar_t'.\n\
806 GDB translates characters and strings between the host and target\n\
807 character sets as needed.\n\
808 To see a list of the character sets GDB supports, type\n\
809 `set target-wide-charset'<TAB>"),
810 set_target_wide_charset_sfunc,
811 show_target_wide_charset_name,
812 &setlist, &showlist);
813 }
This page took 0.045317 seconds and 5 git commands to generate.