1 /* Character set conversion support for GDB.
3 Copyright (C) 2001, 2003, 2007, 2008, 2009 Free Software Foundation, Inc.
5 This file is part of GDB.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "gdb_assert.h"
24 #include "gdb_obstack.h"
25 #include "charset-list.h"
29 #include "gdb_string.h"
33 /* How GDB's character set support works
35 GDB has three global settings:
37 - The `current host character set' is the character set GDB should
38 use in talking to the user, and which (hopefully) the user's
39 terminal knows how to display properly. Most users should not
42 - The `current target character set' is the character set the
43 program being debugged uses.
45 - The `current target wide character set' is the wide character set
46 the program being debugged uses, that is, the encoding used for
49 There are commands to set each of these, and mechanisms for
50 choosing reasonable default values. GDB has a global list of
51 character sets that it can use as its host or target character
54 The header file `charset.h' declares various functions that
55 different pieces of GDB need to perform tasks like:
57 - printing target strings and characters to the user's terminal
58 (mostly target->host conversions),
60 - building target-appropriate representations of strings and
61 characters the user enters in expressions (mostly host->target
66 To avoid excessive code duplication and maintenance efforts,
67 GDB simply requires a capable iconv function. Users on platforms
68 without a suitable iconv can use the GNU iconv library. */
73 /* Provide a phony iconv that does as little as possible. Also,
74 arrange for there to be a single available character set. */
76 #undef GDB_DEFAULT_HOST_CHARSET
77 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
78 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
79 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
80 #undef DEFAULT_CHARSET_NAMES
81 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
90 iconv_open (const char *to
, const char *from
)
92 /* We allow conversions from UCS-4BE, wchar_t, and the host charset.
93 We allow conversions to wchar_t and the host charset. */
94 if (strcmp (from
, "UCS-4BE") && strcmp (from
, "wchar_t")
95 && strcmp (from
, GDB_DEFAULT_HOST_CHARSET
))
97 if (strcmp (to
, "wchar_t") && strcmp (to
, GDB_DEFAULT_HOST_CHARSET
))
100 /* Return 1 if we are converting from UCS-4BE, 0 otherwise. This is
101 used as a flag in calls to iconv. */
102 return !strcmp (from
, "UCS-4BE");
106 iconv_close (iconv_t arg
)
112 iconv (iconv_t ucs_flag
, char **inbuf
, size_t *inbytesleft
,
113 char **outbuf
, size_t *outbytesleft
)
117 while (*inbytesleft
>= 4)
122 for (j
= 0; j
< 4; ++j
)
125 c
+= (*inbuf
)[j
] & 0xff;
140 if (*inbytesleft
< 4)
148 /* In all other cases we simply copy input bytes to the
150 size_t amt
= *inbytesleft
;
151 if (amt
> *outbytesleft
)
153 memcpy (*outbuf
, *inbuf
, amt
);
157 *outbytesleft
-= amt
;
166 /* The number of non-reversible conversions -- but they were all
175 /* The global lists of character sets and translations. */
178 #ifndef GDB_DEFAULT_TARGET_CHARSET
179 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
182 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
183 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UCS-4"
186 static const char *auto_host_charset_name
= GDB_DEFAULT_HOST_CHARSET
;
187 static const char *host_charset_name
= "auto";
189 show_host_charset_name (struct ui_file
*file
, int from_tty
,
190 struct cmd_list_element
*c
,
193 if (!strcmp (value
, "auto"))
194 fprintf_filtered (file
,
195 _("The host character set is \"auto; currently %s\".\n"),
196 auto_host_charset_name
);
198 fprintf_filtered (file
, _("The host character set is \"%s\".\n"), value
);
201 static const char *target_charset_name
= GDB_DEFAULT_TARGET_CHARSET
;
203 show_target_charset_name (struct ui_file
*file
, int from_tty
,
204 struct cmd_list_element
*c
, const char *value
)
206 fprintf_filtered (file
, _("The target character set is \"%s\".\n"),
210 static const char *target_wide_charset_name
= GDB_DEFAULT_TARGET_WIDE_CHARSET
;
212 show_target_wide_charset_name (struct ui_file
*file
, int from_tty
,
213 struct cmd_list_element
*c
, const char *value
)
215 fprintf_filtered (file
, _("The target wide character set is \"%s\".\n"),
219 static const char *default_charset_names
[] =
221 DEFAULT_CHARSET_NAMES
225 static const char **charset_enum
;
228 /* If the target wide character set has big- or little-endian
229 variants, these are the corresponding names. */
230 static const char *target_wide_charset_be_name
;
231 static const char *target_wide_charset_le_name
;
233 /* A helper function for validate which sets the target wide big- and
234 little-endian character set names, if possible. */
237 set_be_le_names (void)
241 target_wide_charset_le_name
= NULL
;
242 target_wide_charset_be_name
= NULL
;
244 len
= strlen (target_wide_charset_name
);
245 for (i
= 0; charset_enum
[i
]; ++i
)
247 if (strncmp (target_wide_charset_name
, charset_enum
[i
], len
))
249 if ((charset_enum
[i
][len
] == 'B'
250 || charset_enum
[i
][len
] == 'L')
251 && charset_enum
[i
][len
+ 1] == 'E'
252 && charset_enum
[i
][len
+ 2] == '\0')
254 if (charset_enum
[i
][len
] == 'B')
255 target_wide_charset_be_name
= charset_enum
[i
];
257 target_wide_charset_le_name
= charset_enum
[i
];
262 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
263 target-wide-charset', 'set charset' sfunc's. */
269 const char *host_cset
= host_charset ();
271 desc
= iconv_open (target_wide_charset_name
, host_cset
);
272 if (desc
== (iconv_t
) -1)
273 error ("Cannot convert between character sets `%s' and `%s'",
274 target_wide_charset_name
, host_cset
);
277 desc
= iconv_open (target_charset_name
, host_cset
);
278 if (desc
== (iconv_t
) -1)
279 error ("Cannot convert between character sets `%s' and `%s'",
280 target_charset_name
, host_cset
);
286 /* This is the sfunc for the 'set charset' command. */
288 set_charset_sfunc (char *charset
, int from_tty
, struct cmd_list_element
*c
)
290 /* CAREFUL: set the target charset here as well. */
291 target_charset_name
= host_charset_name
;
295 /* 'set host-charset' command sfunc. We need a wrapper here because
296 the function needs to have a specific signature. */
298 set_host_charset_sfunc (char *charset
, int from_tty
,
299 struct cmd_list_element
*c
)
304 /* Wrapper for the 'set target-charset' command. */
306 set_target_charset_sfunc (char *charset
, int from_tty
,
307 struct cmd_list_element
*c
)
312 /* Wrapper for the 'set target-wide-charset' command. */
314 set_target_wide_charset_sfunc (char *charset
, int from_tty
,
315 struct cmd_list_element
*c
)
320 /* sfunc for the 'show charset' command. */
322 show_charset (struct ui_file
*file
, int from_tty
, struct cmd_list_element
*c
,
325 show_host_charset_name (file
, from_tty
, c
, host_charset_name
);
326 show_target_charset_name (file
, from_tty
, c
, target_charset_name
);
327 show_target_wide_charset_name (file
, from_tty
, c
, target_wide_charset_name
);
331 /* Accessor functions. */
336 if (!strcmp (host_charset_name
, "auto"))
337 return auto_host_charset_name
;
338 return host_charset_name
;
342 target_charset (void)
344 return target_charset_name
;
348 target_wide_charset (void)
350 if (gdbarch_byte_order (current_gdbarch
) == BFD_ENDIAN_BIG
)
352 if (target_wide_charset_be_name
)
353 return target_wide_charset_be_name
;
357 if (target_wide_charset_le_name
)
358 return target_wide_charset_le_name
;
361 return target_wide_charset_name
;
365 /* Host character set management. For the time being, we assume that
366 the host character set is some superset of ASCII. */
369 host_letter_to_control_character (char c
)
376 /* Convert a host character, C, to its hex value. C must already have
377 been validated using isxdigit. */
380 host_hex_value (char c
)
384 if (c
>= 'a' && c
<= 'f')
386 gdb_assert (c
>= 'A' && c
<= 'F');
391 /* Public character management functions. */
393 /* A cleanup function which is run to close an iconv descriptor. */
396 cleanup_iconv (void *p
)
399 iconv_close (*descp
);
403 convert_between_encodings (const char *from
, const char *to
,
404 const gdb_byte
*bytes
, unsigned int num_bytes
,
405 int width
, struct obstack
*output
,
406 enum transliterations translit
)
409 struct cleanup
*cleanups
;
412 unsigned int space_request
;
414 /* Often, the host and target charsets will be the same. */
415 if (!strcmp (from
, to
))
417 obstack_grow (output
, bytes
, num_bytes
);
421 desc
= iconv_open (to
, from
);
422 if (desc
== (iconv_t
) -1)
423 perror_with_name ("Converting character sets");
424 cleanups
= make_cleanup (cleanup_iconv
, &desc
);
427 inp
= (char *) bytes
;
429 space_request
= num_bytes
;
437 old_size
= obstack_object_size (output
);
438 obstack_blank (output
, space_request
);
440 outp
= obstack_base (output
) + old_size
;
441 outleft
= space_request
;
443 r
= iconv (desc
, &inp
, &inleft
, &outp
, &outleft
);
445 /* Now make sure that the object on the obstack only includes
446 bytes we have converted. */
447 obstack_blank (output
, - (int) outleft
);
449 if (r
== (size_t) -1)
457 /* Invalid input sequence. */
458 if (translit
== translit_none
)
459 error (_("Could not convert character to `%s' character set"),
462 /* We emit escape sequence for the bytes, skip them,
464 for (i
= 0; i
< width
; ++i
)
468 sprintf (octal
, "\\%.3o", *inp
& 0xff);
469 obstack_grow_str (output
, octal
);
478 /* We ran out of space in the output buffer. Make it
479 bigger next time around. */
484 /* Incomplete input sequence. FIXME: ought to report this
485 to the caller somehow. */
490 perror_with_name ("Internal error while converting character sets");
495 do_cleanups (cleanups
);
500 /* An iterator that returns host wchar_t's from a target string. */
501 struct wchar_iterator
503 /* The underlying iconv descriptor. */
506 /* The input string. This is updated as convert characters. */
508 /* The number of bytes remaining in the input. */
511 /* The width of an input character. */
514 /* The output buffer and its size. */
519 /* Create a new iterator. */
520 struct wchar_iterator
*
521 make_wchar_iterator (const gdb_byte
*input
, size_t bytes
, const char *charset
,
524 struct wchar_iterator
*result
;
527 desc
= iconv_open ("wchar_t", charset
);
528 if (desc
== (iconv_t
) -1)
529 perror_with_name ("Converting character sets");
531 result
= XNEW (struct wchar_iterator
);
533 result
->input
= (char *) input
;
534 result
->bytes
= bytes
;
535 result
->width
= width
;
537 result
->out
= XNEW (gdb_wchar_t
);
538 result
->out_size
= 1;
544 do_cleanup_iterator (void *p
)
546 struct wchar_iterator
*iter
= p
;
548 iconv_close (iter
->desc
);
554 make_cleanup_wchar_iterator (struct wchar_iterator
*iter
)
556 return make_cleanup (do_cleanup_iterator
, iter
);
560 wchar_iterate (struct wchar_iterator
*iter
,
561 enum wchar_iterate_result
*out_result
,
562 gdb_wchar_t
**out_chars
,
563 const gdb_byte
**ptr
,
568 /* Try to convert some characters. At first we try to convert just
569 a single character. The reason for this is that iconv does not
570 necessarily update its outgoing arguments when it encounters an
571 invalid input sequence -- but we want to reliably report this to
572 our caller so it can emit an escape sequence. */
574 while (iter
->bytes
> 0)
576 char *outptr
= (char *) &iter
->out
[0];
577 char *orig_inptr
= iter
->input
;
578 size_t orig_in
= iter
->bytes
;
579 size_t out_avail
= out_request
* sizeof (gdb_wchar_t
);
583 size_t r
= iconv (iter
->desc
, (char **) &iter
->input
, &iter
->bytes
,
584 &outptr
, &out_avail
);
585 if (r
== (size_t) -1)
590 /* Invalid input sequence. Skip it, and let the caller
592 *out_result
= wchar_iterate_invalid
;
595 iter
->input
+= iter
->width
;
596 iter
->bytes
-= iter
->width
;
600 /* We ran out of space. We still might have converted a
601 character; if so, return it. Otherwise, grow the
602 buffer and try again. */
603 if (out_avail
< out_request
* sizeof (gdb_wchar_t
))
607 if (out_request
> iter
->out_size
)
609 iter
->out_size
= out_request
;
610 iter
->out
= xrealloc (iter
->out
,
611 out_request
* sizeof (gdb_wchar_t
));
616 /* Incomplete input sequence. Let the caller know, and
617 arrange for future calls to see EOF. */
618 *out_result
= wchar_iterate_incomplete
;
625 perror_with_name ("Internal error while converting character sets");
629 /* We converted something. */
630 num
= out_request
- out_avail
/ sizeof (gdb_wchar_t
);
631 *out_result
= wchar_iterate_ok
;
632 *out_chars
= iter
->out
;
634 *len
= orig_in
- iter
->bytes
;
639 *out_result
= wchar_iterate_eof
;
644 /* The charset.c module initialization function. */
646 extern initialize_file_ftype _initialize_charset
; /* -Wmissing-prototype */
648 typedef char *char_ptr
;
649 DEF_VEC_P (char_ptr
);
651 static VEC (char_ptr
) *charsets
;
656 find_charset_names (void)
658 VEC_safe_push (char_ptr
, charsets
, GDB_DEFAULT_HOST_CHARSET
);
659 VEC_safe_push (char_ptr
, charsets
, NULL
);
662 #else /* PHONY_ICONV */
663 #ifdef HAVE_ICONVLIST
665 /* A helper function that adds some character sets to the vector of
666 all character sets. This is a callback function for iconvlist. */
669 add_one (unsigned int count
, const char *const *names
, void *data
)
673 for (i
= 0; i
< count
; ++i
)
674 VEC_safe_push (char_ptr
, charsets
, xstrdup (names
[i
]));
680 find_charset_names (void)
682 iconvlist (add_one
, NULL
);
683 VEC_safe_push (char_ptr
, charsets
, NULL
);
689 find_charset_names (void)
693 in
= popen ("iconv -l", "r");
694 /* It is ok to ignore errors; we'll fall back on a default. */
698 /* POSIX says that iconv -l uses an unspecified format. We parse
699 the glibc format; feel free to add others as needed. */
702 /* The size of buf is chosen arbitrarily. A character set name
703 longer than this would not be very nice. */
706 char *r
= fgets (buf
, sizeof (buf
), in
);
712 if (buf
[len
- 2] == '/' && buf
[len
- 3] == '/')
714 VEC_safe_push (char_ptr
, charsets
, xstrdup (buf
));
719 VEC_safe_push (char_ptr
, charsets
, NULL
);
722 #endif /* HAVE_ICONVLIST */
723 #endif /* PHONY_ICONV */
726 _initialize_charset (void)
728 struct cmd_list_element
*new_cmd
;
730 /* The first element is always "auto"; then we skip it for the
731 commands where it is not allowed. */
732 VEC_safe_push (char_ptr
, charsets
, "auto");
733 find_charset_names ();
735 if (VEC_length (char_ptr
, charsets
) > 1)
736 charset_enum
= (const char **) VEC_address (char_ptr
, charsets
);
738 charset_enum
= default_charset_names
;
741 #ifdef HAVE_LANGINFO_CODESET
742 auto_host_charset_name
= nl_langinfo (CODESET
);
743 target_charset_name
= auto_host_charset_name
;
749 add_setshow_enum_cmd ("charset", class_support
,
750 &charset_enum
[1], &host_charset_name
, _("\
751 Set the host and target character sets."), _("\
752 Show the host and target character sets."), _("\
753 The `host character set' is the one used by the system GDB is running on.\n\
754 The `target character set' is the one used by the program being debugged.\n\
755 You may only use supersets of ASCII for your host character set; GDB does\n\
756 not support any others.\n\
757 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
758 /* Note that the sfunc below needs to set
759 target_charset_name, because the 'set
760 charset' command sets two variables. */
763 &setlist
, &showlist
);
765 add_setshow_enum_cmd ("host-charset", class_support
,
766 charset_enum
, &host_charset_name
, _("\
767 Set the host character set."), _("\
768 Show the host character set."), _("\
769 The `host character set' is the one used by the system GDB is running on.\n\
770 You may only use supersets of ASCII for your host character set; GDB does\n\
771 not support any others.\n\
772 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
773 set_host_charset_sfunc
,
774 show_host_charset_name
,
775 &setlist
, &showlist
);
777 add_setshow_enum_cmd ("target-charset", class_support
,
778 &charset_enum
[1], &target_charset_name
, _("\
779 Set the target character set."), _("\
780 Show the target character set."), _("\
781 The `target character set' is the one used by the program being debugged.\n\
782 GDB translates characters and strings between the host and target\n\
783 character sets as needed.\n\
784 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
785 set_target_charset_sfunc
,
786 show_target_charset_name
,
787 &setlist
, &showlist
);
789 add_setshow_enum_cmd ("target-wide-charset", class_support
,
790 &charset_enum
[1], &target_wide_charset_name
,
792 Set the target wide character set."), _("\
793 Show the target wide character set."), _("\
794 The `target wide character set' is the one used by the program being debugged.\n\
795 In particular it is the encoding used by `wchar_t'.\n\
796 GDB translates characters and strings between the host and target\n\
797 character sets as needed.\n\
798 To see a list of the character sets GDB supports, type\n\
799 `set target-wide-charset'<TAB>"),
800 set_target_wide_charset_sfunc
,
801 show_target_wide_charset_name
,
802 &setlist
, &showlist
);