479fcb9d85a36b2b0b1e0c567287b122d85fc6ab
1 /* Determine a canonical name for the current locale's character encoding.
3 Copyright (C) 2000-2006, 2008-2019 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, see <https://www.gnu.org/licenses/>. */
18 /* Written by Bruno Haible <bruno@clisp.org>. */
23 #include "localcharset.h"
30 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
31 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
34 #if defined _WIN32 && !defined __CYGWIN__
35 # define WINDOWS_NATIVE
40 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
46 #if !defined WINDOWS_NATIVE
47 # if HAVE_LANGINFO_CODESET
48 # include <langinfo.h>
50 # if 0 /* see comment regarding use of setlocale(), below */
55 # define WIN32_LEAN_AND_MEAN
58 #elif defined WINDOWS_NATIVE
59 # define WIN32_LEAN_AND_MEAN
67 /* For MB_CUR_MAX_L */
73 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
75 /* On these platforms, we use a mapping from non-canonical encoding name
76 to GNU canonical encoding name. */
78 /* With glibc-2.1 or newer, we don't need any canonicalization,
79 because glibc has iconv and both glibc and libiconv support all
80 GNU canonical names directly. */
81 # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
85 const char alias
[11+1];
86 const char canonical
[11+1];
89 /* Table of platform-dependent mappings, sorted in ascending order. */
90 static const struct table_entry alias_table
[] =
92 # if defined __FreeBSD__ /* FreeBSD */
93 /*{ "ARMSCII-8", "ARMSCII-8" },*/
96 /*{ "CP1131", "CP1131" },*/
97 /*{ "CP1251", "CP1251" },*/
98 /*{ "CP866", "CP866" },*/
99 /*{ "GB18030", "GB18030" },*/
100 /*{ "GB2312", "GB2312" },*/
101 /*{ "GBK", "GBK" },*/
102 /*{ "ISCII-DEV", "?" },*/
103 { "ISO8859-1", "ISO-8859-1" },
104 { "ISO8859-13", "ISO-8859-13" },
105 { "ISO8859-15", "ISO-8859-15" },
106 { "ISO8859-2", "ISO-8859-2" },
107 { "ISO8859-5", "ISO-8859-5" },
108 { "ISO8859-7", "ISO-8859-7" },
109 { "ISO8859-9", "ISO-8859-9" },
110 /*{ "KOI8-R", "KOI8-R" },*/
111 /*{ "KOI8-U", "KOI8-U" },*/
112 { "SJIS", "SHIFT_JIS" },
113 { "US-ASCII", "ASCII" },
114 { "eucCN", "GB2312" },
115 { "eucJP", "EUC-JP" },
116 { "eucKR", "EUC-KR" }
117 # define alias_table_defined
119 # if defined __NetBSD__ /* NetBSD */
121 /*{ "ARMSCII-8", "ARMSCII-8" },*/
122 /*{ "BIG5", "BIG5" },*/
123 { "Big5-HKSCS", "BIG5-HKSCS" },
124 /*{ "CP1251", "CP1251" },*/
125 /*{ "CP866", "CP866" },*/
126 /*{ "GB18030", "GB18030" },*/
127 /*{ "GB2312", "GB2312" },*/
128 { "ISO8859-1", "ISO-8859-1" },
129 { "ISO8859-13", "ISO-8859-13" },
130 { "ISO8859-15", "ISO-8859-15" },
131 { "ISO8859-2", "ISO-8859-2" },
132 { "ISO8859-4", "ISO-8859-4" },
133 { "ISO8859-5", "ISO-8859-5" },
134 { "ISO8859-7", "ISO-8859-7" },
135 /*{ "KOI8-R", "KOI8-R" },*/
136 /*{ "KOI8-U", "KOI8-U" },*/
137 /*{ "PT154", "PT154" },*/
138 { "SJIS", "SHIFT_JIS" },
139 { "eucCN", "GB2312" },
140 { "eucJP", "EUC-JP" },
141 { "eucKR", "EUC-KR" },
142 { "eucTW", "EUC-TW" }
143 # define alias_table_defined
145 # if defined __OpenBSD__ /* OpenBSD */
147 { "ISO8859-1", "ISO-8859-1" },
148 { "ISO8859-13", "ISO-8859-13" },
149 { "ISO8859-15", "ISO-8859-15" },
150 { "ISO8859-2", "ISO-8859-2" },
151 { "ISO8859-4", "ISO-8859-4" },
152 { "ISO8859-5", "ISO-8859-5" },
153 { "ISO8859-7", "ISO-8859-7" }
154 # define alias_table_defined
156 # if defined __APPLE__ && defined __MACH__ /* Mac OS X */
157 /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
159 - It returns the empty string when LANG is set to a locale of the
160 form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
162 - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
163 the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
164 - The documentation says:
165 "... all code that calls BSD system routines should ensure
166 that the const *char parameters of these routines are in UTF-8
167 encoding. All BSD system functions expect their string
168 parameters to be in UTF-8 encoding and nothing else."
170 "An additional caveat is that string parameters for files,
171 paths, and other file-system entities must be in canonical
172 UTF-8. In a canonical UTF-8 Unicode string, all decomposable
173 characters are decomposed ..."
174 but this is not true: You can pass non-decomposed UTF-8 strings
175 to file system functions, and it is the OS which will convert
176 them to decomposed UTF-8 before accessing the file system.
177 - The Apple Terminal application displays UTF-8 by default.
178 - However, other applications are free to use different encodings:
179 - xterm uses ISO-8859-1 by default.
180 - TextEdit uses MacRoman by default.
181 We prefer UTF-8 over decomposed UTF-8-MAC because one should
182 minimize the use of decomposed Unicode. Unfortunately, through the
183 Darwin file system, decomposed UTF-8 strings are leaked into user
185 Then there are also the locales with encodings other than US-ASCII
186 and UTF-8. These locales can be occasionally useful to users (e.g.
187 when grepping through ISO-8859-1 encoded text files), when all their
188 file names are in US-ASCII.
190 { "ARMSCII-8", "ARMSCII-8" },
192 { "Big5HKSCS", "BIG5-HKSCS" },
193 { "CP1131", "CP1131" },
194 { "CP1251", "CP1251" },
195 { "CP866", "CP866" },
196 { "CP949", "CP949" },
197 { "GB18030", "GB18030" },
198 { "GB2312", "GB2312" },
200 /*{ "ISCII-DEV", "?" },*/
201 { "ISO8859-1", "ISO-8859-1" },
202 { "ISO8859-13", "ISO-8859-13" },
203 { "ISO8859-15", "ISO-8859-15" },
204 { "ISO8859-2", "ISO-8859-2" },
205 { "ISO8859-4", "ISO-8859-4" },
206 { "ISO8859-5", "ISO-8859-5" },
207 { "ISO8859-7", "ISO-8859-7" },
208 { "ISO8859-9", "ISO-8859-9" },
209 { "KOI8-R", "KOI8-R" },
210 { "KOI8-U", "KOI8-U" },
211 { "PT154", "PT154" },
212 { "SJIS", "SHIFT_JIS" },
213 { "eucCN", "GB2312" },
214 { "eucJP", "EUC-JP" },
215 { "eucKR", "EUC-KR" }
216 # define alias_table_defined
218 # if defined _AIX /* AIX */
219 /*{ "GBK", "GBK" },*/
220 { "IBM-1046", "CP1046" },
221 { "IBM-1124", "CP1124" },
222 { "IBM-1129", "CP1129" },
223 { "IBM-1252", "CP1252" },
224 { "IBM-850", "CP850" },
225 { "IBM-856", "CP856" },
226 { "IBM-921", "ISO-8859-13" },
227 { "IBM-922", "CP922" },
228 { "IBM-932", "CP932" },
229 { "IBM-943", "CP943" },
230 { "IBM-eucCN", "GB2312" },
231 { "IBM-eucJP", "EUC-JP" },
232 { "IBM-eucKR", "EUC-KR" },
233 { "IBM-eucTW", "EUC-TW" },
234 { "ISO8859-1", "ISO-8859-1" },
235 { "ISO8859-15", "ISO-8859-15" },
236 { "ISO8859-2", "ISO-8859-2" },
237 { "ISO8859-5", "ISO-8859-5" },
238 { "ISO8859-6", "ISO-8859-6" },
239 { "ISO8859-7", "ISO-8859-7" },
240 { "ISO8859-8", "ISO-8859-8" },
241 { "ISO8859-9", "ISO-8859-9" },
242 { "TIS-620", "TIS-620" },
243 /*{ "UTF-8", "UTF-8" },*/
245 # define alias_table_defined
247 # if defined __hpux /* HP-UX */
248 { "SJIS", "SHIFT_JIS" },
249 { "arabic8", "HP-ARABIC8" },
251 { "cp1251", "CP1251" },
252 { "eucJP", "EUC-JP" },
253 { "eucKR", "EUC-KR" },
254 { "eucTW", "EUC-TW" },
255 { "gb18030", "GB18030" },
256 { "greek8", "HP-GREEK8" },
257 { "hebrew8", "HP-HEBREW8" },
258 { "hkbig5", "BIG5-HKSCS" },
259 { "hp15CN", "GB2312" },
260 { "iso88591", "ISO-8859-1" },
261 { "iso885913", "ISO-8859-13" },
262 { "iso885915", "ISO-8859-15" },
263 { "iso88592", "ISO-8859-2" },
264 { "iso88594", "ISO-8859-4" },
265 { "iso88595", "ISO-8859-5" },
266 { "iso88596", "ISO-8859-6" },
267 { "iso88597", "ISO-8859-7" },
268 { "iso88598", "ISO-8859-8" },
269 { "iso88599", "ISO-8859-9" },
270 { "kana8", "HP-KANA8" },
271 { "koi8r", "KOI8-R" },
272 { "roman8", "HP-ROMAN8" },
273 { "tis620", "TIS-620" },
274 { "turkish8", "HP-TURKISH8" },
276 # define alias_table_defined
278 # if defined __sgi /* IRIX */
279 { "ISO8859-1", "ISO-8859-1" },
280 { "ISO8859-15", "ISO-8859-15" },
281 { "ISO8859-2", "ISO-8859-2" },
282 { "ISO8859-5", "ISO-8859-5" },
283 { "ISO8859-7", "ISO-8859-7" },
284 { "ISO8859-9", "ISO-8859-9" },
285 { "eucCN", "GB2312" },
286 { "eucJP", "EUC-JP" },
287 { "eucKR", "EUC-KR" },
288 { "eucTW", "EUC-TW" }
289 # define alias_table_defined
291 # if defined __osf__ /* OSF/1 */
292 /*{ "GBK", "GBK" },*/
293 { "ISO8859-1", "ISO-8859-1" },
294 { "ISO8859-15", "ISO-8859-15" },
295 { "ISO8859-2", "ISO-8859-2" },
296 { "ISO8859-4", "ISO-8859-4" },
297 { "ISO8859-5", "ISO-8859-5" },
298 { "ISO8859-7", "ISO-8859-7" },
299 { "ISO8859-8", "ISO-8859-8" },
300 { "ISO8859-9", "ISO-8859-9" },
301 { "KSC5601", "CP949" },
302 { "SJIS", "SHIFT_JIS" },
303 { "TACTIS", "TIS-620" },
304 /*{ "UTF-8", "UTF-8" },*/
306 { "cp850", "CP850" },
307 { "dechanyu", "DEC-HANYU" },
308 { "dechanzi", "GB2312" },
309 { "deckanji", "DEC-KANJI" },
310 { "deckorean", "EUC-KR" },
311 { "eucJP", "EUC-JP" },
312 { "eucKR", "EUC-KR" },
313 { "eucTW", "EUC-TW" },
314 { "sdeckanji", "EUC-JP" }
315 # define alias_table_defined
317 # if defined __sun /* Solaris */
318 { "5601", "EUC-KR" },
320 /*{ "BIG5", "BIG5" },*/
321 { "Big5-HKSCS", "BIG5-HKSCS" },
322 { "GB18030", "GB18030" },
323 /*{ "GBK", "GBK" },*/
324 { "ISO8859-1", "ISO-8859-1" },
325 { "ISO8859-11", "TIS-620" },
326 { "ISO8859-13", "ISO-8859-13" },
327 { "ISO8859-15", "ISO-8859-15" },
328 { "ISO8859-2", "ISO-8859-2" },
329 { "ISO8859-3", "ISO-8859-3" },
330 { "ISO8859-4", "ISO-8859-4" },
331 { "ISO8859-5", "ISO-8859-5" },
332 { "ISO8859-6", "ISO-8859-6" },
333 { "ISO8859-7", "ISO-8859-7" },
334 { "ISO8859-8", "ISO-8859-8" },
335 { "ISO8859-9", "ISO-8859-9" },
336 { "PCK", "SHIFT_JIS" },
337 { "TIS620.2533", "TIS-620" },
338 /*{ "UTF-8", "UTF-8" },*/
339 { "ansi-1251", "CP1251" },
340 { "cns11643", "EUC-TW" },
341 { "eucJP", "EUC-JP" },
342 { "gb2312", "GB2312" },
343 { "koi8-r", "KOI8-R" }
344 # define alias_table_defined
346 # if defined __minix /* Minix */
348 # define alias_table_defined
350 # if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Windows */
351 { "CP1361", "JOHAB" },
352 { "CP20127", "ASCII" },
353 { "CP20866", "KOI8-R" },
354 { "CP20936", "GB2312" },
355 { "CP21866", "KOI8-RU" },
356 { "CP28591", "ISO-8859-1" },
357 { "CP28592", "ISO-8859-2" },
358 { "CP28593", "ISO-8859-3" },
359 { "CP28594", "ISO-8859-4" },
360 { "CP28595", "ISO-8859-5" },
361 { "CP28596", "ISO-8859-6" },
362 { "CP28597", "ISO-8859-7" },
363 { "CP28598", "ISO-8859-8" },
364 { "CP28599", "ISO-8859-9" },
365 { "CP28605", "ISO-8859-15" },
366 { "CP38598", "ISO-8859-8" },
367 { "CP51932", "EUC-JP" },
368 { "CP51936", "GB2312" },
369 { "CP51949", "EUC-KR" },
370 { "CP51950", "EUC-TW" },
371 { "CP54936", "GB18030" },
372 { "CP65001", "UTF-8" },
374 # define alias_table_defined
376 # if defined OS2 /* OS/2 */
377 /* The list of encodings is taken from "List of OS/2 Codepages"
379 <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
380 See also "__convcp() of kLIBC":
381 <https://github.com/bitwiseworks/libc/blob/master/src/emx/src/lib/locale/__convcp.c>. */
382 { "CP1004", "CP1252" },
383 /*{ "CP1041", "CP943" },*/
384 /*{ "CP1088", "CP949" },*/
385 { "CP1089", "ISO-8859-6" },
386 /*{ "CP1114", "CP950" },*/
387 /*{ "CP1115", "GB2312" },*/
388 { "CP1208", "UTF-8" },
389 /*{ "CP1380", "GB2312" },*/
390 { "CP1381", "GB2312" },
391 { "CP1383", "GB2312" },
393 /*{ "CP301", "CP943" },*/
394 { "CP3372", "EUC-JP" },
395 { "CP4946", "CP850" },
396 /*{ "CP5048", "JIS_X0208-1990" },*/
397 /*{ "CP5049", "JIS_X0212-1990" },*/
398 /*{ "CP5067", "KS_C_5601-1987" },*/
399 { "CP813", "ISO-8859-7" },
400 { "CP819", "ISO-8859-1" },
401 { "CP878", "KOI8-R" },
402 /*{ "CP897", "CP943" },*/
403 { "CP912", "ISO-8859-2" },
404 { "CP913", "ISO-8859-3" },
405 { "CP914", "ISO-8859-4" },
406 { "CP915", "ISO-8859-5" },
407 { "CP916", "ISO-8859-8" },
408 { "CP920", "ISO-8859-9" },
409 { "CP921", "ISO-8859-13" },
410 { "CP923", "ISO-8859-15" },
411 /*{ "CP941", "CP943" },*/
412 /*{ "CP947", "CP950" },*/
413 /*{ "CP951", "CP949" },*/
414 /*{ "CP952", "JIS_X0208-1990" },*/
415 /*{ "CP953", "JIS_X0212-1990" },*/
416 { "CP954", "EUC-JP" },
417 { "CP964", "EUC-TW" },
418 { "CP970", "EUC-KR" },
419 /*{ "CP971", "KS_C_5601-1987" },*/
420 { "IBM-1004", "CP1252" },
421 /*{ "IBM-1006", "?" },*/
422 /*{ "IBM-1008", "?" },*/
423 /*{ "IBM-1041", "CP943" },*/
424 /*{ "IBM-1051", "?" },*/
425 /*{ "IBM-1088", "CP949" },*/
426 { "IBM-1089", "ISO-8859-6" },
427 /*{ "IBM-1098", "?" },*/
428 /*{ "IBM-1114", "CP950" },*/
429 /*{ "IBM-1115", "GB2312" },*/
430 /*{ "IBM-1116", "?" },*/
431 /*{ "IBM-1117", "?" },*/
432 /*{ "IBM-1118", "?" },*/
433 /*{ "IBM-1119", "?" },*/
434 { "IBM-1124", "CP1124" },
435 { "IBM-1125", "CP1125" },
436 { "IBM-1131", "CP1131" },
437 { "IBM-1208", "UTF-8" },
438 { "IBM-1250", "CP1250" },
439 { "IBM-1251", "CP1251" },
440 { "IBM-1252", "CP1252" },
441 { "IBM-1253", "CP1253" },
442 { "IBM-1254", "CP1254" },
443 { "IBM-1255", "CP1255" },
444 { "IBM-1256", "CP1256" },
445 { "IBM-1257", "CP1257" },
446 /*{ "IBM-1275", "?" },*/
447 /*{ "IBM-1276", "?" },*/
448 /*{ "IBM-1277", "?" },*/
449 /*{ "IBM-1280", "?" },*/
450 /*{ "IBM-1281", "?" },*/
451 /*{ "IBM-1282", "?" },*/
452 /*{ "IBM-1283", "?" },*/
453 /*{ "IBM-1380", "GB2312" },*/
454 { "IBM-1381", "GB2312" },
455 { "IBM-1383", "GB2312" },
456 { "IBM-1386", "GBK" },
457 /*{ "IBM-301", "CP943" },*/
458 { "IBM-3372", "EUC-JP" },
459 { "IBM-367", "ASCII" },
460 { "IBM-437", "CP437" },
461 { "IBM-4946", "CP850" },
462 /*{ "IBM-5048", "JIS_X0208-1990" },*/
463 /*{ "IBM-5049", "JIS_X0212-1990" },*/
464 /*{ "IBM-5067", "KS_C_5601-1987" },*/
465 { "IBM-813", "ISO-8859-7" },
466 { "IBM-819", "ISO-8859-1" },
467 { "IBM-850", "CP850" },
468 /*{ "IBM-851", "?" },*/
469 { "IBM-852", "CP852" },
470 { "IBM-855", "CP855" },
471 { "IBM-856", "CP856" },
472 { "IBM-857", "CP857" },
473 /*{ "IBM-859", "?" },*/
474 { "IBM-860", "CP860" },
475 { "IBM-861", "CP861" },
476 { "IBM-862", "CP862" },
477 { "IBM-863", "CP863" },
478 { "IBM-864", "CP864" },
479 { "IBM-865", "CP865" },
480 { "IBM-866", "CP866" },
481 /*{ "IBM-868", "?" },*/
482 { "IBM-869", "CP869" },
483 { "IBM-874", "CP874" },
484 { "IBM-878", "KOI8-R" },
485 /*{ "IBM-895", "?" },*/
486 /*{ "IBM-897", "CP943" },*/
487 /*{ "IBM-907", "?" },*/
488 /*{ "IBM-909", "?" },*/
489 { "IBM-912", "ISO-8859-2" },
490 { "IBM-913", "ISO-8859-3" },
491 { "IBM-914", "ISO-8859-4" },
492 { "IBM-915", "ISO-8859-5" },
493 { "IBM-916", "ISO-8859-8" },
494 { "IBM-920", "ISO-8859-9" },
495 { "IBM-921", "ISO-8859-13" },
496 { "IBM-922", "CP922" },
497 { "IBM-923", "ISO-8859-15" },
498 { "IBM-932", "CP932" },
499 /*{ "IBM-941", "CP943" },*/
500 /*{ "IBM-942", "?" },*/
501 { "IBM-943", "CP943" },
502 /*{ "IBM-947", "CP950" },*/
503 { "IBM-949", "CP949" },
504 { "IBM-950", "CP950" },
505 /*{ "IBM-951", "CP949" },*/
506 /*{ "IBM-952", "JIS_X0208-1990" },*/
507 /*{ "IBM-953", "JIS_X0212-1990" },*/
508 { "IBM-954", "EUC-JP" },
509 /*{ "IBM-955", "?" },*/
510 { "IBM-964", "EUC-TW" },
511 { "IBM-970", "EUC-KR" },
512 /*{ "IBM-971", "KS_C_5601-1987" },*/
513 { "IBM-eucCN", "GB2312" },
514 { "IBM-eucJP", "EUC-JP" },
515 { "IBM-eucKR", "EUC-KR" },
516 { "IBM-eucTW", "EUC-TW" },
517 { "IBM33722", "EUC-JP" },
518 { "ISO8859-1", "ISO-8859-1" },
519 { "ISO8859-2", "ISO-8859-2" },
520 { "ISO8859-3", "ISO-8859-3" },
521 { "ISO8859-4", "ISO-8859-4" },
522 { "ISO8859-5", "ISO-8859-5" },
523 { "ISO8859-6", "ISO-8859-6" },
524 { "ISO8859-7", "ISO-8859-7" },
525 { "ISO8859-8", "ISO-8859-8" },
526 { "ISO8859-9", "ISO-8859-9" },
527 /*{ "JISX0201-1976", "JISX0201-1976" },*/
528 /*{ "JISX0208-1978", "?" },*/
529 /*{ "JISX0208-1983", "JIS_X0208-1983" },*/
530 /*{ "JISX0208-1990", "JIS_X0208-1990" },*/
531 /*{ "JISX0212-1990", "JIS_X0212-1990" },*/
532 /*{ "KSC5601-1987", "KS_C_5601-1987" },*/
533 { "SJIS-1", "CP943" },
534 { "SJIS-2", "CP943" },
535 { "eucJP", "EUC-JP" },
536 { "eucKR", "EUC-KR" },
537 { "eucTW-1993", "EUC-TW" }
538 # define alias_table_defined
540 # if defined VMS /* OpenVMS */
541 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
542 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
543 section 10.7 "Handling Different Character Sets". */
544 { "DECHANYU", "DEC-HANYU" },
545 { "DECHANZI", "GB2312" },
546 { "DECKANJI", "DEC-KANJI" },
547 { "DECKOREAN", "EUC-KR" },
548 { "ISO8859-1", "ISO-8859-1" },
549 { "ISO8859-2", "ISO-8859-2" },
550 { "ISO8859-5", "ISO-8859-5" },
551 { "ISO8859-7", "ISO-8859-7" },
552 { "ISO8859-8", "ISO-8859-8" },
553 { "ISO8859-9", "ISO-8859-9" },
554 { "SDECKANJI", "EUC-JP" },
555 { "SJIS", "SHIFT_JIS" },
556 { "eucJP", "EUC-JP" },
557 { "eucTW", "EUC-TW" }
558 # define alias_table_defined
560 # ifndef alias_table_defined
561 /* Just a dummy entry, to avoid a C syntax error. */
570 /* On these platforms, we use a mapping from locale name to GNU canonical
575 const char locale
[17+1];
576 const char canonical
[11+1];
579 /* Table of platform-dependent mappings, sorted in ascending order. */
580 static const struct table_entry locale_table
[] =
582 # if defined __FreeBSD__ /* FreeBSD 4.2 */
583 { "cs_CZ.ISO_8859-2", "ISO-8859-2" },
584 { "da_DK.DIS_8859-15", "ISO-8859-15" },
585 { "da_DK.ISO_8859-1", "ISO-8859-1" },
586 { "de_AT.DIS_8859-15", "ISO-8859-15" },
587 { "de_AT.ISO_8859-1", "ISO-8859-1" },
588 { "de_CH.DIS_8859-15", "ISO-8859-15" },
589 { "de_CH.ISO_8859-1", "ISO-8859-1" },
590 { "de_DE.DIS_8859-15", "ISO-8859-15" },
591 { "de_DE.ISO_8859-1", "ISO-8859-1" },
592 { "en_AU.DIS_8859-15", "ISO-8859-15" },
593 { "en_AU.ISO_8859-1", "ISO-8859-1" },
594 { "en_CA.DIS_8859-15", "ISO-8859-15" },
595 { "en_CA.ISO_8859-1", "ISO-8859-1" },
596 { "en_GB.DIS_8859-15", "ISO-8859-15" },
597 { "en_GB.ISO_8859-1", "ISO-8859-1" },
598 { "en_US.DIS_8859-15", "ISO-8859-15" },
599 { "en_US.ISO_8859-1", "ISO-8859-1" },
600 { "es_ES.DIS_8859-15", "ISO-8859-15" },
601 { "es_ES.ISO_8859-1", "ISO-8859-1" },
602 { "fi_FI.DIS_8859-15", "ISO-8859-15" },
603 { "fi_FI.ISO_8859-1", "ISO-8859-1" },
604 { "fr_BE.DIS_8859-15", "ISO-8859-15" },
605 { "fr_BE.ISO_8859-1", "ISO-8859-1" },
606 { "fr_CA.DIS_8859-15", "ISO-8859-15" },
607 { "fr_CA.ISO_8859-1", "ISO-8859-1" },
608 { "fr_CH.DIS_8859-15", "ISO-8859-15" },
609 { "fr_CH.ISO_8859-1", "ISO-8859-1" },
610 { "fr_FR.DIS_8859-15", "ISO-8859-15" },
611 { "fr_FR.ISO_8859-1", "ISO-8859-1" },
612 { "hr_HR.ISO_8859-2", "ISO-8859-2" },
613 { "hu_HU.ISO_8859-2", "ISO-8859-2" },
614 { "is_IS.DIS_8859-15", "ISO-8859-15" },
615 { "is_IS.ISO_8859-1", "ISO-8859-1" },
616 { "it_CH.DIS_8859-15", "ISO-8859-15" },
617 { "it_CH.ISO_8859-1", "ISO-8859-1" },
618 { "it_IT.DIS_8859-15", "ISO-8859-15" },
619 { "it_IT.ISO_8859-1", "ISO-8859-1" },
620 { "ja_JP.EUC", "EUC-JP" },
621 { "ja_JP.SJIS", "SHIFT_JIS" },
622 { "ja_JP.Shift_JIS", "SHIFT_JIS" },
623 { "ko_KR.EUC", "EUC-KR" },
624 { "la_LN.ASCII", "ASCII" },
625 { "la_LN.DIS_8859-15", "ISO-8859-15" },
626 { "la_LN.ISO_8859-1", "ISO-8859-1" },
627 { "la_LN.ISO_8859-2", "ISO-8859-2" },
628 { "la_LN.ISO_8859-4", "ISO-8859-4" },
629 { "lt_LN.ASCII", "ASCII" },
630 { "lt_LN.DIS_8859-15", "ISO-8859-15" },
631 { "lt_LN.ISO_8859-1", "ISO-8859-1" },
632 { "lt_LN.ISO_8859-2", "ISO-8859-2" },
633 { "lt_LT.ISO_8859-4", "ISO-8859-4" },
634 { "nl_BE.DIS_8859-15", "ISO-8859-15" },
635 { "nl_BE.ISO_8859-1", "ISO-8859-1" },
636 { "nl_NL.DIS_8859-15", "ISO-8859-15" },
637 { "nl_NL.ISO_8859-1", "ISO-8859-1" },
638 { "no_NO.DIS_8859-15", "ISO-8859-15" },
639 { "no_NO.ISO_8859-1", "ISO-8859-1" },
640 { "pl_PL.ISO_8859-2", "ISO-8859-2" },
641 { "pt_PT.DIS_8859-15", "ISO-8859-15" },
642 { "pt_PT.ISO_8859-1", "ISO-8859-1" },
643 { "ru_RU.CP866", "CP866" },
644 { "ru_RU.ISO_8859-5", "ISO-8859-5" },
645 { "ru_RU.KOI8-R", "KOI8-R" },
646 { "ru_SU.CP866", "CP866" },
647 { "ru_SU.ISO_8859-5", "ISO-8859-5" },
648 { "ru_SU.KOI8-R", "KOI8-R" },
649 { "sl_SI.ISO_8859-2", "ISO-8859-2" },
650 { "sv_SE.DIS_8859-15", "ISO-8859-15" },
651 { "sv_SE.ISO_8859-1", "ISO-8859-1" },
652 { "uk_UA.KOI8-U", "KOI8-U" },
653 { "zh_CN.EUC", "GB2312" },
654 { "zh_TW.BIG5", "BIG5" },
655 { "zh_TW.Big5", "BIG5" }
656 # define locale_table_defined
658 # if defined __DJGPP__ /* DOS / DJGPP 2.03 */
659 /* The encodings given here may not all be correct.
660 If you find that the encoding given for your language and
661 country is not the one your DOS machine actually uses, just
662 correct it in this file, and send a mail to
663 Juan Manuel Guerrero <juan.guerrero@gmx.de>
664 and <bug-gnulib@gnu.org>. */
667 { "ar_AE", "CP864" },
668 { "ar_DZ", "CP864" },
669 { "ar_EG", "CP864" },
670 { "ar_IQ", "CP864" },
671 { "ar_IR", "CP864" },
672 { "ar_JO", "CP864" },
673 { "ar_KW", "CP864" },
674 { "ar_MA", "CP864" },
675 { "ar_OM", "CP864" },
676 { "ar_QA", "CP864" },
677 { "ar_SA", "CP864" },
678 { "ar_SY", "CP864" },
680 { "be_BE", "CP866" },
681 { "bg", "CP866" }, /* not CP855 ?? */
682 { "bg_BG", "CP866" }, /* not CP855 ?? */
684 { "ca_ES", "CP850" },
686 { "cs_CZ", "CP852" },
687 { "da", "CP865" }, /* not CP850 ?? */
688 { "da_DK", "CP865" }, /* not CP850 ?? */
690 { "de_AT", "CP850" },
691 { "de_CH", "CP850" },
692 { "de_DE", "CP850" },
694 { "el_GR", "CP869" },
696 { "en_AU", "CP850" }, /* not CP437 ?? */
697 { "en_CA", "CP850" },
698 { "en_GB", "CP850" },
699 { "en_NZ", "CP437" },
700 { "en_US", "CP437" },
701 { "en_ZA", "CP850" }, /* not CP437 ?? */
703 { "eo_EO", "CP850" },
705 { "es_AR", "CP850" },
706 { "es_BO", "CP850" },
707 { "es_CL", "CP850" },
708 { "es_CO", "CP850" },
709 { "es_CR", "CP850" },
710 { "es_CU", "CP850" },
711 { "es_DO", "CP850" },
712 { "es_EC", "CP850" },
713 { "es_ES", "CP850" },
714 { "es_GT", "CP850" },
715 { "es_HN", "CP850" },
716 { "es_MX", "CP850" },
717 { "es_NI", "CP850" },
718 { "es_PA", "CP850" },
719 { "es_PE", "CP850" },
720 { "es_PY", "CP850" },
721 { "es_SV", "CP850" },
722 { "es_UY", "CP850" },
723 { "es_VE", "CP850" },
725 { "et_EE", "CP850" },
727 { "eu_ES", "CP850" },
729 { "fi_FI", "CP850" },
731 { "fr_BE", "CP850" },
732 { "fr_CA", "CP850" },
733 { "fr_CH", "CP850" },
734 { "fr_FR", "CP850" },
736 { "ga_IE", "CP850" },
738 { "gd_GB", "CP850" },
740 { "gl_ES", "CP850" },
742 { "he_IL", "CP862" },
744 { "hr_HR", "CP852" },
746 { "hu_HU", "CP852" },
747 { "id", "CP850" }, /* not CP437 ?? */
748 { "id_ID", "CP850" }, /* not CP437 ?? */
749 { "is", "CP861" }, /* not CP850 ?? */
750 { "is_IS", "CP861" }, /* not CP850 ?? */
752 { "it_CH", "CP850" },
753 { "it_IT", "CP850" },
755 { "ja_JP", "CP932" },
756 { "kr", "CP949" }, /* not CP934 ?? */
757 { "kr_KR", "CP949" }, /* not CP934 ?? */
759 { "lt_LT", "CP775" },
761 { "lv_LV", "CP775" },
762 { "mk", "CP866" }, /* not CP855 ?? */
763 { "mk_MK", "CP866" }, /* not CP855 ?? */
765 { "mt_MT", "CP850" },
766 { "nb", "CP865" }, /* not CP850 ?? */
767 { "nb_NO", "CP865" }, /* not CP850 ?? */
769 { "nl_BE", "CP850" },
770 { "nl_NL", "CP850" },
771 { "nn", "CP865" }, /* not CP850 ?? */
772 { "nn_NO", "CP865" }, /* not CP850 ?? */
773 { "no", "CP865" }, /* not CP850 ?? */
774 { "no_NO", "CP865" }, /* not CP850 ?? */
776 { "pl_PL", "CP852" },
778 { "pt_BR", "CP850" },
779 { "pt_PT", "CP850" },
781 { "ro_RO", "CP852" },
783 { "ru_RU", "CP866" },
785 { "sk_SK", "CP852" },
787 { "sl_SI", "CP852" },
789 { "sq_AL", "CP852" },
790 { "sr", "CP852" }, /* CP852 or CP866 or CP855 ?? */
791 { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
792 { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
794 { "sv_SE", "CP850" },
796 { "th_TH", "CP874" },
798 { "tr_TR", "CP857" },
800 { "uk_UA", "CP1125" },
802 { "zh_TW", "CP950" } /* not CP938 ?? */
803 # define locale_table_defined
805 # ifndef locale_table_defined
806 /* Just a dummy entry, to avoid a C syntax error. */
814 /* Determine the current locale's character encoding, and canonicalize it
815 into one of the canonical names listed in localcharset.h.
816 The result must not be freed; it is statically allocated.
817 If the canonical name cannot be determined, the result is a non-canonical
824 locale_charset (void)
828 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
830 # if HAVE_LANGINFO_CODESET
832 /* Most systems support nl_langinfo (CODESET) nowadays. */
833 codeset
= nl_langinfo (CODESET
);
836 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
837 returns "US-ASCII". Return the suffix of the locale name from the
838 environment variables (if present) or the codepage as a number. */
839 if (codeset
!= NULL
&& strcmp (codeset
, "US-ASCII") == 0)
842 static char buf
[2 + 10 + 1];
844 locale
= getenv ("LC_ALL");
845 if (locale
== NULL
|| locale
[0] == '\0')
847 locale
= getenv ("LC_CTYPE");
848 if (locale
== NULL
|| locale
[0] == '\0')
849 locale
= getenv ("LANG");
851 if (locale
!= NULL
&& locale
[0] != '\0')
853 /* If the locale name contains an encoding after the dot, return
855 const char *dot
= strchr (locale
, '.');
859 const char *modifier
;
862 /* Look for the possible @... trailer and remove it, if any. */
863 modifier
= strchr (dot
, '@');
864 if (modifier
== NULL
)
866 if (modifier
- dot
< sizeof (buf
))
868 memcpy (buf
, dot
, modifier
- dot
);
869 buf
[modifier
- dot
] = '\0';
875 /* The Windows API has a function returning the locale's codepage as a
876 number: GetACP(). This encoding is used by Cygwin, unless the user
877 has set the environment variable CYGWIN=codepage:oem (which very few
879 Output directed to console windows needs to be converted (to
880 GetOEMCP() if the console is using a raster font, or to
881 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
882 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
883 converting to GetConsoleOutputCP(). This leads to correct results,
884 except when SetConsoleOutputCP has been called and a raster font is
886 sprintf (buf
, "CP%u", GetACP ());
892 /* The canonical name cannot be determined. */
895 # elif defined WINDOWS_NATIVE
897 static char buf
[2 + 10 + 1];
899 /* The Windows API has a function returning the locale's codepage as
900 a number, but the value doesn't change according to what the
901 'setlocale' call specified. So we use it as a last resort, in
902 case the string returned by 'setlocale' doesn't specify the
904 char *current_locale
= setlocale (LC_ALL
, NULL
);
907 /* If they set different locales for different categories,
908 'setlocale' will return a semi-colon separated list of locale
909 values. To make sure we use the correct one, we choose LC_CTYPE. */
910 if (strchr (current_locale
, ';'))
911 current_locale
= setlocale (LC_CTYPE
, NULL
);
913 pdot
= strrchr (current_locale
, '.');
914 if (pdot
&& 2 + strlen (pdot
+ 1) + 1 <= sizeof (buf
))
915 sprintf (buf
, "CP%s", pdot
+ 1);
918 /* The Windows API has a function returning the locale's codepage as a
920 When the output goes to a console window, it needs to be provided in
921 GetOEMCP() encoding if the console is using a raster font, or in
922 GetConsoleOutputCP() encoding if it is using a TrueType font.
923 But in GUI programs and for output sent to files and pipes, GetACP()
924 encoding is the best bet. */
925 sprintf (buf
, "CP%u", GetACP ());
927 /* For a locale name such as "French_France.65001", in Windows 10,
928 setlocale now returns "French_France.utf8" instead. */
929 if (strcmp (buf
+ 2, "65001") == 0 || strcmp (buf
+ 2, "utf8") == 0)
937 static char buf
[2 + 10 + 1];
943 /* Allow user to override the codeset, as set in the operating system,
944 with standard language environment variables. */
945 locale
= getenv ("LC_ALL");
946 if (locale
== NULL
|| locale
[0] == '\0')
948 locale
= getenv ("LC_CTYPE");
949 if (locale
== NULL
|| locale
[0] == '\0')
950 locale
= getenv ("LANG");
952 if (locale
!= NULL
&& locale
[0] != '\0')
954 /* If the locale name contains an encoding after the dot, return it. */
955 const char *dot
= strchr (locale
, '.');
959 const char *modifier
;
962 /* Look for the possible @... trailer and remove it, if any. */
963 modifier
= strchr (dot
, '@');
964 if (modifier
== NULL
)
966 if (modifier
- dot
< sizeof (buf
))
968 memcpy (buf
, dot
, modifier
- dot
);
969 buf
[modifier
- dot
] = '\0';
974 /* For the POSIX locale, don't use the system's codepage. */
975 if (strcmp (locale
, "C") == 0 || strcmp (locale
, "POSIX") == 0)
981 /* OS/2 has a function returning the locale's codepage as a number. */
982 if (DosQueryCp (sizeof (cp
), cp
, &cplen
))
986 sprintf (buf
, "CP%u", cp
[0]);
993 # error "Add code for other platforms here."
999 # ifdef alias_table_defined
1000 /* On some platforms, UTF-8 locales are the most frequently used ones.
1001 Speed up the common case and slow down the less common cases by
1002 testing for this case first. */
1003 # if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
1004 if (strcmp (codeset
, "UTF-8") == 0)
1005 goto done_table_lookup
;
1009 const struct table_entry
* const table
= alias_table
;
1010 size_t const table_size
=
1011 sizeof (alias_table
) / sizeof (struct table_entry
);
1012 /* The table is sorted. Perform a binary search. */
1013 size_t hi
= table_size
;
1018 for i < lo, strcmp (table[i].alias, codeset) < 0,
1019 for i >= hi, strcmp (table[i].alias, codeset) > 0. */
1020 size_t mid
= (hi
+ lo
) >> 1; /* >= lo, < hi */
1021 int cmp
= strcmp (table
[mid
].alias
, codeset
);
1029 strcmp (table[i].alias, codeset) == 0. */
1030 codeset
= table
[mid
].canonical
;
1031 goto done_table_lookup
;
1036 done_table_lookup
: ;
1040 /* Did not find it in the table. */
1041 /* On Mac OS X, all modern locales use the UTF-8 encoding.
1042 BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
1043 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
1046 /* Don't return an empty string. GNU libc and GNU libiconv interpret
1047 the empty string as denoting "the locale's character encoding",
1048 thus GNU libiconv would call this function a second time. */
1049 if (codeset
[0] == '\0')
1057 /* On old systems which lack it, use setlocale or getenv. */
1058 const char *locale
= NULL
;
1060 /* But most old systems don't have a complete set of locales. Some
1061 (like DJGPP) have only the C locale. Therefore we don't use setlocale
1062 here; it would return "C" when it doesn't support the locale name the
1065 locale
= setlocale (LC_CTYPE
, NULL
);
1067 if (locale
== NULL
|| locale
[0] == '\0')
1069 locale
= getenv ("LC_ALL");
1070 if (locale
== NULL
|| locale
[0] == '\0')
1072 locale
= getenv ("LC_CTYPE");
1073 if (locale
== NULL
|| locale
[0] == '\0')
1074 locale
= getenv ("LANG");
1080 /* Map locale name to canonical encoding name. */
1082 # ifdef locale_table_defined
1083 const struct table_entry
* const table
= locale_table
;
1084 size_t const table_size
=
1085 sizeof (locale_table
) / sizeof (struct table_entry
);
1086 /* The table is sorted. Perform a binary search. */
1087 size_t hi
= table_size
;
1092 for i < lo, strcmp (table[i].locale, locale) < 0,
1093 for i >= hi, strcmp (table[i].locale, locale) > 0. */
1094 size_t mid
= (hi
+ lo
) >> 1; /* >= lo, < hi */
1095 int cmp
= strcmp (table
[mid
].locale
, locale
);
1103 strcmp (table[i].locale, locale) == 0. */
1104 codeset
= table
[mid
].canonical
;
1105 goto done_table_lookup
;
1109 done_table_lookup
: ;
1113 /* Did not find it in the table. */
1114 /* On Mac OS X, all modern locales use the UTF-8 encoding.
1115 BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
1116 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
1119 /* The canonical name cannot be determined. */
1120 /* Don't return an empty string. GNU libc and GNU libiconv interpret
1121 the empty string as denoting "the locale's character encoding",
1122 thus GNU libiconv would call this function a second time. */
1131 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
1132 (the default codeset) does not work when MB_CUR_MAX is 1. */
1133 if (strcmp (codeset
, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL
)) <= 1)
This page took 0.10004 seconds and 4 git commands to generate.