/* * $LynxId: UCdomap.c,v 1.96 2014/02/04 01:29:44 tom Exp $ * * UCdomap.c * ========= * * This is a Lynx chartrans engine, its external calls are in UCMap.h * * Derived from code in the Linux kernel console driver. * The GNU Public Licence therefore applies, see * the file COPYING in the top-level directory * which should come with every Lynx distribution. * * [ original comment: - KW ] * * Mapping from internal code (such as Latin-1 or Unicode or IBM PC code) * to font positions. * * aeb, 950210 */ #include #include #include #include #include #include #include #include #include #include #if defined(USE_LOCALE_CHARSET) && defined(HAVE_LANGINFO_CODESET) #include #endif #ifdef EXP_JAPANESEUTF8_SUPPORT #include #endif #include /* * Include chartrans tables: */ #include /* WinLatin2 (cp1250) */ #include /* WinCyrillic (cp1251) */ #include /* WinLatin1 (cp1252) */ #include /* WinGreek (cp1253) */ #include /* WinHebrew (cp1255) */ #include /* WinArabic (cp1256) */ #include /* WinBaltRim (cp1257) */ #include /* DosLatinUS (cp437) */ #include /* DosGreek (cp737) */ #include /* DosBaltRim (cp775) */ #include /* DosLatin1 (cp850) */ #include /* DosLatin2 (cp852) */ #include /* DosTurkish (cp857) */ #include /* DosHebrew (cp862) */ #include /* DosArabic (cp864) */ #include /* DosCyrillic (cp866) */ #include /* DosGreek2 (cp869) */ #include /* 7 bit approximations */ #include /* DEC Multinational */ #include /* HP Roman8 */ #include /* ISO Latin 1 */ #include /* ISO Latin 2 */ #include /* ISO Latin 3 */ #include /* ISO Latin 4 */ #include /* ISO 8859-5 Cyrillic */ #include /* ISO 8859-6 Arabic */ #include /* ISO 8859-7 Greek */ #include /* ISO 8859-8 Hebrew */ #include /* ISO 8859-9 (Latin 5) */ #include /* ISO 8859-10 */ #include /* ISO 8859-13 (Latin 7) */ #include /* ISO 8859-14 (Latin 8) */ #include /* ISO 8859-15 (Latin 9) */ #include /* KOI8-R Cyrillic */ #include /* Macintosh (8 bit) */ #include /* RFC 1345 Mnemonic */ #include /* NeXT character set */ #include /* RFC 1345 w/o Intro */ /* #include */ /* UNICODE UTF 8 */ #include /* Vietnamese (VISCII) */ #include /* Ukrainian Cyrillic (866) */ #include /* Ukrainian Cyrillic (koi8-u */ #include /* Cyrillic-Asian (PT154) */ #ifdef CAN_AUTODETECT_DISPLAY_CHARSET int auto_display_charset = -1; #endif static const char *UC_GNsetMIMEnames[4] = { "iso-8859-1", "x-dec-graphics", "cp437", "x-transparent" }; static int UC_GNhandles[4] = { -1, -1, -1, -1 }; /* * Some of the code below, and some of the comments, are left in for * historical reasons. Not all those tables below are currently * really needed (and what with all those hardwired codepoints), * but let's keep them around for now. They may come in handy if we * decide to make more extended use of the mechanisms (including e.g. * for chars < 127...). - KW */ static u16 translations[][256] = { /* * 8-bit Latin-1 mapped to Unicode -- trivial mapping. */ { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff }, /* * VT100 graphics mapped to Unicode. */ { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x00a0, 0x25c6, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1, 0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0xf800, 0xf801, 0x2500, 0xf803, 0xf804, 0x251c, 0x2524, 0x2534, 0x252c, 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff }, /* * IBM Codepage 437 mapped to Unicode. */ { 0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, 0x263c, 0x25ba, 0x25c4, 0x2195, 0x203c, 0x00b6, 0x00a7, 0x25ac, 0x21a8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221f, 0x2194, 0x25b2, 0x25bc, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x2302, 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229, 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0 }, /* * User mapping -- default to codes for direct font mapping. */ { 0xf000, 0xf001, 0xf002, 0xf003, 0xf004, 0xf005, 0xf006, 0xf007, 0xf008, 0xf009, 0xf00a, 0xf00b, 0xf00c, 0xf00d, 0xf00e, 0xf00f, 0xf010, 0xf011, 0xf012, 0xf013, 0xf014, 0xf015, 0xf016, 0xf017, 0xf018, 0xf019, 0xf01a, 0xf01b, 0xf01c, 0xf01d, 0xf01e, 0xf01f, 0xf020, 0xf021, 0xf022, 0xf023, 0xf024, 0xf025, 0xf026, 0xf027, 0xf028, 0xf029, 0xf02a, 0xf02b, 0xf02c, 0xf02d, 0xf02e, 0xf02f, 0xf030, 0xf031, 0xf032, 0xf033, 0xf034, 0xf035, 0xf036, 0xf037, 0xf038, 0xf039, 0xf03a, 0xf03b, 0xf03c, 0xf03d, 0xf03e, 0xf03f, 0xf040, 0xf041, 0xf042, 0xf043, 0xf044, 0xf045, 0xf046, 0xf047, 0xf048, 0xf049, 0xf04a, 0xf04b, 0xf04c, 0xf04d, 0xf04e, 0xf04f, 0xf050, 0xf051, 0xf052, 0xf053, 0xf054, 0xf055, 0xf056, 0xf057, 0xf058, 0xf059, 0xf05a, 0xf05b, 0xf05c, 0xf05d, 0xf05e, 0xf05f, 0xf060, 0xf061, 0xf062, 0xf063, 0xf064, 0xf065, 0xf066, 0xf067, 0xf068, 0xf069, 0xf06a, 0xf06b, 0xf06c, 0xf06d, 0xf06e, 0xf06f, 0xf070, 0xf071, 0xf072, 0xf073, 0xf074, 0xf075, 0xf076, 0xf077, 0xf078, 0xf079, 0xf07a, 0xf07b, 0xf07c, 0xf07d, 0xf07e, 0xf07f, 0xf080, 0xf081, 0xf082, 0xf083, 0xf084, 0xf085, 0xf086, 0xf087, 0xf088, 0xf089, 0xf08a, 0xf08b, 0xf08c, 0xf08d, 0xf08e, 0xf08f, 0xf090, 0xf091, 0xf092, 0xf093, 0xf094, 0xf095, 0xf096, 0xf097, 0xf098, 0xf099, 0xf09a, 0xf09b, 0xf09c, 0xf09d, 0xf09e, 0xf09f, 0xf0a0, 0xf0a1, 0xf0a2, 0xf0a3, 0xf0a4, 0xf0a5, 0xf0a6, 0xf0a7, 0xf0a8, 0xf0a9, 0xf0aa, 0xf0ab, 0xf0ac, 0xf0ad, 0xf0ae, 0xf0af, 0xf0b0, 0xf0b1, 0xf0b2, 0xf0b3, 0xf0b4, 0xf0b5, 0xf0b6, 0xf0b7, 0xf0b8, 0xf0b9, 0xf0ba, 0xf0bb, 0xf0bc, 0xf0bd, 0xf0be, 0xf0bf, 0xf0c0, 0xf0c1, 0xf0c2, 0xf0c3, 0xf0c4, 0xf0c5, 0xf0c6, 0xf0c7, 0xf0c8, 0xf0c9, 0xf0ca, 0xf0cb, 0xf0cc, 0xf0cd, 0xf0ce, 0xf0cf, 0xf0d0, 0xf0d1, 0xf0d2, 0xf0d3, 0xf0d4, 0xf0d5, 0xf0d6, 0xf0d7, 0xf0d8, 0xf0d9, 0xf0da, 0xf0db, 0xf0dc, 0xf0dd, 0xf0de, 0xf0df, 0xf0e0, 0xf0e1, 0xf0e2, 0xf0e3, 0xf0e4, 0xf0e5, 0xf0e6, 0xf0e7, 0xf0e8, 0xf0e9, 0xf0ea, 0xf0eb, 0xf0ec, 0xf0ed, 0xf0ee, 0xf0ef, 0xf0f0, 0xf0f1, 0xf0f2, 0xf0f3, 0xf0f4, 0xf0f5, 0xf0f6, 0xf0f7, 0xf0f8, 0xf0f9, 0xf0fa, 0xf0fb, 0xf0fc, 0xf0fd, 0xf0fe, 0xf0ff } }; static u16 *UC_translate = NULL; static struct UC_charset UCInfo[MAXCHARSETS]; /* * The standard kernel character-to-font mappings are not invertible * -- this is just a best effort. */ #define MAX_GLYPH 512 /* Max possible glyph value */ static unsigned char *inv_translate = NULL; static unsigned char inv_norm_transl[MAX_GLYPH]; static unsigned char *inverse_translations[4] = {NULL, NULL, NULL, NULL}; static void set_inverse_transl(int i); static u16 *set_translate(int m); static int UC_valid_UC_charset(int UC_charset_hndl); static void UC_con_set_trans(int UC_charset_in_hndl, int Gn, int update_flag); static int con_insert_unipair(unsigned unicode, unsigned fontpos, int fordefault); static int con_insert_unipair_str(unsigned unicode, const char *replace_str, int fordefault); static void con_clear_unimap(int fordefault); static void con_clear_unimap_str(int fordefault); static void con_set_default_unimap(void); static int UC_con_set_unimap(int UC_charset_out_hndl, int update_flag); static int UC_con_set_unimap_str(unsigned ct, struct unipair_str *list, int fordefault); static int conv_uni_to_pc(long ucs, int usedefault); static int conv_uni_to_str(char *outbuf, int buflen, UCode_t ucs, int usedefault); static void UCconsole_map_init(void); static int UC_MapGN(int UChndl, int update_flag); static int UC_FindGN_byMIME(const char *UC_MIMEcharset); static void UCreset_allocated_LYCharSets(void); static STRING2PTR UC_setup_LYCharSets_repl(int UC_charset_in_hndl, unsigned lowest8); static int UC_Register_with_LYCharSets(int s, const char *UC_MIMEcharset, const char *UC_LYNXcharset, int lowest_eightbit); #ifdef LY_FIND_LEAKS static void UCfree_allocated_LYCharSets(void); static void UCcleanup_mem(void); #endif static int default_UChndl = -1; static void set_inverse_transl(int i) { int j, glyph; u16 *p = translations[i]; unsigned char *q = inverse_translations[i]; if (!q) { /* * Slightly messy to avoid calling kmalloc too early. */ q = inverse_translations[i] = ((i == LAT1_MAP) ? inv_norm_transl : typeMallocn(unsigned char, MAX_GLYPH)); if (!q) return; } for (j = 0; j < MAX_GLYPH; j++) q[j] = 0; for (j = 0; j < E_TABSZ; j++) { glyph = conv_uni_to_pc((long) p[j], 0); if (glyph >= 0 && glyph < MAX_GLYPH && q[glyph] < 32) { /* * Prefer '-' above SHY etc. */ q[glyph] = UCH(j); } } } static u16 *set_translate(int m) { if (!inverse_translations[m]) set_inverse_transl(m); inv_translate = inverse_translations[m]; return translations[m]; } static int UC_valid_UC_charset(int UC_charset_hndl) { return (UC_charset_hndl >= 0 && UC_charset_hndl < UCNumCharsets); } static void UC_con_set_trans(int UC_charset_in_hndl, int Gn, int update_flag) { int i, j; const u16 *p; u16 *ptrans; if (!UC_valid_UC_charset(UC_charset_in_hndl)) { CTRACE((tfp, "UC_con_set_trans: Invalid charset handle %d.\n", UC_charset_in_hndl)); return; } ptrans = translations[Gn]; p = UCInfo[UC_charset_in_hndl].unitable; #if(0) if (p == UC_current_unitable) { /* test whether pointers are equal */ return; /* nothing to be done */ } /* * The font is always 256 characters - so far. */ con_clear_unimap(); #endif for (i = 0; i < 256; i++) { if ((j = UCInfo[UC_charset_in_hndl].unicount[i])) { ptrans[i] = *p; for (; j; j--) { p++; } } else { ptrans[i] = 0xfffd; } } if (update_flag) { set_inverse_transl(Gn); /* Update inverse translation for this one */ } } /* * Unicode -> current font conversion * * A font has at most 512 chars, usually 256. * But one font position may represent several Unicode chars. * A hashtable is somewhat of a pain to deal with, so use a * "paged table" instead. Simulation has shown the memory cost of * this 3-level paged table scheme to be comparable to a hash table. */ static int hashtable_contents_valid = 0; /* Use ASCII-only mode for bootup */ static int hashtable_str_contents_valid = 0; static u16 **uni_pagedir[32] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; static char ***uni_pagedir_str[32] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; static const u16 *UC_current_unitable = NULL; static struct unimapdesc_str *UC_current_unitable_str = NULL; /* * Keep a second set of structures for the translation designated * as "default" - kw */ static int unidefault_contents_valid = 0; /* Use ASCII-only mode for bootup */ static int unidefault_str_contents_valid = 0; static u16 **unidefault_pagedir[32] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; static char ***unidefault_pagedir_str[32] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; static const u16 *UC_default_unitable = 0; static const struct unimapdesc_str *UC_default_unitable_str = 0; static int con_insert_unipair(unsigned unicode, unsigned fontpos, int fordefault) { int i; unsigned n; u16 **p1, *p2; if (fordefault) p1 = unidefault_pagedir[n = unicode >> 11]; else p1 = uni_pagedir[n = unicode >> 11]; if (!p1) { p1 = (u16 * *)malloc(32 * sizeof(u16 *)); if (fordefault) unidefault_pagedir[n] = p1; else uni_pagedir[n] = p1; if (!p1) return ucError; for (i = 0; i < 32; i++) { p1[i] = NULL; } } if (!(p2 = p1[n = (unicode >> 6) & 0x1f])) { p2 = p1[n] = (u16 *) malloc(64 * sizeof(u16)); if (!p2) return ucError; for (i = 0; i < 64; i++) { p2[i] = 0xffff; /* No glyph for this character (yet) */ } } p2[unicode & 0x3f] = (u16) fontpos; return 0; } static int con_insert_unipair_str(unsigned unicode, const char *replace_str, int fordefault) { int i; unsigned n; char ***p1; const char **p2; if (fordefault) p1 = unidefault_pagedir_str[n = unicode >> 11]; else p1 = uni_pagedir_str[n = unicode >> 11]; if (!p1) { p1 = (char ***) malloc(32 * sizeof(char **)); if (fordefault) unidefault_pagedir_str[n] = p1; else uni_pagedir_str[n] = p1; if (!p1) return ucError; for (i = 0; i < 32; i++) { p1[i] = NULL; } } n = ((unicode >> 6) & 0x1f); if (!p1[n]) { p1[n] = (char **) malloc(64 * sizeof(char *)); if (!p1[n]) return ucError; p2 = (const char **) p1[n]; for (i = 0; i < 64; i++) { p2[i] = NULL; /* No replace string this character (yet) */ } } p2 = (const char **) p1[n]; p2[unicode & 0x3f] = replace_str; return 0; } /* * ui arg was a leftover, deleted. - KW */ static void con_clear_unimap(int fordefault) { int i, j; u16 **p1; if (fordefault) { for (i = 0; i < 32; i++) { if ((p1 = unidefault_pagedir[i]) != NULL) { for (j = 0; j < 32; j++) { FREE(p1[j]); } FREE(p1); } unidefault_pagedir[i] = NULL; } unidefault_contents_valid = 1; } else { for (i = 0; i < 32; i++) { if ((p1 = uni_pagedir[i]) != NULL) { for (j = 0; j < 32; j++) { FREE(p1[j]); } FREE(p1); } uni_pagedir[i] = NULL; } hashtable_contents_valid = 1; } } static void con_clear_unimap_str(int fordefault) { int i, j; char ***p1; if (fordefault) { for (i = 0; i < 32; i++) { if ((p1 = unidefault_pagedir_str[i]) != NULL) { for (j = 0; j < 32; j++) { FREE(p1[j]); } FREE(p1); } unidefault_pagedir_str[i] = NULL; } unidefault_str_contents_valid = 1; /* ??? probably no use... */ } else { for (i = 0; i < 32; i++) { if ((p1 = uni_pagedir_str[i]) != NULL) { for (j = 0; j < 32; j++) { FREE(p1[j]); } FREE(p1); } uni_pagedir_str[i] = NULL; } hashtable_str_contents_valid = 1; /* ??? probably no use... */ } } /* * Loads the unimap for the hardware font, as defined in uni_hash.tbl. * The representation used was the most compact I could come up * with. This routine is executed at sys_setup time, and when the * PIO_FONTRESET ioctl is called. */ static void con_set_default_unimap(void) { int i, j; const u16 *p; /* * The default font is always 256 characters. */ con_clear_unimap(1); p = dfont_unitable; for (i = 0; i < 256; i++) { for (j = dfont_unicount[i]; j; j--) { con_insert_unipair(*(p++), (u16) i, 1); } } UC_default_unitable = dfont_unitable; con_clear_unimap_str(1); UC_con_set_unimap_str(dfont_replacedesc.entry_ct, repl_map, 1); UC_default_unitable_str = &dfont_replacedesc; } int UCNumCharsets = 0; int UCLYhndl_HTFile_for_unspec = -1; int UCLYhndl_HTFile_for_unrec = -1; int UCLYhndl_for_unspec = -1; int UCLYhndl_for_unrec = -1; /* easy to type, will initialize later */ int LATIN1 = -1; /* UCGetLYhndl_byMIME("iso-8859-1") */ int US_ASCII = -1; /* UCGetLYhndl_byMIME("us-ascii") */ int UTF8_handle = -1; /* UCGetLYhndl_byMIME("utf-8") */ int TRANSPARENT = -1; /* UCGetLYhndl_byMIME("x-transparent") */ static int UC_con_set_unimap(int UC_charset_out_hndl, int update_flag) { int i, j; const u16 *p; if (!UC_valid_UC_charset(UC_charset_out_hndl)) { CTRACE((tfp, "UC_con_set_unimap: Invalid charset handle %d.\n", UC_charset_out_hndl)); return ucError; } p = UCInfo[UC_charset_out_hndl].unitable; if (p == UC_current_unitable) { /* test whether pointers are equal */ return update_flag; /* nothing to be done */ } UC_current_unitable = p; /* * The font is always 256 characters - so far. */ con_clear_unimap(0); for (i = 0; i < 256; i++) { for (j = UCInfo[UC_charset_out_hndl].unicount[i]; j; j--) { con_insert_unipair(*(p++), (u16) i, 0); } } if (update_flag) { for (i = 0; i <= 3; i++) { set_inverse_transl(i); /* Update all inverse translations */ } } return 0; } static int UC_con_set_unimap_str(unsigned ct, struct unipair_str *list, int fordefault) { int err = 0, err1; while (ct--) { if ((err1 = con_insert_unipair_str(list->unicode, list->replace_str, fordefault)) != 0) { err = err1; } list++; } /* * No inverse translations for replacement strings! */ if (!err) { if (fordefault) unidefault_str_contents_valid = 1; else hashtable_str_contents_valid = 1; } return err; } static int conv_uni_to_pc(long ucs, int usedefault) { int h; u16 **p1, *p2; /* * Only 16-bit codes supported at this time. */ if (ucs > 0xffff) { /* * U+FFFD: REPLACEMENT CHARACTER. */ ucs = 0xfffd; } else if (ucs < 0x20 || ucs >= 0xfffe) { /* * Not a printable character. */ return ucError; } else if (ucs == 0xfeff || (ucs >= 0x200b && ucs <= 0x200f)) { /* * Zero-width space. */ return ucZeroWidth; } else if ((ucs & ~UNI_DIRECT_MASK) == UNI_DIRECT_BASE) { /* * UNI_DIRECT_BASE indicates the start of the region in the * User Zone which always has a 1:1 mapping to the currently * loaded font. The UNI_DIRECT_MASK indicates the bit span * of the region. */ return (ucs & UNI_DIRECT_MASK); } if (usedefault) { if (!unidefault_contents_valid) return ucInvalidHash; p1 = unidefault_pagedir[ucs >> 11]; } else { if (!hashtable_contents_valid) return ucInvalidHash; p1 = uni_pagedir[ucs >> 11]; } if (p1 && (p2 = p1[(ucs >> 6) & 0x1f]) && (h = p2[ucs & 0x3f]) < MAX_GLYPH) { return h; } /* * Not found. */ return ucNotFound; } /* * Note: contents of outbuf is not changes for negative return value! */ static int conv_uni_to_str(char *outbuf, int buflen, UCode_t ucs, int usedefault) { char *h; char ***p1, **p2; /* * Only 16-bit codes supported at this time. */ if (ucs > 0xffff) { /* * U+FFFD: REPLACEMENT CHARACTER. */ ucs = 0xfffd; /* * Maybe the following two cases should be allowed here?? - KW */ } else if (ucs < 0x20 || ucs >= 0xfffe) { /* * Not a printable character. */ return ucError; } else if (ucs == 0xfeff || (ucs >= 0x200b && ucs <= 0x200f)) { /* * Zero-width space. */ return ucZeroWidth; } if (usedefault) { if (!unidefault_str_contents_valid) return ucInvalidHash; p1 = unidefault_pagedir_str[ucs >> 11]; } else { if (!hashtable_str_contents_valid) return ucInvalidHash; p1 = uni_pagedir_str[ucs >> 11]; } if (p1 && (p2 = p1[(ucs >> 6) & 0x1f]) && (h = p2[ucs & 0x3f])) { StrNCpy(outbuf, h, (buflen - 1)); return 1; /* ok ! */ } /* * Not found. */ return ucNotFound; } int UCInitialized = 0; /* * [ original comment: - KW ] * This is called at sys_setup time, after memory and the console are * initialized. It must be possible to call kmalloc(..., GFP_KERNEL) * from this function, hence the call from sys_setup. */ static void UCconsole_map_init(void) { con_set_default_unimap(); UCInitialized = 1; } /* * OK now, finally, some stuff that is more specifically for Lynx: - KW */ int UCTransUniChar(UCode_t unicode, int charset_out) { int rc = 0; int UChndl_out; int isdefault, trydefault = 0; const u16 *ut; if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) { if (LYCharSet_UC[charset_out].codepage < 0) { if (unicode < 128) { rc = (int) unicode; } else { rc = LYCharSet_UC[charset_out].codepage; } return rc; } if ((UChndl_out = default_UChndl) < 0) { return ucCannotOutput; } isdefault = 1; } else { isdefault = UCInfo[UChndl_out].replacedesc.isdefault; trydefault = UCInfo[UChndl_out].replacedesc.trydefault; } if (!isdefault) { ut = UCInfo[UChndl_out].unitable; if (ut != UC_current_unitable) { rc = UC_con_set_unimap(UChndl_out, 1); if (rc < 0) { return rc; } } rc = conv_uni_to_pc(unicode, 0); if (rc >= 0) { return rc; } } if (isdefault || trydefault) { rc = conv_uni_to_pc(unicode, 1); if (rc >= 0) { return rc; } } if (!isdefault && (rc == ucNotFound)) { rc = conv_uni_to_pc(0xfffdL, 0); } if ((isdefault || trydefault) && (rc == ucNotFound)) { rc = conv_uni_to_pc(0xfffdL, 1); } return rc; } /* * Returns string length, or negative value for error. */ int UCTransUniCharStr(char *outbuf, int buflen, UCode_t unicode, int charset_out, int chk_single_flag) { int rc = ucUnknown, src = 0; int UChndl_out; int isdefault, trydefault = 0; struct unimapdesc_str *repl; const u16 *ut; if (buflen < 2) return ucBufferTooSmall; if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) { if (LYCharSet_UC[charset_out].codepage < 0) return LYCharSet_UC[charset_out].codepage; if ((UChndl_out = default_UChndl) < 0) return ucCannotOutput; isdefault = 1; } else { isdefault = UCInfo[UChndl_out].replacedesc.isdefault; trydefault = UCInfo[UChndl_out].replacedesc.trydefault; } if (chk_single_flag) { if (!isdefault) { ut = UCInfo[UChndl_out].unitable; if (ut != UC_current_unitable) { src = UC_con_set_unimap(UChndl_out, 1); if (src < 0) { return src; } } } src = conv_uni_to_pc(unicode, isdefault); if (src >= 32) { outbuf[0] = (char) src; outbuf[1] = '\0'; return 1; } } repl = &(UCInfo[UChndl_out].replacedesc); if (!isdefault) { if (repl != UC_current_unitable_str) { con_clear_unimap_str(0); (void) UC_con_set_unimap_str(repl->entry_ct, repl->entries, 0); UC_current_unitable_str = repl; } rc = conv_uni_to_str(outbuf, buflen, unicode, 0); if (rc >= 0) return (int) strlen(outbuf); } if (trydefault && chk_single_flag) { src = conv_uni_to_pc(unicode, 1); if (src >= 32) { outbuf[0] = (char) src; outbuf[1] = '\0'; return 1; } } if (isdefault || trydefault) { #ifdef EXP_JAPANESEUTF8_SUPPORT if (LYCharSet_UC[charset_out].codepage == 0 && LYCharSet_UC[charset_out].codepoints == 0) { iconv_t cd; char str[3], *pin, *pout; size_t inleft, outleft; char *tocode = NULL; str[0] = (char) (unicode >> 8); str[1] = (char) (unicode & 0xFF); str[2] = 0; pin = str; inleft = 2; pout = outbuf; outleft = (size_t) buflen; /* * Try TRANSLIT first, since it is an extension which can provide * translations when there is no available exact translation to * the target character set. */ HTSprintf0(&tocode, "%s//TRANSLIT", LYCharSet_UC[charset_out].MIMEname); cd = iconv_open(tocode, "UTF-16BE"); if (cd == (iconv_t) -1) { /* * Try again, without TRANSLIT */ HTSprintf0(&tocode, "%s", LYCharSet_UC[charset_out].MIMEname); cd = iconv_open(tocode, "UTF-16BE"); if (cd == (iconv_t) -1) { CTRACE((tfp, "Warning: Cannot transcode form charset %s to %s!\n", "UTF-16BE", tocode)); } } FREE(tocode); if (cd != (iconv_t) -1) { rc = (int) iconv(cd, (ICONV_CONST char **) &pin, &inleft, &pout, &outleft); iconv_close(cd); if ((pout - outbuf) == 3) { CTRACE((tfp, "It seems to be a JIS X 0201 code(%" PRI_UCode_t "). Not supported.\n", unicode)); pin = str; inleft = 2; pout = outbuf; outleft = (size_t) buflen; } else if (rc >= 0) { *pout = '\0'; return (int) strlen(outbuf); } } } #endif rc = conv_uni_to_str(outbuf, buflen, unicode, 1); if (rc >= 0) return (int) strlen(outbuf); } if (rc == ucNotFound) { if (!isdefault) rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 0); if ((rc == ucNotFound) && (isdefault || trydefault)) rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 1); if (rc >= 0) return (int) strlen(outbuf); } if (chk_single_flag && src == ucNotFound) { if (!isdefault) rc = conv_uni_to_pc(0xfffdL, 0); if ((rc == ucNotFound) && (isdefault || trydefault)) rc = conv_uni_to_pc(0xfffdL, 1); if (rc >= 32) { outbuf[0] = (char) rc; outbuf[1] = '\0'; return 1; } return rc; } return ucNotFound; } static int UC_lastautoGN = 0; static int UC_MapGN(int UChndl, int update_flag) { int i, Gn, found, lasthndl; found = 0; Gn = -1; for (i = 0; i < 4 && Gn < 0; i++) { if (UC_GNhandles[i] < 0) { Gn = i; } else if (UC_GNhandles[i] == UChndl) { Gn = i; found = 1; } } if (found) return Gn; if (Gn >= 0) { UCInfo[UChndl].GN = Gn; UC_GNhandles[Gn] = UChndl; } else { if (UC_lastautoGN == GRAF_MAP) { Gn = IBMPC_MAP; } else { Gn = GRAF_MAP; } UC_lastautoGN = Gn; lasthndl = UC_GNhandles[Gn]; UCInfo[lasthndl].GN = -1; UCInfo[UChndl].GN = Gn; UC_GNhandles[Gn] = UChndl; } CTRACE((tfp, "UC_MapGN: Using %d <- %d (%s)\n", Gn, UChndl, UCInfo[UChndl].MIMEname)); UC_con_set_trans(UChndl, Gn, update_flag); return Gn; } int UCTransChar(int ch_in, int charset_in, int charset_out) { UCode_t unicode; int Gn; int rc = ucNotFound; int UChndl_in, UChndl_out; int isdefault, trydefault = 0; const u16 *ut; int upd = 0; if (charset_in == charset_out) return UCH(ch_in); if (charset_in < 0) return ucCannotConvert; if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0) return ucCannotConvert; if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) { if (LYCharSet_UC[charset_out].codepage < 0) return LYCharSet_UC[charset_out].codepage; if ((UChndl_out = default_UChndl) < 0) return ucCannotOutput; isdefault = 1; } else { isdefault = UCInfo[UChndl_out].replacedesc.isdefault; trydefault = UCInfo[UChndl_out].replacedesc.trydefault; } if (!UCInfo[UChndl_in].num_uni) return ucCannotConvert; if ((Gn = UCInfo[UChndl_in].GN) < 0) { Gn = UC_MapGN(UChndl_in, 0); upd = 1; } ut = UCInfo[UChndl_out].unitable; if (!isdefault) { if (ut == UC_current_unitable) { if (upd) { set_inverse_transl(Gn); } } else { rc = UC_con_set_unimap(UChndl_out, 1); if (rc > 0) { set_inverse_transl(Gn); } else if (rc < 0) { return rc; } } } UC_translate = set_translate(Gn); unicode = UC_translate[UCH(ch_in)]; if (!isdefault) { rc = conv_uni_to_pc(unicode, 0); if (rc >= 0) return rc; } if ((rc == ucNotFound) && (isdefault || trydefault)) { rc = conv_uni_to_pc(unicode, 1); } if ((rc == ucNotFound) && !isdefault) { rc = conv_uni_to_pc(0xfffdL, 0); } if ((rc == ucNotFound) && (isdefault || trydefault)) { rc = conv_uni_to_pc(0xfffdL, 1); } return rc; } #ifdef EXP_JAPANESEUTF8_SUPPORT UCode_t UCTransJPToUni(char *inbuf, int buflen, int charset_in) { char outbuf[3], *pin, *pout; size_t ilen, olen; iconv_t cd; pin = inbuf; pout = outbuf; ilen = 2; olen = (size_t) buflen; cd = iconv_open("UTF-16BE", LYCharSet_UC[charset_in].MIMEname); (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen); iconv_close(cd); if ((ilen == 0) && (olen == 0)) { return (((unsigned char) outbuf[0]) << 8) + (unsigned char) outbuf[1]; } return ucCannotConvert; } #endif /* * Translate a character to Unicode. If additional bytes are needed, this * returns ucNeedMore, based on its internal state. To reset the state, * call this with charset_in < 0. */ UCode_t UCTransToUni(int ch_in, int charset_in) { static char buffer[10]; static unsigned inx = 0; UCode_t unicode; int Gn; unsigned char ch_iu = UCH(ch_in); int UChndl_in; /* * Reset saved-state. */ if (charset_in < 0) { inx = 0; return ucCannotConvert; } else if (charset_in == LATIN1) { return ch_iu; } else if (charset_in == UTF8_handle) { if (is8bits(ch_iu)) { unsigned need; char *ptr; buffer[inx++] = (char) ch_iu; buffer[inx] = '\0'; need = (unsigned) utf8_length(TRUE, buffer); if (need && (need + 1) == inx) { inx = 0; ptr = buffer; return UCGetUniFromUtf8String(&ptr); } else if (inx < sizeof(buffer) - 1) { return ucNeedMore; } else { inx = 0; } } else { inx = 0; } } #ifdef EXP_JAPANESEUTF8_SUPPORT if ((strcmp(LYCharSet_UC[charset_in].MIMEname, "shift_jis") == 0) || (strcmp(LYCharSet_UC[charset_in].MIMEname, "euc-jp") == 0)) { char obuffer[3], *pin, *pout; size_t ilen, olen; iconv_t cd; pin = buffer; pout = obuffer; ilen = olen = 2; if (strcmp(LYCharSet_UC[charset_in].MIMEname, "shift_jis") == 0) { if (inx == 0) { if (IS_SJIS_HI1(ch_iu) || IS_SJIS_HI2(ch_iu)) { buffer[0] = (char) ch_in; inx = 1; return ucNeedMore; } } else { if (IS_SJIS_LO(ch_iu)) { buffer[1] = (char) ch_in; buffer[2] = 0; cd = iconv_open("UTF-16BE", "Shift_JIS"); (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen); iconv_close(cd); inx = 0; if ((ilen == 0) && (olen == 0)) { return (UCH(obuffer[0]) << 8) + UCH(obuffer[1]); } } } } if (strcmp(LYCharSet_UC[charset_in].MIMEname, "euc-jp") == 0) { if (inx == 0) { if (IS_EUC_HI(ch_iu)) { buffer[0] = (char) ch_in; inx = 1; return ucNeedMore; } } else { if (IS_EUC_LOX(ch_iu)) { buffer[1] = (char) ch_in; buffer[2] = 0; cd = iconv_open("UTF-16BE", "EUC-JP"); (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen); iconv_close(cd); inx = 0; if ((ilen == 0) && (olen == 0)) { return (UCH(obuffer[0]) << 8) + UCH(obuffer[1]); } } } } inx = 0; } #endif if (ch_iu < 128 && ch_iu >= 32) return ch_iu; if (ch_iu < 32 && LYCharSet_UC[charset_in].enc != UCT_ENC_8BIT_C0) { /* * Don't translate C0 chars except for specific charsets. */ return ch_iu; } else if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0) { return ucCannotConvert; } else if (!UCInfo[UChndl_in].num_uni) { return ucCannotConvert; } if ((Gn = UCInfo[UChndl_in].GN) < 0) { Gn = UC_MapGN(UChndl_in, 1); } UC_translate = set_translate(Gn); unicode = UC_translate[ch_iu]; return unicode; } int UCReverseTransChar(int ch_out, int charset_in, int charset_out) { int Gn; int rc = ucError; int UChndl_in, UChndl_out; int isdefault; int i_ch = UCH(ch_out); const u16 *ut; if (charset_in == charset_out) return UCH(ch_out); if (charset_in < 0) return ucCannotConvert; if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0) return ucCannotConvert; if (!UCInfo[UChndl_in].num_uni) return ucCannotConvert; if (charset_out < 0) return ucCannotOutput; if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) { if (LYCharSet_UC[charset_out].codepage < 0) return LYCharSet_UC[charset_out].codepage; if ((UChndl_out = default_UChndl) < 0) return ucCannotOutput; isdefault = 1; } else { isdefault = UCInfo[UChndl_out].replacedesc.isdefault; } if (!isdefault) { /* * Try to use the inverse table if charset_out is not equivalent * to using just the default table. If it is, it should have * just ASCII chars and trying to back-translate those should * not give anything but themselves. - kw */ ut = UCInfo[UChndl_out].unitable; if (ut == UC_current_unitable) { if ((Gn = UCInfo[UChndl_in].GN) < 0) { Gn = UC_MapGN(UChndl_in, 1); } UC_translate = set_translate(Gn); if (inv_translate) rc = inv_translate[i_ch]; if (rc >= 32) { return rc; } } } return UCTransChar(ch_out, charset_out, charset_in); } /* * Returns string length, or negative value for error. */ int UCTransCharStr(char *outbuf, int buflen, int ch_in, int charset_in, int charset_out, int chk_single_flag) { UCode_t unicode; int Gn; int rc = ucUnknown, src = 0; int UChndl_in, UChndl_out; int isdefault, trydefault = 0; struct unimapdesc_str *repl; const u16 *ut; int upd = 0; if (buflen < 2) return ucBufferTooSmall; if (chk_single_flag && charset_in == charset_out) { outbuf[0] = (char) ch_in; outbuf[1] = '\0'; return 1; } if (charset_in < 0) return ucCannotConvert; if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0) return ucCannotConvert; if (!UCInfo[UChndl_in].num_uni) return ucCannotConvert; if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) { if (LYCharSet_UC[charset_out].codepage < 0) return LYCharSet_UC[charset_out].codepage; if ((UChndl_out = default_UChndl) < 0) return ucCannotOutput; isdefault = 1; } else { isdefault = UCInfo[UChndl_out].replacedesc.isdefault; trydefault = UCInfo[UChndl_out].replacedesc.trydefault; } if ((Gn = UCInfo[UChndl_in].GN) < 0) { Gn = UC_MapGN(UChndl_in, !chk_single_flag); upd = chk_single_flag; } UC_translate = set_translate(Gn); unicode = UC_translate[UCH(ch_in)]; if (chk_single_flag) { if (!isdefault) { ut = UCInfo[UChndl_out].unitable; if (ut == UC_current_unitable) { if (upd) set_inverse_transl(Gn); } else { src = UC_con_set_unimap(UChndl_out, 1); if (src > 0) { set_inverse_transl(Gn); } else if (src < 0) { return src; } } } src = conv_uni_to_pc(unicode, isdefault); if (src >= 32) { outbuf[0] = (char) src; outbuf[1] = '\0'; return 1; } } repl = &(UCInfo[UChndl_out].replacedesc); if (!isdefault) { if (repl != UC_current_unitable_str) { con_clear_unimap_str(0); (void) UC_con_set_unimap_str(repl->entry_ct, repl->entries, 0); UC_current_unitable_str = repl; } rc = conv_uni_to_str(outbuf, buflen, unicode, 0); if (rc >= 0) return (int) strlen(outbuf); } if (trydefault && chk_single_flag) { src = conv_uni_to_pc(unicode, 1); if (src >= 32) { outbuf[0] = (char) src; outbuf[1] = '\0'; return 1; } } if (isdefault || trydefault) { rc = conv_uni_to_str(outbuf, buflen, unicode, 1); if (rc >= 0) return (int) strlen(outbuf); } if (rc == ucNotFound) { if (!isdefault) rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 0); if ((rc == ucNotFound) && (isdefault || trydefault)) rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 1); if (rc >= 0) return (int) strlen(outbuf); } if (chk_single_flag && src == ucNotFound) { if (!isdefault) rc = conv_uni_to_pc(0xfffdL, 0); if ((rc == ucNotFound) && (isdefault || trydefault)) rc = conv_uni_to_pc(0xfffdL, 1); if (rc >= 32) { outbuf[0] = (char) rc; outbuf[1] = '\0'; return 1; } else if (rc <= 0) { outbuf[0] = '\0'; return rc; } return rc; } return ucNotFound; } static int UC_FindGN_byMIME(const char *UC_MIMEcharset) { int i; for (i = 0; i < 4; i++) { if (!strcmp(UC_MIMEcharset, UC_GNsetMIMEnames[i])) { return i; } } return ucError; } int UCGetRawUniMode_byLYhndl(int i) { if (i < 0) return 0; return LYCharSet_UC[i].enc; } /* * Construct a new charset name, given prefix and codepage. This introduces * potentially unchecked recursion into UCGetLYhntl_byMIME if neither the "cp" * nor "windows-" prefixes are configured, so we check it here. */ static int getLYhndl_byCP(const char *prefix, const char *codepage) { static int nested; int result = ucError; if (!nested++) { char *cptmp = NULL; StrAllocCopy(cptmp, prefix); StrAllocCat(cptmp, codepage); result = UCGetLYhndl_byMIME(cptmp); FREE(cptmp); } nested--; return result; } /* * Get Lynx internal charset handler from MIME name, * return -1 if we got NULL or did not recognize value. * According to RFC, MIME headers should match case-insensitively. */ int UCGetLYhndl_byMIME(const char *value) { int i; int LYhndl = -1; if (isEmpty(value)) { CTRACE((tfp, "UCGetLYhndl_byMIME: NULL argument instead of MIME name.\n")); return ucError; } for (i = 0; (i < MAXCHARSETS && i < LYNumCharsets && LYchar_set_names[i]); i++) { if (LYCharSet_UC[i].MIMEname && !strcasecomp(value, LYCharSet_UC[i].MIMEname)) { return i; } } /* * Not yet found, try synonyms. - FM */ #if !NO_CHARSET_utf_8 if (!strcasecomp(value, "unicode-1-1-utf-8") || !strcasecomp(value, "utf8")) { /* * Treat these as synonyms for the IANA registered name. - FM */ return UCGetLYhndl_byMIME("utf-8"); } #endif if (!strncasecomp(value, "iso", 3) && !StrNCmp(value + 3, "8859", 4)) { return getLYhndl_byCP("iso-", value + 3); } if (!strcasecomp(value, "iso-8859-8-i") || !strcasecomp(value, "iso-8859-8-e")) { return UCGetLYhndl_byMIME("iso-8859-8"); } #if !NO_CHARSET_euc_jp if (!strcasecomp(value, "x-euc-jp") || !strcasecomp(value, "eucjp")) { return UCGetLYhndl_byMIME("euc-jp"); } #endif #if !NO_CHARSET_shift_jis if ((!strcasecomp(value, "x-shift-jis")) || (!strcasecomp(value, "x-sjis")) || (!strcasecomp(value, "pck"))) { return UCGetLYhndl_byMIME("shift_jis"); } #endif #if !NO_CHARSET_euc_kr if (!strcasecomp(value, "iso-2022-kr")) { return UCGetLYhndl_byMIME("euc-kr"); } #endif #if !NO_CHARSET_euc_cn if (!strcasecomp(value, "gb2312") || !strncasecomp(value, "cn-gb", 5) || !strcasecomp(value, "iso-2022-cn")) { return UCGetLYhndl_byMIME("euc-cn"); } #endif #if !NO_CHARSET_big5 if (!strcasecomp(value, "cn-big5")) { return UCGetLYhndl_byMIME("big5"); } #endif #if !NO_CHARSET_macintosh if (!strcasecomp(value, "x-mac-roman") || !strcasecomp(value, "mac-roman")) { return UCGetLYhndl_byMIME("macintosh"); } #endif #if !NO_CHARSET_next if (!strcasecomp(value, "x-next") || !strcasecomp(value, "nextstep") || !strcasecomp(value, "x-nextstep")) { return UCGetLYhndl_byMIME("next"); } #endif #if !NO_CHARSET_windows_1252 if (!strcasecomp(value, "iso-8859-1-windows-3.1-latin-1") || !strcasecomp(value, "cp1252") || !strcasecomp(value, "cp-1252") || !strcasecomp(value, "ibm1252") || !strcasecomp(value, "iso-8859-1-windows-3.0-latin-1")) { /* * Treat these as synonyms for windows-1252, which is more * commonly used than the IANA registered name. - FM */ return UCGetLYhndl_byMIME("windows-1252"); } #endif #if !NO_CHARSET_windows_1251 if (!strcasecomp(value, "ansi-1251")) { return UCGetLYhndl_byMIME("windows-1251"); } #endif #if !NO_CHARSET_windows_1250 if (!strcasecomp(value, "iso-8859-2-windows-latin-2") || !strcasecomp(value, "cp1250") || !strcasecomp(value, "cp-1250") || !strcasecomp(value, "ibm1250")) { /* * Treat these as synonyms for windows-1250. - FM */ return UCGetLYhndl_byMIME("windows-1250"); } #endif if ((!strncasecomp(value, "ibm", 3) || !strncasecomp(value, "cp-", 3)) && isdigit(UCH(value[3])) && isdigit(UCH(value[4])) && isdigit(UCH(value[5]))) { /* * For "ibmNNN<...>" or "cp-NNN", try "cpNNN<...>" * if not yet found. - KW & FM */ if ((LYhndl = getLYhndl_byCP("cp", value + 3)) >= 0) return LYhndl; /* * Try windows-NNN<...> if not yet found. - FM */ return getLYhndl_byCP("windows-", value + 3); } if (!strncasecomp(value, "windows-", 8) && isdigit(UCH(value[8])) && isdigit(UCH(value[9])) && isdigit(UCH(value[10]))) { /* * For "windows-NNN<...>", try "cpNNN<...>" - FM */ return getLYhndl_byCP("cp", value + 8); } #if !NO_CHARSET_koi8_r if (!strcasecomp(value, "koi-8")) { /* accentsoft bugosity */ return UCGetLYhndl_byMIME("koi8-r"); } #endif if (!strcasecomp(value, "ANSI_X3.4-1968")) { return US_ASCII; } /* no more synonyms if come here... */ CTRACE((tfp, "UCGetLYhndl_byMIME: unrecognized MIME name \"%s\"\n", value)); return ucError; /* returns -1 if no charset found by that MIME name */ } /* * Function UC_setup_LYCharSets_repl() tries to set up a subtable in * LYCharSets[] appropriate for this new charset, for compatibility with the * "old method". Maybe not nice (maybe not even necessary any more), but it * works (as far as it goes..). * * We try to be conservative and only allocate new memory for this if needed. * If not needed, just point to SevenBitApproximations[i]. [Could do the same * for ISO_Latin1[] if it's identical to that, but would make it even *more* * messy than it already is...] This the only function in this file that knows, * or cares, about the HTMLDTD or details of LYCharSets[] subtables (and * therefore somewhat violates the idea that this file should be independent of * those). As in other places, we rely on ISO_Latin1 being the *first* table * in LYCharSets. - KW */ /* * We need to remember which ones were allocated and which are static. */ static STRING2PTR remember_allocated_LYCharSets[MAXCHARSETS]; static void UCreset_allocated_LYCharSets(void) { int i = 0; for (; i < MAXCHARSETS; i++) { remember_allocated_LYCharSets[i] = NULL; } } #ifdef LY_FIND_LEAKS static void UCfree_allocated_LYCharSets(void) { int i = 0; for (; i < MAXCHARSETS; i++) { if (remember_allocated_LYCharSets[i] != NULL) { FREE(remember_allocated_LYCharSets[i]); } } } #endif static STRING2PTR UC_setup_LYCharSets_repl(int UC_charset_in_hndl, unsigned lowest8) { STRING2PTR ISO_Latin1 = LYCharSets[0]; const char **p; char **prepl; const u16 *pp; const char **tp; const char *s7; const char *s8; size_t i; int j, changed; u16 k; u8 *ti; /* * Create a temporary table for reverse lookup of latin1 codes: */ tp = (const char **) malloc(96 * sizeof(char *)); if (!tp) return NULL; for (i = 0; i < 96; i++) tp[i] = NULL; ti = (u8 *) malloc(96 * sizeof(u8)); if (!ti) { FREE(tp); return NULL; } for (i = 0; i < 96; i++) ti[i] = 0; pp = UCInfo[UC_charset_in_hndl].unitable; /* * Determine if we have any mapping of a Unicode in the range 160-255 * to an allowed code point > 0x80 in our new charset... * Store any mappings found in ti[]. */ if (UCInfo[UC_charset_in_hndl].num_uni > 0) { for (i = 0; i < 256; i++) { if ((j = UCInfo[UC_charset_in_hndl].unicount[i])) { if ((k = *pp) >= 160 && k < 256 && i >= lowest8) { ti[k - 160] = UCH(i); } for (; j; j--) { pp++; } } } } { u16 ct; struct unipair_str *list; /* * Determine if we have any mapping of a Unicode in the range * 160-255 to a replacement string for our new charset... * Store any mappings found in tp[]. */ ct = UCInfo[UC_charset_in_hndl].replacedesc.entry_ct; list = UCInfo[UC_charset_in_hndl].replacedesc.entries; while (ct--) { if ((k = list->unicode) >= 160 && k < 256) { tp[k - 160] = list->replace_str; } list++; } } /* * Now allocate a new table compatible with LYCharSets[] * and with the HTMLDTD for entities. * We don't know yet whether we'll keep it around. */ prepl = (char **) malloc(HTML_dtd.number_of_entities * sizeof(char *)); if (!prepl) { FREE(tp); FREE(ti); return 0; } p = (const char **) prepl; changed = 0; for (i = 0; i < HTML_dtd.number_of_entities; i++, p++) { /* * For each of those entities, we check what the "old method" * ISO_Latin1[] mapping does with them. If it is nothing we * want to use, just point to the SevenBitApproximations[] string. */ s7 = SevenBitApproximations[i]; s8 = ISO_Latin1[i]; *p = s7; if (s8 && UCH(*s8) >= 160 && s8[1] == '\0') { /* * We have an entity that is mapped to * one valid eightbit latin1 char. */ if (ti[UCH(*s8) - 160] >= UCH(lowest8) && !(UCH(s7[0]) == ti[UCH(*s8) - 160] && s7[1] == '\0')) { /* * ...which in turn is mapped, by our "new method", * to another valid eightbit char for this new * charset: either to itself... */ if (ti[UCH(*s8) - 160] == UCH(*s8)) { *p = s8; } else { /* * make those 1-char strings * into HTAtoms, so they will be cleaned up * at exit... all for the sake of preventing * memory leaks, sigh. */ static char dummy[2]; /* one char dummy string */ dummy[0] = (char) ti[UCH(*s8) - 160]; *p = HTAtom_name(HTAtom_for(dummy)); } changed = 1; } else if (tp[UCH(*s8) - 160] && strcmp(s7, tp[UCH(*s8) - 160])) { /* * ...or which is mapped, by our "new method", * to a replacement string for this new charset. */ *p = tp[UCH(*s8) - 160]; changed = 1; } } } FREE(tp); FREE(ti); if (!changed) { FREE(prepl); return NULL; } return (STRING2PTR) prepl; } /* * "New method" meets "Old method" ... */ static int UC_Register_with_LYCharSets(int s, const char *UC_MIMEcharset, const char *UC_LYNXcharset, int lowest_eightbit) { int i, LYhndl, found; STRING2PTR repl; LYhndl = -1; if (LYNumCharsets == 0) { /* * Initialize here; so whoever changes * LYCharSets.c doesn't have to count... */ for (i = 0; (i < MAXCHARSETS) && LYchar_set_names[i]; i++) { LYNumCharsets = i + 1; } } /* * Search by MIME name, (LYchar_set_names may differ...) */ for (i = 0; i < MAXCHARSETS && LYchar_set_names[i] && LYhndl < 0; i++) { if (LYCharSet_UC[i].MIMEname && !strcmp(UC_MIMEcharset, LYCharSet_UC[i].MIMEname)) { LYhndl = i; } } if (LYhndl < 0) { /* not found */ found = 0; if (LYNumCharsets >= MAXCHARSETS) { CTRACE((tfp, "UC_Register_with_LYCharSets: Too many. Ignoring %s/%s.", UC_MIMEcharset, UC_LYNXcharset)); return ucError; } /* * Add to LYCharSets.c lists. */ LYhndl = LYNumCharsets; LYNumCharsets++; LYlowest_eightbit[LYhndl] = 999; LYCharSets[LYhndl] = SevenBitApproximations; /* * Hmm, try to be conservative here. */ LYchar_set_names[LYhndl] = UC_LYNXcharset; LYchar_set_names[LYhndl + 1] = NULL; /* * Terminating NULL may be looked for by Lynx code. */ } else { found = 1; } LYCharSet_UC[LYhndl].UChndl = s; /* * Can we just copy the pointer? Hope so... */ LYCharSet_UC[LYhndl].MIMEname = UC_MIMEcharset; LYCharSet_UC[LYhndl].enc = UCInfo[s].enc; LYCharSet_UC[LYhndl].codepage = UCInfo[s].codepage; /* * @@@ We really SHOULD get more info from the table files, * and set relevant flags in the LYCharSet_UC[] entry with * that info... For now, let's try it without. - KW */ if (lowest_eightbit < LYlowest_eightbit[LYhndl]) { LYlowest_eightbit[LYhndl] = lowest_eightbit; } else if (lowest_eightbit > LYlowest_eightbit[LYhndl]) { UCInfo[s].lowest_eight = LYlowest_eightbit[LYhndl]; } if (!found && LYhndl > 0) { repl = UC_setup_LYCharSets_repl(s, (unsigned) UCInfo[s].lowest_eight); if (repl) { LYCharSets[LYhndl] = repl; /* * Remember to FREE at exit. */ remember_allocated_LYCharSets[LYhndl] = repl; } } return LYhndl; } /* * This only sets up the structure - no initialization of the tables * is done here yet. */ void UC_Charset_Setup(const char *UC_MIMEcharset, const char *UC_LYNXcharset, const u8 * unicount, const u16 * unitable, int nnuni, struct unimapdesc_str replacedesc, int lowest_eight, int UC_rawuni, int codepage) { int s, Gn; int i, status = 0, found; /* * Get (new?) slot. */ found = -1; for (i = 0; i < UCNumCharsets && found < 0; i++) { if (!strcmp(UCInfo[i].MIMEname, UC_MIMEcharset)) { found = i; } } if (found >= 0) { s = found; } else { if (UCNumCharsets >= MAXCHARSETS) { CTRACE((tfp, "UC_Charset_Setup: Too many. Ignoring %s/%s.", UC_MIMEcharset, UC_LYNXcharset)); return; } s = UCNumCharsets; UCInfo[s].MIMEname = UC_MIMEcharset; } UCInfo[s].LYNXname = UC_LYNXcharset; UCInfo[s].unicount = unicount; UCInfo[s].unitable = unitable; UCInfo[s].num_uni = nnuni; UCInfo[s].replacedesc = replacedesc; if (replacedesc.isdefault) { default_UChndl = s; } Gn = UC_FindGN_byMIME(UC_MIMEcharset); if (Gn >= 0) UC_GNhandles[Gn] = s; UCInfo[s].GN = Gn; if (UC_rawuni == UCT_ENC_UTF8) lowest_eight = 128; /* cheat here */ UCInfo[s].lowest_eight = lowest_eight; UCInfo[s].enc = UC_rawuni; UCInfo[s].codepage = codepage; UCInfo[s].LYhndl = UC_Register_with_LYCharSets(s, UC_MIMEcharset, UC_LYNXcharset, lowest_eight); CTRACE2(TRACE_CFG, (tfp, "registered charset %d mime \"%s\" lynx \"%s\"\n", s, UC_MIMEcharset, UC_LYNXcharset)); UCInfo[s].uc_status = status; if (found < 0) UCNumCharsets++; return; } /* * UC_NoUctb_Register_with_LYCharSets, UC_Charset_NoUctb_Setup - * Alternative functions for adding character set info to the lists * kept in LYCharSets.c. * * These are for character sets without any real tables of their own. * We don't keep an entry in UCinfo[] for them. */ static int UC_NoUctb_Register_with_LYCharSets(const char *UC_MIMEcharset, const char *UC_LYNXcharset, int lowest_eightbit, int UC_rawuni, int codepage) { int i, LYhndl = -1; if (LYNumCharsets == 0) { /* * Initialize here; so whoever changes * LYCharSets.c doesn't have to count... */ for (i = 0; (i < MAXCHARSETS) && LYchar_set_names[i]; i++) { LYNumCharsets = i + 1; } } /* * Search by MIME name, (LYchar_set_names may differ...) * ignore if already present! */ for (i = 0; i < MAXCHARSETS && LYchar_set_names[i] && LYhndl < 0; i++) { if (LYCharSet_UC[i].MIMEname && !strcmp(UC_MIMEcharset, LYCharSet_UC[i].MIMEname)) { return ucError; } } /* not found */ if (LYNumCharsets >= MAXCHARSETS) { CTRACE((tfp, "UC_NoUctb_Register_with_LYCharSets: Too many. Ignoring %s/%s.", UC_MIMEcharset, UC_LYNXcharset)); return ucError; } /* * Add to LYCharSets.c lists. */ LYhndl = LYNumCharsets; LYNumCharsets++; LYlowest_eightbit[LYhndl] = lowest_eightbit; LYCharSets[LYhndl] = SevenBitApproximations; LYchar_set_names[LYhndl] = UC_LYNXcharset; LYchar_set_names[LYhndl + 1] = NULL; /* * Terminating NULL may be looked for by Lynx code. */ LYCharSet_UC[LYhndl].UChndl = -1; /* no corresponding UChndl ! */ LYCharSet_UC[LYhndl].MIMEname = UC_MIMEcharset; LYCharSet_UC[LYhndl].enc = UC_rawuni; LYCharSet_UC[LYhndl].codepage = codepage; /* * @@@ We really SHOULD get more info from the table files, * and set relevant flags in the LYCharSet_UC[] entry with * that info... For now, let's try it without. - KW */ return LYhndl; } /* * A wrapper for the previous function. */ static void UC_Charset_NoUctb_Setup(const char *UC_MIMEcharset, const char *UC_LYNXcharset, int trydefault, int lowest_eight, int UC_rawuni, int codepage) { int i; /* * Ignore completely if already in slot. */ for (i = 0; i < UCNumCharsets; i++) { if (!strcmp(UCInfo[i].MIMEname, UC_MIMEcharset)) { return; } } if (UC_rawuni == UCT_ENC_UTF8) lowest_eight = 128; /* cheat here */ /* 'codepage' doubles as a flag for 'do not try any table * lookup, not even default' when negative. The value will * be returned immediately by UCTrans* functions. */ if (!trydefault && codepage == 0) codepage = ucCannotOutput; /* if not already set; any negative should do. */ UC_NoUctb_Register_with_LYCharSets(UC_MIMEcharset, UC_LYNXcharset, lowest_eight, UC_rawuni, codepage); return; } #ifdef LY_FIND_LEAKS static void UCcleanup_mem(void) { int i; UCfree_allocated_LYCharSets(); con_clear_unimap_str(0); con_clear_unimap_str(1); con_clear_unimap(0); con_clear_unimap(1); for (i = 1; i < 4; i++) { /* first one is static! */ FREE(inverse_translations[i]); } } #endif /* LY_FIND_LEAKS */ #ifdef EXP_CHARTRANS_AUTOSWITCH #ifdef CAN_AUTODETECT_DISPLAY_CHARSET # ifdef __EMX__ static int CpOrdinal(const unsigned UCode_t cp, const int other) { char lyName[80]; char myMimeName[80]; char *mimeName, *mName = NULL, *lName = NULL; int s, i, exists = 0, ret; CTRACE((tfp, "CpOrdinal(cp=%lu, other=%d).\n", cp, other)); sprintf(myMimeName, "auto%s-cp%lu", (other ? "2" : ""), cp); mimeName = myMimeName + 5 + (other != 0); sprintf(lyName, "AutoDetect%s (cp%lu)", (other ? "-2" : ""), cp); /* Find slot. */ s = -1; for (i = 0; i < UCNumCharsets; i++) { if (!strcmp(UCInfo[i].LYNXname, lyName)) return UCGetLYhndl_byMIME(myMimeName); else if (!strcasecomp(UCInfo[i].MIMEname, mimeName)) s = i; } if (s < 0) return ucError; /* Store the "real" charset info */ real_charsets[other != 0] = UCGetLYhndl_byMIME(mimeName); /* Duplicate the record. */ StrAllocCopy(mName, myMimeName); StrAllocCopy(lName, lyName); UC_Charset_Setup(mName, lName, UCInfo[s].unicount, UCInfo[s].unitable, UCInfo[s].num_uni, UCInfo[s].replacedesc, UCInfo[s].lowest_eight, UCInfo[s].enc, UCInfo[s].codepage); ret = UCGetLYhndl_byMIME(myMimeName); CTRACE((tfp, "Found %i.\n", ret)); return ret; } # endif /* __EMX__ */ #endif /* CAN_AUTODETECT_DISPLAY_CHARSET */ #endif /* EXP_CHARTRANS_AUTOSWITCH */ void UCInit(void) { UCreset_allocated_LYCharSets(); #ifdef LY_FIND_LEAKS atexit(UCcleanup_mem); #endif UCconsole_map_init(); /* * The order of charset names visible in Lynx Options menu correspond to * the order of lines below, except the first two described in LYCharSet.c * * Entries whose comment is marked with *** are declared in UCdomap.h, * others are based on the included tables - UCdomap.c, near the top. */ UC_CHARSET_SETUP_iso_8859_1; /* ISO Latin 1 */ UC_CHARSET_SETUP_iso_8859_15; /* ISO 8859-15 (Latin 9) */ UC_CHARSET_SETUP_cp850; /* DosLatin1 (cp850) */ UC_CHARSET_SETUP_windows_1252; /* WinLatin1 (cp1252) */ UC_CHARSET_SETUP_cp437; /* DosLatinUS (cp437) */ UC_CHARSET_SETUP_dec_mcs; /* DEC Multinational */ UC_CHARSET_SETUP_macintosh; /* Macintosh (8 bit) */ UC_CHARSET_SETUP_next; /* NeXT character set */ UC_CHARSET_SETUP_hp_roman8; /* HP Roman8 */ UC_CHARSET_SETUP_euc_cn; /*** Chinese */ UC_CHARSET_SETUP_euc_jp; /*** Japanese (EUC_JP) */ UC_CHARSET_SETUP_shift_jis; /*** Japanese (Shift_JIS) */ UC_CHARSET_SETUP_euc_kr; /*** Korean */ UC_CHARSET_SETUP_big5; /*** Taipei (Big5) */ UC_CHARSET_SETUP_viscii; /* Vietnamese (VISCII) */ UC_CHARSET_SETUP; /* us-ascii */ /* 7 bit approximations */ UC_CHARSET_SETUP_x_transparent; /*** Transparent */ UC_CHARSET_SETUP_iso_8859_2; /* ISO Latin 2 */ UC_CHARSET_SETUP_cp852; /* DosLatin2 (cp852) */ UC_CHARSET_SETUP_windows_1250; /* WinLatin2 (cp1250) */ UC_CHARSET_SETUP_iso_8859_3; /* ISO Latin 3 */ UC_CHARSET_SETUP_iso_8859_4; /* ISO Latin 4 */ UC_CHARSET_SETUP_iso_8859_13; /* ISO 8859-13 Baltic Rim */ UC_CHARSET_SETUP_cp775; /* DosBaltRim (cp775) */ UC_CHARSET_SETUP_windows_1257; /* WinBaltRim (cp1257) */ UC_CHARSET_SETUP_iso_8859_5; /* ISO 8859-5 Cyrillic */ UC_CHARSET_SETUP_cp866; /* DosCyrillic (cp866) */ UC_CHARSET_SETUP_windows_1251; /* WinCyrillic (cp1251) */ UC_CHARSET_SETUP_koi8_r; /* KOI8-R Cyrillic */ UC_CHARSET_SETUP_iso_8859_6; /* ISO 8869-6 Arabic */ UC_CHARSET_SETUP_cp864; /* DosArabic (cp864) */ UC_CHARSET_SETUP_windows_1256; /* WinArabic (cp1256) */ UC_CHARSET_SETUP_iso_8859_14; /* ISO 8859-14 Celtic */ UC_CHARSET_SETUP_iso_8859_7; /* ISO 8859-7 Greek */ UC_CHARSET_SETUP_cp737; /* DosGreek (cp737) */ UC_CHARSET_SETUP_cp869; /* DosGreek2 (cp869) */ UC_CHARSET_SETUP_windows_1253; /* WinGreek (cp1253) */ UC_CHARSET_SETUP_iso_8859_8; /* ISO 8859-8 Hebrew */ UC_CHARSET_SETUP_cp862; /* DosHebrew (cp862) */ UC_CHARSET_SETUP_windows_1255; /* WinHebrew (cp1255) */ UC_CHARSET_SETUP_iso_8859_9; /* ISO 8859-9 (Latin 5) */ UC_CHARSET_SETUP_cp857; /* DosTurkish (cp857) */ UC_CHARSET_SETUP_iso_8859_10; /* ISO 8859-10 North European */ UC_CHARSET_SETUP_utf_8; /*** UNICODE UTF-8 */ UC_CHARSET_SETUP_mnemonic_ascii_0; /* RFC 1345 w/o Intro */ UC_CHARSET_SETUP_mnemonic; /* RFC 1345 Mnemonic */ UC_CHARSET_SETUP_cp866u; /* Ukrainian Cyrillic (866) */ UC_CHARSET_SETUP_koi8_u; /* Ukrainian Cyrillic (koi8-u) */ UC_CHARSET_SETUP_ptcp154; /* Cyrillic-Asian (PT154) */ #ifdef EXP_CHARTRANS_AUTOSWITCH #ifdef CAN_AUTODETECT_DISPLAY_CHARSET # ifdef __EMX__ { unsigned UCode_t lst[3]; unsigned UCode_t len, rc; rc = DosQueryCp(sizeof(lst), lst, &len); if (rc == 0) { if (len >= 1) auto_display_charset = CpOrdinal(lst[0], 0); # ifdef CAN_SWITCH_DISPLAY_CHARSET if (len >= 3) { codepages[0] = lst[0]; codepages[1] = (lst[0] == lst[1] ? lst[2] : lst[1]); auto_other_display_charset = CpOrdinal(codepages[1], 1); } # endif } else { CTRACE((tfp, "DosQueryCp() returned %#lx=%lu.\n", rc, rc)); } } # endif #endif #endif /* * To add synonyms for any charset name check function UCGetLYhndl_byMIME in * this file. */ /* for coding/performance - easy to type: */ LATIN1 = UCGetLYhndl_byMIME("iso-8859-1"); US_ASCII = UCGetLYhndl_byMIME("us-ascii"); UTF8_handle = UCGetLYhndl_byMIME("utf-8"); TRANSPARENT = UCGetLYhndl_byMIME("x-transparent"); } /* * Safe variant of UCGetLYhndl_byMIME, with blind recovery from typo in user * input: lynx.cfg, userdefs.h, command line switches. */ int safeUCGetLYhndl_byMIME(const char *value) { int i = UCGetLYhndl_byMIME(value); if (i == -1) { /* was user's typo or not yet recognized value */ i = LATIN1; /* error recovery? */ CTRACE((tfp, "safeUCGetLYhndl_byMIME: ISO-8859-1 assumed.\n")); } return (i); } #ifdef USE_LOCALE_CHARSET #if defined(USE_LOCALE_CHARSET) && !defined(HAVE_LANGINFO_CODESET) /* * This is a quick-and-dirty emulator of the nl_langinfo(CODESET) * function defined in the Single Unix Specification for those systems * (FreeBSD, etc.) that don't have one yet. It behaves as if it had * been called after setlocale(LC_CTYPE, ""), that is it looks at * the locale environment variables. * * http://www.opengroup.org/onlinepubs/7908799/xsh/langinfo.h.html * * Please extend it as needed and suggest improvements to the author. * This emulator will hopefully become redundant soon as * nl_langinfo(CODESET) becomes more widely implemented. * * Since the proposed Li18nux encoding name registry is still not mature, * the output follows the MIME registry where possible: * * http://www.iana.org/assignments/character-sets * * A possible autoconf test for the availability of nl_langinfo(CODESET) * can be found in * * http://www.cl.cam.ac.uk/~mgk25/unicode.html#activate * * Markus.Kuhn@cl.cam.ac.uk -- 2002-03-11 * Permission to use, copy, modify, and distribute this software * for any purpose and without fee is hereby granted. The author * disclaims all warranties with regard to this software. * * Latest version: * * http://www.cl.cam.ac.uk/~mgk25/ucs/langinfo.c */ /* #include "langinfo.h" */ typedef int nl_item; #define CODESET 1 #define C_CODESET "US-ASCII" /* Return this as the encoding of the * C/POSIX locale. Could as well one day * become "UTF-8". */ #define digit(x) ((x) >= '0' && (x) <= '9') static char buf[16]; static char *nl_langinfo(nl_item item) { char *l, *p; if (item != CODESET) return NULL; if (((l = LYGetEnv("LC_ALL")) != 0) || ((l = LYGetEnv("LC_CTYPE")) != 0) || ((l = LYGetEnv("LANG")) != 0)) { /* check standardized locales */ if (!strcmp(l, "C") || !strcmp(l, "POSIX")) return C_CODESET; /* check for encoding name fragment */ if (strstr(l, "UTF") || strstr(l, "utf")) return "UTF-8"; if ((p = strstr(l, "8859-"))) { memcpy(buf, "ISO-8859-\0\0", 12); p += 5; if (digit(*p)) { buf[9] = *p++; if (digit(*p)) buf[10] = *p++; return buf; } } if (strstr(l, "KOI8-R")) return "KOI8-R"; if (strstr(l, "KOI8-U")) return "KOI8-U"; if (strstr(l, "620")) return "TIS-620"; if (strstr(l, "2312")) return "GB2312"; if (strstr(l, "HKSCS")) return "Big5HKSCS"; /* no MIME charset */ if (strstr(l, "Big5") || strstr(l, "BIG5")) return "Big5"; if (strstr(l, "GBK")) return "GBK"; /* no MIME charset */ if (strstr(l, "18030")) return "GB18030"; /* no MIME charset */ if (strstr(l, "Shift_JIS") || strstr(l, "SJIS")) return "Shift_JIS"; /* check for conclusive modifier */ if (strstr(l, "euro")) return "ISO-8859-15"; /* check for language (and perhaps country) codes */ if (strstr(l, "zh_TW")) return "Big5"; if (strstr(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */ if (strstr(l, "zh")) return "GB2312"; if (strstr(l, "ja")) return "EUC-JP"; if (strstr(l, "ko")) return "EUC-KR"; if (strstr(l, "ru")) return "KOI8-R"; if (strstr(l, "uk")) return "KOI8-U"; if (strstr(l, "pl") || strstr(l, "hr") || strstr(l, "hu") || strstr(l, "cs") || strstr(l, "sk") || strstr(l, "sl")) return "ISO-8859-2"; if (strstr(l, "eo") || strstr(l, "mt")) return "ISO-8859-3"; if (strstr(l, "el")) return "ISO-8859-7"; if (strstr(l, "he")) return "ISO-8859-8"; if (strstr(l, "tr")) return "ISO-8859-9"; if (strstr(l, "th")) return "TIS-620"; /* or ISO-8859-11 */ if (strstr(l, "lt")) return "ISO-8859-13"; if (strstr(l, "cy")) return "ISO-8859-14"; if (strstr(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */ if (strstr(l, "am") || strstr(l, "vi")) return "UTF-8"; /* Send me further rules if you like, but don't forget that we are * *only* interested in locale naming conventions on platforms * that do not already provide an nl_langinfo(CODESET) implementation. */ return "ISO-8859-1"; /* should perhaps be "UTF-8" instead */ } return C_CODESET; } #endif /* defined(USE_LOCALE_CHARSET) && !defined(HAVE_LANGINFO_CODESET) */ /* * If LYLocaleCharset is true, use the current locale to lookup a MIME name * that corresponds, and use that as the display charset. This feature is * experimental because while nl_langinfo(CODESET) itself is standardized, * the return values and their relationship to the locale value is not. * GNU libiconv happens to give useful values, but other implementations are * not guaranteed to do this. * * Not all Linux versions provide useful information. GNU libc 2.2 returns * "ANSI_X3.4-1968" * whether locale is POSIX or en_US.UTF-8. * * Another possible thing to investigate is the locale_charset() function * provided in libiconv 1.5.1. */ void LYFindLocaleCharset(void) { char *name; CTRACE((tfp, "LYFindLocaleCharset(%d)\n", LYLocaleCharset)); name = nl_langinfo(CODESET); if (name != 0) { int value = UCGetLYhndl_byMIME(name); if (value >= 0) { linedrawing_char_set = value; CTRACE((tfp, "Found name \"%s\" -> %d\n", name, value)); /* * If no locale was set, we will get the POSIX character set, which * in Lynx is treated as US-ASCII. However, Lynx's longstanding * behavior has been to default to ISO-8859-1. So we treat that * encoding specially. Otherwise, if LOCALE_CHARSET is set, then * we will use the locale encoding -- unless overridden by the * ASSUME_CHARSET value and/or command-line option. */ if (LYLocaleCharset) { CTRACE((tfp, "...prior LocaleCharset '%s'\n", NonNull(UCAssume_MIMEcharset))); if (value == US_ASCII) { CTRACE((tfp, "...prefer existing charset to ASCII\n")); } else if (assumed_charset) { CTRACE((tfp, "...already assumed-charset\n")); } else { current_char_set = linedrawing_char_set; UCLYhndl_for_unspec = current_char_set; StrAllocCopy(UCAssume_MIMEcharset, name); CTRACE((tfp, "...using LocaleCharset '%s'\n", NonNull(UCAssume_MIMEcharset))); } } } else { CTRACE((tfp, "Cannot find a handle for MIME name \"%s\"\n", name)); } } else { CTRACE((tfp, "Cannot find a MIME name for locale\n")); } } #endif /* USE_LOCALE_CHARSET */ BOOL UCScanCode(UCode_t *target, const char *source, BOOL isHex) { BOOL status = FALSE; long lcode; char *endptr; errno = 0; *target = 0; lcode = strtol(source, &endptr, isHex ? 16 : 10); if (lcode >= 0 && (endptr > source) #if defined(ERANGE) && defined(LONG_MAX) && defined(LONG_MIN) && (errno != ERANGE || (lcode != LONG_MAX && lcode != LONG_MIN)) #else && (endptr - source) < (isHex ? 8 : 10) #endif && (endptr != 0) && (*endptr == '\0')) { *target = (UCode_t) lcode; status = TRUE; } return status; }