diff options
Diffstat (limited to 'camel/camel-charset-map.c')
-rw-r--r-- | camel/camel-charset-map.c | 361 |
1 files changed, 361 insertions, 0 deletions
diff --git a/camel/camel-charset-map.c b/camel/camel-charset-map.c new file mode 100644 index 0000000000..59f916c700 --- /dev/null +++ b/camel/camel-charset-map.c @@ -0,0 +1,361 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */ +/* + * Authors: + * Michael Zucchi <notzed@ximian.com> + * Jeffrey Stedfast <fejj@ximian.com> + * Dan Winship <danw@ximian.com> + * + * Copyright 2000-2003 Ximian, Inc. (www.ximian.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + */ + + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +/* + if you want to build the charset map, compile this with something like: + gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags` + (plus any -I/-L/-l flags you need for iconv), then run it as + ./a.out > camel-charset-map-private.h + + Note that the big-endian variant isn't tested... + + The tables genereated work like this: + + An indirect array for each page of unicode character + Each array element has an indirect pointer to one of the bytes of + the generated bitmask. +*/ + +#ifdef BUILD_MAP +#include <iconv.h> +#include <glib.h> + +static struct { + char *name; + unsigned int bit; /* assigned bit */ +} tables[] = { + /* These are the 8bit character sets (other than iso-8859-1, + * which is special-cased) which are supported by both other + * mailers and the GNOME environment. Note that the order + * they're listed in is the order they'll be tried in, so put + * the more-popular ones first. + */ + { "iso-8859-2", 0 }, /* Central/Eastern European */ + { "iso-8859-4", 0 }, /* Baltic */ + { "koi8-r", 0 }, /* Russian */ + { "koi8-u", 0 }, /* Ukranian */ + { "iso-8859-5", 0 }, /* Least-popular Russian encoding */ + { "iso-8859-7", 0 }, /* Greek */ + { "iso-8859-8", 0 }, /* Hebrew; Visual */ + { "iso-8859-9", 0 }, /* Turkish */ + { "iso-8859-13", 0 }, /* Baltic again */ + { "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most + * programs that support this support UTF8 + */ + { "windows-1251", 0 }, /* Russian */ + { 0, 0 } +}; + +unsigned int encoding_map[256 * 256]; + +#if G_BYTE_ORDER == G_BIG_ENDIAN +#define UCS "UCS-4BE" +#else +#define UCS "UCS-4LE" +#endif + +int main (void) +{ + int i, j; + int max, min; + int bit = 0x01; + int k; + int bytes; + iconv_t cd; + char in[128]; + guint32 out[128]; + char *inptr, *outptr; + size_t inlen, outlen; + + /* dont count the terminator */ + bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8; + + for (i = 0; i < 128; i++) + in[i] = i + 128; + + for (j = 0; tables[j].name; j++) { + cd = iconv_open (UCS, tables[j].name); + inptr = in; + outptr = (char *)(out); + inlen = sizeof (in); + outlen = sizeof (out); + while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) { + if (errno == EILSEQ) { + inptr++; + inlen--; + } else { + printf ("%s\n", strerror (errno)); + exit (1); + } + } + iconv_close (cd); + + for (i = 0; i < 128 - outlen / 4; i++) { + encoding_map[i] |= bit; + encoding_map[out[i]] |= bit; + } + + tables[j].bit = bit; + bit <<= 1; + } + + printf("/* This file is automatically generated: DO NOT EDIT */\n\n"); + + for (i=0;i<256;i++) { + /* first, do we need this block? */ + for (k=0;k<bytes;k++) { + for (j=0;j<256;j++) { + if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) + break; + } + if (j < 256) { + /* yes, dump it */ + printf("static unsigned char m%02x%x[256] = {\n\t", i, k); + for (j=0;j<256;j++) { + printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff ); + if (((j+1)&7) == 0 && j<255) + printf("\n\t"); + } + printf("\n};\n\n"); + } + } + } + + printf("struct {\n"); + for (k=0;k<bytes;k++) { + printf("\tunsigned char *bits%d;\n", k); + } + printf("} camel_charmap[256] = {\n\t"); + for (i=0;i<256;i++) { + /* first, do we need this block? */ + printf("{ "); + for (k=0;k<bytes;k++) { + for (j=0;j<256;j++) { + if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) + break; + } + if (j < 256) { + printf("m%02x%x, ", i, k); + } else { + printf("0, "); + } + } + printf("}, "); + if (((i+1)&7) == 0 && i<255) + printf("\n\t"); + } + printf("\n};\n\n"); + + printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n"); + for (j=0;tables[j].name;j++) { + printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit); + } + printf("};\n\n"); + + printf("#define charset_mask(x) \\\n"); + for (k=0;k<bytes;k++) { + if (k!=0) + printf("\t| "); + else + printf("\t"); + printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8); + if (k<bytes-1) + printf("\t\\\n"); + } + printf("\n\n"); + + return 0; +} + +#else + +#include "camel-charset-map.h" +#include "camel-charset-map-private.h" + +#include <gal/util/e-iconv.h> + +#include <glib.h> +#include <locale.h> +#include <ctype.h> +#include <pthread.h> +#ifdef HAVE_CODESET +#include <langinfo.h> +#endif + +void +camel_charset_init (CamelCharset *c) +{ + c->mask = (unsigned int) ~0; + c->level = 0; +} + +void +camel_charset_step (CamelCharset *c, const char *in, int len) +{ + register unsigned int mask; + register int level; + const char *inptr = in, *inend = in+len; + + mask = c->mask; + level = c->level; + + /* check what charset a given string will fit in */ + while (inptr < inend) { + gunichar c; + const char *newinptr; + newinptr = g_utf8_next_char(inptr); + c = g_utf8_get_char(inptr); + if (newinptr == NULL || !g_unichar_validate (c)) { + inptr++; + continue; + } + + inptr = newinptr; + if (c<=0xffff) { + mask &= charset_mask(c); + + if (c>=128 && c<256) + level = MAX(level, 1); + else if (c>=256) + level = MAX(level, 2); + } else { + mask = 0; + level = MAX(level, 2); + } + } + + c->mask = mask; + c->level = level; +} + +/* gets the best charset from the mask of chars in it */ +static const char * +camel_charset_best_mask(unsigned int mask) +{ + const char *locale_lang, *lang; + int i; + + locale_lang = e_iconv_locale_language (); + for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) { + if (camel_charinfo[i].bit & mask) { + lang = e_iconv_charset_language (camel_charinfo[i].name); + + if (!lang || (locale_lang && !strncmp (locale_lang, lang, 2))) + return camel_charinfo[i].name; + } + } + + return "UTF-8"; +} + +const char * +camel_charset_best_name (CamelCharset *charset) +{ + if (charset->level == 1) + return "ISO-8859-1"; + else if (charset->level == 2) + return camel_charset_best_mask (charset->mask); + else + return NULL; + +} + +/* finds the minimum charset for this string NULL means US-ASCII */ +const char * +camel_charset_best (const char *in, int len) +{ + CamelCharset charset; + + camel_charset_init (&charset); + camel_charset_step (&charset, in, len); + return camel_charset_best_name (&charset); +} + + +/** + * camel_charset_iso_to_windows: + * @isocharset: a canonicalised ISO charset + * + * Returns the equivalent Windows charset. + **/ +const char * +camel_charset_iso_to_windows (const char *isocharset) +{ + /* According to http://czyborra.com/charsets/codepages.html, + * the charset mapping is as follows: + * + * us-ascii maps to windows-cp1252 + * iso-8859-1 maps to windows-cp1252 + * iso-8859-2 maps to windows-cp1250 + * iso-8859-3 maps to windows-cp???? + * iso-8859-4 maps to windows-cp???? + * iso-8859-5 maps to windows-cp1251 + * iso-8859-6 maps to windows-cp1256 + * iso-8859-7 maps to windows-cp1253 + * iso-8859-8 maps to windows-cp1255 + * iso-8859-9 maps to windows-cp1254 + * iso-8859-10 maps to windows-cp???? + * iso-8859-11 maps to windows-cp???? + * iso-8859-12 maps to windows-cp???? + * iso-8859-13 maps to windows-cp1257 + * + * Assumptions: + * - I'm going to assume that since iso-8859-4 and + * iso-8859-13 are Baltic that it also maps to + * windows-cp1257. + */ + + if (!g_ascii_strcasecmp (isocharset, "iso-8859-1") || !g_ascii_strcasecmp (isocharset, "us-ascii")) + return "windows-cp1252"; + else if (!g_ascii_strcasecmp (isocharset, "iso-8859-2")) + return "windows-cp1250"; + else if (!g_ascii_strcasecmp (isocharset, "iso-8859-4")) + return "windows-cp1257"; + else if (!g_ascii_strcasecmp (isocharset, "iso-8859-5")) + return "windows-cp1251"; + else if (!g_ascii_strcasecmp (isocharset, "iso-8859-6")) + return "windows-cp1256"; + else if (!g_ascii_strcasecmp (isocharset, "iso-8859-7")) + return "windows-cp1253"; + else if (!g_ascii_strcasecmp (isocharset, "iso-8859-8")) + return "windows-cp1255"; + else if (!g_ascii_strcasecmp (isocharset, "iso-8859-9")) + return "windows-cp1254"; + else if (!g_ascii_strcasecmp (isocharset, "iso-8859-13")) + return "windows-cp1257"; + + return isocharset; +} + +#endif /* !BUILD_MAP */ |