diff options
Diffstat (limited to 'camel/camel-charset-map.c')
-rw-r--r-- | camel/camel-charset-map.c | 447 |
1 files changed, 0 insertions, 447 deletions
diff --git a/camel/camel-charset-map.c b/camel/camel-charset-map.c deleted file mode 100644 index 02ea31a44c..0000000000 --- a/camel/camel-charset-map.c +++ /dev/null @@ -1,447 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */ - -/* - * Authors: - * Michael Zucchi <notzed@ximian.com> - * Dan Winship <danw@ximian.com> - * - * Copyright 2000, 2001 Ximian, Inc. (www.ximian.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 - * USA - */ - -#ifdef HAVE_CONFIG_H -#include <config.h> -#endif - -#include <stdio.h> - -/* - if you want to build the charset map, compile this with something like: - gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags` - (plus any -I/-L/-l flags you need for iconv), then run it as - ./a.out > camel-charset-map-private.h - - Note that the big-endian variant isn't tested... - - The tables genereated work like this: - - An indirect array for each page of unicode character - Each array element has an indirect pointer to one of the bytes of - the generated bitmask. -*/ - -#ifdef BUILD_MAP -#include <iconv.h> -#include <glib.h> - -static struct { - char *name; - unsigned int bit; /* assigned bit */ -} tables[] = { - /* These are the 8bit character sets (other than iso-8859-1, - * which is special-cased) which are supported by both other - * mailers and the GNOME environment. Note that the order - * they're listed in is the order they'll be tried in, so put - * the more-popular ones first. - */ - { "iso-8859-2", 0 }, /* Central/Eastern European */ - { "iso-8859-4", 0 }, /* Baltic */ - { "koi8-r", 0 }, /* Russian */ - { "windows-1251", 0 }, /* Russian */ - { "koi8-u", 0 }, /* Ukranian */ - { "iso-8859-5", 0 }, /* Least-popular Russian encoding */ - { "iso-8859-7", 0 }, /* Greek */ - { "iso-8859-9", 0 }, /* Turkish */ - { "iso-8859-13", 0 }, /* Baltic again */ - { "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most - * programs that support this support UTF8 - */ - { 0, 0 } -}; - -unsigned int encoding_map[256 * 256]; - -#if G_BYTE_ORDER == G_BIG_ENDIAN -#define UCS "UCS-4BE" -#else -#define UCS "UCS-4LE" -#endif - -void main(void) -{ - int i, j; - int max, min; - int bit = 0x01; - int k; - int bytes; - iconv_t cd; - char in[128]; - guint32 out[128]; - char *inptr, *outptr; - size_t inlen, outlen; - - /* dont count the terminator */ - bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8; - - for (i = 0; i < 128; i++) - in[i] = i + 128; - - for (j = 0; tables[j].name; j++) { - cd = iconv_open (UCS, tables[j].name); - inptr = in; - outptr = (char *)(out); - inlen = sizeof (in); - outlen = sizeof (out); - while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) { - if (errno == EILSEQ) { - inptr++; - inlen--; - } else { - printf ("%s\n", strerror (errno)); - exit (1); - } - } - iconv_close (cd); - - for (i = 0; i < 128 - outlen / 4; i++) { - encoding_map[i] |= bit; - encoding_map[out[i]] |= bit; - } - - tables[j].bit = bit; - bit <<= 1; - } - - printf("/* This file is automatically generated: DO NOT EDIT */\n\n"); - - for (i=0;i<256;i++) { - /* first, do we need this block? */ - for (k=0;k<bytes;k++) { - for (j=0;j<256;j++) { - if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) - break; - } - if (j < 256) { - /* yes, dump it */ - printf("static unsigned char m%02x%x[256] = {\n\t", i, k); - for (j=0;j<256;j++) { - printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff ); - if (((j+1)&7) == 0 && j<255) - printf("\n\t"); - } - printf("\n};\n\n"); - } - } - } - - printf("struct {\n"); - for (k=0;k<bytes;k++) { - printf("\tunsigned char *bits%d;\n", k); - } - printf("} camel_charmap[256] = {\n\t"); - for (i=0;i<256;i++) { - /* first, do we need this block? */ - printf("{ "); - for (k=0;k<bytes;k++) { - for (j=0;j<256;j++) { - if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) - break; - } - if (j < 256) { - printf("m%02x%x, ", i, k); - } else { - printf("0, "); - } - } - printf("}, "); - if (((i+1)&7) == 0 && i<255) - printf("\n\t"); - } - printf("\n};\n\n"); - - printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n"); - for (j=0;tables[j].name;j++) { - printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit); - } - printf("};\n\n"); - - printf("#define charset_mask(x) \\\n"); - for (k=0;k<bytes;k++) { - if (k!=0) - printf("\t| "); - else - printf("\t"); - printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8); - if (k<bytes-1) - printf("\t\\\n"); - } - printf("\n\n"); - -} - -#else - -#include "camel-charset-map.h" -#include "camel-charset-map-private.h" -#include "hash-table-utils.h" -#include <gal/unicode/gunicode.h> -#include <locale.h> -#include <string.h> -#include <ctype.h> -#include <glib.h> -#ifdef ENABLE_THREADS -#include <pthread.h> -#endif - - -#ifdef ENABLE_THREADS -static pthread_mutex_t iconv_charsets_lock = PTHREAD_MUTEX_INITIALIZER; -#define ICONV_CHARSETS_LOCK() pthread_mutex_lock (&iconv_charsets_lock) -#define ICONV_CHARSETS_UNLOCK() pthread_mutex_unlock (&iconv_charsets_lock) -#else -#define ICONV_CHARSETS_LOCK() -#define ICONV_CHARSETS_UNLOCK() -#endif /* ENABLE_THREADS */ - -static GHashTable *iconv_charsets = NULL; -static char *locale_charset = NULL; - -struct { - char *charset; - char *iconv_name; -} known_iconv_charsets[] = { - /* charset name, iconv-friendly charset name */ - { "iso-8859-1", "iso-8859-1" }, - { "iso8859-1", "iso-8859-1" }, - /* the above mostly serves as an example for iso-style charsets, - but we have code that will populate the iso-*'s if/when they - show up in camel_charset_map_to_iconv() so I'm - not going to bother putting them all in here... */ - { "windows-cp1251", "cp1251" }, - { "windows-1251", "cp1251" }, - { "cp1251", "cp1251" }, - /* the above mostly serves as an example for windows-style - charsets, but we have code that will parse and convert them - to their cp#### equivalents if/when they show up in - camel_charset_map_to_iconv() so I'm not going to bother - putting them all in here... */ - { "ks_c_5601-1987", "euc-kr" }, - { NULL, NULL } -}; - - -static void -shutdown_foreach (gpointer key, gpointer value, gpointer data) -{ - g_free (key); - g_free (value); -} - -static void -camel_charset_map_shutdown (void) -{ - g_hash_table_foreach (iconv_charsets, shutdown_foreach, NULL); - g_hash_table_destroy (iconv_charsets); - g_free (locale_charset); -} - -void -camel_charset_map_init (void) -{ - char *locale; - int i; - - if (iconv_charsets) - return; - - iconv_charsets = g_hash_table_new (g_strcase_hash, g_strcase_equal); - for (i = 0; known_iconv_charsets[i].charset != NULL; i++) { - g_hash_table_insert (iconv_charsets, g_strdup (known_iconv_charsets[i].charset), - g_strdup (known_iconv_charsets[i].iconv_name)); - } - - locale = setlocale (LC_ALL, NULL); - - if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) { - /* The locale "C" or "POSIX" is a portable locale; its - * LC_CTYPE part corresponds to the 7-bit ASCII character - * set. - */ - - locale_charset = NULL; - } else { - /* A locale name is typically of the form language[_terri- - * tory][.codeset][@modifier], where language is an ISO 639 - * language code, territory is an ISO 3166 country code, and - * codeset is a character set or encoding identifier like - * ISO-8859-1 or UTF-8. - */ - char *p; - int len; - - p = strchr (locale, '@'); - len = p ? (p - locale) : strlen (locale); - if ((p = strchr (locale, '.'))) { - locale_charset = g_strndup (p + 1, len - (p - locale) + 1); - g_strdown (locale_charset); - } - } - - g_atexit (camel_charset_map_shutdown); -} - -void -camel_charset_init (CamelCharset *c) -{ - c->mask = ~0; - c->level = 0; -} - -void -camel_charset_step (CamelCharset *c, const char *in, int len) -{ - register unsigned int mask; - register int level; - const char *inptr = in, *inend = in+len; - - mask = c->mask; - level = c->level; - - /* check what charset a given string will fit in */ - while (inptr < inend) { - gunichar c; - const char *newinptr; - newinptr = g_utf8_next_char(inptr); - c = g_utf8_get_char(inptr); - if (newinptr == NULL || !g_unichar_validate (c)) { - inptr++; - continue; - } - - inptr = newinptr; - if (c<=0xffff) { - mask &= charset_mask(c); - - if (c>=128 && c<256) - level = MAX(level, 1); - else if (c>=256) - level = MAX(level, 2); - } else { - mask = 0; - level = MAX(level, 2); - } - } - - c->mask = mask; - c->level = level; -} - -/* gets the best charset from the mask of chars in it */ -static const char * -camel_charset_best_mask(unsigned int mask) -{ - int i; - - for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) { - if (camel_charinfo[i].bit & mask) - return camel_charinfo[i].name; - } - return "UTF-8"; -} - -const char * -camel_charset_best_name (CamelCharset *charset) -{ - if (charset->level == 1) - return "ISO-8859-1"; - else if (charset->level == 2) - return camel_charset_best_mask (charset->mask); - else - return NULL; - -} - -/* finds the minimum charset for this string NULL means US-ASCII */ -const char * -camel_charset_best (const char *in, int len) -{ - CamelCharset charset; - - camel_charset_init (&charset); - camel_charset_step (&charset, in, len); - return camel_charset_best_name (&charset); -} - -const char * -camel_charset_locale_name (void) -{ - return locale_charset; -} - -const char * -camel_charset_to_iconv (const char *name) -{ - const char *charset; - - if (name == NULL) - return NULL; - - /* special-case hack... */ - if (!g_strcasecmp (name, "x-unknown")) - return locale_charset ? locale_charset : "iso-8859-1"; - - ICONV_CHARSETS_LOCK (); - charset = g_hash_table_lookup (iconv_charsets, name); - if (!charset) { - /* Attempt to friendlyify the charset */ - char *new_charset, *p; - int len; - - if (!g_strncasecmp (name, "iso", 3) && name[3] != '-' && name[3] != '_') { - /* Hack to convert charsets like ISO8859-1 to iconv-friendly ISO-8859-1 */ - len = strlen (name); - new_charset = g_malloc (len + 2); - memcpy (new_charset, name, 3); - new_charset[3] = '-'; - memcpy (new_charset + 4, name + 3, len - 3); - new_charset[len + 1] = '\0'; - } else if (!g_strncasecmp (name, "windows-", 8)) { - /* Convert charsets like windows-1251 and windows-cp1251 to iconv-friendly cp1251 */ - new_charset = (char *) name + 8; - if (!g_strncasecmp (new_charset, "cp", 2)) - new_charset += 2; - - for (p = new_charset; *p && isdigit ((unsigned) *p); p++); - if (*p == '\0') - new_charset = g_strdup_printf ("cp%s", new_charset); - else - new_charset = g_strdup (name); - } else { - /* *shrug* - add it to the hash table just the way it is? */ - new_charset = g_strdup (name); - } - - g_hash_table_insert (iconv_charsets, g_strdup (name), new_charset); - charset = new_charset; - } - ICONV_CHARSETS_UNLOCK (); - - return charset; -} - -#endif /* !BUILD_MAP */ - |