diff options
Diffstat (limited to 'camel/camel-iconv.c')
-rw-r--r-- | camel/camel-iconv.c | 161 |
1 files changed, 156 insertions, 5 deletions
diff --git a/camel/camel-iconv.c b/camel/camel-iconv.c index e2798a16e1..439adf2df2 100644 --- a/camel/camel-iconv.c +++ b/camel/camel-iconv.c @@ -27,6 +27,7 @@ #include <glib.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> #include <errno.h> @@ -47,6 +48,73 @@ struct _iconv_cache_bucket { }; +/* a useful website on charset alaises: + * http://www.li18nux.org/subgroups/sa/locnameguide/v1.1draft/CodesetAliasTable-V11.html */ + +struct { + char *charset; + char *iconv_name; +} known_iconv_charsets[] = { +#if 0 + /* charset name, iconv-friendly charset name */ + { "iso-8859-1", "iso-8859-1" }, + { "iso8859-1", "iso-8859-1" }, + /* the above mostly serves as an example for iso-style charsets, + but we have code that will populate the iso-*'s if/when they + show up in camel_iconv_charset_name() so I'm + not going to bother putting them all in here... */ + { "windows-cp1251", "cp1251" }, + { "windows-1251", "cp1251" }, + { "cp1251", "cp1251" }, + /* the above mostly serves as an example for windows-style + charsets, but we have code that will parse and convert them + to their cp#### equivalents if/when they show up in + camel_iconv_charset_name() so I'm not going to bother + putting them all in here either... */ +#endif + /* charset name (lowercase!), iconv-friendly name (sometimes case sensitive) */ + { "utf-8", "UTF-8" }, + { "utf8", "UTF-8" }, + + /* 10646 is a special case, its usually UCS-2 big endian */ + /* This might need some checking but should be ok for solaris/linux */ + { "iso-10646-1", "UCS-2BE" }, + { "iso_10646-1", "UCS-2BE" }, + { "iso10646-1", "UCS-2BE" }, + { "iso-10646", "UCS-2BE" }, + { "iso_10646", "UCS-2BE" }, + { "iso10646", "UCS-2BE" }, + + /* "ks_c_5601-1987" seems to be the most common of this lot */ + { "ks_c_5601-1987", "EUC-KR" }, + { "5601", "EUC-KR" }, + { "ksc-5601", "EUC-KR" }, + { "ksc-5601-1987", "EUC-KR" }, + { "ksc-5601_1987", "EUC-KR" }, + + /* FIXME: Japanese/Korean/Chinese stuff needs checking */ + { "euckr-0", "EUC-KR" }, + { "5601", "EUC-KR" }, + { "big5-0", "BIG5" }, + { "big5.eten-0", "BIG5" }, + { "big5hkscs-0", "BIG5HKCS" }, + { "gb2312-0", "gb2312" }, + { "gb2312.1980-0", "gb2312" }, + { "euc-cn", "gb2312" }, + { "gb18030-0", "gb18030" }, + { "gbk-0", "GBK" }, + + { "eucjp-0", "eucJP" }, /* should this map to "EUC-JP" instead? */ + { "ujis-0", "ujis" }, /* we might want to map this to EUC-JP */ + { "jisx0208.1983-0", "SJIS" }, + { "jisx0212.1990-0", "SJIS" }, + { "pck", "SJIS" }, + { NULL, NULL } +}; + + +static GHashTable *iconv_charsets; + static EMemChunk *cache_chunk; static struct _iconv_cache_bucket *iconv_cache_buckets; static GHashTable *iconv_cache; @@ -55,11 +123,16 @@ static unsigned int iconv_cache_size = 0; #ifdef G_THREADS_ENABLED static GStaticMutex iconv_cache_lock = G_STATIC_MUTEX_INIT; +static GStaticMutex iconv_charset_lock = G_STATIC_MUTEX_INIT; #define ICONV_CACHE_LOCK() g_static_mutex_lock (&iconv_cache_lock) #define ICONV_CACHE_UNLOCK() g_static_mutex_unlock (&iconv_cache_lock) +#define ICONV_CHARSET_LOCK() g_static_mutex_lock (&iconv_charset_lock) +#define ICONV_CHARSET_UNLOCK() g_static_mutex_unlock (&iconv_charset_lock) #else #define ICONV_CACHE_LOCK() #define ICONV_CACHE_UNLOCK() +#define ICONV_CHARSET_LOCK() +#define ICONV_CHARSET_UNLOCK() #endif /* G_THREADS_ENABLED */ @@ -158,11 +231,21 @@ iconv_cache_expire_unused (void) } +static void +iconv_charset_free (char *name, char *iname, gpointer user_data) +{ + g_free (name); + g_free (iname); +} + void camel_iconv_shutdown (void) { struct _iconv_cache_bucket *bucket, *next; + g_hash_table_foreach (iconv_charsets, (GHFunc) iconv_charset_free, NULL); + g_hash_table_destroy (iconv_charsets); + bucket = iconv_cache_buckets; while (bucket) { next = bucket->next; @@ -191,10 +274,22 @@ void camel_iconv_init (void) { static int initialized = FALSE; + char *from, *to; + int i; if (initialized) return; + iconv_charsets = g_hash_table_new (g_str_hash, g_str_equal); + + for (i = 0; known_iconv_charsets[i].charset != NULL; i++) { + from = g_strdup (known_iconv_charsets[i].charset); + to = g_strdup (known_iconv_charsets[i].iconv_name); + g_ascii_strdown (from, -1); + + g_hash_table_insert (iconv_charsets, from, to); + } + iconv_cache_buckets = NULL; iconv_cache = g_hash_table_new (g_str_hash, g_str_equal); iconv_open_hash = g_hash_table_new (g_direct_hash, g_direct_equal); @@ -206,6 +301,62 @@ camel_iconv_init (void) /** + * camel_iconv_charset_name: + * @charset: charset name + * + * Maps charset names to the names that glib's g_iconv_open() is more + * likely able to handle. + * + * Returns an iconv-friendly name for @charset. + **/ +const char * +camel_iconv_charset_name (const char *charset) +{ + char *name, *iname, *tmp; + + if (charset == NULL) + return NULL; + + name = g_alloca (strlen (charset) + 1); + strcpy (name, charset); + g_ascii_strdown (name, -1); + + ICONV_CHARSET_LOCK (); + if ((iname = g_hash_table_lookup (iconv_charsets, name)) != NULL) { + ICONV_CHARSET_UNLOCK (); + return iname; + } + + /* Unknown, try to convert some basic charset types to something that should work */ + if (!strncmp (name, "iso", 3)) { + /* camel_charset_canonical_name() can handle this case */ + ICONV_CHARSET_UNLOCK (); + return camel_charset_canonical_name (charset); + } else if (strncmp (name, "windows-", 8) == 0) { + /* Convert windows-#### or windows-cp#### to cp#### */ + tmp = name + 8; + if (!strncmp (tmp, "cp", 2)) + tmp += 2; + iname = g_strdup_printf ("CP%s", tmp); + } else if (strncmp (name, "microsoft-", 10) == 0) { + /* Convert microsoft-#### or microsoft-cp#### to cp#### */ + tmp = name + 10; + if (!strncmp (tmp, "cp", 2)) + tmp += 2; + iname = g_strdup_printf ("CP%s", tmp); + } else { + /* Just assume its ok enough as is, case and all - let g_iconv_open() handle this */ + iname = g_strdup (charset); + } + + g_hash_table_insert (iconv_charsets, g_strdup (name), iname); + ICONV_CHARSET_UNLOCK (); + + return iname; +} + + +/** * camel_iconv_open: * @to: charset to convert to * @from: charset to convert from @@ -235,11 +386,11 @@ camel_iconv_open (const char *to, const char *from) from = camel_charset_locale_name (); /* Even tho g_iconv_open will find the appropriate charset - * format(s) for the to/from charset strings, we still convert - * them to their canonical format here so that our key is in a - * standard format */ - from = camel_charset_canonical_name (from); - to = camel_charset_canonical_name (to); + * format(s) for the to/from charset strings (hahaha, yea + * right), we still convert them to their canonical format + * here so that our key is in a standard format */ + from = camel_iconv_charset_name (from); + to = camel_iconv_charset_name (to); key = g_alloca (strlen (from) + strlen (to) + 2); sprintf (key, "%s:%s", from, to); |