1 files changed, 361 insertions, 0 deletions
diff --git a/camel/camel-charset-map.c b/camel/camel-charset-map.c
new file mode 100644
index 0000000000..59f916c700
--- /dev/null
+++ b/camel/camel-charset-map.c
@@ -0,0 +1,361 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */
+/* 
+ * Authors:
+ *   Michael Zucchi <notzed@ximian.com>
+ *   Jeffrey Stedfast <fejj@ximian.com>
+ *   Dan Winship <danw@ximian.com>
+ *
+ * Copyright 2000-2003 Ximian, Inc. (www.ximian.com)
+ *
+ * This program is free software; you can redistribute it and/or 
+ * modify it under the terms of version 2 of the GNU General Public 
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+/*
+  if you want to build the charset map, compile this with something like:
+    gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags`
+  (plus any -I/-L/-l flags you need for iconv), then run it as 
+    ./a.out > camel-charset-map-private.h
+
+  Note that the big-endian variant isn't tested...
+
+  The tables genereated work like this:
+
+   An indirect array for each page of unicode character
+   Each array element has an indirect pointer to one of the bytes of
+   the generated bitmask.
+*/
+
+#ifdef BUILD_MAP
+#include <iconv.h>
+#include <glib.h>
+
+static struct {
+	char *name;
+	unsigned int bit;	/* assigned bit */
+} tables[] = {
+	/* These are the 8bit character sets (other than iso-8859-1,
+	 * which is special-cased) which are supported by both other
+	 * mailers and the GNOME environment. Note that the order
+	 * they're listed in is the order they'll be tried in, so put
+	 * the more-popular ones first.
+	 */
+	{ "iso-8859-2", 0 },	/* Central/Eastern European */
+	{ "iso-8859-4", 0 },	/* Baltic */
+	{ "koi8-r", 0 },	/* Russian */
+	{ "koi8-u", 0 },	/* Ukranian */
+	{ "iso-8859-5", 0 },	/* Least-popular Russian encoding */
+	{ "iso-8859-7", 0 },	/* Greek */
+	{ "iso-8859-8", 0 },    /* Hebrew; Visual */
+	{ "iso-8859-9", 0 },	/* Turkish */
+	{ "iso-8859-13", 0 },	/* Baltic again */
+	{ "iso-8859-15", 0 },	/* New-and-improved iso-8859-1, but most
+				 * programs that support this support UTF8
+				 */
+	{ "windows-1251", 0 },	/* Russian */
+	{ 0, 0 }
+};
+
+unsigned int encoding_map[256 * 256];
+
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+#define UCS "UCS-4BE"
+#else
+#define UCS "UCS-4LE"
+#endif
+
+int main (void)
+{
+	int i, j;
+	int max, min;
+	int bit = 0x01;
+	int k;
+	int bytes;
+	iconv_t cd;
+	char in[128];
+	guint32 out[128];
+	char *inptr, *outptr;
+	size_t inlen, outlen;
+
+	/* dont count the terminator */
+	bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
+
+	for (i = 0; i < 128; i++)
+		in[i] = i + 128;
+
+	for (j = 0; tables[j].name; j++) {
+		cd = iconv_open (UCS, tables[j].name);
+		inptr = in;
+		outptr = (char *)(out);
+		inlen = sizeof (in);
+		outlen = sizeof (out);
+		while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
+			if (errno == EILSEQ) {
+				inptr++;
+				inlen--;
+			} else {
+				printf ("%s\n", strerror (errno));
+				exit (1);
+			}
+		}
+		iconv_close (cd);
+
+		for (i = 0; i < 128 - outlen / 4; i++) {
+			encoding_map[i] |= bit;
+			encoding_map[out[i]] |= bit;
+		}
+
+		tables[j].bit = bit;
+		bit <<= 1;
+	}
+
+	printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
+
+	for (i=0;i<256;i++) {
+		/* first, do we need this block? */
+		for (k=0;k<bytes;k++) {
+			for (j=0;j<256;j++) {
+				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
+					break;
+			}
+			if (j < 256) {
+				/* yes, dump it */
+				printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
+				for (j=0;j<256;j++) {
+					printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
+					if (((j+1)&7) == 0 && j<255)
+						printf("\n\t");
+				}
+				printf("\n};\n\n");
+			}
+		}
+	}
+
+	printf("struct {\n");
+	for (k=0;k<bytes;k++) {
+		printf("\tunsigned char *bits%d;\n", k);
+	}
+	printf("} camel_charmap[256] = {\n\t");
+	for (i=0;i<256;i++) {
+		/* first, do we need this block? */
+		printf("{ ");
+		for (k=0;k<bytes;k++) {
+			for (j=0;j<256;j++) {
+				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
+					break;
+			}
+			if (j < 256) {
+				printf("m%02x%x, ", i, k);
+			} else {
+				printf("0, ");
+			}
+		}
+		printf("}, ");
+		if (((i+1)&7) == 0 && i<255)
+			printf("\n\t");
+	}
+	printf("\n};\n\n");
+
+	printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
+	for (j=0;tables[j].name;j++) {
+		printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
+	}
+	printf("};\n\n");
+
+	printf("#define charset_mask(x) \\\n");
+	for (k=0;k<bytes;k++) {
+		if (k!=0)
+			printf("\t| ");
+		else
+			printf("\t");
+		printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
+		if (k<bytes-1)
+			printf("\t\\\n");
+	}
+	printf("\n\n");
+	
+	return 0;
+}
+
+#else
+
+#include "camel-charset-map.h"
+#include "camel-charset-map-private.h"
+
+#include <gal/util/e-iconv.h>
+
+#include <glib.h>
+#include <locale.h>
+#include <ctype.h>
+#include <pthread.h>
+#ifdef HAVE_CODESET
+#include <langinfo.h>
+#endif
+
+void
+camel_charset_init (CamelCharset *c)
+{
+	c->mask = (unsigned int) ~0;
+	c->level = 0;
+}
+
+void
+camel_charset_step (CamelCharset *c, const char *in, int len)
+{
+	register unsigned int mask;
+	register int level;
+	const char *inptr = in, *inend = in+len;
+
+	mask = c->mask;
+	level = c->level;
+
+	/* check what charset a given string will fit in */
+	while (inptr < inend) {
+		gunichar c;
+		const char *newinptr;
+		newinptr = g_utf8_next_char(inptr);
+		c = g_utf8_get_char(inptr);
+		if (newinptr == NULL || !g_unichar_validate (c)) {
+			inptr++;
+			continue;
+		}
+
+		inptr = newinptr;
+		if (c<=0xffff) {
+			mask &= charset_mask(c);
+		
+			if (c>=128 && c<256)
+				level = MAX(level, 1);
+			else if (c>=256)
+				level = MAX(level, 2);
+		} else {
+			mask = 0;
+			level = MAX(level, 2);
+		}
+	}
+
+	c->mask = mask;
+	c->level = level;
+}
+
+/* gets the best charset from the mask of chars in it */
+static const char *
+camel_charset_best_mask(unsigned int mask)
+{
+	const char *locale_lang, *lang;
+	int i;
+	
+	locale_lang = e_iconv_locale_language ();
+	for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) {
+		if (camel_charinfo[i].bit & mask) {
+			lang = e_iconv_charset_language (camel_charinfo[i].name);
+			
+			if (!lang || (locale_lang && !strncmp (locale_lang, lang, 2)))
+				return camel_charinfo[i].name;
+		}
+	}
+	
+	return "UTF-8";
+}
+
+const char *
+camel_charset_best_name (CamelCharset *charset)
+{
+	if (charset->level == 1)
+		return "ISO-8859-1";
+	else if (charset->level == 2)
+		return camel_charset_best_mask (charset->mask);
+	else
+		return NULL;
+
+}
+
+/* finds the minimum charset for this string NULL means US-ASCII */
+const char *
+camel_charset_best (const char *in, int len)
+{
+	CamelCharset charset;
+
+	camel_charset_init (&charset);
+	camel_charset_step (&charset, in, len);
+	return camel_charset_best_name (&charset);
+}
+
+
+/**
+ * camel_charset_iso_to_windows:
+ * @isocharset: a canonicalised ISO charset
+ *
+ * Returns the equivalent Windows charset.
+ **/
+const char *
+camel_charset_iso_to_windows (const char *isocharset)
+{
+	/* According to http://czyborra.com/charsets/codepages.html,
+	 * the charset mapping is as follows:
+	 *
+	 * us-ascii    maps to windows-cp1252
+	 * iso-8859-1  maps to windows-cp1252
+	 * iso-8859-2  maps to windows-cp1250
+	 * iso-8859-3  maps to windows-cp????
+	 * iso-8859-4  maps to windows-cp????
+	 * iso-8859-5  maps to windows-cp1251
+	 * iso-8859-6  maps to windows-cp1256
+	 * iso-8859-7  maps to windows-cp1253
+	 * iso-8859-8  maps to windows-cp1255
+	 * iso-8859-9  maps to windows-cp1254
+	 * iso-8859-10 maps to windows-cp????
+	 * iso-8859-11 maps to windows-cp????
+	 * iso-8859-12 maps to windows-cp????
+	 * iso-8859-13 maps to windows-cp1257
+	 *
+	 * Assumptions:
+	 *  - I'm going to assume that since iso-8859-4 and
+	 *    iso-8859-13 are Baltic that it also maps to
+	 *    windows-cp1257.
+	 */
+	
+	if (!g_ascii_strcasecmp (isocharset, "iso-8859-1") || !g_ascii_strcasecmp (isocharset, "us-ascii"))
+		return "windows-cp1252";
+	else if (!g_ascii_strcasecmp (isocharset, "iso-8859-2"))
+		return "windows-cp1250";
+	else if (!g_ascii_strcasecmp (isocharset, "iso-8859-4"))
+		return "windows-cp1257";
+	else if (!g_ascii_strcasecmp (isocharset, "iso-8859-5"))
+		return "windows-cp1251";
+	else if (!g_ascii_strcasecmp (isocharset, "iso-8859-6"))
+		return "windows-cp1256";
+	else if (!g_ascii_strcasecmp (isocharset, "iso-8859-7"))
+		return "windows-cp1253";
+	else if (!g_ascii_strcasecmp (isocharset, "iso-8859-8"))
+		return "windows-cp1255";
+	else if (!g_ascii_strcasecmp (isocharset, "iso-8859-9"))
+		return "windows-cp1254";
+	else if (!g_ascii_strcasecmp (isocharset, "iso-8859-13"))
+		return "windows-cp1257";
+	
+	return isocharset;
+}
+
+#endif /* !BUILD_MAP */