aboutsummaryrefslogtreecommitdiffstats
path: root/camel/camel-charset-map.c
diff options
context:
space:
mode:
Diffstat (limited to 'camel/camel-charset-map.c')
-rw-r--r--camel/camel-charset-map.c361
1 files changed, 361 insertions, 0 deletions
diff --git a/camel/camel-charset-map.c b/camel/camel-charset-map.c
new file mode 100644
index 0000000000..59f916c700
--- /dev/null
+++ b/camel/camel-charset-map.c
@@ -0,0 +1,361 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */
+/*
+ * Authors:
+ * Michael Zucchi <notzed@ximian.com>
+ * Jeffrey Stedfast <fejj@ximian.com>
+ * Dan Winship <danw@ximian.com>
+ *
+ * Copyright 2000-2003 Ximian, Inc. (www.ximian.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+/*
+ if you want to build the charset map, compile this with something like:
+ gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags`
+ (plus any -I/-L/-l flags you need for iconv), then run it as
+ ./a.out > camel-charset-map-private.h
+
+ Note that the big-endian variant isn't tested...
+
+ The tables genereated work like this:
+
+ An indirect array for each page of unicode character
+ Each array element has an indirect pointer to one of the bytes of
+ the generated bitmask.
+*/
+
+#ifdef BUILD_MAP
+#include <iconv.h>
+#include <glib.h>
+
+static struct {
+ char *name;
+ unsigned int bit; /* assigned bit */
+} tables[] = {
+ /* These are the 8bit character sets (other than iso-8859-1,
+ * which is special-cased) which are supported by both other
+ * mailers and the GNOME environment. Note that the order
+ * they're listed in is the order they'll be tried in, so put
+ * the more-popular ones first.
+ */
+ { "iso-8859-2", 0 }, /* Central/Eastern European */
+ { "iso-8859-4", 0 }, /* Baltic */
+ { "koi8-r", 0 }, /* Russian */
+ { "koi8-u", 0 }, /* Ukranian */
+ { "iso-8859-5", 0 }, /* Least-popular Russian encoding */
+ { "iso-8859-7", 0 }, /* Greek */
+ { "iso-8859-8", 0 }, /* Hebrew; Visual */
+ { "iso-8859-9", 0 }, /* Turkish */
+ { "iso-8859-13", 0 }, /* Baltic again */
+ { "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most
+ * programs that support this support UTF8
+ */
+ { "windows-1251", 0 }, /* Russian */
+ { 0, 0 }
+};
+
+unsigned int encoding_map[256 * 256];
+
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+#define UCS "UCS-4BE"
+#else
+#define UCS "UCS-4LE"
+#endif
+
+int main (void)
+{
+ int i, j;
+ int max, min;
+ int bit = 0x01;
+ int k;
+ int bytes;
+ iconv_t cd;
+ char in[128];
+ guint32 out[128];
+ char *inptr, *outptr;
+ size_t inlen, outlen;
+
+ /* dont count the terminator */
+ bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
+
+ for (i = 0; i < 128; i++)
+ in[i] = i + 128;
+
+ for (j = 0; tables[j].name; j++) {
+ cd = iconv_open (UCS, tables[j].name);
+ inptr = in;
+ outptr = (char *)(out);
+ inlen = sizeof (in);
+ outlen = sizeof (out);
+ while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
+ if (errno == EILSEQ) {
+ inptr++;
+ inlen--;
+ } else {
+ printf ("%s\n", strerror (errno));
+ exit (1);
+ }
+ }
+ iconv_close (cd);
+
+ for (i = 0; i < 128 - outlen / 4; i++) {
+ encoding_map[i] |= bit;
+ encoding_map[out[i]] |= bit;
+ }
+
+ tables[j].bit = bit;
+ bit <<= 1;
+ }
+
+ printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
+
+ for (i=0;i<256;i++) {
+ /* first, do we need this block? */
+ for (k=0;k<bytes;k++) {
+ for (j=0;j<256;j++) {
+ if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
+ break;
+ }
+ if (j < 256) {
+ /* yes, dump it */
+ printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
+ for (j=0;j<256;j++) {
+ printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
+ if (((j+1)&7) == 0 && j<255)
+ printf("\n\t");
+ }
+ printf("\n};\n\n");
+ }
+ }
+ }
+
+ printf("struct {\n");
+ for (k=0;k<bytes;k++) {
+ printf("\tunsigned char *bits%d;\n", k);
+ }
+ printf("} camel_charmap[256] = {\n\t");
+ for (i=0;i<256;i++) {
+ /* first, do we need this block? */
+ printf("{ ");
+ for (k=0;k<bytes;k++) {
+ for (j=0;j<256;j++) {
+ if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
+ break;
+ }
+ if (j < 256) {
+ printf("m%02x%x, ", i, k);
+ } else {
+ printf("0, ");
+ }
+ }
+ printf("}, ");
+ if (((i+1)&7) == 0 && i<255)
+ printf("\n\t");
+ }
+ printf("\n};\n\n");
+
+ printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
+ for (j=0;tables[j].name;j++) {
+ printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
+ }
+ printf("};\n\n");
+
+ printf("#define charset_mask(x) \\\n");
+ for (k=0;k<bytes;k++) {
+ if (k!=0)
+ printf("\t| ");
+ else
+ printf("\t");
+ printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
+ if (k<bytes-1)
+ printf("\t\\\n");
+ }
+ printf("\n\n");
+
+ return 0;
+}
+
+#else
+
+#include "camel-charset-map.h"
+#include "camel-charset-map-private.h"
+
+#include <gal/util/e-iconv.h>
+
+#include <glib.h>
+#include <locale.h>
+#include <ctype.h>
+#include <pthread.h>
+#ifdef HAVE_CODESET
+#include <langinfo.h>
+#endif
+
+void
+camel_charset_init (CamelCharset *c)
+{
+ c->mask = (unsigned int) ~0;
+ c->level = 0;
+}
+
+void
+camel_charset_step (CamelCharset *c, const char *in, int len)
+{
+ register unsigned int mask;
+ register int level;
+ const char *inptr = in, *inend = in+len;
+
+ mask = c->mask;
+ level = c->level;
+
+ /* check what charset a given string will fit in */
+ while (inptr < inend) {
+ gunichar c;
+ const char *newinptr;
+ newinptr = g_utf8_next_char(inptr);
+ c = g_utf8_get_char(inptr);
+ if (newinptr == NULL || !g_unichar_validate (c)) {
+ inptr++;
+ continue;
+ }
+
+ inptr = newinptr;
+ if (c<=0xffff) {
+ mask &= charset_mask(c);
+
+ if (c>=128 && c<256)
+ level = MAX(level, 1);
+ else if (c>=256)
+ level = MAX(level, 2);
+ } else {
+ mask = 0;
+ level = MAX(level, 2);
+ }
+ }
+
+ c->mask = mask;
+ c->level = level;
+}
+
+/* gets the best charset from the mask of chars in it */
+static const char *
+camel_charset_best_mask(unsigned int mask)
+{
+ const char *locale_lang, *lang;
+ int i;
+
+ locale_lang = e_iconv_locale_language ();
+ for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) {
+ if (camel_charinfo[i].bit & mask) {
+ lang = e_iconv_charset_language (camel_charinfo[i].name);
+
+ if (!lang || (locale_lang && !strncmp (locale_lang, lang, 2)))
+ return camel_charinfo[i].name;
+ }
+ }
+
+ return "UTF-8";
+}
+
+const char *
+camel_charset_best_name (CamelCharset *charset)
+{
+ if (charset->level == 1)
+ return "ISO-8859-1";
+ else if (charset->level == 2)
+ return camel_charset_best_mask (charset->mask);
+ else
+ return NULL;
+
+}
+
+/* finds the minimum charset for this string NULL means US-ASCII */
+const char *
+camel_charset_best (const char *in, int len)
+{
+ CamelCharset charset;
+
+ camel_charset_init (&charset);
+ camel_charset_step (&charset, in, len);
+ return camel_charset_best_name (&charset);
+}
+
+
+/**
+ * camel_charset_iso_to_windows:
+ * @isocharset: a canonicalised ISO charset
+ *
+ * Returns the equivalent Windows charset.
+ **/
+const char *
+camel_charset_iso_to_windows (const char *isocharset)
+{
+ /* According to http://czyborra.com/charsets/codepages.html,
+ * the charset mapping is as follows:
+ *
+ * us-ascii maps to windows-cp1252
+ * iso-8859-1 maps to windows-cp1252
+ * iso-8859-2 maps to windows-cp1250
+ * iso-8859-3 maps to windows-cp????
+ * iso-8859-4 maps to windows-cp????
+ * iso-8859-5 maps to windows-cp1251
+ * iso-8859-6 maps to windows-cp1256
+ * iso-8859-7 maps to windows-cp1253
+ * iso-8859-8 maps to windows-cp1255
+ * iso-8859-9 maps to windows-cp1254
+ * iso-8859-10 maps to windows-cp????
+ * iso-8859-11 maps to windows-cp????
+ * iso-8859-12 maps to windows-cp????
+ * iso-8859-13 maps to windows-cp1257
+ *
+ * Assumptions:
+ * - I'm going to assume that since iso-8859-4 and
+ * iso-8859-13 are Baltic that it also maps to
+ * windows-cp1257.
+ */
+
+ if (!g_ascii_strcasecmp (isocharset, "iso-8859-1") || !g_ascii_strcasecmp (isocharset, "us-ascii"))
+ return "windows-cp1252";
+ else if (!g_ascii_strcasecmp (isocharset, "iso-8859-2"))
+ return "windows-cp1250";
+ else if (!g_ascii_strcasecmp (isocharset, "iso-8859-4"))
+ return "windows-cp1257";
+ else if (!g_ascii_strcasecmp (isocharset, "iso-8859-5"))
+ return "windows-cp1251";
+ else if (!g_ascii_strcasecmp (isocharset, "iso-8859-6"))
+ return "windows-cp1256";
+ else if (!g_ascii_strcasecmp (isocharset, "iso-8859-7"))
+ return "windows-cp1253";
+ else if (!g_ascii_strcasecmp (isocharset, "iso-8859-8"))
+ return "windows-cp1255";
+ else if (!g_ascii_strcasecmp (isocharset, "iso-8859-9"))
+ return "windows-cp1254";
+ else if (!g_ascii_strcasecmp (isocharset, "iso-8859-13"))
+ return "windows-cp1257";
+
+ return isocharset;
+}
+
+#endif /* !BUILD_MAP */