diff options
Diffstat (limited to 'camel/camel-charset-map.c')
-rw-r--r-- | camel/camel-charset-map.c | 257 |
1 files changed, 257 insertions, 0 deletions
diff --git a/camel/camel-charset-map.c b/camel/camel-charset-map.c new file mode 100644 index 0000000000..b6ad0a5f37 --- /dev/null +++ b/camel/camel-charset-map.c @@ -0,0 +1,257 @@ + +#include <stdio.h> + +/* + if you want to build the charset map, add the root directory of + libunicode to the include path and define BUILD_MAP, + then run it as + ./a.out > camel-charset-map-private.h + + The tables genereated work like this: + + An indirect array for each page of unicode character + Each array element has an indirect pointer to one of the bytes of + the generated bitmask. +*/ + +#ifdef BUILD_MAP +#include "iso/iso8859-2.h" +#include "iso/iso8859-3.h" +#include "iso/iso8859-4.h" +#include "iso/iso8859-5.h" +#include "iso/iso8859-6.h" +#include "iso/iso8859-7.h" +#include "iso/iso8859-8.h" +#include "iso/iso8859-9.h" +#include "iso/iso8859-10.h" +#include "iso/iso8859-14.h" +#include "iso/iso8859-15.h" +#include "iso/koi8-r.h" +#include "iso/koi8-u.h" +#include "msft/cp932.h" +#include "jis/shiftjis.h" + +static struct { + unsigned short *table; + char *name; + int type; /* type of table */ + unsigned int bit; /* assigned bit */ +} tables[] = { + { iso8859_2_table, "iso-8859-2", 0, 0} , + { iso8859_3_table, "iso-8859-3", 0, 0} , + { iso8859_4_table, "iso-8859-4", 0, 0}, + { iso8859_5_table, "iso-8859-5", 0, 0}, +/* apparently -6 has special digits? */ + { iso8859_6_table, "iso-8859-6", 0, 0}, + { iso8859_7_table, "iso-8859-7", 0, 0}, + { iso8859_8_table, "iso-8859-8", 0, 0}, + { iso8859_9_table, "iso-8859-9", 0, 0}, + { iso8859_10_table, "iso-8859-10", 0, 0}, + { iso8859_14_table, "iso-8859-14", 0, 0}, + { iso8859_15_table, "iso-8859-15", 0, 0}, + { koi8_r_table, "koi8-r", 0, 0}, + { koi8_u_table, "koi8-u", 0, 0}, + { cp932_table, "CP932", 1, 0}, + { sjis_table, "Shift-JIS", 1, 0}, + { 0, 0} +}; + +unsigned int encoding_map[256 * 256]; + +static void +add_bigmap(unsigned short **table, int bit) +{ + int i; + int j; + + for (i=0;i<256;i++) { + unsigned short *tab = table[i]; + if (tab) { + for (j=0;j<256;j++) { + if (tab[j]) + encoding_map[tab[j]] |= bit; + } + } + } +} + +main() +{ + int i, j; + unsigned short *tab; + int max, min; + int bit = 0x01; + int k; + int bytes; + +#if 0 + /* iso-latin-1 (not needed-detected in code) */ + for (i=0;i<256;i++) { + encoding_map[i] |= bit; + } + bit <<= 1; +#endif + + /* dont count the terminator */ + bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8; + + /* the other latin charsets */ + for (j=0;tables[j].table;j++) { + switch (tables[j].type) { + case 0: /* table from 128-256 */ + tab = tables[j].table; + for (i=0;i<128;i++) { + /* 0-127 is the common */ + encoding_map[i] |= bit; + encoding_map[tab[i]] |= bit; + } + break; + case 1: /* sparse table */ + add_bigmap(tables[j].table, bit); + break; + } + tables[j].bit = bit; + bit <<= 1; + } + + printf("/* This file is automatically generated: DO NOT EDIT */\n\n"); + + for (i=0;i<256;i++) { + /* first, do we need this block? */ + for (k=0;k<bytes;k++) { + for (j=0;j<256;j++) { + if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) + break; + } + if (j < 256) { + /* yes, dump it */ + printf("static unsigned char m%02x%x[256] = {\n\t", i, k); + for (j=0;j<256;j++) { + printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff ); + if (((j+1)&7) == 0 && j<255) + printf("\n\t"); + } + printf("\n};\n\n"); + } + } + } + + printf("struct {\n"); + for (k=0;k<bytes;k++) { + printf("\tunsigned char *bits%d;\n", k); + } + printf("} camel_charmap[256] = {\n\t"); + for (i=0;i<256;i++) { + /* first, do we need this block? */ + printf("{ "); + for (k=0;k<bytes;k++) { + for (j=0;j<256;j++) { + if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) + break; + } + if (j < 256) { + printf("m%02x%x, ", i, k); + } else { + printf("0, "); + } + } + printf("}, "); + if (((i+1)&7) == 0 && i<255) + printf("\n\t"); + } + printf("\n};\n\n"); + + printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n"); + for (j=0;tables[j].table;j++) { + printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit); + } + printf("};\n\n"); + + printf("#define charset_mask(x) \\\n"); + for (k=0;k<bytes;k++) { + if (k!=0) + printf("\t| "); + else + printf("\t"); + printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8); + if (k<bytes-1) + printf("\t\\\n"); + } + printf("\n\n"); + +} + +#else + +#include "camel-charset-map.h" +#include "camel-charset-map-private.h" +#include <unicode.h> +#include <glib.h> + +unsigned int +camel_charset_mask(unsigned int c) +{ + if (c>0xffff) + return 0; + + return charset_mask(c); +} + +/* gets the best charset from the mask of chars in it */ +const char * +camel_charset_best_mask(unsigned int mask) +{ + int i; + + for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) { + if (camel_charinfo[i].bit & mask) + return camel_charinfo[i].name; + } + return "UTF-8"; +} + +/* finds the minimum charset for this string NULL means US-ASCII */ +const char * +camel_charset_best(const char *in, int len) +{ + int i; + unsigned int mask = ~0; + int level = 0; + const char *inptr = in, *inend = in+len; + + /* check what charset a given string will fit in */ + while (inptr < inend) { + unicode_char_t c; + const char *newinptr; + newinptr = unicode_get_utf8(inptr, &c); + if (newinptr == NULL) { + inptr++; + continue; + } + inptr = newinptr; + if (c<=0xffff) { + mask |= camel_charset_mask(c); + + if (c>=128 && c<256) + level = MAX(level, 1); + else if (c>=256) + level = MAX(level, 2); + } else { + mask = 0; + level = MAX(level, 2); + } + } + + switch(level) { + case 0: + return NULL; + case 1: + return "ISO-8859-1"; + case 2: + return camel_charset_best_mask(mask); + } +} + + +#endif /* !BUILD_MAP */ + |