#include <glib.h> #include "camel-utf8.h" /** * camel_utf8_putc: * @ptr: * @c: * * Output a 32 bit unicode character as utf8 octets. At most 4 octets will * be written to @ptr. @ptr will be advanced to the next character position. **/ void camel_utf8_putc(unsigned char **ptr, guint32 c) { register unsigned char *p = *ptr; if (c <= 0x7f) *p++ = c; else if (c <= 0x7ff) { *p++ = 0xc0 | c >> 6; *p++ = 0x80 | (c & 0x3f); } else if (c <= 0xffff) { *p++ = 0xe0 | c >> 12; *p++ = 0x80 | ((c >> 6) & 0x3f); *p++ = 0x80 | (c & 0x3f); } else { /* see unicode standard 3.0, S 3.8, max 4 octets */ *p++ = 0xf0 | c >> 18; *p++ = 0x80 | ((c >> 12) & 0x3f); *p++ = 0x80 | ((c >> 6) & 0x3f); *p++ = 0x80 | (c & 0x3f); } *ptr = p; } /** * camel_utf8_getc: * @ptr: * * Get a Unicode character from a utf8 stream. @ptr will be advanced * to the next character position. Invalid utf8 characters will be * silently skipped. @ptr should point to a NUL terminated array. * * Return value: The next Unicode character. @ptr will be advanced to * the next character always. **/ guint32 camel_utf8_getc(const unsigned char **ptr) { register unsigned char *p = (unsigned char *)*ptr; register unsigned char c, r; register guint32 v, m; again: r = *p++; loop: if (r < 0x80) { *ptr = p; v = r; } else if (r < 0xf8) { /* valid start char? (max 4 octets) */ v = r; m = 0x7f80; /* used to mask out the length bits */ do { c = *p++; if ((c & 0xc0) != 0x80) { r = c; goto loop; } v = (v<<6) | (c & 0x3f); r<<=1; m<<=5; } while (r & 0x40); *ptr = p; v &= ~m; } else { goto again; } return v; } void g_string_append_u(GString *out, guint32 c) { unsigned char buffer[8]; unsigned char *p = buffer; camel_utf8_putc(&p, c); *p = 0; g_string_append(out, buffer); } static char *utf7_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; static unsigned char utf7_rank[256] = { 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff, 0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e, 0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff, 0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28, 0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, }; /** * camel_utf7_utf8: * @ptr: * * Convert a modified utf7 string to utf8. If the utf7 string * contains 8 bit characters, they are treated as iso-8859-1. * * The IMAP rules [rfc2060] are used in the utf7 encoding. * * Return value: The converted string. **/ char * camel_utf7_utf8(const char *ptr) { const unsigned char *p = (unsigned char *)ptr; unsigned int c; guint32 v=0, x; GString *out; int i=0; int state = 0; char *ret; out = g_string_new(""); do { c = *p++; switch(state) { case 0: if (c == '&') state = 1; else g_string_append_u(out, c); break; case 1: if (c == '-') { g_string_append_c(out, '&'); state = 0; } else if (utf7_rank[c] != 0xff) { v = utf7_rank[c]; i = 6; state = 2; } else { /* invalid */ g_string_append(out, "&-"); state = 0; } break; case 2: if (c == '-') { state = 0; } else if (utf7_rank[c] != 0xff) { v = (v<<6) | utf7_rank[c]; i+=6; if (i >= 16) { x = (v >> (i-16)) & 0xffff; g_string_append_u(out, x); i-=16; } } else { g_string_append_u(out, c); state = 0; } break; } } while (c); ret = g_strdup(out->str); g_string_free(out, TRUE); return ret; } static void utf7_closeb64(GString *out, guint32 v, guint32 i) { guint32 x; if (i>0) { x = (v << (6-i)) & 0x3f; g_string_append_c(out, utf7_alphabet[x]); } g_string_append_c(out, '-'); } /** * camel_utf8_utf7: * @ptr: * * Convert a utf8 string to a modified utf7 format. * * The IMAP rules [rfc2060] are used in the utf7 encoding. * * Return value: **/ char * camel_utf8_utf7(const char *ptr) { const unsigned char *p = (unsigned char *)ptr; unsigned int c; guint32 x, v = 0; int state = 0; GString *out; int i = 0; char *ret; out = g_string_new(""); while ( (c = camel_utf8_getc(&p)) ) { if (c >= 0x20 && c <= 0x7e) { if (state == 1) { utf7_closeb64(out, v, i); state = 0; i = 0; } if (c == '&') g_string_append(out, "&-"); else g_string_append_c(out, c); } else { if (state == 0) { g_string_append_c(out, '&'); state = 1; } v = (v << 16) | c; i += 16; while (i >= 6) { x = (v >> (i-6)) & 0x3f; g_string_append_c(out, utf7_alphabet[x]); i -= 6; } } } if (state == 1) utf7_closeb64(out, v, i); ret = g_strdup(out->str); g_string_free(out, TRUE); return ret; }