aboutsummaryrefslogtreecommitdiffstats
path: root/camel/camel-mime-part-utils.c
diff options
context:
space:
mode:
Diffstat (limited to 'camel/camel-mime-part-utils.c')
-rw-r--r--camel/camel-mime-part-utils.c361
1 files changed, 346 insertions, 15 deletions
diff --git a/camel/camel-mime-part-utils.c b/camel/camel-mime-part-utils.c
index af4d7c1161..800f23372e 100644
--- a/camel/camel-mime-part-utils.c
+++ b/camel/camel-mime-part-utils.c
@@ -5,7 +5,7 @@
* Michael Zucchi <notzed@ximian.com>
* Jeffrey Stedfast <fejj@ximian.com>
*
- * Copyright 1999-2003 Ximian, Inc. (www.ximian.com)
+ * Copyright 1999, 2000 Ximian, Inc. (www.ximian.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -34,6 +34,7 @@
#include <gal/util/e-iconv.h>
+#include "camel-string-utils.h"
#include "camel-charset-map.h"
#include "camel-mime-part-utils.h"
#include "camel-mime-message.h"
@@ -53,21 +54,278 @@
#define d(x) /*(printf("%s(%d): ", __FILE__, __LINE__),(x))
#include <stdio.h>*/
+/* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
+
+static const char *
+check_html_charset(char *buffer, int length)
+{
+ CamelHTMLParser *hp;
+ const char *charset = NULL;
+ camel_html_parser_t state;
+ struct _header_content_type *ct;
+
+ /* if we need to first base64/qp decode, do this here, sigh */
+ hp = camel_html_parser_new();
+ camel_html_parser_set_data(hp, buffer, length, TRUE);
+
+ do {
+ const char *data;
+ int len;
+ const char *val;
+
+ state = camel_html_parser_step(hp, &data, &len);
+
+ /* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
+
+ switch(state) {
+ case CAMEL_HTML_PARSER_ELEMENT:
+ val = camel_html_parser_tag(hp);
+ d(printf("Got tag: %s\n", val));
+ if (strcasecmp(val, "meta") == 0
+ && (val = camel_html_parser_attr(hp, "http-equiv"))
+ && strcasecmp(val, "content-type") == 0
+ && (val = camel_html_parser_attr(hp, "content"))
+ && (ct = header_content_type_decode(val))) {
+ charset = header_content_type_param(ct, "charset");
+ charset = e_iconv_charset_name (charset);
+ header_content_type_unref(ct);
+ }
+ break;
+ default:
+ /* ignore everything else */
+ break;
+ }
+ } while (charset == NULL && state != CAMEL_HTML_PARSER_EOF);
+
+ camel_object_unref (hp);
+
+ return charset;
+}
+
+static GByteArray *
+convert_buffer (GByteArray *in, const char *to, const char *from)
+{
+ size_t inleft, outleft, outlen, converted = 0;
+ GByteArray *out = NULL;
+ const char *inbuf;
+ char *outbuf;
+ iconv_t cd;
+
+ if (in->len == 0)
+ return g_byte_array_new();
+
+ d(printf("converting buffer from %s to %s:\n", from, to));
+ d(fwrite(in->data, 1, (int)in->len, stdout));
+ d(printf("\n"));
+
+ cd = e_iconv_open(to, from);
+ if (cd == (iconv_t) -1) {
+ g_warning ("Cannot convert from '%s' to '%s': %s", from, to, strerror (errno));
+ return NULL;
+ }
+
+ outlen = in->len * 2 + 16;
+ out = g_byte_array_new ();
+ g_byte_array_set_size (out, outlen);
+
+ inbuf = in->data;
+ inleft = in->len;
+
+ do {
+ outbuf = out->data + converted;
+ outleft = outlen - converted;
+
+ converted = e_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
+ if (converted == (size_t) -1) {
+ if (errno != E2BIG && errno != EINVAL)
+ goto fail;
+ }
+
+ /*
+ * E2BIG There is not sufficient room at *outbuf.
+ *
+ * We just need to grow our outbuffer and try again.
+ */
+
+ converted = outbuf - (char *)out->data;
+ if (errno == E2BIG) {
+ outlen += inleft * 2 + 16;
+ out = g_byte_array_set_size (out, outlen);
+ outbuf = out->data + converted;
+ }
+
+ } while (errno == E2BIG && inleft > 0);
+
+ /*
+ * EINVAL An incomplete multibyte sequence has been encoun­
+ * tered in the input.
+ *
+ * We'll just have to ignore it...
+ */
+
+ /* flush the iconv conversion */
+ e_iconv (cd, NULL, NULL, &outbuf, &outleft);
+
+ /* now set the true length on the GByteArray */
+ converted = outbuf - (char *)out->data;
+ g_byte_array_set_size (out, converted);
+
+ d(printf("converted data:\n"));
+ d(fwrite(out->data, 1, (int)out->len, stdout));
+ d(printf("\n"));
+
+ e_iconv_close (cd);
+
+ return out;
+
+ fail:
+ g_warning ("Cannot convert from '%s' to '%s': %s", from, to, strerror (errno));
+
+ g_byte_array_free (out, TRUE);
+
+ e_iconv_close (cd);
+
+ return NULL;
+}
+
+/* We don't really use the charset argument except for debugging... */
+static gboolean
+broken_windows_charset (GByteArray *buffer, const char *charset)
+{
+ register unsigned char *inptr;
+ unsigned char *inend;
+
+ inptr = buffer->data;
+ inend = inptr + buffer->len;
+
+ while (inptr < inend) {
+ register unsigned char c = *inptr++;
+
+ if (c >= 128 && c <= 159) {
+ g_warning ("Encountered Windows charset parading as %s", charset);
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+static gboolean
+is_7bit (GByteArray *buffer)
+{
+ register unsigned int i;
+
+ for (i = 0; i < buffer->len; i++)
+ if (buffer->data[i] > 127)
+ return FALSE;
+
+ return TRUE;
+}
+
+static const char *iso_charsets[] = {
+ "us-ascii",
+ "iso-8859-1",
+ "iso-8859-2",
+ "iso-8859-3",
+ "iso-8859-4",
+ "iso-8859-5",
+ "iso-8859-6",
+ "iso-8859-7",
+ "iso-8859-8",
+ "iso-8859-9",
+ "iso-8859-10",
+ "iso-8859-11",
+ "iso-8859-12",
+ "iso-8859-13",
+ "iso-8859-14",
+ "iso-8859-15",
+ "iso-8859-16"
+};
+
+#define NUM_ISO_CHARSETS (sizeof (iso_charsets) / sizeof (iso_charsets[0]))
+
+static const char *
+canon_charset_name (const char *charset)
+{
+ const char *ptr;
+ char *endptr;
+ int iso;
+
+ if (strncasecmp (charset, "iso", 3) != 0)
+ return charset;
+
+ ptr = charset + 3;
+ if (*ptr == '-' || *ptr == '_')
+ ptr++;
+
+ /* if it's not an iso-8859-# charset, we don't care about it */
+ if (strncmp (ptr, "8859", 4) != 0)
+ return charset;
+
+ ptr += 4;
+ if (*ptr == '-' || *ptr == '_')
+ ptr++;
+
+ iso = strtoul (ptr, &endptr, 10);
+ if (endptr == ptr || *endptr != '\0')
+ return charset;
+
+ if (iso >= NUM_ISO_CHARSETS)
+ return charset;
+
+ return iso_charsets[iso];
+}
+
/* simple data wrapper */
static void
simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp)
{
+ CamelMimeFilter *fdec = NULL, *fcrlf = NULL;
+ CamelMimeFilterBasicType enctype = 0;
+ size_t len;
+ int decid = -1, crlfid = -1;
struct _header_content_type *ct;
const char *charset = NULL;
char *encoding, *buf;
GByteArray *buffer;
CamelStream *mem;
- size_t len;
d(printf ("simple_data_wrapper_construct_from_parser()\n"));
/* first, work out conversion, if any, required, we dont care about what we dont know about */
encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "Content-Transfer-Encoding", NULL));
+ if (encoding) {
+ if (!strcasecmp (encoding, "base64")) {
+ d(printf("Adding base64 decoder ...\n"));
+ enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC;
+ } else if (!strcasecmp (encoding, "quoted-printable")) {
+ d(printf("Adding quoted-printable decoder ...\n"));
+ enctype = CAMEL_MIME_FILTER_BASIC_QP_DEC;
+ } else if (!strcasecmp (encoding, "x-uuencode")) {
+ d(printf("Adding uudecoder ...\n"));
+ enctype = CAMEL_MIME_FILTER_BASIC_UU_DEC;
+ }
+ g_free (encoding);
+
+ if (enctype != 0) {
+ fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type(enctype);
+ decid = camel_mime_parser_filter_add (mp, fdec);
+ }
+ }
+
+ /* If we're doing text, we also need to do CRLF->LF and may have to convert it to UTF8 as well. */
+ ct = camel_mime_parser_content_type (mp);
+ if (header_content_type_is (ct, "text", "*")) {
+ charset = header_content_type_param (ct, "charset");
+ charset = e_iconv_charset_name (charset);
+
+ if (fdec) {
+ d(printf ("Adding CRLF conversion filter\n"));
+ fcrlf = camel_mime_filter_crlf_new (CAMEL_MIME_FILTER_CRLF_DECODE,
+ CAMEL_MIME_FILTER_CRLF_MODE_CRLF_ONLY);
+ crlfid = camel_mime_parser_filter_add (mp, fcrlf);
+ }
+ }
/* read in the entire content */
buffer = g_byte_array_new ();
@@ -76,22 +334,86 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser
g_byte_array_append (buffer, buf, len);
}
- d(printf("message part kept in memory!\n"));
+ /* check for broken Outlook/Web mailers that like to send html marked as text/plain */
+ if (header_content_type_is (ct, "text", "plain")) {
+ register const unsigned char *inptr;
+ const unsigned char *inend;
+
+ inptr = buffer->data;
+ inend = inptr + buffer->len;
+
+ while (inptr < inend && isspace ((int) *inptr))
+ inptr++;
+
+ if (((inend-inptr) > 5 && g_ascii_strncasecmp(inptr, "<html", 5) == 0)
+ || ((inend-inptr) > 9 && g_ascii_strncasecmp(inptr, "<!doctype", 9) == 0)) {
+ /* re-tag as text/html */
+ g_free (ct->subtype);
+ ct->subtype = g_strdup ("html");
+ }
+ }
- mem = camel_stream_mem_new_with_byte_array (buffer);
- camel_data_wrapper_construct_from_stream (dw, mem);
- camel_object_unref (mem);
+ /* Possible Lame Mailer Alert... check the META tags for a charset */
+ if (!charset && header_content_type_is (ct, "text", "html")) {
+ if ((charset = check_html_charset (buffer->data, buffer->len)))
+ header_content_type_set_param (ct, "charset", charset);
+ }
- if (encoding) {
- if (!strcasecmp (encoding, "base64")) {
- dw->encoding = CAMEL_MIME_PART_ENCODING_BASE64;
- } else if (!strcasecmp (encoding, "quoted-printable")) {
- dw->encoding = CAMEL_MIME_PART_ENCODING_QUOTEDPRINTABLE;
- } else if (!strcasecmp (encoding, "x-uuencode")) {
- dw->encoding = CAMEL_MIME_PART_ENCODING_UUENCODE;
+ /* if we need to do charset conversion, see if we can/it works/etc */
+ if (charset && !(strcasecmp (charset, "us-ascii") == 0
+ || strcasecmp (charset, "utf-8") == 0
+ || strncasecmp (charset, "x-", 2) == 0)) {
+ GByteArray *out;
+
+ /* You often see Microsoft Windows users announcing their texts
+ * as being in ISO-8859-1 even when in fact they contain funny
+ * characters from the Windows-CP1252 superset.
+ */
+ charset = canon_charset_name (charset);
+ if (!strncasecmp (charset, "iso-8859", 8)) {
+ /* check for Windows-specific chars... */
+ if (broken_windows_charset (buffer, charset))
+ charset = camel_charset_iso_to_windows (charset);
+ }
+
+ out = convert_buffer (buffer, "UTF-8", charset);
+ if (out) {
+ /* converted ok, use this data instead */
+ g_byte_array_free(buffer, TRUE);
+ dw->rawtext = FALSE;
+ buffer = out;
+ } else {
+ /* else failed to convert, leave as raw? */
+ g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset);
+ dw->rawtext = TRUE;
+ }
+ } else if (header_content_type_is (ct, "text", "*")) {
+ if (charset == NULL || !strcasecmp (charset, "us-ascii")) {
+ /* check that it's 7bit */
+ dw->rawtext = !is_7bit (buffer);
+ } else if (!strncasecmp (charset, "x-", 2)) {
+ /* we're not even going to bother trying to convert, so set the
+ rawtext bit to TRUE and let the mailer deal with it. */
+ dw->rawtext = TRUE;
+ } else if (!strcasecmp (charset, "utf-8") && buffer->len) {
+ /* check that it is valid utf8 */
+ dw->rawtext = !g_utf8_validate (buffer->data, buffer->len, NULL);
}
- g_free (encoding);
}
+
+ d(printf("message part kept in memory!\n"));
+
+ mem = camel_stream_mem_new_with_byte_array(buffer);
+ camel_data_wrapper_construct_from_stream(dw, mem);
+ camel_object_unref((CamelObject *)mem);
+
+ camel_mime_parser_filter_remove(mp, decid);
+ camel_mime_parser_filter_remove(mp, crlfid);
+
+ if (fdec)
+ camel_object_unref((CamelObject *)fdec);
+ if (fcrlf)
+ camel_object_unref((CamelObject *)fcrlf);
}
/* This replaces the data wrapper repository ... and/or could be replaced by it? */
@@ -102,7 +424,7 @@ camel_mime_part_construct_content_from_parser (CamelMimePart *dw, CamelMimeParse
CamelContentType *ct;
ct = camel_mime_parser_content_type (mp);
-
+
switch (camel_mime_parser_state (mp)) {
case HSCAN_HEADER:
d(printf("Creating body part\n"));
@@ -139,6 +461,15 @@ camel_mime_part_construct_content_from_parser (CamelMimePart *dw, CamelMimeParse
/* would you believe you have to set this BEFORE you set the content object??? oh my god !!!! */
camel_data_wrapper_set_mime_type_field (content, camel_mime_part_get_content_type (dw));
camel_medium_set_content_object ((CamelMedium *)dw, content);
+
+ /* Note: we don't set ct as the content-object's mime-type above because
+ * camel_medium_set_content_object() may re-write the Content-Type header
+ * (see CamelMimePart::set_content_object) if we did that (which is a Bad Thing).
+ * However, if we set it *afterward*, we can still use any special auto-detections
+ * that we found in simple_data_wrapper_construct_from_parser(). This is important
+ * later when we go to render the MIME parts in mail-format.c */
+ camel_data_wrapper_set_mime_type_field (content, ct);
+
camel_object_unref (content);
}
}