1 files changed, 346 insertions, 15 deletions
diff --git a/camel/camel-mime-part-utils.c b/camel/camel-mime-part-utils.c
index af4d7c1161..800f23372e 100644
--- a/camel/camel-mime-part-utils.c
+++ b/camel/camel-mime-part-utils.c
@@ -5,7 +5,7 @@
  *          Michael Zucchi <notzed@ximian.com>
  *          Jeffrey Stedfast <fejj@ximian.com>
  *
- * Copyright 1999-2003 Ximian, Inc. (www.ximian.com)
+ * Copyright 1999, 2000 Ximian, Inc. (www.ximian.com)
  *
  * This program is free software; you can redistribute it and/or 
  * modify it under the terms of version 2 of the GNU General Public 
@@ -34,6 +34,7 @@
 
 #include <gal/util/e-iconv.h>
 
+#include "camel-string-utils.h"
 #include "camel-charset-map.h"
 #include "camel-mime-part-utils.h"
 #include "camel-mime-message.h"
@@ -53,21 +54,278 @@
 #define d(x) /*(printf("%s(%d): ", __FILE__, __LINE__),(x))
 	       #include <stdio.h>*/
 
+/* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
+
+static const char *
+check_html_charset(char *buffer, int length)
+{
+	CamelHTMLParser *hp;
+	const char *charset = NULL;
+	camel_html_parser_t state;
+	struct _header_content_type *ct;
+
+	/* if we need to first base64/qp decode, do this here, sigh */
+	hp = camel_html_parser_new();
+	camel_html_parser_set_data(hp, buffer, length, TRUE);
+	
+	do {
+		const char *data;
+		int len;
+		const char *val;
+		
+		state = camel_html_parser_step(hp, &data, &len);
+		
+		/* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
+		
+		switch(state) {
+		case CAMEL_HTML_PARSER_ELEMENT:
+			val = camel_html_parser_tag(hp);
+			d(printf("Got tag: %s\n", val));
+			if (strcasecmp(val, "meta") == 0
+			    && (val = camel_html_parser_attr(hp, "http-equiv"))
+			    && strcasecmp(val, "content-type") == 0
+			    && (val = camel_html_parser_attr(hp, "content"))
+			    && (ct = header_content_type_decode(val))) {
+				charset = header_content_type_param(ct, "charset");
+				charset = e_iconv_charset_name (charset);
+				header_content_type_unref(ct);
+			}
+			break;
+		default:
+			/* ignore everything else */
+			break;
+		}
+	} while (charset == NULL && state != CAMEL_HTML_PARSER_EOF);
+
+	camel_object_unref (hp);
+
+	return charset;
+}
+
+static GByteArray *
+convert_buffer (GByteArray *in, const char *to, const char *from)
+{
+	size_t inleft, outleft, outlen, converted = 0;
+	GByteArray *out = NULL;
+	const char *inbuf;
+	char *outbuf;
+	iconv_t cd;
+	
+	if (in->len == 0)
+		return g_byte_array_new();
+	
+	d(printf("converting buffer from %s to %s:\n", from, to));
+	d(fwrite(in->data, 1, (int)in->len, stdout));
+	d(printf("\n"));
+	
+	cd = e_iconv_open(to, from);
+	if (cd == (iconv_t) -1) {
+		g_warning ("Cannot convert from '%s' to '%s': %s", from, to, strerror (errno));
+		return NULL;
+	}
+	
+	outlen = in->len * 2 + 16;
+	out = g_byte_array_new ();
+	g_byte_array_set_size (out, outlen);
+	
+	inbuf = in->data;
+	inleft = in->len;
+	
+	do {
+		outbuf = out->data + converted;
+		outleft = outlen - converted;
+		
+		converted = e_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
+		if (converted == (size_t) -1) {
+			if (errno != E2BIG && errno != EINVAL)
+				goto fail;
+		}
+		
+		/*
+		 * E2BIG   There is not sufficient room at *outbuf.
+		 *
+		 * We just need to grow our outbuffer and try again.
+		 */
+		
+		converted = outbuf - (char *)out->data;
+		if (errno == E2BIG) {
+			outlen += inleft * 2 + 16;
+			out = g_byte_array_set_size (out, outlen);
+			outbuf = out->data + converted;
+		}
+		
+	} while (errno == E2BIG && inleft > 0);
+	
+	/*
+	 * EINVAL  An  incomplete  multibyte sequence has been encoun
+	 *         tered in the input.
+	 *
+	 * We'll just have to ignore it...
+	 */
+	
+	/* flush the iconv conversion */
+	e_iconv (cd, NULL, NULL, &outbuf, &outleft);
+	
+	/* now set the true length on the GByteArray */
+	converted = outbuf - (char *)out->data;
+	g_byte_array_set_size (out, converted);
+	
+	d(printf("converted data:\n"));
+	d(fwrite(out->data, 1, (int)out->len, stdout));
+	d(printf("\n"));
+	
+	e_iconv_close (cd);
+	
+	return out;
+	
+ fail:
+	g_warning ("Cannot convert from '%s' to '%s': %s", from, to, strerror (errno));
+	
+	g_byte_array_free (out, TRUE);
+	
+	e_iconv_close (cd);
+	
+	return NULL;
+}
+
+/* We don't really use the charset argument except for debugging... */
+static gboolean
+broken_windows_charset (GByteArray *buffer, const char *charset)
+{
+	register unsigned char *inptr;
+	unsigned char *inend;
+	
+	inptr = buffer->data;
+	inend = inptr + buffer->len;
+	
+	while (inptr < inend) {
+		register unsigned char c = *inptr++;
+		
+		if (c >= 128 && c <= 159) {
+			g_warning ("Encountered Windows charset parading as %s", charset);
+			return TRUE;
+		}
+	}
+	
+	return FALSE;
+}
+
+static gboolean
+is_7bit (GByteArray *buffer)
+{
+	register unsigned int i;
+	
+	for (i = 0; i < buffer->len; i++)
+		if (buffer->data[i] > 127)
+			return FALSE;
+	
+	return TRUE;
+}
+
+static const char *iso_charsets[] = {
+	"us-ascii",
+	"iso-8859-1",
+	"iso-8859-2",
+	"iso-8859-3",
+	"iso-8859-4",
+	"iso-8859-5",
+	"iso-8859-6",
+	"iso-8859-7",
+	"iso-8859-8",
+	"iso-8859-9",
+	"iso-8859-10",
+	"iso-8859-11",
+	"iso-8859-12",
+	"iso-8859-13",
+	"iso-8859-14",
+	"iso-8859-15",
+	"iso-8859-16"
+};
+
+#define NUM_ISO_CHARSETS (sizeof (iso_charsets) / sizeof (iso_charsets[0]))
+
+static const char *
+canon_charset_name (const char *charset)
+{
+	const char *ptr;
+	char *endptr;
+	int iso;
+	
+	if (strncasecmp (charset, "iso", 3) != 0)
+		return charset;
+	
+	ptr = charset + 3;
+	if (*ptr == '-' || *ptr == '_')
+		ptr++;
+	
+	/* if it's not an iso-8859-# charset, we don't care about it */
+	if (strncmp (ptr, "8859", 4) != 0)
+		return charset;
+	
+	ptr += 4;
+	if (*ptr == '-' || *ptr == '_')
+		ptr++;
+	
+	iso = strtoul (ptr, &endptr, 10);
+	if (endptr == ptr || *endptr != '\0')
+		return charset;
+	
+	if (iso >= NUM_ISO_CHARSETS)
+		return charset;
+	
+	return iso_charsets[iso];
+}
+
 /* simple data wrapper */
 static void
 simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp)
 {
+	CamelMimeFilter *fdec = NULL, *fcrlf = NULL;
+	CamelMimeFilterBasicType enctype = 0;
+	size_t len;
+	int decid = -1, crlfid = -1;
 	struct _header_content_type *ct;
 	const char *charset = NULL;
 	char *encoding, *buf;
 	GByteArray *buffer;
 	CamelStream *mem;
-	size_t len;
 	
 	d(printf ("simple_data_wrapper_construct_from_parser()\n"));
 	
 	/* first, work out conversion, if any, required, we dont care about what we dont know about */
 	encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "Content-Transfer-Encoding", NULL));
+	if (encoding) {
+		if (!strcasecmp (encoding, "base64")) {
+			d(printf("Adding base64 decoder ...\n"));
+			enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC;
+		} else if (!strcasecmp (encoding, "quoted-printable")) {
+			d(printf("Adding quoted-printable decoder ...\n"));
+			enctype = CAMEL_MIME_FILTER_BASIC_QP_DEC;
+		} else if (!strcasecmp (encoding, "x-uuencode")) {
+			d(printf("Adding uudecoder ...\n"));
+			enctype = CAMEL_MIME_FILTER_BASIC_UU_DEC;
+		}
+		g_free (encoding);
+		
+		if (enctype != 0) {
+			fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type(enctype);
+			decid = camel_mime_parser_filter_add (mp, fdec);
+		}
+	}
+	
+	/* If we're doing text, we also need to do CRLF->LF and may have to convert it to UTF8 as well. */
+	ct = camel_mime_parser_content_type (mp);
+	if (header_content_type_is (ct, "text", "*")) {
+		charset = header_content_type_param (ct, "charset");
+		charset = e_iconv_charset_name (charset);
+		
+		if (fdec) {
+			d(printf ("Adding CRLF conversion filter\n"));
+			fcrlf = camel_mime_filter_crlf_new (CAMEL_MIME_FILTER_CRLF_DECODE,
+							    CAMEL_MIME_FILTER_CRLF_MODE_CRLF_ONLY);
+			crlfid = camel_mime_parser_filter_add (mp, fcrlf);
+		}
+	}
 	
 	/* read in the entire content */
 	buffer = g_byte_array_new ();
@@ -76,22 +334,86 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser
 		g_byte_array_append (buffer, buf, len);
 	}
 	
-	d(printf("message part kept in memory!\n"));
+	/* check for broken Outlook/Web mailers that like to send html marked as text/plain */
+	if (header_content_type_is (ct, "text", "plain")) {
+		register const unsigned char *inptr;
+		const unsigned char *inend;
+		
+		inptr = buffer->data;
+		inend = inptr + buffer->len;
+		
+		while (inptr < inend && isspace ((int) *inptr))
+			inptr++;
+
+		if (((inend-inptr) > 5 && g_ascii_strncasecmp(inptr, "<html", 5) == 0)
+		    || ((inend-inptr) > 9 && g_ascii_strncasecmp(inptr, "<!doctype", 9) == 0)) {
+			/* re-tag as text/html */
+			g_free (ct->subtype);
+			ct->subtype = g_strdup ("html");
+		}
+	}
 	
-	mem = camel_stream_mem_new_with_byte_array (buffer);
-	camel_data_wrapper_construct_from_stream (dw, mem);
-	camel_object_unref (mem);
+	/* Possible Lame Mailer Alert... check the META tags for a charset */
+	if (!charset && header_content_type_is (ct, "text", "html")) {
+		if ((charset = check_html_charset (buffer->data, buffer->len)))
+			header_content_type_set_param (ct, "charset", charset);
+	}
 	
-	if (encoding) {
-		if (!strcasecmp (encoding, "base64")) {
-			dw->encoding = CAMEL_MIME_PART_ENCODING_BASE64;
-		} else if (!strcasecmp (encoding, "quoted-printable")) {
-			dw->encoding = CAMEL_MIME_PART_ENCODING_QUOTEDPRINTABLE;
-		} else if (!strcasecmp (encoding, "x-uuencode")) {
-			dw->encoding = CAMEL_MIME_PART_ENCODING_UUENCODE;
+	/* if we need to do charset conversion, see if we can/it works/etc */
+	if (charset && !(strcasecmp (charset, "us-ascii") == 0
+			 || strcasecmp (charset, "utf-8") == 0
+			 || strncasecmp (charset, "x-", 2) == 0)) {
+		GByteArray *out;
+		
+		/* You often see Microsoft Windows users announcing their texts
+		 * as being in ISO-8859-1 even when in fact they contain funny
+		 * characters from the Windows-CP1252 superset.
+		 */
+		charset = canon_charset_name (charset);
+		if (!strncasecmp (charset, "iso-8859", 8)) {
+			/* check for Windows-specific chars... */
+			if (broken_windows_charset (buffer, charset))
+				charset = camel_charset_iso_to_windows (charset);
+		}
+		
+		out = convert_buffer (buffer, "UTF-8", charset);
+		if (out) {
+			/* converted ok, use this data instead */
+			g_byte_array_free(buffer, TRUE);
+			dw->rawtext = FALSE;
+			buffer = out;
+		} else {
+			/* else failed to convert, leave as raw? */
+			g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset);
+			dw->rawtext = TRUE;
+		}
+	} else if (header_content_type_is (ct, "text", "*")) {
+		if (charset == NULL || !strcasecmp (charset, "us-ascii")) {
+			/* check that it's 7bit */
+			dw->rawtext = !is_7bit (buffer);
+		} else if (!strncasecmp (charset, "x-", 2)) {
+			/* we're not even going to bother trying to convert, so set the
+			   rawtext bit to TRUE and let the mailer deal with it. */
+			dw->rawtext = TRUE;
+		} else if (!strcasecmp (charset, "utf-8") && buffer->len) {
+			/* check that it is valid utf8 */
+			dw->rawtext = !g_utf8_validate (buffer->data, buffer->len, NULL);
 		}
-		g_free (encoding);
 	}
+	
+	d(printf("message part kept in memory!\n"));
+	
+	mem = camel_stream_mem_new_with_byte_array(buffer);
+	camel_data_wrapper_construct_from_stream(dw, mem);
+	camel_object_unref((CamelObject *)mem);
+
+	camel_mime_parser_filter_remove(mp, decid);
+	camel_mime_parser_filter_remove(mp, crlfid);
+	
+	if (fdec)
+		camel_object_unref((CamelObject *)fdec);
+	if (fcrlf)
+		camel_object_unref((CamelObject *)fcrlf);
 }
 
 /* This replaces the data wrapper repository ... and/or could be replaced by it? */
@@ -102,7 +424,7 @@ camel_mime_part_construct_content_from_parser (CamelMimePart *dw, CamelMimeParse
 	CamelContentType *ct;
 	
 	ct = camel_mime_parser_content_type (mp);
-	
+
 	switch (camel_mime_parser_state (mp)) {
 	case HSCAN_HEADER:
 		d(printf("Creating body part\n"));
@@ -139,6 +461,15 @@ camel_mime_part_construct_content_from_parser (CamelMimePart *dw, CamelMimeParse
 		/* would you believe you have to set this BEFORE you set the content object???  oh my god !!!! */
 		camel_data_wrapper_set_mime_type_field (content, camel_mime_part_get_content_type (dw));
 		camel_medium_set_content_object ((CamelMedium *)dw, content);
+		
+		/* Note: we don't set ct as the content-object's mime-type above because
+		 * camel_medium_set_content_object() may re-write the Content-Type header
+		 * (see CamelMimePart::set_content_object) if we did that (which is a Bad Thing).
+		 * However, if we set it *afterward*, we can still use any special auto-detections
+		 * that we found in simple_data_wrapper_construct_from_parser(). This is important
+		 * later when we go to render the MIME parts in mail-format.c */
+		camel_data_wrapper_set_mime_type_field (content, ct);
+		
 		camel_object_unref (content);
 	}
 }