New function to map ISO charsets to the Windows charsets.

2001-12-17 Jeffrey Stedfast <fejj@ximian.com> * camel-charset-map.c (camel_charset_iso_to_windows): New function to map ISO charsets to the Windows charsets. * camel-mime-part-utils.c (broken_windows_charset): Detect Windows charsets. (simple_data_wrapper_construct_from_parser): Simplify a tad and also check for iso-8859-* charsets that are really Windows charsets. Fixes bug #12631. svn path=/trunk/; revision=15144
author: Jeffrey Stedfast <fejj@ximian.com> 2001-12-18 09:28:27 +0800
committer: Jeffrey Stedfast <fejj@src.gnome.org> 2001-12-18 09:28:27 +0800
commit: f6408daa103092f18789a719a4123224b259f71f (patch)
tree: 838b491516e1b3669428136d73019aa9afe5f2c3
parent: 13299ab7e073cf4d412cec019e4240a7634c1cf5 (diff)
download: gsoc2013-evolution-f6408daa103092f18789a719a4123224b259f71f.tar.gz
gsoc2013-evolution-f6408daa103092f18789a719a4123224b259f71f.tar.zst
gsoc2013-evolution-f6408daa103092f18789a719a4123224b259f71f.zip
4 files changed, 127 insertions, 31 deletions
diff --git a/camel/ChangeLog b/camel/ChangeLog
index cb8a9eaf6a..b82266ff9e 100644
--- a/camel/ChangeLog
+++ b/camel/ChangeLog
@@ -1,3 +1,14 @@
+2001-12-17  Jeffrey Stedfast  <fejj@ximian.com>
+
+	* camel-charset-map.c (camel_charset_iso_to_windows): New function
+	to map ISO charsets to the Windows charsets.
+
+	* camel-mime-part-utils.c (broken_windows_charset): Detect Windows
+	charsets.
+	(simple_data_wrapper_construct_from_parser): Simplify a tad and
+	also check for iso-8859-* charsets that are really Windows
+	charsets. Fixes bug #12631.
+
 2001-12-17  Dan Winship  <danw@ximian.com>
 
 	* Makefile.am (INCLUDES): define CAMEL_PROVIDERDIR to be the
@@ -7,11 +18,16 @@
 
 	* providers/imap/Makefile.am (camel_provider_LTLIBRARIES,
 	camel_provider_DATA): renamed from provider_LTLIBRARIES,
-	provider_DATA.
-	* providers/local/Makefile.am: Likewise
-	* providers/nntp/Makefile.am: Likewise
-	* providers/pop3/Makefile.am: Likewise
-	* providers/sendmail/Makefile.am: Likewise
+	provider_DATA.  
+
+	* providers/local/Makefile.am: Likewise 
+
+	* providers/nntp/Makefile.am: Likewise 
+
+	* providers/pop3/Makefile.am: Likewise 
+
+	* providers/sendmail/Makefile.am: Likewise 
+
 	* providers/smtp/Makefile.am: Likewise
 
 2001-12-16  Jeffrey Stedfast  <fejj@ximian.com>
diff --git a/camel/camel-charset-map.c b/camel/camel-charset-map.c
index 17962d74be..2416dd2504 100644
--- a/camel/camel-charset-map.c
+++ b/camel/camel-charset-map.c
@@ -292,5 +292,60 @@ camel_charset_best (const char *in, int len)
 	return camel_charset_best_name (&charset);
 }
 
+
+/**
+ * camel_charset_iso_to_windows:
+ * @isocharset: an ISO charset
+ *
+ * Returns the equivalent Windows charset.
+ **/
+const char *
+camel_charset_iso_to_windows (const char *isocharset)
+{
+	/* According to http://czyborra.com/charsets/codepages.html,
+	 * the charset mapping is as follows:
+	 *
+	 * iso-8859-1  maps to windows-cp1252
+	 * iso-8859-2  maps to windows-cp1250
+	 * iso-8859-3  maps to windows-cp????
+	 * iso-8859-4  maps to windows-cp????
+	 * iso-8859-5  maps to windows-cp1251
+	 * iso-8859-6  maps to windows-cp1256
+	 * iso-8859-7  maps to windows-cp1253
+	 * iso-8859-8  maps to windows-cp1255
+	 * iso-8859-9  maps to windows-cp1254
+	 * iso-8859-10 maps to windows-cp????
+	 * iso-8859-11 maps to windows-cp????
+	 * iso-8859-12 maps to windows-cp????
+	 * iso-8859-13 maps to windows-cp1257
+	 *
+	 * Assumptions:
+	 *  - I'm going to assume that since iso-8859-4 and
+	 *    iso-8859-13 are Baltic that it also maps to
+	 *    windows-cp1257.
+	 */
+	
+	if (!strcasecmp (isocharset, "iso-8859-1"))
+		return "windows-cp1252";
+	else if (!strcasecmp (isocharset, "iso-8859-2"))
+		return "windows-cp1250";
+	else if (!strcasecmp (isocharset, "iso-8859-4"))
+		return "windows-cp1257";
+	else if (!strcasecmp (isocharset, "iso-8859-5"))
+		return "windows-cp1251";
+	else if (!strcasecmp (isocharset, "iso-8859-6"))
+		return "windows-cp1256";
+	else if (!strcasecmp (isocharset, "iso-8859-7"))
+		return "windows-cp1253";
+	else if (!strcasecmp (isocharset, "iso-8859-8"))
+		return "windows-cp1255";
+	else if (!strcasecmp (isocharset, "iso-8859-9"))
+		return "windows-cp1254";
+	else if (!strcasecmp (isocharset, "iso-8859-13"))
+		return "windows-cp1257";
+	
+	return isocharset;
+}
+
 #endif /* !BUILD_MAP */
 
diff --git a/camel/camel-charset-map.h b/camel/camel-charset-map.h
index 7c7022c0a1..0cae1916a6 100644
--- a/camel/camel-charset-map.h
+++ b/camel/camel-charset-map.h
@@ -37,4 +37,6 @@ const char *camel_charset_best_name(CamelCharset *);
 /* helper function */
 const char *camel_charset_best(const char *in, int len);
 
+const char *camel_charset_iso_to_windows (const char *isocharset);
+
 #endif /* ! _CAMEL_CHARSET_MAP_H */
diff --git a/camel/camel-mime-part-utils.c b/camel/camel-mime-part-utils.c
index 65c99c6dc8..08787df2cd 100644
--- a/camel/camel-mime-part-utils.c
+++ b/camel/camel-mime-part-utils.c
@@ -155,6 +155,28 @@ convert_buffer (GByteArray *in, const char *to, const char *from)
 	return out;
 }
 
+/* We don't really use the charset argument except for debugging... */
+static gboolean
+broken_windows_charset (GByteArray *buffer, const char *charset)
+{
+	register unsigned char *inptr;
+	unsigned char *inend;
+	
+	inptr = buffer->data;
+	inend = inptr + buffer->len;
+	
+	while (inptr < inend) {
+		register unsigned char c = *inptr++;
+		
+		if (c >= 128 && c <= 159) {
+			g_warning ("Encountered Windows charset parading as %s", charset);
+			return TRUE;
+		}
+	}
+	
+	return FALSE;
+}
+
 static gboolean
 is_7bit (GByteArray *buffer)
 {
@@ -172,33 +194,24 @@ static void
 simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp)
 {
 	CamelMimeFilter *fdec = NULL, *fcrlf = NULL;
+	CamelMimeFilterBasicType enctype;
 	int len, decid = -1, crlfid = -1;
 	struct _header_content_type *ct;
+	const char *charset = NULL;
 	GByteArray *buffer;
 	char *encoding, *buf;
-	const char *charset = NULL;
-	CamelMimeFilterBasicType enctype = 0;
 	CamelStream *mem;
-
-	d(printf("constructing data-wrapper\n"));
+	
+	d(printf ("simple_data_wrapper_construct_from_parser()\n"));
 	
 	/* first, work out conversion, if any, required, we dont care about what we dont know about */
-	encoding = header_content_encoding_decode(camel_mime_parser_header(mp, "content-transfer-encoding", NULL));
+	encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "Content-Transfer-Encoding", NULL));
 	if (encoding) {
-		if (!strcasecmp(encoding, "base64")) {
-			d(printf("Adding base64 decoder ...\n"));
-			enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC;
-		} else if (!strcasecmp(encoding, "quoted-printable")) {
-			d(printf("Adding quoted-printable decoder ...\n"));
-			enctype = CAMEL_MIME_FILTER_BASIC_QP_DEC;
-		} else if (!strcasecmp (encoding, "x-uuencode")) {
-			d(printf("Adding uudecoder ...\n"));
-			enctype = CAMEL_MIME_FILTER_BASIC_UU_DEC;
-		}
+		enctype = camel_mime_part_encoding_from_string (encoding);
 		g_free (encoding);
 		
-		if (enctype != 0) {
-			fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type(enctype);
+		if (enctype != CAMEL_MIME_PART_ENCODING_DEFAULT) {
+			fdec = (CamelMimeFilter *) camel_mime_filter_basic_new_type (enctype);
 			decid = camel_mime_parser_filter_add (mp, fdec);
 		}
 	}
@@ -229,21 +242,32 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser
 		charset = check_html_charset(buffer->data, buffer->len);
 	
 	/* if we need to do charset conversion, see if we can/it works/etc */
-	if (charset && !(strcasecmp(charset, "us-ascii") == 0
-			 || strcasecmp(charset, "utf-8") == 0
-			 || strncasecmp(charset, "x-", 2) == 0)) {
+	if (charset && !(strcasecmp (charset, "us-ascii") == 0
+			 || strcasecmp (charset, "utf-8") == 0
+			 || strncasecmp (charset, "x-", 2) == 0)) {
 		GByteArray *out;
 		
-		out = convert_buffer(buffer, "UTF-8", charset);
+		/* You often see Microsoft Windows users announcing their texts
+		 * as being in ISO-8859-1 even when in fact they contain funny
+		 * characters from the Windows-CP1252 superset.
+		 */
+		if (!strncasecmp (charset, "iso-8859", 8)) {
+			/* check for Windows-specific chars... */
+			if (broken_windows_charset (buffer, charset)) {
+				charset = camel_charset_iso_to_windows (charset);
+				charset = e_iconv_charset_name (charset);
+			}
+		}
+		
+		out = convert_buffer (buffer, "UTF-8", charset);
 		if (out) {
 			/* converted ok, use this data instead */
 			g_byte_array_free(buffer, TRUE);
 			buffer = out;
 		} else {
-			g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset);
 			/* else failed to convert, leave as raw? */
+			g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset);
 			dw->rawtext = TRUE;
-			/* should we change the content-type header? */
 		}
 	} else if (header_content_type_is (ct, "text", "*")) {
 		if (charset == NULL) {
@@ -258,10 +282,9 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser
 			dw->rawtext = !g_utf8_validate (buffer->data, buffer->len, NULL);
 		}
 	}
-			
-
+	
 	d(printf("message part kept in memory!\n"));
-		
+	
 	mem = camel_stream_mem_new_with_byte_array(buffer);
 	camel_data_wrapper_construct_from_stream(dw, mem);
 	camel_object_unref((CamelObject *)mem);
author	Jeffrey Stedfast <fejj@ximian.com>	2001-12-18 09:28:27 +0800
committer	Jeffrey Stedfast <fejj@src.gnome.org>	2001-12-18 09:28:27 +0800
commit	f6408daa103092f18789a719a4123224b259f71f (patch)
tree	838b491516e1b3669428136d73019aa9afe5f2c3
parent	13299ab7e073cf4d412cec019e4240a7634c1cf5 (diff)
download	gsoc2013-evolution-f6408daa103092f18789a719a4123224b259f71f.tar.gz gsoc2013-evolution-f6408daa103092f18789a719a4123224b259f71f.tar.zst gsoc2013-evolution-f6408daa103092f18789a719a4123224b259f71f.zip