diff options
author | Not Zed <NotZed@Ximian.com> | 2001-07-12 13:02:11 +0800 |
---|---|---|
committer | Michael Zucci <zucchi@src.gnome.org> | 2001-07-12 13:02:11 +0800 |
commit | b88f6b9593ad0a6fda85ca8d01b623583f714bcc (patch) | |
tree | e09fdaf2a329a81f097f932efd050977239783dd /camel | |
parent | 421aa80ae6961cb4ddef8e79133ce89fcfbbf52d (diff) | |
download | gsoc2013-evolution-b88f6b9593ad0a6fda85ca8d01b623583f714bcc.tar.gz gsoc2013-evolution-b88f6b9593ad0a6fda85ca8d01b623583f714bcc.tar.zst gsoc2013-evolution-b88f6b9593ad0a6fda85ca8d01b623583f714bcc.zip |
Removed. (check_html_charset): Replaced with this.
2001-07-12 Not Zed <NotZed@Ximian.com>
* camel-mime-part-utils.c (extract_metatag_charset): Removed.
(check_html_charset): Replaced with this.
(simple_data_wrapper_construct_from_parser): Call
check_html_charset if we dont have a charset on the content-type,
and we have text/html data.
(check_html_charset): We also need to do qp/base64 decoding
ourselves, sigh.
* camel-mime-utils.c (html_meta_param_list_decode): Removed. This
was very wrong, the rules for mail headers vastly different from
rules for decoding html elements.
(rfc2184_decode): Move the malloc inside the iconv_open worked,
otherwise we have a memleak.
* camel-mime-filter-html.c (camel_mime_filter_html_finalize, init,
run, reset): Changed to use camelhtmlparser, and fixed a tiny
memleak.
* camel-html-parser.c: Made the html indexer tokeniser re-usable.
ONLY TO BE USED INTERNAL TO CAMEL HOWEVER.
(tokenise_step): Slight fix to non-quoted values.
svn path=/trunk/; revision=11028
Diffstat (limited to 'camel')
-rw-r--r-- | camel/ChangeLog | 22 | ||||
-rw-r--r-- | camel/Makefile.am | 4 | ||||
-rw-r--r-- | camel/camel-html-parser.c | 807 | ||||
-rw-r--r-- | camel/camel-html-parser.h | 78 | ||||
-rw-r--r-- | camel/camel-mime-filter-html.c | 821 | ||||
-rw-r--r-- | camel/camel-mime-part-utils.c | 151 | ||||
-rw-r--r-- | camel/camel-mime-utils.c | 73 | ||||
-rw-r--r-- | camel/camel-mime-utils.h | 3 |
8 files changed, 1017 insertions, 942 deletions
diff --git a/camel/ChangeLog b/camel/ChangeLog index 38d2d8d142..520f7c99a8 100644 --- a/camel/ChangeLog +++ b/camel/ChangeLog @@ -1,5 +1,27 @@ 2001-07-12 Not Zed <NotZed@Ximian.com> + * camel-mime-part-utils.c (extract_metatag_charset): Removed. + (check_html_charset): Replaced with this. + (simple_data_wrapper_construct_from_parser): Call + check_html_charset if we dont have a charset on the content-type, + and we have text/html data. + (check_html_charset): We also need to do qp/base64 decoding + ourselves, sigh. + + * camel-mime-utils.c (html_meta_param_list_decode): Removed. This + was very wrong, the rules for mail headers vastly different from + rules for decoding html elements. + (rfc2184_decode): Move the malloc inside the iconv_open worked, + otherwise we have a memleak. + + * camel-mime-filter-html.c (camel_mime_filter_html_finalize, init, + run, reset): Changed to use camelhtmlparser, and fixed a tiny + memleak. + + * camel-html-parser.c: Made the html indexer tokeniser re-usable. + ONLY TO BE USED INTERNAL TO CAMEL HOWEVER. + (tokenise_step): Slight fix to non-quoted values. + * camel-folder-summary.c (camel_folder_summary_info_new_from_message): Removed some code i wasn't supposed to commit. diff --git a/camel/Makefile.am b/camel/Makefile.am index e8e0253efb..408a0e33d0 100644 --- a/camel/Makefile.am +++ b/camel/Makefile.am @@ -39,6 +39,7 @@ libcamel_la_SOURCES = \ camel-folder-summary.c \ camel-folder-thread.c \ camel-folder.c \ + camel-html-parser.c \ camel-internet-address.c \ camel-lock.c \ camel-lock-client.c \ @@ -217,7 +218,8 @@ noinst_HEADERS = \ camel-charset-map-private.h \ camel-private.h \ camel-search-private.h \ - camel-lock-helper.h + camel-lock-helper.h \ + camel-html-parser.h # manually include camel-lock-helper.c since we build it manually EXTRA_DIST = \ diff --git a/camel/camel-html-parser.c b/camel/camel-html-parser.c new file mode 100644 index 0000000000..9169f97318 --- /dev/null +++ b/camel/camel-html-parser.c @@ -0,0 +1,807 @@ +/* + * Copyright (C) 2001 Ximian Inc. + * + * Authors: Michael Zucchi <notzed@ximian.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/** WARNING + ** + ** DO NOT USE THIS CODE OUTSIDE OF CAMEL + ** + ** IT IS SUBJECT TO CHANGE OR MAY VANISH AT ANY TIME + **/ + +#include "camel-html-parser.h" + +#include <stdio.h> +#include <string.h> + +#include <gal/unicode/gunicode.h> +#include <ctype.h> + +/* if defined, must also compile in dump_tag() below somewhere */ +#define d(x) + +static void camel_html_parser_class_init (CamelHTMLParserClass *klass); +static void camel_html_parser_init (CamelObject *o); +static void camel_html_parser_finalize (CamelObject *o); + +static CamelObjectClass *camel_html_parser_parent; + +/* Parser definitions, see below object code for details */ + +typedef struct _CamelHTMLParserPrivate CamelHTMLParserPrivate; + +struct _CamelHTMLParserPrivate { + char *inbuf, + *inptr, + *inend, + *start; + enum _camel_html_parser_t state; + char *charset; + int eof; + GString *tag; + GString *ent; + char ent_utf8[8]; + int attr; + GPtrArray *attrs; + GPtrArray *values; + int quote; +}; + +static void tokenise_setup(void); +static CamelHTMLParserPrivate *tokenise_init(void); +static void tokenise_free(CamelHTMLParserPrivate *p); +static int tokenise_step(CamelHTMLParserPrivate *p, char **datap, int *lenp); + +/* ********************************************************************** */ + +CamelType +camel_html_parser_get_type (void) +{ + static CamelType type = CAMEL_INVALID_TYPE; + + if (type == CAMEL_INVALID_TYPE) { + type = camel_type_register (camel_object_get_type (), "CamelHTMLParser", + sizeof (CamelHTMLParser), + sizeof (CamelHTMLParserClass), + (CamelObjectClassInitFunc) camel_html_parser_class_init, + NULL, + (CamelObjectInitFunc) camel_html_parser_init, + (CamelObjectFinalizeFunc) camel_html_parser_finalize); + } + + return type; +} + +static void +camel_html_parser_finalize(CamelObject *o) +{ + CamelHTMLParser *f = (CamelHTMLParser *)o; + + tokenise_free(f->priv); +} + +static void +camel_html_parser_init (CamelObject *o) +{ + CamelHTMLParser *f = (CamelHTMLParser *)o; + + f->priv = tokenise_init(); +} + +static void +camel_html_parser_class_init (CamelHTMLParserClass *klass) +{ + camel_html_parser_parent = CAMEL_OBJECT_CLASS (camel_type_get_global_classfuncs (camel_object_get_type ())); + + tokenise_setup(); +} + +/** + * camel_html_parser_new: + * + * Create a new CamelHTMLParser object. + * + * Return value: A new CamelHTMLParser widget. + **/ +CamelHTMLParser * +camel_html_parser_new (void) +{ + CamelHTMLParser *new = CAMEL_HTML_PARSER ( camel_object_new (camel_html_parser_get_type ())); + return new; +} + + +void camel_html_parser_set_data(CamelHTMLParser *hp, const char *start, int len, int last) +{ + CamelHTMLParserPrivate *p = hp->priv; + + p->inptr = p->inbuf = (char *)start; + p->inend = (char *)start+len; + p->eof = last; +} + +camel_html_parser_t camel_html_parser_step(CamelHTMLParser *hp, const char **datap, int *lenp) +{ + return tokenise_step(hp->priv, (char **)datap, lenp); +} + +const char *camel_html_parser_left(CamelHTMLParser *hp, int *lenp) +{ + CamelHTMLParserPrivate *p = hp->priv; + + if (lenp) + *lenp = p->inend - p->inptr; + + return p->inptr; +} + +const char *camel_html_parser_tag(CamelHTMLParser *hp) +{ + return hp->priv->tag->str; +} + +const char *camel_html_parser_attr(CamelHTMLParser *hp, const char *name) +{ + int i; + CamelHTMLParserPrivate *p = hp->priv; + + for (i=0;i<p->attrs->len;i++) { + if (!strcasecmp(((GString *)p->attrs->pdata[i])->str, name)) { + return ((GString *)p->values->pdata[i])->str; + } + } + + return NULL; +} + +const GPtrArray *camel_html_parser_attr_list(CamelHTMLParser *hp, const GPtrArray **values) +{ + if (values) + *values = hp->priv->values; + + return hp->priv->attrs; +} + +/* this map taken out of libxml */ +static struct { + unsigned int val; + const char *name; +} entity_map[] = { +/* + * the 4 absolute ones, + */ + { 34, "quot", /* quotation mark = APL quote, U+0022 ISOnum */ }, + { 38, "amp", /* ampersand, U+0026 ISOnum */ }, + { 60, "lt", /* less-than sign, U+003C ISOnum */ }, + { 62, "gt", /* greater-than sign, U+003E ISOnum */ }, + +/* + * A bunch still in the 128-255 range + * Replacing them depend really on the charset used. + */ + { 39, "apos", /* single quote */ }, + { 160, "nbsp", /* no-break space = non-breaking space, U+00A0 ISOnum */ }, + { 161, "iexcl",/* inverted exclamation mark, U+00A1 ISOnum */ }, + { 162, "cent", /* cent sign, U+00A2 ISOnum */ }, + { 163, "pound",/* pound sign, U+00A3 ISOnum */ }, + { 164, "curren",/* currency sign, U+00A4 ISOnum */ }, + { 165, "yen", /* yen sign = yuan sign, U+00A5 ISOnum */ }, + { 166, "brvbar",/* broken bar = broken vertical bar, U+00A6 ISOnum */ }, + { 167, "sect", /* section sign, U+00A7 ISOnum */ }, + { 168, "uml", /* diaeresis = spacing diaeresis, U+00A8 ISOdia */ }, + { 169, "copy", /* copyright sign, U+00A9 ISOnum */ }, + { 170, "ordf", /* feminine ordinal indicator, U+00AA ISOnum */ }, + { 171, "laquo",/* left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */ }, + { 172, "not", /* not sign, U+00AC ISOnum */ }, + { 173, "shy", /* soft hyphen = discretionary hyphen, U+00AD ISOnum */ }, + { 174, "reg", /* registered sign = registered trade mark sign, U+00AE ISOnum */ }, + { 175, "macr", /* macron = spacing macron = overline = APL overbar, U+00AF ISOdia */ }, + { 176, "deg", /* degree sign, U+00B0 ISOnum */ }, + { 177, "plusmn",/* plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */ }, + { 178, "sup2", /* superscript two = superscript digit two = squared, U+00B2 ISOnum */ }, + { 179, "sup3", /* superscript three = superscript digit three = cubed, U+00B3 ISOnum */ }, + { 180, "acute",/* acute accent = spacing acute, U+00B4 ISOdia */ }, + { 181, "micro",/* micro sign, U+00B5 ISOnum */ }, + { 182, "para", /* pilcrow sign = paragraph sign, U+00B6 ISOnum */ }, + { 183, "middot",/* middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum */ }, + { 184, "cedil",/* cedilla = spacing cedilla, U+00B8 ISOdia */ }, + { 185, "sup1", /* superscript one = superscript digit one, U+00B9 ISOnum */ }, + { 186, "ordm", /* masculine ordinal indicator, U+00BA ISOnum */ }, + { 187, "raquo",/* right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum */ }, + { 188, "frac14",/* vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */ }, + { 189, "frac12",/* vulgar fraction one half = fraction one half, U+00BD ISOnum */ }, + { 190, "frac34",/* vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */ }, + { 191, "iquest",/* inverted question mark = turned question mark, U+00BF ISOnum */ }, + { 192, "Agrave",/* latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */ }, + { 193, "Aacute",/* latin capital letter A with acute, U+00C1 ISOlat1 */ }, + { 194, "Acirc",/* latin capital letter A with circumflex, U+00C2 ISOlat1 */ }, + { 195, "Atilde",/* latin capital letter A with tilde, U+00C3 ISOlat1 */ }, + { 196, "Auml", /* latin capital letter A with diaeresis, U+00C4 ISOlat1 */ }, + { 197, "Aring",/* latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */ }, + { 198, "AElig",/* latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */ }, + { 199, "Ccedil",/* latin capital letter C with cedilla, U+00C7 ISOlat1 */ }, + { 200, "Egrave",/* latin capital letter E with grave, U+00C8 ISOlat1 */ }, + { 201, "Eacute",/* latin capital letter E with acute, U+00C9 ISOlat1 */ }, + { 202, "Ecirc",/* latin capital letter E with circumflex, U+00CA ISOlat1 */ }, + { 203, "Euml", /* latin capital letter E with diaeresis, U+00CB ISOlat1 */ }, + { 204, "Igrave",/* latin capital letter I with grave, U+00CC ISOlat1 */ }, + { 205, "Iacute",/* latin capital letter I with acute, U+00CD ISOlat1 */ }, + { 206, "Icirc",/* latin capital letter I with circumflex, U+00CE ISOlat1 */ }, + { 207, "Iuml", /* latin capital letter I with diaeresis, U+00CF ISOlat1 */ }, + { 208, "ETH", /* latin capital letter ETH, U+00D0 ISOlat1 */ }, + { 209, "Ntilde",/* latin capital letter N with tilde, U+00D1 ISOlat1 */ }, + { 210, "Ograve",/* latin capital letter O with grave, U+00D2 ISOlat1 */ }, + { 211, "Oacute",/* latin capital letter O with acute, U+00D3 ISOlat1 */ }, + { 212, "Ocirc",/* latin capital letter O with circumflex, U+00D4 ISOlat1 */ }, + { 213, "Otilde",/* latin capital letter O with tilde, U+00D5 ISOlat1 */ }, + { 214, "Ouml", /* latin capital letter O with diaeresis, U+00D6 ISOlat1 */ }, + { 215, "times",/* multiplication sign, U+00D7 ISOnum */ }, + { 216, "Oslash",/* latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1 */ }, + { 217, "Ugrave",/* latin capital letter U with grave, U+00D9 ISOlat1 */ }, + { 218, "Uacute",/* latin capital letter U with acute, U+00DA ISOlat1 */ }, + { 219, "Ucirc",/* latin capital letter U with circumflex, U+00DB ISOlat1 */ }, + { 220, "Uuml", /* latin capital letter U with diaeresis, U+00DC ISOlat1 */ }, + { 221, "Yacute",/* latin capital letter Y with acute, U+00DD ISOlat1 */ }, + { 222, "THORN",/* latin capital letter THORN, U+00DE ISOlat1 */ }, + { 223, "szlig",/* latin small letter sharp s = ess-zed, U+00DF ISOlat1 */ }, + { 224, "agrave",/* latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */ }, + { 225, "aacute",/* latin small letter a with acute, U+00E1 ISOlat1 */ }, + { 226, "acirc",/* latin small letter a with circumflex, U+00E2 ISOlat1 */ }, + { 227, "atilde",/* latin small letter a with tilde, U+00E3 ISOlat1 */ }, + { 228, "auml", /* latin small letter a with diaeresis, U+00E4 ISOlat1 */ }, + { 229, "aring",/* latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */ }, + { 230, "aelig",/* latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */ }, + { 231, "ccedil",/* latin small letter c with cedilla, U+00E7 ISOlat1 */ }, + { 232, "egrave",/* latin small letter e with grave, U+00E8 ISOlat1 */ }, + { 233, "eacute",/* latin small letter e with acute, U+00E9 ISOlat1 */ }, + { 234, "ecirc",/* latin small letter e with circumflex, U+00EA ISOlat1 */ }, + { 235, "euml", /* latin small letter e with diaeresis, U+00EB ISOlat1 */ }, + { 236, "igrave",/* latin small letter i with grave, U+00EC ISOlat1 */ }, + { 237, "iacute",/* latin small letter i with acute, U+00ED ISOlat1 */ }, + { 238, "icirc",/* latin small letter i with circumflex, U+00EE ISOlat1 */ }, + { 239, "iuml", /* latin small letter i with diaeresis, U+00EF ISOlat1 */ }, + { 240, "eth", /* latin small letter eth, U+00F0 ISOlat1 */ }, + { 241, "ntilde",/* latin small letter n with tilde, U+00F1 ISOlat1 */ }, + { 242, "ograve",/* latin small letter o with grave, U+00F2 ISOlat1 */ }, + { 243, "oacute",/* latin small letter o with acute, U+00F3 ISOlat1 */ }, + { 244, "ocirc",/* latin small letter o with circumflex, U+00F4 ISOlat1 */ }, + { 245, "otilde",/* latin small letter o with tilde, U+00F5 ISOlat1 */ }, + { 246, "ouml", /* latin small letter o with diaeresis, U+00F6 ISOlat1 */ }, + { 247, "divide",/* division sign, U+00F7 ISOnum */ }, + { 248, "oslash",/* latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */ }, + { 249, "ugrave",/* latin small letter u with grave, U+00F9 ISOlat1 */ }, + { 250, "uacute",/* latin small letter u with acute, U+00FA ISOlat1 */ }, + { 251, "ucirc",/* latin small letter u with circumflex, U+00FB ISOlat1 */ }, + { 252, "uuml", /* latin small letter u with diaeresis, U+00FC ISOlat1 */ }, + { 253, "yacute",/* latin small letter y with acute, U+00FD ISOlat1 */ }, + { 254, "thorn",/* latin small letter thorn with, U+00FE ISOlat1 */ }, + { 255, "yuml", /* latin small letter y with diaeresis, U+00FF ISOlat1 */ }, + +/* + * Anything below should really be kept as entities references + */ + { 402, "fnof", /* latin small f with hook = function = florin, U+0192 ISOtech */ }, + + { 913, "Alpha",/* greek capital letter alpha, U+0391 */ }, + { 914, "Beta", /* greek capital letter beta, U+0392 */ }, + { 915, "Gamma",/* greek capital letter gamma, U+0393 ISOgrk3 */ }, + { 916, "Delta",/* greek capital letter delta, U+0394 ISOgrk3 */ }, + { 917, "Epsilon",/* greek capital letter epsilon, U+0395 */ }, + { 918, "Zeta", /* greek capital letter zeta, U+0396 */ }, + { 919, "Eta", /* greek capital letter eta, U+0397 */ }, + { 920, "Theta",/* greek capital letter theta, U+0398 ISOgrk3 */ }, + { 921, "Iota", /* greek capital letter iota, U+0399 */ }, + { 922, "Kappa",/* greek capital letter kappa, U+039A */ }, + { 923, "Lambda"/* greek capital letter lambda, U+039B ISOgrk3 */ }, + { 924, "Mu", /* greek capital letter mu, U+039C */ }, + { 925, "Nu", /* greek capital letter nu, U+039D */ }, + { 926, "Xi", /* greek capital letter xi, U+039E ISOgrk3 */ }, + { 927, "Omicron",/* greek capital letter omicron, U+039F */ }, + { 928, "Pi", /* greek capital letter pi, U+03A0 ISOgrk3 */ }, + { 929, "Rho", /* greek capital letter rho, U+03A1 */ }, + { 931, "Sigma",/* greek capital letter sigma, U+03A3 ISOgrk3 */ }, + { 932, "Tau", /* greek capital letter tau, U+03A4 */ }, + { 933, "Upsilon",/* greek capital letter upsilon, U+03A5 ISOgrk3 */ }, + { 934, "Phi", /* greek capital letter phi, U+03A6 ISOgrk3 */ }, + { 935, "Chi", /* greek capital letter chi, U+03A7 */ }, + { 936, "Psi", /* greek capital letter psi, U+03A8 ISOgrk3 */ }, + { 937, "Omega",/* greek capital letter omega, U+03A9 ISOgrk3 */ }, + + { 945, "alpha",/* greek small letter alpha, U+03B1 ISOgrk3 */ }, + { 946, "beta", /* greek small letter beta, U+03B2 ISOgrk3 */ }, + { 947, "gamma",/* greek small letter gamma, U+03B3 ISOgrk3 */ }, + { 948, "delta",/* greek small letter delta, U+03B4 ISOgrk3 */ }, + { 949, "epsilon",/* greek small letter epsilon, U+03B5 ISOgrk3 */ }, + { 950, "zeta", /* greek small letter zeta, U+03B6 ISOgrk3 */ }, + { 951, "eta", /* greek small letter eta, U+03B7 ISOgrk3 */ }, + { 952, "theta",/* greek small letter theta, U+03B8 ISOgrk3 */ }, + { 953, "iota", /* greek small letter iota, U+03B9 ISOgrk3 */ }, + { 954, "kappa",/* greek small letter kappa, U+03BA ISOgrk3 */ }, + { 955, "lambda",/* greek small letter lambda, U+03BB ISOgrk3 */ }, + { 956, "mu", /* greek small letter mu, U+03BC ISOgrk3 */ }, + { 957, "nu", /* greek small letter nu, U+03BD ISOgrk3 */ }, + { 958, "xi", /* greek small letter xi, U+03BE ISOgrk3 */ }, + { 959, "omicron",/* greek small letter omicron, U+03BF NEW */ }, + { 960, "pi", /* greek small letter pi, U+03C0 ISOgrk3 */ }, + { 961, "rho", /* greek small letter rho, U+03C1 ISOgrk3 */ }, + { 962, "sigmaf",/* greek small letter final sigma, U+03C2 ISOgrk3 */ }, + { 963, "sigma",/* greek small letter sigma, U+03C3 ISOgrk3 */ }, + { 964, "tau", /* greek small letter tau, U+03C4 ISOgrk3 */ }, + { 965, "upsilon",/* greek small letter upsilon, U+03C5 ISOgrk3 */ }, + { 966, "phi", /* greek small letter phi, U+03C6 ISOgrk3 */ }, + { 967, "chi", /* greek small letter chi, U+03C7 ISOgrk3 */ }, + { 968, "psi", /* greek small letter psi, U+03C8 ISOgrk3 */ }, + { 969, "omega",/* greek small letter omega, U+03C9 ISOgrk3 */ }, + { 977, "thetasym",/* greek small letter theta symbol, U+03D1 NEW */ }, + { 978, "upsih",/* greek upsilon with hook symbol, U+03D2 NEW */ }, + { 982, "piv", /* greek pi symbol, U+03D6 ISOgrk3 */ }, + + { 8226, "bull", /* bullet = black small circle, U+2022 ISOpub */ }, + { 8230, "hellip",/* horizontal ellipsis = three dot leader, U+2026 ISOpub */ }, + { 8242, "prime",/* prime = minutes = feet, U+2032 ISOtech */ }, + { 8243, "Prime",/* double prime = seconds = inches, U+2033 ISOtech */ }, + { 8254, "oline",/* overline = spacing overscore, U+203E NEW */ }, + { 8260, "frasl",/* fraction slash, U+2044 NEW */ }, + + { 8472, "weierp",/* script capital P = power set = Weierstrass p, U+2118 ISOamso */ }, + { 8465, "image",/* blackletter capital I = imaginary part, U+2111 ISOamso */ }, + { 8476, "real", /* blackletter capital R = real part symbol, U+211C ISOamso */ }, + { 8482, "trade",/* trade mark sign, U+2122 ISOnum */ }, + { 8501, "alefsym",/* alef symbol = first transfinite cardinal, U+2135 NEW */ }, + { 8592, "larr", /* leftwards arrow, U+2190 ISOnum */ }, + { 8593, "uarr", /* upwards arrow, U+2191 ISOnum */ }, + { 8594, "rarr", /* rightwards arrow, U+2192 ISOnum */ }, + { 8595, "darr", /* downwards arrow, U+2193 ISOnum */ }, + { 8596, "harr", /* left right arrow, U+2194 ISOamsa */ }, + { 8629, "crarr",/* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */ }, + { 8656, "lArr", /* leftwards double arrow, U+21D0 ISOtech */ }, + { 8657, "uArr", /* upwards double arrow, U+21D1 ISOamsa */ }, + { 8658, "rArr", /* rightwards double arrow, U+21D2 ISOtech */ }, + { 8659, "dArr", /* downwards double arrow, U+21D3 ISOamsa */ }, + { 8660, "hArr", /* left right double arrow, U+21D4 ISOamsa */ }, + + + { 8704, "forall",/* for all, U+2200 ISOtech */ }, + { 8706, "part", /* partial differential, U+2202 ISOtech */ }, + { 8707, "exist",/* there exists, U+2203 ISOtech */ }, + { 8709, "empty",/* empty set = null set = diameter, U+2205 ISOamso */ }, + { 8711, "nabla",/* nabla = backward difference, U+2207 ISOtech */ }, + { 8712, "isin", /* element of, U+2208 ISOtech */ }, + { 8713, "notin",/* not an element of, U+2209 ISOtech */ }, + { 8715, "ni", /* contains as member, U+220B ISOtech */ }, + { 8719, "prod", /* n-ary product = product sign, U+220F ISOamsb */ }, + { 8721, "sum", /* n-ary sumation, U+2211 ISOamsb */ }, + { 8722, "minus",/* minus sign, U+2212 ISOtech */ }, + { 8727, "lowast",/* asterisk operator, U+2217 ISOtech */ }, + { 8730, "radic",/* square root = radical sign, U+221A ISOtech */ }, + { 8733, "prop", /* proportional to, U+221D ISOtech */ }, + { 8734, "infin",/* infinity, U+221E ISOtech */ }, + { 8736, "ang", /* angle, U+2220 ISOamso */ }, + { 8743, "and", /* logical and = wedge, U+2227 ISOtech */ }, + { 8744, "or", /* logical or = vee, U+2228 ISOtech */ }, + { 8745, "cap", /* intersection = cap, U+2229 ISOtech */ }, + { 8746, "cup", /* union = cup, U+222A ISOtech */ }, + { 8747, "int", /* integral, U+222B ISOtech */ }, + { 8756, "there4",/* therefore, U+2234 ISOtech */ }, + { 8764, "sim", /* tilde operator = varies with = similar to, U+223C ISOtech */ }, + { 8773, "cong", /* approximately equal to, U+2245 ISOtech */ }, + { 8776, "asymp",/* almost equal to = asymptotic to, U+2248 ISOamsr */ }, + { 8800, "ne", /* not equal to, U+2260 ISOtech */ }, + { 8801, "equiv",/* identical to, U+2261 ISOtech */ }, + { 8804, "le", /* less-than or equal to, U+2264 ISOtech */ }, + { 8805, "ge", /* greater-than or equal to, U+2265 ISOtech */ }, + { 8834, "sub", /* subset of, U+2282 ISOtech */ }, + { 8835, "sup", /* superset of, U+2283 ISOtech */ }, + { 8836, "nsub", /* not a subset of, U+2284 ISOamsn */ }, + { 8838, "sube", /* subset of or equal to, U+2286 ISOtech */ }, + { 8839, "supe", /* superset of or equal to, U+2287 ISOtech */ }, + { 8853, "oplus",/* circled plus = direct sum, U+2295 ISOamsb */ }, + { 8855, "otimes",/* circled times = vector product, U+2297 ISOamsb */ }, + { 8869, "perp", /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */ }, + { 8901, "sdot", /* dot operator, U+22C5 ISOamsb */ }, + { 8968, "lceil",/* left ceiling = apl upstile, U+2308 ISOamsc */ }, + { 8969, "rceil",/* right ceiling, U+2309 ISOamsc */ }, + { 8970, "lfloor",/* left floor = apl downstile, U+230A ISOamsc */ }, + { 8971, "rfloor",/* right floor, U+230B ISOamsc */ }, + { 9001, "lang", /* left-pointing angle bracket = bra, U+2329 ISOtech */ }, + { 9002, "rang", /* right-pointing angle bracket = ket, U+232A ISOtech */ }, + { 9674, "loz", /* lozenge, U+25CA ISOpub */ }, + + { 9824, "spades",/* black spade suit, U+2660 ISOpub */ }, + { 9827, "clubs",/* black club suit = shamrock, U+2663 ISOpub */ }, + { 9829, "hearts",/* black heart suit = valentine, U+2665 ISOpub */ }, + { 9830, "diams",/* black diamond suit, U+2666 ISOpub */ }, + + { 338, "OElig",/* latin capital ligature OE, U+0152 ISOlat2 */ }, + { 339, "oelig",/* latin small ligature oe, U+0153 ISOlat2 */ }, + { 352, "Scaron",/* latin capital letter S with caron, U+0160 ISOlat2 */ }, + { 353, "scaron",/* latin small letter s with caron, U+0161 ISOlat2 */ }, + { 376, "Yuml", /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */ }, + { 710, "circ", /* modifier letter circumflex accent, U+02C6 ISOpub */ }, + { 732, "tilde",/* small tilde, U+02DC ISOdia */ }, + + { 8194, "ensp", /* en space, U+2002 ISOpub */ }, + { 8195, "emsp", /* em space, U+2003 ISOpub */ }, + { 8201, "thinsp",/* thin space, U+2009 ISOpub */ }, + { 8204, "zwnj", /* zero width non-joiner, U+200C NEW RFC 2070 */ }, + { 8205, "zwj", /* zero width joiner, U+200D NEW RFC 2070 */ }, + { 8206, "lrm", /* left-to-right mark, U+200E NEW RFC 2070 */ }, + { 8207, "rlm", /* right-to-left mark, U+200F NEW RFC 2070 */ }, + { 8211, "ndash",/* en dash, U+2013 ISOpub */ }, + { 8212, "mdash",/* em dash, U+2014 ISOpub */ }, + { 8216, "lsquo",/* left single quotation mark, U+2018 ISOnum */ }, + { 8217, "rsquo",/* right single quotation mark, U+2019 ISOnum */ }, + { 8218, "sbquo",/* single low-9 quotation mark, U+201A NEW */ }, + { 8220, "ldquo",/* left double quotation mark, U+201C ISOnum */ }, + { 8221, "rdquo",/* right double quotation mark, U+201D ISOnum */ }, + { 8222, "bdquo",/* double low-9 quotation mark, U+201E NEW */ }, + { 8224, "dagger",/* dagger, U+2020 ISOpub */ }, + { 8225, "Dagger",/* double dagger, U+2021 ISOpub */ }, + { 8240, "permil",/* per mille sign, U+2030 ISOtech */ }, + { 8249, "lsaquo",/* single left-pointing angle quotation mark, U+2039 ISO proposed */ }, + { 8250, "rsaquo",/* single right-pointing angle quotation mark, U+203A ISO proposed */ }, + { 8364, "euro", /* euro sign, U+20AC NEW */ } +}; + +static GHashTable *entities; + +/* this cannot be called in a thread context */ +static void tokenise_setup(void) +{ + int i; + + if (entities == NULL) { + entities = g_hash_table_new(g_str_hash, g_str_equal); + for (i=0;i<sizeof(entity_map)/sizeof(entity_map[0]);i++) { + g_hash_table_insert(entities, (char *)entity_map[i].name, (void *)entity_map[i].val); + } + } +} + +static CamelHTMLParserPrivate *tokenise_init(void) +{ + CamelHTMLParserPrivate *p; + + p = g_malloc(sizeof(*p)); + p->state = CAMEL_HTML_PARSER_DATA; + + p->attr = 0; + p->attrs = g_ptr_array_new(); + p->values = g_ptr_array_new(); + p->tag = g_string_new(""); + p->ent = g_string_new(""); + p->charset = NULL; + + if (entities == NULL) + tokenise_setup(); + + return p; +} + +static void tokenise_free(CamelHTMLParserPrivate *p) +{ + int i; + + g_string_free(p->tag, TRUE); + g_string_free(p->ent, TRUE); + g_free(p->charset); + + for (i=0;i<p->attrs->len;i++) + g_string_free(p->attrs->pdata[i], TRUE); + + for (i=0;i<p->values->len;i++) + g_string_free(p->values->pdata[i], TRUE); + + g_free(p); +} + +static int convert_entity(const char *e, char *ent) +{ + unsigned int val; + + if (e[0] == '#') + return g_unichar_to_utf8(atoi(e+1), ent); + + val = (unsigned int)g_hash_table_lookup(entities, e); + if (ent) + return g_unichar_to_utf8(val, ent); + else + return 0; +} + +#if 0 +static void dump_tag(CamelHTMLParserPrivate *p) +{ + int i; + + printf("got tag: %s\n", p->tag->str); + printf("%d attributes:\n", p->attr); + for (i=0;i<p->attr;i++) { + printf(" %s = '%s'\n", ((GString *)p->attrs->pdata[i])->str, ((GString *)p->values->pdata[i])->str); + } +} +#endif + +static int tokenise_step(CamelHTMLParserPrivate *p, char **datap, int *lenp) +{ + char *in = p->inptr; + char *inend = p->inend; + char c; + int state = p->state, ret, len; + char *start = p->inptr; + + d(printf("Tokenise step\n")); + + while (in < inend) { + c = *in++; + switch (state) { + case CAMEL_HTML_PARSER_DATA: + if (c == '<') { + ret = state; + state = CAMEL_HTML_PARSER_TAG; + p->attr = 0; + g_string_truncate(p->tag, 0); + d(printf("got data '%.*s'\n", in-start-1, start)); + *datap = start; + *lenp = in-start-1; + goto done; + } else if (c=='&') { + ret = state; + state = CAMEL_HTML_PARSER_ENT; + g_string_truncate(p->ent, 0); + g_string_append_c(p->ent, c); + d(printf("got data '%.*s'\n", in-start-1, start)); + *datap = start; + *lenp = in-start-1; + goto done; + } + break; + case CAMEL_HTML_PARSER_ENT: + if (c==';') { + len = convert_entity(p->ent->str+1, p->ent_utf8); + if (len == 0) { + /* handle broken entity */ + g_string_append_c(p->ent, c); + ret = state = CAMEL_HTML_PARSER_DATA; + *datap = p->ent->str; + *lenp = p->ent->len; + goto done; + } else { + d(printf("got entity: %s = %s\n", p->ent->str, p->ent_utf8)); + ret = state; + state = CAMEL_HTML_PARSER_DATA; + *datap = p->ent_utf8; + *lenp = len; + goto done; + } + } else if (isalnum(c) || c=='#') { /* FIXME: right type */ + g_string_append_c(p->ent, c); + } else { + /* handle broken entity */ + g_string_append_c(p->ent, c); + ret = state = CAMEL_HTML_PARSER_DATA; + *datap = p->ent->str; + *lenp = p->ent->len; + goto done; + } + break; + case CAMEL_HTML_PARSER_TAG: + if (c == '!') { + state = CAMEL_HTML_PARSER_COMMENT0; + g_string_append_c(p->tag, c); + } else if (c == '>') { + d(dump_tag(p)); + ret = CAMEL_HTML_PARSER_ELEMENT; + state = CAMEL_HTML_PARSER_DATA; + goto done; + } else if (c == ' ' || c=='\n' || c=='\t') { + state = CAMEL_HTML_PARSER_ATTR0; + } else { + g_string_append_c(p->tag, c); + } + break; + /* check for <!-- */ + case CAMEL_HTML_PARSER_COMMENT0: + if (c == '-') { + g_string_append_c(p->tag, c); + if (p->tag->len == 3) { + g_string_truncate(p->tag, 0); + state = CAMEL_HTML_PARSER_COMMENT; + } + } else { + /* got something else, probbly dtd entity */ + state = CAMEL_HTML_PARSER_DTDENT; + } + break; + case CAMEL_HTML_PARSER_DTDENT: + if (c == '>') { + ret = CAMEL_HTML_PARSER_DTDENT; + state = CAMEL_HTML_PARSER_DATA; + *datap = start; + *lenp = in-start-1; + goto done; + } + break; + case CAMEL_HTML_PARSER_COMMENT: + if (c == '>' && p->tag->len == 2) { + ret = CAMEL_HTML_PARSER_COMMENT; + state = CAMEL_HTML_PARSER_DATA; + *datap = start; + *lenp = in-start-1; + goto done; + } else if (c=='-') { + /* we dont care if we get 'n' --'s before the > */ + if (p->tag->len < 2) + g_string_append_c(p->tag, c); + } else { + g_string_truncate(p->tag, 0); + } + break; + case CAMEL_HTML_PARSER_ATTR0: /* pre-attribute whitespace */ + if (c == '>') { + d(dump_tag(p)); + ret = CAMEL_HTML_PARSER_ELEMENT; + state = CAMEL_HTML_PARSER_DATA; + goto done; + } else if (c == ' ' || c=='\n' || c=='\t') { + } else { + if (p->attrs->len <= p->attr) { + g_ptr_array_add(p->attrs, g_string_new("")); + g_ptr_array_add(p->values, g_string_new("")); + } else { + g_string_truncate(p->attrs->pdata[p->attr], 0); + g_string_truncate(p->values->pdata[p->attr], 0); + } + g_string_append_c(p->attrs->pdata[p->attr], c); + state = CAMEL_HTML_PARSER_ATTR; + } + break; + case CAMEL_HTML_PARSER_ATTR: + if (c == '>') { + d(dump_tag(p)); + ret = CAMEL_HTML_PARSER_ELEMENT; + state = CAMEL_HTML_PARSER_DATA; + goto done; + } else if (c == '=') { + state = CAMEL_HTML_PARSER_VAL0; + } else if (c == ' ' || c=='\n' || c=='\t') { + state = CAMEL_HTML_PARSER_ATTR0; + p->attr++; + } else { + g_string_append_c(p->attrs->pdata[p->attr], c); + } + break; + case CAMEL_HTML_PARSER_VAL0: + if (c == '>') { + d(printf("value truncated\n")); + d(dump_tag(p)); + ret = CAMEL_HTML_PARSER_ELEMENT; + state = CAMEL_HTML_PARSER_DATA; + goto done; + } else if (c == '\'' || c == '\"') { + p->quote = c; + state = CAMEL_HTML_PARSER_VAL; + } else if (c == ' ' || c=='\n' || c=='\t') { + } else { + g_string_append_c(p->values->pdata[p->attr], c); + p->quote = 0; + state = CAMEL_HTML_PARSER_VAL; + } + break; + case CAMEL_HTML_PARSER_VAL: + do_val: + if (p->quote) { + if (c == '>') { + d(printf("value truncated\n")); + d(dump_tag(p)); + ret = CAMEL_HTML_PARSER_ELEMENT; + state = CAMEL_HTML_PARSER_DATA; + p->attr++; + goto done; + } else if (c == p->quote) { + state = CAMEL_HTML_PARSER_ATTR0; + p->attr++; + } else if (c=='&') { + state = CAMEL_HTML_PARSER_VAL_ENT; + g_string_truncate(p->ent, 0); + } else { + g_string_append_c(p->values->pdata[p->attr], c); + } + } else if (c == '>') { + d(dump_tag(p)); + ret = CAMEL_HTML_PARSER_ELEMENT; + state = CAMEL_HTML_PARSER_DATA; + p->attr++; + goto done; + } else if (c == ' ' || c=='\n' || c=='\t') { + state = CAMEL_HTML_PARSER_ATTR0; + p->attr++; + } else if (c=='&') { + state = CAMEL_HTML_PARSER_VAL_ENT; + g_string_truncate(p->ent, 0); + } else { + g_string_append_c(p->values->pdata[p->attr], c); + } + break; + case CAMEL_HTML_PARSER_VAL_ENT: + if (c==';') { + state = CAMEL_HTML_PARSER_VAL; + len = convert_entity(p->ent->str+1, p->ent_utf8); + if (len == 0) { + /* fallback; broken entity, just output it and see why we ended */ + g_string_append(p->values->pdata[p->attr], p->ent->str); + g_string_append_c(p->values->pdata[p->attr], ';'); + } else { + d(printf("got entity: %s = %s\n", p->ent->str, p->ent_utf8)); + g_string_append(p->values->pdata[p->attr], p->ent_utf8); + } + } else if (isalnum(c) || c=='#') { /* FIXME: right type */ + g_string_append_c(p->ent, c); + } else { + /* fallback; broken entity, just output it and see why we ended */ + g_string_append(p->values->pdata[p->attr], p->ent->str); + goto do_val; + } + break; + } + } + + if (p->eof) { + /* FIXME: what about other truncated states? */ + switch (state) { + case CAMEL_HTML_PARSER_DATA: + case CAMEL_HTML_PARSER_COMMENT: + if (in > start) { + ret = state; + *datap = start; + *lenp = in-start-1; + } else { + ret = CAMEL_HTML_PARSER_EOF; + state = CAMEL_HTML_PARSER_EOF; + } + break; + default: + ret = CAMEL_HTML_PARSER_EOF; + state = CAMEL_HTML_PARSER_EOF; + } + } else { + /* we only care about remaining data for this buffer, everything else has its own copy */ + switch (state) { + case CAMEL_HTML_PARSER_DATA: + case CAMEL_HTML_PARSER_COMMENT: + if (in > start) { + ret = state; + *datap = start; + *lenp = in-start-1; + } else { + ret = CAMEL_HTML_PARSER_EOD; + } + break; + default: + ret = CAMEL_HTML_PARSER_EOD; + } + } + +done: + p->start = start; + p->state = state; + p->inptr = in; + + return ret; +} diff --git a/camel/camel-html-parser.h b/camel/camel-html-parser.h new file mode 100644 index 0000000000..41ac2ac9ec --- /dev/null +++ b/camel/camel-html-parser.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2001 Ximian Inc. + * + * Authors: Michael Zucchi <notzed@ximian.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/** WARNING + ** + ** DO NOT USE THIS CODE OUTSIDE OF CAMEL + ** + ** IT IS SUBJECT TO CHANGE OR MAY VANISH AT ANY TIME + **/ + +#ifndef _CAMEL_HTML_PARSER_H +#define _CAMEL_HTML_PARSER_H + +#include <camel/camel-object.h> + +#define CAMEL_HTML_PARSER(obj) CAMEL_CHECK_CAST (obj, camel_html_parser_get_type (), CamelHTMLParser) +#define CAMEL_HTML_PARSER_CLASS(klass) CAMEL_CHECK_CLASS_CAST (klass, camel_html_parser_get_type (), CamelHTMLParserClass) +#define CAMEL_IS_HTML_PARSER(obj) CAMEL_CHECK_TYPE (obj, camel_html_parser_get_type ()) + +typedef struct _CamelHTMLParserClass CamelHTMLParserClass; +typedef struct _CamelHTMLParser CamelHTMLParser; + +/* Parser/tokeniser states */ +typedef enum _camel_html_parser_t { + CAMEL_HTML_PARSER_DATA, /* raw data */ + CAMEL_HTML_PARSER_ENT, /* entity in data */ + CAMEL_HTML_PARSER_ELEMENT, /* element (tag + attributes scanned) */ + CAMEL_HTML_PARSER_TAG, /* tag */ + CAMEL_HTML_PARSER_DTDENT, /* dtd entity? <! blah blah > */ + CAMEL_HTML_PARSER_COMMENT0, /* start of comment */ + CAMEL_HTML_PARSER_COMMENT, /* body of comment */ + CAMEL_HTML_PARSER_ATTR0, /* start of attribute */ + CAMEL_HTML_PARSER_ATTR, /* attribute */ + CAMEL_HTML_PARSER_VAL0, /* start of value */ + CAMEL_HTML_PARSER_VAL, /* value */ + CAMEL_HTML_PARSER_VAL_ENT, /* entity in value */ + CAMEL_HTML_PARSER_EOD, /* end of current data */ + CAMEL_HTML_PARSER_EOF, /* end of file */ +} camel_html_parser_t; + +struct _CamelHTMLParser { + CamelObject parent; + + struct _CamelHTMLParserPrivate *priv; +}; + +struct _CamelHTMLParserClass { + CamelObjectClass parent_class; +}; + +guint camel_html_parser_get_type (void); +CamelHTMLParser *camel_html_parser_new (void); + +void camel_html_parser_set_data(CamelHTMLParser *hp, const char *start, int len, int last); +camel_html_parser_t camel_html_parser_step(CamelHTMLParser *hp, const char **datap, int *lenp); +const char *camel_html_parser_left(CamelHTMLParser *hp, int *lenp); +const char *camel_html_parser_tag(CamelHTMLParser *hp); +const char *camel_html_parser_attr(CamelHTMLParser *hp, const char *name); +const GPtrArray *camel_html_parser_attr_list(CamelHTMLParser *hp, const GPtrArray **values); + +#endif /* ! _CAMEL_HTML_PARSER_H */ diff --git a/camel/camel-mime-filter-html.c b/camel/camel-mime-filter-html.c index 548e8fdaba..311b85a652 100644 --- a/camel/camel-mime-filter-html.c +++ b/camel/camel-mime-filter-html.c @@ -18,8 +18,6 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "camel-mime-filter-html.h" - #include <stdio.h> #include <string.h> #include <stdarg.h> @@ -31,6 +29,9 @@ #include <unistd.h> #include <stdlib.h> +#include "camel-mime-filter-html.h" +#include "camel-html-parser.h" + #define d(x) static void camel_mime_filter_html_class_init (CamelMimeFilterHTMLClass *klass); @@ -39,52 +40,8 @@ static void camel_mime_filter_html_finalize (CamelObject *o); static CamelMimeFilterClass *camel_mime_filter_html_parent; -/* Parser definitions, see below object code for details */ -enum _parser_t { - H_DATA, /* raw data */ - H_ENT, /* entity in data */ - H_ELEMENT, /* element (tag + attributes scanned) */ - H_TAG, /* tag */ - H_DTDENT, /* dtd entity? <! blah blah > */ - H_COMMENT0, /* start of comment */ - H_COMMENT, /* body of comment */ - H_ATTR0, /* start of attribute */ - H_ATTR, /* attribute */ - H_VAL0, /* start of value */ - H_VAL, /* value */ - H_VAL_ENT, /* entity in value */ - H_EOD, /* end of current data */ - H_EOF, /* end of file */ -}; - -struct _parser { - char *inbuf, - *inptr, - *inend, - *start; - enum _parser_t state; - char *charset; - int eof; - GString *tag; - GString *ent; - char ent_utf8[8]; - int attr; - GPtrArray *attrs; - GPtrArray *values; - int quote; -}; - -static void tokenise_setup(void); -static struct _parser *tokenise_init(void); -static void tokenise_free(struct _parser *p); -static int tokenise_step(struct _parser *p, char **datap, int *lenp); -static const char *tokenise_data(struct _parser *p, int *len); -static const char *tokenise_tag(struct _parser *p); -static const char *tokenise_left(struct _parser *p, int *len); -static void tokenise_set_data(struct _parser *p, char *start, int length, int last); - struct _CamelMimeFilterHTMLPrivate { - struct _parser *ctxt; + CamelHTMLParser *ctxt; }; /* ********************************************************************** */ @@ -140,7 +97,8 @@ camel_mime_filter_html_finalize(CamelObject *o) { CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)o; - tokenise_free(f->priv->ctxt); + camel_object_unref((CamelObject *)f->priv->ctxt); + g_free(f->priv); } static void @@ -149,13 +107,13 @@ camel_mime_filter_html_init (CamelObject *o) CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)o; f->priv = g_malloc0(sizeof(*f->priv)); - f->priv->ctxt = tokenise_init(); + f->priv->ctxt = camel_html_parser_new(); } static void run(CamelMimeFilter *mf, char *in, size_t len, size_t prespace, char **out, size_t *outlenptr, size_t *outprespace, int last) { - int state; + camel_html_parser_t state; char *outp; CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)mf; @@ -165,27 +123,27 @@ run(CamelMimeFilter *mf, char *in, size_t len, size_t prespace, char **out, size camel_mime_filter_set_size(mf, len*2+256, FALSE); outp = mf->outbuf; - tokenise_set_data(f->priv->ctxt, in, len, last); + camel_html_parser_set_data(f->priv->ctxt, in, len, last); do { - char *data; + const char *data; int len; - state = tokenise_step(f->priv->ctxt, &data, &len); + state = camel_html_parser_step(f->priv->ctxt, &data, &len); switch(state) { - case H_DATA: - case H_ENT: + case CAMEL_HTML_PARSER_DATA: + case CAMEL_HTML_PARSER_ENT: memcpy(outp, data, len); outp += len; break; - case H_ELEMENT: + case CAMEL_HTML_PARSER_ELEMENT: /* FIXME: do some whitespace processing here */ break; default: /* ignore everything else */ break; } - } while (state != H_EOF && state != H_EOD); + } while (state != CAMEL_HTML_PARSER_EOF && state != CAMEL_HTML_PARSER_EOD); *out = mf->outbuf; *outlenptr = outp - mf->outbuf; @@ -211,8 +169,8 @@ reset(CamelMimeFilter *mf) { CamelMimeFilterHTML *f = (CamelMimeFilterHTML *)mf; - tokenise_free(f->priv->ctxt); - f->priv->ctxt = tokenise_init(); + camel_object_unref((CamelObject *)f->priv->ctxt); + f->priv->ctxt = camel_html_parser_new(); } static void @@ -225,8 +183,6 @@ camel_mime_filter_html_class_init (CamelMimeFilterHTMLClass *klass) filter_class->reset = reset; filter_class->filter = filter; filter_class->complete = complete; - - tokenise_setup(); } /** @@ -242,746 +198,3 @@ camel_mime_filter_html_new (void) CamelMimeFilterHTML *new = CAMEL_MIME_FILTER_HTML ( camel_object_new (camel_mime_filter_html_get_type ())); return new; } - - -/* - What follows is a simple, high-speed html parser. - - It is not complete, but should be complete enough for its intended purpose. -*/ - -#include <gal/unicode/gunicode.h> -#include <ctype.h> - -/* this map taken out of libxml */ -static struct { - unsigned int val; - const char *name; -} entity_map[] = { -/* - * the 4 absolute ones, - */ - { 34, "quot", /* quotation mark = APL quote, U+0022 ISOnum */ }, - { 38, "amp", /* ampersand, U+0026 ISOnum */ }, - { 60, "lt", /* less-than sign, U+003C ISOnum */ }, - { 62, "gt", /* greater-than sign, U+003E ISOnum */ }, - -/* - * A bunch still in the 128-255 range - * Replacing them depend really on the charset used. - */ - { 39, "apos", /* single quote */ }, - { 160, "nbsp", /* no-break space = non-breaking space, U+00A0 ISOnum */ }, - { 161, "iexcl",/* inverted exclamation mark, U+00A1 ISOnum */ }, - { 162, "cent", /* cent sign, U+00A2 ISOnum */ }, - { 163, "pound",/* pound sign, U+00A3 ISOnum */ }, - { 164, "curren",/* currency sign, U+00A4 ISOnum */ }, - { 165, "yen", /* yen sign = yuan sign, U+00A5 ISOnum */ }, - { 166, "brvbar",/* broken bar = broken vertical bar, U+00A6 ISOnum */ }, - { 167, "sect", /* section sign, U+00A7 ISOnum */ }, - { 168, "uml", /* diaeresis = spacing diaeresis, U+00A8 ISOdia */ }, - { 169, "copy", /* copyright sign, U+00A9 ISOnum */ }, - { 170, "ordf", /* feminine ordinal indicator, U+00AA ISOnum */ }, - { 171, "laquo",/* left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */ }, - { 172, "not", /* not sign, U+00AC ISOnum */ }, - { 173, "shy", /* soft hyphen = discretionary hyphen, U+00AD ISOnum */ }, - { 174, "reg", /* registered sign = registered trade mark sign, U+00AE ISOnum */ }, - { 175, "macr", /* macron = spacing macron = overline = APL overbar, U+00AF ISOdia */ }, - { 176, "deg", /* degree sign, U+00B0 ISOnum */ }, - { 177, "plusmn",/* plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */ }, - { 178, "sup2", /* superscript two = superscript digit two = squared, U+00B2 ISOnum */ }, - { 179, "sup3", /* superscript three = superscript digit three = cubed, U+00B3 ISOnum */ }, - { 180, "acute",/* acute accent = spacing acute, U+00B4 ISOdia */ }, - { 181, "micro",/* micro sign, U+00B5 ISOnum */ }, - { 182, "para", /* pilcrow sign = paragraph sign, U+00B6 ISOnum */ }, - { 183, "middot",/* middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum */ }, - { 184, "cedil",/* cedilla = spacing cedilla, U+00B8 ISOdia */ }, - { 185, "sup1", /* superscript one = superscript digit one, U+00B9 ISOnum */ }, - { 186, "ordm", /* masculine ordinal indicator, U+00BA ISOnum */ }, - { 187, "raquo",/* right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum */ }, - { 188, "frac14",/* vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */ }, - { 189, "frac12",/* vulgar fraction one half = fraction one half, U+00BD ISOnum */ }, - { 190, "frac34",/* vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */ }, - { 191, "iquest",/* inverted question mark = turned question mark, U+00BF ISOnum */ }, - { 192, "Agrave",/* latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */ }, - { 193, "Aacute",/* latin capital letter A with acute, U+00C1 ISOlat1 */ }, - { 194, "Acirc",/* latin capital letter A with circumflex, U+00C2 ISOlat1 */ }, - { 195, "Atilde",/* latin capital letter A with tilde, U+00C3 ISOlat1 */ }, - { 196, "Auml", /* latin capital letter A with diaeresis, U+00C4 ISOlat1 */ }, - { 197, "Aring",/* latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */ }, - { 198, "AElig",/* latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */ }, - { 199, "Ccedil",/* latin capital letter C with cedilla, U+00C7 ISOlat1 */ }, - { 200, "Egrave",/* latin capital letter E with grave, U+00C8 ISOlat1 */ }, - { 201, "Eacute",/* latin capital letter E with acute, U+00C9 ISOlat1 */ }, - { 202, "Ecirc",/* latin capital letter E with circumflex, U+00CA ISOlat1 */ }, - { 203, "Euml", /* latin capital letter E with diaeresis, U+00CB ISOlat1 */ }, - { 204, "Igrave",/* latin capital letter I with grave, U+00CC ISOlat1 */ }, - { 205, "Iacute",/* latin capital letter I with acute, U+00CD ISOlat1 */ }, - { 206, "Icirc",/* latin capital letter I with circumflex, U+00CE ISOlat1 */ }, - { 207, "Iuml", /* latin capital letter I with diaeresis, U+00CF ISOlat1 */ }, - { 208, "ETH", /* latin capital letter ETH, U+00D0 ISOlat1 */ }, - { 209, "Ntilde",/* latin capital letter N with tilde, U+00D1 ISOlat1 */ }, - { 210, "Ograve",/* latin capital letter O with grave, U+00D2 ISOlat1 */ }, - { 211, "Oacute",/* latin capital letter O with acute, U+00D3 ISOlat1 */ }, - { 212, "Ocirc",/* latin capital letter O with circumflex, U+00D4 ISOlat1 */ }, - { 213, "Otilde",/* latin capital letter O with tilde, U+00D5 ISOlat1 */ }, - { 214, "Ouml", /* latin capital letter O with diaeresis, U+00D6 ISOlat1 */ }, - { 215, "times",/* multiplication sign, U+00D7 ISOnum */ }, - { 216, "Oslash",/* latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1 */ }, - { 217, "Ugrave",/* latin capital letter U with grave, U+00D9 ISOlat1 */ }, - { 218, "Uacute",/* latin capital letter U with acute, U+00DA ISOlat1 */ }, - { 219, "Ucirc",/* latin capital letter U with circumflex, U+00DB ISOlat1 */ }, - { 220, "Uuml", /* latin capital letter U with diaeresis, U+00DC ISOlat1 */ }, - { 221, "Yacute",/* latin capital letter Y with acute, U+00DD ISOlat1 */ }, - { 222, "THORN",/* latin capital letter THORN, U+00DE ISOlat1 */ }, - { 223, "szlig",/* latin small letter sharp s = ess-zed, U+00DF ISOlat1 */ }, - { 224, "agrave",/* latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */ }, - { 225, "aacute",/* latin small letter a with acute, U+00E1 ISOlat1 */ }, - { 226, "acirc",/* latin small letter a with circumflex, U+00E2 ISOlat1 */ }, - { 227, "atilde",/* latin small letter a with tilde, U+00E3 ISOlat1 */ }, - { 228, "auml", /* latin small letter a with diaeresis, U+00E4 ISOlat1 */ }, - { 229, "aring",/* latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */ }, - { 230, "aelig",/* latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */ }, - { 231, "ccedil",/* latin small letter c with cedilla, U+00E7 ISOlat1 */ }, - { 232, "egrave",/* latin small letter e with grave, U+00E8 ISOlat1 */ }, - { 233, "eacute",/* latin small letter e with acute, U+00E9 ISOlat1 */ }, - { 234, "ecirc",/* latin small letter e with circumflex, U+00EA ISOlat1 */ }, - { 235, "euml", /* latin small letter e with diaeresis, U+00EB ISOlat1 */ }, - { 236, "igrave",/* latin small letter i with grave, U+00EC ISOlat1 */ }, - { 237, "iacute",/* latin small letter i with acute, U+00ED ISOlat1 */ }, - { 238, "icirc",/* latin small letter i with circumflex, U+00EE ISOlat1 */ }, - { 239, "iuml", /* latin small letter i with diaeresis, U+00EF ISOlat1 */ }, - { 240, "eth", /* latin small letter eth, U+00F0 ISOlat1 */ }, - { 241, "ntilde",/* latin small letter n with tilde, U+00F1 ISOlat1 */ }, - { 242, "ograve",/* latin small letter o with grave, U+00F2 ISOlat1 */ }, - { 243, "oacute",/* latin small letter o with acute, U+00F3 ISOlat1 */ }, - { 244, "ocirc",/* latin small letter o with circumflex, U+00F4 ISOlat1 */ }, - { 245, "otilde",/* latin small letter o with tilde, U+00F5 ISOlat1 */ }, - { 246, "ouml", /* latin small letter o with diaeresis, U+00F6 ISOlat1 */ }, - { 247, "divide",/* division sign, U+00F7 ISOnum */ }, - { 248, "oslash",/* latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */ }, - { 249, "ugrave",/* latin small letter u with grave, U+00F9 ISOlat1 */ }, - { 250, "uacute",/* latin small letter u with acute, U+00FA ISOlat1 */ }, - { 251, "ucirc",/* latin small letter u with circumflex, U+00FB ISOlat1 */ }, - { 252, "uuml", /* latin small letter u with diaeresis, U+00FC ISOlat1 */ }, - { 253, "yacute",/* latin small letter y with acute, U+00FD ISOlat1 */ }, - { 254, "thorn",/* latin small letter thorn with, U+00FE ISOlat1 */ }, - { 255, "yuml", /* latin small letter y with diaeresis, U+00FF ISOlat1 */ }, - -/* - * Anything below should really be kept as entities references - */ - { 402, "fnof", /* latin small f with hook = function = florin, U+0192 ISOtech */ }, - - { 913, "Alpha",/* greek capital letter alpha, U+0391 */ }, - { 914, "Beta", /* greek capital letter beta, U+0392 */ }, - { 915, "Gamma",/* greek capital letter gamma, U+0393 ISOgrk3 */ }, - { 916, "Delta",/* greek capital letter delta, U+0394 ISOgrk3 */ }, - { 917, "Epsilon",/* greek capital letter epsilon, U+0395 */ }, - { 918, "Zeta", /* greek capital letter zeta, U+0396 */ }, - { 919, "Eta", /* greek capital letter eta, U+0397 */ }, - { 920, "Theta",/* greek capital letter theta, U+0398 ISOgrk3 */ }, - { 921, "Iota", /* greek capital letter iota, U+0399 */ }, - { 922, "Kappa",/* greek capital letter kappa, U+039A */ }, - { 923, "Lambda"/* greek capital letter lambda, U+039B ISOgrk3 */ }, - { 924, "Mu", /* greek capital letter mu, U+039C */ }, - { 925, "Nu", /* greek capital letter nu, U+039D */ }, - { 926, "Xi", /* greek capital letter xi, U+039E ISOgrk3 */ }, - { 927, "Omicron",/* greek capital letter omicron, U+039F */ }, - { 928, "Pi", /* greek capital letter pi, U+03A0 ISOgrk3 */ }, - { 929, "Rho", /* greek capital letter rho, U+03A1 */ }, - { 931, "Sigma",/* greek capital letter sigma, U+03A3 ISOgrk3 */ }, - { 932, "Tau", /* greek capital letter tau, U+03A4 */ }, - { 933, "Upsilon",/* greek capital letter upsilon, U+03A5 ISOgrk3 */ }, - { 934, "Phi", /* greek capital letter phi, U+03A6 ISOgrk3 */ }, - { 935, "Chi", /* greek capital letter chi, U+03A7 */ }, - { 936, "Psi", /* greek capital letter psi, U+03A8 ISOgrk3 */ }, - { 937, "Omega",/* greek capital letter omega, U+03A9 ISOgrk3 */ }, - - { 945, "alpha",/* greek small letter alpha, U+03B1 ISOgrk3 */ }, - { 946, "beta", /* greek small letter beta, U+03B2 ISOgrk3 */ }, - { 947, "gamma",/* greek small letter gamma, U+03B3 ISOgrk3 */ }, - { 948, "delta",/* greek small letter delta, U+03B4 ISOgrk3 */ }, - { 949, "epsilon",/* greek small letter epsilon, U+03B5 ISOgrk3 */ }, - { 950, "zeta", /* greek small letter zeta, U+03B6 ISOgrk3 */ }, - { 951, "eta", /* greek small letter eta, U+03B7 ISOgrk3 */ }, - { 952, "theta",/* greek small letter theta, U+03B8 ISOgrk3 */ }, - { 953, "iota", /* greek small letter iota, U+03B9 ISOgrk3 */ }, - { 954, "kappa",/* greek small letter kappa, U+03BA ISOgrk3 */ }, - { 955, "lambda",/* greek small letter lambda, U+03BB ISOgrk3 */ }, - { 956, "mu", /* greek small letter mu, U+03BC ISOgrk3 */ }, - { 957, "nu", /* greek small letter nu, U+03BD ISOgrk3 */ }, - { 958, "xi", /* greek small letter xi, U+03BE ISOgrk3 */ }, - { 959, "omicron",/* greek small letter omicron, U+03BF NEW */ }, - { 960, "pi", /* greek small letter pi, U+03C0 ISOgrk3 */ }, - { 961, "rho", /* greek small letter rho, U+03C1 ISOgrk3 */ }, - { 962, "sigmaf",/* greek small letter final sigma, U+03C2 ISOgrk3 */ }, - { 963, "sigma",/* greek small letter sigma, U+03C3 ISOgrk3 */ }, - { 964, "tau", /* greek small letter tau, U+03C4 ISOgrk3 */ }, - { 965, "upsilon",/* greek small letter upsilon, U+03C5 ISOgrk3 */ }, - { 966, "phi", /* greek small letter phi, U+03C6 ISOgrk3 */ }, - { 967, "chi", /* greek small letter chi, U+03C7 ISOgrk3 */ }, - { 968, "psi", /* greek small letter psi, U+03C8 ISOgrk3 */ }, - { 969, "omega",/* greek small letter omega, U+03C9 ISOgrk3 */ }, - { 977, "thetasym",/* greek small letter theta symbol, U+03D1 NEW */ }, - { 978, "upsih",/* greek upsilon with hook symbol, U+03D2 NEW */ }, - { 982, "piv", /* greek pi symbol, U+03D6 ISOgrk3 */ }, - - { 8226, "bull", /* bullet = black small circle, U+2022 ISOpub */ }, - { 8230, "hellip",/* horizontal ellipsis = three dot leader, U+2026 ISOpub */ }, - { 8242, "prime",/* prime = minutes = feet, U+2032 ISOtech */ }, - { 8243, "Prime",/* double prime = seconds = inches, U+2033 ISOtech */ }, - { 8254, "oline",/* overline = spacing overscore, U+203E NEW */ }, - { 8260, "frasl",/* fraction slash, U+2044 NEW */ }, - - { 8472, "weierp",/* script capital P = power set = Weierstrass p, U+2118 ISOamso */ }, - { 8465, "image",/* blackletter capital I = imaginary part, U+2111 ISOamso */ }, - { 8476, "real", /* blackletter capital R = real part symbol, U+211C ISOamso */ }, - { 8482, "trade",/* trade mark sign, U+2122 ISOnum */ }, - { 8501, "alefsym",/* alef symbol = first transfinite cardinal, U+2135 NEW */ }, - { 8592, "larr", /* leftwards arrow, U+2190 ISOnum */ }, - { 8593, "uarr", /* upwards arrow, U+2191 ISOnum */ }, - { 8594, "rarr", /* rightwards arrow, U+2192 ISOnum */ }, - { 8595, "darr", /* downwards arrow, U+2193 ISOnum */ }, - { 8596, "harr", /* left right arrow, U+2194 ISOamsa */ }, - { 8629, "crarr",/* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */ }, - { 8656, "lArr", /* leftwards double arrow, U+21D0 ISOtech */ }, - { 8657, "uArr", /* upwards double arrow, U+21D1 ISOamsa */ }, - { 8658, "rArr", /* rightwards double arrow, U+21D2 ISOtech */ }, - { 8659, "dArr", /* downwards double arrow, U+21D3 ISOamsa */ }, - { 8660, "hArr", /* left right double arrow, U+21D4 ISOamsa */ }, - - - { 8704, "forall",/* for all, U+2200 ISOtech */ }, - { 8706, "part", /* partial differential, U+2202 ISOtech */ }, - { 8707, "exist",/* there exists, U+2203 ISOtech */ }, - { 8709, "empty",/* empty set = null set = diameter, U+2205 ISOamso */ }, - { 8711, "nabla",/* nabla = backward difference, U+2207 ISOtech */ }, - { 8712, "isin", /* element of, U+2208 ISOtech */ }, - { 8713, "notin",/* not an element of, U+2209 ISOtech */ }, - { 8715, "ni", /* contains as member, U+220B ISOtech */ }, - { 8719, "prod", /* n-ary product = product sign, U+220F ISOamsb */ }, - { 8721, "sum", /* n-ary sumation, U+2211 ISOamsb */ }, - { 8722, "minus",/* minus sign, U+2212 ISOtech */ }, - { 8727, "lowast",/* asterisk operator, U+2217 ISOtech */ }, - { 8730, "radic",/* square root = radical sign, U+221A ISOtech */ }, - { 8733, "prop", /* proportional to, U+221D ISOtech */ }, - { 8734, "infin",/* infinity, U+221E ISOtech */ }, - { 8736, "ang", /* angle, U+2220 ISOamso */ }, - { 8743, "and", /* logical and = wedge, U+2227 ISOtech */ }, - { 8744, "or", /* logical or = vee, U+2228 ISOtech */ }, - { 8745, "cap", /* intersection = cap, U+2229 ISOtech */ }, - { 8746, "cup", /* union = cup, U+222A ISOtech */ }, - { 8747, "int", /* integral, U+222B ISOtech */ }, - { 8756, "there4",/* therefore, U+2234 ISOtech */ }, - { 8764, "sim", /* tilde operator = varies with = similar to, U+223C ISOtech */ }, - { 8773, "cong", /* approximately equal to, U+2245 ISOtech */ }, - { 8776, "asymp",/* almost equal to = asymptotic to, U+2248 ISOamsr */ }, - { 8800, "ne", /* not equal to, U+2260 ISOtech */ }, - { 8801, "equiv",/* identical to, U+2261 ISOtech */ }, - { 8804, "le", /* less-than or equal to, U+2264 ISOtech */ }, - { 8805, "ge", /* greater-than or equal to, U+2265 ISOtech */ }, - { 8834, "sub", /* subset of, U+2282 ISOtech */ }, - { 8835, "sup", /* superset of, U+2283 ISOtech */ }, - { 8836, "nsub", /* not a subset of, U+2284 ISOamsn */ }, - { 8838, "sube", /* subset of or equal to, U+2286 ISOtech */ }, - { 8839, "supe", /* superset of or equal to, U+2287 ISOtech */ }, - { 8853, "oplus",/* circled plus = direct sum, U+2295 ISOamsb */ }, - { 8855, "otimes",/* circled times = vector product, U+2297 ISOamsb */ }, - { 8869, "perp", /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */ }, - { 8901, "sdot", /* dot operator, U+22C5 ISOamsb */ }, - { 8968, "lceil",/* left ceiling = apl upstile, U+2308 ISOamsc */ }, - { 8969, "rceil",/* right ceiling, U+2309 ISOamsc */ }, - { 8970, "lfloor",/* left floor = apl downstile, U+230A ISOamsc */ }, - { 8971, "rfloor",/* right floor, U+230B ISOamsc */ }, - { 9001, "lang", /* left-pointing angle bracket = bra, U+2329 ISOtech */ }, - { 9002, "rang", /* right-pointing angle bracket = ket, U+232A ISOtech */ }, - { 9674, "loz", /* lozenge, U+25CA ISOpub */ }, - - { 9824, "spades",/* black spade suit, U+2660 ISOpub */ }, - { 9827, "clubs",/* black club suit = shamrock, U+2663 ISOpub */ }, - { 9829, "hearts",/* black heart suit = valentine, U+2665 ISOpub */ }, - { 9830, "diams",/* black diamond suit, U+2666 ISOpub */ }, - - { 338, "OElig",/* latin capital ligature OE, U+0152 ISOlat2 */ }, - { 339, "oelig",/* latin small ligature oe, U+0153 ISOlat2 */ }, - { 352, "Scaron",/* latin capital letter S with caron, U+0160 ISOlat2 */ }, - { 353, "scaron",/* latin small letter s with caron, U+0161 ISOlat2 */ }, - { 376, "Yuml", /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */ }, - { 710, "circ", /* modifier letter circumflex accent, U+02C6 ISOpub */ }, - { 732, "tilde",/* small tilde, U+02DC ISOdia */ }, - - { 8194, "ensp", /* en space, U+2002 ISOpub */ }, - { 8195, "emsp", /* em space, U+2003 ISOpub */ }, - { 8201, "thinsp",/* thin space, U+2009 ISOpub */ }, - { 8204, "zwnj", /* zero width non-joiner, U+200C NEW RFC 2070 */ }, - { 8205, "zwj", /* zero width joiner, U+200D NEW RFC 2070 */ }, - { 8206, "lrm", /* left-to-right mark, U+200E NEW RFC 2070 */ }, - { 8207, "rlm", /* right-to-left mark, U+200F NEW RFC 2070 */ }, - { 8211, "ndash",/* en dash, U+2013 ISOpub */ }, - { 8212, "mdash",/* em dash, U+2014 ISOpub */ }, - { 8216, "lsquo",/* left single quotation mark, U+2018 ISOnum */ }, - { 8217, "rsquo",/* right single quotation mark, U+2019 ISOnum */ }, - { 8218, "sbquo",/* single low-9 quotation mark, U+201A NEW */ }, - { 8220, "ldquo",/* left double quotation mark, U+201C ISOnum */ }, - { 8221, "rdquo",/* right double quotation mark, U+201D ISOnum */ }, - { 8222, "bdquo",/* double low-9 quotation mark, U+201E NEW */ }, - { 8224, "dagger",/* dagger, U+2020 ISOpub */ }, - { 8225, "Dagger",/* double dagger, U+2021 ISOpub */ }, - { 8240, "permil",/* per mille sign, U+2030 ISOtech */ }, - { 8249, "lsaquo",/* single left-pointing angle quotation mark, U+2039 ISO proposed */ }, - { 8250, "rsaquo",/* single right-pointing angle quotation mark, U+203A ISO proposed */ }, - { 8364, "euro", /* euro sign, U+20AC NEW */ } -}; - -static GHashTable *entities; - -/* this cannot be called in a thread context */ -static void tokenise_setup(void) -{ - int i; - - if (entities == NULL) { - entities = g_hash_table_new(g_str_hash, g_str_equal); - for (i=0;i<sizeof(entity_map)/sizeof(entity_map[0]);i++) { - g_hash_table_insert(entities, (char *)entity_map[i].name, (void *)entity_map[i].val); - } - } -} - -static struct _parser *tokenise_init(void) -{ - struct _parser *p; - - p = g_malloc(sizeof(*p)); - p->state = H_DATA; - - p->attr = 0; - p->attrs = g_ptr_array_new(); - p->values = g_ptr_array_new(); - p->tag = g_string_new(""); - p->ent = g_string_new(""); - p->charset = NULL; - - if (entities == NULL) - tokenise_setup(); - - return p; -} - -static void tokenise_free(struct _parser *p) -{ - int i; - - g_string_free(p->tag, TRUE); - g_string_free(p->ent, TRUE); - g_free(p->charset); - - for (i=0;i<p->attrs->len;i++) - g_string_free(p->attrs->pdata[i], TRUE); - - for (i=0;i<p->values->len;i++) - g_string_free(p->values->pdata[i], TRUE); - - g_free(p); -} - -static int convert_entity(const char *e, char *ent) -{ - unsigned int val; - - if (e[0] == '#') - return g_unichar_to_utf8(atoi(e+1), ent); - - val = (unsigned int)g_hash_table_lookup(entities, e); - if (ent) - return g_unichar_to_utf8(val, ent); - else - return 0; -} - -#if 0 -static void dump_tag(struct _parser *p) -{ - int i; - - printf("got tag: %s\n", p->tag->str); - printf("%d attributes:\n", p->attr); - for (i=0;i<p->attr;i++) { - printf(" %s = '%s'\n", ((GString *)p->attrs->pdata[i])->str, ((GString *)p->values->pdata[i])->str); - } -} -#endif - -static int tokenise_step(struct _parser *p, char **datap, int *lenp) -{ - char *in = p->inptr; - char *inend = p->inend; - char c; - int state = p->state, ret, len; - char *start = p->inptr; - - d(printf("Tokenise step\n")); - - while (in < inend) { - c = *in++; - switch (state) { - case H_DATA: - if (c == '<') { - ret = state; - state = H_TAG; - p->attr = 0; - g_string_truncate(p->tag, 0); - d(printf("got data '%.*s'\n", in-start-1, start)); - *datap = start; - *lenp = in-start-1; - goto done; - } else if (c=='&') { - ret = state; - state = H_ENT; - g_string_truncate(p->ent, 0); - g_string_append_c(p->ent, c); - d(printf("got data '%.*s'\n", in-start-1, start)); - *datap = start; - *lenp = in-start-1; - goto done; - } - break; - case H_ENT: - if (c==';') { - len = convert_entity(p->ent->str+1, p->ent_utf8); - if (len == 0) { - /* handle broken entity */ - g_string_append_c(p->ent, c); - ret = state = H_DATA; - *datap = p->ent->str; - *lenp = p->ent->len; - goto done; - } else { - d(printf("got entity: %s = %s\n", p->ent->str, p->ent_utf8)); - ret = state; - state = H_DATA; - *datap = p->ent_utf8; - *lenp = len; - goto done; - } - } else if (isalnum(c) || c=='#') { /* FIXME: right type */ - g_string_append_c(p->ent, c); - } else { - /* handle broken entity */ - g_string_append_c(p->ent, c); - ret = state = H_DATA; - *datap = p->ent->str; - *lenp = p->ent->len; - goto done; - } - break; - case H_TAG: - if (c == '!') { - state = H_COMMENT0; - g_string_append_c(p->tag, c); - } else if (c == '>') { - d(dump_tag(p)); - ret = H_ELEMENT; - state = H_DATA; - goto done; - } else if (c == ' ' || c=='\n' || c=='\t') { - state = H_ATTR0; - } else { - g_string_append_c(p->tag, c); - } - break; - /* check for <!-- */ - case H_COMMENT0: - if (c == '-') { - g_string_append_c(p->tag, c); - if (p->tag->len == 3) { - g_string_truncate(p->tag, 0); - state = H_COMMENT; - } - } else { - /* got something else, probbly dtd entity */ - state = H_DTDENT; - } - break; - case H_DTDENT: - if (c == '>') { - ret = H_DTDENT; - state = H_DATA; - *datap = start; - *lenp = in-start-1; - goto done; - } - break; - case H_COMMENT: - if (c == '>' && p->tag->len == 2) { - ret = H_COMMENT; - state = H_DATA; - *datap = start; - *lenp = in-start-1; - goto done; - } else if (c=='-') { - /* we dont care if we get 'n' --'s before the > */ - if (p->tag->len < 2) - g_string_append_c(p->tag, c); - } else { - g_string_truncate(p->tag, 0); - } - break; - case H_ATTR0: /* pre-attribute whitespace */ - if (c == '>') { - d(dump_tag(p)); - ret = H_ELEMENT; - state = H_DATA; - goto done; - } else if (c == ' ' || c=='\n' || c=='\t') { - } else { - if (p->attrs->len <= p->attr) { - g_ptr_array_add(p->attrs, g_string_new("")); - g_ptr_array_add(p->values, g_string_new("")); - } else { - g_string_truncate(p->attrs->pdata[p->attr], 0); - g_string_truncate(p->values->pdata[p->attr], 0); - } - g_string_append_c(p->attrs->pdata[p->attr], c); - state = H_ATTR; - } - break; - case H_ATTR: - if (c == '>') { - d(dump_tag(p)); - ret = H_ELEMENT; - state = H_DATA; - goto done; - } else if (c == '=') { - state = H_VAL0; - } else if (c == ' ' || c=='\n' || c=='\t') { - state = H_ATTR0; - p->attr++; - } else { - g_string_append_c(p->attrs->pdata[p->attr], c); - } - break; - case H_VAL0: - if (c == '>') { - d(printf("value truncated\n")); - d(dump_tag(p)); - ret = H_ELEMENT; - state = H_DATA; - goto done; - } else if (c == '\'' || c == '\"') { - p->quote = c; - state = H_VAL; - } else if (c == ' ' || c=='\n' || c=='\t') { - } else { - g_string_append_c(p->values->pdata[p->attr], c); - p->quote = 0; - state = H_VAL; - } - break; - case H_VAL: - do_val: - if (c == '>') { - d(printf("value truncated\n")); - d(dump_tag(p)); - ret = H_ELEMENT; - state = H_DATA; - goto done; - } else if (p->quote) { - if (c == p->quote) { - state = H_ATTR0; - p->attr++; - } else if (c=='&') { - state = H_VAL_ENT; - g_string_truncate(p->ent, 0); - } else { - g_string_append_c(p->values->pdata[p->attr], c); - } - } else if (c == ' ' || c=='\n' || c=='\t') { - state = H_ATTR0; - p->attr++; - } else if (c=='&') { - state = H_VAL_ENT; - g_string_truncate(p->ent, 0); - } else { - g_string_append_c(p->values->pdata[p->attr], c); - } - break; - case H_VAL_ENT: - if (c==';') { - state = H_VAL; - len = convert_entity(p->ent->str+1, p->ent_utf8); - if (len == 0) { - /* fallback; broken entity, just output it and see why we ended */ - g_string_append(p->values->pdata[p->attr], p->ent->str); - g_string_append_c(p->values->pdata[p->attr], ';'); - } else { - d(printf("got entity: %s = %s\n", p->ent->str, p->ent_utf8)); - g_string_append(p->values->pdata[p->attr], p->ent_utf8); - } - } else if (isalnum(c) || c=='#') { /* FIXME: right type */ - g_string_append_c(p->ent, c); - } else { - /* fallback; broken entity, just output it and see why we ended */ - g_string_append(p->values->pdata[p->attr], p->ent->str); - goto do_val; - } - break; - } - } - - if (p->eof) { - /* FIXME: what about other truncated states? */ - switch (state) { - case H_DATA: - case H_COMMENT: - if (in > start) { - ret = state; - *datap = start; - *lenp = in-start-1; - } else { - ret = H_EOF; - state = H_EOF; - } - break; - default: - ret = H_EOF; - state = H_EOF; - } - } else { - /* we only care about remaining data for this buffer, everything else has its own copy */ - switch (state) { - case H_DATA: - case H_COMMENT: - if (in > start) { - ret = state; - *datap = start; - *lenp = in-start-1; - } else { - ret = H_EOD; - } - break; - default: - ret = H_EOD; - } - } - -done: - p->start = start; - p->state = state; - p->inptr = in; - - return ret; -} - -#if 0 -static const char *tokenise_data(struct _parser *p, int *len) -{ - if (len) - *len = p->inptr - p->start - 1; - - return p->start; -} - -static const char *tokenise_tag(struct _parser *p) -{ - return p->tag->str; -} - -static const char *tokenise_left(struct _parser *p, int *len) -{ - if (len) - *len = p->inend - p->inptr; - - return p->inptr; -} -#endif - -static void tokenise_set_data(struct _parser *p, char *start, int length, int last) -{ - p->inptr = p->inbuf = start; - p->inend = start+length; - p->eof = last; -} - - -#if 0 -html_parse_step() -{ - do { - char *data; - const char *tag; - int len; - - state = tokenise_step(p, &data, &len); - - switch(state) { - case H_DATA: - printf("Data: %.*s\n", len, data); - break; - case H_ENT: - printf("Entity: %.*s\n", len, data); - break; - case H_ELEMENT: - tag = tokenise_tag(p); - if (tag[0] == '/') { - /* go up the stack, looking at what other elements need to be closed first */ - } else { - /* go up the stack, looking at what other elements need to be closed first */ - } - printf("Element: %s\n", tokenise_tag(p)); - break; - } - } while (state != H_EOF && state != H_EOD); -} -#endif - -#if 0 -int main(int argc, char **argv) -{ - struct _parser *p; - char buf[1024]; - int len; - int fd; - char *name; - - p = tokenise_init(NULL); - - if (argc == 2) - name = argv[1]; - else - name = "/home/notzed/public_html/wwwdocs/htdocs/search.html"; - - fd = open(name, O_RDONLY); - while ((len = read(fd, buf, 1024)) > 0) { - int state; - - tokenise_set_data(p, buf, len, len < 1024); - do { - char *data; - int len; - - state = tokenise_step(p, &data, &len); - - switch(state) { - case H_DATA: - /*printf("Data: %.*s\n", len, data);*/ - printf("%.*s", len, data); - break; - case H_COMMENT: - /*printf("Comment: %.*s\n", len, data);*/ - break; - case H_ENT: - printf("%.*s", len, data); - /*printf("Entity: %.*s\n", len, data);*/ - break; - case H_ELEMENT: - /*printf("Element: %s\n", tokenise_tag(p));*/ - break; - } - } while (state != H_EOF && state != H_EOD); - } - close (fd); -} - -#endif diff --git a/camel/camel-mime-part-utils.c b/camel/camel-mime-part-utils.c index 3bec8bfb3c..4b7707a195 100644 --- a/camel/camel-mime-part-utils.c +++ b/camel/camel-mime-part-utils.c @@ -41,61 +41,84 @@ #include "camel-mime-filter-basic.h" #include "camel-mime-filter-charset.h" #include "camel-mime-filter-crlf.h" +#include "camel-html-parser.h" #define d(x) /*(printf("%s(%d): ", __FILE__, __LINE__),(x))*/ +/* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */ + static char * -extract_metatag_charset (GByteArray *buffer) +check_html_charset(CamelMimeParser *mp, CamelMimeFilterBasicType enctype) { - /* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */ - const char *slashhead, *data; + const char *buf; + off_t offset; + int length; + CamelHTMLParser *hp; char *charset = NULL; - - data = buffer->data; - - slashhead = strstrcase (data, "</head"); - if (!slashhead) - slashhead = data + buffer->len; - - /* Yea, this is ugly */ - while (data < slashhead) { - struct _header_param *params; - const char *meta, *metaend; + camel_html_parser_t state; + struct _header_content_type *ct; + CamelMimeFilterBasic *fdec = NULL; + + /* if we can't find the charset within the first 2k, we ain't gonna find it */ + offset = camel_mime_parser_tell(mp); + length = camel_mime_parser_read(mp, &buf, 2048); + + d(printf("Checking html for meta content-type: '%.*s'", len, buf)); + + if (length == 0) { + camel_mime_parser_seek(mp, offset, SEEK_SET); + return NULL; + } + + /* if we need to first base64/qp decode, do this here, sigh */ + hp = camel_html_parser_new(); + if (enctype != 0) { + int dummy, len; + char *buffer; + + fdec = camel_mime_filter_basic_new_type(enctype); + camel_mime_filter_filter((CamelMimeFilter *)fdec, (char *)buf, length, 0, &buffer, &len, &dummy); + camel_html_parser_set_data(hp, buffer, len, TRUE); + } else { + camel_html_parser_set_data(hp, buf, length, TRUE); + } + + do { + const char *data; + int len; const char *val; + + state = camel_html_parser_step(hp, &data, &len); + + /* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */ - meta = strstrcase (data, "<meta"); - if (!meta) - break; - - metaend = strchr (meta, '>'); - if (!metaend) - metaend = slashhead; - else - metaend++; - - params = html_meta_param_list_decode (meta, metaend - meta); - if (params) { - val = header_param (params, "http-equiv"); - if (val && !g_strcasecmp (val, "Content-Type")) { - struct _header_content_type *content_type; - - val = header_param (params, "content"); - content_type = header_content_type_decode (val); - charset = g_strdup (header_content_type_param (content_type, "charset")); - - header_content_type_unref (content_type); + switch(state) { + case CAMEL_HTML_PARSER_ELEMENT: + val = camel_html_parser_tag(hp); + d(printf("Got tag: %s\n", tag)); + if (strcasecmp(val, "meta") == 0 + && (val = camel_html_parser_attr(hp, "http-equiv")) + && strcasecmp(val, "content-type") == 0 + && (val = camel_html_parser_attr(hp, "content")) + && (ct = header_content_type_decode(val))) { + charset = (char *)header_content_type_param(ct, "charset"); + if (charset) + charset = g_strdup(charset); + header_content_type_unref(ct); } - - header_param_list_free (params); - - /* break as soon as we find a charset */ - if (charset) - break; + break; + default: + /* ignore everything else */ + break; } - - data = metaend; - } - + } while (charset == NULL && state != CAMEL_HTML_PARSER_EOF); + + camel_object_unref((CamelObject *)hp); + if (fdec) + camel_object_unref((CamelObject *)fdec); + + camel_mime_parser_seek(mp, offset, SEEK_SET); + return charset; } @@ -111,6 +134,7 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser GByteArray *buffer; off_t start = 0, end; char *encoding, *buf; + CamelMimeFilterBasicType enctype = 0; d(printf("constructing data-wrapper\n")); @@ -130,16 +154,19 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser /* first, work out conversion, if any, required, we dont care about what we dont know about */ encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "content-transfer-encoding", NULL)); if (encoding) { - if (!g_strcasecmp (encoding, "base64")) { + if (!strcasecmp (encoding, "base64")) { d(printf("Adding base64 decoder ...\n")); - fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type (CAMEL_MIME_FILTER_BASIC_BASE64_DEC); - decid = camel_mime_parser_filter_add (mp, fdec); + enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC; } else if (!strcasecmp(encoding, "quoted-printable")) { d(printf("Adding quoted-printable decoder ...\n")); - fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type (CAMEL_MIME_FILTER_BASIC_QP_DEC); - decid = camel_mime_parser_filter_add (mp, fdec); + enctype = CAMEL_MIME_FILTER_BASIC_QP_DEC; } g_free (encoding); + + if (enctype != 0) { + fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type(enctype); + decid = camel_mime_parser_filter_add (mp, fdec); + } } /* If we're doing text, we also need to do CRLF->LF and may have to convert it to UTF8 as well. */ @@ -156,29 +183,11 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser } /* Possible Lame Mailer Alert... check the META tags for a charset */ - if (!charset && header_content_type_is (ct, "text", "html")) { - GByteArray *bytes; - const char *buf; - off_t offset; - int len; - - offset = camel_mime_parser_tell (mp); - /* if we can't find the charset within the first 2k, we ain't gonna find it */ - len = camel_mime_parser_read (mp, &buf, 2048); - camel_mime_parser_seek (mp, offset, SEEK_SET); - - /* we only do this because we need it to be null terminated */ - bytes = g_byte_array_new (); - g_byte_array_append (bytes, buf, len); - g_byte_array_append (bytes, "", 1); - - acharset = extract_metatag_charset (bytes); - charset = acharset; - g_byte_array_free (bytes, TRUE); - } + if (!charset && header_content_type_is (ct, "text", "html")) + charset = acharset = check_html_charset(mp, enctype); /* if the charset is not us-ascii or utf-8, then we need to convert to utf-8 */ - if (charset && !(g_strcasecmp (charset, "us-ascii") == 0 || g_strcasecmp (charset, "utf-8") == 0)) { + if (charset && !(strcasecmp(charset, "us-ascii") == 0 || strcasecmp(charset, "utf-8") == 0)) { d(printf("Adding conversion filter from %s to UTF-8\n", charset)); fch = (CamelMimeFilter *)camel_mime_filter_charset_new_convert (charset, "UTF-8"); if (fch) { diff --git a/camel/camel-mime-utils.c b/camel/camel-mime-utils.c index eb228cd748..f051c0d596 100644 --- a/camel/camel-mime-utils.c +++ b/camel/camel-mime-utils.c @@ -903,7 +903,7 @@ rfc2047_decode_word(const char *in, int len) { const char *inptr = in+2; const char *inend = in+len-2; - char *inbuf; + const char *inbuf; char *encname; int tmplen; int ret; @@ -1150,7 +1150,7 @@ rfc2047_encode_word(GString *outstring, const char *in, int len, const char *typ iconv_t ic = (iconv_t *)-1; char *buffer, *out, *ascii; size_t inlen, outlen, enclen, bufflen; - char *inptr, *p; + const char *inptr, *p; int first = 1; d(printf("Converting [%d] '%.*s' to %s\n", len, len, in, type)); @@ -1159,7 +1159,7 @@ rfc2047_encode_word(GString *outstring, const char *in, int len, const char *typ bufflen = len*6+16; buffer = alloca(bufflen); inlen = len; - inptr = (char *) in; + inptr = in; ascii = alloca(bufflen); @@ -1808,20 +1808,21 @@ rfc2184_decode (const char *in, int len) inptr++; if (inptr < inend) { - char *decword, *inbuf, *outbase, *outbuf; + char *decword, *outbase, *outbuf; + const char *inbuf; int inlen, outlen; iconv_t ic; inbuf = decword = hex_decode (inptr, inend - inptr); inlen = strlen (inbuf); - - outlen = inlen * 6 + 16; - outbuf = outbase = g_malloc (outlen); - + ic = iconv_open ("UTF-8", encoding); if (ic != (iconv_t) -1) { int ret; - + + outlen = inlen * 6 + 16; + outbuf = outbase = g_malloc (outlen); + ret = iconv (ic, &inbuf, &inlen, &outbuf, &outlen); if (ret >= 0) { iconv (ic, NULL, 0, &outbuf, &outlen); @@ -2720,60 +2721,6 @@ header_param_list_decode(const char *in) return header_decode_param_list(&in); } -struct _header_param * -html_meta_param_list_decode (const char *in, int inlen) -{ - /* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */ - struct _header_param *params = NULL, *last = NULL; - const char *inptr, *inend; - - if (in == NULL) - return NULL; - - inptr = in; - inend = inptr + inlen; - - if (*inptr != '<') - return NULL; - - if (!g_strncasecmp (inptr, "<meta", 5)) - inptr += 5; - else - return NULL; - - header_decode_lwsp (&inptr); - - while (inptr < inend && *inptr != '>') { - char *name = NULL, *value = NULL; - struct _header_param *param; - - name = decode_token (&inptr); - header_decode_lwsp (&inptr); - if (*inptr != '=') { - g_free (name); - break; - } - - inptr++; - value = header_decode_value (&inptr); - header_decode_lwsp (&inptr); - - param = g_malloc (sizeof (struct _header_param)); - param->next = NULL; - param->name = name; - param->value = value; - - if (last) { - last->next = param; - last = param; - } else { - last = params = param; - } - } - - return params; -} - /* FIXME: I wrote this in a quick & dirty fasion - it may not be 100% correct */ static char * header_encode_param (const unsigned char *in, gboolean *encoded) diff --git a/camel/camel-mime-utils.h b/camel/camel-mime-utils.h index d057b91936..c32485d000 100644 --- a/camel/camel-mime-utils.h +++ b/camel/camel-mime-utils.h @@ -109,9 +109,6 @@ void header_param_list_format_append(GString *out, struct _header_param *p); char *header_param_list_format(struct _header_param *p); void header_param_list_free(struct _header_param *p); -/* for decoding META tags in text/html stuff */ -struct _header_param *html_meta_param_list_decode (const char *in, int inlen); - /* Content-Type header */ struct _header_content_type *header_content_type_new(const char *type, const char *subtype); struct _header_content_type *header_content_type_decode(const char *in); |