aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeffrey Stedfast <fejj@ximian.com>2004-02-04 02:52:53 +0800
committerJeffrey Stedfast <fejj@src.gnome.org>2004-02-04 02:52:53 +0800
commit3bed590653f5e6d72ffecb606f2569c1d1057326 (patch)
treec9d614774900392368137e9903ad86b88594c356
parent229c627ee11c7b5f4b7df9355b42d3bf0d35fbaa (diff)
downloadgsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.tar.gz
gsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.tar.zst
gsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.zip
New test suite for url scanning.
2004-02-03 Jeffrey Stedfast <fejj@ximian.com> * tests/misc/url-scan.c: New test suite for url scanning. * camel-url-scanner.c: Added single/double quotes to url_braces[] in case the user is quoting the url. (camel_url_web_end): Add "-;:" to list of punctuation to strip off the end of urls. Also fixed to handle user@domain's (camel_url_addrspec_start): Strip open brace characters from the beginning of the addr. (camel_url_web_start): Make sure "www" wasn't part of something not a url (like "Ewww.Gross") by check that pos[-1] is either an open brace or whitespace. (camel_url_addrspec_end): Don't allow toplevel domain addr-specs (if we encounter something that looks like it is a toplevel domain addr, it is more likely to be bogus than correct). svn path=/trunk/; revision=24592
-rw-r--r--camel/ChangeLog17
-rw-r--r--camel/camel-url-scanner.c128
-rw-r--r--camel/tests/misc/Makefile.am3
-rw-r--r--camel/tests/misc/url-scan.c132
4 files changed, 240 insertions, 40 deletions
diff --git a/camel/ChangeLog b/camel/ChangeLog
index b222c86fa0..0a90be539a 100644
--- a/camel/ChangeLog
+++ b/camel/ChangeLog
@@ -1,3 +1,20 @@
+2004-02-03 Jeffrey Stedfast <fejj@ximian.com>
+
+ * tests/misc/url-scan.c: New test suite for url scanning.
+
+ * camel-url-scanner.c: Added single/double quotes to url_braces[]
+ in case the user is quoting the url.
+ (camel_url_web_end): Add "-;:" to list of punctuation to strip off
+ the end of urls. Also fixed to handle user@domain's
+ (camel_url_addrspec_start): Strip open brace characters from the
+ beginning of the addr.
+ (camel_url_web_start): Make sure "www" wasn't part of something
+ not a url (like "Ewww.Gross") by check that pos[-1] is either an
+ open brace or whitespace.
+ (camel_url_addrspec_end): Don't allow toplevel domain addr-specs
+ (if we encounter something that looks like it is a toplevel domain
+ addr, it is more likely to be bogus than correct).
+
2004-02-02 Jeffrey Stedfast <fejj@ximian.com>
Fixes for bug #53091.
diff --git a/camel/camel-url-scanner.c b/camel/camel-url-scanner.c
index 8a43b05aee..3d7c0b6053 100644
--- a/camel/camel-url-scanner.c
+++ b/camel/camel-url-scanner.c
@@ -139,6 +139,46 @@ enum {
#define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
+static struct {
+ char open;
+ char close;
+} url_braces[] = {
+ { '(', ')' },
+ { '{', '}' },
+ { '[', ']' },
+ { '<', '>' },
+ { '|', '|' },
+};
+
+static gboolean
+is_open_brace (char c)
+{
+ int i;
+
+ for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
+ if (c == url_braces[i].open)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+static char
+url_stop_at_brace (const char *in, size_t so)
+{
+ int i;
+
+ if (so > 0) {
+ for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
+ if (in[so - 1] == url_braces[i].open)
+ return url_braces[i].close;
+ }
+ }
+
+ return '\0';
+}
+
+
gboolean
camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
{
@@ -161,7 +201,7 @@ camel_url_addrspec_start (const char *in, const char *pos, const char *inend, ur
inptr--;
}
- if (!is_atom (*inptr))
+ if (!is_atom (*inptr) || is_open_brace (*inptr))
inptr++;
if (inptr == pos)
@@ -177,6 +217,7 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
{
const char *inptr = pos;
int parts = 0, digits;
+ gboolean got_dot = FALSE;
g_assert (*inptr == '@');
@@ -213,12 +254,16 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
while (inptr < inend && is_domain (*inptr))
inptr++;
- if (inptr < inend && *inptr == '.' && is_domain (inptr[1]))
+ if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) {
+ if (*inptr == '.')
+ got_dot = TRUE;
inptr++;
+ }
}
}
- if (inptr == pos + 1)
+ /* don't allow toplevel domains */
+ if (inptr == pos + 1 || !got_dot)
return FALSE;
match->um_eo = (inptr - in);
@@ -226,31 +271,6 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
return TRUE;
}
-static struct {
- char open;
- char close;
-} url_braces[] = {
- { '(', ')' },
- { '{', '}' },
- { '[', ']' },
- { '<', '>' },
-};
-
-static char
-url_stop_at_brace (const char *in, size_t so)
-{
- int i;
-
- if (so > 0) {
- for (i = 0; i < 4; i++) {
- if (in[so - 1] == url_braces[i].open)
- return url_braces[i].close;
- }
- }
-
- return '\0';
-}
-
gboolean
camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
{
@@ -286,6 +306,12 @@ camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch
gboolean
camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
{
+ if (pos > in && !strncmp (pos, "www", 3)) {
+ /* make sure we aren't actually part of another word */
+ if (!is_open_brace (pos[-1]) && !isspace (pos[-1]))
+ return FALSE;
+ }
+
match->um_so = (pos - in);
return TRUE;
@@ -320,13 +346,37 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_
inptr++;
} while (parts < 4);
- } else if (is_domain (*inptr)) {
- while (inptr < inend) {
- if (is_domain (*inptr))
+ } else if (is_atom (*inptr)) {
+ /* might be a domain or user@domain */
+ const char *save = inptr;
+
+ while (inptr < inend) {
+ if (!is_atom (*inptr))
+ break;
+
+ inptr++;
+
+ while (inptr < inend && is_atom (*inptr))
inptr++;
- else
+
+ if (inptr < inend && *inptr == '.' && is_atom (inptr[1]))
+ inptr++;
+ }
+
+ if (*inptr != '@')
+ inptr = save;
+ else
+ inptr++;
+
+ goto domain;
+ } else if (is_domain (*inptr)) {
+ domain:
+ while (inptr < inend) {
+ if (!is_domain (*inptr))
break;
+ inptr++;
+
while (inptr < inend && is_domain (*inptr))
inptr++;
@@ -359,19 +409,19 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_
while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
inptr++;
- /* urls are extremely unlikely to end with any
- * punctuation, so strip any trailing
- * punctuation off. Also strip off any closing
- * braces. */
- while (inptr > pos && strchr (",.?!)}]", inptr[-1]))
- inptr--;
-
break;
default:
break;
}
}
+ /* urls are extremely unlikely to end with any
+ * punctuation, so strip any trailing
+ * punctuation off. Also strip off any closing
+ * braces or quotes. */
+ while (inptr > pos && strchr (",.:;?!-|)}]'\"", inptr[-1]))
+ inptr--;
+
match->um_eo = (inptr - in);
return TRUE;
diff --git a/camel/tests/misc/Makefile.am b/camel/tests/misc/Makefile.am
index d3ed29f2e6..cc119e9390 100644
--- a/camel/tests/misc/Makefile.am
+++ b/camel/tests/misc/Makefile.am
@@ -18,10 +18,11 @@ LDADD = \
check_PROGRAMS = \
url \
+ url-scan \
utf7 \
split
-TESTS = url utf7 split
+TESTS = url utf7 split url-scan
diff --git a/camel/tests/misc/url-scan.c b/camel/tests/misc/url-scan.c
new file mode 100644
index 0000000000..a7bbf51cf5
--- /dev/null
+++ b/camel/tests/misc/url-scan.c
@@ -0,0 +1,132 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Authors: Jeffrey Stedfast <fejj@ximian.com>
+ *
+ * Copyright 2004 Ximian, Inc. (www.ximian.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <camel/camel-mime-filter-tohtml.h>
+
+#include "camel-test.h"
+
+struct {
+ char *text, *url;
+} url_tests[] = {
+ { "bob@foo.com", "mailto:bob@foo.com" },
+ { "Ends with bob@foo.com", "mailto:bob@foo.com" },
+ { "bob@foo.com at start", "mailto:bob@foo.com" },
+ { "bob@foo.com.", "mailto:bob@foo.com" },
+ { "\"bob@foo.com\"", "mailto:bob@foo.com" },
+ { "<bob@foo.com>", "mailto:bob@foo.com" },
+ { "(bob@foo.com)", "mailto:bob@foo.com" },
+ { "bob@foo.com, 555-9999", "mailto:bob@foo.com" },
+ { "|bob@foo.com|555-9999|", "mailto:bob@foo.com" },
+ { "bob@ no match bob@", NULL },
+ { "@foo.com no match @foo.com", NULL },
+ { "\"bob\"@foo.com", NULL },
+ { "M@ke money fast!", NULL },
+ { "ASCII art @_@ @>->-", NULL },
+
+ { "http://www.foo.com", "http://www.foo.com" },
+ { "Ends with http://www.foo.com", "http://www.foo.com" },
+ { "http://www.foo.com at start", "http://www.foo.com" },
+ { "http://www.foo.com.", "http://www.foo.com" },
+ { "http://www.foo.com/.", "http://www.foo.com/" },
+ { "<http://www.foo.com>", "http://www.foo.com" },
+ { "(http://www.foo.com)", "http://www.foo.com" },
+ { "http://www.foo.com, 555-9999", "http://www.foo.com" },
+ { "|http://www.foo.com|555-9999|", "http://www.foo.com" },
+ { "foo http://www.foo.com/ bar", "http://www.foo.com/" },
+ { "foo http://www.foo.com/index.html bar", "http://www.foo.com/index.html" },
+ { "foo http://www.foo.com/q?99 bar", "http://www.foo.com/q?99" },
+ { "foo http://www.foo.com/;foo=bar&baz=quux bar", "http://www.foo.com/;foo=bar&baz=quux" },
+ { "foo http://www.foo.com/index.html#anchor bar", "http://www.foo.com/index.html#anchor" },
+ { "http://www.foo.com/index.html; foo", "http://www.foo.com/index.html" },
+ { "http://www.foo.com/index.html: foo", "http://www.foo.com/index.html" },
+ { "http://www.foo.com/index.html-- foo", "http://www.foo.com/index.html" },
+ { "http://www.foo.com/index.html?", "http://www.foo.com/index.html" },
+ { "http://www.foo.com/index.html!", "http://www.foo.com/index.html" },
+ { "\"http://www.foo.com/index.html\"", "http://www.foo.com/index.html" },
+ { "'http://www.foo.com/index.html'", "http://www.foo.com/index.html" },
+ { "http://bob@www.foo.com/bar/baz/", "http://bob@www.foo.com/bar/baz/" },
+ { "http no match http", NULL },
+ { "http: no match http:", NULL },
+ { "http:// no match http://", NULL },
+ { "unrecognized://bob@foo.com/path", "mailto:bob@foo.com" },
+
+ { "src/www.c", NULL },
+ { "Ewwwwww.Gross.", NULL },
+
+};
+
+static int num_url_tests = G_N_ELEMENTS (url_tests);
+
+int main (int argc, char **argv)
+{
+ char *html, *url, *p;
+ int i, errors = 0;
+ guint32 flags;
+
+ camel_test_init (argc, argv);
+
+ camel_test_start ("URL scanning");
+
+ flags = CAMEL_MIME_FILTER_TOHTML_CONVERT_URLS | CAMEL_MIME_FILTER_TOHTML_CONVERT_ADDRESSES;
+ for (i = 0; i < num_url_tests; i++) {
+ camel_test_push ("'%s' => '%s'", url_tests[i].text, url_tests[i].url ? url_tests[i].url : "None");
+
+ html = camel_text_to_html (url_tests[i].text, flags, 0);
+
+ url = strstr (html, "href=\"");
+ if (url) {
+ url += 6;
+ p = strchr (url, '"');
+ if (p)
+ *p = '\0';
+
+ while ((p = strstr (url, "&amp;")))
+ memmove (p + 1, p + 5, strlen (p + 5) + 1);
+ }
+
+ if ((url && (!url_tests[i].url || strcmp (url, url_tests[i].url) != 0)) ||
+ (!url && url_tests[i].url)) {
+ printf ("FAILED on \"%s\" -> %s\n (got %s)\n\n",
+ url_tests[i].text,
+ url_tests[i].url ? url_tests[i].url : "(nothing)",
+ url ? url : "(nothing)");
+ errors++;
+ }
+
+ g_free (html);
+ }
+
+ printf ("\n%d errors\n", errors);
+
+ camel_test_end ();
+
+ return errors;
+}