New functions to add additional search strings one at a time. Maybe it

2002-05-18 Not Zed <NotZed@Ximian.com> * e-searching-tokenizer.c (e_searching_tokenizer_add_primary_search_string): (e_searching_tokenizer_add_secondary_search_string): New functions to add additional search strings one at a time. Maybe it should just split the word itself? (all): Basically, entirely rewritten. Now implements the Aho-Corasick multiple pattern search algorithm and handles multiple search strings and only ever has to decode any utf8 character once, etc etc. svn path=/trunk/; revision=16961
author: Not Zed <NotZed@Ximian.com> 2002-05-21 10:54:05 +0800
committer: Michael Zucci <zucchi@src.gnome.org> 2002-05-21 10:54:05 +0800
commit: e32a24e30ba3558de1ccf54b2f2aa0c348f07b65 (patch)
tree: fbd985d74c9d20593d09e8be7cbf1e53ab87d985 /mail/e-searching-tokenizer.c
parent: 8435ad3d2db7d4cb0cbed199cd58ef0a5e0ae366 (diff)
download: gsoc2013-evolution-e32a24e30ba3558de1ccf54b2f2aa0c348f07b65.tar.gz
gsoc2013-evolution-e32a24e30ba3558de1ccf54b2f2aa0c348f07b65.tar.zst
gsoc2013-evolution-e32a24e30ba3558de1ccf54b2f2aa0c348f07b65.zip
1 files changed, 871 insertions, 660 deletions
diff --git a/mail/e-searching-tokenizer.c b/mail/e-searching-tokenizer.c
index 602dd3858d..ce0e9d94fa 100644
--- a/mail/e-searching-tokenizer.c
+++ b/mail/e-searching-tokenizer.c
@@ -3,9 +3,11 @@
 /*
  * e-searching-tokenizer.c
  *
- * Copyright (C) 2001 Ximian, Inc.
+ * Copyright (C) 2002 Ximian, Inc.
  *
  * Developed by Jon Trowbridge <trow@ximian.com>
+ * Rewritten significantly to handle multiple strings and improve performance
+ *  by Michael Zucchi <notzed@ximian.com>
  */
 
 /*
@@ -30,15 +32,17 @@
 #include <gal/unicode/gunicode.h>
 #include "e-searching-tokenizer.h"
 
+#include "e-util/e-memory.h"
+#include "e-util/e-msgport.h"
+
+#define d(x) 
+
 enum {
 	EST_MATCH_SIGNAL,
 	EST_LAST_SIGNAL
 };
 guint e_searching_tokenizer_signals[EST_LAST_SIGNAL] = { 0 };
 
-#define START_MAGIC "<\n>S<\n>"
-#define END_MAGIC   "<\n>E<\n>"
-
 static void     e_searching_tokenizer_begin      (HTMLTokenizer *, gchar *);
 static void     e_searching_tokenizer_end        (HTMLTokenizer *);
 static gchar   *e_searching_tokenizer_peek_token (HTMLTokenizer *);
@@ -47,34 +51,14 @@ static gboolean e_searching_tokenizer_has_more   (HTMLTokenizer *);
 
 static HTMLTokenizer *e_searching_tokenizer_clone (HTMLTokenizer *);
 
-static const gchar *ignored_tags[] = { "b", "i", NULL };
-static const gchar *space_tags[] = { "br", NULL };
+/*
+  static const gchar *space_tags[] = { "br", NULL };*/
 
 GtkObjectClass *parent_class = NULL;
 
 /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
 
-typedef enum {
-	MATCH_FAILED = 0,
-	MATCH_COMPLETE,
-	MATCH_START,
-	MATCH_CONTINUES,
-	MATCH_END
-} MatchInfo;
-
-typedef struct _SearchInfo SearchInfo;
-struct _SearchInfo {
-	gchar *search;
-	gchar *current;
-
-	gboolean case_sensitive;
-	gboolean allow_space_tags_to_match_whitespace;
-
-	gint match_size_incr;
-	gchar *match_color;
-	gboolean match_bold;
-};
-
+/* ???
 typedef struct _SharedState SharedState;
 struct _SharedState {
 	gint refs;
@@ -83,17 +67,12 @@ struct _SharedState {
 	gboolean case_sensitive_primary;
 	gboolean case_sensitive_secondary;
 };
+*/
 
-struct _ESearchingTokenizerPrivate {
-	gint match_count;
-	SearchInfo *search;
-	GList *pending;
-	GList *trash;
-	SharedState *shared;
-};
+/* ********************************************************************** */
 
-/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
 
+#if 0
 static SharedState *
 shared_state_new (void)
 {
@@ -123,353 +102,878 @@ shared_state_unref (SharedState *shared)
 		}
 	}
 }
+#endif
 
 /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
 
-static SearchInfo *
-search_info_new (void)
+/* ********************************************************************** */
+
+/* Utility functions */
+
+/* This is faster and safer than glib2's utf8 abomination, but isn't exported from camel as yet */
+static __inline__ guint32
+camel_utf8_getc(const unsigned char **ptr)
 {
-	SearchInfo *si;
+	register unsigned char *p = (unsigned char *)*ptr;
+	register unsigned char c, r;
+	register guint32 v, m;
+
+again:
+	r = *p++;
+loop:
+	if (r < 0x80) {
+		*ptr = p;
+		v = r;
+	} else if (r < 0xfe) { /* valid start char? */
+		v = r;
+		m = 0x7f80;	/* used to mask out the length bits */
+		do {
+			c = *p++;
+			if ((c & 0xc0) != 0x80) {
+				r = c;
+				goto loop;
+			}
+			v = (v<<6) | (c & 0x3f);
+			r<<=1;
+			m<<=5;
+		} while (r & 0x40);
+		
+		*ptr = p;
 
-	si = g_new0 (SearchInfo, 1);
-	si->case_sensitive = FALSE;
+		v &= ~m;
+	} else {
+		goto again;
+	}
 
-	si->match_size_incr = 1;
-	si->match_color = g_strdup ("red");
-	si->match_bold = FALSE;
+	return v;
+}
+
+
+/* note: our tags of interest are 7 bit ascii, only, no need to do any fancy utf8 stuff */
+/* tags should be upper case
+   if this list gets longer than 10 entries, consider binary search */
+static char *ignored_tags[] = { "B", "I", "FONT", "TT", "EM", /* and more? */};
+
+static int
+ignore_tag(const char *tag)
+{
+	char *t = alloca(strlen(tag)+1), c, *out;
+	const char *in;
+	int i;
+
+	/* we could use a aho-corasick matcher here too ... but we wont */
+
+	/* normalise tag into 't'.
+	   Note we use the property that the only tags we're interested in
+	   are 7 bit ascii to shortcut and simplify case insensitivity */
+	in = tag+2;		/* skip: TAG_ESCAPE '<' */
+	if (*in == '/')
+		in++;
+	out = t;
+	while ((c = *in++)) {
+		if (c >= 'A' && c <= 'Z')
+			*out++ = c;
+		else if (c >= 'a' && c <= 'z')
+			*out++ = c & 0xdf; /* convert ASCII to upper case */
+		else
+			/* maybe should check for > or ' ' etc? */
+			break;
+	}
+	*out = 0;
 
-	si->allow_space_tags_to_match_whitespace = TRUE;
+	for (i=0;i<sizeof(ignored_tags)/sizeof(ignored_tags[0]);i++) {
+		if (strcmp(t, ignored_tags[i]) == 0)
+			return 1;
+	}
 
-	return si;
+	return 0;
 }
 
-static void
-search_info_free (SearchInfo *si)
+/* ********************************************************************** */
+
+/* Aho-Corasick search tree implmeentation */
+
+/* next state if we match a character */
+struct _match {
+	struct _match *next;
+	guint32 ch;
+	struct _state *match;
+};
+
+/* tree state node */
+struct _state {
+	struct _match *matches;
+	unsigned int final;		/* max no of chars we just matched */
+	struct _state *fail;		/* where to try next if we fail */
+	struct _state *next;		/* next on this level? */
+};
+
+/* base tree structure */
+struct _trie {
+	struct _state root;
+	int max_depth;
+
+	EMemChunk *state_chunks;
+	EMemChunk *match_chunks;
+};
+
+static void 
+dump_trie(struct _state *s, int d)
 {
-	if (si) {
-		g_free (si->search);
-		g_free (si->match_color);
-		g_free (si);
+	char *p = alloca(d*2+1);
+	struct _match *m;
+
+	memset(p, ' ', d*2);
+	p[d*2]=0;
+	
+	printf("%s[state] %p: %d  fail->%p\n", p, s, s->final, s->fail);
+	m = s->matches;
+	while (m) {
+		printf(" %s'%c' -> %p\n", p, m->ch, m->match);
+		if (m->match)
+			dump_trie(m->match, d+1);
+		m = m->next;
 	}
 }
 
-static SearchInfo *
-search_info_clone (SearchInfo *si)
+/* This builds an Aho-Corasick search trie for a set of utf8 words */
+/* See
+    http://www-sr.informatik.uni-tuebingen.de/~buehler/AC/AC.html
+   for a neat demo */
+
+static __inline__ struct _match *
+g(struct _state *q, guint32 c)
 {
-	SearchInfo *new_si = NULL;
+	struct _match *m = q->matches;
+
+	while (m && m->ch != c)
+		m = m->next;
 
-	if (si) {
-		new_si                 = search_info_new ();
-		new_si->search         = g_strdup (si->search);
-		new_si->case_sensitive = si->case_sensitive;
+	return m;
+}
+
+static struct _trie *
+build_trie(int nocase, int len, char **words)
+{
+	struct _state *q, *qt, *r;
+	char *word;
+	struct _match *m, *n;
+	int i, depth;
+	guint32 c;
+	struct _trie *trie;
+	int state_depth_max, state_depth_size;
+	struct _state **state_depth;
+
+	trie = g_malloc(sizeof(*trie));
+	trie->root.matches = 0;
+	trie->root.final = 0;
+	trie->root.fail = 0;
+	trie->root.next = 0;
+
+	trie->state_chunks = e_memchunk_new(8, sizeof(struct _state));
+	trie->match_chunks = e_memchunk_new(8, sizeof(struct _match));
+
+	/* This will correspond to the length of the longest pattern */
+	state_depth_size = 0;
+	state_depth_max = 64;
+	state_depth = g_malloc(sizeof(*state_depth[0])*64);
+	state_depth[0] = NULL;
+
+	/* Step 1: Build trie */
+
+	/* This just builds a tree that merges all common prefixes into the same branch */
+
+	for (i=0;i<len;i++) {
+		word = words[i];
+		q = &trie->root;
+		depth = 0;
+		while ((c = camel_utf8_getc((const unsigned char **)&word))) {
+			if (nocase)
+				c = g_unichar_tolower(c);
+			m = g(q, c);
+			if (m == 0) {
+				m = e_memchunk_alloc(trie->match_chunks);
+				m->ch = c;
+				m->next = q->matches;
+				q->matches = m;
+				q = m->match = e_memchunk_alloc(trie->state_chunks);
+				q->matches = 0;
+				q->fail = &trie->root;
+				q->final = 0;
+				if (state_depth_max < depth) {
+					state_depth_max += 64;
+					state_depth = g_realloc(state_depth, sizeof(*state_depth[0])*state_depth_max);
+				}
+				if (state_depth_size < depth) {
+					state_depth[depth] = 0;
+					state_depth_size = depth;
+				}
+				q->next = state_depth[depth];
+				state_depth[depth] = q;
+			} else {
+				q = m->match;
+			}
+			depth++;
+		}
+		q->final = depth;
 	}
 
-	return new_si;
+	d(printf("Dumping trie:\n"));
+	d(dump_trie(&trie->root, 0));
+
+	/* Step 2: Build failure graph */
+
+	/* This searches for the longest substring which is a prefix of another string and
+	   builds a graph of failure links so you can find multiple substrings concurrently,
+	   using aho-corasick's algorithm */
+
+	for (i=0;i<state_depth_size;i++) {
+		q = state_depth[i];
+		while (q) {
+			m = q->matches;
+			while (m) {
+				c = m->ch;
+				qt = m->match;
+				r = q->fail;
+				while (r != 0 && (n = g(r, c)) == NULL)
+					r = r->fail;
+				if (r != 0) {
+					qt->fail = n->match;
+					if (qt->fail->final > qt->final)
+						qt->final = qt->fail->final;
+				} else {
+					if ((n = g(&trie->root, c)))
+						qt->fail = n->match;
+					else
+						qt->fail = &trie->root;
+				}
+				m = m->next;
+			}
+			q = q->next;
+		}
+	}
+	
+	d(printf("After failure analysis\n"));
+	d(dump_trie(&trie->root, 0));
+
+	g_free(state_depth);
+
+	trie->max_depth = state_depth_size;
+
+	return trie;
 }
 
 static void
-search_info_set_string (SearchInfo *si, const gchar *str)
+free_trie(struct _trie *t)
 {
-	g_return_if_fail (si);
-	g_return_if_fail (str);
+	e_memchunk_destroy(t->match_chunks);
+	e_memchunk_destroy(t->state_chunks);
 
-	g_free (si->search);
-	si->search = g_strdup (str);
-	si->current = NULL;
+	g_free(t);
 }
 
+/* ********************************************************************** */
+
+/* html token searcher */
+
+struct _token {
+	struct _token *next;
+	struct _token *prev;
+	unsigned int offset;
+	/* we need to copy the token for memory management, so why not copy it whole */
+	char tok[1];
+};
+
+/* stack of submatches currently being scanned, used for merging */
+struct _submatch {
+	unsigned int offstart, offend; /* in bytes */
+};
+
+/* flags for new func */
+#define SEARCH_CASE (1)
+#define SEARCH_BOLD (2)
+
+struct _searcher {
+	struct _trie *t;
+
+	char *(*next_token)();	/* callbacks for more tokens */
+	void *next_data;
+
+	int words;		/* how many words */
+	char *tags, *tage;	/* the tag we used to highlight */
+
+	int flags;		/* case sensitive or not */
+
+	struct _state *state;	/* state is the current trie state */
+
+	int matchcount;
+
+	EDList input;		/* pending 'input' tokens, processed but might match */
+	EDList output;		/* output tokens ready for source */
+
+	struct _token *current;	/* for token output memory management */
+
+	guint32 offset;		/* current offset through searchable stream? */
+	guint32 offout;		/* last output position */
+
+	unsigned int lastp;	/* current position in rotating last buffer */
+	guint32 *last;		/* buffer that goes back last 'n' positions */
+	guint32 last_mask;	/* bitmask for efficient rotation calculation */
+
+	unsigned int submatchp;	/* submatch stack */
+	struct _submatch *submatches;
+};
+
 static void
-search_info_set_case_sensitivity (SearchInfo *si, gboolean flag)
+searcher_set_tokenfunc(struct _searcher *s, char *(*next)(), void *data)
 {
-	g_return_if_fail (si);
+	s->next_token = next;
+	s->next_data = data;
+}
 
-	si->case_sensitive = flag;
+static struct _searcher *
+searcher_new(int flags, int argc, char **argv, const char *tags, const char *tage)
+{
+	int i, m;
+	struct _searcher *s;
+
+	s = g_malloc(sizeof(*s));
+
+	s->t = build_trie((flags&SEARCH_CASE) == 0, argc, argv);
+	s->words = argc;
+	s->tags = g_strdup(tags);
+	s->tage = g_strdup(tage);
+	s->flags = flags;
+	s->state = &s->t->root;
+	s->matchcount = 0;
+
+	e_dlist_init(&s->input);
+	e_dlist_init(&s->output);
+	s->current = 0;
+
+	s->offset = 0;
+	s->offout = 0;
+
+	/* rotating queue of previous character positions */
+	m = s->t->max_depth+1;
+	i = 2;
+	while (i<m)
+		i<<=2;
+	s->last = g_malloc(sizeof(s->last[0])*i);
+	s->last_mask = i-1;
+	s->lastp = 0;
+
+	/* a stack of possible submatches */
+	s->submatchp = 0;
+	s->submatches = g_malloc(sizeof(s->submatches[0])*argc+1);
+
+	return s;
 }
 
-#if 0
 static void
-search_info_set_match_size_increase (SearchInfo *si, gint incr)
+searcher_free(struct _searcher *s)
 {
-	g_return_if_fail (si);
-	g_return_if_fail (incr >= 0);
+	struct _token *t;
+
+	while ((t = (struct _token *)e_dlist_remhead(&s->input)))
+		g_free(t);
+	while ((t = (struct _token *)e_dlist_remhead(&s->output)))
+		g_free(t);
+	g_free(s->tags);
+	g_free(s->tage);
+	g_free(s->last);
+	g_free(s->submatches);
+	free_trie(s->t);
+	g_free(s);
+}
+static struct _token *
+append_token(EDList *list, const char *tok, int len)
+{
+	struct _token *token;
+
+	if (len == -1)
+		len = strlen(tok);
+	token = g_malloc(sizeof(*token) + len+1);
+	token->offset = 0;	/* set by caller when required */
+	memcpy(token->tok, tok, len);
+	token->tok[len] = 0;
+	e_dlist_addtail(list, (EDListNode *)token);
 
-	si->match_size_incr = incr;
+	return token;
 }
-#endif
+
+#define free_token(x) (g_free(x))
 
 static void
-search_info_set_match_color (SearchInfo *si, const gchar *color)
+output_token(struct _searcher *s, struct _token *token)
 {
-	g_return_if_fail (si);
+	int offend;
+	int left, pre;
 
-	g_free (si->match_color);
-	si->match_color = g_strdup (color);
+	if (token->tok[0] == TAG_ESCAPE) {
+		if (token->offset >= s->offout) {
+			d(printf("moving tag token '%s' from input to output\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+			e_dlist_addtail(&s->output, (EDListNode *)token);
+		} else {
+			d(printf("discarding tag token '%s' from input\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+			free_token(token);
+		}
+	} else {
+		offend = token->offset + strlen(token->tok);
+		left = offend-s->offout;
+		if (left > 0) {
+			pre = s->offout - token->offset;
+			if (pre>0)
+				memmove(token->tok, token->tok+pre, left+1);
+			d(printf("adding partial remaining/failed '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+			s->offout = offend;
+			e_dlist_addtail(&s->output, (EDListNode *)token);
+		} else {
+			d(printf("discarding whole token '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+			free_token(token);
+		}
+	}
 }
 
-static void
-search_info_set_match_bold (SearchInfo *si, gboolean flag)
+static struct _token *
+find_token(struct _searcher *s, int start)
 {
-	g_return_if_fail (si);
+	register struct _token *token;
+
+	/* find token which is start token, from end of list back */
+	token = (struct _token *)s->input.tailpred;
+	while (token->prev) {
+		if (token->offset <= start)
+			return token;
+		token = token->prev;
+	}
 
-	si->match_bold = flag;
+	return NULL;
 }
 
 static void
-search_info_reset (SearchInfo *si)
+output_match(struct _searcher *s, unsigned int start, unsigned int end)
 {
-	if (si == NULL)
+	register struct _token *token;
+	struct _token *starttoken, *endtoken;
+	char b[8];
+
+	d(printf("output match: %d-%d  at %d\n", start, end, s->offout));
+
+	starttoken = find_token(s, start);
+	endtoken = find_token(s, end);
+
+	if (starttoken == NULL || endtoken == NULL) {
+		printf("Cannot find match history for match %d-%d\n", start, end);
 		return;
-	si->current = NULL;
-}
+	}
 
-/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */
+	d(printf("start in token '%s'\n", starttoken->tok[0]==TAG_ESCAPE?starttoken->tok+1:starttoken->tok));
+	d(printf("end in token   '%s'\n", endtoken->tok[0]==TAG_ESCAPE?endtoken->tok+1:endtoken->tok));
 
-static const gchar *
-find_whole (SearchInfo *si, const gchar *haystack, const gchar *needle)
-{
-	const gchar *h, *n;
-
-	g_return_val_if_fail (si, NULL);
-	g_return_val_if_fail (haystack && needle, NULL);
-	g_return_val_if_fail (g_utf8_validate (haystack, -1, NULL), NULL);
-	g_return_val_if_fail (g_utf8_validate (needle, -1, NULL), NULL);
-
-	while (*haystack) {
-		h = haystack;
-		n = needle;
-		while (*h && *n) {
-			gunichar c1 = g_utf8_get_char (h);
-			gunichar c2 = g_utf8_get_char (n);
-
-			if (!si->case_sensitive) {
-				c1 = g_unichar_tolower (c1);
-				c2 = g_unichar_tolower (c2);
-			}
+	/* output pending stuff that didn't match afterall */
+	while ((struct _token *)s->input.head != starttoken) {
+		token = (struct _token *)e_dlist_remhead(&s->input);
+		d(printf("appending failed match '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+		output_token(s, token);
+	}
 
-			if (c1 != c2)
-				break;
-			
-			h = g_utf8_next_char (h);
-			n = g_utf8_next_char (n);
+	/* output any pre-match text */
+	if (s->offout < start) {
+		token = append_token(&s->output, starttoken->tok + (s->offout-starttoken->offset), start-s->offout);
+		d(printf("adding pre-match text '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+		s->offout = start;
+	}
+
+	/* output highlight/bold */
+	if (s->flags & SEARCH_BOLD) {
+		sprintf(b, "%c<b>", (char)TAG_ESCAPE);
+		append_token(&s->output, b, -1);
+	}
+	if (s->tags)
+		append_token(&s->output, s->tags, -1);
+
+	/* output match node(s) */
+	if (starttoken != endtoken) {
+		while ((struct _token *)s->input.head != endtoken) {
+			token = (struct _token *)e_dlist_remhead(&s->input);
+			d(printf("appending (partial) match node (head) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+			output_token(s, token);
 		}
-		if (*n == '\0')
-			return haystack;
-		if (*h == '\0')
-			return NULL;
-		haystack = g_utf8_next_char (haystack);
 	}
 
-	return NULL;
+	/* any remaining partial content */
+	if (s->offout < end) {
+		token = append_token(&s->output, endtoken->tok+(s->offout-endtoken->offset), end-s->offout);
+		d(printf("appending (partial) match node (tail) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+		s->offout = end;
+	}
+
+	/* end highlight */
+	if (s->tage)
+		append_token(&s->output, s->tage, -1);
+
+	/* and close bold if we need to */
+	if (s->flags & SEARCH_BOLD) {
+		sprintf(b, "%c</b>", (char)TAG_ESCAPE);
+		append_token(&s->output, b, -1);
+	}
 }
 
-/* This is a really stupid implementation of this function. */
-static const gchar *
-find_head (SearchInfo *si, const gchar *haystack, const gchar *needle)
+/* output any sub-pending blocks */
+static void
+output_subpending(struct _searcher *s)
 {
-	const gchar *h, *n;
-
-	g_return_val_if_fail (si, NULL);
-	g_return_val_if_fail (haystack && needle, NULL);
-	g_return_val_if_fail (g_utf8_validate (haystack, -1, NULL), NULL);
-	g_return_val_if_fail (g_utf8_validate (needle, -1, NULL), NULL);
-
-	while (*haystack) {
-		h = haystack;
-		n = needle;
-		while (*h && *n) {
-			gunichar c1 = g_utf8_get_char (h);
-			gunichar c2 = g_utf8_get_char (n);
-
-			if (!si->case_sensitive) {
-				c1 = g_unichar_tolower (c1);
-				c2 = g_unichar_tolower (c2);
-			}
+	int i;
 
-			if (c1 != c2)
-				break;
+	for (i=s->submatchp-1;i>=0;i--)
+		output_match(s, s->submatches[i].offstart, s->submatches[i].offend);
+	s->submatchp = 0;
+}
 
-			h = g_utf8_next_char (h);
-			n = g_utf8_next_char (n);
+/* returns true if a merge took place */
+static int
+merge_subpending(struct _searcher *s, int offstart, int offend)
+{
+	int i;
+
+	/* merges overlapping or abutting match strings */
+	if (s->submatchp &&
+	    s->submatches[s->submatchp-1].offend >= offstart) {
+
+		/* go from end, any that match 'invalidate' follow-on ones too */
+		for (i=s->submatchp-1;i>=0;i--) {
+			if (s->submatches[i].offend >= offstart) {
+				if (offstart < s->submatches[i].offstart)
+					s->submatches[i].offstart = offstart;
+				s->submatches[i].offend = offend;
+				if (s->submatchp > i)
+					s->submatchp = i+1;
+			}
 		}
-		if (*h == '\0')
-			return haystack;
-		haystack = g_utf8_next_char (haystack);
+		return 1;
 	}
 
-	return NULL;
+	return 0;
 }
 
-static const gchar *
-find_partial (SearchInfo *si, const gchar *haystack, const gchar *needle)
+static void
+push_subpending(struct _searcher *s, int offstart, int offend)
 {
-	g_return_val_if_fail (si, NULL);
-	g_return_val_if_fail (haystack && needle, NULL);
-	g_return_val_if_fail (g_utf8_validate (haystack, -1, NULL), NULL);
-	g_return_val_if_fail (g_utf8_validate (needle, -1, NULL), NULL);
-	
-	while (*needle) {
-		gunichar c1 = g_utf8_get_char (haystack);
-		gunichar c2 = g_utf8_get_char (needle);
+	/* This is really an assertion, we just ignore the last pending match instead of crashing though */
+	if (s->submatchp >= s->words) {
+		printf("ERROR: submatch pending stack overflow\n");
+		s->submatchp = s->words-1;
+	}
 
-		if (!si->case_sensitive) {
-			c1 = g_unichar_tolower (c1);
-			c2 = g_unichar_tolower (c2);
-		}
+	s->submatches[s->submatchp].offstart = offstart;
+	s->submatches[s->submatchp].offend = offend;
+	s->submatchp++;
+}
 
-		if (c1 != c2)
-			return NULL;
+/* move any (partial) tokens from input to output if they are beyond the current output position */
+static void
+output_pending(struct _searcher *s)
+{
+	struct _token *token;
 
-		needle = g_utf8_next_char (needle);
-		haystack = g_utf8_next_char (haystack);
-	}
-	return haystack;
+	while ( (token = (struct _token *)e_dlist_remhead(&s->input)) )
+		output_token(s, token);
 }
 
-static gboolean
-tag_match (const gchar *token, const gchar *tag)
+/* flushes any nodes we cannot possibly match anymore */
+static void
+flush_extra(struct _searcher *s)
 {
-	token += 2; /* Skip past TAG_ESCAPE and < */
-	if (*token == '/')
-		++token;
-	while (*token && *tag) {
-		gunichar c1 = g_unichar_tolower (g_utf8_get_char (token));
-		gunichar c2 = g_unichar_tolower (g_utf8_get_char (tag));
-		if (c1 != c2)
-			return FALSE;
-		token = g_utf8_next_char (token);
-		tag = g_utf8_next_char (tag);
+	unsigned int start;
+	int i;
+	struct _token *starttoken, *token;
+
+	/* find earliest char that can be in contention */
+	start = s->offset - s->t->max_depth;
+	for (i=0;i<s->submatchp;i++)
+		if (s->submatches[i].offstart < start)
+			start = s->submatches[i].offstart;
+
+	/* now, flush out any tokens which are before this point */
+	starttoken = find_token(s, start);
+	if (starttoken == NULL)
+		return;
+
+	while ((struct _token *)s->input.head != starttoken) {
+		token = (struct _token *)e_dlist_remhead(&s->input);
+		output_token(s, token);
 	}
-	return (*tag == '\0' && *token == '>');
 }
 
-static MatchInfo
-search_info_compare (SearchInfo *si, const gchar *token, gint *start_pos, gint *end_pos)
+static char *
+searcher_next_token(struct _searcher *s)
 {
-	gboolean token_is_tag;
-	const gchar *s;
-	gint i;
-	
-	g_return_val_if_fail (si != NULL, MATCH_FAILED);
-	g_return_val_if_fail (token != NULL, MATCH_FAILED);
-	g_return_val_if_fail (start_pos != NULL, MATCH_FAILED);
-	g_return_val_if_fail (end_pos != NULL, MATCH_FAILED);
+	struct _token *token;
+	char *tok, *stok;
+	struct _trie *t = s->t;
+	struct _state *q = s->state;
+	struct _match *m;
+	int offstart, offend;
+	guint32 c;
+
+	while (e_dlist_empty(&s->output)) {
+		/* get next token */
+		tok = s->next_token(s->next_data);
+		if (tok == NULL) {
+			output_subpending(s);
+			output_pending(s);
+			break;
+		}
 
-	token_is_tag = (*token == TAG_ESCAPE);
+		/* we dont always have to copy each token, e.g. if we dont match anything */
+		token = append_token(&s->input, tok, -1);
+		token->offset = s->offset;
+		tok = token->tok;
 
-	/* Try to start a new match. */
-	if (si->current == NULL) {
+		d(printf("new token %d '%s'\n", token->offset, token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
 
-		/* A match can never start on a token. */
-		if (token_is_tag)
-			return MATCH_FAILED;
-		
-		/* Check to see if the search string is entirely embedded within the token. */
-		s = find_whole (si, token, si->search);
-		if (s) {
-			const gchar *pos = s;
-			i = g_utf8_strlen (si->search, -1);
-			while (i > 0) {
-				pos = g_utf8_next_char (pos);
-				--i;
+		/* tag test, reset state on unknown tags */
+		if (tok[0] == TAG_ESCAPE) {
+			if (!ignore_tag(tok)) {
+				/* force reset */
+				output_subpending(s);
+				output_pending(s);
+				q = &t->root;
 			}
-			*start_pos = s - token;
-			*end_pos = pos - token;
 
-			return MATCH_COMPLETE;
+			continue;
 		}
 
-		/* Check to see if the beginning of the search string lies in this token. */
-		s = find_head (si, token, si->search);
-		if (s) {
-			*start_pos = s - token;
-			si->current = si->search;
-			while (*s) {
-				s = g_utf8_next_char (s);
-				si->current = g_utf8_next_char (si->current);
+		/* process whole token */
+		stok = tok;
+		while ((c = camel_utf8_getc((const unsigned char **)&tok))) {
+			if ((s->flags & SEARCH_CASE) == 0)
+				c = g_unichar_tolower(c);
+			while (q && (m = g(q, c)) == NULL)
+				q = q->fail;
+			if (q == 0) {
+				/* mismatch ... reset state */
+				output_subpending(s);
+				q = &t->root;
+			} else {
+				/* keep track of previous offsets of utf8 chars, rotating buffer */
+				s->last[s->lastp] = s->offset + (tok-stok)-1;
+				s->lastp = (s->lastp+1)&s->last_mask;
+
+				q = m->match;
+				/* we have a match of q->final characters for a matching word */
+				if (q->final) {
+					s->matchcount++;
+
+					/* use the last buffer to find the real offset of this char */
+					offstart = s->last[(s->lastp - q->final)&s->last_mask];
+					offend = s->offset + (tok - stok);
+
+					if (q->matches == NULL) {
+						if (s->submatchp == 0) {
+							/* nothing pending, always put something in so we can try merge */
+							push_subpending(s, offstart, offend);
+						} else if (!merge_subpending(s, offstart, offend)) {
+							/* can't merge, output what we have, and start againt */
+							output_subpending(s);
+							push_subpending(s, offstart, offend);
+							/*output_match(s, offstart, offend);*/
+						} else if (e_dlist_length(&s->input) > 8) {
+							/* we're continuing to match and merge, but we have a lot of stuff
+							   waiting, so flush it out now since this is a safe point to do it */
+							output_subpending(s);
+						}
+					} else {
+						/* merge/add subpending */
+						if (!merge_subpending(s, offstart, offend))
+							push_subpending(s, offstart, offend);
+					}
+				}
 			}
-
-			return MATCH_START;
 		}
-		
-		return MATCH_FAILED;
+
+		s->offset += (tok-stok)-1;
+
+		flush_extra(s);
 	}
 
-	/* Try to continue a previously-started match. */
-	
-	/* Deal with tags that we encounter mid-match. */
-	if (token_is_tag) {
+	s->state = q;
 
-		/* "Ignored tags" will never mess up a match. */
-		for (i=0; ignored_tags[i]; ++i) {
-			if (tag_match (token, ignored_tags[i]))
-				return MATCH_CONTINUES;
-		}
-		
-		/* "Space tags" only match whitespace in our ongoing match. */
-		if (si->allow_space_tags_to_match_whitespace
-		    && g_unichar_isspace (g_utf8_get_char (si->current))) {
-			for (i=0; space_tags[i]; ++i) {
-				if (tag_match (token, space_tags[i])) {
-					si->current = g_utf8_next_char (si->current);
-					return MATCH_CONTINUES;
-				}
-			}
-		}
+	if (s->current)
+		free_token(s->current);
 
-		/* All other tags derail our match. */
-		return MATCH_FAILED;
-	}
+	s->current = token = (struct _token *)e_dlist_remhead(&s->output);
 
-	s = find_partial (si, token, si->current);
-	if (s) {
-		if (start_pos)
-			*start_pos = 0;
-		if (end_pos)
-			*end_pos = s - token;
-		return MATCH_END;
-	}
+	return token?token->tok:NULL;
+}
 
-	s = find_partial (si, si->current, token);
-	if (s) {
-		si->current = (gchar *) s;
-		return MATCH_CONTINUES;
+static char *
+searcher_peek_token(struct _searcher *s)
+{
+	char *tok;
+
+	/* we just get it and then put it back, it's fast enuf */
+	tok = searcher_next_token(s);
+	if (tok) {
+		/* need to clear this so we dont free it while its still active */
+		e_dlist_addhead(&s->output, (EDListNode *)s->current);
+		s->current = NULL;
 	}
-	
-	return MATCH_FAILED;
+
+	return tok;
+}
+
+static int
+searcher_pending(struct _searcher *s)
+{
+	return !(e_dlist_empty(&s->input) && e_dlist_empty(&s->output));
 }
 
+/* ********************************************************************** */
+
+struct _search_info {
+	GPtrArray *strv;
+	char *colour;
+	unsigned int size:8;
+	unsigned int flags:8;
+};
+
 /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
 
+static struct _search_info *
+search_info_new(void)
+{
+	struct _search_info *s;
+
+	s = g_malloc0(sizeof(struct _search_info));
+	s->strv = g_ptr_array_new();
+
+	return s;
+}
+
 static void
-e_searching_tokenizer_cleanup (ESearchingTokenizer *st)
+search_info_set_flags(struct _search_info *si, unsigned int flags, unsigned int mask)
 {
-	g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
+	si->flags = (si->flags & ~mask) | (flags & mask);
+}
 
-	if (st->priv->trash) {
-		g_list_foreach (st->priv->trash, (GFunc) g_free, NULL);
-		g_list_free (st->priv->trash);
-		st->priv->trash = NULL;
-	}
+static void
+search_info_set_colour(struct _search_info *si, const char *colour)
+{
+	g_free(si->colour);
+	si->colour = g_strdup(colour);
+}
 
-	if (st->priv->pending) {
-		g_list_foreach (st->priv->pending, (GFunc) g_free, NULL);
-		g_list_free (st->priv->pending);
-		st->priv->pending = NULL;
+static void
+search_info_add_string(struct _search_info *si, const char *s)
+{
+	const char *start;
+	guint32 c;
+
+	if (s && s[0]) {
+		/* strip leading whitespace */
+		start = s;
+		while ((c = camel_utf8_getc((const unsigned char **)&s))) {
+			if (!g_unichar_isspace(c)) {
+				break;
+			}
+			start = s;
+		}
+		/* should probably also strip trailing, but i'm lazy today */
+		if (start[0])
+			g_ptr_array_add(si->strv, g_strdup(start));
 	}
 }
 
 static void
+search_info_clear(struct _search_info *si)
+{
+	int i;
+
+	for (i=0;i<si->strv->len;i++)
+		g_free(si->strv->pdata[i]);
+
+	g_ptr_array_set_size(si->strv, 0);
+}
+
+static void
+search_info_free(struct _search_info *si)
+{
+	int i;
+
+	for (i=0;i<si->strv->len;i++)
+		g_free(si->strv->pdata[i]);
+
+	g_ptr_array_free(si->strv, TRUE);
+	g_free(si->colour);
+	g_free(si);
+}
+
+static struct _search_info *
+search_info_clone(struct _search_info *si)
+{
+	struct _search_info *out;
+	int i;
+
+	out = search_info_new();
+	for (i=0;i<si->strv->len;i++)
+		g_ptr_array_add(out->strv, g_strdup(si->strv->pdata[i]));
+	out->colour = g_strdup(si->colour);
+	out->flags = si->flags;
+	out->size = si->size;
+
+	return out;
+}
+
+static struct _searcher *
+search_info_to_searcher(struct _search_info *si)
+{
+	char *tags, *tage;
+	char *col;
+
+	if (si->strv->len == 0)
+		return NULL;
+
+	if (si->colour == NULL)
+		col = "red";
+	else
+		col = si->colour;
+
+	tags = alloca(20+strlen(col));
+	sprintf(tags, "%c<font color=\"%s\">", TAG_ESCAPE, col);
+	tage = alloca(20);
+	sprintf(tage, "%c</font>", TAG_ESCAPE);
+
+	return searcher_new(si->flags, si->strv->len, (char **)si->strv->pdata, tags, tage);
+}
+
+/* ********************************************************************** */
+
+struct _ESearchingTokenizerPrivate {
+	struct _search_info *primary, *secondary;
+	struct _searcher *engine;
+};
+
+/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
+
+/* shoudlnt' this be finalise? */
+static void
 e_searching_tokenizer_destroy (GtkObject *obj)
 {
 	ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (obj);
+	struct _ESearchingTokenizerPrivate *p = st->priv;
 
-	e_searching_tokenizer_cleanup (st);
+	search_info_free (p->primary);
+	search_info_free (p->secondary);
+	if (p->engine)
+		searcher_free(p->engine);
 
-	search_info_free (st->priv->search);
+	/* again wtf? 
 	shared_state_unref (st->priv->shared);
+	*/
 
-	g_free (st->priv);
-	st->priv = NULL;
+	g_free(p);
 
 	if (parent_class->destroy)
 		parent_class->destroy (obj);
@@ -507,14 +1011,24 @@ e_searching_tokenizer_class_init (ESearchingTokenizerClass *klass)
 static void
 e_searching_tokenizer_init (ESearchingTokenizer *st)
 {
-	st->priv = g_new0 (struct _ESearchingTokenizerPrivate, 1);
-	st->priv->shared = shared_state_new ();
+	struct _ESearchingTokenizerPrivate *p;
+
+	p = st->priv = g_new0 (struct _ESearchingTokenizerPrivate, 1);
+
+	p->primary = search_info_new();
+	search_info_set_flags(p->primary, SEARCH_BOLD, SEARCH_CASE|SEARCH_BOLD);
+	search_info_set_colour(p->primary, "red");
+	
+	p->secondary = search_info_new();
+	search_info_set_flags(p->secondary, SEARCH_BOLD, SEARCH_CASE|SEARCH_BOLD);
+	search_info_set_colour(p->secondary, "purple");
 }
 
 GtkType
 e_searching_tokenizer_get_type (void)
 {
 	static GtkType e_searching_tokenizer_type = 0;
+
 	if (! e_searching_tokenizer_type) {
 		static GtkTypeInfo e_searching_tokenizer_info = {
 			"ESearchingTokenizer",
@@ -539,353 +1053,35 @@ e_searching_tokenizer_new (void)
 
 /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
 
-static GList *
-g_list_remove_head (GList *x)
-{
-	GList *repl = NULL;
-	if (x) {
-		repl = g_list_remove_link (x, x);
-		g_list_free_1 (x);
-	}
-	return repl;
-}
-
-/* I can't believe that there isn't a better way to do this. */
-static GList *
-g_list_insert_before (GList *list, GList *llink, gpointer data)
-{
-	gint pos = g_list_position (list, llink);
-	return g_list_insert (list, data, pos);
-}
-
-static gchar *
-pop_pending (ESearchingTokenizer *st)
+/* blah blah the htmltokeniser doesn't like being asked
+   for a token if it doens't hvae any! */
+static char *get_token(HTMLTokenizer *t)
 {
-	gchar *token = NULL;
-	if (st->priv->pending) {
-		token = (gchar *) st->priv->pending->data;
-		st->priv->trash = g_list_prepend (st->priv->trash, token);
-		st->priv->pending = g_list_remove_head (st->priv->pending);
-	}
-	return token;
-}
-
-static inline void
-add_pending (ESearchingTokenizer *st, gchar *tok)
-{
-	st->priv->pending = g_list_append (st->priv->pending, tok);
-}
-
-static void
-add_pending_match_begin (ESearchingTokenizer *st, SearchInfo *si)
-{
-	gchar *size_str = NULL;
-	gchar *color_str= NULL;
-
-	if (si->match_size_incr > 0)
-		size_str = g_strdup_printf (" size=+%d", si->match_size_incr);
-	if (si->match_color)
-		color_str = g_strdup_printf (" color=%s", si->match_color);
-
-	if (size_str || color_str)
-		add_pending (st, g_strdup_printf ("%c<font%s%s>",
-						  TAG_ESCAPE,
-						  size_str ? size_str : "",
-						  color_str ? color_str : ""));
-
-	g_free (size_str);
-	g_free (color_str);
-
-	if (si->match_bold)
-		add_pending (st, g_strdup_printf ("%c<b>", TAG_ESCAPE));
-}
-
-static void
-add_pending_match_end (ESearchingTokenizer *st, SearchInfo *si)
-{
-	if (si->match_bold)
-		add_pending (st, g_strdup_printf ("%c</b>", TAG_ESCAPE));
-
-	if (si->match_size_incr > 0 || si->match_color)
-		add_pending (st, g_strdup_printf ("%c</font>", TAG_ESCAPE));
-}
-
-static void
-add_to_trash (ESearchingTokenizer *st, gchar *txt)
-{
-	st->priv->trash = g_list_prepend (st->priv->trash, txt);
-}
-
-static gchar *
-get_next_token (ESearchingTokenizer *st)
-{
-	HTMLTokenizer *ht = HTML_TOKENIZER (st);
 	HTMLTokenizerClass *klass = HTML_TOKENIZER_CLASS (parent_class);
-	
-	return klass->has_more (ht) ? klass->next_token (ht) : NULL;
-}
-
-/*
- * Move the matched part of the queue into pending, replacing the start and end placeholders by
- * the appropriate tokens.
- */
-static GList *
-queue_matched (ESearchingTokenizer *st, SearchInfo *si, GList *q)
-{
-	GList *qh = q;
-	gboolean post_start = FALSE;
-
-	while (q != NULL) {
-		GList *q_next = g_list_next (q);
-		if (!strcmp ((gchar *) q->data, START_MAGIC)) {
-			add_pending_match_begin (st, si);
-			post_start = TRUE;
-		} else if (!strcmp ((gchar *) q->data, END_MAGIC)) {
-			add_pending_match_end (st, si);
-			q_next = NULL;
-		} else {
-			gboolean is_tag = *((gchar *)q->data) == TAG_ESCAPE;
-			if (is_tag && post_start)
-				add_pending_match_end (st, si);
-			add_pending (st, g_strdup ((gchar *) q->data));
-			if (is_tag && post_start)
-				add_pending_match_begin (st, si);
-		}
-		qh = g_list_remove_link (qh, q);
-		g_list_free_1 (q);
-		q = q_next;
-	}
-
-	return qh;
-}
-
-/*
- * Strip the start and end placeholders out of the queue.
- */
-static GList *
-queue_match_failed (ESearchingTokenizer *st, GList *q)
-{
-	GList *qh = q;
-
-	/* If we do find the START_MAGIC token in the queue, we want
-	   to drop everything up to and including the token immediately
-	   following START_MAGIC. */
-	while (q != NULL && strcmp ((gchar *) q->data, START_MAGIC))
-		q = g_list_next (q);
-	if (q) {
-		q = g_list_next (q);
-		/* If there is no token following START_MAGIC, something is
-		   very wrong. */
-		if (q == NULL) {
-			g_assert_not_reached ();
-		}
-	}
-
-	/* Otherwise we just want to just drop the the first token. */
-	if (q == NULL)
-		q = qh;
-
-	/* Now move everything up to and including q to pending. */
-	while (qh && qh != q) {
-		if (strcmp ((gchar *) qh->data, START_MAGIC))
-			add_pending (st, g_strdup (qh->data));
-		qh = g_list_remove_head (qh);
-	}
-	if (qh == q) {
-		if (strcmp ((gchar *) qh->data, START_MAGIC))
-			add_pending (st, g_strdup (qh->data));
-		qh = g_list_remove_head (qh);
-	}
-
-	return qh;
-}
-
-static void
-matched (ESearchingTokenizer *st)
-{
-	++st->priv->match_count;
-	gtk_signal_emit (GTK_OBJECT (st), e_searching_tokenizer_signals[EST_MATCH_SIGNAL]);
-}
-
-static void
-get_pending_tokens (ESearchingTokenizer *st)
-{
-	GList *queue = NULL;
-	gchar *token = NULL;
-	MatchInfo result;
-	gint start_pos, end_pos;
-	GList *start_after = NULL;
-
-	/* Get an initial token into the queue. */
-	token = get_next_token (st);
-	if (token) {
-		queue = g_list_append (queue, token);
-	}
-
-	while (queue) {
-		GList *q;
-		gboolean finished = FALSE;
-		search_info_reset (st->priv->search);
-
-		if (start_after) {
-			q = g_list_next (start_after);
-			start_after = NULL;
-		} else {
-			q = queue;
-		}
-		
-		while (q) {
-			GList *q_next = g_list_next (q);
-			token = (gchar *) q->data;
-			
-			result = search_info_compare (st->priv->search, token, &start_pos, &end_pos);
-
-			switch (result) {
-
-			case MATCH_FAILED:
-
-				queue = queue_match_failed (st, queue);
-
-				finished = TRUE;
-				break;
-
-			case MATCH_COMPLETE:
-
-				if (start_pos != 0)
-					add_pending (st, g_strndup (token, start_pos));
-				add_pending_match_begin (st, st->priv->search);
-				add_pending (st, g_strndup (token+start_pos, end_pos-start_pos));
-				add_pending_match_end (st, st->priv->search);
-				if (*(token+end_pos)) {
-					queue->data = g_strdup (token+end_pos);
-					add_to_trash (st, (gchar *) queue->data);
-				} else {
-					queue = g_list_remove_head (queue);
-				}
-
-				matched (st);
-
-				finished = TRUE;
-				break;
-
-			case MATCH_START: {
-				
-				gchar *s1 = g_strndup (token, start_pos);
-				gchar *s2 = g_strdup (START_MAGIC);
-				gchar *s3 = g_strdup (token+start_pos);
-				
-				queue = g_list_insert_before (queue, q, s1);
-				queue = g_list_insert_before (queue, q, s2);
-				queue = g_list_insert_before (queue, q, s3);
-
-				add_to_trash (st, s1);
-				add_to_trash (st, s2);
-				add_to_trash (st, s3);
-
-				queue = g_list_remove_link (queue, q);
 
-				finished = FALSE;
-				break;
-			}
-
-			case MATCH_CONTINUES:
-				/* Do nothing... */
-				finished = FALSE;
-				break;
-				
-			case MATCH_END: {
-				gchar *s1 = g_strndup (token, end_pos);
-				gchar *s2 = g_strdup (END_MAGIC);
-				gchar *s3 = g_strdup (token+end_pos);
-
-				queue = g_list_insert_before (queue, q, s1);
-				queue = g_list_insert_before (queue, q, s2);
-				queue = g_list_insert_before (queue, q, s3);
-
-				add_to_trash (st, s1);
-				add_to_trash (st, s2);
-				add_to_trash (st, s3);
-
-				queue = g_list_remove_link (queue, q);
-				queue = queue_matched (st, st->priv->search, queue);
-
-				matched (st);
-
-				finished = TRUE;
-				break;
-			}
-				
-			default:
-				g_assert_not_reached ();
-			}
-
-			/* If we reach the end of the queue but we aren't finished, try to pull in another
-			   token and stick it onto the end. */
-			if (q_next == NULL && !finished) {
-				gchar *next_token = get_next_token (st);
-				if (next_token) {
-					queue = g_list_append (queue, next_token);
-					q_next = g_list_last (queue);
-				}
-			}
-			q = finished ? NULL : q_next;
-		
-		} /* while (q) */
-
-		if (!finished && queue) { /* ...we add the token at the head of the queue to pending and try again. */
-			add_pending (st, g_strdup ((gchar *) queue->data));
-			queue = g_list_remove_head (queue);
-		}
-		
-	} /* while (queue) */
+	return klass->has_more(t) ? klass->next_token(t) : NULL;
 }
 
-/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
-
 static void
 e_searching_tokenizer_begin (HTMLTokenizer *t, gchar *content_type)
 {
 	ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t);
-	SearchInfo *si;
+	struct _ESearchingTokenizerPrivate *p = st->priv;
 
-	/* Reset our search */
-	search_info_free (st->priv->search);
-	st->priv->search = NULL;
-
-	if (st->priv->shared && (st->priv->shared->str_primary || st->priv->shared->str_secondary)) {
-		st->priv->search = search_info_new ();
+	/* reset search */
+	if (p->engine) {
+		searcher_free(p->engine);
+		p->engine = NULL;
 	}
-	si = st->priv->search;
-
-	if (st->priv->shared && si) {
-		if (st->priv->shared->str_primary) {
 
-			search_info_set_string (si, st->priv->shared->str_primary);
-			search_info_set_case_sensitivity (si, st->priv->shared->case_sensitive_primary);
-
-			search_info_set_match_color (si, "red");
-			search_info_set_match_bold (si, TRUE);
-
-		} else if (st->priv->shared->str_secondary) {
-			
-			search_info_set_string (si, st->priv->shared->str_secondary);
-			search_info_set_case_sensitivity (si, st->priv->shared->case_sensitive_secondary);
-			
-			search_info_set_match_color (si, "purple");
-			search_info_set_match_bold (si, TRUE);
-		}
-
-	} else {
-		
-		search_info_free (st->priv->search);
-		st->priv->search = NULL;
+	if ((p->engine = search_info_to_searcher(p->primary))
+	    || (p->engine = search_info_to_searcher(p->secondary))) {
+		/*HTMLTokenizerClass *klass = HTML_TOKENIZER_CLASS (parent_class);*/
 
+		/*searcher_set_tokenfunc(p->engine, klass->next_token, st);*/
+		searcher_set_tokenfunc(p->engine, get_token, st);
 	}
-	
-	e_searching_tokenizer_cleanup (st);
-	search_info_reset (st->priv->search);
-
-	st->priv->match_count = 0;
+	/* else - no engine, no search active */
 
 	HTML_TOKENIZER_CLASS (parent_class)->begin (t, content_type);
 }
@@ -894,7 +1090,17 @@ static void
 e_searching_tokenizer_end (HTMLTokenizer *t)
 {
 	ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t);
-	e_searching_tokenizer_cleanup (st);
+	struct _ESearchingTokenizerPrivate *p = st->priv;
+
+	/* so end gets called before any get/next tokens.
+	   I dont get it.  */
+#if 0
+	/* not sure if we should reset search every time ... *shrug* */
+	if (p->engine) {
+		searcher_free(p->engine);
+		p->engine = NULL;
+	}
+#endif
 
 	HTML_TOKENIZER_CLASS (parent_class)->end (t);
 }
@@ -905,26 +1111,32 @@ e_searching_tokenizer_peek_token (HTMLTokenizer *tok)
 	ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok);
 
 	/* If no search is active, just use the default method. */
-	if (st->priv->search == NULL)
+	if (st->priv->engine == NULL)
 		return HTML_TOKENIZER_CLASS (parent_class)->peek_token (tok);
 
-	if (st->priv->pending == NULL)
-		get_pending_tokens (st);
-	return st->priv->pending ? (gchar *) st->priv->pending->data : NULL;
+	return searcher_peek_token(st->priv->engine);
 }
 
 static gchar *
 e_searching_tokenizer_next_token (HTMLTokenizer *tok)
 {
 	ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok);
+	int oldmatched;
+	char *token;
 
 	/* If no search is active, just use the default method. */
-	if (st->priv->search == NULL)
+	if (st->priv->engine == NULL)
 		return HTML_TOKENIZER_CLASS (parent_class)->next_token (tok);
 
-	if (st->priv->pending == NULL)
-		get_pending_tokens (st);
-	return pop_pending (st);
+	oldmatched = st->priv->engine->matchcount;
+
+	token = searcher_next_token(st->priv->engine);
+
+	/* not sure if this has to be accurate or just say we had some matches */
+	if (oldmatched != st->priv->engine->matchcount)
+		gtk_signal_emit (GTK_OBJECT (st), e_searching_tokenizer_signals[EST_MATCH_SIGNAL]);
+
+	return token;
 }
 
 static gboolean
@@ -932,10 +1144,16 @@ e_searching_tokenizer_has_more (HTMLTokenizer *tok)
 {
 	ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok);
 
-	/* If no search is active, pending will always be NULL and thus
-	   we'll always fall back to using the default method. */
+	return (st->priv->engine != NULL && searcher_pending(st->priv->engine))
+		|| HTML_TOKENIZER_CLASS (parent_class)->has_more (tok);
+}
 
-	return st->priv->pending || HTML_TOKENIZER_CLASS (parent_class)->has_more (tok);
+/* proxy matched event, not sure what its for otherwise */
+static void
+matched (ESearchingTokenizer *st)
+{
+	/*++st->priv->match_count;*/
+	gtk_signal_emit (GTK_OBJECT (st), e_searching_tokenizer_signals[EST_MATCH_SIGNAL]);
 }
 
 static HTMLTokenizer *
@@ -944,15 +1162,20 @@ e_searching_tokenizer_clone (HTMLTokenizer *tok)
 	ESearchingTokenizer *orig_st = E_SEARCHING_TOKENIZER (tok);
 	ESearchingTokenizer *new_st = E_SEARCHING_TOKENIZER (e_searching_tokenizer_new ());
 
-	if (new_st->priv->search) {
-		search_info_free (new_st->priv->search);
-	}
+	search_info_free(new_st->priv->primary);
+	search_info_free(new_st->priv->secondary);
+
+	new_st->priv->primary = search_info_clone(orig_st->priv->primary);
+	new_st->priv->secondary = search_info_clone(orig_st->priv->secondary);
 
-	new_st->priv->search = search_info_clone (orig_st->priv->search);
+	printf("cloinging shit\n");
 
+	/* what the fucking what???? */
+#if 0
 	shared_state_ref (orig_st->priv->shared);
 	shared_state_unref (new_st->priv->shared);
 	new_st->priv->shared = orig_st->priv->shared;
+#endif
 
 	gtk_signal_connect_object (GTK_OBJECT (new_st),
 				   "match",
@@ -963,42 +1186,29 @@ e_searching_tokenizer_clone (HTMLTokenizer *tok)
 }
 /* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */
 
-static gboolean
-only_whitespace (const gchar *p)
+void
+e_searching_tokenizer_set_primary_search_string (ESearchingTokenizer *st, const gchar *search_str)
 {
-	gunichar c;
-	g_return_val_if_fail (p, FALSE);
+	g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
 
-	while (*p && g_unichar_validate (c = g_utf8_get_char (p))) {
-		if (!g_unichar_isspace (c))
-			return FALSE;
-		p = g_utf8_next_char (p);
-	}
-	return TRUE;
+	search_info_clear(st->priv->primary);
+	search_info_add_string(st->priv->primary, search_str);
 }
 
 void
-e_searching_tokenizer_set_primary_search_string (ESearchingTokenizer *st, const gchar *search_str)
+e_searching_tokenizer_add_primary_search_string (ESearchingTokenizer *st, const gchar *search_str)
 {
 	g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
 
-	g_free (st->priv->shared->str_primary);
-	st->priv->shared->str_primary = NULL;
-
-	if (search_str != NULL
-	    && g_utf8_validate (search_str, -1, NULL)
-	    && !only_whitespace (search_str)) {
-
-		st->priv->shared->str_primary = g_strdup (search_str);
-	}
+	search_info_add_string(st->priv->primary, search_str);
 }
 
 void
-e_searching_tokenizer_set_primary_case_sensitivity (ESearchingTokenizer *st, gboolean is_case_sensitive)
+e_searching_tokenizer_set_primary_case_sensitivity (ESearchingTokenizer *st, gboolean iscase)
 {
 	g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
 
-	st->priv->shared->case_sensitive_primary = is_case_sensitive;
+	search_info_set_flags(st->priv->primary, iscase?SEARCH_CASE:0, SEARCH_CASE);
 }
 
 void
@@ -1006,23 +1216,24 @@ e_searching_tokenizer_set_secondary_search_string (ESearchingTokenizer *st, cons
 {
 	g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
 
-	g_free (st->priv->shared->str_secondary);
-	st->priv->shared->str_secondary = NULL;
+	search_info_clear(st->priv->secondary);
+	search_info_add_string(st->priv->secondary, search_str);
+}
 
-	if (search_str != NULL
-	    && g_utf8_validate (search_str, -1, NULL)
-	    && !only_whitespace (search_str)) {
-		
-		st->priv->shared->str_secondary = g_strdup (search_str);
-	}
+void
+e_searching_tokenizer_add_secondary_search_string (ESearchingTokenizer *st, const gchar *search_str)
+{
+	g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
+
+	search_info_add_string(st->priv->secondary, search_str);
 }
 
 void
-e_searching_tokenizer_set_secondary_case_sensitivity (ESearchingTokenizer *st, gboolean is_case_sensitive)
+e_searching_tokenizer_set_secondary_case_sensitivity (ESearchingTokenizer *st, gboolean iscase)
 {
 	g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
 
-	st->priv->shared->case_sensitive_secondary = is_case_sensitive;
+	search_info_set_flags(st->priv->secondary, iscase?SEARCH_CASE:0, SEARCH_CASE);
 }
 
 gint
@@ -1030,8 +1241,8 @@ e_searching_tokenizer_match_count (ESearchingTokenizer *st)
 {
 	g_return_val_if_fail (st && E_IS_SEARCHING_TOKENIZER (st), -1);
 
-	return st->priv->match_count;
-}
-
-
+	if (st->priv->engine)
+		return st->priv->engine->matchcount;
 
+	return 0;
+}
author	Not Zed <NotZed@Ximian.com>	2002-05-21 10:54:05 +0800
committer	Michael Zucci <zucchi@src.gnome.org>	2002-05-21 10:54:05 +0800
commit	e32a24e30ba3558de1ccf54b2f2aa0c348f07b65 (patch)
tree	fbd985d74c9d20593d09e8be7cbf1e53ab87d985 /mail/e-searching-tokenizer.c
parent	8435ad3d2db7d4cb0cbed199cd58ef0a5e0ae366 (diff)
download	gsoc2013-evolution-e32a24e30ba3558de1ccf54b2f2aa0c348f07b65.tar.gz gsoc2013-evolution-e32a24e30ba3558de1ccf54b2f2aa0c348f07b65.tar.zst gsoc2013-evolution-e32a24e30ba3558de1ccf54b2f2aa0c348f07b65.zip