diff options
author | Not Zed <NotZed@Ximian.com> | 2002-05-21 10:54:05 +0800 |
---|---|---|
committer | Michael Zucci <zucchi@src.gnome.org> | 2002-05-21 10:54:05 +0800 |
commit | e32a24e30ba3558de1ccf54b2f2aa0c348f07b65 (patch) | |
tree | fbd985d74c9d20593d09e8be7cbf1e53ab87d985 /mail/e-searching-tokenizer.c | |
parent | 8435ad3d2db7d4cb0cbed199cd58ef0a5e0ae366 (diff) | |
download | gsoc2013-evolution-e32a24e30ba3558de1ccf54b2f2aa0c348f07b65.tar.gz gsoc2013-evolution-e32a24e30ba3558de1ccf54b2f2aa0c348f07b65.tar.zst gsoc2013-evolution-e32a24e30ba3558de1ccf54b2f2aa0c348f07b65.zip |
New functions to add additional search strings one at a time. Maybe it
2002-05-18 Not Zed <NotZed@Ximian.com>
* e-searching-tokenizer.c
(e_searching_tokenizer_add_primary_search_string):
(e_searching_tokenizer_add_secondary_search_string): New functions
to add additional search strings one at a time. Maybe it should
just split the word itself?
(all): Basically, entirely rewritten. Now implements the
Aho-Corasick multiple pattern search algorithm and handles
multiple search strings and only ever has to decode any utf8
character once, etc etc.
svn path=/trunk/; revision=16961
Diffstat (limited to 'mail/e-searching-tokenizer.c')
-rw-r--r-- | mail/e-searching-tokenizer.c | 1531 |
1 files changed, 871 insertions, 660 deletions
diff --git a/mail/e-searching-tokenizer.c b/mail/e-searching-tokenizer.c index 602dd3858d..ce0e9d94fa 100644 --- a/mail/e-searching-tokenizer.c +++ b/mail/e-searching-tokenizer.c @@ -3,9 +3,11 @@ /* * e-searching-tokenizer.c * - * Copyright (C) 2001 Ximian, Inc. + * Copyright (C) 2002 Ximian, Inc. * * Developed by Jon Trowbridge <trow@ximian.com> + * Rewritten significantly to handle multiple strings and improve performance + * by Michael Zucchi <notzed@ximian.com> */ /* @@ -30,15 +32,17 @@ #include <gal/unicode/gunicode.h> #include "e-searching-tokenizer.h" +#include "e-util/e-memory.h" +#include "e-util/e-msgport.h" + +#define d(x) + enum { EST_MATCH_SIGNAL, EST_LAST_SIGNAL }; guint e_searching_tokenizer_signals[EST_LAST_SIGNAL] = { 0 }; -#define START_MAGIC "<\n>S<\n>" -#define END_MAGIC "<\n>E<\n>" - static void e_searching_tokenizer_begin (HTMLTokenizer *, gchar *); static void e_searching_tokenizer_end (HTMLTokenizer *); static gchar *e_searching_tokenizer_peek_token (HTMLTokenizer *); @@ -47,34 +51,14 @@ static gboolean e_searching_tokenizer_has_more (HTMLTokenizer *); static HTMLTokenizer *e_searching_tokenizer_clone (HTMLTokenizer *); -static const gchar *ignored_tags[] = { "b", "i", NULL }; -static const gchar *space_tags[] = { "br", NULL }; +/* + static const gchar *space_tags[] = { "br", NULL };*/ GtkObjectClass *parent_class = NULL; /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ -typedef enum { - MATCH_FAILED = 0, - MATCH_COMPLETE, - MATCH_START, - MATCH_CONTINUES, - MATCH_END -} MatchInfo; - -typedef struct _SearchInfo SearchInfo; -struct _SearchInfo { - gchar *search; - gchar *current; - - gboolean case_sensitive; - gboolean allow_space_tags_to_match_whitespace; - - gint match_size_incr; - gchar *match_color; - gboolean match_bold; -}; - +/* ??? typedef struct _SharedState SharedState; struct _SharedState { gint refs; @@ -83,17 +67,12 @@ struct _SharedState { gboolean case_sensitive_primary; gboolean case_sensitive_secondary; }; +*/ -struct _ESearchingTokenizerPrivate { - gint match_count; - SearchInfo *search; - GList *pending; - GList *trash; - SharedState *shared; -}; +/* ********************************************************************** */ -/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ +#if 0 static SharedState * shared_state_new (void) { @@ -123,353 +102,878 @@ shared_state_unref (SharedState *shared) } } } +#endif /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ -static SearchInfo * -search_info_new (void) +/* ********************************************************************** */ + +/* Utility functions */ + +/* This is faster and safer than glib2's utf8 abomination, but isn't exported from camel as yet */ +static __inline__ guint32 +camel_utf8_getc(const unsigned char **ptr) { - SearchInfo *si; + register unsigned char *p = (unsigned char *)*ptr; + register unsigned char c, r; + register guint32 v, m; + +again: + r = *p++; +loop: + if (r < 0x80) { + *ptr = p; + v = r; + } else if (r < 0xfe) { /* valid start char? */ + v = r; + m = 0x7f80; /* used to mask out the length bits */ + do { + c = *p++; + if ((c & 0xc0) != 0x80) { + r = c; + goto loop; + } + v = (v<<6) | (c & 0x3f); + r<<=1; + m<<=5; + } while (r & 0x40); + + *ptr = p; - si = g_new0 (SearchInfo, 1); - si->case_sensitive = FALSE; + v &= ~m; + } else { + goto again; + } - si->match_size_incr = 1; - si->match_color = g_strdup ("red"); - si->match_bold = FALSE; + return v; +} + + +/* note: our tags of interest are 7 bit ascii, only, no need to do any fancy utf8 stuff */ +/* tags should be upper case + if this list gets longer than 10 entries, consider binary search */ +static char *ignored_tags[] = { "B", "I", "FONT", "TT", "EM", /* and more? */}; + +static int +ignore_tag(const char *tag) +{ + char *t = alloca(strlen(tag)+1), c, *out; + const char *in; + int i; + + /* we could use a aho-corasick matcher here too ... but we wont */ + + /* normalise tag into 't'. + Note we use the property that the only tags we're interested in + are 7 bit ascii to shortcut and simplify case insensitivity */ + in = tag+2; /* skip: TAG_ESCAPE '<' */ + if (*in == '/') + in++; + out = t; + while ((c = *in++)) { + if (c >= 'A' && c <= 'Z') + *out++ = c; + else if (c >= 'a' && c <= 'z') + *out++ = c & 0xdf; /* convert ASCII to upper case */ + else + /* maybe should check for > or ' ' etc? */ + break; + } + *out = 0; - si->allow_space_tags_to_match_whitespace = TRUE; + for (i=0;i<sizeof(ignored_tags)/sizeof(ignored_tags[0]);i++) { + if (strcmp(t, ignored_tags[i]) == 0) + return 1; + } - return si; + return 0; } -static void -search_info_free (SearchInfo *si) +/* ********************************************************************** */ + +/* Aho-Corasick search tree implmeentation */ + +/* next state if we match a character */ +struct _match { + struct _match *next; + guint32 ch; + struct _state *match; +}; + +/* tree state node */ +struct _state { + struct _match *matches; + unsigned int final; /* max no of chars we just matched */ + struct _state *fail; /* where to try next if we fail */ + struct _state *next; /* next on this level? */ +}; + +/* base tree structure */ +struct _trie { + struct _state root; + int max_depth; + + EMemChunk *state_chunks; + EMemChunk *match_chunks; +}; + +static void +dump_trie(struct _state *s, int d) { - if (si) { - g_free (si->search); - g_free (si->match_color); - g_free (si); + char *p = alloca(d*2+1); + struct _match *m; + + memset(p, ' ', d*2); + p[d*2]=0; + + printf("%s[state] %p: %d fail->%p\n", p, s, s->final, s->fail); + m = s->matches; + while (m) { + printf(" %s'%c' -> %p\n", p, m->ch, m->match); + if (m->match) + dump_trie(m->match, d+1); + m = m->next; } } -static SearchInfo * -search_info_clone (SearchInfo *si) +/* This builds an Aho-Corasick search trie for a set of utf8 words */ +/* See + http://www-sr.informatik.uni-tuebingen.de/~buehler/AC/AC.html + for a neat demo */ + +static __inline__ struct _match * +g(struct _state *q, guint32 c) { - SearchInfo *new_si = NULL; + struct _match *m = q->matches; + + while (m && m->ch != c) + m = m->next; - if (si) { - new_si = search_info_new (); - new_si->search = g_strdup (si->search); - new_si->case_sensitive = si->case_sensitive; + return m; +} + +static struct _trie * +build_trie(int nocase, int len, char **words) +{ + struct _state *q, *qt, *r; + char *word; + struct _match *m, *n; + int i, depth; + guint32 c; + struct _trie *trie; + int state_depth_max, state_depth_size; + struct _state **state_depth; + + trie = g_malloc(sizeof(*trie)); + trie->root.matches = 0; + trie->root.final = 0; + trie->root.fail = 0; + trie->root.next = 0; + + trie->state_chunks = e_memchunk_new(8, sizeof(struct _state)); + trie->match_chunks = e_memchunk_new(8, sizeof(struct _match)); + + /* This will correspond to the length of the longest pattern */ + state_depth_size = 0; + state_depth_max = 64; + state_depth = g_malloc(sizeof(*state_depth[0])*64); + state_depth[0] = NULL; + + /* Step 1: Build trie */ + + /* This just builds a tree that merges all common prefixes into the same branch */ + + for (i=0;i<len;i++) { + word = words[i]; + q = &trie->root; + depth = 0; + while ((c = camel_utf8_getc((const unsigned char **)&word))) { + if (nocase) + c = g_unichar_tolower(c); + m = g(q, c); + if (m == 0) { + m = e_memchunk_alloc(trie->match_chunks); + m->ch = c; + m->next = q->matches; + q->matches = m; + q = m->match = e_memchunk_alloc(trie->state_chunks); + q->matches = 0; + q->fail = &trie->root; + q->final = 0; + if (state_depth_max < depth) { + state_depth_max += 64; + state_depth = g_realloc(state_depth, sizeof(*state_depth[0])*state_depth_max); + } + if (state_depth_size < depth) { + state_depth[depth] = 0; + state_depth_size = depth; + } + q->next = state_depth[depth]; + state_depth[depth] = q; + } else { + q = m->match; + } + depth++; + } + q->final = depth; } - return new_si; + d(printf("Dumping trie:\n")); + d(dump_trie(&trie->root, 0)); + + /* Step 2: Build failure graph */ + + /* This searches for the longest substring which is a prefix of another string and + builds a graph of failure links so you can find multiple substrings concurrently, + using aho-corasick's algorithm */ + + for (i=0;i<state_depth_size;i++) { + q = state_depth[i]; + while (q) { + m = q->matches; + while (m) { + c = m->ch; + qt = m->match; + r = q->fail; + while (r != 0 && (n = g(r, c)) == NULL) + r = r->fail; + if (r != 0) { + qt->fail = n->match; + if (qt->fail->final > qt->final) + qt->final = qt->fail->final; + } else { + if ((n = g(&trie->root, c))) + qt->fail = n->match; + else + qt->fail = &trie->root; + } + m = m->next; + } + q = q->next; + } + } + + d(printf("After failure analysis\n")); + d(dump_trie(&trie->root, 0)); + + g_free(state_depth); + + trie->max_depth = state_depth_size; + + return trie; } static void -search_info_set_string (SearchInfo *si, const gchar *str) +free_trie(struct _trie *t) { - g_return_if_fail (si); - g_return_if_fail (str); + e_memchunk_destroy(t->match_chunks); + e_memchunk_destroy(t->state_chunks); - g_free (si->search); - si->search = g_strdup (str); - si->current = NULL; + g_free(t); } +/* ********************************************************************** */ + +/* html token searcher */ + +struct _token { + struct _token *next; + struct _token *prev; + unsigned int offset; + /* we need to copy the token for memory management, so why not copy it whole */ + char tok[1]; +}; + +/* stack of submatches currently being scanned, used for merging */ +struct _submatch { + unsigned int offstart, offend; /* in bytes */ +}; + +/* flags for new func */ +#define SEARCH_CASE (1) +#define SEARCH_BOLD (2) + +struct _searcher { + struct _trie *t; + + char *(*next_token)(); /* callbacks for more tokens */ + void *next_data; + + int words; /* how many words */ + char *tags, *tage; /* the tag we used to highlight */ + + int flags; /* case sensitive or not */ + + struct _state *state; /* state is the current trie state */ + + int matchcount; + + EDList input; /* pending 'input' tokens, processed but might match */ + EDList output; /* output tokens ready for source */ + + struct _token *current; /* for token output memory management */ + + guint32 offset; /* current offset through searchable stream? */ + guint32 offout; /* last output position */ + + unsigned int lastp; /* current position in rotating last buffer */ + guint32 *last; /* buffer that goes back last 'n' positions */ + guint32 last_mask; /* bitmask for efficient rotation calculation */ + + unsigned int submatchp; /* submatch stack */ + struct _submatch *submatches; +}; + static void -search_info_set_case_sensitivity (SearchInfo *si, gboolean flag) +searcher_set_tokenfunc(struct _searcher *s, char *(*next)(), void *data) { - g_return_if_fail (si); + s->next_token = next; + s->next_data = data; +} - si->case_sensitive = flag; +static struct _searcher * +searcher_new(int flags, int argc, char **argv, const char *tags, const char *tage) +{ + int i, m; + struct _searcher *s; + + s = g_malloc(sizeof(*s)); + + s->t = build_trie((flags&SEARCH_CASE) == 0, argc, argv); + s->words = argc; + s->tags = g_strdup(tags); + s->tage = g_strdup(tage); + s->flags = flags; + s->state = &s->t->root; + s->matchcount = 0; + + e_dlist_init(&s->input); + e_dlist_init(&s->output); + s->current = 0; + + s->offset = 0; + s->offout = 0; + + /* rotating queue of previous character positions */ + m = s->t->max_depth+1; + i = 2; + while (i<m) + i<<=2; + s->last = g_malloc(sizeof(s->last[0])*i); + s->last_mask = i-1; + s->lastp = 0; + + /* a stack of possible submatches */ + s->submatchp = 0; + s->submatches = g_malloc(sizeof(s->submatches[0])*argc+1); + + return s; } -#if 0 static void -search_info_set_match_size_increase (SearchInfo *si, gint incr) +searcher_free(struct _searcher *s) { - g_return_if_fail (si); - g_return_if_fail (incr >= 0); + struct _token *t; + + while ((t = (struct _token *)e_dlist_remhead(&s->input))) + g_free(t); + while ((t = (struct _token *)e_dlist_remhead(&s->output))) + g_free(t); + g_free(s->tags); + g_free(s->tage); + g_free(s->last); + g_free(s->submatches); + free_trie(s->t); + g_free(s); +} +static struct _token * +append_token(EDList *list, const char *tok, int len) +{ + struct _token *token; + + if (len == -1) + len = strlen(tok); + token = g_malloc(sizeof(*token) + len+1); + token->offset = 0; /* set by caller when required */ + memcpy(token->tok, tok, len); + token->tok[len] = 0; + e_dlist_addtail(list, (EDListNode *)token); - si->match_size_incr = incr; + return token; } -#endif + +#define free_token(x) (g_free(x)) static void -search_info_set_match_color (SearchInfo *si, const gchar *color) +output_token(struct _searcher *s, struct _token *token) { - g_return_if_fail (si); + int offend; + int left, pre; - g_free (si->match_color); - si->match_color = g_strdup (color); + if (token->tok[0] == TAG_ESCAPE) { + if (token->offset >= s->offout) { + d(printf("moving tag token '%s' from input to output\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); + e_dlist_addtail(&s->output, (EDListNode *)token); + } else { + d(printf("discarding tag token '%s' from input\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); + free_token(token); + } + } else { + offend = token->offset + strlen(token->tok); + left = offend-s->offout; + if (left > 0) { + pre = s->offout - token->offset; + if (pre>0) + memmove(token->tok, token->tok+pre, left+1); + d(printf("adding partial remaining/failed '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); + s->offout = offend; + e_dlist_addtail(&s->output, (EDListNode *)token); + } else { + d(printf("discarding whole token '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); + free_token(token); + } + } } -static void -search_info_set_match_bold (SearchInfo *si, gboolean flag) +static struct _token * +find_token(struct _searcher *s, int start) { - g_return_if_fail (si); + register struct _token *token; + + /* find token which is start token, from end of list back */ + token = (struct _token *)s->input.tailpred; + while (token->prev) { + if (token->offset <= start) + return token; + token = token->prev; + } - si->match_bold = flag; + return NULL; } static void -search_info_reset (SearchInfo *si) +output_match(struct _searcher *s, unsigned int start, unsigned int end) { - if (si == NULL) + register struct _token *token; + struct _token *starttoken, *endtoken; + char b[8]; + + d(printf("output match: %d-%d at %d\n", start, end, s->offout)); + + starttoken = find_token(s, start); + endtoken = find_token(s, end); + + if (starttoken == NULL || endtoken == NULL) { + printf("Cannot find match history for match %d-%d\n", start, end); return; - si->current = NULL; -} + } -/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */ + d(printf("start in token '%s'\n", starttoken->tok[0]==TAG_ESCAPE?starttoken->tok+1:starttoken->tok)); + d(printf("end in token '%s'\n", endtoken->tok[0]==TAG_ESCAPE?endtoken->tok+1:endtoken->tok)); -static const gchar * -find_whole (SearchInfo *si, const gchar *haystack, const gchar *needle) -{ - const gchar *h, *n; - - g_return_val_if_fail (si, NULL); - g_return_val_if_fail (haystack && needle, NULL); - g_return_val_if_fail (g_utf8_validate (haystack, -1, NULL), NULL); - g_return_val_if_fail (g_utf8_validate (needle, -1, NULL), NULL); - - while (*haystack) { - h = haystack; - n = needle; - while (*h && *n) { - gunichar c1 = g_utf8_get_char (h); - gunichar c2 = g_utf8_get_char (n); - - if (!si->case_sensitive) { - c1 = g_unichar_tolower (c1); - c2 = g_unichar_tolower (c2); - } + /* output pending stuff that didn't match afterall */ + while ((struct _token *)s->input.head != starttoken) { + token = (struct _token *)e_dlist_remhead(&s->input); + d(printf("appending failed match '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); + output_token(s, token); + } - if (c1 != c2) - break; - - h = g_utf8_next_char (h); - n = g_utf8_next_char (n); + /* output any pre-match text */ + if (s->offout < start) { + token = append_token(&s->output, starttoken->tok + (s->offout-starttoken->offset), start-s->offout); + d(printf("adding pre-match text '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); + s->offout = start; + } + + /* output highlight/bold */ + if (s->flags & SEARCH_BOLD) { + sprintf(b, "%c<b>", (char)TAG_ESCAPE); + append_token(&s->output, b, -1); + } + if (s->tags) + append_token(&s->output, s->tags, -1); + + /* output match node(s) */ + if (starttoken != endtoken) { + while ((struct _token *)s->input.head != endtoken) { + token = (struct _token *)e_dlist_remhead(&s->input); + d(printf("appending (partial) match node (head) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); + output_token(s, token); } - if (*n == '\0') - return haystack; - if (*h == '\0') - return NULL; - haystack = g_utf8_next_char (haystack); } - return NULL; + /* any remaining partial content */ + if (s->offout < end) { + token = append_token(&s->output, endtoken->tok+(s->offout-endtoken->offset), end-s->offout); + d(printf("appending (partial) match node (tail) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); + s->offout = end; + } + + /* end highlight */ + if (s->tage) + append_token(&s->output, s->tage, -1); + + /* and close bold if we need to */ + if (s->flags & SEARCH_BOLD) { + sprintf(b, "%c</b>", (char)TAG_ESCAPE); + append_token(&s->output, b, -1); + } } -/* This is a really stupid implementation of this function. */ -static const gchar * -find_head (SearchInfo *si, const gchar *haystack, const gchar *needle) +/* output any sub-pending blocks */ +static void +output_subpending(struct _searcher *s) { - const gchar *h, *n; - - g_return_val_if_fail (si, NULL); - g_return_val_if_fail (haystack && needle, NULL); - g_return_val_if_fail (g_utf8_validate (haystack, -1, NULL), NULL); - g_return_val_if_fail (g_utf8_validate (needle, -1, NULL), NULL); - - while (*haystack) { - h = haystack; - n = needle; - while (*h && *n) { - gunichar c1 = g_utf8_get_char (h); - gunichar c2 = g_utf8_get_char (n); - - if (!si->case_sensitive) { - c1 = g_unichar_tolower (c1); - c2 = g_unichar_tolower (c2); - } + int i; - if (c1 != c2) - break; + for (i=s->submatchp-1;i>=0;i--) + output_match(s, s->submatches[i].offstart, s->submatches[i].offend); + s->submatchp = 0; +} - h = g_utf8_next_char (h); - n = g_utf8_next_char (n); +/* returns true if a merge took place */ +static int +merge_subpending(struct _searcher *s, int offstart, int offend) +{ + int i; + + /* merges overlapping or abutting match strings */ + if (s->submatchp && + s->submatches[s->submatchp-1].offend >= offstart) { + + /* go from end, any that match 'invalidate' follow-on ones too */ + for (i=s->submatchp-1;i>=0;i--) { + if (s->submatches[i].offend >= offstart) { + if (offstart < s->submatches[i].offstart) + s->submatches[i].offstart = offstart; + s->submatches[i].offend = offend; + if (s->submatchp > i) + s->submatchp = i+1; + } } - if (*h == '\0') - return haystack; - haystack = g_utf8_next_char (haystack); + return 1; } - return NULL; + return 0; } -static const gchar * -find_partial (SearchInfo *si, const gchar *haystack, const gchar *needle) +static void +push_subpending(struct _searcher *s, int offstart, int offend) { - g_return_val_if_fail (si, NULL); - g_return_val_if_fail (haystack && needle, NULL); - g_return_val_if_fail (g_utf8_validate (haystack, -1, NULL), NULL); - g_return_val_if_fail (g_utf8_validate (needle, -1, NULL), NULL); - - while (*needle) { - gunichar c1 = g_utf8_get_char (haystack); - gunichar c2 = g_utf8_get_char (needle); + /* This is really an assertion, we just ignore the last pending match instead of crashing though */ + if (s->submatchp >= s->words) { + printf("ERROR: submatch pending stack overflow\n"); + s->submatchp = s->words-1; + } - if (!si->case_sensitive) { - c1 = g_unichar_tolower (c1); - c2 = g_unichar_tolower (c2); - } + s->submatches[s->submatchp].offstart = offstart; + s->submatches[s->submatchp].offend = offend; + s->submatchp++; +} - if (c1 != c2) - return NULL; +/* move any (partial) tokens from input to output if they are beyond the current output position */ +static void +output_pending(struct _searcher *s) +{ + struct _token *token; - needle = g_utf8_next_char (needle); - haystack = g_utf8_next_char (haystack); - } - return haystack; + while ( (token = (struct _token *)e_dlist_remhead(&s->input)) ) + output_token(s, token); } -static gboolean -tag_match (const gchar *token, const gchar *tag) +/* flushes any nodes we cannot possibly match anymore */ +static void +flush_extra(struct _searcher *s) { - token += 2; /* Skip past TAG_ESCAPE and < */ - if (*token == '/') - ++token; - while (*token && *tag) { - gunichar c1 = g_unichar_tolower (g_utf8_get_char (token)); - gunichar c2 = g_unichar_tolower (g_utf8_get_char (tag)); - if (c1 != c2) - return FALSE; - token = g_utf8_next_char (token); - tag = g_utf8_next_char (tag); + unsigned int start; + int i; + struct _token *starttoken, *token; + + /* find earliest char that can be in contention */ + start = s->offset - s->t->max_depth; + for (i=0;i<s->submatchp;i++) + if (s->submatches[i].offstart < start) + start = s->submatches[i].offstart; + + /* now, flush out any tokens which are before this point */ + starttoken = find_token(s, start); + if (starttoken == NULL) + return; + + while ((struct _token *)s->input.head != starttoken) { + token = (struct _token *)e_dlist_remhead(&s->input); + output_token(s, token); } - return (*tag == '\0' && *token == '>'); } -static MatchInfo -search_info_compare (SearchInfo *si, const gchar *token, gint *start_pos, gint *end_pos) +static char * +searcher_next_token(struct _searcher *s) { - gboolean token_is_tag; - const gchar *s; - gint i; - - g_return_val_if_fail (si != NULL, MATCH_FAILED); - g_return_val_if_fail (token != NULL, MATCH_FAILED); - g_return_val_if_fail (start_pos != NULL, MATCH_FAILED); - g_return_val_if_fail (end_pos != NULL, MATCH_FAILED); + struct _token *token; + char *tok, *stok; + struct _trie *t = s->t; + struct _state *q = s->state; + struct _match *m; + int offstart, offend; + guint32 c; + + while (e_dlist_empty(&s->output)) { + /* get next token */ + tok = s->next_token(s->next_data); + if (tok == NULL) { + output_subpending(s); + output_pending(s); + break; + } - token_is_tag = (*token == TAG_ESCAPE); + /* we dont always have to copy each token, e.g. if we dont match anything */ + token = append_token(&s->input, tok, -1); + token->offset = s->offset; + tok = token->tok; - /* Try to start a new match. */ - if (si->current == NULL) { + d(printf("new token %d '%s'\n", token->offset, token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); - /* A match can never start on a token. */ - if (token_is_tag) - return MATCH_FAILED; - - /* Check to see if the search string is entirely embedded within the token. */ - s = find_whole (si, token, si->search); - if (s) { - const gchar *pos = s; - i = g_utf8_strlen (si->search, -1); - while (i > 0) { - pos = g_utf8_next_char (pos); - --i; + /* tag test, reset state on unknown tags */ + if (tok[0] == TAG_ESCAPE) { + if (!ignore_tag(tok)) { + /* force reset */ + output_subpending(s); + output_pending(s); + q = &t->root; } - *start_pos = s - token; - *end_pos = pos - token; - return MATCH_COMPLETE; + continue; } - /* Check to see if the beginning of the search string lies in this token. */ - s = find_head (si, token, si->search); - if (s) { - *start_pos = s - token; - si->current = si->search; - while (*s) { - s = g_utf8_next_char (s); - si->current = g_utf8_next_char (si->current); + /* process whole token */ + stok = tok; + while ((c = camel_utf8_getc((const unsigned char **)&tok))) { + if ((s->flags & SEARCH_CASE) == 0) + c = g_unichar_tolower(c); + while (q && (m = g(q, c)) == NULL) + q = q->fail; + if (q == 0) { + /* mismatch ... reset state */ + output_subpending(s); + q = &t->root; + } else { + /* keep track of previous offsets of utf8 chars, rotating buffer */ + s->last[s->lastp] = s->offset + (tok-stok)-1; + s->lastp = (s->lastp+1)&s->last_mask; + + q = m->match; + /* we have a match of q->final characters for a matching word */ + if (q->final) { + s->matchcount++; + + /* use the last buffer to find the real offset of this char */ + offstart = s->last[(s->lastp - q->final)&s->last_mask]; + offend = s->offset + (tok - stok); + + if (q->matches == NULL) { + if (s->submatchp == 0) { + /* nothing pending, always put something in so we can try merge */ + push_subpending(s, offstart, offend); + } else if (!merge_subpending(s, offstart, offend)) { + /* can't merge, output what we have, and start againt */ + output_subpending(s); + push_subpending(s, offstart, offend); + /*output_match(s, offstart, offend);*/ + } else if (e_dlist_length(&s->input) > 8) { + /* we're continuing to match and merge, but we have a lot of stuff + waiting, so flush it out now since this is a safe point to do it */ + output_subpending(s); + } + } else { + /* merge/add subpending */ + if (!merge_subpending(s, offstart, offend)) + push_subpending(s, offstart, offend); + } + } } - - return MATCH_START; } - - return MATCH_FAILED; + + s->offset += (tok-stok)-1; + + flush_extra(s); } - /* Try to continue a previously-started match. */ - - /* Deal with tags that we encounter mid-match. */ - if (token_is_tag) { + s->state = q; - /* "Ignored tags" will never mess up a match. */ - for (i=0; ignored_tags[i]; ++i) { - if (tag_match (token, ignored_tags[i])) - return MATCH_CONTINUES; - } - - /* "Space tags" only match whitespace in our ongoing match. */ - if (si->allow_space_tags_to_match_whitespace - && g_unichar_isspace (g_utf8_get_char (si->current))) { - for (i=0; space_tags[i]; ++i) { - if (tag_match (token, space_tags[i])) { - si->current = g_utf8_next_char (si->current); - return MATCH_CONTINUES; - } - } - } + if (s->current) + free_token(s->current); - /* All other tags derail our match. */ - return MATCH_FAILED; - } + s->current = token = (struct _token *)e_dlist_remhead(&s->output); - s = find_partial (si, token, si->current); - if (s) { - if (start_pos) - *start_pos = 0; - if (end_pos) - *end_pos = s - token; - return MATCH_END; - } + return token?token->tok:NULL; +} - s = find_partial (si, si->current, token); - if (s) { - si->current = (gchar *) s; - return MATCH_CONTINUES; +static char * +searcher_peek_token(struct _searcher *s) +{ + char *tok; + + /* we just get it and then put it back, it's fast enuf */ + tok = searcher_next_token(s); + if (tok) { + /* need to clear this so we dont free it while its still active */ + e_dlist_addhead(&s->output, (EDListNode *)s->current); + s->current = NULL; } - - return MATCH_FAILED; + + return tok; +} + +static int +searcher_pending(struct _searcher *s) +{ + return !(e_dlist_empty(&s->input) && e_dlist_empty(&s->output)); } +/* ********************************************************************** */ + +struct _search_info { + GPtrArray *strv; + char *colour; + unsigned int size:8; + unsigned int flags:8; +}; + /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ +static struct _search_info * +search_info_new(void) +{ + struct _search_info *s; + + s = g_malloc0(sizeof(struct _search_info)); + s->strv = g_ptr_array_new(); + + return s; +} + static void -e_searching_tokenizer_cleanup (ESearchingTokenizer *st) +search_info_set_flags(struct _search_info *si, unsigned int flags, unsigned int mask) { - g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); + si->flags = (si->flags & ~mask) | (flags & mask); +} - if (st->priv->trash) { - g_list_foreach (st->priv->trash, (GFunc) g_free, NULL); - g_list_free (st->priv->trash); - st->priv->trash = NULL; - } +static void +search_info_set_colour(struct _search_info *si, const char *colour) +{ + g_free(si->colour); + si->colour = g_strdup(colour); +} - if (st->priv->pending) { - g_list_foreach (st->priv->pending, (GFunc) g_free, NULL); - g_list_free (st->priv->pending); - st->priv->pending = NULL; +static void +search_info_add_string(struct _search_info *si, const char *s) +{ + const char *start; + guint32 c; + + if (s && s[0]) { + /* strip leading whitespace */ + start = s; + while ((c = camel_utf8_getc((const unsigned char **)&s))) { + if (!g_unichar_isspace(c)) { + break; + } + start = s; + } + /* should probably also strip trailing, but i'm lazy today */ + if (start[0]) + g_ptr_array_add(si->strv, g_strdup(start)); } } static void +search_info_clear(struct _search_info *si) +{ + int i; + + for (i=0;i<si->strv->len;i++) + g_free(si->strv->pdata[i]); + + g_ptr_array_set_size(si->strv, 0); +} + +static void +search_info_free(struct _search_info *si) +{ + int i; + + for (i=0;i<si->strv->len;i++) + g_free(si->strv->pdata[i]); + + g_ptr_array_free(si->strv, TRUE); + g_free(si->colour); + g_free(si); +} + +static struct _search_info * +search_info_clone(struct _search_info *si) +{ + struct _search_info *out; + int i; + + out = search_info_new(); + for (i=0;i<si->strv->len;i++) + g_ptr_array_add(out->strv, g_strdup(si->strv->pdata[i])); + out->colour = g_strdup(si->colour); + out->flags = si->flags; + out->size = si->size; + + return out; +} + +static struct _searcher * +search_info_to_searcher(struct _search_info *si) +{ + char *tags, *tage; + char *col; + + if (si->strv->len == 0) + return NULL; + + if (si->colour == NULL) + col = "red"; + else + col = si->colour; + + tags = alloca(20+strlen(col)); + sprintf(tags, "%c<font color=\"%s\">", TAG_ESCAPE, col); + tage = alloca(20); + sprintf(tage, "%c</font>", TAG_ESCAPE); + + return searcher_new(si->flags, si->strv->len, (char **)si->strv->pdata, tags, tage); +} + +/* ********************************************************************** */ + +struct _ESearchingTokenizerPrivate { + struct _search_info *primary, *secondary; + struct _searcher *engine; +}; + +/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ + +/* shoudlnt' this be finalise? */ +static void e_searching_tokenizer_destroy (GtkObject *obj) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (obj); + struct _ESearchingTokenizerPrivate *p = st->priv; - e_searching_tokenizer_cleanup (st); + search_info_free (p->primary); + search_info_free (p->secondary); + if (p->engine) + searcher_free(p->engine); - search_info_free (st->priv->search); + /* again wtf? shared_state_unref (st->priv->shared); + */ - g_free (st->priv); - st->priv = NULL; + g_free(p); if (parent_class->destroy) parent_class->destroy (obj); @@ -507,14 +1011,24 @@ e_searching_tokenizer_class_init (ESearchingTokenizerClass *klass) static void e_searching_tokenizer_init (ESearchingTokenizer *st) { - st->priv = g_new0 (struct _ESearchingTokenizerPrivate, 1); - st->priv->shared = shared_state_new (); + struct _ESearchingTokenizerPrivate *p; + + p = st->priv = g_new0 (struct _ESearchingTokenizerPrivate, 1); + + p->primary = search_info_new(); + search_info_set_flags(p->primary, SEARCH_BOLD, SEARCH_CASE|SEARCH_BOLD); + search_info_set_colour(p->primary, "red"); + + p->secondary = search_info_new(); + search_info_set_flags(p->secondary, SEARCH_BOLD, SEARCH_CASE|SEARCH_BOLD); + search_info_set_colour(p->secondary, "purple"); } GtkType e_searching_tokenizer_get_type (void) { static GtkType e_searching_tokenizer_type = 0; + if (! e_searching_tokenizer_type) { static GtkTypeInfo e_searching_tokenizer_info = { "ESearchingTokenizer", @@ -539,353 +1053,35 @@ e_searching_tokenizer_new (void) /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ -static GList * -g_list_remove_head (GList *x) -{ - GList *repl = NULL; - if (x) { - repl = g_list_remove_link (x, x); - g_list_free_1 (x); - } - return repl; -} - -/* I can't believe that there isn't a better way to do this. */ -static GList * -g_list_insert_before (GList *list, GList *llink, gpointer data) -{ - gint pos = g_list_position (list, llink); - return g_list_insert (list, data, pos); -} - -static gchar * -pop_pending (ESearchingTokenizer *st) +/* blah blah the htmltokeniser doesn't like being asked + for a token if it doens't hvae any! */ +static char *get_token(HTMLTokenizer *t) { - gchar *token = NULL; - if (st->priv->pending) { - token = (gchar *) st->priv->pending->data; - st->priv->trash = g_list_prepend (st->priv->trash, token); - st->priv->pending = g_list_remove_head (st->priv->pending); - } - return token; -} - -static inline void -add_pending (ESearchingTokenizer *st, gchar *tok) -{ - st->priv->pending = g_list_append (st->priv->pending, tok); -} - -static void -add_pending_match_begin (ESearchingTokenizer *st, SearchInfo *si) -{ - gchar *size_str = NULL; - gchar *color_str= NULL; - - if (si->match_size_incr > 0) - size_str = g_strdup_printf (" size=+%d", si->match_size_incr); - if (si->match_color) - color_str = g_strdup_printf (" color=%s", si->match_color); - - if (size_str || color_str) - add_pending (st, g_strdup_printf ("%c<font%s%s>", - TAG_ESCAPE, - size_str ? size_str : "", - color_str ? color_str : "")); - - g_free (size_str); - g_free (color_str); - - if (si->match_bold) - add_pending (st, g_strdup_printf ("%c<b>", TAG_ESCAPE)); -} - -static void -add_pending_match_end (ESearchingTokenizer *st, SearchInfo *si) -{ - if (si->match_bold) - add_pending (st, g_strdup_printf ("%c</b>", TAG_ESCAPE)); - - if (si->match_size_incr > 0 || si->match_color) - add_pending (st, g_strdup_printf ("%c</font>", TAG_ESCAPE)); -} - -static void -add_to_trash (ESearchingTokenizer *st, gchar *txt) -{ - st->priv->trash = g_list_prepend (st->priv->trash, txt); -} - -static gchar * -get_next_token (ESearchingTokenizer *st) -{ - HTMLTokenizer *ht = HTML_TOKENIZER (st); HTMLTokenizerClass *klass = HTML_TOKENIZER_CLASS (parent_class); - - return klass->has_more (ht) ? klass->next_token (ht) : NULL; -} - -/* - * Move the matched part of the queue into pending, replacing the start and end placeholders by - * the appropriate tokens. - */ -static GList * -queue_matched (ESearchingTokenizer *st, SearchInfo *si, GList *q) -{ - GList *qh = q; - gboolean post_start = FALSE; - - while (q != NULL) { - GList *q_next = g_list_next (q); - if (!strcmp ((gchar *) q->data, START_MAGIC)) { - add_pending_match_begin (st, si); - post_start = TRUE; - } else if (!strcmp ((gchar *) q->data, END_MAGIC)) { - add_pending_match_end (st, si); - q_next = NULL; - } else { - gboolean is_tag = *((gchar *)q->data) == TAG_ESCAPE; - if (is_tag && post_start) - add_pending_match_end (st, si); - add_pending (st, g_strdup ((gchar *) q->data)); - if (is_tag && post_start) - add_pending_match_begin (st, si); - } - qh = g_list_remove_link (qh, q); - g_list_free_1 (q); - q = q_next; - } - - return qh; -} - -/* - * Strip the start and end placeholders out of the queue. - */ -static GList * -queue_match_failed (ESearchingTokenizer *st, GList *q) -{ - GList *qh = q; - - /* If we do find the START_MAGIC token in the queue, we want - to drop everything up to and including the token immediately - following START_MAGIC. */ - while (q != NULL && strcmp ((gchar *) q->data, START_MAGIC)) - q = g_list_next (q); - if (q) { - q = g_list_next (q); - /* If there is no token following START_MAGIC, something is - very wrong. */ - if (q == NULL) { - g_assert_not_reached (); - } - } - - /* Otherwise we just want to just drop the the first token. */ - if (q == NULL) - q = qh; - - /* Now move everything up to and including q to pending. */ - while (qh && qh != q) { - if (strcmp ((gchar *) qh->data, START_MAGIC)) - add_pending (st, g_strdup (qh->data)); - qh = g_list_remove_head (qh); - } - if (qh == q) { - if (strcmp ((gchar *) qh->data, START_MAGIC)) - add_pending (st, g_strdup (qh->data)); - qh = g_list_remove_head (qh); - } - - return qh; -} - -static void -matched (ESearchingTokenizer *st) -{ - ++st->priv->match_count; - gtk_signal_emit (GTK_OBJECT (st), e_searching_tokenizer_signals[EST_MATCH_SIGNAL]); -} - -static void -get_pending_tokens (ESearchingTokenizer *st) -{ - GList *queue = NULL; - gchar *token = NULL; - MatchInfo result; - gint start_pos, end_pos; - GList *start_after = NULL; - - /* Get an initial token into the queue. */ - token = get_next_token (st); - if (token) { - queue = g_list_append (queue, token); - } - - while (queue) { - GList *q; - gboolean finished = FALSE; - search_info_reset (st->priv->search); - - if (start_after) { - q = g_list_next (start_after); - start_after = NULL; - } else { - q = queue; - } - - while (q) { - GList *q_next = g_list_next (q); - token = (gchar *) q->data; - - result = search_info_compare (st->priv->search, token, &start_pos, &end_pos); - - switch (result) { - - case MATCH_FAILED: - - queue = queue_match_failed (st, queue); - - finished = TRUE; - break; - - case MATCH_COMPLETE: - - if (start_pos != 0) - add_pending (st, g_strndup (token, start_pos)); - add_pending_match_begin (st, st->priv->search); - add_pending (st, g_strndup (token+start_pos, end_pos-start_pos)); - add_pending_match_end (st, st->priv->search); - if (*(token+end_pos)) { - queue->data = g_strdup (token+end_pos); - add_to_trash (st, (gchar *) queue->data); - } else { - queue = g_list_remove_head (queue); - } - - matched (st); - - finished = TRUE; - break; - - case MATCH_START: { - - gchar *s1 = g_strndup (token, start_pos); - gchar *s2 = g_strdup (START_MAGIC); - gchar *s3 = g_strdup (token+start_pos); - - queue = g_list_insert_before (queue, q, s1); - queue = g_list_insert_before (queue, q, s2); - queue = g_list_insert_before (queue, q, s3); - - add_to_trash (st, s1); - add_to_trash (st, s2); - add_to_trash (st, s3); - - queue = g_list_remove_link (queue, q); - finished = FALSE; - break; - } - - case MATCH_CONTINUES: - /* Do nothing... */ - finished = FALSE; - break; - - case MATCH_END: { - gchar *s1 = g_strndup (token, end_pos); - gchar *s2 = g_strdup (END_MAGIC); - gchar *s3 = g_strdup (token+end_pos); - - queue = g_list_insert_before (queue, q, s1); - queue = g_list_insert_before (queue, q, s2); - queue = g_list_insert_before (queue, q, s3); - - add_to_trash (st, s1); - add_to_trash (st, s2); - add_to_trash (st, s3); - - queue = g_list_remove_link (queue, q); - queue = queue_matched (st, st->priv->search, queue); - - matched (st); - - finished = TRUE; - break; - } - - default: - g_assert_not_reached (); - } - - /* If we reach the end of the queue but we aren't finished, try to pull in another - token and stick it onto the end. */ - if (q_next == NULL && !finished) { - gchar *next_token = get_next_token (st); - if (next_token) { - queue = g_list_append (queue, next_token); - q_next = g_list_last (queue); - } - } - q = finished ? NULL : q_next; - - } /* while (q) */ - - if (!finished && queue) { /* ...we add the token at the head of the queue to pending and try again. */ - add_pending (st, g_strdup ((gchar *) queue->data)); - queue = g_list_remove_head (queue); - } - - } /* while (queue) */ + return klass->has_more(t) ? klass->next_token(t) : NULL; } -/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ - static void e_searching_tokenizer_begin (HTMLTokenizer *t, gchar *content_type) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t); - SearchInfo *si; + struct _ESearchingTokenizerPrivate *p = st->priv; - /* Reset our search */ - search_info_free (st->priv->search); - st->priv->search = NULL; - - if (st->priv->shared && (st->priv->shared->str_primary || st->priv->shared->str_secondary)) { - st->priv->search = search_info_new (); + /* reset search */ + if (p->engine) { + searcher_free(p->engine); + p->engine = NULL; } - si = st->priv->search; - - if (st->priv->shared && si) { - if (st->priv->shared->str_primary) { - search_info_set_string (si, st->priv->shared->str_primary); - search_info_set_case_sensitivity (si, st->priv->shared->case_sensitive_primary); - - search_info_set_match_color (si, "red"); - search_info_set_match_bold (si, TRUE); - - } else if (st->priv->shared->str_secondary) { - - search_info_set_string (si, st->priv->shared->str_secondary); - search_info_set_case_sensitivity (si, st->priv->shared->case_sensitive_secondary); - - search_info_set_match_color (si, "purple"); - search_info_set_match_bold (si, TRUE); - } - - } else { - - search_info_free (st->priv->search); - st->priv->search = NULL; + if ((p->engine = search_info_to_searcher(p->primary)) + || (p->engine = search_info_to_searcher(p->secondary))) { + /*HTMLTokenizerClass *klass = HTML_TOKENIZER_CLASS (parent_class);*/ + /*searcher_set_tokenfunc(p->engine, klass->next_token, st);*/ + searcher_set_tokenfunc(p->engine, get_token, st); } - - e_searching_tokenizer_cleanup (st); - search_info_reset (st->priv->search); - - st->priv->match_count = 0; + /* else - no engine, no search active */ HTML_TOKENIZER_CLASS (parent_class)->begin (t, content_type); } @@ -894,7 +1090,17 @@ static void e_searching_tokenizer_end (HTMLTokenizer *t) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t); - e_searching_tokenizer_cleanup (st); + struct _ESearchingTokenizerPrivate *p = st->priv; + + /* so end gets called before any get/next tokens. + I dont get it. */ +#if 0 + /* not sure if we should reset search every time ... *shrug* */ + if (p->engine) { + searcher_free(p->engine); + p->engine = NULL; + } +#endif HTML_TOKENIZER_CLASS (parent_class)->end (t); } @@ -905,26 +1111,32 @@ e_searching_tokenizer_peek_token (HTMLTokenizer *tok) ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok); /* If no search is active, just use the default method. */ - if (st->priv->search == NULL) + if (st->priv->engine == NULL) return HTML_TOKENIZER_CLASS (parent_class)->peek_token (tok); - if (st->priv->pending == NULL) - get_pending_tokens (st); - return st->priv->pending ? (gchar *) st->priv->pending->data : NULL; + return searcher_peek_token(st->priv->engine); } static gchar * e_searching_tokenizer_next_token (HTMLTokenizer *tok) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok); + int oldmatched; + char *token; /* If no search is active, just use the default method. */ - if (st->priv->search == NULL) + if (st->priv->engine == NULL) return HTML_TOKENIZER_CLASS (parent_class)->next_token (tok); - if (st->priv->pending == NULL) - get_pending_tokens (st); - return pop_pending (st); + oldmatched = st->priv->engine->matchcount; + + token = searcher_next_token(st->priv->engine); + + /* not sure if this has to be accurate or just say we had some matches */ + if (oldmatched != st->priv->engine->matchcount) + gtk_signal_emit (GTK_OBJECT (st), e_searching_tokenizer_signals[EST_MATCH_SIGNAL]); + + return token; } static gboolean @@ -932,10 +1144,16 @@ e_searching_tokenizer_has_more (HTMLTokenizer *tok) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok); - /* If no search is active, pending will always be NULL and thus - we'll always fall back to using the default method. */ + return (st->priv->engine != NULL && searcher_pending(st->priv->engine)) + || HTML_TOKENIZER_CLASS (parent_class)->has_more (tok); +} - return st->priv->pending || HTML_TOKENIZER_CLASS (parent_class)->has_more (tok); +/* proxy matched event, not sure what its for otherwise */ +static void +matched (ESearchingTokenizer *st) +{ + /*++st->priv->match_count;*/ + gtk_signal_emit (GTK_OBJECT (st), e_searching_tokenizer_signals[EST_MATCH_SIGNAL]); } static HTMLTokenizer * @@ -944,15 +1162,20 @@ e_searching_tokenizer_clone (HTMLTokenizer *tok) ESearchingTokenizer *orig_st = E_SEARCHING_TOKENIZER (tok); ESearchingTokenizer *new_st = E_SEARCHING_TOKENIZER (e_searching_tokenizer_new ()); - if (new_st->priv->search) { - search_info_free (new_st->priv->search); - } + search_info_free(new_st->priv->primary); + search_info_free(new_st->priv->secondary); + + new_st->priv->primary = search_info_clone(orig_st->priv->primary); + new_st->priv->secondary = search_info_clone(orig_st->priv->secondary); - new_st->priv->search = search_info_clone (orig_st->priv->search); + printf("cloinging shit\n"); + /* what the fucking what???? */ +#if 0 shared_state_ref (orig_st->priv->shared); shared_state_unref (new_st->priv->shared); new_st->priv->shared = orig_st->priv->shared; +#endif gtk_signal_connect_object (GTK_OBJECT (new_st), "match", @@ -963,42 +1186,29 @@ e_searching_tokenizer_clone (HTMLTokenizer *tok) } /* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */ -static gboolean -only_whitespace (const gchar *p) +void +e_searching_tokenizer_set_primary_search_string (ESearchingTokenizer *st, const gchar *search_str) { - gunichar c; - g_return_val_if_fail (p, FALSE); + g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); - while (*p && g_unichar_validate (c = g_utf8_get_char (p))) { - if (!g_unichar_isspace (c)) - return FALSE; - p = g_utf8_next_char (p); - } - return TRUE; + search_info_clear(st->priv->primary); + search_info_add_string(st->priv->primary, search_str); } void -e_searching_tokenizer_set_primary_search_string (ESearchingTokenizer *st, const gchar *search_str) +e_searching_tokenizer_add_primary_search_string (ESearchingTokenizer *st, const gchar *search_str) { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); - g_free (st->priv->shared->str_primary); - st->priv->shared->str_primary = NULL; - - if (search_str != NULL - && g_utf8_validate (search_str, -1, NULL) - && !only_whitespace (search_str)) { - - st->priv->shared->str_primary = g_strdup (search_str); - } + search_info_add_string(st->priv->primary, search_str); } void -e_searching_tokenizer_set_primary_case_sensitivity (ESearchingTokenizer *st, gboolean is_case_sensitive) +e_searching_tokenizer_set_primary_case_sensitivity (ESearchingTokenizer *st, gboolean iscase) { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); - st->priv->shared->case_sensitive_primary = is_case_sensitive; + search_info_set_flags(st->priv->primary, iscase?SEARCH_CASE:0, SEARCH_CASE); } void @@ -1006,23 +1216,24 @@ e_searching_tokenizer_set_secondary_search_string (ESearchingTokenizer *st, cons { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); - g_free (st->priv->shared->str_secondary); - st->priv->shared->str_secondary = NULL; + search_info_clear(st->priv->secondary); + search_info_add_string(st->priv->secondary, search_str); +} - if (search_str != NULL - && g_utf8_validate (search_str, -1, NULL) - && !only_whitespace (search_str)) { - - st->priv->shared->str_secondary = g_strdup (search_str); - } +void +e_searching_tokenizer_add_secondary_search_string (ESearchingTokenizer *st, const gchar *search_str) +{ + g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); + + search_info_add_string(st->priv->secondary, search_str); } void -e_searching_tokenizer_set_secondary_case_sensitivity (ESearchingTokenizer *st, gboolean is_case_sensitive) +e_searching_tokenizer_set_secondary_case_sensitivity (ESearchingTokenizer *st, gboolean iscase) { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); - st->priv->shared->case_sensitive_secondary = is_case_sensitive; + search_info_set_flags(st->priv->secondary, iscase?SEARCH_CASE:0, SEARCH_CASE); } gint @@ -1030,8 +1241,8 @@ e_searching_tokenizer_match_count (ESearchingTokenizer *st) { g_return_val_if_fail (st && E_IS_SEARCHING_TOKENIZER (st), -1); - return st->priv->match_count; -} - - + if (st->priv->engine) + return st->priv->engine->matchcount; + return 0; +} |