aboutsummaryrefslogtreecommitdiffstats
path: root/mail/e-searching-tokenizer.c
diff options
context:
space:
mode:
authornobody <nobody@localhost>2003-05-03 19:02:31 +0800
committernobody <nobody@localhost>2003-05-03 19:02:31 +0800
commitd34577c91655ef19466b05f9270e2bc5c4ab83fa (patch)
tree1445967f5c0ef4c749bc1b4030295ea1a12e3c00 /mail/e-searching-tokenizer.c
parent19f2626e65d1700ff9c631a70ecb917f98dfcb38 (diff)
downloadgsoc2013-evolution-R3_1.tar.gz
gsoc2013-evolution-R3_1.tar.zst
gsoc2013-evolution-R3_1.zip
This commit was manufactured by cvs2svn to create tag 'R3_1'.R3_1
svn path=/tags/R3_1/; revision=21091
Diffstat (limited to 'mail/e-searching-tokenizer.c')
-rw-r--r--mail/e-searching-tokenizer.c1250
1 files changed, 0 insertions, 1250 deletions
diff --git a/mail/e-searching-tokenizer.c b/mail/e-searching-tokenizer.c
deleted file mode 100644
index 01e055ff84..0000000000
--- a/mail/e-searching-tokenizer.c
+++ /dev/null
@@ -1,1250 +0,0 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-
-/*
- * e-searching-tokenizer.c
- *
- * Copyright (C) 2002 Ximian, Inc.
- *
- * Developed by Jon Trowbridge <trow@ximian.com>
- * Rewritten significantly to handle multiple strings and improve performance
- * by Michael Zucchi <notzed@ximian.com>
- */
-
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA.
- */
-
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-
-#include "e-searching-tokenizer.h"
-
-#include "e-util/e-memory.h"
-#include "e-util/e-msgport.h"
-
-#define d(x)
-
-enum {
- MATCH_SIGNAL,
- LAST_SIGNAL
-};
-
-static guint signals[LAST_SIGNAL] = { 0, };
-
-static void e_searching_tokenizer_begin (HTMLTokenizer *, char *);
-static void e_searching_tokenizer_end (HTMLTokenizer *);
-static char *e_searching_tokenizer_peek_token (HTMLTokenizer *);
-static char *e_searching_tokenizer_next_token (HTMLTokenizer *);
-static gboolean e_searching_tokenizer_has_more (HTMLTokenizer *);
-
-static HTMLTokenizer *e_searching_tokenizer_clone (HTMLTokenizer *);
-
-/*
- static const gchar *space_tags[] = { "br", NULL };*/
-
-static HTMLTokenizerClass *parent_class = NULL;
-
-/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
-
-/* ???
-typedef struct _SharedState SharedState;
-struct _SharedState {
- gint refs;
- gchar *str_primary;
- gchar *str_secondary;
- gboolean case_sensitive_primary;
- gboolean case_sensitive_secondary;
-};
-*/
-
-/* ********************************************************************** */
-
-
-#if 0
-static SharedState *
-shared_state_new (void)
-{
- SharedState *shared = g_new0 (SharedState, 1);
- shared->refs = 1;
- return shared;
-}
-
-static void
-shared_state_ref (SharedState *shared)
-{
- g_return_if_fail (shared != NULL);
- g_return_if_fail (shared->refs > 0);
- ++shared->refs;
-}
-
-static void
-shared_state_unref (SharedState *shared)
-{
- if (shared) {
- g_return_if_fail (shared->refs > 0);
- --shared->refs;
- if (shared->refs == 0) {
- g_free (shared->str_primary);
- g_free (shared->str_secondary);
- g_free (shared);
- }
- }
-}
-#endif
-
-/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
-
-/* ********************************************************************** */
-
-/* Utility functions */
-
-/* This is faster and safer than glib2's utf8 abomination, but isn't exported from camel as yet */
-static inline guint32
-camel_utf8_getc(const unsigned char **ptr)
-{
- register unsigned char *p = (unsigned char *)*ptr;
- register unsigned char c, r;
- register guint32 v, m;
-
-again:
- r = *p++;
-loop:
- if (r < 0x80) {
- *ptr = p;
- v = r;
- } else if (r < 0xfe) { /* valid start char? */
- v = r;
- m = 0x7f80; /* used to mask out the length bits */
- do {
- c = *p++;
- if ((c & 0xc0) != 0x80) {
- r = c;
- goto loop;
- }
- v = (v<<6) | (c & 0x3f);
- r<<=1;
- m<<=5;
- } while (r & 0x40);
-
- *ptr = p;
-
- v &= ~m;
- } else {
- goto again;
- }
-
- return v;
-}
-
-
-/* note: our tags of interest are 7 bit ascii, only, no need to do any fancy utf8 stuff */
-/* tags should be upper case
- if this list gets longer than 10 entries, consider binary search */
-static char *ignored_tags[] = { "B", "I", "FONT", "TT", "EM", /* and more? */};
-
-static int
-ignore_tag(const char *tag)
-{
- char *t = alloca(strlen(tag)+1), c, *out;
- const char *in;
- int i;
-
- /* we could use a aho-corasick matcher here too ... but we wont */
-
- /* normalise tag into 't'.
- Note we use the property that the only tags we're interested in
- are 7 bit ascii to shortcut and simplify case insensitivity */
- in = tag+2; /* skip: TAG_ESCAPE '<' */
- if (*in == '/')
- in++;
- out = t;
- while ((c = *in++)) {
- if (c >= 'A' && c <= 'Z')
- *out++ = c;
- else if (c >= 'a' && c <= 'z')
- *out++ = c & 0xdf; /* convert ASCII to upper case */
- else
- /* maybe should check for > or ' ' etc? */
- break;
- }
- *out = 0;
-
- for (i=0;i<sizeof(ignored_tags)/sizeof(ignored_tags[0]);i++) {
- if (strcmp(t, ignored_tags[i]) == 0)
- return 1;
- }
-
- return 0;
-}
-
-/* ********************************************************************** */
-
-/* Aho-Corasick search tree implmeentation */
-
-/* next state if we match a character */
-struct _match {
- struct _match *next;
- guint32 ch;
- struct _state *match;
-};
-
-/* tree state node */
-struct _state {
- struct _match *matches;
- unsigned int final; /* max no of chars we just matched */
- struct _state *fail; /* where to try next if we fail */
- struct _state *next; /* next on this level? */
-};
-
-/* base tree structure */
-struct _trie {
- struct _state root;
- int max_depth;
-
- EMemChunk *state_chunks;
- EMemChunk *match_chunks;
-};
-
-static void
-dump_trie(struct _state *s, int d)
-{
- char *p = alloca(d*2+1);
- struct _match *m;
-
- memset(p, ' ', d*2);
- p[d*2]=0;
-
- printf("%s[state] %p: %d fail->%p\n", p, s, s->final, s->fail);
- m = s->matches;
- while (m) {
- printf(" %s'%c' -> %p\n", p, m->ch, m->match);
- if (m->match)
- dump_trie(m->match, d+1);
- m = m->next;
- }
-}
-
-/* This builds an Aho-Corasick search trie for a set of utf8 words */
-/* See
- http://www-sr.informatik.uni-tuebingen.de/~buehler/AC/AC.html
- for a neat demo */
-
-static inline struct _match *
-g(struct _state *q, guint32 c)
-{
- struct _match *m = q->matches;
-
- while (m && m->ch != c)
- m = m->next;
-
- return m;
-}
-
-static struct _trie *
-build_trie(int nocase, int len, char **words)
-{
- struct _state *q, *qt, *r;
- char *word;
- struct _match *m, *n = NULL;
- int i, depth;
- guint32 c;
- struct _trie *trie;
- int state_depth_max, state_depth_size;
- struct _state **state_depth;
-
- trie = g_malloc(sizeof(*trie));
- trie->root.matches = NULL;
- trie->root.final = 0;
- trie->root.fail = NULL;
- trie->root.next = NULL;
-
- trie->state_chunks = e_memchunk_new(8, sizeof(struct _state));
- trie->match_chunks = e_memchunk_new(8, sizeof(struct _match));
-
- /* This will correspond to the length of the longest pattern */
- state_depth_size = 0;
- state_depth_max = 64;
- state_depth = g_malloc(sizeof(*state_depth[0])*64);
- state_depth[0] = NULL;
-
- /* Step 1: Build trie */
-
- /* This just builds a tree that merges all common prefixes into the same branch */
-
- for (i=0;i<len;i++) {
- word = words[i];
- q = &trie->root;
- depth = 0;
- while ((c = camel_utf8_getc((const unsigned char **)&word))) {
- if (nocase)
- c = g_unichar_tolower(c);
- m = g(q, c);
- if (m == NULL) {
- m = e_memchunk_alloc(trie->match_chunks);
- m->ch = c;
- m->next = q->matches;
- q->matches = m;
- q = m->match = e_memchunk_alloc(trie->state_chunks);
- q->matches = NULL;
- q->fail = &trie->root;
- q->final = 0;
- if (state_depth_max < depth) {
- state_depth_max += 64;
- state_depth = g_realloc(state_depth, sizeof(*state_depth[0])*state_depth_max);
- }
- if (state_depth_size < depth) {
- state_depth[depth] = 0;
- state_depth_size = depth;
- }
- q->next = state_depth[depth];
- state_depth[depth] = q;
- } else {
- q = m->match;
- }
- depth++;
- }
- q->final = depth;
- }
-
- d(printf("Dumping trie:\n"));
- d(dump_trie(&trie->root, 0));
-
- /* Step 2: Build failure graph */
-
- /* This searches for the longest substring which is a prefix of another string and
- builds a graph of failure links so you can find multiple substrings concurrently,
- using aho-corasick's algorithm */
-
- for (i=0;i<state_depth_size;i++) {
- q = state_depth[i];
- while (q) {
- m = q->matches;
- while (m) {
- c = m->ch;
- qt = m->match;
- r = q->fail;
- while (r && (n = g(r, c)) == NULL)
- r = r->fail;
- if (r != NULL) {
- qt->fail = n->match;
- if (qt->fail->final > qt->final)
- qt->final = qt->fail->final;
- } else {
- if ((n = g(&trie->root, c)))
- qt->fail = n->match;
- else
- qt->fail = &trie->root;
- }
- m = m->next;
- }
- q = q->next;
- }
- }
-
- d(printf("After failure analysis\n"));
- d(dump_trie(&trie->root, 0));
-
- g_free(state_depth);
-
- trie->max_depth = state_depth_size;
-
- return trie;
-}
-
-static void
-free_trie(struct _trie *t)
-{
- e_memchunk_destroy(t->match_chunks);
- e_memchunk_destroy(t->state_chunks);
-
- g_free(t);
-}
-
-/* ********************************************************************** */
-
-/* html token searcher */
-
-struct _token {
- struct _token *next;
- struct _token *prev;
- unsigned int offset;
- /* we need to copy the token for memory management, so why not copy it whole */
- char tok[1];
-};
-
-/* stack of submatches currently being scanned, used for merging */
-struct _submatch {
- unsigned int offstart, offend; /* in bytes */
-};
-
-/* flags for new func */
-#define SEARCH_CASE (1)
-#define SEARCH_BOLD (2)
-
-struct _searcher {
- struct _trie *t;
-
- char *(*next_token)(); /* callbacks for more tokens */
- void *next_data;
-
- int words; /* how many words */
- char *tags, *tage; /* the tag we used to highlight */
-
- int flags; /* case sensitive or not */
-
- struct _state *state; /* state is the current trie state */
-
- int matchcount;
-
- EDList input; /* pending 'input' tokens, processed but might match */
- EDList output; /* output tokens ready for source */
-
- struct _token *current; /* for token output memory management */
-
- guint32 offset; /* current offset through searchable stream? */
- guint32 offout; /* last output position */
-
- unsigned int lastp; /* current position in rotating last buffer */
- guint32 *last; /* buffer that goes back last 'n' positions */
- guint32 last_mask; /* bitmask for efficient rotation calculation */
-
- unsigned int submatchp; /* submatch stack */
- struct _submatch *submatches;
-};
-
-static void
-searcher_set_tokenfunc(struct _searcher *s, char *(*next)(), void *data)
-{
- s->next_token = next;
- s->next_data = data;
-}
-
-static struct _searcher *
-searcher_new(int flags, int argc, char **argv, const char *tags, const char *tage)
-{
- int i, m;
- struct _searcher *s;
-
- s = g_malloc(sizeof(*s));
-
- s->t = build_trie((flags&SEARCH_CASE) == 0, argc, argv);
- s->words = argc;
- s->tags = g_strdup(tags);
- s->tage = g_strdup(tage);
- s->flags = flags;
- s->state = &s->t->root;
- s->matchcount = 0;
-
- e_dlist_init(&s->input);
- e_dlist_init(&s->output);
- s->current = 0;
-
- s->offset = 0;
- s->offout = 0;
-
- /* rotating queue of previous character positions */
- m = s->t->max_depth+1;
- i = 2;
- while (i<m)
- i<<=2;
- s->last = g_malloc(sizeof(s->last[0])*i);
- s->last_mask = i-1;
- s->lastp = 0;
-
- /* a stack of possible submatches */
- s->submatchp = 0;
- s->submatches = g_malloc(sizeof(s->submatches[0])*argc+1);
-
- return s;
-}
-
-static void
-searcher_free(struct _searcher *s)
-{
- struct _token *t;
-
- while ((t = (struct _token *)e_dlist_remhead(&s->input)))
- g_free(t);
- while ((t = (struct _token *)e_dlist_remhead(&s->output)))
- g_free(t);
- g_free(s->tags);
- g_free(s->tage);
- g_free(s->last);
- g_free(s->submatches);
- free_trie(s->t);
- g_free(s);
-}
-static struct _token *
-append_token(EDList *list, const char *tok, int len)
-{
- struct _token *token;
-
- if (len == -1)
- len = strlen(tok);
- token = g_malloc(sizeof(*token) + len+1);
- token->offset = 0; /* set by caller when required */
- memcpy(token->tok, tok, len);
- token->tok[len] = 0;
- e_dlist_addtail(list, (EDListNode *)token);
-
- return token;
-}
-
-#define free_token(x) (g_free(x))
-
-static void
-output_token(struct _searcher *s, struct _token *token)
-{
- int offend;
- int left, pre;
-
- if (token->tok[0] == TAG_ESCAPE) {
- if (token->offset >= s->offout) {
- d(printf("moving tag token '%s' from input to output\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
- e_dlist_addtail(&s->output, (EDListNode *)token);
- } else {
- d(printf("discarding tag token '%s' from input\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
- free_token(token);
- }
- } else {
- offend = token->offset + strlen(token->tok);
- left = offend-s->offout;
- if (left > 0) {
- pre = s->offout - token->offset;
- if (pre>0)
- memmove(token->tok, token->tok+pre, left+1);
- d(printf("adding partial remaining/failed '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
- s->offout = offend;
- e_dlist_addtail(&s->output, (EDListNode *)token);
- } else {
- d(printf("discarding whole token '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
- free_token(token);
- }
- }
-}
-
-static struct _token *
-find_token(struct _searcher *s, int start)
-{
- register struct _token *token;
-
- /* find token which is start token, from end of list back */
- token = (struct _token *)s->input.tailpred;
- while (token->prev) {
- if (token->offset <= start)
- return token;
- token = token->prev;
- }
-
- return NULL;
-}
-
-static void
-output_match(struct _searcher *s, unsigned int start, unsigned int end)
-{
- register struct _token *token;
- struct _token *starttoken, *endtoken;
- char b[8];
-
- d(printf("output match: %d-%d at %d\n", start, end, s->offout));
-
- starttoken = find_token(s, start);
- endtoken = find_token(s, end);
-
- if (starttoken == NULL || endtoken == NULL) {
- printf("Cannot find match history for match %d-%d\n", start, end);
- return;
- }
-
- d(printf("start in token '%s'\n", starttoken->tok[0]==TAG_ESCAPE?starttoken->tok+1:starttoken->tok));
- d(printf("end in token '%s'\n", endtoken->tok[0]==TAG_ESCAPE?endtoken->tok+1:endtoken->tok));
-
- /* output pending stuff that didn't match afterall */
- while ((struct _token *)s->input.head != starttoken) {
- token = (struct _token *)e_dlist_remhead(&s->input);
- d(printf("appending failed match '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
- output_token(s, token);
- }
-
- /* output any pre-match text */
- if (s->offout < start) {
- token = append_token(&s->output, starttoken->tok + (s->offout-starttoken->offset), start-s->offout);
- d(printf("adding pre-match text '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
- s->offout = start;
- }
-
- /* output highlight/bold */
- if (s->flags & SEARCH_BOLD) {
- sprintf(b, "%c<b>", (char)TAG_ESCAPE);
- append_token(&s->output, b, -1);
- }
- if (s->tags)
- append_token(&s->output, s->tags, -1);
-
- /* output match node(s) */
- if (starttoken != endtoken) {
- while ((struct _token *)s->input.head != endtoken) {
- token = (struct _token *)e_dlist_remhead(&s->input);
- d(printf("appending (partial) match node (head) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
- output_token(s, token);
- }
- }
-
- /* any remaining partial content */
- if (s->offout < end) {
- token = append_token(&s->output, endtoken->tok+(s->offout-endtoken->offset), end-s->offout);
- d(printf("appending (partial) match node (tail) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
- s->offout = end;
- }
-
- /* end highlight */
- if (s->tage)
- append_token(&s->output, s->tage, -1);
-
- /* and close bold if we need to */
- if (s->flags & SEARCH_BOLD) {
- sprintf(b, "%c</b>", (char)TAG_ESCAPE);
- append_token(&s->output, b, -1);
- }
-}
-
-/* output any sub-pending blocks */
-static void
-output_subpending(struct _searcher *s)
-{
- int i;
-
- for (i=s->submatchp-1;i>=0;i--)
- output_match(s, s->submatches[i].offstart, s->submatches[i].offend);
- s->submatchp = 0;
-}
-
-/* returns true if a merge took place */
-static int
-merge_subpending(struct _searcher *s, int offstart, int offend)
-{
- int i;
-
- /* merges overlapping or abutting match strings */
- if (s->submatchp &&
- s->submatches[s->submatchp-1].offend >= offstart) {
-
- /* go from end, any that match 'invalidate' follow-on ones too */
- for (i=s->submatchp-1;i>=0;i--) {
- if (s->submatches[i].offend >= offstart) {
- if (offstart < s->submatches[i].offstart)
- s->submatches[i].offstart = offstart;
- s->submatches[i].offend = offend;
- if (s->submatchp > i)
- s->submatchp = i+1;
- }
- }
- return 1;
- }
-
- return 0;
-}
-
-static void
-push_subpending(struct _searcher *s, int offstart, int offend)
-{
- /* This is really an assertion, we just ignore the last pending match instead of crashing though */
- if (s->submatchp >= s->words) {
- printf("ERROR: submatch pending stack overflow\n");
- s->submatchp = s->words-1;
- }
-
- s->submatches[s->submatchp].offstart = offstart;
- s->submatches[s->submatchp].offend = offend;
- s->submatchp++;
-}
-
-/* move any (partial) tokens from input to output if they are beyond the current output position */
-static void
-output_pending(struct _searcher *s)
-{
- struct _token *token;
-
- while ( (token = (struct _token *)e_dlist_remhead(&s->input)) )
- output_token(s, token);
-}
-
-/* flushes any nodes we cannot possibly match anymore */
-static void
-flush_extra(struct _searcher *s)
-{
- unsigned int start;
- int i;
- struct _token *starttoken, *token;
-
- /* find earliest char that can be in contention */
- start = s->offset - s->t->max_depth;
- for (i=0;i<s->submatchp;i++)
- if (s->submatches[i].offstart < start)
- start = s->submatches[i].offstart;
-
- /* now, flush out any tokens which are before this point */
- starttoken = find_token(s, start);
- if (starttoken == NULL)
- return;
-
- while ((struct _token *)s->input.head != starttoken) {
- token = (struct _token *)e_dlist_remhead(&s->input);
- output_token(s, token);
- }
-}
-
-static char *
-searcher_next_token(struct _searcher *s)
-{
- struct _token *token;
- char *tok, *stok;
- struct _trie *t = s->t;
- struct _state *q = s->state;
- struct _match *m = NULL;
- int offstart, offend;
- guint32 c;
-
- while (e_dlist_empty(&s->output)) {
- /* get next token */
- tok = s->next_token(s->next_data);
- if (tok == NULL) {
- output_subpending(s);
- output_pending(s);
- break;
- }
-
- /* we dont always have to copy each token, e.g. if we dont match anything */
- token = append_token(&s->input, tok, -1);
- token->offset = s->offset;
- tok = token->tok;
-
- d(printf("new token %d '%s'\n", token->offset, token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
-
- /* tag test, reset state on unknown tags */
- if (tok[0] == TAG_ESCAPE) {
- if (!ignore_tag(tok)) {
- /* force reset */
- output_subpending(s);
- output_pending(s);
- q = &t->root;
- }
-
- continue;
- }
-
- /* process whole token */
- stok = tok;
- while ((c = camel_utf8_getc((const unsigned char **)&tok))) {
- if ((s->flags & SEARCH_CASE) == 0)
- c = g_unichar_tolower(c);
- while (q && (m = g(q, c)) == NULL)
- q = q->fail;
- if (q == NULL) {
- /* mismatch ... reset state */
- output_subpending(s);
- q = &t->root;
- } else if (m != NULL) {
- /* keep track of previous offsets of utf8 chars, rotating buffer */
- s->last[s->lastp] = s->offset + (tok-stok)-1;
- s->lastp = (s->lastp+1)&s->last_mask;
-
- q = m->match;
- /* we have a match of q->final characters for a matching word */
- if (q->final) {
- s->matchcount++;
-
- /* use the last buffer to find the real offset of this char */
- offstart = s->last[(s->lastp - q->final)&s->last_mask];
- offend = s->offset + (tok - stok);
-
- if (q->matches == NULL) {
- if (s->submatchp == 0) {
- /* nothing pending, always put something in so we can try merge */
- push_subpending(s, offstart, offend);
- } else if (!merge_subpending(s, offstart, offend)) {
- /* can't merge, output what we have, and start againt */
- output_subpending(s);
- push_subpending(s, offstart, offend);
- /*output_match(s, offstart, offend);*/
- } else if (e_dlist_length(&s->input) > 8) {
- /* we're continuing to match and merge, but we have a lot of stuff
- waiting, so flush it out now since this is a safe point to do it */
- output_subpending(s);
- }
- } else {
- /* merge/add subpending */
- if (!merge_subpending(s, offstart, offend))
- push_subpending(s, offstart, offend);
- }
- }
- }
- }
-
- s->offset += (tok-stok)-1;
-
- flush_extra(s);
- }
-
- s->state = q;
-
- if (s->current)
- free_token(s->current);
-
- s->current = token = (struct _token *)e_dlist_remhead(&s->output);
-
- return token?token->tok:NULL;
-}
-
-static char *
-searcher_peek_token(struct _searcher *s)
-{
- char *tok;
-
- /* we just get it and then put it back, it's fast enuf */
- tok = searcher_next_token(s);
- if (tok) {
- /* need to clear this so we dont free it while its still active */
- e_dlist_addhead(&s->output, (EDListNode *)s->current);
- s->current = NULL;
- }
-
- return tok;
-}
-
-static int
-searcher_pending(struct _searcher *s)
-{
- return !(e_dlist_empty(&s->input) && e_dlist_empty(&s->output));
-}
-
-/* ********************************************************************** */
-
-struct _search_info {
- GPtrArray *strv;
- char *colour;
- unsigned int size:8;
- unsigned int flags:8;
-};
-
-/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
-
-static struct _search_info *
-search_info_new(void)
-{
- struct _search_info *s;
-
- s = g_malloc0(sizeof(struct _search_info));
- s->strv = g_ptr_array_new();
-
- return s;
-}
-
-static void
-search_info_set_flags(struct _search_info *si, unsigned int flags, unsigned int mask)
-{
- si->flags = (si->flags & ~mask) | (flags & mask);
-}
-
-static void
-search_info_set_colour(struct _search_info *si, const char *colour)
-{
- g_free(si->colour);
- si->colour = g_strdup(colour);
-}
-
-static void
-search_info_add_string(struct _search_info *si, const char *s)
-{
- const char *start;
- guint32 c;
-
- if (s && s[0]) {
- /* strip leading whitespace */
- start = s;
- while ((c = camel_utf8_getc((const unsigned char **)&s))) {
- if (!g_unichar_isspace(c)) {
- break;
- }
- start = s;
- }
- /* should probably also strip trailing, but i'm lazy today */
- if (start[0])
- g_ptr_array_add(si->strv, g_strdup(start));
- }
-}
-
-static void
-search_info_clear(struct _search_info *si)
-{
- int i;
-
- for (i=0;i<si->strv->len;i++)
- g_free(si->strv->pdata[i]);
-
- g_ptr_array_set_size(si->strv, 0);
-}
-
-static void
-search_info_free(struct _search_info *si)
-{
- int i;
-
- for (i=0;i<si->strv->len;i++)
- g_free(si->strv->pdata[i]);
-
- g_ptr_array_free(si->strv, TRUE);
- g_free(si->colour);
- g_free(si);
-}
-
-static struct _search_info *
-search_info_clone(struct _search_info *si)
-{
- struct _search_info *out;
- int i;
-
- out = search_info_new();
- for (i=0;i<si->strv->len;i++)
- g_ptr_array_add(out->strv, g_strdup(si->strv->pdata[i]));
- out->colour = g_strdup(si->colour);
- out->flags = si->flags;
- out->size = si->size;
-
- return out;
-}
-
-static struct _searcher *
-search_info_to_searcher(struct _search_info *si)
-{
- char *tags, *tage;
- char *col;
-
- if (si->strv->len == 0)
- return NULL;
-
- if (si->colour == NULL)
- col = "red";
- else
- col = si->colour;
-
- tags = alloca(20+strlen(col));
- sprintf(tags, "%c<font color=\"%s\">", TAG_ESCAPE, col);
- tage = alloca(20);
- sprintf(tage, "%c</font>", TAG_ESCAPE);
-
- return searcher_new(si->flags, si->strv->len, (char **)si->strv->pdata, tags, tage);
-}
-
-/* ********************************************************************** */
-
-struct _ESearchingTokenizerPrivate {
- struct _search_info *primary, *secondary;
- struct _searcher *engine;
-};
-
-/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
-
-static void
-e_searching_tokenizer_finalise (GObject *obj)
-{
- ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (obj);
- struct _ESearchingTokenizerPrivate *p = st->priv;
-
- search_info_free (p->primary);
- search_info_free (p->secondary);
- if (p->engine)
- searcher_free(p->engine);
-
- /* again wtf?
- shared_state_unref (st->priv->shared);
- */
-
- g_free (p);
-
- if (G_OBJECT_CLASS (parent_class)->finalize)
- G_OBJECT_CLASS (parent_class)->finalize(obj);
-}
-
-static void
-e_searching_tokenizer_class_init (ESearchingTokenizerClass *klass)
-{
- GObjectClass *obj_class = (GObjectClass *) klass;
- HTMLTokenizerClass *tok_class = HTML_TOKENIZER_CLASS (klass);
-
- parent_class = g_type_class_ref (HTML_TYPE_TOKENIZER);
-
- signals[MATCH_SIGNAL] =
- g_signal_new ("match",
- E_TYPE_SEARCHING_TOKENIZER,
- G_SIGNAL_RUN_LAST,
- G_STRUCT_OFFSET (ESearchingTokenizerClass, match),
- NULL,
- NULL,
- g_cclosure_marshal_VOID__VOID,
- G_TYPE_NONE, 0);
-
- obj_class->finalize = e_searching_tokenizer_finalise;
-
- tok_class->begin = e_searching_tokenizer_begin;
- tok_class->end = e_searching_tokenizer_end;
-
- tok_class->peek_token = e_searching_tokenizer_peek_token;
- tok_class->next_token = e_searching_tokenizer_next_token;
- tok_class->has_more = e_searching_tokenizer_has_more;
- tok_class->clone = e_searching_tokenizer_clone;
-}
-
-static void
-e_searching_tokenizer_init (ESearchingTokenizer *st)
-{
- struct _ESearchingTokenizerPrivate *p;
-
- p = st->priv = g_new0 (struct _ESearchingTokenizerPrivate, 1);
-
- p->primary = search_info_new();
- search_info_set_flags(p->primary, SEARCH_BOLD, SEARCH_CASE|SEARCH_BOLD);
- search_info_set_colour(p->primary, "red");
-
- p->secondary = search_info_new();
- search_info_set_flags(p->secondary, SEARCH_BOLD, SEARCH_CASE|SEARCH_BOLD);
- search_info_set_colour(p->secondary, "purple");
-}
-
-GType
-e_searching_tokenizer_get_type (void)
-{
- static GType type = 0;
-
- if (!type) {
- static const GTypeInfo info = {
- sizeof (ESearchingTokenizerClass),
- NULL, NULL,
- (GClassInitFunc) e_searching_tokenizer_class_init,
- NULL, NULL,
- sizeof (ESearchingTokenizer),
- 0,
- (GInstanceInitFunc) e_searching_tokenizer_init,
- };
-
- type = g_type_register_static (HTML_TYPE_TOKENIZER, "ESearchingTokenizer", &info, 0);
- }
-
- return type;
-}
-
-HTMLTokenizer *
-e_searching_tokenizer_new (void)
-{
- return (HTMLTokenizer *) g_object_new (E_TYPE_SEARCHING_TOKENIZER, NULL);
-}
-
-/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
-
-/* blah blah the htmltokeniser doesn't like being asked
- for a token if it doens't hvae any! */
-static char *get_token(HTMLTokenizer *t)
-{
- HTMLTokenizerClass *klass = HTML_TOKENIZER_CLASS (parent_class);
-
- return klass->has_more(t) ? klass->next_token(t) : NULL;
-}
-
-static void
-e_searching_tokenizer_begin (HTMLTokenizer *t, char *content_type)
-{
- ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t);
- struct _ESearchingTokenizerPrivate *p = st->priv;
-
- /* reset search */
- if (p->engine) {
- searcher_free(p->engine);
- p->engine = NULL;
- }
-
- if ((p->engine = search_info_to_searcher(p->primary))
- || (p->engine = search_info_to_searcher(p->secondary))) {
- /*HTMLTokenizerClass *klass = HTML_TOKENIZER_CLASS (parent_class);*/
-
- /*searcher_set_tokenfunc(p->engine, klass->next_token, st);*/
- searcher_set_tokenfunc(p->engine, get_token, st);
- }
- /* else - no engine, no search active */
-
- HTML_TOKENIZER_CLASS (parent_class)->begin (t, content_type);
-}
-
-static void
-e_searching_tokenizer_end (HTMLTokenizer *t)
-{
- /* so end gets called before any get/next tokens.
- I dont get it. */
-#if 0
- ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t);
- struct _ESearchingTokenizerPrivate *p = st->priv;
-
- /* not sure if we should reset search every time ... *shrug* */
- if (p->engine) {
- searcher_free(p->engine);
- p->engine = NULL;
- }
-#endif
-
- HTML_TOKENIZER_CLASS (parent_class)->end (t);
-}
-
-static char *
-e_searching_tokenizer_peek_token (HTMLTokenizer *tok)
-{
- ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok);
-
- /* If no search is active, just use the default method. */
- if (st->priv->engine == NULL)
- return HTML_TOKENIZER_CLASS (parent_class)->peek_token (tok);
-
- return searcher_peek_token(st->priv->engine);
-}
-
-static char *
-e_searching_tokenizer_next_token (HTMLTokenizer *tok)
-{
- ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok);
- int oldmatched;
- char *token;
-
- /* If no search is active, just use the default method. */
- if (st->priv->engine == NULL)
- return HTML_TOKENIZER_CLASS (parent_class)->next_token (tok);
-
- oldmatched = st->priv->engine->matchcount;
-
- token = searcher_next_token(st->priv->engine);
-
- /* not sure if this has to be accurate or just say we had some matches */
- if (oldmatched != st->priv->engine->matchcount)
- g_signal_emit (st, signals[MATCH_SIGNAL], 0);
-
- return token;
-}
-
-static gboolean
-e_searching_tokenizer_has_more (HTMLTokenizer *tok)
-{
- ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok);
-
- return (st->priv->engine != NULL && searcher_pending(st->priv->engine))
- || HTML_TOKENIZER_CLASS (parent_class)->has_more (tok);
-}
-
-/* proxy matched event, not sure what its for otherwise */
-static void
-matched (ESearchingTokenizer *st)
-{
- /*++st->priv->match_count;*/
- g_signal_emit (st, signals[MATCH_SIGNAL], 0);
-}
-
-static HTMLTokenizer *
-e_searching_tokenizer_clone (HTMLTokenizer *tok)
-{
- ESearchingTokenizer *orig_st = E_SEARCHING_TOKENIZER (tok);
- ESearchingTokenizer *new_st = E_SEARCHING_TOKENIZER (e_searching_tokenizer_new ());
-
- search_info_free(new_st->priv->primary);
- search_info_free(new_st->priv->secondary);
-
- new_st->priv->primary = search_info_clone(orig_st->priv->primary);
- new_st->priv->secondary = search_info_clone(orig_st->priv->secondary);
-
- /* what the fucking what???? */
-#if 0
- shared_state_ref (orig_st->priv->shared);
- shared_state_unref (new_st->priv->shared);
- new_st->priv->shared = orig_st->priv->shared;
-#endif
-
- g_signal_connect_swapped (new_st, "match", G_CALLBACK(matched), orig_st);
-
- return HTML_TOKENIZER (new_st);
-}
-/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */
-
-void
-e_searching_tokenizer_set_primary_search_string (ESearchingTokenizer *st, const gchar *search_str)
-{
- g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
-
- search_info_clear(st->priv->primary);
- search_info_add_string(st->priv->primary, search_str);
-}
-
-void
-e_searching_tokenizer_add_primary_search_string (ESearchingTokenizer *st, const gchar *search_str)
-{
- g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
-
- search_info_add_string(st->priv->primary, search_str);
-}
-
-void
-e_searching_tokenizer_set_primary_case_sensitivity (ESearchingTokenizer *st, gboolean iscase)
-{
- g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
-
- search_info_set_flags(st->priv->primary, iscase?SEARCH_CASE:0, SEARCH_CASE);
-}
-
-void
-e_searching_tokenizer_set_secondary_search_string (ESearchingTokenizer *st, const gchar *search_str)
-{
- g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
-
- search_info_clear(st->priv->secondary);
- search_info_add_string(st->priv->secondary, search_str);
-}
-
-void
-e_searching_tokenizer_add_secondary_search_string (ESearchingTokenizer *st, const gchar *search_str)
-{
- g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
-
- search_info_add_string(st->priv->secondary, search_str);
-}
-
-void
-e_searching_tokenizer_set_secondary_case_sensitivity (ESearchingTokenizer *st, gboolean iscase)
-{
- g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st));
-
- search_info_set_flags(st->priv->secondary, iscase?SEARCH_CASE:0, SEARCH_CASE);
-}
-
-gint
-e_searching_tokenizer_match_count (ESearchingTokenizer *st)
-{
- g_return_val_if_fail (st && E_IS_SEARCHING_TOKENIZER (st), -1);
-
- if (st->priv->engine)
- return st->priv->engine->matchcount;
-
- return 0;
-}