diff options
Diffstat (limited to 'e-util/e-trie.c')
-rw-r--r-- | e-util/e-trie.c | 345 |
1 files changed, 0 insertions, 345 deletions
diff --git a/e-util/e-trie.c b/e-util/e-trie.c deleted file mode 100644 index 2edf94df34..0000000000 --- a/e-util/e-trie.c +++ /dev/null @@ -1,345 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Authors: Jeffrey Stedfast <fejj@ximian.com> - * - * Copyright 2002 Ximian, Inc. (www.ximian.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA. - * - */ - - -#ifdef HAVE_CONFIG_H -#include <config.h> -#endif - -#include <stdio.h> -#include <string.h> - -#include "e-trie.h" -#include "e-memory.h" - -#define d(x) - -struct _trie_state { - struct _trie_state *next; - struct _trie_state *fail; - struct _trie_match *match; - unsigned int final; - int id; -}; - -struct _trie_match { - struct _trie_match *next; - struct _trie_state *state; - gunichar c; -}; - -struct _ETrie { - struct _trie_state root; - GPtrArray *fail_states; - gboolean icase; - - EMemChunk *match_chunks; - EMemChunk *state_chunks; -}; - - -static inline gunichar -trie_utf8_getc (const unsigned char **in, size_t inlen) -{ - register const unsigned char *inptr = *in; - const unsigned char *inend = inptr + inlen; - register unsigned char c, r; - register gunichar u, m; - - if (inlen == 0) - return 0; - - r = *inptr++; - if (r < 0x80) { - *in = inptr; - u = r; - } else if (r < 0xfe) { /* valid start char? */ - u = r; - m = 0x7f80; /* used to mask out the length bits */ - do { - if (inptr >= inend) - return 0; - - c = *inptr++; - if ((c & 0xc0) != 0x80) - goto error; - - u = (u << 6) | (c & 0x3f); - r <<= 1; - m <<= 5; - } while (r & 0x40); - - *in = inptr; - - u &= ~m; - } else { - error: - *in = (*in)+1; - u = 0xfffe; - } - - return u; -} - - -ETrie * -e_trie_new (gboolean icase) -{ - ETrie *trie; - - trie = g_new (ETrie, 1); - trie->root.next = NULL; - trie->root.fail = NULL; - trie->root.match = NULL; - trie->root.final = 0; - - trie->fail_states = g_ptr_array_sized_new (8); - trie->icase = icase; - - trie->match_chunks = e_memchunk_new (8, sizeof (struct _trie_match)); - trie->state_chunks = e_memchunk_new (8, sizeof (struct _trie_state)); - - return trie; -} - -void -e_trie_free (ETrie *trie) -{ - g_ptr_array_free (trie->fail_states, TRUE); - e_memchunk_destroy (trie->match_chunks); - e_memchunk_destroy (trie->state_chunks); - g_free (trie); -} - - - -static struct _trie_match * -g (struct _trie_state *s, gunichar c) -{ - struct _trie_match *m = s->match; - - while (m && m->c != c) - m = m->next; - - return m; -} - -static struct _trie_state * -trie_insert (ETrie *trie, int depth, struct _trie_state *q, gunichar c) -{ - struct _trie_match *m; - - m = e_memchunk_alloc (trie->match_chunks); - m->next = q->match; - m->c = c; - - q->match = m; - q = m->state = e_memchunk_alloc (trie->state_chunks); - q->match = NULL; - q->fail = &trie->root; - q->final = 0; - q->id = -1; - - if (trie->fail_states->len < depth + 1) { - unsigned int size = trie->fail_states->len; - - size = MAX (size + 64, depth + 1); - g_ptr_array_set_size (trie->fail_states, size); - } - - q->next = trie->fail_states->pdata[depth]; - trie->fail_states->pdata[depth] = q; - - return q; -} - - -#if 1 -static void -dump_trie (struct _trie_state *s, int depth) -{ - char *p = g_alloca ((depth * 2) + 1); - struct _trie_match *m; - - memset (p, ' ', depth * 2); - p[depth * 2] = '\0'; - - fprintf (stderr, "%s[state] %p: final=%d; pattern-id=%d; fail=%p\n", - p, s, s->final, s->id, s->fail); - m = s->match; - while (m) { - fprintf (stderr, " %s'%c' -> %p\n", p, m->c, m->state); - if (m->state) - dump_trie (m->state, depth + 1); - - m = m->next; - } -} -#endif - - -/* - * final = empty set - * FOR p = 1 TO #pat - * q = root - * FOR j = 1 TO m[p] - * IF g(q, pat[p][j]) == null - * insert(q, pat[p][j]) - * ENDIF - * q = g(q, pat[p][j]) - * ENDFOR - * final = union(final, q) - * ENDFOR -*/ - -void -e_trie_add (ETrie *trie, const char *pattern, int pattern_id) -{ - const unsigned char *inptr = (const unsigned char *) pattern; - struct _trie_state *q, *q1, *r; - struct _trie_match *m, *n; - int i, depth = 0; - gunichar c; - - /* Step 1: add the pattern to the trie */ - - q = &trie->root; - - while ((c = trie_utf8_getc (&inptr, -1))) { - if (trie->icase) - c = g_unichar_tolower (c); - - m = g (q, c); - if (m == NULL) { - q = trie_insert (trie, depth, q, c); - } else { - q = m->state; - } - - depth++; - } - - q->final = depth; - q->id = pattern_id; - - /* Step 2: compute failure graph */ - - for (i = 0; i < trie->fail_states->len; i++) { - q = trie->fail_states->pdata[i]; - while (q) { - m = q->match; - while (m) { - c = m->c; - q1 = m->state; - r = q->fail; - while (r && (n = g (r, c)) == NULL) - r = r->fail; - - if (r != NULL) { - q1->fail = n->state; - if (q1->fail->final > q1->final) - q1->final = q1->fail->final; - } else { - if ((n = g (&trie->root, c))) - q1->fail = n->state; - else - q1->fail = &trie->root; - } - - m = m->next; - } - - q = q->next; - } - } - - d(fprintf (stderr, "\nafter adding pattern '%s' to trie %p:\n", pattern, trie)); - d(dump_trie (&trie->root, 0)); -} - -/* - * Aho-Corasick - * - * q = root - * FOR i = 1 TO n - * WHILE q != fail AND g(q, text[i]) == fail - * q = h(q) - * ENDWHILE - * IF q == fail - * q = root - * ELSE - * q = g(q, text[i]) - * ENDIF - * IF isElement(q, final) - * RETURN TRUE - * ENDIF - * ENDFOR - * RETURN FALSE - */ - -const char * -e_trie_search (ETrie *trie, const char *buffer, size_t buflen, int *matched_id) -{ - const unsigned char *inptr, *inend, *prev, *pat; - register size_t inlen = buflen; - struct _trie_state *q; - struct _trie_match *m; - gunichar c; - - inptr = (const unsigned char *) buffer; - inend = inptr + buflen; - - q = &trie->root; - pat = prev = inptr; - while ((c = trie_utf8_getc (&inptr, inlen))) { - inlen = (inend - inptr); - - if (c != 0xfffe) { - if (trie->icase) - c = g_unichar_tolower (c); - - while (q != NULL && (m = g (q, c)) == NULL) - q = q->fail; - - if (q == &trie->root) - pat = prev; - - if (q == NULL) { - q = &trie->root; - pat = inptr; - } else if (m != NULL) { - q = m->state; - - if (q->final) { - if (matched_id) - *matched_id = q->id; - - return (const char *) pat; - } - } - } - - prev = inptr; - } - - return NULL; -} |