diff options
author | Not Zed <NotZed@Ximian.com> | 2002-04-18 10:18:55 +0800 |
---|---|---|
committer | Michael Zucci <zucchi@src.gnome.org> | 2002-04-18 10:18:55 +0800 |
commit | 6ccd0e6f59bec5f1900c49cd1868fca998570fc7 (patch) | |
tree | eac70d58c4d79bfbc73d7592ad5f303f7f8c044e /camel/camel-search-private.c | |
parent | e5e67a6644e4d0ac41c270a4bcd18e5c6e2b7667 (diff) | |
download | gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.gz gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.zst gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.zip |
When doing a contains match, split the words and perform an and on it.
2002-04-18 Not Zed <NotZed@Ximian.com>
* camel-folder-search.c (check_header): When doing a contains
match, split the words and perform an and on it.
(match_words_messages): If we have an index, but were forced to do
a full search, first lookup a subset of messages using
the index and a simplified word set. Only do a manual search of
this subset.
2002-04-17 Not Zed <NotZed@Ximian.com>
* camel-folder-search.c (match_message_index): Changed to take a
utf8 string not a regex pattern.
(match_words_index): Matches against a camel_search_words list.
(match_words_1message): Matches a single message against a
camel_search_words list.
(match_words_message): Same, but gets the message from the folder
for you.
(match_words_messages): Matches a list of messages against a words
list.
(search_body_contains): Rewritten to handle multiple word
searches. For #23371.
* providers/imap/camel-imap-search.c (sync_match): Split words
when searching, to support multiple search words. Also, try
searching specifying charset of utf8 if we can, if that fails,
fall back to not specifying charset. TODO: It should translate
the strings into the locale default charset?
* providers/imap/camel-imap-store.c (connect_to_server): Added new
cap - utf8_search, if set, we tell the server we're searching
using utf8, otherwise we dont (incorrectly, since we always use
utf8 to search).
* camel-search-private.c (camel_ustrstrcase): Make this class public.
(camel_search_words_split): Split a word into multiple words based
on whitespace, and keep track of whether the word is simple
(indexable directly), or not.
(camel_search_words_free): Free 'em.
svn path=/trunk/; revision=16501
Diffstat (limited to 'camel/camel-search-private.c')
-rw-r--r-- | camel/camel-search-private.c | 158 |
1 files changed, 154 insertions, 4 deletions
diff --git a/camel/camel-search-private.c b/camel/camel-search-private.c index 6ecb64a1ae..7e8553cd35 100644 --- a/camel/camel-search-private.c +++ b/camel/camel-search-private.c @@ -194,6 +194,7 @@ header_soundex (const char *header, const char *match) return truth; } +/* FIXME: This is stupidly slow and needs to be removed */ static gunichar utf8_get (const char **inp) { @@ -209,7 +210,7 @@ utf8_get (const char **inp) return c; } -static const char * +const char * camel_ustrstrcase (const char *haystack, const char *needle) { gunichar *nuni, *puni; @@ -469,9 +470,6 @@ camel_search_message_body_contains (CamelDataWrapper *object, regex_t *pattern) if (containee == NULL) return FALSE; - /* TODO: I find it odd that get_part and get_content_object do not - add a reference, probably need fixing for multithreading */ - /* using the object types is more accurate than using the mime/types */ if (CAMEL_IS_MULTIPART (containee)) { parts = camel_multipart_get_number (CAMEL_MULTIPART (containee)); @@ -496,3 +494,155 @@ camel_search_message_body_contains (CamelDataWrapper *object, regex_t *pattern) return truth; } +static __inline__ guint32 +camel_utf8_getc(const unsigned char **ptr) +{ + register unsigned char *p = (unsigned char *)*ptr; + register unsigned char c, r; + register guint32 v=0, /* this is only required because the stupid @@@%#%# compiler thinks it can be used uninitialised */ + m; + + r = *p++; +loop: + if (r < 0x80) { + *ptr = p; + v = r; + } else if (r < 0xfe) { /* valid start char? */ + v = r; + m = 0x7f80; /* used to mask out the length bits */ + do { + c = *p++; + if ((c & 0xc0) != 0x80) { + r = c; + goto loop; + } + v = (v<<6) | (c & 0x3f); + r<<=1; + m<<=5; + } while (r & 0x40); + + *ptr = p; + + v &= ~m; + } + + return v; +} + +struct _camel_search_words * +camel_search_words_split(const unsigned char *in) +{ + int type = CAMEL_SEARCH_WORD_SIMPLE, all = 0; + GString *w; + struct _camel_search_word *word; + struct _camel_search_words *words; + GPtrArray *list = g_ptr_array_new(); + guint32 c; + int utf8len; + char utf8[8]; + + words = g_malloc0(sizeof(*words)); + w = g_string_new(""); + + do { + c = camel_utf8_getc(&in); + if (c == 0 || g_unichar_isspace(c)) { + if (w->len) { + word = g_malloc0(sizeof(*word)); + word->word = g_strdup(w->str); + word->type = type; + g_ptr_array_add(list, word); + all |= type; + type = CAMEL_SEARCH_WORD_SIMPLE; + g_string_truncate(w, 0); + } + } else { + if (!g_unichar_isalnum(c)) + type = CAMEL_SEARCH_WORD_COMPLEX; + else + c = g_unichar_tolower(c); + if (c > 0x80) + type |= CAMEL_SEARCH_WORD_8BIT; + + utf8len = g_unichar_to_utf8(c, utf8); + utf8[utf8len] = 0; + g_string_append(w, utf8); + } + } while (c); + + g_string_free(w, TRUE); + words->len = list->len; + words->words = (struct _camel_search_word **)list->pdata; + words->type = all; + g_ptr_array_free(list, FALSE); + + return words; +} + +/* takes an existing 'words' list, and converts it to another consisting of + only simple words, with any punctuation etc stripped */ +struct _camel_search_words * +camel_search_words_simple(struct _camel_search_words *wordin) +{ + int i; + const unsigned char *ptr, *start, *last; + int type = CAMEL_SEARCH_WORD_SIMPLE, all = 0; + GPtrArray *list = g_ptr_array_new(); + struct _camel_search_word *word; + struct _camel_search_words *words; + guint32 c; + + words = g_malloc0(sizeof(*words)); + + for (i=0;i<wordin->len;i++) { + if ((wordin->words[i]->type & CAMEL_SEARCH_WORD_COMPLEX) == 0) { + word = g_malloc0(sizeof(*word)); + word->type = wordin->words[i]->type; + word->word = g_strdup(wordin->words[i]->word); + g_ptr_array_add(list, word); + } else { + ptr = wordin->words[i]->word; + start = last = ptr; + do { + c = camel_utf8_getc(&ptr); + if (c == 0 || !g_unichar_isalnum(c)) { + if (last > start) { + word = g_malloc0(sizeof(*word)); + word->word = g_strndup(start, last-start); + word->type = type; + g_ptr_array_add(list, word); + all |= type; + type = CAMEL_SEARCH_WORD_SIMPLE; + } + start = ptr; + } + if (c > 0x80) + type = CAMEL_SEARCH_WORD_8BIT; + last = ptr; + } while (c); + } + } + + words->len = list->len; + words->words = (struct _camel_search_word **)list->pdata; + words->type = all; + g_ptr_array_free(list, FALSE); + + return words; +} + +void +camel_search_words_free(struct _camel_search_words *words) +{ + int i; + + for (i=0;i<words->len;i++) { + struct _camel_search_word *word = words->words[i]; + + g_free(word->word); + g_free(word); + } + g_free(words->words); + g_free(words); +} + |