aboutsummaryrefslogtreecommitdiffstats
path: root/camel/camel-search-private.c
diff options
context:
space:
mode:
authorNot Zed <NotZed@Ximian.com>2002-04-18 10:18:55 +0800
committerMichael Zucci <zucchi@src.gnome.org>2002-04-18 10:18:55 +0800
commit6ccd0e6f59bec5f1900c49cd1868fca998570fc7 (patch)
treeeac70d58c4d79bfbc73d7592ad5f303f7f8c044e /camel/camel-search-private.c
parente5e67a6644e4d0ac41c270a4bcd18e5c6e2b7667 (diff)
downloadgsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.gz
gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.zst
gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.zip
When doing a contains match, split the words and perform an and on it.
2002-04-18 Not Zed <NotZed@Ximian.com> * camel-folder-search.c (check_header): When doing a contains match, split the words and perform an and on it. (match_words_messages): If we have an index, but were forced to do a full search, first lookup a subset of messages using the index and a simplified word set. Only do a manual search of this subset. 2002-04-17 Not Zed <NotZed@Ximian.com> * camel-folder-search.c (match_message_index): Changed to take a utf8 string not a regex pattern. (match_words_index): Matches against a camel_search_words list. (match_words_1message): Matches a single message against a camel_search_words list. (match_words_message): Same, but gets the message from the folder for you. (match_words_messages): Matches a list of messages against a words list. (search_body_contains): Rewritten to handle multiple word searches. For #23371. * providers/imap/camel-imap-search.c (sync_match): Split words when searching, to support multiple search words. Also, try searching specifying charset of utf8 if we can, if that fails, fall back to not specifying charset. TODO: It should translate the strings into the locale default charset? * providers/imap/camel-imap-store.c (connect_to_server): Added new cap - utf8_search, if set, we tell the server we're searching using utf8, otherwise we dont (incorrectly, since we always use utf8 to search). * camel-search-private.c (camel_ustrstrcase): Make this class public. (camel_search_words_split): Split a word into multiple words based on whitespace, and keep track of whether the word is simple (indexable directly), or not. (camel_search_words_free): Free 'em. svn path=/trunk/; revision=16501
Diffstat (limited to 'camel/camel-search-private.c')
-rw-r--r--camel/camel-search-private.c158
1 files changed, 154 insertions, 4 deletions
diff --git a/camel/camel-search-private.c b/camel/camel-search-private.c
index 6ecb64a1ae..7e8553cd35 100644
--- a/camel/camel-search-private.c
+++ b/camel/camel-search-private.c
@@ -194,6 +194,7 @@ header_soundex (const char *header, const char *match)
return truth;
}
+/* FIXME: This is stupidly slow and needs to be removed */
static gunichar
utf8_get (const char **inp)
{
@@ -209,7 +210,7 @@ utf8_get (const char **inp)
return c;
}
-static const char *
+const char *
camel_ustrstrcase (const char *haystack, const char *needle)
{
gunichar *nuni, *puni;
@@ -469,9 +470,6 @@ camel_search_message_body_contains (CamelDataWrapper *object, regex_t *pattern)
if (containee == NULL)
return FALSE;
- /* TODO: I find it odd that get_part and get_content_object do not
- add a reference, probably need fixing for multithreading */
-
/* using the object types is more accurate than using the mime/types */
if (CAMEL_IS_MULTIPART (containee)) {
parts = camel_multipart_get_number (CAMEL_MULTIPART (containee));
@@ -496,3 +494,155 @@ camel_search_message_body_contains (CamelDataWrapper *object, regex_t *pattern)
return truth;
}
+static __inline__ guint32
+camel_utf8_getc(const unsigned char **ptr)
+{
+ register unsigned char *p = (unsigned char *)*ptr;
+ register unsigned char c, r;
+ register guint32 v=0, /* this is only required because the stupid @@@%#%# compiler thinks it can be used uninitialised */
+ m;
+
+ r = *p++;
+loop:
+ if (r < 0x80) {
+ *ptr = p;
+ v = r;
+ } else if (r < 0xfe) { /* valid start char? */
+ v = r;
+ m = 0x7f80; /* used to mask out the length bits */
+ do {
+ c = *p++;
+ if ((c & 0xc0) != 0x80) {
+ r = c;
+ goto loop;
+ }
+ v = (v<<6) | (c & 0x3f);
+ r<<=1;
+ m<<=5;
+ } while (r & 0x40);
+
+ *ptr = p;
+
+ v &= ~m;
+ }
+
+ return v;
+}
+
+struct _camel_search_words *
+camel_search_words_split(const unsigned char *in)
+{
+ int type = CAMEL_SEARCH_WORD_SIMPLE, all = 0;
+ GString *w;
+ struct _camel_search_word *word;
+ struct _camel_search_words *words;
+ GPtrArray *list = g_ptr_array_new();
+ guint32 c;
+ int utf8len;
+ char utf8[8];
+
+ words = g_malloc0(sizeof(*words));
+ w = g_string_new("");
+
+ do {
+ c = camel_utf8_getc(&in);
+ if (c == 0 || g_unichar_isspace(c)) {
+ if (w->len) {
+ word = g_malloc0(sizeof(*word));
+ word->word = g_strdup(w->str);
+ word->type = type;
+ g_ptr_array_add(list, word);
+ all |= type;
+ type = CAMEL_SEARCH_WORD_SIMPLE;
+ g_string_truncate(w, 0);
+ }
+ } else {
+ if (!g_unichar_isalnum(c))
+ type = CAMEL_SEARCH_WORD_COMPLEX;
+ else
+ c = g_unichar_tolower(c);
+ if (c > 0x80)
+ type |= CAMEL_SEARCH_WORD_8BIT;
+
+ utf8len = g_unichar_to_utf8(c, utf8);
+ utf8[utf8len] = 0;
+ g_string_append(w, utf8);
+ }
+ } while (c);
+
+ g_string_free(w, TRUE);
+ words->len = list->len;
+ words->words = (struct _camel_search_word **)list->pdata;
+ words->type = all;
+ g_ptr_array_free(list, FALSE);
+
+ return words;
+}
+
+/* takes an existing 'words' list, and converts it to another consisting of
+ only simple words, with any punctuation etc stripped */
+struct _camel_search_words *
+camel_search_words_simple(struct _camel_search_words *wordin)
+{
+ int i;
+ const unsigned char *ptr, *start, *last;
+ int type = CAMEL_SEARCH_WORD_SIMPLE, all = 0;
+ GPtrArray *list = g_ptr_array_new();
+ struct _camel_search_word *word;
+ struct _camel_search_words *words;
+ guint32 c;
+
+ words = g_malloc0(sizeof(*words));
+
+ for (i=0;i<wordin->len;i++) {
+ if ((wordin->words[i]->type & CAMEL_SEARCH_WORD_COMPLEX) == 0) {
+ word = g_malloc0(sizeof(*word));
+ word->type = wordin->words[i]->type;
+ word->word = g_strdup(wordin->words[i]->word);
+ g_ptr_array_add(list, word);
+ } else {
+ ptr = wordin->words[i]->word;
+ start = last = ptr;
+ do {
+ c = camel_utf8_getc(&ptr);
+ if (c == 0 || !g_unichar_isalnum(c)) {
+ if (last > start) {
+ word = g_malloc0(sizeof(*word));
+ word->word = g_strndup(start, last-start);
+ word->type = type;
+ g_ptr_array_add(list, word);
+ all |= type;
+ type = CAMEL_SEARCH_WORD_SIMPLE;
+ }
+ start = ptr;
+ }
+ if (c > 0x80)
+ type = CAMEL_SEARCH_WORD_8BIT;
+ last = ptr;
+ } while (c);
+ }
+ }
+
+ words->len = list->len;
+ words->words = (struct _camel_search_word **)list->pdata;
+ words->type = all;
+ g_ptr_array_free(list, FALSE);
+
+ return words;
+}
+
+void
+camel_search_words_free(struct _camel_search_words *words)
+{
+ int i;
+
+ for (i=0;i<words->len;i++) {
+ struct _camel_search_word *word = words->words[i];
+
+ g_free(word->word);
+ g_free(word);
+ }
+ g_free(words->words);
+ g_free(words);
+}
+