When doing a contains match, split the words and perform an and on it.

2002-04-18 Not Zed <NotZed@Ximian.com> * camel-folder-search.c (check_header): When doing a contains match, split the words and perform an and on it. (match_words_messages): If we have an index, but were forced to do a full search, first lookup a subset of messages using the index and a simplified word set. Only do a manual search of this subset. 2002-04-17 Not Zed <NotZed@Ximian.com> * camel-folder-search.c (match_message_index): Changed to take a utf8 string not a regex pattern. (match_words_index): Matches against a camel_search_words list. (match_words_1message): Matches a single message against a camel_search_words list. (match_words_message): Same, but gets the message from the folder for you. (match_words_messages): Matches a list of messages against a words list. (search_body_contains): Rewritten to handle multiple word searches. For #23371. * providers/imap/camel-imap-search.c (sync_match): Split words when searching, to support multiple search words. Also, try searching specifying charset of utf8 if we can, if that fails, fall back to not specifying charset. TODO: It should translate the strings into the locale default charset? * providers/imap/camel-imap-store.c (connect_to_server): Added new cap - utf8_search, if set, we tell the server we're searching using utf8, otherwise we dont (incorrectly, since we always use utf8 to search). * camel-search-private.c (camel_ustrstrcase): Make this class public. (camel_search_words_split): Split a word into multiple words based on whitespace, and keep track of whether the word is simple (indexable directly), or not. (camel_search_words_free): Free 'em. svn path=/trunk/; revision=16501
author: Not Zed <NotZed@Ximian.com> 2002-04-18 10:18:55 +0800
committer: Michael Zucci <zucchi@src.gnome.org> 2002-04-18 10:18:55 +0800
commit: 6ccd0e6f59bec5f1900c49cd1868fca998570fc7 (patch)
tree: eac70d58c4d79bfbc73d7592ad5f303f7f8c044e /camel/camel-search-private.c
parent: e5e67a6644e4d0ac41c270a4bcd18e5c6e2b7667 (diff)
download: gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.gz
gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.zst
gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.zip
1 files changed, 154 insertions, 4 deletions
diff --git a/camel/camel-search-private.c b/camel/camel-search-private.c
index 6ecb64a1ae..7e8553cd35 100644
--- a/camel/camel-search-private.c
+++ b/camel/camel-search-private.c
@@ -194,6 +194,7 @@ header_soundex (const char *header, const char *match)
 	return truth;
 }
 
+/* FIXME: This is stupidly slow and needs to be removed */
 static gunichar
 utf8_get (const char **inp)
 {
@@ -209,7 +210,7 @@ utf8_get (const char **inp)
 	return c;
 }
 
-static const char *
+const char *
 camel_ustrstrcase (const char *haystack, const char *needle)
 {
 	gunichar *nuni, *puni;
@@ -469,9 +470,6 @@ camel_search_message_body_contains (CamelDataWrapper *object, regex_t *pattern)
 	if (containee == NULL)
 		return FALSE;
 	
-	/* TODO: I find it odd that get_part and get_content_object do not
-	   add a reference, probably need fixing for multithreading */
-	
 	/* using the object types is more accurate than using the mime/types */
 	if (CAMEL_IS_MULTIPART (containee)) {
 		parts = camel_multipart_get_number (CAMEL_MULTIPART (containee));
@@ -496,3 +494,155 @@ camel_search_message_body_contains (CamelDataWrapper *object, regex_t *pattern)
 	return truth;
 }
 
+static __inline__ guint32
+camel_utf8_getc(const unsigned char **ptr)
+{
+	register unsigned char *p = (unsigned char *)*ptr;
+	register unsigned char c, r;
+	register guint32 v=0, /* this is only required because the stupid @@@%#%# compiler thinks it can be used uninitialised */
+		m;
+
+	r = *p++;
+loop:
+	if (r < 0x80) {
+		*ptr = p;
+		v = r;
+	} else if (r < 0xfe) { /* valid start char? */
+		v = r;
+		m = 0x7f80;	/* used to mask out the length bits */
+		do {
+			c = *p++;
+			if ((c & 0xc0) != 0x80) {
+				r = c;
+				goto loop;
+			}
+			v = (v<<6) | (c & 0x3f);
+			r<<=1;
+			m<<=5;
+		} while (r & 0x40);
+		
+		*ptr = p;
+
+		v &= ~m;
+	}
+
+	return v;
+}
+
+struct _camel_search_words *
+camel_search_words_split(const unsigned char *in)
+{
+	int type = CAMEL_SEARCH_WORD_SIMPLE, all = 0;
+	GString *w;
+	struct _camel_search_word *word;
+	struct _camel_search_words *words;
+	GPtrArray *list = g_ptr_array_new();
+	guint32 c;
+	int utf8len;
+	char utf8[8];
+
+	words = g_malloc0(sizeof(*words));	
+	w = g_string_new("");
+
+	do {
+		c = camel_utf8_getc(&in);
+		if (c == 0 || g_unichar_isspace(c)) {
+			if (w->len) {
+				word = g_malloc0(sizeof(*word));
+				word->word = g_strdup(w->str);
+				word->type = type;
+				g_ptr_array_add(list, word);
+				all |= type;
+				type = CAMEL_SEARCH_WORD_SIMPLE;
+				g_string_truncate(w, 0);
+			}
+		} else {
+			if (!g_unichar_isalnum(c))
+				type = CAMEL_SEARCH_WORD_COMPLEX;
+			else
+				c = g_unichar_tolower(c);
+			if (c > 0x80)
+				type |= CAMEL_SEARCH_WORD_8BIT;
+
+			utf8len = g_unichar_to_utf8(c, utf8);
+			utf8[utf8len] = 0;
+			g_string_append(w, utf8);
+		}
+	} while (c);
+
+	g_string_free(w, TRUE);
+	words->len = list->len;
+	words->words = (struct _camel_search_word **)list->pdata;
+	words->type = all;
+	g_ptr_array_free(list, FALSE);
+
+	return words;
+}
+
+/* takes an existing 'words' list, and converts it to another consisting of
+   only simple words, with any punctuation etc stripped */
+struct _camel_search_words *
+camel_search_words_simple(struct _camel_search_words *wordin)
+{
+	int i;
+	const unsigned char *ptr, *start, *last;
+	int type = CAMEL_SEARCH_WORD_SIMPLE, all = 0;
+	GPtrArray *list = g_ptr_array_new();
+	struct _camel_search_word *word;
+	struct _camel_search_words *words;
+	guint32 c;
+
+	words = g_malloc0(sizeof(*words));	
+
+	for (i=0;i<wordin->len;i++) {
+		if ((wordin->words[i]->type & CAMEL_SEARCH_WORD_COMPLEX) == 0) {
+			word = g_malloc0(sizeof(*word));
+			word->type = wordin->words[i]->type;
+			word->word = g_strdup(wordin->words[i]->word);
+			g_ptr_array_add(list, word);
+		} else {
+			ptr = wordin->words[i]->word;
+			start = last = ptr;
+			do {
+				c = camel_utf8_getc(&ptr);
+				if (c == 0 || !g_unichar_isalnum(c)) {
+					if (last > start) {
+						word = g_malloc0(sizeof(*word));
+						word->word = g_strndup(start, last-start);
+						word->type = type;
+						g_ptr_array_add(list, word);
+						all |= type;
+						type = CAMEL_SEARCH_WORD_SIMPLE;
+					}
+					start = ptr;
+				}
+				if (c > 0x80)
+					type = CAMEL_SEARCH_WORD_8BIT;
+				last = ptr;
+			} while (c);
+		}
+	}
+
+	words->len = list->len;
+	words->words = (struct _camel_search_word **)list->pdata;
+	words->type = all;
+	g_ptr_array_free(list, FALSE);
+
+	return words;
+}
+
+void
+camel_search_words_free(struct _camel_search_words *words)
+{
+	int i;
+
+	for (i=0;i<words->len;i++) {
+		struct _camel_search_word *word = words->words[i];
+
+		g_free(word->word);
+		g_free(word);
+	}
+	g_free(words->words);
+	g_free(words);
+}
+
author	Not Zed <NotZed@Ximian.com>	2002-04-18 10:18:55 +0800
committer	Michael Zucci <zucchi@src.gnome.org>	2002-04-18 10:18:55 +0800
commit	6ccd0e6f59bec5f1900c49cd1868fca998570fc7 (patch)
tree	eac70d58c4d79bfbc73d7592ad5f303f7f8c044e /camel/camel-search-private.c
parent	e5e67a6644e4d0ac41c270a4bcd18e5c6e2b7667 (diff)
download	gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.gz gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.zst gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.zip