diff options
author | Not Zed <NotZed@Ximian.com> | 2002-04-18 10:18:55 +0800 |
---|---|---|
committer | Michael Zucci <zucchi@src.gnome.org> | 2002-04-18 10:18:55 +0800 |
commit | 6ccd0e6f59bec5f1900c49cd1868fca998570fc7 (patch) | |
tree | eac70d58c4d79bfbc73d7592ad5f303f7f8c044e /camel/camel-folder-search.c | |
parent | e5e67a6644e4d0ac41c270a4bcd18e5c6e2b7667 (diff) | |
download | gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.gz gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.tar.zst gsoc2013-evolution-6ccd0e6f59bec5f1900c49cd1868fca998570fc7.zip |
When doing a contains match, split the words and perform an and on it.
2002-04-18 Not Zed <NotZed@Ximian.com>
* camel-folder-search.c (check_header): When doing a contains
match, split the words and perform an and on it.
(match_words_messages): If we have an index, but were forced to do
a full search, first lookup a subset of messages using
the index and a simplified word set. Only do a manual search of
this subset.
2002-04-17 Not Zed <NotZed@Ximian.com>
* camel-folder-search.c (match_message_index): Changed to take a
utf8 string not a regex pattern.
(match_words_index): Matches against a camel_search_words list.
(match_words_1message): Matches a single message against a
camel_search_words list.
(match_words_message): Same, but gets the message from the folder
for you.
(match_words_messages): Matches a list of messages against a words
list.
(search_body_contains): Rewritten to handle multiple word
searches. For #23371.
* providers/imap/camel-imap-search.c (sync_match): Split words
when searching, to support multiple search words. Also, try
searching specifying charset of utf8 if we can, if that fails,
fall back to not specifying charset. TODO: It should translate
the strings into the locale default charset?
* providers/imap/camel-imap-store.c (connect_to_server): Added new
cap - utf8_search, if set, we tell the server we're searching
using utf8, otherwise we dont (incorrectly, since we always use
utf8 to search).
* camel-search-private.c (camel_ustrstrcase): Make this class public.
(camel_search_words_split): Split a word into multiple words based
on whitespace, and keep track of whether the word is simple
(indexable directly), or not.
(camel_search_words_free): Free 'em.
svn path=/trunk/; revision=16501
Diffstat (limited to 'camel/camel-folder-search.c')
-rw-r--r-- | camel/camel-folder-search.c | 296 |
1 files changed, 210 insertions, 86 deletions
diff --git a/camel/camel-folder-search.c b/camel/camel-folder-search.c index e0e5052a5f..d9702706f9 100644 --- a/camel/camel-folder-search.c +++ b/camel/camel-folder-search.c @@ -621,8 +621,9 @@ check_header(struct _ESExp *f, int argc, struct _ESExpResult **argv, CamelFolder char *headername; const char *header = NULL; char strbuf[32]; - int i; + int i, j; camel_search_t type = CAMEL_SEARCH_TYPE_ASIS; + struct _camel_search_words *words; /* only a subset of headers are supported .. */ headername = argv[0]->value.string; @@ -652,9 +653,21 @@ check_header(struct _ESExp *f, int argc, struct _ESExpResult **argv, CamelFolder if (header) { /* performs an OR of all words */ for (i=1;i<argc && !truth;i++) { - if (argv[i]->type == ESEXP_RES_STRING) - truth = camel_search_header_match(header, argv[i]->value.string, - how, type, NULL); + if (argv[i]->type == ESEXP_RES_STRING) { + if (argv[i]->value.string[0] == 0) { + truth = TRUE; + } else if (how == CAMEL_SEARCH_MATCH_CONTAINS) { + /* doesn't make sense to split words on anything but contains i.e. we can't have an ending match different words */ + words = camel_search_words_split(argv[i]->value.string); + truth = TRUE; + for (j=0;j<words->len && truth;j++) { + truth = camel_search_header_match(header, words->words[j]->word, how, type, NULL); + } + camel_search_words_free(words); + } else { + truth = camel_search_header_match(header, argv[i]->value.string, how, type, NULL); + } + } } } } @@ -723,26 +736,53 @@ g_lib_sux_htor(char *key, int value, struct _glib_sux_donkeys *fuckup) g_ptr_array_add(fuckup->uids, key); } +/* and, only store duplicates */ +static void +g_lib_sux_htand(char *key, int value, struct _glib_sux_donkeys *fuckup) +{ + if (value == fuckup->count) + g_ptr_array_add(fuckup->uids, key); +} + static int -match_message(CamelFolder *folder, const char *uid, regex_t *pattern, CamelException *ex) +match_message_index(CamelIndex *idx, const char *uid, const char *match, CamelException *ex) { - CamelMimeMessage *msg; + CamelIndexCursor *wc, *nc; + const char *word, *name; int truth = FALSE; - msg = camel_folder_get_message(folder, uid, ex); - if (!camel_exception_is_set(ex) && msg!=NULL) { - truth = camel_search_message_body_contains((CamelDataWrapper *)msg, pattern); - camel_object_unref((CamelObject *)msg); - } else { - camel_exception_clear(ex); + wc = camel_index_words(idx); + if (wc) { + while (!truth && (word = camel_index_cursor_next(wc))) { + if (camel_ustrstrcase(word,match) != NULL) { + /* perf: could have the wc cursor return the name cursor */ + nc = camel_index_find(idx, word); + if (nc) { + while (!truth && (name = camel_index_cursor_next(nc))) + truth = strcmp(name, uid) == 0; + camel_object_unref((CamelObject *)nc); + } + } + } + camel_object_unref((CamelObject *)wc); } + return truth; } -/* perform a regex match against words in an index */ -/* uids = hash table of messageinfo's by uid's */ +/* + "one two" "three" "four five" + + one and two +or + three +or + four and five +*/ + +/* returns messages which contain all words listed in words */ static GPtrArray * -match_messages_index(CamelIndex *idx, regex_t *pattern, GHashTable *uids, CamelException *ex) +match_words_index(CamelFolderSearch *search, struct _camel_search_words *words, CamelException *ex) { GPtrArray *result = g_ptr_array_new(); GHashTable *ht = g_hash_table_new(g_str_hash, g_str_equal); @@ -750,123 +790,207 @@ match_messages_index(CamelIndex *idx, regex_t *pattern, GHashTable *uids, CamelE CamelIndexCursor *wc, *nc; const char *word, *name; CamelMessageInfo *mi; + int i; + + /* we can have a maximum of 32 words, as we use it as the AND mask */ - wc = camel_index_words(idx); + wc = camel_index_words(search->body_index); if (wc) { while ((word = camel_index_cursor_next(wc))) { - if (regexec(pattern, word, 0, NULL, 0) == 0) { - /* perf: could have the wc cursor return the name cursor */ - nc = camel_index_find(idx, word); - if (nc) { - while ((name = camel_index_cursor_next(nc))) { - mi = g_hash_table_lookup(uids, name); - if (mi) - g_hash_table_insert(ht, (char *)camel_message_info_uid(mi), (void *)1); + for (i=0;i<words->len;i++) { + if (camel_ustrstrcase(word, words->words[i]->word) != NULL) { + /* perf: could have the wc cursor return the name cursor */ + nc = camel_index_find(search->body_index, word); + if (nc) { + while ((name = camel_index_cursor_next(nc))) { + mi = g_hash_table_lookup(search->summary_hash, name); + if (mi) { + int mask; + const char *uid = camel_message_info_uid(mi); + + mask = ((int)g_hash_table_lookup(ht, uid)) | (1<<i); + g_hash_table_insert(ht, (char *)uid, (void *)mask); + } + } + camel_object_unref((CamelObject *)nc); } - camel_object_unref((CamelObject *)nc); } } } camel_object_unref((CamelObject *)wc); lambdafoo.uids = result; - g_hash_table_foreach(ht, (GHFunc)g_lib_sux_htor, &lambdafoo); + lambdafoo.count = (1<<words->len) - 1; + g_hash_table_foreach(ht, (GHFunc)g_lib_sux_htand, &lambdafoo); g_hash_table_destroy(ht); } return result; } -/* perform a regex match against an individual uid in an index */ -/* this would benefit greatly in practice if there was a hashtalbe of uid's to amtch against */ -static int -match_message_index(CamelIndex *idx, const char *uid, regex_t *pattern, CamelException *ex) +static gboolean +match_words_1message (CamelDataWrapper *object, struct _camel_search_words *words, guint32 *mask) { - CamelIndexCursor *wc, *nc; - const char *word, *name; + CamelDataWrapper *containee; int truth = FALSE; - - wc = camel_index_words(idx); - if (wc) { - while (!truth && (word = camel_index_cursor_next(wc))) { - if (regexec(pattern, word, 0, NULL, 0) == 0) { - /* perf: could have the wc cursor return the name cursor */ - nc = camel_index_find(idx, word); - if (nc) { - while (!truth && (name = camel_index_cursor_next(nc))) - truth = strcmp(name, uid) == 0; - camel_object_unref((CamelObject *)nc); - } + int parts, i; + + containee = camel_medium_get_content_object (CAMEL_MEDIUM (object)); + + if (containee == NULL) + return FALSE; + + /* using the object types is more accurate than using the mime/types */ + if (CAMEL_IS_MULTIPART (containee)) { + parts = camel_multipart_get_number (CAMEL_MULTIPART (containee)); + for (i = 0; i < parts && truth == FALSE; i++) { + CamelDataWrapper *part = (CamelDataWrapper *)camel_multipart_get_part (CAMEL_MULTIPART (containee), i); + if (part) + truth = match_words_1message(part, words, mask); + } + } else if (CAMEL_IS_MIME_MESSAGE (containee)) { + /* for messages we only look at its contents */ + truth = match_words_1message((CamelDataWrapper *)containee, words, mask); + } else if (header_content_type_is(CAMEL_DATA_WRAPPER (containee)->mime_type, "text", "*")) { + /* for all other text parts, we look inside, otherwise we dont care */ + CamelStreamMem *mem = (CamelStreamMem *)camel_stream_mem_new (); + + /* FIXME: The match should be part of a stream op */ + camel_data_wrapper_write_to_stream (containee, CAMEL_STREAM (mem)); + camel_stream_write (CAMEL_STREAM (mem), "", 1); + for (i=0;i<words->len;i++) { + /* FIXME: This is horridly slow, and should use a real search algorithm */ + if (camel_ustrstrcase(mem->buffer->data, words->words[i]->word) != NULL) { + *mask |= (1<<i); + /* shortcut a match */ + if (*mask == (1<<(words->len))-1) + return TRUE; } } - camel_object_unref((CamelObject *)wc); + camel_object_unref (CAMEL_OBJECT (mem)); + } + + return truth; +} + +static gboolean +match_words_message(CamelFolder *folder, const char *uid, struct _camel_search_words *words, CamelException *ex) +{ + guint32 mask; + CamelMimeMessage *msg; + int truth; + + msg = camel_folder_get_message(folder, uid, ex); + if (msg) { + mask = 0; + truth = match_words_1message((CamelDataWrapper *)msg, words, &mask); + camel_object_unref((CamelObject *)msg); + } else { + camel_exception_clear(ex); + truth = FALSE; } return truth; } +static GPtrArray * +match_words_messages(CamelFolderSearch *search, struct _camel_search_words *words, CamelException *ex) +{ + int i; + GPtrArray *matches = g_ptr_array_new(); + + if (search->body_index) { + GPtrArray *indexed; + struct _camel_search_words *simple; + + simple = camel_search_words_simple(words); + indexed = match_words_index(search, simple, ex); + camel_search_words_free(simple); + + for (i=0;i<indexed->len;i++) { + const char *uid = g_ptr_array_index(indexed, i); + + if (match_words_message(search->folder, uid, words, ex)) + g_ptr_array_add(matches, (char *)uid); + } + + g_ptr_array_free(indexed, TRUE); + } else { + for (i=0;i<search->summary->len;i++) { + CamelMessageInfo *info = g_ptr_array_index(search->summary, i); + const char *uid = camel_message_info_uid(info); + + if (match_words_message(search->folder, uid, words, ex)) + g_ptr_array_add(matches, (char *)uid); + } + } + + return matches; +} + static ESExpResult * search_body_contains(struct _ESExp *f, int argc, struct _ESExpResult **argv, CamelFolderSearch *search) { - ESExpResult *r; - int i; - regex_t pattern; + int i, j; CamelException *ex = search->priv->ex; + struct _camel_search_words *words; + ESExpResult *r; + struct _glib_sux_donkeys lambdafoo; - if (search->current) { + if (search->current) { int truth = FALSE; - if (argc == 1 && argv[0]->value.string[0] == 0 && search->folder) { + if (argc == 1 && argv[0]->value.string[0] == 0) { truth = TRUE; - } else if (search->body_index) { - if (camel_search_build_match_regex(&pattern, CAMEL_SEARCH_MATCH_ICASE, argc, argv, ex) == 0) { - truth = match_message_index(search->body_index, camel_message_info_uid(search->current), &pattern, ex); - regfree(&pattern); - } - } else if (search->folder) { - /* we do a 'slow' direct search */ - if (camel_search_build_match_regex(&pattern, CAMEL_SEARCH_MATCH_ICASE, argc, argv, ex) == 0) { - truth = match_message(search->folder, camel_message_info_uid(search->current), &pattern, ex); - regfree(&pattern); - } } else { - g_warning("Cannot perform indexed body query with no index or folder set"); + for (i=0;i<argc && !truth;i++) { + if (argv[i]->type == ESEXP_RES_STRING) { + words = camel_search_words_split(argv[i]->value.string); + truth = TRUE; + if ((words->type & CAMEL_SEARCH_WORD_COMPLEX) == 0 && search->body_index) { + for (j=0;j<words->len && truth;j++) + truth = match_message_index(search->body_index, camel_message_info_uid(search->current), words->words[j]->word, ex); + } else { + /* TODO: cache current message incase of multiple body search terms */ + truth = match_words_message(search->folder, camel_message_info_uid(search->current), words, ex); + } + camel_search_words_free(words); + } + } } r = e_sexp_result_new(f, ESEXP_RES_BOOL); r->value.bool = truth; } else { r = e_sexp_result_new(f, ESEXP_RES_ARRAY_PTR); + r->value.ptrarray = g_ptr_array_new(); - if (argc == 1 && argv[0]->value.string[0] == 0 && search->folder) { - /* optimise the match "" case - match everything */ - r->value.ptrarray = g_ptr_array_new(); + if (argc == 1 && argv[0]->value.string[0] == 0) { for (i=0;i<search->summary->len;i++) { CamelMessageInfo *info = g_ptr_array_index(search->summary, i); + g_ptr_array_add(r->value.ptrarray, (char *)camel_message_info_uid(info)); } - } else if (search->body_index) { - if (camel_search_build_match_regex(&pattern, CAMEL_SEARCH_MATCH_ICASE, argc, argv, ex) == 0) { - r->value.ptrarray = match_messages_index(search->body_index, &pattern, search->summary_hash, ex); - regfree(&pattern); - } - } else if (search->folder) { - /* do a slow search */ - r->value.ptrarray = g_ptr_array_new(); - if (camel_search_build_match_regex(&pattern, CAMEL_SEARCH_MATCH_ICASE, argc, argv, ex) == 0) { - if (search->summary) { - for (i=0;i<search->summary->len;i++) { - CamelMessageInfo *info = g_ptr_array_index(search->summary, i); - - if (match_message(search->folder, camel_message_info_uid(info), &pattern, ex)) - g_ptr_array_add(r->value.ptrarray, (char *)camel_message_info_uid(info)); + } else { + GHashTable *ht = g_hash_table_new(g_str_hash, g_str_equal); + GPtrArray *matches; + + for (i=0;i<argc;i++) { + if (argv[i]->type == ESEXP_RES_STRING) { + words = camel_search_words_split(argv[i]->value.string); + if ((words->type & CAMEL_SEARCH_WORD_COMPLEX) == 0 && search->body_index) { + matches = match_words_index(search, words, ex); + } else { + matches = match_words_messages(search, words, ex); } - } /* else? we could always get the summary from the folder, but then - we need to free it later somehow */ - regfree(&pattern); + for (j=0;j<matches->len;j++) + g_hash_table_insert(ht, matches->pdata[j], matches->pdata[j]); + g_ptr_array_free(matches, TRUE); + camel_search_words_free(words); + } } - } else { - g_warning("Cannot perform indexed body query with no index or folder set"); - r->value.ptrarray = g_ptr_array_new(); + lambdafoo.uids = r->value.ptrarray; + g_hash_table_foreach(ht, (GHFunc)g_lib_sux_htor, &lambdafoo); + g_hash_table_destroy(ht); } } |