1 files changed, 142 insertions, 26 deletions
diff --git a/libibex/wordindex.c b/libibex/wordindex.c
index d988ee1482..da25389b27 100644
--- a/libibex/wordindex.c
+++ b/libibex/wordindex.c
@@ -45,7 +45,7 @@ of words, and could then be discarded (:flush()).
 #define d(x)
 
 /*#define WORDCACHE_SIZE (256)*/
-#define WORDCACHE_SIZE (10240)
+#define WORDCACHE_SIZE (4096)
 
 extern struct _IBEXStoreClass ibex_diskarray_class;
 extern struct _IBEXIndexClass ibex_hash_class;
@@ -56,15 +56,20 @@ extern struct _IBEXIndexClass ibex_hash_class;
 */
 
 
+#define CACHE_FILE_COUNT (62)
+
 struct _wordcache {
 	struct _wordcache *next;
 	struct _wordcache *prev;
 	nameid_t wordid;	/* disk wordid */
 	blockid_t wordblock;	/* head of disk list */
 	blockid_t wordtail;	/* and the tail data */
-	int filecount;		/* how many we have in memory */
-	/*nameid_t files[32];*/	/* memory cache of files */
-	nameid_t files[62];	/* memory cache of files */
+	short filecount;	/* how many valid items in files[] */
+	short filealloc;	/* how much allocated space in files[] */
+	union {
+		nameid_t *files;	/* memory cache of files */
+		nameid_t file0;	/* if filecount == 1 && filealloc == 0, store directly */
+	} file;
 	char word[1];		/* actual word follows */
 };
 
@@ -106,7 +111,7 @@ ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nam
 		printf("opening wordindex root = %d\n", *wordroot);
 		idx->wordindex = ibex_hash_class.open(bc, *wordroot);
 	} else {
-		idx->wordindex = ibex_hash_class.create(bc, 1024);
+		idx->wordindex = ibex_hash_class.create(bc, 2048);
 		*wordroot = idx->wordindex->root;
 		printf("creating wordindex root = %d\n", *wordroot);
 	}
@@ -114,7 +119,7 @@ ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nam
 		printf("opening nameindex root = %d\n", *nameroot);
 		idx->nameindex = ibex_hash_class.open(bc, *nameroot);
 	} else {
-		idx->nameindex = ibex_hash_class.create(bc, 1024);
+		idx->nameindex = ibex_hash_class.create(bc, 2048);
 		*nameroot = idx->nameindex->root;
 		printf("creating nameindex root = %d\n", *nameroot);
 	}
@@ -126,7 +131,7 @@ static void
 cache_sanity(struct _wordcache *head)
 {
 	while (head->next) {
-		g_assert(head->filecount <= sizeof(head->files)/sizeof(head->files[0]));
+		g_assert(head->filecount <= head->filealloc);
 		g_assert(strlen(head->word) != 0);
 		head = head->next;
 	}
@@ -209,7 +214,10 @@ static GPtrArray *find(struct _IBEXWord *idx, const char *word)
 	names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail);
 	/* .. including any memory-only data */
 	if (cache) {
-		g_array_append_vals(names, cache->files, cache->filecount);
+		if (cache->filealloc == 0 && cache->filecount == 1)
+			g_array_append_val(names, cache->file.file0);
+		else
+			g_array_append_vals(names, cache->file.files, cache->filecount);
 	}
 
 	/* walk it ... converting id's back to strings */
@@ -242,9 +250,14 @@ static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *w
 		/* freshen cache entry if we touch it */
 		ibex_list_remove((struct _listnode *)cache);
 		ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
-		for (i=0;i<cache->filecount;i++) {
-			if (cache->files[i] == nameid)
+		if (cache->filecount == 1 && cache->filealloc == 0) {
+			if (cache->file.file0 == nameid)
 				return TRUE;
+		} else {
+			for (i=0;i<cache->filecount;i++) {
+				if (cache->file.files[i] == nameid)
+					return TRUE;
+			}
 		}
 		/* not there?  well we can use the wordid anyway */
 		wordid = cache->wordid;
@@ -265,8 +278,11 @@ sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache)
 	GArray array; /* just use this as a header */
 	blockid_t oldblock, oldtail;
 	
-	d(printf("syncing cache entry '%s'\n", cache->word));
-	array.data = (char *)cache->files;
+	d(printf("syncing cache entry '%s' used %d\n", cache->word, cache->filecount));
+	if (cache->filecount == 1 && cache->filealloc == 0)
+		array.data = (char *)&cache->file.file0;
+	else
+		array.data = (char *)cache->file.files;
 	array.len = cache->filecount;
 	oldblock = cache->wordblock;
 	oldtail = cache->wordtail;
@@ -305,13 +321,35 @@ add_index_cache(struct _IBEXWord *idx, const char *word)
 	if (cache == 0) {
 		/* see if we have to flush off the last entry */
 		if (idx->wordcount >= WORDCACHE_SIZE) {
+			struct _wordcache *mincache;
+			int min, count=0;
 			/* remove last entry, and flush it */
 			cache = (struct _wordcache *)idx->wordnodes.tailpred;
+			mincache = cache;
+			min = mincache->filecount;
+
 			d(printf("flushing word from cache %s\n", cache->word));
-			ibex_list_remove((struct _listnode *)cache);
-			g_hash_table_remove(idx->wordcache, cache->word);
-			sync_cache_entry(idx, cache);
-			g_free(cache);
+			/* instead of just using the last entry, we try and find an entry with
+			   with only 1 item (failing that, the smallest in the range we look at) */
+			/* this could probably benefit greatly from a more sophisticated aging algorithm */
+			while (cache->next && count < 100) {
+				if (cache->filecount == 1) {
+					mincache = cache;
+					break;
+				}
+				if (cache->filecount > 0 && cache->filecount < min) {
+					mincache = cache;
+					min = cache->filecount;
+				}
+				cache = cache->next;
+				count++;
+			}
+			ibex_list_remove((struct _listnode *)mincache);
+			g_hash_table_remove(idx->wordcache, mincache->word);
+			sync_cache_entry(idx, mincache);
+			if (mincache->filealloc)
+				g_free(mincache->file.files);
+			g_free(mincache);
 			idx->wordcount--;
 		}
 		cache = g_malloc0(sizeof(*cache)+strlen(word));
@@ -347,14 +385,34 @@ static void add(struct _IBEXWord *idx, const char *name, const char *word)
 	add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
 
 	/* check for repeats of the last name - dont add anything */
-	if (cache->files[cache->filecount] == nameid)
-		return;
+	if (cache->filecount == 1 && cache->filealloc == 0) {
+		if (cache->file.file0 == nameid)
+			return;
+	} else {
+		if (cache->file.files[cache->filecount] == nameid)
+			return;
+	}
+
+	/* see if we are setting the first, drop it in the union */
+	if (cache->filecount == 0 && cache->filealloc == 0) {
+		cache->file.file0 = nameid;
+	} else if (cache->filecount == 1 && cache->filealloc == 0) {
+		nameid_t saveid;
+		/* we need to allocate space for words */
+		saveid = cache->file.file0;
+		cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT);
+		/* this could possibly grow as needed, but i wont for now */
+		cache->filealloc = CACHE_FILE_COUNT;
+		cache->file.files[0] = saveid;
+		cache->file.files[1] = nameid;
+	} else {
+		cache->file.files[cache->filecount] = nameid;
+	}
 
-	/* yay, now insert the nameindex into the word list, and the word index into the name list */
-	cache->files[cache->filecount] = nameid;
 	cache->filecount++;
+
 	/* if we are full, force a flush now */
-	if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
+	if (cache->filealloc && cache->filecount >= cache->filealloc) {
 		sync_cache_entry(idx, cache);
 	}
 
@@ -391,14 +449,33 @@ static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words)
 
 		/*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
 
-		/* check for duplicates; doesn't catch duplicates over an overflow
-		   boundary.  Watch me care. */
+		/* check for duplicates; doesn't catch duplicates over an overflow boundary.  Watch me care. */
 		if (cache->filecount == 0
-		    || cache->files[cache->filecount-1] != nameid) {
-			cache->files[cache->filecount] = nameid;
+		    /* the 1 item case */
+		    || (cache->filecount == 1 && cache->filealloc == 0 && cache->file.file0 != nameid)
+		    /* the normal case */
+		    || (cache->filealloc > 0 && cache->file.files[cache->filecount-1] != nameid)) {
+
+			/* see if we are setting the first, drop it in the union */
+			if (cache->filecount == 0 && cache->filealloc == 0) {
+				cache->file.file0 = nameid;
+			} else if (cache->filecount == 1 && cache->filealloc == 0) {
+				nameid_t saveid;
+				/* we need to allocate space for words */
+				saveid = cache->file.file0;
+				cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT);
+				/* this could possibly grow as needed, but i wont for now */
+				cache->filealloc = CACHE_FILE_COUNT;
+				cache->file.files[0] = saveid;
+				cache->file.files[1] = nameid;
+			} else {
+				cache->file.files[cache->filecount] = nameid;
+			}
+
 			cache->filecount++;
+
 			/* if we are full, force a flush now */
-			if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
+			if (cache->filealloc && cache->filecount >= cache->filealloc) {
 				sync_cache_entry(idx, cache);
 			}
 
@@ -452,30 +529,69 @@ static int
 word_flush(struct _IBEXWord *idx)
 {
 	struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
+	extern int block_log;
+	int count= 0;
+	int used=0, wasted=0;
+
+	block_log = 0;
 
 	next = cache->next;
 	while (next) {
+		count++;
+		used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t));
+		if (cache->filealloc)
+			wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t);
+		else
+			wasted += (1-cache->filecount) * sizeof(nameid_t);
+
+		/*printf("syncing word %s\n", cache->word);*/
 		sync_cache_entry(idx, cache);
 		g_hash_table_remove(idx->wordcache, cache->word);
+		if (cache->filealloc)
+			g_free(cache->file.files);
 		g_free(cache);
 		cache = next;
 		next = cache->next;
 	}
+
+	printf("sync cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted);
+
+	block_log = 0;
 	ibex_list_new(&idx->wordnodes);
+	idx->wordcount = 0;
 	return 0;
 }
 
 static int word_close(struct _IBEXWord *idx)
 {
 	struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
+	extern int block_log;
+	int count= 0;
+	int used=0, wasted=0;
+
+	block_log = 0;
 
 	next = cache->next;
 	while (next) {
+		count++;
+		used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t));
+		if (cache->filealloc)
+			wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t);
+		else
+			wasted += (1-cache->filecount) * sizeof(nameid_t);
+
+		/*printf("closing word %s\n", cache->word);*/
 		sync_cache_entry(idx, cache);
+		if (cache->filealloc)
+			g_free(cache->file.files);
 		g_free(cache);
 		cache = next;
 		next = cache->next;
 	}
+	block_log = 0;
+
+	printf("cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted);
+
 	idx->namestore->klass->close(idx->namestore);
 	idx->nameindex->klass->close(idx->nameindex);
 	/*same as namestore: