From 1deca02b41aae9bf2632ea1420cc900fcf41531d Mon Sep 17 00:00:00 2001 From: Not Zed Date: Thu, 12 Oct 2000 13:40:55 +0000 Subject: Added some stat stuff. 2000-10-12 Not Zed * index.h: Added some stat stuff. * wordindex.c (struct _wordcache): Changed files[] to be a pointer to an allocated block/or an individual item. (find): Fix for changes to struct. (find_name): " (sync_cache_entry): " (add): " (add_list): " (add_index_cache): Free the cache file array if it was created. (word_flush): And here. (word_close): And here too. (ibex_create_word_index): Double the size of the hashtables. (word_flush): Make sure we reset the wordcount to 0 if we remove the list items. DOH. (add_index_cache): Use a slightly more sohpisticated aging algorithm to remove expired nodes. 2000-10-10 Not Zed * hash.c (hash_find): (hash_remove): (hash_insert): Truncate key if it is too big to fit in a single block to MAX_KEYLEN bytes. svn path=/trunk/; revision=5882 --- libibex/ChangeLog | 20 +++++++ libibex/block.c | 9 +++ libibex/hash.c | 10 ++++ libibex/index.h | 6 ++ libibex/wordindex.c | 168 ++++++++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 187 insertions(+), 26 deletions(-) (limited to 'libibex') diff --git a/libibex/ChangeLog b/libibex/ChangeLog index 50aed83023..5021bc12b4 100644 --- a/libibex/ChangeLog +++ b/libibex/ChangeLog @@ -1,3 +1,23 @@ +2000-10-12 Not Zed + + * index.h: Added some stat stuff. + + * wordindex.c (struct _wordcache): Changed files[] to be a pointer + to an allocated block/or an individual item. + (find): Fix for changes to struct. + (find_name): " + (sync_cache_entry): " + (add): " + (add_list): " + (add_index_cache): Free the cache file array if it was created. + (word_flush): And here. + (word_close): And here too. + (ibex_create_word_index): Double the size of the hashtables. + (word_flush): Make sure we reset the wordcount to 0 if we remove + the list items. DOH. + (add_index_cache): Use a slightly more sohpisticated aging + algorithm to remove expired nodes. + 2000-10-10 Not Zed * hash.c (hash_find): diff --git a/libibex/block.c b/libibex/block.c index d971077dba..6f6cd60657 100644 --- a/libibex/block.c +++ b/libibex/block.c @@ -38,6 +38,7 @@ #define d(x) /*#define DEBUG*/ +int block_log; #ifdef IBEX_STATS static void @@ -202,6 +203,9 @@ ibex_block_dirty(struct _block *block) static void sync_block(struct _memcache *block_cache, struct _memblock *memblock) { + if (block_log) + printf("writing block %d\n", memblock->block); + lseek(block_cache->fd, memblock->block, SEEK_SET); if (write(block_cache->fd, &memblock->data, sizeof(memblock->data)) != -1) { memblock->flags &= ~BLOCK_DIRTY; @@ -324,6 +328,9 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid) add_miss(block_cache, blockid); add_read(block_cache, blockid); #endif + if (block_log) + printf("miss block %d\n", blockid); + d(printf("loading blockid from disk %d\n", blockid)); memblock = g_malloc(sizeof(*memblock)); memblock->block = blockid; @@ -345,6 +352,8 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid) struct _memblock *rootblock = g_hash_table_lookup(block_cache->index, (void *)0); struct _root *root = (struct _root *)&rootblock->data; + printf("Unsyncing root block\n"); + g_assert(rootblock != NULL); root->flags &= ~IBEX_ROOT_SYNCF; sync_block(block_cache, rootblock); diff --git a/libibex/hash.c b/libibex/hash.c index a36bef33b0..25a6e92ef9 100644 --- a/libibex/hash.c +++ b/libibex/hash.c @@ -190,6 +190,9 @@ static int hash_sync(struct _IBEXIndex *index) static int hash_close(struct _IBEXIndex *index) { +#ifdef INDEX_STAT + printf("Performed %d lookups, average %f depth\n", index->lookups, (double)index->lookup_total/index->lookups); +#endif g_free(index); return 0; } @@ -259,12 +262,19 @@ hash_find(struct _IBEXIndex *index, const char *key, int keylen) /* and its bucket */ hashbucket = table->buckets[hashentry]; +#ifdef INDEX_STAT + index->lookups++; +#endif /* go down the bucket chain, reading each entry till we are done ... */ while (hashbucket != 0) { struct _hashblock *bucket; char *start, *end; int ind; +#ifdef INDEX_STAT + index->lookup_total++; +#endif + d(printf(" checking bucket %d\n", hashbucket)); /* get the real bucket id from the hashbucket id */ diff --git a/libibex/index.h b/libibex/index.h index 35e3df23c2..c3c83c1bcf 100644 --- a/libibex/index.h +++ b/libibex/index.h @@ -25,10 +25,16 @@ /* an indexing 'class' maps a key to 1 piece of info */ +#define INDEX_STAT + struct _IBEXIndex { struct _IBEXIndexClass *klass; struct _memcache *blocks; blockid_t root; /* root block of ondisk index data */ +#ifdef INDEX_STAT + int lookups; /* how many lookups */ + int lookup_total; /* how many blocks loaded for all lookups (hash chain depth) */ +#endif }; struct _IBEXIndexClass { diff --git a/libibex/wordindex.c b/libibex/wordindex.c index d988ee1482..da25389b27 100644 --- a/libibex/wordindex.c +++ b/libibex/wordindex.c @@ -45,7 +45,7 @@ of words, and could then be discarded (:flush()). #define d(x) /*#define WORDCACHE_SIZE (256)*/ -#define WORDCACHE_SIZE (10240) +#define WORDCACHE_SIZE (4096) extern struct _IBEXStoreClass ibex_diskarray_class; extern struct _IBEXIndexClass ibex_hash_class; @@ -56,15 +56,20 @@ extern struct _IBEXIndexClass ibex_hash_class; */ +#define CACHE_FILE_COUNT (62) + struct _wordcache { struct _wordcache *next; struct _wordcache *prev; nameid_t wordid; /* disk wordid */ blockid_t wordblock; /* head of disk list */ blockid_t wordtail; /* and the tail data */ - int filecount; /* how many we have in memory */ - /*nameid_t files[32];*/ /* memory cache of files */ - nameid_t files[62]; /* memory cache of files */ + short filecount; /* how many valid items in files[] */ + short filealloc; /* how much allocated space in files[] */ + union { + nameid_t *files; /* memory cache of files */ + nameid_t file0; /* if filecount == 1 && filealloc == 0, store directly */ + } file; char word[1]; /* actual word follows */ }; @@ -106,7 +111,7 @@ ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nam printf("opening wordindex root = %d\n", *wordroot); idx->wordindex = ibex_hash_class.open(bc, *wordroot); } else { - idx->wordindex = ibex_hash_class.create(bc, 1024); + idx->wordindex = ibex_hash_class.create(bc, 2048); *wordroot = idx->wordindex->root; printf("creating wordindex root = %d\n", *wordroot); } @@ -114,7 +119,7 @@ ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nam printf("opening nameindex root = %d\n", *nameroot); idx->nameindex = ibex_hash_class.open(bc, *nameroot); } else { - idx->nameindex = ibex_hash_class.create(bc, 1024); + idx->nameindex = ibex_hash_class.create(bc, 2048); *nameroot = idx->nameindex->root; printf("creating nameindex root = %d\n", *nameroot); } @@ -126,7 +131,7 @@ static void cache_sanity(struct _wordcache *head) { while (head->next) { - g_assert(head->filecount <= sizeof(head->files)/sizeof(head->files[0])); + g_assert(head->filecount <= head->filealloc); g_assert(strlen(head->word) != 0); head = head->next; } @@ -209,7 +214,10 @@ static GPtrArray *find(struct _IBEXWord *idx, const char *word) names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail); /* .. including any memory-only data */ if (cache) { - g_array_append_vals(names, cache->files, cache->filecount); + if (cache->filealloc == 0 && cache->filecount == 1) + g_array_append_val(names, cache->file.file0); + else + g_array_append_vals(names, cache->file.files, cache->filecount); } /* walk it ... converting id's back to strings */ @@ -242,9 +250,14 @@ static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *w /* freshen cache entry if we touch it */ ibex_list_remove((struct _listnode *)cache); ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache); - for (i=0;ifilecount;i++) { - if (cache->files[i] == nameid) + if (cache->filecount == 1 && cache->filealloc == 0) { + if (cache->file.file0 == nameid) return TRUE; + } else { + for (i=0;ifilecount;i++) { + if (cache->file.files[i] == nameid) + return TRUE; + } } /* not there? well we can use the wordid anyway */ wordid = cache->wordid; @@ -265,8 +278,11 @@ sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache) GArray array; /* just use this as a header */ blockid_t oldblock, oldtail; - d(printf("syncing cache entry '%s'\n", cache->word)); - array.data = (char *)cache->files; + d(printf("syncing cache entry '%s' used %d\n", cache->word, cache->filecount)); + if (cache->filecount == 1 && cache->filealloc == 0) + array.data = (char *)&cache->file.file0; + else + array.data = (char *)cache->file.files; array.len = cache->filecount; oldblock = cache->wordblock; oldtail = cache->wordtail; @@ -305,13 +321,35 @@ add_index_cache(struct _IBEXWord *idx, const char *word) if (cache == 0) { /* see if we have to flush off the last entry */ if (idx->wordcount >= WORDCACHE_SIZE) { + struct _wordcache *mincache; + int min, count=0; /* remove last entry, and flush it */ cache = (struct _wordcache *)idx->wordnodes.tailpred; + mincache = cache; + min = mincache->filecount; + d(printf("flushing word from cache %s\n", cache->word)); - ibex_list_remove((struct _listnode *)cache); - g_hash_table_remove(idx->wordcache, cache->word); - sync_cache_entry(idx, cache); - g_free(cache); + /* instead of just using the last entry, we try and find an entry with + with only 1 item (failing that, the smallest in the range we look at) */ + /* this could probably benefit greatly from a more sophisticated aging algorithm */ + while (cache->next && count < 100) { + if (cache->filecount == 1) { + mincache = cache; + break; + } + if (cache->filecount > 0 && cache->filecount < min) { + mincache = cache; + min = cache->filecount; + } + cache = cache->next; + count++; + } + ibex_list_remove((struct _listnode *)mincache); + g_hash_table_remove(idx->wordcache, mincache->word); + sync_cache_entry(idx, mincache); + if (mincache->filealloc) + g_free(mincache->file.files); + g_free(mincache); idx->wordcount--; } cache = g_malloc0(sizeof(*cache)+strlen(word)); @@ -347,14 +385,34 @@ static void add(struct _IBEXWord *idx, const char *name, const char *word) add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); /* check for repeats of the last name - dont add anything */ - if (cache->files[cache->filecount] == nameid) - return; + if (cache->filecount == 1 && cache->filealloc == 0) { + if (cache->file.file0 == nameid) + return; + } else { + if (cache->file.files[cache->filecount] == nameid) + return; + } + + /* see if we are setting the first, drop it in the union */ + if (cache->filecount == 0 && cache->filealloc == 0) { + cache->file.file0 = nameid; + } else if (cache->filecount == 1 && cache->filealloc == 0) { + nameid_t saveid; + /* we need to allocate space for words */ + saveid = cache->file.file0; + cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT); + /* this could possibly grow as needed, but i wont for now */ + cache->filealloc = CACHE_FILE_COUNT; + cache->file.files[0] = saveid; + cache->file.files[1] = nameid; + } else { + cache->file.files[cache->filecount] = nameid; + } - /* yay, now insert the nameindex into the word list, and the word index into the name list */ - cache->files[cache->filecount] = nameid; cache->filecount++; + /* if we are full, force a flush now */ - if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) { + if (cache->filealloc && cache->filecount >= cache->filealloc) { sync_cache_entry(idx, cache); } @@ -391,14 +449,33 @@ static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words) /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/ - /* check for duplicates; doesn't catch duplicates over an overflow - boundary. Watch me care. */ + /* check for duplicates; doesn't catch duplicates over an overflow boundary. Watch me care. */ if (cache->filecount == 0 - || cache->files[cache->filecount-1] != nameid) { - cache->files[cache->filecount] = nameid; + /* the 1 item case */ + || (cache->filecount == 1 && cache->filealloc == 0 && cache->file.file0 != nameid) + /* the normal case */ + || (cache->filealloc > 0 && cache->file.files[cache->filecount-1] != nameid)) { + + /* see if we are setting the first, drop it in the union */ + if (cache->filecount == 0 && cache->filealloc == 0) { + cache->file.file0 = nameid; + } else if (cache->filecount == 1 && cache->filealloc == 0) { + nameid_t saveid; + /* we need to allocate space for words */ + saveid = cache->file.file0; + cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT); + /* this could possibly grow as needed, but i wont for now */ + cache->filealloc = CACHE_FILE_COUNT; + cache->file.files[0] = saveid; + cache->file.files[1] = nameid; + } else { + cache->file.files[cache->filecount] = nameid; + } + cache->filecount++; + /* if we are full, force a flush now */ - if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) { + if (cache->filealloc && cache->filecount >= cache->filealloc) { sync_cache_entry(idx, cache); } @@ -452,30 +529,69 @@ static int word_flush(struct _IBEXWord *idx) { struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next; + extern int block_log; + int count= 0; + int used=0, wasted=0; + + block_log = 0; next = cache->next; while (next) { + count++; + used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t)); + if (cache->filealloc) + wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t); + else + wasted += (1-cache->filecount) * sizeof(nameid_t); + + /*printf("syncing word %s\n", cache->word);*/ sync_cache_entry(idx, cache); g_hash_table_remove(idx->wordcache, cache->word); + if (cache->filealloc) + g_free(cache->file.files); g_free(cache); cache = next; next = cache->next; } + + printf("sync cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted); + + block_log = 0; ibex_list_new(&idx->wordnodes); + idx->wordcount = 0; return 0; } static int word_close(struct _IBEXWord *idx) { struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next; + extern int block_log; + int count= 0; + int used=0, wasted=0; + + block_log = 0; next = cache->next; while (next) { + count++; + used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t)); + if (cache->filealloc) + wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t); + else + wasted += (1-cache->filecount) * sizeof(nameid_t); + + /*printf("closing word %s\n", cache->word);*/ sync_cache_entry(idx, cache); + if (cache->filealloc) + g_free(cache->file.files); g_free(cache); cache = next; next = cache->next; } + block_log = 0; + + printf("cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted); + idx->namestore->klass->close(idx->namestore); idx->nameindex->klass->close(idx->nameindex); /*same as namestore: -- cgit