diff options
-rw-r--r-- | libibex/ChangeLog | 8 | ||||
-rw-r--r-- | libibex/Makefile.am | 5 | ||||
-rw-r--r-- | libibex/testindex.c | 158 | ||||
-rw-r--r-- | libibex/wordindexmem.c | 112 |
4 files changed, 278 insertions, 5 deletions
diff --git a/libibex/ChangeLog b/libibex/ChangeLog index de9cb8dd40..1d74b73898 100644 --- a/libibex/ChangeLog +++ b/libibex/ChangeLog @@ -1,3 +1,11 @@ +2000-11-17 Not Zed <NotZed@HelixCode.com> + + * wordindexmem.c (add_list): If we have the namecache active, and + there is no name there, we add it directly and dont look it up + first. + + * testindex.c: Some performance testing & stat gathering stuff. + 2000-11-16 Not Zed <NotZed@HelixCode.com> * wordindexmem.c (ibex_create_word_index_mem): Initialise nameinit diff --git a/libibex/Makefile.am b/libibex/Makefile.am index 6cc88186d6..61f3d72004 100644 --- a/libibex/Makefile.am +++ b/libibex/Makefile.am @@ -21,11 +21,14 @@ INCLUDES = -I$(srcdir) $(GLIB_CFLAGS) $(UNICODE_CFLAGS) \ -DG_LOG_DOMAIN=\"libibex\" -noinst_PROGRAMS = dumpindex +noinst_PROGRAMS = dumpindex testindex dumpindex_SOURCES = dumpindex.c dumpindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) +testindex_SOURCES = testindex.c +testindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) -lm + #noinst_PROGRAMS = mkindex lookup # #mkindex_SOURCES = mkindex.c diff --git a/libibex/testindex.c b/libibex/testindex.c new file mode 100644 index 0000000000..e21d73ff06 --- /dev/null +++ b/libibex/testindex.c @@ -0,0 +1,158 @@ +/* Test code for libibex */ + +#include <stdio.h> +#include <glib.h> +#include <errno.h> +#include <string.h> +#include "ibex_internal.h" + +void word_index_mem_dump_info(struct _IBEXWord *idx); + +/* + The following is a routine to generate a Gaussian distribution + of pseudo random numbers, to make the results a little more + meaningful +*/ + +/* boxmuller.c Implements the Polar form of the Box-Muller + Transformation + + (c) Copyright 1994, Everett F. Carter Jr. + Permission is granted by the author to use + this software for any application provided this + copyright notice is preserved. + +*/ + +#include <stdlib.h> +#include <math.h> + +#define ranf() ((float)rand()/(float)RAND_MAX) + +static float box_muller(float m, float s) /* normal random variate generator */ +{ /* mean m, standard deviation s */ + float x1, x2, w, y1; + static float y2; + static int use_last = 0; + + if (use_last) /* use value from previous call */ + { + y1 = y2; + use_last = 0; + } + else + { + do { + x1 = 2.0 * ranf() - 1.0; + x2 = 2.0 * ranf() - 1.0; + w = x1 * x1 + x2 * x2; + } while ( w >= 1.0 ); + + w = sqrt( (-2.0 * log( w ) ) / w ); + y1 = x1 * w; + y2 = x2 * w; + use_last = 1; + } + + return( m + y1 * s ); +} + +/* gets a word from words, using m and s as distribution values */ +static char *getword(GPtrArray *words, float m, float s) +{ + int index; + + do { + index = (int)box_muller(m, s); + } while (index<0 || index>=words->len); + + return words->pdata[index]; +} + + +int main(int argc, char **argv) +{ + int i, j; + GPtrArray *words = g_ptr_array_new(); + char line[256]; + int len; + FILE *file; + float m, s; + ibex *ib; + GString *buffer = g_string_new(""); + int files; + char *dict; + + srand(0xABADF00D); + + files = 80000; + dict = "/usr/dict/words"; + + /* read words into an array */ + file = fopen(dict, "r"); + if (file == NULL) { + fprintf(stderr, "Cannot open word file: %s: %s\n", dict, strerror(errno)); + return 1; + } + while (fgets(line, sizeof(line), file) != NULL) { + len = strlen(line); + if (len>0 && line[len-1]=='\n') { + line[len-1]=0; + } + g_ptr_array_add(words, g_strdup(line)); + } + fclose(file); + + fprintf(stderr, "Read %d words\n", words->len); + + /* *shrug* arbitrary values really */ + m = words->len/2; + /* well, the average vocabulary of a mailbox is about 10K words */ + s = 1000.0; + + printf("mean is %f, s is %f\n", m, s); + + /* open ibex file */ + ib = ibex_open("test.ibex", O_RDWR|O_CREAT, 0600); + if (ib == NULL) { + perror("Creating ibex file\n"); + return 1; + } + + printf("Adding %d files\n", files); + + /* simulate adding new words to a bunch of files */ + for (j=0;j<files;j++) { + /* always new name */ + char *name = words->pdata[j % words->len]; + /* something like 60 words in a typical message, say */ + int count = (int)box_muller(60.0, 20.0); + + if (j%1000 == 0) + word_index_mem_dump_info(ib->words); + + /* cache the name info */ + ibex_contains_name(ib, name); + + /*printf("Adding %d words to '%s'\n", count, name);*/ + + g_string_truncate(buffer, 0); + + /* build up the word buffer */ + for (i=0;i<count;i++) { + if (i>0) + g_string_append_c(buffer, ' '); + g_string_append(buffer, getword(words, m, s)); + } + + /* and index it */ + ibex_index_buffer(ib, name, buffer->str, buffer->len, NULL); + } + + word_index_mem_dump_info(ib->words); + + ibex_close(ib); + + return 0; +} + diff --git a/libibex/wordindexmem.c b/libibex/wordindexmem.c index 4c0bca7cef..9d26bb3697 100644 --- a/libibex/wordindexmem.c +++ b/libibex/wordindexmem.c @@ -650,11 +650,16 @@ static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words) d(cache_sanity(idx)); /* make sure we keep the namecache in sync, if it is active */ - if (idx->nameinit && g_hash_table_lookup(idx->namecache, name) == NULL) + if (idx->nameinit && g_hash_table_lookup(idx->namecache, name) == NULL) { g_hash_table_insert(idx->namecache, g_strdup(name), (void *)TRUE); - - /* get the nameid and block start for this name */ - add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); + /* we know we dont have it in the disk hash either, so we insert anew (saves a lookup) */ + nameid = idx->nameindex->klass->insert(idx->nameindex, name, strlen(name)); + nameblock = 0; + nametail = 0; + } else { + /* get the nameid and block start for this name */ + add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); + } d(cache_sanity(idx)); @@ -778,3 +783,102 @@ static int word_close(struct _IBEXWord *idx) return 0; } + +/* debugging/tuning function */ + +struct _stats { + int memcache; /* total memory used by cache entries */ + int memfile; /* total mem ysed by file data */ + int memfileused; /* actual memory used by file data */ + int memword; /* total mem used by words */ + int file1; /* total file entries with only 1 entry */ + int total; +}; + +static void +get_info(void *key, void *value, void *data) +{ + struct _wordcache *cache = (struct _wordcache *)value; + struct _stats *stats = (struct _stats *)data; + + /* round up to probable alignment, + malloc overheads */ + stats->memcache += ((sizeof(struct _wordcache) + strlen(cache->word) + 4 + 3) & ~3); + if (cache->filealloc > 0) { + /* size of file array data */ + stats->memcache += sizeof(nameid_t) * cache->filealloc + 4; + /* actual used memory */ + stats->memfile += sizeof(nameid_t) * cache->filealloc; + stats->memfileused += sizeof(nameid_t) * cache->filecount; + } + if (cache->filecount == 1 && cache->filealloc == 0) + stats->file1++; + + stats->memword += strlen(cache->word); + stats->total++; +} + +static char * +num(int num) +{ + int n; + char buf[256], *p = buf; + char type = 0; + + n = num; + if (n>1000000) { + p+= sprintf(p, "%d ", n/1000000); + n -= (n/1000000)*1000000; + type = 'M'; + } + if (n>1000) { + if (num>1000000) + p+= sprintf(p, "%03d ", n/1000); + else + p+= sprintf(p, "%d ", n/1000); + n -= (n/1000)*1000; + if (type == 0) + type = 'K'; + } + if (num > 1000) + p += sprintf(p, "%03d", n); + else + p += sprintf(p, "%d", n); + + n = num; + switch (type) { + case 'M': + p += sprintf(p, ", %d.%02dM", n/1024/1024, n*100/1024/1024); + break; + case 'K': + p += sprintf(p, ", %d.%02dK", n/1024, n*100/1024); + break; + case 0: + break; + } + + return buf; +} + +void word_index_mem_dump_info(struct _IBEXWord *idx); + +void word_index_mem_dump_info(struct _IBEXWord *idx) +{ + struct _stats stats = { 0 }; + int useful; + + g_hash_table_foreach(idx->wordcache, get_info, &stats); + + useful = stats.total * sizeof(struct _wordcache) + stats.memword + stats.memfile; + + printf("Word Index Stats:\n"); + printf("Total word count: %d\n", stats.total); + printf("Total memory used: %s\n", num(stats.memcache)); + printf("Total useful memory: %s\n", num(useful)); + printf("Total malloc/alignment overhead: %s\n", num(stats.memcache - useful)); + printf("Total buffer overhead: %s\n", num(stats.memfile - stats.memfileused)); + printf("Space taken by words: %s\n", num(stats.memword + stats.total)); + printf("Number of 1-word entries: %s\n", num(stats.file1)); + if (stats.memcache > 0) + printf("%% unused space: %d %%\n", (stats.memfile - stats.memfileused) * 100 / stats.memcache); +} + |