diff options
Diffstat (limited to 'libibex')
-rw-r--r-- | libibex/ChangeLog | 57 | ||||
-rw-r--r-- | libibex/Makefile.am | 29 | ||||
-rw-r--r-- | libibex/block.c | 481 | ||||
-rw-r--r-- | libibex/block.h | 102 | ||||
-rw-r--r-- | libibex/diskarray.c | 255 | ||||
-rw-r--r-- | libibex/disktail.c | 807 | ||||
-rw-r--r-- | libibex/file.c | 481 | ||||
-rw-r--r-- | libibex/find.c | 198 | ||||
-rw-r--r-- | libibex/hash.c | 701 | ||||
-rw-r--r-- | libibex/ibex_block.c (renamed from libibex/words.c) | 177 | ||||
-rw-r--r-- | libibex/ibex_db.c | 512 | ||||
-rw-r--r-- | libibex/ibex_internal.h | 16 | ||||
-rw-r--r-- | libibex/index.c | 154 | ||||
-rw-r--r-- | libibex/index.h | 82 | ||||
-rw-r--r-- | libibex/wordindex.c | 488 | ||||
-rw-r--r-- | libibex/wordindex.h | 65 |
16 files changed, 3681 insertions, 924 deletions
diff --git a/libibex/ChangeLog b/libibex/ChangeLog index a74f697308..1c80c8348b 100644 --- a/libibex/ChangeLog +++ b/libibex/ChangeLog @@ -1,3 +1,60 @@ +2000-09-19 Not Zed <NotZed@HelixCode.com> + + ** Merged from IBEX_DISK branch to head. + + * file.c: + * find.c: + * words.c: + * index.c: Removed unused files. + + * block.h: Changed block to use only 24 bits for next and 8 for + used, and fixed all relevant code. Some cleanup. + + * disktail.c (tail_get): If we use an empty tail node, then make + sure we make it dirty. + +2000-09-15 Not Zed <NotZed@HelixCode.com> + + * wordindex.c (word_close): Free hashtable on exit too. + + * disktail.c: Implemented tail-node storage for the end of long + lists, or for short lists. Should save significant disk space + (5x?). + Implemented special case for 1-item lists, where the tailnode + pointer is used to store the index entry. + +2000-09-14 Not Zed <NotZed@HelixCode.com> + + * wordindex.c (add_index_key): Keys also handle tails. + + * hash.c (hash_set_data_block): Added new parameter to keys - a + tail block (a full 32 bit block pointer). + (hash_get_data_block): And same here. + +2000-09-12 Not Zed <NotZed@HelixCode.com> + + * wordindex.c (word_close): Dont close namestore twice. + +2000-09-11 Not Zed <NotZed@HelixCode.com> + + ** Redid almost everything, on-disk hash table to store an index + to index records, mroe on the way to modularisation (more to go), + now stores reverse indexes for deleting. + +2000-08-31 Not Zed <NotZed@HelixCode.com> + + * block.c (add_key_mem): Initialise a memory based array for newly + added index entries. + (add_record): Changed to cache updates in memory until we hit a + limit, and then flush them to disk. + (get_record): Merge in-memory records with disk records. + (remove_record): Remove from memory first, and if that fails, goto + disk. + (find_record): Check memory first, then disk if that fails. + (add_datum_list): oops, copy size * sizeof(blockid_t) + (add_indexed): Make sure we link in the head node when we create a + new one. + 2000-08-09 Christopher James Lahey <clahey@helixcode.com> * file.c, find.c: Fixed some warnings. diff --git a/libibex/Makefile.am b/libibex/Makefile.am index c9e8165987..cfd8a1b4d5 100644 --- a/libibex/Makefile.am +++ b/libibex/Makefile.am @@ -2,20 +2,35 @@ noinst_LTLIBRARIES = libibex.la -libibex_la_SOURCES = file.c index.c find.c words.c ibex.h +libibex_la_SOURCES = \ + wordindex.c \ + block.c ibex.h \ + hash.c \ + disktail.c \ + ibex_block.c + libibex_la_LDFLAGS = -static -noinst_HEADERS = ibex_internal.h +noinst_HEADERS = \ + ibex_internal.h \ + block.h \ + wordindex.h \ + index.h INCLUDES = -I$(srcdir) $(GLIB_CFLAGS) $(UNICODE_CFLAGS) \ -DG_LOG_DOMAIN=\"libibex\" -noinst_PROGRAMS = mkindex lookup +#noinst_PROGRAMS = hash -mkindex_SOURCES = mkindex.c -mkindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) +#hash_SOURCES = disktail.c +#hash_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) -lookup_SOURCES = lookup.c -lookup_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) +#noinst_PROGRAMS = mkindex lookup +# +#mkindex_SOURCES = mkindex.c +#mkindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) +# +#lookup_SOURCES = lookup.c +#lookup_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) diff --git a/libibex/block.c b/libibex/block.c new file mode 100644 index 0000000000..53579523bd --- /dev/null +++ b/libibex/block.c @@ -0,0 +1,481 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2000 Helix Code, Inc. + * + * Authors: Michael Zucchi <notzed@helixcode.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the Gnome Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + block file/cache/utility functions +*/ + +#include <stdio.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include <glib.h> + +#include "block.h" + +#define d(x) +/*#define DEBUG*/ + + +#ifdef IBEX_STATS +static void +init_stats(struct _memcache *index) +{ + index->stats = g_hash_table_new(g_direct_hash, g_direct_equal); +} + +static void +dump_1_stat(int id, struct _stat_info *info, struct _memcache *index) +{ + printf("%d %d %d %d %d\n", id, info->read, info->write, info->cache_hit, info->cache_miss); +} + +static void +dump_stats(struct _memcache *index) +{ + printf("Block reads writes hits misses\n"); + g_hash_table_foreach(index->stats, dump_1_stat, index); +} + +static void +add_read(struct _memcache *index, int id) +{ + struct _stat_info *info; + + info = g_hash_table_lookup(index->stats, (void *)id); + if (info == NULL) { + info = g_malloc0(sizeof(*info)); + g_hash_table_insert(index->stats, (void *)id, info); + } + info->read++; +} + +static void +add_write(struct _memcache *index, int id) +{ + struct _stat_info *info; + + info = g_hash_table_lookup(index->stats, (void *)id); + if (info == NULL) { + info = g_malloc0(sizeof(*info)); + g_hash_table_insert(index->stats, (void *)id, info); + } + info->write++; +} + +static void +add_hit(struct _memcache *index, int id) +{ + struct _stat_info *info; + + info = g_hash_table_lookup(index->stats, (void *)id); + if (info == NULL) { + info = g_malloc0(sizeof(*info)); + g_hash_table_insert(index->stats, (void *)id, info); + } + info->cache_hit++; +} + +static void +add_miss(struct _memcache *index, int id) +{ + struct _stat_info *info; + + info = g_hash_table_lookup(index->stats, (void *)id); + if (info == NULL) { + info = g_malloc0(sizeof(*info)); + g_hash_table_insert(index->stats, (void *)id, info); + } + info->cache_miss++; +} +#endif /* IBEX_STATS */ + + +/* simple list routines (for simplified memory management of cache/lists) */ + +/** + * ibex_list_new: + * @v: + * + * Initialise a list header. A list header must always be initialised + * before use. + **/ +void ibex_list_new(struct _list *v) +{ + v->head = (struct _listnode *)&v->tail; + v->tail = 0; + v->tailpred = (struct _listnode *)&v->head; +} + +/** + * ibex_list_addhead: + * @l: List. + * @n: Node to append. + * + * Prepend a listnode to the head of the list @l. + * + * Return value: Always @n. + **/ +struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n) +{ + n->next = l->head; + n->prev = (struct _listnode *)&l->head; + l->head->prev = n; + l->head = n; + return n; +} + +/** + * ibex_list_addtail: + * @l: + * @n: + * + * Append a listnode to the end of the list @l. + * + * Return value: Always the same as @n. + **/ +struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n) +{ + n->next = (struct _listnode *)&l->tail; + n->prev = l->tailpred; + l->tailpred->next = n; + l->tailpred = n; + return n; +} + +/** + * ibex_list_remove: + * @n: The node to remove. + * + * Remove a listnode from a list. + * + * Return value: Always the same as @n. + **/ +struct _listnode *ibex_list_remove(struct _listnode *n) +{ + n->next->prev = n->prev; + n->prev->next = n->next; + return n; +} + +static struct _memblock * +memblock_addr(struct _block *block) +{ + return (struct _memblock *)(((char *)block) - G_STRUCT_OFFSET(struct _memblock, data)); +} + +/** + * ibex_block_dirty: + * @block: + * + * Dirty a block. This will cause it to be written to disk on + * a cache sync, or when the block is flushed from the cache. + **/ +void +ibex_block_dirty(struct _block *block) +{ + memblock_addr(block)->flags |= BLOCK_DIRTY; +} + +static void +sync_block(struct _memcache *block_cache, struct _memblock *memblock) +{ + lseek(block_cache->fd, memblock->block, SEEK_SET); + if (write(block_cache->fd, &memblock->data, sizeof(memblock->data)) != -1) { + memblock->flags &= ~BLOCK_DIRTY; + } +#ifdef IBEX_STATS + add_write(block_cache, memblock->block); +#endif +} + +/** + * ibex_block_cache_sync: + * @block_cache: + * + * Ensure the block cache is fully synced to disk. + **/ +void +ibex_block_cache_sync(struct _memcache *block_cache) +{ + struct _memblock *memblock; + + memblock = (struct _memblock *)block_cache->nodes.head; + while (memblock->next) { + if (memblock->flags & BLOCK_DIRTY) { + sync_block(block_cache, memblock); + } + memblock = memblock->next; + } + fsync(block_cache->fd); + +#ifdef IBEX_STATS + dump_stats(block_cache); +#endif + +} + +/** + * ibex_block_cache_flush: + * @block_cache: + * + * Ensure the block cache is fully synced to disk, and then flush + * its contents from memory. + **/ +void +ibex_block_cache_flush(struct _memcache *block_cache) +{ + struct _memblock *mw, *mn; + + ibex_block_cache_sync(block_cache); + + mw = (struct _memblock *)block_cache->nodes.head; + mn = mw->next; + while (mn) { + g_hash_table_remove(block_cache->index, (void *)mw->block); + g_free(mw); + mw = mn; + mn = mn->next; + } + ibex_list_new(&block_cache->nodes); +} + + +/** + * ibex_block_read: + * @block_cache: + * @blockid: + * + * Read the data of a block by blockid. The data contents is backed by + * the block cache, and should be considered static. + * + * TODO; should this return a NULL block on error? + * + * Return value: The address of the block data (which may be cached). + **/ +struct _block * +ibex_block_read(struct _memcache *block_cache, blockid_t blockid) +{ + struct _memblock *memblock; + + { + /* assert blockid<roof */ + if (blockid > 0) { + struct _root *root = (struct _root *)ibex_block_read(block_cache, 0); + g_assert(blockid < root->roof); + } + } + + memblock = g_hash_table_lookup(block_cache->index, (void *)blockid); + if (memblock) { + d(printf("foudn blockid in cache %d = %p\n", blockid, &memblock->data)); + /* 'access' page */ + ibex_list_remove((struct _listnode *)memblock); + ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock); +#ifdef IBEX_STATS + add_hit(block_cache, memblock->block); +#endif + return &memblock->data; + } +#ifdef IBEX_STATS + add_miss(block_cache, blockid); + add_read(block_cache, blockid); +#endif + d(printf("loading blockid from disk %d\n", blockid)); + memblock = g_malloc(sizeof(*memblock)); + memblock->block = blockid; + memblock->flags = 0; + lseek(block_cache->fd, blockid, SEEK_SET); + memset(&memblock->data, 0, sizeof(memblock->data)); + read(block_cache->fd, &memblock->data, sizeof(memblock->data)); + ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock); + g_hash_table_insert(block_cache->index, (void *)blockid, memblock); + if (block_cache->count >= CACHE_SIZE) { + struct _memblock *old = (struct _memblock *)block_cache->nodes.head; + d(printf("discaring cache block %d\n", old->block)); + g_hash_table_remove(block_cache->index, (void *)old->block); + ibex_list_remove((struct _listnode *)old); + if (old->flags & BLOCK_DIRTY) { + sync_block(block_cache, old); + } + g_free(old); + } else { + block_cache->count++; + } + + d(printf(" --- cached blocks : %d\n", block_cache->count)); + + return &memblock->data; +} + +/** + * ibex_block_cache_open: + * @name: + * @flags: Flags as to open(2), should use O_RDWR and optionally O_CREAT. + * @mode: Mose as to open(2) + * + * Open a block file. + * + * FIXME; this currently also initialises the word and name indexes + * because their pointers are stored in the root block. Should be + * upto the caller to manage these pointers/data. + * + * Return value: NULL if the backing file could not be opened. + **/ +struct _memcache * +ibex_block_cache_open(const char *name, int flags, int mode) +{ + struct _root *root; + struct _memcache *block_cache = g_malloc0(sizeof(*block_cache)); + + d(printf("opening ibex file: %s", name)); + + /* setup cache */ + ibex_list_new(&block_cache->nodes); + block_cache->count = 0; + block_cache->index = g_hash_table_new(g_direct_hash, g_direct_equal); + block_cache->fd = open(name, flags, mode); + + if (block_cache->fd == -1) { + g_hash_table_destroy(block_cache->index); + g_free(block_cache); + return NULL; + } + + root = (struct _root *)ibex_block_read(block_cache, 0); + if (root->roof == 0 || memcmp(root->version, "ibx3", 4)) { + d(printf("Initialising superblock\n")); + /* reset root data */ + memcpy(root->version, "ibx3", 4); + root->roof = 1024; + root->free = 0; + root->words = 0; + root->names = 0; + root->tail = 0; /* list of tail blocks */ + ibex_block_dirty((struct _block *)root); + } else { + d(printf("superblock already initialised:\n" + " roof = %d\n free = %d\n words = %d\n names = %d\n", + root->roof, root->free, root->words, root->names)); + } + /* this should be moved higher up */ + { + struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot); + + block_cache->words = ibex_create_word_index(block_cache, &root->words, &root->names); + } + +#ifdef IBEX_STATS + init_stats(block_cache); +#endif + + return block_cache; +} + +/** + * ibex_block_cache_close: + * @block_cache: + * + * Close the block file, sync any remaining cached data + * to disk, and free all resources. + **/ +void +ibex_block_cache_close(struct _memcache *block_cache) +{ + struct _memblock *mw, *mn; + + ibex_block_cache_sync(block_cache); + close(block_cache->fd); + + mw = (struct _memblock *)block_cache->nodes.head; + mn = mw->next; + while (mn) { + g_free(mw); + mw = mn; + mn = mw->next; + } + + g_hash_table_destroy(block_cache->index); + + g_free(block_cache); +} + +/** + * ibex_block_free: + * @block_cache: + * @blockid: + * + * Return a block to the free pool. + **/ +void +ibex_block_free(struct _memcache *block_cache, blockid_t blockid) +{ + struct _root *root = (struct _root *)ibex_block_read(block_cache, 0); + struct _block *block = ibex_block_read(block_cache, blockid); + + block->next = root->free; + root->free = blockid; + ibex_block_dirty((struct _block *)root); + ibex_block_dirty((struct _block *)block); +} + +/** + * ibex_block_get: + * @block_cache: + * + * Allocate a new block, or access a previously freed block and return + * its block id. The block will have zeroed contents. + * + * Return value: 0 if there are no blocks left (disk full/read only + * file, etc). + **/ +blockid_t +ibex_block_get(struct _memcache *block_cache) +{ + struct _root *root = (struct _root *)ibex_block_read(block_cache, 0); + struct _block *block; + blockid_t head; + + if (root->free) { + head = root->free; + block = ibex_block_read(block_cache, head); + root->free = block_location(block->next); + } else { + /* TODO: check the block will fit first */ + /* TODO: no need to read this block, can allocate it manually (saves a syscall/read) */ + head = root->roof; + root->roof += BLOCK_SIZE; + block = ibex_block_read(block_cache, head); + } + + g_assert(head != 0); + + d(printf("new block = %d\n", head)); + block->next = 0; + block->used = 0; + ibex_block_dirty(block); + ibex_block_dirty((struct _block *)root); + return head; +} diff --git a/libibex/block.h b/libibex/block.h new file mode 100644 index 0000000000..40262de6e2 --- /dev/null +++ b/libibex/block.h @@ -0,0 +1,102 @@ + +/* public interfaces for block io routines */ + +#ifndef _BLOCK_H +#define _BLOCK_H + +/*#define IBEX_STATS*/ /* define to get/dump block access stats */ + +#include <glib.h> + +typedef guint32 nameid_t; +typedef guint32 blockid_t; + +#define BLOCK_BITS (8) +#define BLOCK_SIZE (1<<BLOCK_BITS) +#define CACHE_SIZE 256 /* blocks in disk cache */ + +/* root block */ +struct _root { + char version[4]; + + blockid_t free; /* list of free blocks */ + blockid_t roof; /* top of allocated space, everything below is in a free or used list */ + blockid_t tail; /* list of 'tail' blocks */ + + blockid_t words; /* root of words index */ + blockid_t names; /* root of names index */ +}; + +/* basic disk structure for (data) blocks */ +struct _block { + unsigned int next:32-BLOCK_BITS; /* next block */ + unsigned int used:BLOCK_BITS; /* number of elements used */ + + nameid_t bl_data[(BLOCK_SIZE-4)/4]; /* references */ +}; + +/* custom list structure, for a simple/efficient cache */ +struct _listnode { + struct _listnode *next; + struct _listnode *prev; +}; +struct _list { + struct _listnode *head; + struct _listnode *tail; + struct _listnode *tailpred; +}; + +void ibex_list_new(struct _list *v); +struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n); +struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n); +struct _listnode *ibex_list_remove(struct _listnode *n); + +/* in-memory structure for block cache */ +struct _memblock { + struct _memblock *next; + struct _memblock *prev; + + blockid_t block; + int flags; + + struct _block data; +}; +#define BLOCK_DIRTY (1<<0) + +struct _memcache { + struct _list nodes; + int count; /* nodes in cache */ + + GHashTable *index; /* blockid->memblock mapping */ + int fd; /* file fd */ + +#ifdef IBEX_STATS + GHashTable *stats; +#endif + /* temporary here */ + struct _IBEXWord *words; /* word index */ +}; + +#ifdef IBEX_STATS +struct _stat_info { + int read; + int write; + int cache_hit; + int cache_miss; +}; +#endif /* IBEX_STATS */ + +struct _memcache *ibex_block_cache_open(const char *name, int flags, int mode); +void ibex_block_cache_close(struct _memcache *block_cache); +void ibex_block_cache_sync(struct _memcache *block_cache); +void ibex_block_cache_flush(struct _memcache *block_cache); + +blockid_t ibex_block_get(struct _memcache *block_cache); +void ibex_block_free(struct _memcache *block_cache, blockid_t blockid); +void ibex_block_dirty(struct _block *block); +struct _block *ibex_block_read(struct _memcache *block_cache, blockid_t blockid); + +#define block_number(x) ((x)>>BLOCK_BITS) +#define block_location(x) ((x)<<BLOCK_BITS) + +#endif /* ! _BLOCK_H */ diff --git a/libibex/diskarray.c b/libibex/diskarray.c new file mode 100644 index 0000000000..2664e0f6b5 --- /dev/null +++ b/libibex/diskarray.c @@ -0,0 +1,255 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2000 Helix Code, Inc. + * + * Authors: Michael Zucchi <notzed@helixcode.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the Gnome Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* a disk based array storage class */ + + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include <glib.h> + +#include "block.h" +#include "index.h" + +#define d(x) +/*#define DEBUG*/ + +static struct _IBEXStore *disk_create(struct _memcache *bc); +static int disk_sync(struct _IBEXStore *store); +static int disk_close(struct _IBEXStore *store); + +static blockid_t disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data); +static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data); +static blockid_t disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data); +static void disk_free(struct _IBEXStore *store, blockid_t head); + +static gboolean disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data); +static GArray *disk_get(struct _IBEXStore *store, blockid_t head); + +struct _IBEXStoreClass ibex_diskarray_class = { + disk_create, disk_sync, disk_close, + disk_add, disk_add_list, + disk_remove, disk_free, + disk_find, disk_get +}; + +static struct _IBEXStore *disk_create(struct _memcache *bc) +{ + struct _IBEXStore *store; + + store = g_malloc0(sizeof(*store)); + store->klass = &ibex_diskarray_class; + store->blocks = bc; + + return store; +} + +static int disk_sync(struct _IBEXStore *store) +{ + /* no cache, nop */ + return 0; +} + +static int disk_close(struct _IBEXStore *store) +{ + g_free(store); + return 0; +} + +static blockid_t +disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data) +{ + struct _block *block; + struct _block *newblock; + blockid_t new; + + if (head == 0) { + head = ibex_block_get(store->blocks); + } + block = ibex_block_read(store->blocks, head); + + d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next)); + + if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) { + d(printf("adding record into block %d %d\n", head, data)); + block->bl_data[block->used] = data; + block->used++; + ibex_block_dirty(block); + return head; + } else { + new = ibex_block_get(store->blocks); + newblock = ibex_block_read(store->blocks, new); + newblock->next = head; + newblock->bl_data[0] = data; + newblock->used = 1; + d(printf("adding record into new %d %d, next =%d\n", new, data, newblock->next)); + ibex_block_dirty(newblock); + return new; + } +} + +static blockid_t +disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data) +{ + struct _block *block; + struct _block *newblock; + blockid_t new; + int copied = 0; + int left, space, tocopy; + + if (head == 0) { + head = ibex_block_get(store->blocks); + } + block = ibex_block_read(store->blocks, head); + + while (copied < data->len) { + left = data->len - copied; + space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used; + if (space) { + tocopy = MIN(left, space); + memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); + block->used += tocopy; + ibex_block_dirty(block); + } else { + new = ibex_block_get(store->blocks); + newblock = ibex_block_read(store->blocks, new); + newblock->next = head; + tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0])); + memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); + newblock->used = tocopy; + block = newblock; + head = new; + ibex_block_dirty(newblock); + } + copied += tocopy; + } + return head; +} + +static blockid_t +disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data) +{ + blockid_t node = head; + + d(printf("removing %d from %d\n", data, head)); + while (node) { + struct _block *block = ibex_block_read(store->blocks, node); + int i; + + for (i=0;i<block->used;i++) { + if (block->bl_data[i] == data) { + struct _block *start = ibex_block_read(store->blocks, head); + + start->used--; + block->bl_data[i] = start->bl_data[start->used]; + if (start->used == 0) { + struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0); + blockid_t new; + + d(printf("dropping block %d, new head = %d\n", head, start->next)); + new = start->next; + start->next = root->free; + root->free = head; + head = new; + ibex_block_dirty((struct _block *)root); + } + ibex_block_dirty(block); + ibex_block_dirty(start); + return head; + } + } + node = block->next; + } + return head; +} + +static void disk_free(struct _IBEXStore *store, blockid_t head) +{ + blockid_t next; + struct _block *block; + + while (head) { + block = ibex_block_read(store->blocks, head); + next = block->next; + ibex_block_free(store->blocks, head); + head = next; + } +} + +static gboolean +disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data) +{ + blockid_t node = head; + + d(printf("finding %d from %d\n", data, head)); + while (node) { + struct _block *block = ibex_block_read(store->blocks, node); + int i; + + for (i=0;i<block->used;i++) { + if (block->bl_data[i] == data) { + return TRUE; + } + } + node = block->next; + } + return FALSE; +} + +static GArray * +disk_get(struct _IBEXStore *store, blockid_t head) +{ + GArray *result = g_array_new(0, 0, sizeof(nameid_t)); + + while (head) { + struct _block *block = ibex_block_read(store->blocks, head); + + d(printf("getting data from block %d\n", head)); + + g_array_append_vals(result, block->bl_data, block->used); + head = block->next; + d(printf("next = %d\n", head)); + } + return result; +} + +void +ibex_diskarray_dump(struct _memcache *blocks, blockid_t head) +{ + blockid_t node = head; + + printf("dumping list %d\n", node); + while (node) { + struct _block *block = ibex_block_read(blocks, node); + int i; + + printf(" block %d used %d\n ", node, block->used); + for (i=0;i<block->used;i++) { + printf(" %d", block->bl_data[i]); + } + printf("\n"); + node = block->next; + } +} diff --git a/libibex/disktail.c b/libibex/disktail.c new file mode 100644 index 0000000000..90562889de --- /dev/null +++ b/libibex/disktail.c @@ -0,0 +1,807 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2000 Helix Code, Inc. + * + * Authors: Michael Zucchi <notzed@helixcode.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the Gnome Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* a disk based array storage class that stores the tails of data lists + in common blocks */ + + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include <glib.h> +#include <string.h> + +#include <stdio.h> + +#include "block.h" +#include "index.h" + +#define d(x) +/*#define DEBUG*/ + +/* marker to define which root keys indicate a single length key */ +#define BLOCK_ONE (1<<BLOCK_BITS) + +/* tail blocks only contain tail data ... */ +/* and we pack it in, similar to the key data, only more compact */ +struct _tailblock { + unsigned int next:32-BLOCK_BITS; /* only needs to point to block numbers */ + unsigned int used:BLOCK_BITS; /* how many entries are used */ + union { + unsigned char offset[BLOCK_SIZE-4]; /* works upto blocksize of 1024 bytes */ + nameid_t data[(BLOCK_SIZE-4)/4]; + } tailblock_u; +}; +#define tb_offset tailblock_u.offset +#define tb_data tailblock_u.data + +/* map a tail index to a block index */ +#define TAIL_INDEX(b) ((b) & (BLOCK_SIZE-1)) +/* map a tail index to a block number */ +#define TAIL_BLOCK(b) ((b) & ~(BLOCK_SIZE-1)) +/* map a block + index to a tailid */ +#define TAIL_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1))) + +#define TAIL_THRESHOLD ((BLOCK_SIZE-8)/6) + +static struct _IBEXStore *disk_create(struct _memcache *bc); +static int disk_sync(struct _IBEXStore *store); +static int disk_close(struct _IBEXStore *store); + +static blockid_t disk_add(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data); +static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data); +static blockid_t disk_remove(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data); +static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail); + +static gboolean disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data); +static GArray *disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail); + +struct _IBEXStoreClass ibex_diskarray_class = { + disk_create, disk_sync, disk_close, + disk_add, disk_add_list, + disk_remove, disk_free, + disk_find, disk_get +}; + + +static int +tail_info(struct _tailblock *bucket, nameid_t tailid, blockid_t **startptr) +{ + blockid_t *start, *end; + int index; + + /* get start/end of area to zap */ + index = TAIL_INDEX(tailid); + start = &bucket->tb_data[bucket->tb_offset[index]]; + if (index == 0) { + end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]; + } else { + end = &bucket->tb_data[bucket->tb_offset[index-1]]; + } + if (startptr) + *startptr = start; + return end-start; +} + +/* compresses (or expand) the bucket entry, to the new size */ +static void +tail_compress(struct _tailblock *bucket, int index, int newsize) +{ + int i; + blockid_t *start, *end, *newstart; + + /* get start/end of area to zap */ + start = &bucket->tb_data[bucket->tb_offset[index]]; + if (index == 0) { + end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]; + } else { + end = &bucket->tb_data[bucket->tb_offset[index-1]]; + } + + if (end-start == newsize) + return; + + /* + XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy + 0 20 25 + + newsize = 0 + end = 25 + newstart = 0 + start = 20 + + newstart+(end-start)-newsize = 5 + i = start-newstart + + XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy + 0 20 25 + + newsize = 2 + end = 25 + newstart = 0 + start = 20 + + newstart+(end-start)-newsize = 3 + i = start-newstart + MIN(end-start, newsize)) = 22 + + XXXXXXXXXXXXXXXXXXXXXXXXXXXXX + 5 25 + newsize = 5 + end = 25 + start = 25 + newstart = 5 + + newstart+(end-start)-newsize = 0 + i = start-newstart = 20 + + XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyy + 3 23 25 + newsize = 5 + end = 25 + start = 23 + newstart = 3 + + newstart+(end-start)-newsize = 0 + i = start-newstart + MIN(end-start, newsize) = 22 + + */ + + + /* fixup data */ + newstart = &bucket->tb_data[bucket->tb_offset[bucket->used-1]]; + + g_assert(newstart+(end-start)-newsize <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]); + g_assert(newstart + (start-newstart) + MIN(end-start, newsize) <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]); + g_assert(newstart+(end-start)-newsize >= &bucket->tb_offset[bucket->used]); + g_assert(newstart + (start-newstart) + MIN(end-start, newsize) >= &bucket->tb_offset[bucket->used]); + + memmove(newstart+(end-start)-newsize, newstart, ((start-newstart)+MIN(end-start, newsize)) * sizeof(blockid_t)); + + /* fixup key pointers */ + for (i=index;i<bucket->used;i++) { + bucket->tb_offset[i] += (end-start)-newsize; + } + ibex_block_dirty((struct _block *)bucket); +} + +/* + returns the number of blockid's free +*/ +static int +tail_space(struct _tailblock *tail) +{ + if (tail->used == 0) + return sizeof(tail->tb_data)/sizeof(tail->tb_data[0])-1; + + return &tail->tb_data[tail->tb_offset[tail->used-1]] + - (blockid_t *)&tail->tb_offset[tail->used]; +} + +static void +tail_dump(struct _memcache *blocks, blockid_t tailid) +{ + int i; + struct _tailblock *tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid)); + + printf("Block %d, used %d\n", tailid, tail->used); + for (i=0;i<sizeof(struct _tailblock)/sizeof(unsigned int);i++) { + printf(" %08x", ((unsigned int *)tail)[i]); + } + printf("\n"); +} + +static blockid_t +tail_get(struct _memcache *blocks, int size) +{ + struct _root *root = (struct _root *)ibex_block_read(blocks, 0); + blockid_t tailid; + struct _tailblock *tail; + int freeindex; + blockid_t *end; + int i, count = 0; + + d(printf("looking for a tail node with %d items in it\n", size)); + + /* look for a node with enough space, if we dont find it fairly + quickly, just quit. needs a better free algorithm i think ... */ + tailid = root->tail; + while (tailid && count<5) { + int space; + + d(printf(" checking tail node %d\n", tailid)); + + tail = (struct _tailblock *)ibex_block_read(blocks, tailid); + if (tail->used == 0) { + /* assume its big enough ... */ + tail->used = 1; + tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size; + d(printf("allocated %d (%d), used %d\n", tailid, tailid, tail->used)); + ibex_block_dirty((struct _block *)tail); + return tailid; + } + + /* see if we have a free slot first */ + freeindex = -1; + end = &tail->tb_data[sizeof(tail->tb_data)/sizeof(tail->tb_data[0])]; + for (i=0;i<tail->used;i++) { + if (end == &tail->tb_data[tail->tb_offset[i]]) { + freeindex = i; + break; + } + end = &tail->tb_data[tail->tb_offset[i]]; + } + + /* determine how much space we have available - incl any extra header we might need */ + space = ((char *)&tail->tb_data[tail->tb_offset[tail->used-1]]) + - ((char *)&tail->tb_offset[tail->used]) + /* FIXMEL work out why this is out a little bit */ + - 8; + if (freeindex == -1) + space -= sizeof(tail->tb_offset[0]); + + /* if we have enough, set it up, creating a new entry if necessary */ + /* for some really odd reason the compiler promotes this expression to unsigned, hence + the requirement for the space>0 check ... */ + if (space>0 && space > size*sizeof(blockid_t)) { + d(printf("space = %d, size = %d size*sizeof() = %d truth = %d\n", space, size, size*sizeof(blockid_t), space>size*sizeof(blockid_t))); + if (freeindex == -1) { + freeindex = tail->used; + tail->tb_offset[tail->used] = tail->tb_offset[tail->used-1]; + tail->used++; + } + tail_compress(tail, freeindex, size); + ibex_block_dirty((struct _block *)tail); + d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, freeindex), tail->used)); + return TAIL_KEY(tailid, freeindex); + } + count++; + tailid = block_location(tail->next); + } + + d(printf("allocating new data node for tail data\n")); + /* didn't find one, setup a new one */ + root = (struct _root *)ibex_block_read(blocks, 0); + tailid = ibex_block_get(blocks); + tail = (struct _tailblock *)ibex_block_read(blocks, tailid); + tail->next = block_number(root->tail); + root->tail = tailid; + tail->used = 1; + tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size; + ibex_block_dirty((struct _block *)tail); + d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, 0), tail->used)); + return TAIL_KEY(tailid, 0); +} + +static void +tail_free(struct _memcache *blocks, blockid_t tailid) +{ + struct _tailblock *tail; + + d(printf("freeing tail id %d\n", tailid)); + + if (tailid == 0) + return; + + tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid)); + d(printf(" tail %d used %d\n", tailid, tail->used)); + g_assert(tail->used); + if (TAIL_INDEX(tailid) == tail->used - 1) { + tail->used --; + } else { + tail_compress(tail, TAIL_INDEX(tailid), 0); + } + ibex_block_dirty((struct _block *)tail); +} + +static struct _IBEXStore *disk_create(struct _memcache *bc) +{ + struct _IBEXStore *store; + + store = g_malloc0(sizeof(*store)); + store->klass = &ibex_diskarray_class; + store->blocks = bc; + + return store; +} + +static int disk_sync(struct _IBEXStore *store) +{ + /* no cache, nop */ + return 0; +} + +static int disk_close(struct _IBEXStore *store) +{ + g_free(store); + return 0; +} + +static blockid_t +disk_add(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data) +{ + struct _block *block; + struct _block *newblock; + blockid_t new, head = *headptr /*, tail = *tailptr*/; + + g_error("Inbimplemented"); + + if (head == 0) { + head = ibex_block_get(store->blocks); + } + block = ibex_block_read(store->blocks, head); + + d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next)); + + if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) { + (printf("adding record into block %d %d\n", head, data)); + block->bl_data[block->used] = data; + block->used++; + ibex_block_dirty(block); + return head; + } else { + new = ibex_block_get(store->blocks); + newblock = ibex_block_read(store->blocks, new); + newblock->next = block_number(head); + newblock->bl_data[0] = data; + newblock->used = 1; + d(printf("adding record into new %d %d, next =%d\n", new, data, newblock->next)); + ibex_block_dirty(newblock); + return new; + } +} + +static blockid_t +disk_add_blocks_internal(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data) +{ + blockid_t head = *headptr, tail = *tailptr, new; + int tocopy; + struct _tailblock *tailblock; + struct _block *block, *newblock; + int space, copied = 0, left; + + /* assumes this funciton is in control of any tail creation */ + g_assert(tail == 0); + + d(printf("Adding %d items to block list\n", data->len)); + + if (head == 0) { + head = ibex_block_get(store->blocks); + d(printf("allocating new head %d\n", head)); + } + block = ibex_block_read(store->blocks, head); + + /* ensure the first block is full before proceeding */ + space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used; + if (space) { + tocopy = MIN(data->len, space); + memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); + block->used += tocopy; + ibex_block_dirty(block); + copied = tocopy; + d(printf("copied %d items to left over last node\n", tocopy)); + } + + while (copied < data->len) { + left = data->len - copied; + /* do we drop the rest in a tail? */ + if (left < TAIL_THRESHOLD) { + d(printf("Storing remaining %d items in tail\n", left)); + tocopy = left; + new = tail_get(store->blocks, tocopy); + tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new)); + memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(new)]], + &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); + *tailptr = new; + } else { + new = ibex_block_get(store->blocks); + newblock = (struct _block *)ibex_block_read(store->blocks, new); + newblock->next = block_number(head); + tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0])); + d(printf("storing %d items in own block\n", tocopy)); + memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); + newblock->used = tocopy; + block = newblock; + head = new; + ibex_block_dirty(newblock); + } + copied += tocopy; + } + + *headptr = head; + return head; +} +/* + case 1: + no head, no tail, adding a lot of data + add blocks until the last, which goes in a tail node + case 2: + no head, no tail, adding a little data + just add a tail node + case 3: + no head, tail, total exceeds a block + merge as much as possible into new full blocks, then the remainder in the tail + case 4: + no head, tail, room in existing tail for data + add new data to tail node + case 5: + no head, tail, no room in existing tail for data + add a new tail node, copy data across, free old tail node +*/ + +static blockid_t +disk_add_list(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data) +{ + blockid_t new, head = *headptr, tail = *tailptr, *start; + struct _tailblock *tailblock, *tailnew; + int len; + GArray *tmpdata = NULL; + + d(printf("adding %d items head=%d tail=%d\n", data->len, head, tail)); + if (data->len == 0) + return head; + + /* store length=1 data in the tail pointer */ + if (head == 0 && tail == 0 && data->len == 1) { + *headptr = BLOCK_ONE; + *tailptr = g_array_index(data, blockid_t, 0); + return BLOCK_ONE; + } + /* if we got length=1 data to append to, upgrade it to a real block list */ + if (head == BLOCK_ONE) { + tmpdata = g_array_new(0, 0, sizeof(blockid_t)); + g_array_append_vals(tmpdata, data->data, data->len); + g_array_append_val(tmpdata, tail); + head = *headptr = 0; + tail = *tailptr = 0; + } + + /* if we have no head, then check the tail */ + if (head == 0) { + if (tail == 0) { + if (data->len >= TAIL_THRESHOLD) { + /* add normally */ + head = disk_add_blocks_internal(store, headptr, tailptr, data); + } else { + /* else add to a tail block */ + new = tail_get(store->blocks, data->len); + d(printf("adding %d items to a tail block %d\n", data->len, new)); + tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new)); + memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]], + data->data, data->len*sizeof(blockid_t)); + *tailptr = new; + ibex_block_dirty((struct _block *)tailnew); + } + } else { + tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); + len = tail_info(tailblock, tail, &start); + /* case 3 */ + if (len + data->len >= TAIL_THRESHOLD) { + /* this is suboptimal, but should work - merge the tail data with + our new data, and write it out */ + if (tmpdata == NULL) { + tmpdata = g_array_new(0, 0, sizeof(blockid_t)); + g_array_append_vals(tmpdata, data->data, data->len); + } + g_array_append_vals(tmpdata, start, len); + *tailptr = 0; + tail_free(store->blocks, tail); + head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata); + } else if (tail_space(tailblock) >= data->len) { + /* can we just expand this in our node, or do we need a new one? */ + tail_compress(tailblock, TAIL_INDEX(tail), data->len + len); + memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(tail)] + len], + data->data, data->len * sizeof(blockid_t)); + ibex_block_dirty((struct _block *)tailblock); + } else { + /* we need to allocate a new tail node */ + new = tail_get(store->blocks, data->len + len); + /* and copy the data across */ + tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new)); + memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]], + start, len*sizeof(blockid_t)); + memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]+len], + data->data, data->len*sizeof(blockid_t)); + tail_free(store->blocks, tail); + ibex_block_dirty((struct _block *)tailnew); + *tailptr = new; + } + } + } else { + if (tail) { + /* read/merge the tail with the new data, rewrite out. + suboptimal, but it should be 'ok' ? */ + tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); + len = tail_info(tailblock, tail, &start); + tmpdata = g_array_new(0, 0, sizeof(blockid_t)); + g_array_append_vals(tmpdata, start, len); + g_array_append_vals(tmpdata, data->data, data->len); + *tailptr = 0; + tail_free(store->blocks, tail); + head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata); + } else { + head = disk_add_blocks_internal(store, headptr, tailptr, data); + } + } + if (tmpdata) + g_array_free(tmpdata, TRUE); + + return head; +} + +static blockid_t +disk_remove(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data) +{ + blockid_t head = *headptr, node = *headptr, tail = *tailptr; + int i; + + /* special case for 1-item nodes */ + if (head == BLOCK_ONE) { + if (tail == data) { + *tailptr = 0; + *headptr = 0; + head = 0; + } + return head; + } + + d(printf("removing %d from %d\n", data, head)); + while (node) { + struct _block *block = ibex_block_read(store->blocks, node); + + for (i=0;i<block->used;i++) { + if (block->bl_data[i] == data) { + struct _block *start = ibex_block_read(store->blocks, head); + + d(printf("removing data from block %d\n ", head)); + + start->used--; + block->bl_data[i] = start->bl_data[start->used]; + if (start->used == 0) { + struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0); + blockid_t new; + + d(printf("dropping block %d, new head = %d\n", head, start->next)); + new = block_location(start->next); + start->next = block_number(root->free); + root->free = head; + head = new; + ibex_block_dirty((struct _block *)root); + } + ibex_block_dirty(block); + ibex_block_dirty(start); + *headptr = head; + return head; + } + } + node = block_location(block->next); + } + + if (tail) { + struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); + int len; + blockid_t *start; + + len = tail_info(tailblock, tail, &start); + for (i=0;i<len;i++) { + if (start[i] == data) { + for (;i<len-1;i++) + start[i] = start[i+1]; + if (len == 1) + *tailptr = 0; + if (len == 1 && (tailblock->used-1) == TAIL_INDEX(tail)) { + d(printf("dropping/unlinking tailblock %d (%d) used = %d\n", + TAIL_BLOCK(tail), tail, tailblock->used)); + tailblock->used--; + /* drop/unlink block? */ + ibex_block_dirty((struct _block *)tailblock); + *tailptr = 0; + } else { + tail_compress(tailblock, TAIL_INDEX(tail), len-1); + ibex_block_dirty((struct _block *)tailblock); + } + } + } + + } + + return head; +} + +static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail) +{ + blockid_t next; + struct _block *block; + + if (head == BLOCK_ONE) + return; + + while (head) { + d(printf("freeing block %d\n", head)); + block = ibex_block_read(store->blocks, head); + next = block_location(block->next); + ibex_block_free(store->blocks, head); + head = next; + } + if (tail) { + struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); + d(printf("freeing tail block %d (%d)\n", TAIL_BLOCK(tail), tail)); + tail_compress(tailblock, TAIL_INDEX(tail), 0); + ibex_block_dirty((struct _block *)tailblock); + } +} + +static gboolean +disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data) +{ + blockid_t node = head; + int i; + + d(printf("finding %d from %d\n", data, head)); + + if (head == BLOCK_ONE) + return data == tail; + + /* first check full blocks */ + while (node) { + struct _block *block = ibex_block_read(store->blocks, node); + + for (i=0;i<block->used;i++) { + if (block->bl_data[i] == data) { + return TRUE; + } + } + node = block_location(block->next); + } + + /* then check tail */ + if (tail) { + struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); + int len; + blockid_t *start; + + len = tail_info(tailblock, tail, &start); + for (i=0;i<len;i++) { + if (start[i] == data) + return TRUE; + } + } + + return FALSE; +} + +static GArray * +disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail) +{ + GArray *result = g_array_new(0, 0, sizeof(nameid_t)); + + if (head == BLOCK_ONE) { + g_array_append_val(result, tail); + return result; + } + + while (head) { + struct _block *block = ibex_block_read(store->blocks, head); + + d(printf("getting data from block %d\n", head)); + + g_array_append_vals(result, block->bl_data, block->used); + head = block_location(block->next); + d(printf("next = %d\n", head)); + } + + /* chuck on any tail data as well */ + if (tail) { + struct _tailblock *tailblock; + int len; + blockid_t *start; + + tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); + len = tail_info(tailblock, tail, &start); + g_array_append_vals(result, start, len); + } + return result; +} + +void +ibex_diskarray_dump(struct _memcache *blocks, blockid_t head, blockid_t tail) +{ + blockid_t node = head; + + printf("dumping list %d tail %d\n", node, tail); + if (head == BLOCK_ONE) { + printf(" 1 length index: %d\n", tail); + return; + } + + while (node) { + struct _block *block = ibex_block_read(blocks, node); + int i; + + printf(" block %d used %d\n ", node, block->used); + for (i=0;i<block->used;i++) { + printf(" %d", block->bl_data[i]); + } + printf("\n"); + node = block_location(block->next); + } + + printf("tail: "); + if (tail) { + struct _tailblock *tailblock; + int len; + blockid_t *start; + int i; + + tailblock = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tail)); + len = tail_info(tailblock, tail, &start); + for (i=0;i<len;i++) + printf(" %d", start[i]); + } + printf("\n"); +} + +#if 0 +int main(int argc, char **argv) +{ + struct _memcache *bc; + struct _IBEXStore *disk; + int i; + blockid_t data[100]; + GArray adata; + blockid_t head=0, tail=0; + + for (i=0;i<100;i++) { + data[i] = i; + } + + bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600); + disk = ibex_diskarray_class.create(bc); + + head = 0; + tail = 0; + adata.data = data; + adata.len = 70; + for (i=0;i<100;i++) { + printf("Loop %d\n", i); + ibex_diskarray_class.add_list(disk, &head, &tail, &adata); + ibex_diskarray_dump(bc, head, tail); + } + +#if 0 + for (i=1;i<100;i++) { + printf("inserting %d items\n", i); + adata.data = data; + adata.len = i; + head=0; + tail=0; + ibex_diskarray_class.add_list(disk, &head, &tail, &adata); + ibex_diskarray_dump(bc, head, tail); + } +#endif + ibex_diskarray_class.close(disk); + ibex_block_cache_close(bc); + return 0; +} + +#endif diff --git a/libibex/file.c b/libibex/file.c deleted file mode 100644 index 70b7ed96cb..0000000000 --- a/libibex/file.c +++ /dev/null @@ -1,481 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Copyright (C) 2000 Helix Code, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* file.c: index file read/write ops */ - -#include <ctype.h> -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> - -#include "ibex_internal.h" - -static unsigned long read_number (FILE *f); -static void write_number (FILE *f, unsigned long n); -static char *get_compressed_word (FILE *f, char **lastword); - -static gint free_file (gpointer key, gpointer value, gpointer data); -static void free_word (gpointer key, gpointer value, gpointer data); - -/* The file format is: - * - * version string (currently "ibex1") - * file count - * list of compressed filenames, separated by \0 - * word count - * list of compressed words, each followed by \0, a count, and that - * many references. - * - * All numbers are stored 7-bit big-endian, with the high bit telling - * whether or not the number continues to the next byte. - * - * compressed text consists of a byte telling how many characters the - * line has in common with the line before it, followed by the rest of - * the string. Obviously this only really works if the lists are sorted. - */ - -/** - * ibex_open: open (or possibly create) an ibex index - * @file: the name of the file - * @flags: open flags, see open(2). - * @mode: If O_CREAT is passed in flags, then the file mode - * to create the new file with. It will be anded with the current - * umask. - * - * Open and/or create the named ibex file and return a handle to it. - * - * Return value: an ibex handle, or NULL if an error occurred. - **/ -ibex * -ibex_open (char *file, int flags, int mode) -{ - ibex *ib; - FILE *f; - char vbuf[sizeof (IBEX_VERSION) - 1]; - char *word, *lastword; - unsigned long nfiles, nwords, nrefs, ref; - ibex_file **ibfs = NULL; - int i; - GPtrArray *refs; - int fd; - char *modestr; - - fd = open(file, flags, mode); - if (fd == -1) { - return NULL; - } - - /* yuck, this is because we use FILE * interface - internally */ - switch (flags & O_ACCMODE) { - case O_RDONLY: - modestr = "r"; - break; - case O_RDWR: - if (flags & O_APPEND) - modestr = "a+"; - else - modestr = "w+"; - break; - case O_WRONLY: - if (flags & O_APPEND) - modestr = "a"; - else - modestr = "w"; - break; - default: - if (flags & O_APPEND) - modestr = "a+"; - else - modestr = "r+"; - break; - } - - f = fdopen(fd, modestr); - if (f == NULL) { - if (errno == 0) - errno = ENOMEM; - close(fd); - return NULL; - } - - ib = g_malloc (sizeof (ibex)); - ib->dirty = FALSE; - ib->path = g_strdup (file); - ib->files = g_tree_new ((GCompareFunc) strcmp); - ib->words = g_hash_table_new (g_str_hash, g_str_equal); - ib->oldfiles = g_ptr_array_new (); - - if (!f) { - close(fd); - return ib; - } - - /* Check version. If its empty, then we have just created it */ - if (fread (vbuf, 1, sizeof (vbuf), f) != sizeof (vbuf)) { - if (feof (f)) { - fclose(f); - close(fd); - return ib; - } - } - if (strncmp (vbuf, IBEX_VERSION, sizeof (vbuf) != 0)) { - errno = EINVAL; - goto errout; - } - - /* Read list of files. */ - nfiles = read_number (f); - ibfs = g_malloc (nfiles * sizeof (ibex_file *)); - lastword = NULL; - for (i = 0; i < nfiles; i++) { - ibfs[i] = g_malloc (sizeof (ibex_file)); - ibfs[i]->name = get_compressed_word (f, &lastword); - if (!ibfs[i]->name) { - goto errout; - } - ibfs[i]->index = 0; - g_tree_insert (ib->files, ibfs[i]->name, ibfs[i]); - } - - /* Read list of words. */ - nwords = read_number (f); - lastword = NULL; - for (i = 0; i < nwords; i++) { - word = get_compressed_word (f, &lastword); - if (!word) { - goto errout; - } - - nrefs = read_number (f); - refs = g_ptr_array_new (); - g_ptr_array_set_size (refs, nrefs); - while (nrefs--) { - ref = read_number (f); - if (ref >= nfiles) { - goto errout; - } - refs->pdata[nrefs] = ibfs[ref]; - } - - g_hash_table_insert (ib->words, word, refs); - } - - g_free (ibfs); - fclose (f); - close(fd); - return ib; - -errout: - - fclose (f); - close(fd); - g_tree_traverse (ib->files, free_file, G_IN_ORDER, NULL); - g_tree_destroy (ib->files); - g_hash_table_foreach (ib->words, free_word, NULL); - g_hash_table_destroy (ib->words); - g_ptr_array_free (ib->oldfiles, TRUE); - if (ibfs) - g_free (ibfs); - g_free (ib->path); - g_free (ib); - - return NULL; -} - -struct ibex_write_data { - unsigned long index; - FILE *f; - char *lastname; -}; - -/* This is an internal function to find the longest common initial - * prefix between the last-written word and the current word. - */ -static int -get_prefix (struct ibex_write_data *iwd, char *name) -{ - int i = 0; - if (iwd->lastname) { - while (!strncmp (iwd->lastname, name, i + 1)) - i++; - } - iwd->lastname = name; - return i; -} - -static gint -write_file (gpointer key, gpointer value, gpointer data) -{ - char *file = key; - ibex_file *ibf = value; - struct ibex_write_data *iwd = data; - int prefix; - - ibf->index = iwd->index++; - prefix = get_prefix (iwd, file); - fprintf (iwd->f, "%c%s", prefix, file + prefix); - fputc (0, iwd->f); - return FALSE; -} - -/* scans for words which still exist in the index (after - index removals), and adds them to the ordered tree for - writing out in order */ -static void -store_word (gpointer key, gpointer value, gpointer data) -{ - GTree *wtree = data; - GPtrArray *refs = value; - int i; - ibex_file *ibf; - - for (i = 0; i < refs->len; i++) { - ibf = g_ptr_array_index (refs, i); - if (ibf->index == -1) { - g_ptr_array_remove_index_fast (refs, i); - i--; - } - } - - if (refs->len > 0) { - g_tree_insert (wtree, key, value); - } -} - -/* writes a word out, in order */ -static gint -write_word (gpointer key, gpointer value, gpointer data) -{ - char *word = key; - GPtrArray *refs = value; - struct ibex_write_data *iwd = data; - int i, prefix; - ibex_file *ibf; - - prefix = get_prefix (iwd, word); - fprintf (iwd->f, "%c%s", prefix, word + prefix); - fputc (0, iwd->f); - - write_number (iwd->f, refs->len); - - for (i = 0; i < refs->len; i++) { - ibf = g_ptr_array_index (refs, i); - write_number (iwd->f, ibf->index); - } - return FALSE; -} - -/** - * ibex_write: Write an ibex out to disk. - * @ib: the ibex - * - * This writes an ibex to disk. - * - * Return value: 0 for success, -1 for failure (in which case errno - * is set). - **/ -int -ibex_write (ibex *ib) -{ - struct ibex_write_data iwd; - GTree *wtree; - char *tmpfile; - - tmpfile = g_strdup_printf ("%s~", ib->path); - iwd.f = fopen (tmpfile, "w"); - if (!iwd.f) { - if (errno == 0) - errno = ENOMEM; - g_free (tmpfile); - return -1; - } - - fputs (IBEX_VERSION, iwd.f); - if (ferror (iwd.f)) - goto lose; - - iwd.index = 0; - iwd.lastname = NULL; - write_number (iwd.f, g_tree_nnodes (ib->files)); - if (ferror (iwd.f)) - goto lose; - g_tree_traverse (ib->files, write_file, G_IN_ORDER, &iwd); - if (ferror (iwd.f)) - goto lose; - - iwd.lastname = NULL; - wtree = g_tree_new ((GCompareFunc) strcmp); - g_hash_table_foreach (ib->words, store_word, wtree); - write_number (iwd.f, g_tree_nnodes(wtree)); - if (ferror (iwd.f)) - goto lose; - g_tree_traverse (wtree, write_word, G_IN_ORDER, &iwd); - g_tree_destroy (wtree); - if (ferror (iwd.f)) - goto lose; - - if (fclose (iwd.f) == 0 && rename (tmpfile, ib->path) == 0) { - g_free (tmpfile); - ib->dirty = FALSE; - return 0; - } - -lose: - unlink (tmpfile); - g_free (tmpfile); - return -1; -} - -/** - * ibex_save: - * @ib: - * - * Only write out an ibex if it is dirty. - * - * Return value: Same as ibex_write. - **/ -int -ibex_save (ibex *ib) -{ - if (ib->dirty) - return ibex_write(ib); - return 0; -} - -/** - * ibex_close: Write out the ibex file (if it has changed) and free - * the data associated with it. - * @ib: the ibex - * - * If this ibex file has been modified since it was opened, this will - * call ibex_write() to write it out to disk. It will then free all data - * associated with the ibex. After calling ibex_close(), @ib will no - * longer be a valid ibex. - * - * Return value: 0 on success, -1 on an ibex_write() failure (in which - * case @ib will not be destroyed). - **/ -int -ibex_close (ibex *ib) -{ - ibex_file *ibf; - - if (ib->dirty && ibex_write (ib) == -1) - return -1; - - g_tree_traverse (ib->files, free_file, G_IN_ORDER, NULL); - g_tree_destroy (ib->files); - g_hash_table_foreach (ib->words, free_word, NULL); - g_hash_table_destroy (ib->words); - - while (ib->oldfiles->len) { - ibf = g_ptr_array_remove_index (ib->oldfiles, 0); - g_free (ibf->name); - g_free (ibf); - } - g_ptr_array_free (ib->oldfiles, TRUE); - g_free (ib->path); - g_free (ib); - - return 0; -} - -static gint -free_file (gpointer key, gpointer value, gpointer data) -{ - ibex_file *ibf = value; - - g_free (ibf->name); - g_free (ibf); - return FALSE; -} - -static void -free_word (gpointer key, gpointer value, gpointer data) -{ - g_free (key); - g_ptr_array_free (value, TRUE); -} - -static char * -get_compressed_word (FILE *f, char **lastword) -{ - char *buf, *p; - int c, size; - - c = getc (f); - if (c == EOF) - return NULL; - - size = c + 10; - buf = g_malloc (size); - if (*lastword) - strncpy (buf, *lastword, c); - p = buf + c; - do { - c = getc (f); - if (c == EOF) - return NULL; - if (p == buf + size) { - buf = g_realloc (buf, size + 10); - p = buf + size; - size += 10; - } - *p++ = c; - } while (c != 0); - - *lastword = buf; - return buf; -} - -static void -write_number (FILE *f, unsigned long number) -{ - int i, flag = 0; - char buf[4]; - - i = 4; - do { - buf[--i] = (number & 0x7F) | flag; - number = number >> 7; - flag = 0x80; - } while (number != 0); - - fwrite (buf + i, 1, 4 - i, f); -} - -static unsigned long -read_number (FILE *f) -{ - int byte; - unsigned long num; - - num = 0; - do { - byte = getc (f); - num = num << 7 | (byte & 0x7F); - } while (byte & 0x80); - - return num; -} - diff --git a/libibex/find.c b/libibex/find.c deleted file mode 100644 index 6b9815f0c0..0000000000 --- a/libibex/find.c +++ /dev/null @@ -1,198 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Copyright (C) 2000 Helix Code, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* find.c: index file searching ops */ - -#include <string.h> - -#include "ibex_internal.h" - -/** - * ibex_find: search an ibex for a word - * @ib: an ibex - * @word: the word - * - * This routine searches an ibex for a word and returns a GPtrArray - * containing the names of the files in the ibex that contain the word. - * If no matches are found, it will return an empty array (not NULL). - * The caller must free the array, but MUST NOT free or alter its - * elements. - * - * Return value: the array of filenames containing @word - **/ -GPtrArray * -ibex_find (ibex *ib, char *word) -{ - GPtrArray *refs, *ret; - ibex_file *ibf; - int i; - - ret = g_ptr_array_new (); - refs = g_hash_table_lookup (ib->words, word); - if (refs) { - for (i = 0; i < refs->len; i++) { - ibf = g_ptr_array_index (refs, i); - g_ptr_array_add (ret, ibf->name); - } - } - return ret; -} - -/** - * ibex_contains_name: - * @ib: - * @name: - * - * Returns #TRUE if the ibex @ib has any index entry for - * the key @name. - * - * Return value: - **/ -gboolean -ibex_contains_name(ibex *ib, char *name) -{ - return g_tree_lookup(ib->files, name) != NULL; -} - -/** - * ibex_find_name: Check if a word occurs in a given file - * @ib: an ibex - * @name: a filename - * @word: a word - * - * This checks if the given word occurs in the given file. - * - * Return value: TRUE or FALSE - **/ -gboolean -ibex_find_name (ibex *ib, char *name, char *word) -{ - GPtrArray *refs; - ibex_file *ibf; - int i; - - refs = g_hash_table_lookup (ib->words, word); - if (refs) { - for (i = 0; i < refs->len; i++) { - ibf = g_ptr_array_index (refs, i); - if (!strcmp (ibf->name, name)) - return TRUE; - } - } - return FALSE; -} - -static gint -build_array (gpointer key, gpointer value, gpointer data) -{ - char *name = key; - unsigned int count = GPOINTER_TO_UINT (value); - GPtrArray *ret = data; - - if (count == 1) - g_ptr_array_add (ret, name); - return FALSE; -} - -/** - * ibex_find_all: Find files containing multiple words - * @ib: an ibex - * @words: a GPtrArray of words - * - * This works like ibex_find(), but returns an array of filenames - * which contain all of the words in @words. - * - * Return value: an array of matches - **/ -GPtrArray * -ibex_find_all (ibex *ib, GPtrArray *words) -{ - GTree *work; - GPtrArray *wrefs, *ret; - int i, j, count; - char *word; - ibex_file *ibf; - - if (words->len == 0) - return g_ptr_array_new (); - else if (words->len == 1) - return ibex_find (ib, g_ptr_array_index (words, 0)); - - work = g_tree_new ((GCompareFunc) strcmp); - for (i = 0; i < words->len; i++) { - word = g_ptr_array_index (words, i); - wrefs = g_hash_table_lookup (ib->words, word); - if (!wrefs) { - /* One of the words isn't even in the index. */ - g_tree_destroy (work); - return g_ptr_array_new (); - } - - if (i == 0) { - /* Copy the references into a tree, using the - * filenames as keys and the size of words as - * the value. - */ - for (j = 0; j < wrefs->len; j++) { - ibf = g_ptr_array_index (wrefs, j); - g_tree_insert (work, ibf->name, - GUINT_TO_POINTER (words->len)); - } - } else { - /* Increment the counts in the working tree - * for the references for this word. - */ - for (j = 0; j < wrefs->len; j++) { - ibf = g_ptr_array_index (wrefs, j); - count = GPOINTER_TO_UINT (g_tree_lookup (work, ibf->name)); - if (count) { - g_tree_insert (work, ibf->name, - GUINT_TO_POINTER (count - 1)); - } - } - } - } - - /* Build an array with the refs that contain all the words. */ - ret = g_ptr_array_new (); - g_tree_traverse (work, build_array, G_IN_ORDER, ret); - g_tree_destroy (work); - return ret; -} - -static void -ibex_dump_foo(char *key, GPtrArray *refs, void *data) -{ - int i; - - g_print("%s: ", key); - for (i=0;i<refs->len;i++) { - ibex_file *ibf = g_ptr_array_index (refs, i); - g_print("%c%s", ibf->index==-1?'-':' ', ibf->name); - } - g_print("\n"); -} - -/* debug function to dump the tree, in key order */ -void -ibex_dump_all (ibex *ib) -{ - g_hash_table_foreach(ib->words, (GHFunc) ibex_dump_foo, 0); -} diff --git a/libibex/hash.c b/libibex/hash.c new file mode 100644 index 0000000000..02e01a1ae0 --- /dev/null +++ b/libibex/hash.c @@ -0,0 +1,701 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2000 Helix Code, Inc. + * + * Authors: Michael Zucchi <notzed@helixcode.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the Gnome Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* hash based index mechanism */ + +#include <stdio.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <string.h> + +#include "block.h" +#include "index.h" + +#define d(x) + +#define HASH_SIZE (1024) +#define KEY_THRESHOLD (sizeof(struct _hashkey) + 4) /* minimum number of free bytes we worry about + maintaining free blocks for */ +#define ARRAY_LEN(a) (sizeof(a)/sizeof(a[0])) + +typedef guint32 hashid_t; + +static struct _IBEXIndex *hash_create(struct _memcache *bc, int size); +static struct _IBEXIndex *hash_open(struct _memcache *bc, blockid_t root); +static int hash_sync(struct _IBEXIndex *index); +static int hash_close(struct _IBEXIndex *index); + +static hashid_t hash_find(struct _IBEXIndex *index, const char *key, int keylen); +static void hash_remove(struct _IBEXIndex *index, const char *key, int keylen); +static hashid_t hash_insert(struct _IBEXIndex *index, const char *key, int keylen); +static char *hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len); +static void hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail); +static blockid_t hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail); + +struct _IBEXIndexClass ibex_hash_class = { + hash_create, hash_open, + hash_sync, hash_close, + hash_find, + hash_remove, + hash_insert, + hash_get_key, + hash_set_data_block, + hash_get_data_block, +}; + +/* the reason we have the tail here is that otherwise we need to + have a 32 bit blockid for the root node; which would make this + structure the same size anyway, with about 24 wasted bits */ +struct _hashkey { + blockid_t next; /* next in hash chain */ + blockid_t tail; + unsigned int root:32-BLOCK_BITS; + unsigned int keyoffset:BLOCK_BITS; +}; + +struct _hashblock { + /*blockid_t next;*/ /* all key blocks linked together? */ + guint32 used; /* elements used */ + union { + struct _hashkey keys[(BLOCK_SIZE-4)/sizeof(struct _hashkey)]; + char keydata[BLOCK_SIZE-4]; + } hashblock_u; +}; +#define hb_keys hashblock_u.keys +#define hb_keydata hashblock_u.keydata + +/* root block for a hash index */ +struct _hashroot { + hashid_t free; /* free list */ + guint32 size; /* how big the hash table is */ + hashid_t table[(BLOCK_SIZE-8)/sizeof(hashid_t)]; /* pointers to blocks of pointers */ +}; + +struct _hashtableblock { + hashid_t buckets[BLOCK_SIZE/sizeof(hashid_t)]; +}; + +/* map a hash index to a block index */ +#define HASH_INDEX(b) ((b) & (BLOCK_SIZE-1)) +/* map a hash index to a block number */ +#define HASH_BLOCK(b) ((b) & ~(BLOCK_SIZE-1)) +/* map a block + index to a hash key */ +#define HASH_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1))) + +/* taken from tdb/gdbm */ +static unsigned int hash_key(const unsigned char *key, int keylen) +{ + char *newkey; + newkey = alloca(keylen+1); + memcpy(newkey, key, keylen); + newkey[keylen]=0; + return g_str_hash(newkey); +#if 0 + unsigned int value; /* Used to compute the hash value. */ + unsigned int i; /* Used to cycle through random values. */ + + /* Set the initial value from the key size. */ + value = 0x238F13AF * keylen; + for (i=0; i < keylen; i++) { + value = (value + (key[i] << (i*5 % 24))); + } + + value = (1103515243 * value + 12345); + + return value; +#endif +} + +/* create a new hash table, return a pointer to its root block */ +static struct _IBEXIndex * +hash_create(struct _memcache *bc, int size) +{ + blockid_t root, block; + struct _hashroot *hashroot; + int i; + struct _hashtableblock *table; + struct _IBEXIndex *index; + + g_assert(size<=10240); + + d(printf("initialising hash table, size = %d\n", size)); + + index = g_malloc0(sizeof(*index)); + index->blocks = bc; + index->klass = &ibex_hash_class; + root = ibex_block_get(bc); + index->root = root; + d(printf(" root = %d\n", root)); + hashroot = (struct _hashroot *)ibex_block_read(bc, root); + hashroot->free = 0; + hashroot->size = size; + ibex_block_dirty((struct _block *)hashroot); + for (i=0;i<size/(BLOCK_SIZE/sizeof(blockid_t));i++) { + d(printf("initialising hash table index block %d\n", i)); + block = hashroot->table[i] = ibex_block_get(bc); + table = (struct _hashtableblock *)ibex_block_read(bc, block); + memset(table, 0, sizeof(table)); + ibex_block_dirty((struct _block *)table); + } + + return index; +} + +static struct _IBEXIndex * +hash_open(struct _memcache *bc, blockid_t root) +{ + struct _IBEXIndex *index; + + /* FIXME: check a 'magic', and the root for validity */ + + index = g_malloc0(sizeof(*index)); + index->blocks = bc; + index->root = root; + index->klass = &ibex_hash_class; + + return index; +} + + +static int hash_sync(struct _IBEXIndex *index) +{ + /* nop, index always synced on disk (at least, to blocks) */ + return 0; +} + +static int hash_close(struct _IBEXIndex *index) +{ + g_free(index); + return 0; +} + +/* convert a hashbucket id into a name */ +static char * +hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len) +{ + struct _hashblock *bucket; + int ind; + char *ret, *start, *end; + + if (hashbucket == 0) { + if (len) + *len = 0; + return g_strdup(""); + } + + bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); + ind = HASH_INDEX(hashbucket); + + g_assert(ind < bucket->used); + + start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset]; + if (ind == 0) { + end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; + } else { + end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset]; + } + + ret = g_malloc(end-start+1); + memcpy(ret, start, end-start); + ret[end-start]=0; + if (len) + *len = end-start; + return ret; +} + +/* sigh, this is fnugly code ... */ +static hashid_t +hash_find(struct _IBEXIndex *index, const char *key, int keylen) +{ + struct _hashroot *hashroot; + guint32 hash; + int hashentry; + blockid_t hashtable; + hashid_t hashbucket; + struct _hashtableblock *table; + + g_assert(index != 0); + g_assert(index->root != 0); + + d(printf("finding hash %.*s\n", keylen, key)); + + hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); + + /* find the table containing this entry */ + hash = hash_key(key, keylen) % hashroot->size; + hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))]; + g_assert(hashtable != 0); + table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable); + hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t)); + /* and its bucket */ + hashbucket = table->buckets[hashentry]; + + /* go down the bucket chain, reading each entry till we are done ... */ + while (hashbucket != 0) { + struct _hashblock *bucket; + char *start, *end; + int ind; + + d(printf(" checking bucket %d\n", hashbucket)); + + /* get the real bucket id from the hashbucket id */ + bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); + /* and get the key number within the block */ + ind = HASH_INDEX(hashbucket); + + g_assert(ind < bucket->used); + + start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset]; + if (ind == 0) { + end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; + } else { + end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset]; + } + if ( (end-start) == keylen + && memcmp(start, key, keylen) == 0) { + return hashbucket; + } + hashbucket = bucket->hb_keys[ind].next; + } + return 0; +} + +/* compresses the bucket 'bucket', removing data + at index 'index' */ +static void +hash_compress(struct _hashblock *bucket, int index) +{ + int i; + char *start, *end, *newstart; + + /* get start/end of area to zap */ + start = &bucket->hb_keydata[bucket->hb_keys[index].keyoffset]; + if (index == 0) { + end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; + } else { + end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset]; + } + + if (start == end) + return; + + /* fixup data */ + newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset]; + memmove(newstart+(end-start), newstart, start-newstart); + + /* fixup key pointers */ + for (i=index;i<bucket->used;i++) { + bucket->hb_keys[i].keyoffset += (end-start); + } + ibex_block_dirty((struct _block *)bucket); +} + +/* make room 'len' for the key 'index' */ +/* assumes key 'index' is already empty (0 length) */ +static void +hash_expand(struct _hashblock *bucket, int index, int len) +{ + int i; + char *end, *newstart; + + /* get start/end of area to zap */ + if (index == 0) { + end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; + } else { + end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset]; + } + + /* fixup data */ + newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset]; + memmove(newstart-len, newstart, end-newstart); + + /* fixup key pointers */ + for (i=index;i<bucket->used;i++) { + bucket->hb_keys[i].keyoffset -= len; + } + ibex_block_dirty((struct _block *)bucket); +} + +static void +hash_remove(struct _IBEXIndex *index, const char *key, int keylen) +{ + struct _hashroot *hashroot; + guint32 hash; + int hashentry; + blockid_t hashtable; + hashid_t hashbucket, hashprev; + struct _hashtableblock *table; + + g_assert(index != 0); + g_assert(index->root != 0); + + d(printf("removing hash %.*s\n", keylen, key)); + + hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); + + /* find the table containing this entry */ + hash = hash_key(key, keylen) % hashroot->size; + hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))]; + table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable); + hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t)); + /* and its bucket */ + hashbucket = table->buckets[hashentry]; + + /* go down the bucket chain, reading each entry till we are done ... */ + hashprev = 0; + while (hashbucket != 0) { + struct _hashblock *bucket; + char *start, *end; + int ind; + + d(printf(" checking bucket %d\n", hashbucket)); + + /* get the real bucket id from the hashbucket id */ + bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); + /* and get the key number within the block */ + ind = HASH_INDEX(hashbucket); + + g_assert(ind < bucket->used); + + start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset]; + if (ind == 0) { + end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; + } else { + end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset]; + } + if ( (end-start) == keylen + && memcmp(start, key, keylen) == 0) { + struct _hashblock *prevbucket; + + if (hashprev == 0) { + /* unlink from hash chain */ + table->buckets[hashentry] = bucket->hb_keys[HASH_INDEX(hashbucket)].next; + /* link into free list */ + bucket->hb_keys[HASH_INDEX(hashbucket)].next = hashroot->free; + hashroot->free = hashbucket; + /* compress away data */ + hash_compress(bucket, HASH_INDEX(hashbucket)); + ibex_block_dirty((struct _block *)bucket); + ibex_block_dirty((struct _block *)table); + ibex_block_dirty((struct _block *)hashroot); + } else { + prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashprev)); + prevbucket->hb_keys[HASH_INDEX(hashprev)].next = + bucket->hb_keys[ind].next; + /* link into free list */ + bucket->hb_keys[ind].next = hashroot->free; + hashroot->free = hashbucket; + /* compress entry */ + hash_compress(bucket, ind); + ibex_block_dirty((struct _block *)bucket); + ibex_block_dirty((struct _block *)prevbucket); + ibex_block_dirty((struct _block *)hashroot); + } + return; + } + hashprev = hashbucket; + hashbucket = bucket->hb_keys[ind].next; + } +} + +/* set where the datablock is located */ +static void +hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail) +{ + struct _hashblock *bucket; + + d(printf("setting data block hash %d to %d tail %d\n", keyid, blockid, tail)); + + /* map to a block number */ + g_assert((blockid & (BLOCK_SIZE-1)) == 0); + blockid >>= BLOCK_BITS; + + bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid)); + if (bucket->hb_keys[HASH_INDEX(keyid)].root != blockid + || bucket->hb_keys[HASH_INDEX(keyid)].tail != tail) { + bucket->hb_keys[HASH_INDEX(keyid)].tail = tail; + bucket->hb_keys[HASH_INDEX(keyid)].root = blockid; + ibex_block_dirty((struct _block *)bucket); + } +} + +static blockid_t +hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail) +{ + struct _hashblock *bucket; + + d(printf("getting data block hash %d\n", keyid)); + + if (keyid == 0) { + if (tail) + *tail = 0; + return 0; + } + + bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid)); + if (tail) + *tail = bucket->hb_keys[HASH_INDEX(keyid)].tail; + return bucket->hb_keys[HASH_INDEX(keyid)].root << BLOCK_BITS; +} + +static hashid_t +hash_insert(struct _IBEXIndex *index, const char *key, int keylen) +{ + struct _hashroot *hashroot; + guint32 hash; + int hashentry; + blockid_t hashtable; + hashid_t hashbucket, keybucket, keyprev, keyfree; + struct _hashtableblock *table; + struct _hashblock *bucket; + int count; + + g_assert(index != 0); + g_assert(index->root != 0); + + d(printf("inserting hash %.*s\n", keylen, key)); + + hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); + + /* find the table containing this entry */ + hash = hash_key(key, keylen) % hashroot->size; + hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))]; + table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable); + hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t)); + /* and its bucket */ + hashbucket = table->buckets[hashentry]; + + /* now look for a free slot, first try the free list */ + /* but dont try too hard if our key is just too long ... so just scan upto + 4 blocks, but if we dont find a space, tough ... */ + keybucket = hashroot->free; + keyprev = 0; + count = 0; + while (keybucket && count<4) { + int space; + + d(printf(" checking free %d\n", keybucket)); + + /* read the bucket containing this free key */ + bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keybucket)); + + /* check if there is enough space for the key */ + space = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset] + - (char *)&bucket->hb_keys[bucket->used]; + if (space >= keylen) { + hash_expand(bucket, HASH_INDEX(keybucket), keylen); + memcpy(&bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(keybucket)].keyoffset], + key, keylen); + + /* check if there is free space still in this node, and there are no other empty blocks */ + keyfree = bucket->hb_keys[HASH_INDEX(keybucket)].next; + if ((space-keylen) >= KEY_THRESHOLD) { + int i; + int head = ARRAY_LEN(bucket->hb_keydata); + int found = FALSE; + + for (i=0;i<bucket->used;i++) { + if (bucket->hb_keys[i].keyoffset == head) { + /* already have a free slot in this block, leave it */ + found = TRUE; + break; + } + head = bucket->hb_keys[i].keyoffset; + } + if (!found) { + /* we should link in a new free slot for this node */ + bucket->hb_keys[bucket->used].next = bucket->hb_keys[HASH_INDEX(keybucket)].next; + bucket->hb_keys[bucket->used].keyoffset = bucket->hb_keys[bucket->used-1].keyoffset; + keyfree = HASH_KEY(HASH_BLOCK(keybucket), bucket->used); + bucket->used++; + } + } + /* link 'keyfree' back to the parent ... */ + if (keyprev == 0) { + hashroot->free = keyfree; + ibex_block_dirty((struct _block *)hashroot); + } else { + struct _hashblock *prevbucket; + prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyprev)); + prevbucket->hb_keys[HASH_INDEX(keyprev)].next = keyfree; + ibex_block_dirty((struct _block *)prevbucket); + } + + /* link into the hash chain */ + bucket->hb_keys[HASH_INDEX(keybucket)].next = hashbucket; + bucket->hb_keys[HASH_INDEX(keybucket)].root = 0; + bucket->hb_keys[HASH_INDEX(keybucket)].tail = 0; + table->buckets[hashentry] = keybucket; + ibex_block_dirty((struct _block *)table); + ibex_block_dirty((struct _block *)bucket); + + d(printf(" new key id %d\n", keybucket)); + d(printf(" new free id %d\n", hashroot->free)); + + return keybucket; + } + + count++; + keyprev = keybucket; + keybucket = bucket->hb_keys[HASH_INDEX(keybucket)].next; + } + + /* else create a new block ... */ + keybucket = ibex_block_get(index->blocks); + bucket = (struct _hashblock *)ibex_block_read(index->blocks, keybucket); + + d(printf("creating new key bucket %d\n", keybucket)); + + memset(bucket, 0, sizeof(*bucket)); + + bucket->used = 2; + /* first block, is the new key */ + bucket->hb_keys[0].keyoffset = ARRAY_LEN(bucket->hb_keydata) - keylen; + memcpy(&bucket->hb_keydata[bucket->hb_keys[0].keyoffset], key, keylen); + bucket->hb_keys[0].next = hashbucket; + bucket->hb_keys[0].root = 0; + bucket->hb_keys[0].tail = 0; + + table->buckets[hashentry] = HASH_KEY(keybucket, 0); + + /* next block is a free block, link into free list */ + bucket->hb_keys[1].keyoffset = bucket->hb_keys[0].keyoffset; + bucket->hb_keys[1].next = hashroot->free; + hashroot->free = HASH_KEY(keybucket, 1); + + ibex_block_dirty((struct _block *)hashroot); + ibex_block_dirty((struct _block *)table); + ibex_block_dirty((struct _block *)bucket); + + d(printf(" new key id %d\n", HASH_KEY(keybucket, 0))); + d(printf(" new free id %d\n", hashroot->free)); + + return HASH_KEY(keybucket, 0); +} + +/* debug */ +void ibex_hash_dump(struct _IBEXIndex *index); + +void +ibex_hash_dump(struct _IBEXIndex *index) +{ + int i; + struct _hashtableblock *table; + struct _hashblock *bucket; + struct _hashroot *hashroot; + blockid_t hashtable; + hashid_t hashbucket; + + printf("Walking hash tree:\n"); + hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); + for (i=0;i<hashroot->size;i++) { + printf("Hash table chain: %d\n", i); + hashtable = hashroot->table[i / (BLOCK_SIZE/sizeof(blockid_t))]; + table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable); + hashbucket = table->buckets[i % (BLOCK_SIZE/sizeof(blockid_t))]; + while (hashbucket) { + int len; + + bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); + printf(" bucket %d: [used %d]", hashbucket, bucket->used); + if (HASH_INDEX(hashbucket) == 0) { + len = ARRAY_LEN(bucket->hb_keydata) - + bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset; + } else { + len = bucket->hb_keys[HASH_INDEX(hashbucket)-1].keyoffset - + bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset; + } + printf("'%.*s' = %d next=%d\n", len, &bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset], + bucket->hb_keys[HASH_INDEX(hashbucket)].root, + bucket->hb_keys[HASH_INDEX(hashbucket)].next); + ibex_diskarray_dump(index->blocks, + bucket->hb_keys[HASH_INDEX(hashbucket)].root << BLOCK_BITS, + bucket->hb_keys[HASH_INDEX(hashbucket)].tail); + + hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next; + } + /* make sure its still in the cache */ + hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); + } + + hashbucket = hashroot->free; + printf("Dumping free lists ..\n"); + while (hashbucket) { + printf(" %d", hashbucket); + bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); + hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next; + } + printf("\n"); +} + +#if 0 +int main(int argc, char **argv) +{ + struct _memcache *bc; + struct _IBEXIndex *hash; + int i; + + bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600); + hash = ibex_hash_class.create(bc, 1024); + for (i=0;i<10000;i++) { + char key[16]; + sprintf(key, "key %d", i); + ibex_hash_class.insert(hash, key, strlen(key)); + } + + for (i=500;i<1000;i++) { + char key[16]; + sprintf(key, "key %d", i); + ibex_hash_class.remove(hash, key, strlen(key)); + } + + for (i=500;i<1000;i++) { + char key[16]; + sprintf(key, "key %d", i); + ibex_hash_class.insert(hash, key, strlen(key)); + } + + + ibex_hash_dump(hash); + + for (i=0;i<2000;i++) { + char key[16], *lookup; + hashid_t keyid; + blockid_t root, tail; + + sprintf(key, "key %d", i); + keyid = ibex_hash_class.find(hash, key, strlen(key)); + lookup = ibex_hash_class.get_key(hash, keyid, 0); + root = ibex_hash_class.get_data(hash, keyid, &tail); + printf("key %s = %d = '%s' root:%d tail:%d \n", key, keyid, lookup, root, tail); + g_free(lookup); + } + + ibex_hash_class.close(hash); + ibex_block_cache_close(bc); + return 0; +} + +#endif diff --git a/libibex/words.c b/libibex/ibex_block.c index 7642b0a305..bbc6fb9c8b 100644 --- a/libibex/words.c +++ b/libibex/ibex_block.c @@ -1,33 +1,16 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Copyright (C) 2000 Helix Code, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ -/* words.c: low-level indexing ops */ +#include <glib.h> +#include <db.h> +#include <stdio.h> +#include <unicode.h> #include <ctype.h> -#include <errno.h> #include <string.h> -#include <unicode.h> - #include "ibex_internal.h" +#define d(x) + static signed char utf8_trans[] = { 'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U', @@ -60,7 +43,7 @@ static char *utf8_long_trans[] = { * It is not safe to call this routine with bad UTF-8. */ static void -normalize_word (char *start, char *end, char *buf) +ibex_normalise_word(char *start, char *end, char *buf) { unsigned char *s, *d; unicode_char_t uc; @@ -151,39 +134,6 @@ utf8_category (char *sp, char **snp, char *send) } } -static ibex_file * -get_ibex_file (ibex *ib, char *name) -{ - ibex_file *ibf; - - ibf = g_tree_lookup (ib->files, name); - if (!ibf) { - ibf = g_malloc (sizeof (ibex_file)); - ibf->name = g_strdup (name); - ibf->index = 0; - g_tree_insert (ib->files, ibf->name, ibf); - ib->dirty = TRUE; - } - return ibf; -} - -static void -ref_word (ibex *ib, ibex_file *ibf, char *word) -{ - GPtrArray *refs; - - refs = g_hash_table_lookup (ib->words, word); - if (!refs) { - refs = g_ptr_array_new (); - g_hash_table_insert (ib->words, g_strdup (word), refs); - g_ptr_array_add (refs, ibf); - ib->dirty = TRUE; - } else if (g_ptr_array_index (refs, refs->len - 1) != ibf) { - g_ptr_array_add (refs, ibf); - ib->dirty = TRUE; - } -} - /** * ibex_index_buffer: the lowest-level ibex indexing interface * @ib: an ibex @@ -206,12 +156,13 @@ ref_word (ibex *ib, ibex_file *ibf, char *word) * Return value: 0 on success, -1 on failure. **/ int -ibex_index_buffer (ibex *ib, char *name, char *buffer, - size_t len, size_t *unread) +ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread) { char *p, *q, *nq, *end, *word; - ibex_file *ibf = get_ibex_file (ib, name); int wordsiz, cat; + GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal); + GPtrArray *wordlist = g_ptr_array_new(); + int i, ret=-1; if (unread) *unread = 0; @@ -229,12 +180,9 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer, p = q; } if (p == end) { - g_free (word); - return 0; + goto done; } else if (cat == IBEX_INVALID) { - errno = EINVAL; - g_free (word); - return -1; + goto error; } else if (cat == IBEX_INCOMPLETE) q = end; @@ -246,24 +194,107 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer, } if (cat == IBEX_INVALID || (cat == IBEX_INCOMPLETE && !unread)) { - errno = EINVAL; - g_free (word); - return -1; + goto error; } else if (cat == IBEX_INCOMPLETE || (q == end && unread)) { *unread = end - p; - g_free (word); - return 0; + goto done; } if (wordsiz < q - p + 1) { wordsiz = q - p + 1; word = g_realloc (word, wordsiz); } - normalize_word (p, q, word); - ref_word (ib, ibf, word); + ibex_normalise_word (p, q, word); + if (word[0]) { + if (g_hash_table_lookup(words, word) == 0) { + char *newword = g_strdup(word); + g_ptr_array_add(wordlist, newword); + g_hash_table_insert(words, newword, name); + } + } p = q; } - +done: + d(printf("name %s count %d size %d\n", name, wordlist->len, len)); + ib->words->klass->add_list(ib->words, name, wordlist); + ret = 0; +error: + for (i=0;i<wordlist->len;i++) + g_free(wordlist->pdata[i]); + g_ptr_array_free(wordlist, TRUE); + g_hash_table_destroy(words); g_free (word); + return ret; +} + + +ibex *ibex_open (char *file, int flags, int mode) +{ + ibex *ib; + + ib = g_malloc0(sizeof(*ib)); + ib->blocks = ibex_block_cache_open(file, flags, mode); + if (ib->blocks == 0) { + g_warning("create: Error occured?: %s\n", strerror(errno)); + g_free(ib); + return NULL; + } + /* FIXME: the blockcache or the wordindex needs to manage the other one */ + ib->words = ib->blocks->words; + + return ib; +} + +int ibex_save (ibex *ib) +{ + printf("syncing database\n"); + ib->words->klass->sync(ib->words); + /* FIXME: some return */ + ibex_block_cache_sync(ib->blocks); return 0; } + +int ibex_close (ibex *ib) +{ + int ret = 0; + + printf("closing database\n"); + + ib->words->klass->close(ib->words); + ibex_block_cache_close(ib->blocks); + g_free(ib); + return ret; +} + +void ibex_unindex (ibex *ib, char *name) +{ + d(printf("trying to unindex '%s'\n", name)); + ib->words->klass->unindex_name(ib->words, name); +} + +GPtrArray *ibex_find (ibex *ib, char *word) +{ + char *normal; + int len; + + len = strlen(word); + normal = alloca(len+1); + ibex_normalise_word(word, word+len, normal); + return ib->words->klass->find(ib->words, normal); +} + +gboolean ibex_find_name (ibex *ib, char *name, char *word) +{ + char *normal; + int len; + + len = strlen(word); + normal = alloca(len+1); + ibex_normalise_word(word, word+len, normal); + return ib->words->klass->find_name(ib->words, name, normal); +} + +gboolean ibex_contains_name(ibex *ib, char *name) +{ + return ib->words->klass->contains_name(ib->words, name); +} diff --git a/libibex/ibex_db.c b/libibex/ibex_db.c new file mode 100644 index 0000000000..1ef0b3d1bc --- /dev/null +++ b/libibex/ibex_db.c @@ -0,0 +1,512 @@ + +#include <glib.h> + +#include <db.h> +#include <stdio.h> +#include <unicode.h> +#include <ctype.h> + +#include "ibex_internal.h" + +#define d(x) + +/* + Uses 2 databases: + + names - HASH - lists which files are stored + words - BTREE - index words to files + +*/ + +static int +db_delete_name(DB *db, const char *name) +{ + DBC *cursor; + int err, namelen; + DBT key, data; + + printf("called to delete name %s\n", name); + return 0; + + err = db->cursor(db, NULL, &cursor, 0); + if (err != 0) { + printf("Error occured?: %s\n", db_strerror(err)); + return err; + } + + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + namelen = strlen(name); + + err = cursor->c_get(cursor, &key, &data, DB_FIRST); + while (err == 0) { + if (data.size == namelen && memcmp(data.data, name, namelen) == 0) { + d(printf("Found match, removing it\n")); + cursor->c_del(cursor, 0); + } else { + data.size = namelen; + data.data = (void *)name; + if (cursor->c_get(cursor, &key, &data, DB_GET_BOTH) == 0) { + d(printf("seek to match, removing it\n")); + cursor->c_del(cursor, 0); + } else { + d(printf("seek to match, not found\n")); + } + } + err = cursor->c_get(cursor, &key, &data, DB_NEXT_NODUP); + } + + cursor->c_close(cursor); + + return 0; +} + +static int +db_index_words(DB *db, char *name, char **words) +{ + DBT key, data; + int err = 0; + char *word; + + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + data.data = name; + data.size = strlen(name); + for (;(word=*words);words++) { + /* normalise word first ... */ + key.data = word; + key.size = strlen(word); + + err = db->put(db, NULL, &key, &data, 0); + if (err != 0) + printf("Error occured?: %s\n", db_strerror(err)); + } + + return err; +} + +static int +db_index_word(DB *db, char *name, char *word) +{ + DBT key, data; + int err = 0; + + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + data.data = name; + data.size = strlen(name); + key.data = word; + key.size = strlen(word); + + err = db->put(db, NULL, &key, &data, 0); + if (err != 0) + printf("Error occured?: %s\n", db_strerror(err)); + + return err; +} + +static GPtrArray * +db_find(DB *db, char *word) +{ + DBT key, data; + int err; + DBC *cursor; + GPtrArray *matches = g_ptr_array_new(); + + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + + err = db->cursor(db, NULL, &cursor, 0); + if (err != 0) { + printf("Error occured?: %s\n", db_strerror(err)); + return matches; + } + + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + + key.size = strlen(word); + key.data = word; + + err = cursor->c_get(cursor, &key, &data, DB_SET); + while (err == 0) { + char *name; + name = g_malloc(data.size+1); + memcpy(name, data.data, data.size); + name[data.size] = '\0'; + g_ptr_array_add(matches, name); + err = cursor->c_get(cursor, &key, &data, DB_NEXT_DUP); + } + if (err != DB_NOTFOUND) { + printf("Error occured?: %s\n", db_strerror(err)); + /* what to do? doesn't matter too much its just a search ... */ + } + + cursor->c_close(cursor); + + return matches; +} + +/* check that db contains key @word, optionally with contents @name */ +static gboolean +db_contains_word(DB *db, char *name, char *word) +{ + DBT key, data; + + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + + if (name != NULL) { + data.size = strlen(name); + data.data = name; + } + key.size = strlen(word); + key.data = word; + + return db->get(db, NULL, &key, &data, name?DB_GET_BOTH:DB_SET) == 0; +} + +static int +db_delete_word(DB *db, char *word) +{ + DBT key; + + memset(&key, 0, sizeof(key)); + key.size = strlen(word); + key.data = word; + return db->del(db, NULL, &key, 0); +} + + +static signed char utf8_trans[] = { + 'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I', + 'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U', + 'U', 'Y', -3, -4, 'a', 'a', 'a', 'a', 'a', 'a', -5, 'c', 'e', 'e', + 'e', 'e', 'i', 'i', 'i', 'i', -6, 'n', 'o', 'o', 'o', 'o', 'o', '/', + 'o', 'u', 'u', 'u', 'u', 'y', -7, 'y', 'A', 'a', 'A', 'a', 'A', 'a', + 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', + 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', + 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', + 'I', 'i', -8, -9, 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L', + 'l', 'L', 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', -10, -11, + 'O', 'o', 'O', 'o', 'O', 'o', -12, -13, 'R', 'r', 'R', 'r', 'R', 'r', + 'S', 'r', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', + 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', + 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's' +}; + +static char *utf8_long_trans[] = { + "AE", "TH", "TH", "ss", "ae", "th", "th", "IJ", "ij", + "NG", "ng", "OE", "oe" +}; + +/* This is a bit weird. It takes pointers to the start and end (actually + * just past the end) of a UTF-8-encoded word, and a buffer at least 1 + * byte longer than the length of the word. It copies the word into the + * buffer in all lowercase without accents, and splits up ligatures. + * (Since any ligature would be a multi-byte character in UTF-8, splitting + * them into two US-ASCII characters won't overrun the buffer.) + * + * It is not safe to call this routine with bad UTF-8. + */ +static void +ibex_normalise_word(char *start, char *end, char *buf) +{ + unsigned char *s, *d; + unicode_char_t uc; + + s = (unsigned char *)start; + d = (unsigned char *)buf; + while (s < (unsigned char *)end) { + if (*s < 0x80) { + /* US-ASCII character: copy unless it's + * an apostrophe. + */ + if (*s != '\'') + *d++ = tolower (*s); + s++; + } else { + char *next = unicode_get_utf8 (s, &uc); + if (uc >= 0xc0 && uc < 0xc0 + sizeof (utf8_trans)) { + signed char ch = utf8_trans[uc - 0xc0]; + if (ch > 0) + *d++ = tolower (ch); + else { + *d++ = tolower (utf8_long_trans[-ch - 1][0]); + *d++ = tolower (utf8_long_trans[-ch - 1][1]); + } + s = next; + } else { + while (s < (unsigned char *)next) + *d++ = *s++; + } + } + } + *d = '\0'; +} + +enum { IBEX_ALPHA, IBEX_NONALPHA, IBEX_INVALID, IBEX_INCOMPLETE }; + +/* This incorporates parts of libunicode, because there's no way to + * force libunicode to not read past a certain point. + */ +static int +utf8_category (char *sp, char **snp, char *send) +{ + unsigned char *p = (unsigned char *)sp, **np = (unsigned char **)snp; + unsigned char *end = (unsigned char *)send; + + if (isascii (*p)) { + *np = p + 1; + if (isalpha (*p) || *p == '\'') + return IBEX_ALPHA; + return IBEX_NONALPHA; + } else { + unicode_char_t uc; + int more; + + if ((*p & 0xe0) == 0xc0) { + more = 1; + uc = *p & 0x1f; + } else if ((*p & 0xf0) == 0xe0) { + more = 2; + uc = *p & 0x0f; + } else if ((*p & 0xf8) == 0xf0) { + more = 3; + uc = *p & 0x07; + } else if ((*p & 0xfc) == 0xf8) { + more = 4; + uc = *p & 0x03; + } else if ((*p & 0xfe) == 0xfc) { + more = 5; + uc = *p & 0x01; + } else + return IBEX_INVALID; + + if (p + more > end) + return IBEX_INCOMPLETE; + + while (more--) { + if ((*++p & 0xc0) != 0x80) + return IBEX_INVALID; + uc <<= 6; + uc |= *p & 0x3f; + } + + *np = p + 1; + if (unicode_isalpha (uc)) + return IBEX_ALPHA; + else + return IBEX_NONALPHA; + } +} + +static void +do_insert_words(char *key, char *name, ibex *ib) +{ + db_index_word(ib->words, name, key); + g_free(key); +} + +static void +do_free_words(char *key, char *name, void *data) +{ + g_free(key); +} + +/** + * ibex_index_buffer: the lowest-level ibex indexing interface + * @ib: an ibex + * @name: the name of the file being indexed + * @buffer: a buffer containing data from the file + * @len: the length of @buffer + * @unread: an output argument containing the number of unread bytes + * + * This routine indexes up to @len bytes from @buffer into @ib. + * If @unread is NULL, the indexer assumes that the buffer ends on a + * word boundary, and will index all the way to the end of the + * buffer. If @unread is not NULL, and the buffer ends with an + * alphabetic character, the indexer will assume that the buffer has + * been cut off in the middle of a word, and return the number of + * un-indexed bytes at the end of the buffer in *@unread. The caller + * should then read in more data through whatever means it has + * and pass in the unread bytes from the original buffer, followed + * by the new data, on its next call. + * + * Return value: 0 on success, -1 on failure. + **/ +int +ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread) +{ + char *p, *q, *nq, *end, *word; + int wordsiz, cat; + GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal); + + if (unread) + *unread = 0; + + end = buffer + len; + wordsiz = 20; + word = g_malloc (wordsiz); + + p = buffer; + while (p < end) { + while (p < end) { + cat = utf8_category (p, &q, end); + if (cat != IBEX_NONALPHA) + break; + p = q; + } + if (p == end) { + goto done; + } else if (cat == IBEX_INVALID) { + goto error; + } else if (cat == IBEX_INCOMPLETE) + q = end; + + while (q < end) { + cat = utf8_category (q, &nq, end); + if (cat != IBEX_ALPHA) + break; + q = nq; + } + if (cat == IBEX_INVALID || + (cat == IBEX_INCOMPLETE && !unread)) { + goto error; + } else if (cat == IBEX_INCOMPLETE || (q == end && unread)) { + *unread = end - p; + goto done; + } + + if (wordsiz < q - p + 1) { + wordsiz = q - p + 1; + word = g_realloc (word, wordsiz); + } + ibex_normalise_word (p, q, word); + if (word[0]) { + if (g_hash_table_lookup(words, word) == 0) { + g_hash_table_insert(words, g_strdup(word), name); + } + } + p = q; + } +done: + /* FIXME: do this and inserts within a transaction */ + if (!db_contains_word(ib->names, NULL, name)) { + printf("adding '%s' to database\n", name); + db_index_word(ib->names, "", name); + } + g_hash_table_foreach(words, (GHFunc)do_insert_words, ib); + g_hash_table_destroy(words); + g_free (word); + return 0; +error: + g_hash_table_foreach(words, (GHFunc)do_free_words, NULL); + g_hash_table_destroy(words); + g_free (word); + return -1; +} + + +ibex *ibex_open (char *file, int flags, int mode) +{ + ibex *ib; + u_int32_t dbflags = 0; + int err; + + ib = g_malloc0(sizeof(*ib)); + err = db_create(&ib->words, NULL, 0); + if (err != 0) { + g_warning("create: Error occured?: %s\n", db_strerror(err)); + g_free(ib); + return NULL; + } + + err = ib->words->set_flags(ib->words, DB_DUP); + if (err != 0) { + g_warning("set flags: Error occured?: %s\n", db_strerror(err)); + ib->words->close(ib->words, 0); + g_free(ib); + return NULL; + } + + if (flags & O_CREAT) + dbflags |= DB_CREATE; + if (flags & O_EXCL) + dbflags |= DB_EXCL; + if (flags & O_RDONLY) + dbflags |= DB_RDONLY; + + /* 1MB cache size? */ + ib->words->set_cachesize(ib->words, 0, 1000000, 0); + + err = ib->words->open(ib->words, file, "words", DB_BTREE, dbflags, mode); + if (err != 0) { + printf("open: Error occured?: %s\n", db_strerror(err)); + ib->words->close(ib->words, 0); + g_free(ib); + return NULL; + } + + /* FIXME: check returns */ + err = db_create(&ib->names, NULL, 0); + err = ib->names->open(ib->names, file, "names", DB_HASH, dbflags, mode); + + return ib; +} + +int ibex_save (ibex *ib) +{ + printf("syncing database\n"); + ib->names->sync(ib->names, 0); + return ib->words->sync(ib->words, 0); +} + +int ibex_close (ibex *ib) +{ + int ret; + + printf("closing database\n"); + + ret = ib->names->close(ib->names, 0); + ret = ib->words->close(ib->words, 0); + g_free(ib); + return ret; +} + +void ibex_unindex (ibex *ib, char *name) +{ + printf("trying to unindex '%s'\n", name); + if (db_contains_word(ib->names, NULL, name)) { + /* FIXME: do within transaction? */ + db_delete_name(ib->words, name); + db_delete_word(ib->names, name); + } +} + +GPtrArray *ibex_find (ibex *ib, char *word) +{ + char *normal; + int len; + + len = strlen(word); + normal = alloca(len+1); + ibex_normalise_word(word, word+len, normal); + return db_find(ib->words, normal); +} + +gboolean ibex_find_name (ibex *ib, char *name, char *word) +{ + char *normal; + int len; + + len = strlen(word); + normal = alloca(len+1); + ibex_normalise_word(word, word+len, normal); + return db_contains_word(ib->words, name, normal); +} + +gboolean ibex_contains_name(ibex *ib, char *name) +{ + return db_contains_word(ib->names, NULL, name); +} diff --git a/libibex/ibex_internal.h b/libibex/ibex_internal.h index 1647147a3a..a21867a2ab 100644 --- a/libibex/ibex_internal.h +++ b/libibex/ibex_internal.h @@ -21,19 +21,13 @@ #include <glib.h> #include "ibex.h" +#include "block.h" +#include "wordindex.h" -#define IBEX_VERSION "ibex1" +#define IBEX_VERSION "ibex3" struct ibex { char *path; - GTree *files; - GHashTable *words; - GPtrArray *oldfiles; - gboolean dirty; + struct _memcache *blocks; + struct _IBEXWord *words; }; - -struct ibex_file { - char *name; - long index; -}; -typedef struct ibex_file ibex_file; diff --git a/libibex/index.c b/libibex/index.c deleted file mode 100644 index 1afedbd251..0000000000 --- a/libibex/index.c +++ /dev/null @@ -1,154 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Copyright (C) 2000 Helix Code, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* index.c: high-level indexing ops */ - -#include <errno.h> -#include <fcntl.h> -#include <string.h> -#include <sys/stat.h> -#include <unistd.h> - -#include "ibex_internal.h" - -/** - * ibex_index_file: Index a file by name - * @ib: an ibex - * @filename: name of the file to index - * - * This indexes the given file into @ib. If @filename is already in - * the ibex, the existing index entries for it are discarded and the - * file is indexed anew. - * - * Return value: 0 on success, -1 on failure. - **/ -int -ibex_index_file (ibex *ib, char *filename) -{ - int fd; - int status; - struct stat st; - - fd = open (filename, O_RDONLY); - if (fd < 0) - return -1; - - if (fstat (fd, &st) == -1) { - close (fd); - return -1; - } - if (!S_ISREG (st.st_mode)) { - close (fd); - errno = EINVAL; - return -1; - } - - ibex_unindex (ib, filename); - status = ibex_index_fd (ib, filename, fd, st.st_size); - close (fd); - return status; -} - -/** - * ibex_index_fd: Index a file given a file descriptor - * @ib: an ibex - * @name: the name of the file being indexed - * @fd: a file descriptor, open for reading - * @len: the number of bytes to read from the file - * - * This indexes a file, or a part of a file, given an open file - * descriptor and a size. There is no requirement that @name - * actually correspond to @fd in any particular way. - * - * If the function returns successfully, the file descriptor offset of - * @fd will be exactly @len bytes beyond where it was when the - * function was called. The indexer assumes that this point is a word - * boundary. - * - * The behavior of this function is not defined if it is not - * possible to read @len bytes off @fd. - * - * Return value: 0 on success, -1 on failure. - **/ -int -ibex_index_fd (ibex *ib, char *name, int fd, size_t len) -{ - char *buf; - int off = 0, nread, status; - - buf = g_malloc (len); - do { - nread = read (fd, buf + off, len - off); - if (nread == -1) { - g_free (buf); - return -1; - } - off += nread; - } while (off != len); - - status = ibex_index_buffer (ib, name, buf, len, NULL); - g_free (buf); - - return status; -} - -/** - * ibex_unindex: Remove a file from the ibex - * @ib: an ibex - * @name: name of the file to remove - * - * This removes all references to @name from @ib. No memory is freed - * right away, but further searches on @ib will never return @name. - **/ -void -ibex_unindex (ibex *ib, char *name) -{ - ibex_file *ibf; - - ibf = g_tree_lookup (ib->files, name); - if (ibf) { - ibf->index = -1; - g_tree_remove (ib->files, name); - g_ptr_array_add (ib->oldfiles, ibf); - ib->dirty = TRUE; - } -} - -/** - * ibex_rename: Rename a file in the ibex - * @ib: an ibex - * @oldname: the old name of the file - * @newname: the new name of the file - * - * This renames a file in the ibex. - **/ -void -ibex_rename (ibex *ib, char *oldname, char *newname) -{ - ibex_file *ibf; - - ibf = g_tree_lookup (ib->files, oldname); - if (ibf) { - g_tree_remove (ib->files, oldname); - g_free (ibf->name); - ibf->name = g_strdup (newname); - g_tree_insert (ib->files, ibf->name, ibf); - } -} diff --git a/libibex/index.h b/libibex/index.h new file mode 100644 index 0000000000..35e3df23c2 --- /dev/null +++ b/libibex/index.h @@ -0,0 +1,82 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2000 Helix Code, Inc. + * + * Authors: Michael Zucchi <notzed@helixcode.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the Gnome Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifndef _INDEX_H +#define _INDEX_H + +/* an indexing 'class' maps a key to 1 piece of info */ + +struct _IBEXIndex { + struct _IBEXIndexClass *klass; + struct _memcache *blocks; + blockid_t root; /* root block of ondisk index data */ +}; + +struct _IBEXIndexClass { + + struct _IBEXIndex *(*create)(struct _memcache *bc, int size); + struct _IBEXIndex *(*open)(struct _memcache *bc, blockid_t root); + + int (*sync)(struct _IBEXIndex *); + int (*close)(struct _IBEXIndex *); + + /* lookup a key in the index, returns the keyid of this item, or 0 if not found */ + guint32 (*find)(struct _IBEXIndex *, const char *key, int keylen); + + /* remove a key from the index */ + void (*remove)(struct _IBEXIndex *, const char *key, int keylen); + + /* insert a new key into the index, the keyid is returned */ + guint32 (*insert)(struct _IBEXIndex *, const char *key, int keylen); + + /* get the key contents/key length from the keyid */ + char *(*get_key)(struct _IBEXIndex *, guint32 keyid, int *keylenptr); + + /* set the key contents based on the keyid */ + void (*set_data)(struct _IBEXIndex *, guint32 keyid, blockid_t datablock, blockid_t tail); + + /* get the key contents based on the keyid */ + blockid_t (*get_data)(struct _IBEXIndex *, guint32 keyid, blockid_t *tail); +}; + +/* a storage class, stores lists of lists of id's */ + +struct _IBEXStore { + struct _IBEXStoreClass *klass; + struct _memcache *blocks; +}; + +struct _IBEXStoreClass { + struct _IBEXStore *(*create)(struct _memcache *bc); + int (*sync)(struct _IBEXStore *store); + int (*close)(struct _IBEXStore *store); + + blockid_t (*add)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data); + blockid_t (*add_list)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data); + blockid_t (*remove)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data); + void (*free)(struct _IBEXStore *store, blockid_t head, blockid_t tail); + + gboolean (*find)(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data); + GArray *(*get)(struct _IBEXStore *store, blockid_t head, blockid_t tail); +}; + +#endif diff --git a/libibex/wordindex.c b/libibex/wordindex.c new file mode 100644 index 0000000000..dbde1f4a7f --- /dev/null +++ b/libibex/wordindex.c @@ -0,0 +1,488 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2000 Helix Code, Inc. + * + * Authors: Michael Zucchi <notzed@helixcode.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the Gnome Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* code to manage a word index */ +/* includes a cache for word index writes, + but not for name index writes (currently), or any reads. + +Note the word cache is only needed during indexing of lots +of words, and could then be discarded (:flush()). + +*/ + +#include <stdio.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include <glib.h> + +#include "block.h" +#include "index.h" +#include "wordindex.h" + +#define d(x) + +/*#define WORDCACHE_SIZE (256)*/ +#define WORDCACHE_SIZE (10240) + +extern struct _IBEXStoreClass ibex_diskarray_class; +extern struct _IBEXIndexClass ibex_hash_class; + +/* need 2 types of hash key? + one that just stores the wordid / wordblock + and one that stores the filecount/files? +*/ + + +struct _wordcache { + struct _wordcache *next; + struct _wordcache *prev; + nameid_t wordid; /* disk wordid */ + blockid_t wordblock; /* head of disk list */ + blockid_t wordtail; /* and the tail data */ + int filecount; /* how many we have in memory */ + /*nameid_t files[32];*/ /* memory cache of files */ + nameid_t files[62]; /* memory cache of files */ + char word[1]; /* actual word follows */ +}; + +static void unindex_name(struct _IBEXWord *, const char *name); /* unindex all entries for name */ +static gboolean contains_name(struct _IBEXWord *, const char *name); /* index contains data for name */ +static GPtrArray *find(struct _IBEXWord *, const char *word); /* returns all matches for word */ +static gboolean find_name(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */ +static void add(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name (slow) */ +static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */ +static int word_sync(struct _IBEXWord *idx); +static int word_flush(struct _IBEXWord *idx); +static int word_close(struct _IBEXWord *idx); + +struct _IBEXWordClass ibex_word_index_class = { + word_sync, word_flush, word_close, + unindex_name, contains_name, + find, find_name, + add, add_list +}; + +/* this interface isn't the best, but it'll do for now */ +struct _IBEXWord * +ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot) +{ + struct _IBEXWord *idx; + + idx = g_malloc0(sizeof(*idx)); + idx->wordcache = g_hash_table_new(g_str_hash, g_str_equal); + ibex_list_new(&idx->wordnodes); + idx->wordcount = 0; + idx->klass = &ibex_word_index_class; + + /* we use the same block array storage for both indexes at the moment */ + idx->wordstore = ibex_diskarray_class.create(bc); + idx->namestore = idx->wordstore; + + /* but not the same indexes! */ + if (*wordroot) { + printf("opening wordindex root = %d\n", *wordroot); + idx->wordindex = ibex_hash_class.open(bc, *wordroot); + } else { + idx->wordindex = ibex_hash_class.create(bc, 1024); + *wordroot = idx->wordindex->root; + printf("creating wordindex root = %d\n", *wordroot); + } + if (*nameroot) { + printf("opening nameindex root = %d\n", *nameroot); + idx->nameindex = ibex_hash_class.open(bc, *nameroot); + } else { + idx->nameindex = ibex_hash_class.create(bc, 1024); + *nameroot = idx->nameindex->root; + printf("creating nameindex root = %d\n", *nameroot); + } + return idx; +} + +#if 0 +static void +cache_sanity(struct _wordcache *head) +{ + while (head->next) { + g_assert(head->filecount <= sizeof(head->files)/sizeof(head->files[0])); + g_assert(strlen(head->word) != 0); + head = head->next; + } +} +#endif + +/* unindex all entries for name */ +static void unindex_name(struct _IBEXWord *idx, const char *name) +{ + GArray *words; + int i; + nameid_t nameid, wordid; + blockid_t nameblock, wordblock, newblock, nametail, wordtail, newtail; + + d(printf("unindexing %s\n", name)); + + /* lookup the hash key */ + nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name)); + /* get the block for this key */ + nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail); + /* and the data for this block */ + words = idx->namestore->klass->get(idx->namestore, nameblock, nametail); + /* walk it ... */ + for (i=0;i<words->len;i++) { + /* get the word */ + wordid = g_array_index(words, nameid_t, i); + d(printf(" word %d\n", wordid)); + /* get the data block */ + wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail); + /* clear this name from it */ + newblock = wordblock; + newtail = wordtail; + idx->wordstore->klass->remove(idx->wordstore, &newblock, &newtail, nameid); + if (newblock != wordblock || newtail != wordtail) + idx->wordindex->klass->set_data(idx->wordindex, wordid, newblock, newtail); + + /* FIXME: check cache as well */ + } + g_array_free(words, TRUE); + + /* and remove name data and itself */ + idx->namestore->klass->free(idx->namestore, nameblock, nametail); + idx->nameindex->klass->remove(idx->nameindex, name, strlen(name)); +} + +/* index contains (any) data for name */ +static gboolean contains_name(struct _IBEXWord *idx, const char *name) +{ + return idx->nameindex->klass->find(idx->nameindex, name, strlen(name)) != 0; +} + +/* returns all matches for word */ +static GPtrArray *find(struct _IBEXWord *idx, const char *word) +{ + nameid_t wordid, nameid; + GPtrArray *res; + GArray *names; + int i; + char *new; + struct _wordcache *cache; + blockid_t wordblock, wordtail; + + res = g_ptr_array_new(); + + cache = g_hash_table_lookup(idx->wordcache, word); + if (cache) { + /* freshen cache entry if we touch it */ + ibex_list_remove((struct _listnode *)cache); + ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache); + wordid = cache->wordid; + wordblock = cache->wordblock; + wordtail = cache->wordtail; + } else { + /* lookup the hash key */ + wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word)); + /* get the block for this key */ + wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail); + } + /* and the data for this block */ + names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail); + /* .. including any memory-only data */ + if (cache) { + g_array_append_vals(names, cache->files, cache->filecount); + } + + /* walk it ... converting id's back to strings */ + g_ptr_array_set_size(res, names->len); + for (i=0;i<names->len;i++) { + nameid = g_array_index(names, nameid_t, i); + new = idx->nameindex->klass->get_key(idx->nameindex, nameid, NULL); + res->pdata[i] = new; + } + g_array_free(names, TRUE); + return res; +} + +/* find if name contains word */ +static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *word) +{ + nameid_t wordid, nameid; + blockid_t nameblock, nametail; + struct _wordcache *cache; + int i; + + /* lookup the hash key for the name */ + nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name)); + /* get the block for this name */ + nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail); + + /* check if there is an in-memory cache for this word, check its file there first */ + cache = g_hash_table_lookup(idx->wordcache, word); + if (cache) { + /* freshen cache entry if we touch it */ + ibex_list_remove((struct _listnode *)cache); + ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache); + for (i=0;i<cache->filecount;i++) { + if (cache->files[i] == nameid) + return TRUE; + } + /* not there? well we can use the wordid anyway */ + wordid = cache->wordid; + } else { + /* lookup the hash key for word */ + wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word)); + } + + /* see if wordid is in nameblock */ + return idx->namestore->klass->find(idx->namestore, nameblock, nametail, wordid); +} + +/* cache helper functions */ +/* flush a cache entry to disk, and empty it out */ +static void +sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache) +{ + GArray array; /* just use this as a header */ + blockid_t oldblock, oldtail; + + d(printf("syncing cache entry '%s'\n", cache->word)); + array.data = (char *)cache->files; + array.len = cache->filecount; + oldblock = cache->wordblock; + oldtail = cache->wordtail; + idx->wordstore->klass->add_list(idx->wordstore, &cache->wordblock, &cache->wordtail, &array); + if (oldblock != cache->wordblock || oldtail != cache->wordtail) { + idx->wordindex->klass->set_data(idx->wordindex, cache->wordid, cache->wordblock, cache->wordtail); + } + cache->filecount = 0; +} + +/* create a new key in an index, returning its id and head block */ +static void +add_index_key(struct _IBEXIndex *wordindex, const char *word, nameid_t *wordid, blockid_t *wordblock, blockid_t *wordtail) +{ + /* initialise cache entry - id of word entry and head block */ + *wordid = wordindex->klass->find(wordindex, word, strlen(word)); + if (*wordid == 0) { + *wordid = wordindex->klass->insert(wordindex, word, strlen(word)); + *wordblock = 0; + *wordtail = 0; + } else { + *wordblock = wordindex->klass->get_data(wordindex, *wordid, wordtail); + } +} + +/* create a new key in a cached index (only word cache so far), flushing old keys + if too much space is being used */ +static struct _wordcache * +add_index_cache(struct _IBEXWord *idx, const char *word) +{ + struct _wordcache *cache; + + d(printf("adding %s to cache\n", word)); + + cache = g_hash_table_lookup(idx->wordcache, word); + if (cache == 0) { + /* see if we have to flush off the last entry */ + if (idx->wordcount >= WORDCACHE_SIZE) { + /* remove last entry, and flush it */ + cache = (struct _wordcache *)idx->wordnodes.tailpred; + d(printf("flushing word from cache %s\n", cache->word)); + ibex_list_remove((struct _listnode *)cache); + g_hash_table_remove(idx->wordcache, cache->word); + sync_cache_entry(idx, cache); + g_free(cache); + idx->wordcount--; + } + cache = g_malloc0(sizeof(*cache)+strlen(word)); + /* initialise cache entry - id of word entry and head block */ + add_index_key(idx->wordindex, word, &cache->wordid, &cache->wordblock, &cache->wordtail); + /* other fields */ + strcpy(cache->word, word); + cache->filecount = 0; + g_hash_table_insert(idx->wordcache, cache->word, cache); + ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache); + idx->wordcount++; + } else { + /* move cache bucket ot the head of the cache list */ + ibex_list_remove((struct _listnode *)cache); + ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache); + } + return cache; +} + +/* adds a single word to name (slow) */ +static void add(struct _IBEXWord *idx, const char *name, const char *word) +{ + nameid_t nameid; + blockid_t nameblock, newblock, nametail, newtail; + struct _wordcache *cache; + + g_error("Dont use wordindex::add()"); + abort(); + + cache = add_index_cache(idx, word); + + /* get the nameid and block start for this name */ + add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); + + /* check for repeats of the last name - dont add anything */ + if (cache->files[cache->filecount] == nameid) + return; + + /* yay, now insert the nameindex into the word list, and the word index into the name list */ + cache->files[cache->filecount] = nameid; + cache->filecount++; + /* if we are full, force a flush now */ + if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) { + sync_cache_entry(idx, cache); + } + + newtail = nametail; + newblock = nameblock; + idx->namestore->klass->add(idx->namestore, &newblock, &newtail, cache->wordid); + if (newblock != nameblock || newtail != nametail) { + idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail); + } +} + +/* adds a bunch of words to a given name */ +static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words) +{ + int i; + GArray *data = g_array_new(0, 0, sizeof(nameid_t)); + blockid_t nameblock, newblock, nametail, newtail; + nameid_t nameid; + struct _wordcache *cache; + + d(printf("Adding words to name %s\n", name)); + + d(cache_sanity((struct _wordcache *)idx->wordnodes.head)); + + /* get the nameid and block start for this name */ + add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); + + d(cache_sanity((struct _wordcache *)idx->wordnodes.head)); + + for (i=0;i<words->len;i++) { + char *word = words->pdata[i]; + + cache = add_index_cache(idx, word); + + /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/ + + /* check for duplicates; doesn't catch duplicates over an overflow + boundary. Watch me care. */ + if (cache->filecount == 0 + || cache->files[cache->filecount-1] != nameid) { + cache->files[cache->filecount] = nameid; + cache->filecount++; + /* if we are full, force a flush now */ + if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) { + sync_cache_entry(idx, cache); + } + + /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/ + + /* and append this wordid for this name in memory */ + g_array_append_val(data, cache->wordid); + } + + /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/ + } + + d(cache_sanity((struct _wordcache *)idx->wordnodes.head)); + + /* and append these word id's in one go */ + newblock = nameblock; + newtail = nametail; + idx->namestore->klass->add_list(idx->namestore, &newblock, &newtail, data); + if (newblock != nameblock || newtail != nametail) { + idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail); + } + + d(cache_sanity((struct _wordcache *)idx->wordnodes.head)); + + g_array_free(data, TRUE); +} + +/* sync any in-memory data to disk */ +static int +word_sync(struct _IBEXWord *idx) +{ + /* we just flush also, save memory */ + word_flush(idx); + +#if 0 + struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head; + + while (cache->next) { + sync_cache_entry(idx, cache); + cache = cache->next; + } + + /*ibex_hash_dump(idx->wordindex);*/ + /*ibex_hash_dump(idx->nameindex);*/ +#endif + return 0; +} + +/* sync and flush any in-memory data to disk and free it */ +static int +word_flush(struct _IBEXWord *idx) +{ + struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next; + + next = cache->next; + while (next) { + sync_cache_entry(idx, cache); + g_hash_table_remove(idx->wordcache, cache->word); + g_free(cache); + cache = next; + next = cache->next; + } + ibex_list_new(&idx->wordnodes); + return 0; +} + +static int word_close(struct _IBEXWord *idx) +{ + struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next; + + next = cache->next; + while (next) { + sync_cache_entry(idx, cache); + g_free(cache); + cache = next; + next = cache->next; + } + idx->namestore->klass->close(idx->namestore); + idx->nameindex->klass->close(idx->nameindex); + /*same as namestore: + idx->wordstore->klass->close(idx->wordstore);*/ + idx->wordindex->klass->close(idx->wordindex); + g_hash_table_destroy(idx->wordcache); + g_free(idx); + + return 0; +} diff --git a/libibex/wordindex.h b/libibex/wordindex.h new file mode 100644 index 0000000000..d222ff5a87 --- /dev/null +++ b/libibex/wordindex.h @@ -0,0 +1,65 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2000 Helix Code, Inc. + * + * Authors: Michael Zucchi <notzed@helixcode.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the Gnome Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifndef _WORDINDEX_H +#define _WORDINDEX_H + +#include <glib.h> + +#include "block.h" +#include "index.h" + +struct _IBEXWord; + +/* not used yet */ +typedef void (*IBEXNormaliseFunc)(char *source, int len, char *dest); + +struct _IBEXWordClass { + int (*sync)(struct _IBEXWord *); + int (*flush)(struct _IBEXWord *); + int (*close)(struct _IBEXWord *); + + void (*unindex_name)(struct _IBEXWord *, const char *name); /* unindex all entries for name */ + gboolean (*contains_name)(struct _IBEXWord *, const char *name); /* index contains data for name */ + GPtrArray *(*find)(struct _IBEXWord *, const char *word); /* returns all matches for word */ + gboolean (*find_name)(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */ + void (*add)(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name */ + void (*add_list)(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */ +}; + +struct _IBEXWord { + struct _IBEXWordClass *klass; + struct _IBEXStore *wordstore; + struct _IBEXIndex *wordindex; + struct _IBEXStore *namestore; + struct _IBEXIndex *nameindex; + + /* word caching info (should probably be modularised) */ + GHashTable *wordcache; /* word->struct _wordcache mapping */ + struct _list wordnodes; /* LRU list of wordcache structures */ + int wordcount; /* how much space used in cache */ +}; + + +struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot); + +#endif /* !_WORDINDEX_H */ |