aboutsummaryrefslogtreecommitdiffstats
path: root/libibex
diff options
context:
space:
mode:
Diffstat (limited to 'libibex')
-rw-r--r--libibex/ChangeLog57
-rw-r--r--libibex/Makefile.am29
-rw-r--r--libibex/block.c481
-rw-r--r--libibex/block.h102
-rw-r--r--libibex/diskarray.c255
-rw-r--r--libibex/disktail.c807
-rw-r--r--libibex/file.c481
-rw-r--r--libibex/find.c198
-rw-r--r--libibex/hash.c701
-rw-r--r--libibex/ibex_block.c (renamed from libibex/words.c)177
-rw-r--r--libibex/ibex_db.c512
-rw-r--r--libibex/ibex_internal.h16
-rw-r--r--libibex/index.c154
-rw-r--r--libibex/index.h82
-rw-r--r--libibex/wordindex.c488
-rw-r--r--libibex/wordindex.h65
16 files changed, 3681 insertions, 924 deletions
diff --git a/libibex/ChangeLog b/libibex/ChangeLog
index a74f697308..1c80c8348b 100644
--- a/libibex/ChangeLog
+++ b/libibex/ChangeLog
@@ -1,3 +1,60 @@
+2000-09-19 Not Zed <NotZed@HelixCode.com>
+
+ ** Merged from IBEX_DISK branch to head.
+
+ * file.c:
+ * find.c:
+ * words.c:
+ * index.c: Removed unused files.
+
+ * block.h: Changed block to use only 24 bits for next and 8 for
+ used, and fixed all relevant code. Some cleanup.
+
+ * disktail.c (tail_get): If we use an empty tail node, then make
+ sure we make it dirty.
+
+2000-09-15 Not Zed <NotZed@HelixCode.com>
+
+ * wordindex.c (word_close): Free hashtable on exit too.
+
+ * disktail.c: Implemented tail-node storage for the end of long
+ lists, or for short lists. Should save significant disk space
+ (5x?).
+ Implemented special case for 1-item lists, where the tailnode
+ pointer is used to store the index entry.
+
+2000-09-14 Not Zed <NotZed@HelixCode.com>
+
+ * wordindex.c (add_index_key): Keys also handle tails.
+
+ * hash.c (hash_set_data_block): Added new parameter to keys - a
+ tail block (a full 32 bit block pointer).
+ (hash_get_data_block): And same here.
+
+2000-09-12 Not Zed <NotZed@HelixCode.com>
+
+ * wordindex.c (word_close): Dont close namestore twice.
+
+2000-09-11 Not Zed <NotZed@HelixCode.com>
+
+ ** Redid almost everything, on-disk hash table to store an index
+ to index records, mroe on the way to modularisation (more to go),
+ now stores reverse indexes for deleting.
+
+2000-08-31 Not Zed <NotZed@HelixCode.com>
+
+ * block.c (add_key_mem): Initialise a memory based array for newly
+ added index entries.
+ (add_record): Changed to cache updates in memory until we hit a
+ limit, and then flush them to disk.
+ (get_record): Merge in-memory records with disk records.
+ (remove_record): Remove from memory first, and if that fails, goto
+ disk.
+ (find_record): Check memory first, then disk if that fails.
+ (add_datum_list): oops, copy size * sizeof(blockid_t)
+ (add_indexed): Make sure we link in the head node when we create a
+ new one.
+
2000-08-09 Christopher James Lahey <clahey@helixcode.com>
* file.c, find.c: Fixed some warnings.
diff --git a/libibex/Makefile.am b/libibex/Makefile.am
index c9e8165987..cfd8a1b4d5 100644
--- a/libibex/Makefile.am
+++ b/libibex/Makefile.am
@@ -2,20 +2,35 @@
noinst_LTLIBRARIES = libibex.la
-libibex_la_SOURCES = file.c index.c find.c words.c ibex.h
+libibex_la_SOURCES = \
+ wordindex.c \
+ block.c ibex.h \
+ hash.c \
+ disktail.c \
+ ibex_block.c
+
libibex_la_LDFLAGS = -static
-noinst_HEADERS = ibex_internal.h
+noinst_HEADERS = \
+ ibex_internal.h \
+ block.h \
+ wordindex.h \
+ index.h
INCLUDES = -I$(srcdir) $(GLIB_CFLAGS) $(UNICODE_CFLAGS) \
-DG_LOG_DOMAIN=\"libibex\"
-noinst_PROGRAMS = mkindex lookup
+#noinst_PROGRAMS = hash
-mkindex_SOURCES = mkindex.c
-mkindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
+#hash_SOURCES = disktail.c
+#hash_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
-lookup_SOURCES = lookup.c
-lookup_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
+#noinst_PROGRAMS = mkindex lookup
+#
+#mkindex_SOURCES = mkindex.c
+#mkindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
+#
+#lookup_SOURCES = lookup.c
+#lookup_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
diff --git a/libibex/block.c b/libibex/block.c
new file mode 100644
index 0000000000..53579523bd
--- /dev/null
+++ b/libibex/block.c
@@ -0,0 +1,481 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ block file/cache/utility functions
+*/
+
+#include <stdio.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <glib.h>
+
+#include "block.h"
+
+#define d(x)
+/*#define DEBUG*/
+
+
+#ifdef IBEX_STATS
+static void
+init_stats(struct _memcache *index)
+{
+ index->stats = g_hash_table_new(g_direct_hash, g_direct_equal);
+}
+
+static void
+dump_1_stat(int id, struct _stat_info *info, struct _memcache *index)
+{
+ printf("%d %d %d %d %d\n", id, info->read, info->write, info->cache_hit, info->cache_miss);
+}
+
+static void
+dump_stats(struct _memcache *index)
+{
+ printf("Block reads writes hits misses\n");
+ g_hash_table_foreach(index->stats, dump_1_stat, index);
+}
+
+static void
+add_read(struct _memcache *index, int id)
+{
+ struct _stat_info *info;
+
+ info = g_hash_table_lookup(index->stats, (void *)id);
+ if (info == NULL) {
+ info = g_malloc0(sizeof(*info));
+ g_hash_table_insert(index->stats, (void *)id, info);
+ }
+ info->read++;
+}
+
+static void
+add_write(struct _memcache *index, int id)
+{
+ struct _stat_info *info;
+
+ info = g_hash_table_lookup(index->stats, (void *)id);
+ if (info == NULL) {
+ info = g_malloc0(sizeof(*info));
+ g_hash_table_insert(index->stats, (void *)id, info);
+ }
+ info->write++;
+}
+
+static void
+add_hit(struct _memcache *index, int id)
+{
+ struct _stat_info *info;
+
+ info = g_hash_table_lookup(index->stats, (void *)id);
+ if (info == NULL) {
+ info = g_malloc0(sizeof(*info));
+ g_hash_table_insert(index->stats, (void *)id, info);
+ }
+ info->cache_hit++;
+}
+
+static void
+add_miss(struct _memcache *index, int id)
+{
+ struct _stat_info *info;
+
+ info = g_hash_table_lookup(index->stats, (void *)id);
+ if (info == NULL) {
+ info = g_malloc0(sizeof(*info));
+ g_hash_table_insert(index->stats, (void *)id, info);
+ }
+ info->cache_miss++;
+}
+#endif /* IBEX_STATS */
+
+
+/* simple list routines (for simplified memory management of cache/lists) */
+
+/**
+ * ibex_list_new:
+ * @v:
+ *
+ * Initialise a list header. A list header must always be initialised
+ * before use.
+ **/
+void ibex_list_new(struct _list *v)
+{
+ v->head = (struct _listnode *)&v->tail;
+ v->tail = 0;
+ v->tailpred = (struct _listnode *)&v->head;
+}
+
+/**
+ * ibex_list_addhead:
+ * @l: List.
+ * @n: Node to append.
+ *
+ * Prepend a listnode to the head of the list @l.
+ *
+ * Return value: Always @n.
+ **/
+struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n)
+{
+ n->next = l->head;
+ n->prev = (struct _listnode *)&l->head;
+ l->head->prev = n;
+ l->head = n;
+ return n;
+}
+
+/**
+ * ibex_list_addtail:
+ * @l:
+ * @n:
+ *
+ * Append a listnode to the end of the list @l.
+ *
+ * Return value: Always the same as @n.
+ **/
+struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n)
+{
+ n->next = (struct _listnode *)&l->tail;
+ n->prev = l->tailpred;
+ l->tailpred->next = n;
+ l->tailpred = n;
+ return n;
+}
+
+/**
+ * ibex_list_remove:
+ * @n: The node to remove.
+ *
+ * Remove a listnode from a list.
+ *
+ * Return value: Always the same as @n.
+ **/
+struct _listnode *ibex_list_remove(struct _listnode *n)
+{
+ n->next->prev = n->prev;
+ n->prev->next = n->next;
+ return n;
+}
+
+static struct _memblock *
+memblock_addr(struct _block *block)
+{
+ return (struct _memblock *)(((char *)block) - G_STRUCT_OFFSET(struct _memblock, data));
+}
+
+/**
+ * ibex_block_dirty:
+ * @block:
+ *
+ * Dirty a block. This will cause it to be written to disk on
+ * a cache sync, or when the block is flushed from the cache.
+ **/
+void
+ibex_block_dirty(struct _block *block)
+{
+ memblock_addr(block)->flags |= BLOCK_DIRTY;
+}
+
+static void
+sync_block(struct _memcache *block_cache, struct _memblock *memblock)
+{
+ lseek(block_cache->fd, memblock->block, SEEK_SET);
+ if (write(block_cache->fd, &memblock->data, sizeof(memblock->data)) != -1) {
+ memblock->flags &= ~BLOCK_DIRTY;
+ }
+#ifdef IBEX_STATS
+ add_write(block_cache, memblock->block);
+#endif
+}
+
+/**
+ * ibex_block_cache_sync:
+ * @block_cache:
+ *
+ * Ensure the block cache is fully synced to disk.
+ **/
+void
+ibex_block_cache_sync(struct _memcache *block_cache)
+{
+ struct _memblock *memblock;
+
+ memblock = (struct _memblock *)block_cache->nodes.head;
+ while (memblock->next) {
+ if (memblock->flags & BLOCK_DIRTY) {
+ sync_block(block_cache, memblock);
+ }
+ memblock = memblock->next;
+ }
+ fsync(block_cache->fd);
+
+#ifdef IBEX_STATS
+ dump_stats(block_cache);
+#endif
+
+}
+
+/**
+ * ibex_block_cache_flush:
+ * @block_cache:
+ *
+ * Ensure the block cache is fully synced to disk, and then flush
+ * its contents from memory.
+ **/
+void
+ibex_block_cache_flush(struct _memcache *block_cache)
+{
+ struct _memblock *mw, *mn;
+
+ ibex_block_cache_sync(block_cache);
+
+ mw = (struct _memblock *)block_cache->nodes.head;
+ mn = mw->next;
+ while (mn) {
+ g_hash_table_remove(block_cache->index, (void *)mw->block);
+ g_free(mw);
+ mw = mn;
+ mn = mn->next;
+ }
+ ibex_list_new(&block_cache->nodes);
+}
+
+
+/**
+ * ibex_block_read:
+ * @block_cache:
+ * @blockid:
+ *
+ * Read the data of a block by blockid. The data contents is backed by
+ * the block cache, and should be considered static.
+ *
+ * TODO; should this return a NULL block on error?
+ *
+ * Return value: The address of the block data (which may be cached).
+ **/
+struct _block *
+ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
+{
+ struct _memblock *memblock;
+
+ {
+ /* assert blockid<roof */
+ if (blockid > 0) {
+ struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
+ g_assert(blockid < root->roof);
+ }
+ }
+
+ memblock = g_hash_table_lookup(block_cache->index, (void *)blockid);
+ if (memblock) {
+ d(printf("foudn blockid in cache %d = %p\n", blockid, &memblock->data));
+ /* 'access' page */
+ ibex_list_remove((struct _listnode *)memblock);
+ ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock);
+#ifdef IBEX_STATS
+ add_hit(block_cache, memblock->block);
+#endif
+ return &memblock->data;
+ }
+#ifdef IBEX_STATS
+ add_miss(block_cache, blockid);
+ add_read(block_cache, blockid);
+#endif
+ d(printf("loading blockid from disk %d\n", blockid));
+ memblock = g_malloc(sizeof(*memblock));
+ memblock->block = blockid;
+ memblock->flags = 0;
+ lseek(block_cache->fd, blockid, SEEK_SET);
+ memset(&memblock->data, 0, sizeof(memblock->data));
+ read(block_cache->fd, &memblock->data, sizeof(memblock->data));
+ ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock);
+ g_hash_table_insert(block_cache->index, (void *)blockid, memblock);
+ if (block_cache->count >= CACHE_SIZE) {
+ struct _memblock *old = (struct _memblock *)block_cache->nodes.head;
+ d(printf("discaring cache block %d\n", old->block));
+ g_hash_table_remove(block_cache->index, (void *)old->block);
+ ibex_list_remove((struct _listnode *)old);
+ if (old->flags & BLOCK_DIRTY) {
+ sync_block(block_cache, old);
+ }
+ g_free(old);
+ } else {
+ block_cache->count++;
+ }
+
+ d(printf(" --- cached blocks : %d\n", block_cache->count));
+
+ return &memblock->data;
+}
+
+/**
+ * ibex_block_cache_open:
+ * @name:
+ * @flags: Flags as to open(2), should use O_RDWR and optionally O_CREAT.
+ * @mode: Mose as to open(2)
+ *
+ * Open a block file.
+ *
+ * FIXME; this currently also initialises the word and name indexes
+ * because their pointers are stored in the root block. Should be
+ * upto the caller to manage these pointers/data.
+ *
+ * Return value: NULL if the backing file could not be opened.
+ **/
+struct _memcache *
+ibex_block_cache_open(const char *name, int flags, int mode)
+{
+ struct _root *root;
+ struct _memcache *block_cache = g_malloc0(sizeof(*block_cache));
+
+ d(printf("opening ibex file: %s", name));
+
+ /* setup cache */
+ ibex_list_new(&block_cache->nodes);
+ block_cache->count = 0;
+ block_cache->index = g_hash_table_new(g_direct_hash, g_direct_equal);
+ block_cache->fd = open(name, flags, mode);
+
+ if (block_cache->fd == -1) {
+ g_hash_table_destroy(block_cache->index);
+ g_free(block_cache);
+ return NULL;
+ }
+
+ root = (struct _root *)ibex_block_read(block_cache, 0);
+ if (root->roof == 0 || memcmp(root->version, "ibx3", 4)) {
+ d(printf("Initialising superblock\n"));
+ /* reset root data */
+ memcpy(root->version, "ibx3", 4);
+ root->roof = 1024;
+ root->free = 0;
+ root->words = 0;
+ root->names = 0;
+ root->tail = 0; /* list of tail blocks */
+ ibex_block_dirty((struct _block *)root);
+ } else {
+ d(printf("superblock already initialised:\n"
+ " roof = %d\n free = %d\n words = %d\n names = %d\n",
+ root->roof, root->free, root->words, root->names));
+ }
+ /* this should be moved higher up */
+ {
+ struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
+
+ block_cache->words = ibex_create_word_index(block_cache, &root->words, &root->names);
+ }
+
+#ifdef IBEX_STATS
+ init_stats(block_cache);
+#endif
+
+ return block_cache;
+}
+
+/**
+ * ibex_block_cache_close:
+ * @block_cache:
+ *
+ * Close the block file, sync any remaining cached data
+ * to disk, and free all resources.
+ **/
+void
+ibex_block_cache_close(struct _memcache *block_cache)
+{
+ struct _memblock *mw, *mn;
+
+ ibex_block_cache_sync(block_cache);
+ close(block_cache->fd);
+
+ mw = (struct _memblock *)block_cache->nodes.head;
+ mn = mw->next;
+ while (mn) {
+ g_free(mw);
+ mw = mn;
+ mn = mw->next;
+ }
+
+ g_hash_table_destroy(block_cache->index);
+
+ g_free(block_cache);
+}
+
+/**
+ * ibex_block_free:
+ * @block_cache:
+ * @blockid:
+ *
+ * Return a block to the free pool.
+ **/
+void
+ibex_block_free(struct _memcache *block_cache, blockid_t blockid)
+{
+ struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
+ struct _block *block = ibex_block_read(block_cache, blockid);
+
+ block->next = root->free;
+ root->free = blockid;
+ ibex_block_dirty((struct _block *)root);
+ ibex_block_dirty((struct _block *)block);
+}
+
+/**
+ * ibex_block_get:
+ * @block_cache:
+ *
+ * Allocate a new block, or access a previously freed block and return
+ * its block id. The block will have zeroed contents.
+ *
+ * Return value: 0 if there are no blocks left (disk full/read only
+ * file, etc).
+ **/
+blockid_t
+ibex_block_get(struct _memcache *block_cache)
+{
+ struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
+ struct _block *block;
+ blockid_t head;
+
+ if (root->free) {
+ head = root->free;
+ block = ibex_block_read(block_cache, head);
+ root->free = block_location(block->next);
+ } else {
+ /* TODO: check the block will fit first */
+ /* TODO: no need to read this block, can allocate it manually (saves a syscall/read) */
+ head = root->roof;
+ root->roof += BLOCK_SIZE;
+ block = ibex_block_read(block_cache, head);
+ }
+
+ g_assert(head != 0);
+
+ d(printf("new block = %d\n", head));
+ block->next = 0;
+ block->used = 0;
+ ibex_block_dirty(block);
+ ibex_block_dirty((struct _block *)root);
+ return head;
+}
diff --git a/libibex/block.h b/libibex/block.h
new file mode 100644
index 0000000000..40262de6e2
--- /dev/null
+++ b/libibex/block.h
@@ -0,0 +1,102 @@
+
+/* public interfaces for block io routines */
+
+#ifndef _BLOCK_H
+#define _BLOCK_H
+
+/*#define IBEX_STATS*/ /* define to get/dump block access stats */
+
+#include <glib.h>
+
+typedef guint32 nameid_t;
+typedef guint32 blockid_t;
+
+#define BLOCK_BITS (8)
+#define BLOCK_SIZE (1<<BLOCK_BITS)
+#define CACHE_SIZE 256 /* blocks in disk cache */
+
+/* root block */
+struct _root {
+ char version[4];
+
+ blockid_t free; /* list of free blocks */
+ blockid_t roof; /* top of allocated space, everything below is in a free or used list */
+ blockid_t tail; /* list of 'tail' blocks */
+
+ blockid_t words; /* root of words index */
+ blockid_t names; /* root of names index */
+};
+
+/* basic disk structure for (data) blocks */
+struct _block {
+ unsigned int next:32-BLOCK_BITS; /* next block */
+ unsigned int used:BLOCK_BITS; /* number of elements used */
+
+ nameid_t bl_data[(BLOCK_SIZE-4)/4]; /* references */
+};
+
+/* custom list structure, for a simple/efficient cache */
+struct _listnode {
+ struct _listnode *next;
+ struct _listnode *prev;
+};
+struct _list {
+ struct _listnode *head;
+ struct _listnode *tail;
+ struct _listnode *tailpred;
+};
+
+void ibex_list_new(struct _list *v);
+struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n);
+struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n);
+struct _listnode *ibex_list_remove(struct _listnode *n);
+
+/* in-memory structure for block cache */
+struct _memblock {
+ struct _memblock *next;
+ struct _memblock *prev;
+
+ blockid_t block;
+ int flags;
+
+ struct _block data;
+};
+#define BLOCK_DIRTY (1<<0)
+
+struct _memcache {
+ struct _list nodes;
+ int count; /* nodes in cache */
+
+ GHashTable *index; /* blockid->memblock mapping */
+ int fd; /* file fd */
+
+#ifdef IBEX_STATS
+ GHashTable *stats;
+#endif
+ /* temporary here */
+ struct _IBEXWord *words; /* word index */
+};
+
+#ifdef IBEX_STATS
+struct _stat_info {
+ int read;
+ int write;
+ int cache_hit;
+ int cache_miss;
+};
+#endif /* IBEX_STATS */
+
+struct _memcache *ibex_block_cache_open(const char *name, int flags, int mode);
+void ibex_block_cache_close(struct _memcache *block_cache);
+void ibex_block_cache_sync(struct _memcache *block_cache);
+void ibex_block_cache_flush(struct _memcache *block_cache);
+
+blockid_t ibex_block_get(struct _memcache *block_cache);
+void ibex_block_free(struct _memcache *block_cache, blockid_t blockid);
+void ibex_block_dirty(struct _block *block);
+struct _block *ibex_block_read(struct _memcache *block_cache, blockid_t blockid);
+
+#define block_number(x) ((x)>>BLOCK_BITS)
+#define block_location(x) ((x)<<BLOCK_BITS)
+
+#endif /* ! _BLOCK_H */
diff --git a/libibex/diskarray.c b/libibex/diskarray.c
new file mode 100644
index 0000000000..2664e0f6b5
--- /dev/null
+++ b/libibex/diskarray.c
@@ -0,0 +1,255 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/* a disk based array storage class */
+
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <glib.h>
+
+#include "block.h"
+#include "index.h"
+
+#define d(x)
+/*#define DEBUG*/
+
+static struct _IBEXStore *disk_create(struct _memcache *bc);
+static int disk_sync(struct _IBEXStore *store);
+static int disk_close(struct _IBEXStore *store);
+
+static blockid_t disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data);
+static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data);
+static blockid_t disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data);
+static void disk_free(struct _IBEXStore *store, blockid_t head);
+
+static gboolean disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data);
+static GArray *disk_get(struct _IBEXStore *store, blockid_t head);
+
+struct _IBEXStoreClass ibex_diskarray_class = {
+ disk_create, disk_sync, disk_close,
+ disk_add, disk_add_list,
+ disk_remove, disk_free,
+ disk_find, disk_get
+};
+
+static struct _IBEXStore *disk_create(struct _memcache *bc)
+{
+ struct _IBEXStore *store;
+
+ store = g_malloc0(sizeof(*store));
+ store->klass = &ibex_diskarray_class;
+ store->blocks = bc;
+
+ return store;
+}
+
+static int disk_sync(struct _IBEXStore *store)
+{
+ /* no cache, nop */
+ return 0;
+}
+
+static int disk_close(struct _IBEXStore *store)
+{
+ g_free(store);
+ return 0;
+}
+
+static blockid_t
+disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data)
+{
+ struct _block *block;
+ struct _block *newblock;
+ blockid_t new;
+
+ if (head == 0) {
+ head = ibex_block_get(store->blocks);
+ }
+ block = ibex_block_read(store->blocks, head);
+
+ d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next));
+
+ if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) {
+ d(printf("adding record into block %d %d\n", head, data));
+ block->bl_data[block->used] = data;
+ block->used++;
+ ibex_block_dirty(block);
+ return head;
+ } else {
+ new = ibex_block_get(store->blocks);
+ newblock = ibex_block_read(store->blocks, new);
+ newblock->next = head;
+ newblock->bl_data[0] = data;
+ newblock->used = 1;
+ d(printf("adding record into new %d %d, next =%d\n", new, data, newblock->next));
+ ibex_block_dirty(newblock);
+ return new;
+ }
+}
+
+static blockid_t
+disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data)
+{
+ struct _block *block;
+ struct _block *newblock;
+ blockid_t new;
+ int copied = 0;
+ int left, space, tocopy;
+
+ if (head == 0) {
+ head = ibex_block_get(store->blocks);
+ }
+ block = ibex_block_read(store->blocks, head);
+
+ while (copied < data->len) {
+ left = data->len - copied;
+ space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used;
+ if (space) {
+ tocopy = MIN(left, space);
+ memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+ block->used += tocopy;
+ ibex_block_dirty(block);
+ } else {
+ new = ibex_block_get(store->blocks);
+ newblock = ibex_block_read(store->blocks, new);
+ newblock->next = head;
+ tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0]));
+ memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+ newblock->used = tocopy;
+ block = newblock;
+ head = new;
+ ibex_block_dirty(newblock);
+ }
+ copied += tocopy;
+ }
+ return head;
+}
+
+static blockid_t
+disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data)
+{
+ blockid_t node = head;
+
+ d(printf("removing %d from %d\n", data, head));
+ while (node) {
+ struct _block *block = ibex_block_read(store->blocks, node);
+ int i;
+
+ for (i=0;i<block->used;i++) {
+ if (block->bl_data[i] == data) {
+ struct _block *start = ibex_block_read(store->blocks, head);
+
+ start->used--;
+ block->bl_data[i] = start->bl_data[start->used];
+ if (start->used == 0) {
+ struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0);
+ blockid_t new;
+
+ d(printf("dropping block %d, new head = %d\n", head, start->next));
+ new = start->next;
+ start->next = root->free;
+ root->free = head;
+ head = new;
+ ibex_block_dirty((struct _block *)root);
+ }
+ ibex_block_dirty(block);
+ ibex_block_dirty(start);
+ return head;
+ }
+ }
+ node = block->next;
+ }
+ return head;
+}
+
+static void disk_free(struct _IBEXStore *store, blockid_t head)
+{
+ blockid_t next;
+ struct _block *block;
+
+ while (head) {
+ block = ibex_block_read(store->blocks, head);
+ next = block->next;
+ ibex_block_free(store->blocks, head);
+ head = next;
+ }
+}
+
+static gboolean
+disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data)
+{
+ blockid_t node = head;
+
+ d(printf("finding %d from %d\n", data, head));
+ while (node) {
+ struct _block *block = ibex_block_read(store->blocks, node);
+ int i;
+
+ for (i=0;i<block->used;i++) {
+ if (block->bl_data[i] == data) {
+ return TRUE;
+ }
+ }
+ node = block->next;
+ }
+ return FALSE;
+}
+
+static GArray *
+disk_get(struct _IBEXStore *store, blockid_t head)
+{
+ GArray *result = g_array_new(0, 0, sizeof(nameid_t));
+
+ while (head) {
+ struct _block *block = ibex_block_read(store->blocks, head);
+
+ d(printf("getting data from block %d\n", head));
+
+ g_array_append_vals(result, block->bl_data, block->used);
+ head = block->next;
+ d(printf("next = %d\n", head));
+ }
+ return result;
+}
+
+void
+ibex_diskarray_dump(struct _memcache *blocks, blockid_t head)
+{
+ blockid_t node = head;
+
+ printf("dumping list %d\n", node);
+ while (node) {
+ struct _block *block = ibex_block_read(blocks, node);
+ int i;
+
+ printf(" block %d used %d\n ", node, block->used);
+ for (i=0;i<block->used;i++) {
+ printf(" %d", block->bl_data[i]);
+ }
+ printf("\n");
+ node = block->next;
+ }
+}
diff --git a/libibex/disktail.c b/libibex/disktail.c
new file mode 100644
index 0000000000..90562889de
--- /dev/null
+++ b/libibex/disktail.c
@@ -0,0 +1,807 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/* a disk based array storage class that stores the tails of data lists
+ in common blocks */
+
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <glib.h>
+#include <string.h>
+
+#include <stdio.h>
+
+#include "block.h"
+#include "index.h"
+
+#define d(x)
+/*#define DEBUG*/
+
+/* marker to define which root keys indicate a single length key */
+#define BLOCK_ONE (1<<BLOCK_BITS)
+
+/* tail blocks only contain tail data ... */
+/* and we pack it in, similar to the key data, only more compact */
+struct _tailblock {
+ unsigned int next:32-BLOCK_BITS; /* only needs to point to block numbers */
+ unsigned int used:BLOCK_BITS; /* how many entries are used */
+ union {
+ unsigned char offset[BLOCK_SIZE-4]; /* works upto blocksize of 1024 bytes */
+ nameid_t data[(BLOCK_SIZE-4)/4];
+ } tailblock_u;
+};
+#define tb_offset tailblock_u.offset
+#define tb_data tailblock_u.data
+
+/* map a tail index to a block index */
+#define TAIL_INDEX(b) ((b) & (BLOCK_SIZE-1))
+/* map a tail index to a block number */
+#define TAIL_BLOCK(b) ((b) & ~(BLOCK_SIZE-1))
+/* map a block + index to a tailid */
+#define TAIL_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1)))
+
+#define TAIL_THRESHOLD ((BLOCK_SIZE-8)/6)
+
+static struct _IBEXStore *disk_create(struct _memcache *bc);
+static int disk_sync(struct _IBEXStore *store);
+static int disk_close(struct _IBEXStore *store);
+
+static blockid_t disk_add(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
+static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data);
+static blockid_t disk_remove(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
+static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail);
+
+static gboolean disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data);
+static GArray *disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail);
+
+struct _IBEXStoreClass ibex_diskarray_class = {
+ disk_create, disk_sync, disk_close,
+ disk_add, disk_add_list,
+ disk_remove, disk_free,
+ disk_find, disk_get
+};
+
+
+static int
+tail_info(struct _tailblock *bucket, nameid_t tailid, blockid_t **startptr)
+{
+ blockid_t *start, *end;
+ int index;
+
+ /* get start/end of area to zap */
+ index = TAIL_INDEX(tailid);
+ start = &bucket->tb_data[bucket->tb_offset[index]];
+ if (index == 0) {
+ end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])];
+ } else {
+ end = &bucket->tb_data[bucket->tb_offset[index-1]];
+ }
+ if (startptr)
+ *startptr = start;
+ return end-start;
+}
+
+/* compresses (or expand) the bucket entry, to the new size */
+static void
+tail_compress(struct _tailblock *bucket, int index, int newsize)
+{
+ int i;
+ blockid_t *start, *end, *newstart;
+
+ /* get start/end of area to zap */
+ start = &bucket->tb_data[bucket->tb_offset[index]];
+ if (index == 0) {
+ end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])];
+ } else {
+ end = &bucket->tb_data[bucket->tb_offset[index-1]];
+ }
+
+ if (end-start == newsize)
+ return;
+
+ /*
+ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy
+ 0 20 25
+
+ newsize = 0
+ end = 25
+ newstart = 0
+ start = 20
+
+ newstart+(end-start)-newsize = 5
+ i = start-newstart
+
+ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy
+ 0 20 25
+
+ newsize = 2
+ end = 25
+ newstart = 0
+ start = 20
+
+ newstart+(end-start)-newsize = 3
+ i = start-newstart + MIN(end-start, newsize)) = 22
+
+ XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+ 5 25
+ newsize = 5
+ end = 25
+ start = 25
+ newstart = 5
+
+ newstart+(end-start)-newsize = 0
+ i = start-newstart = 20
+
+ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyy
+ 3 23 25
+ newsize = 5
+ end = 25
+ start = 23
+ newstart = 3
+
+ newstart+(end-start)-newsize = 0
+ i = start-newstart + MIN(end-start, newsize) = 22
+
+ */
+
+
+ /* fixup data */
+ newstart = &bucket->tb_data[bucket->tb_offset[bucket->used-1]];
+
+ g_assert(newstart+(end-start)-newsize <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]);
+ g_assert(newstart + (start-newstart) + MIN(end-start, newsize) <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]);
+ g_assert(newstart+(end-start)-newsize >= &bucket->tb_offset[bucket->used]);
+ g_assert(newstart + (start-newstart) + MIN(end-start, newsize) >= &bucket->tb_offset[bucket->used]);
+
+ memmove(newstart+(end-start)-newsize, newstart, ((start-newstart)+MIN(end-start, newsize)) * sizeof(blockid_t));
+
+ /* fixup key pointers */
+ for (i=index;i<bucket->used;i++) {
+ bucket->tb_offset[i] += (end-start)-newsize;
+ }
+ ibex_block_dirty((struct _block *)bucket);
+}
+
+/*
+ returns the number of blockid's free
+*/
+static int
+tail_space(struct _tailblock *tail)
+{
+ if (tail->used == 0)
+ return sizeof(tail->tb_data)/sizeof(tail->tb_data[0])-1;
+
+ return &tail->tb_data[tail->tb_offset[tail->used-1]]
+ - (blockid_t *)&tail->tb_offset[tail->used];
+}
+
+static void
+tail_dump(struct _memcache *blocks, blockid_t tailid)
+{
+ int i;
+ struct _tailblock *tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid));
+
+ printf("Block %d, used %d\n", tailid, tail->used);
+ for (i=0;i<sizeof(struct _tailblock)/sizeof(unsigned int);i++) {
+ printf(" %08x", ((unsigned int *)tail)[i]);
+ }
+ printf("\n");
+}
+
+static blockid_t
+tail_get(struct _memcache *blocks, int size)
+{
+ struct _root *root = (struct _root *)ibex_block_read(blocks, 0);
+ blockid_t tailid;
+ struct _tailblock *tail;
+ int freeindex;
+ blockid_t *end;
+ int i, count = 0;
+
+ d(printf("looking for a tail node with %d items in it\n", size));
+
+ /* look for a node with enough space, if we dont find it fairly
+ quickly, just quit. needs a better free algorithm i think ... */
+ tailid = root->tail;
+ while (tailid && count<5) {
+ int space;
+
+ d(printf(" checking tail node %d\n", tailid));
+
+ tail = (struct _tailblock *)ibex_block_read(blocks, tailid);
+ if (tail->used == 0) {
+ /* assume its big enough ... */
+ tail->used = 1;
+ tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size;
+ d(printf("allocated %d (%d), used %d\n", tailid, tailid, tail->used));
+ ibex_block_dirty((struct _block *)tail);
+ return tailid;
+ }
+
+ /* see if we have a free slot first */
+ freeindex = -1;
+ end = &tail->tb_data[sizeof(tail->tb_data)/sizeof(tail->tb_data[0])];
+ for (i=0;i<tail->used;i++) {
+ if (end == &tail->tb_data[tail->tb_offset[i]]) {
+ freeindex = i;
+ break;
+ }
+ end = &tail->tb_data[tail->tb_offset[i]];
+ }
+
+ /* determine how much space we have available - incl any extra header we might need */
+ space = ((char *)&tail->tb_data[tail->tb_offset[tail->used-1]])
+ - ((char *)&tail->tb_offset[tail->used])
+ /* FIXMEL work out why this is out a little bit */
+ - 8;
+ if (freeindex == -1)
+ space -= sizeof(tail->tb_offset[0]);
+
+ /* if we have enough, set it up, creating a new entry if necessary */
+ /* for some really odd reason the compiler promotes this expression to unsigned, hence
+ the requirement for the space>0 check ... */
+ if (space>0 && space > size*sizeof(blockid_t)) {
+ d(printf("space = %d, size = %d size*sizeof() = %d truth = %d\n", space, size, size*sizeof(blockid_t), space>size*sizeof(blockid_t)));
+ if (freeindex == -1) {
+ freeindex = tail->used;
+ tail->tb_offset[tail->used] = tail->tb_offset[tail->used-1];
+ tail->used++;
+ }
+ tail_compress(tail, freeindex, size);
+ ibex_block_dirty((struct _block *)tail);
+ d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, freeindex), tail->used));
+ return TAIL_KEY(tailid, freeindex);
+ }
+ count++;
+ tailid = block_location(tail->next);
+ }
+
+ d(printf("allocating new data node for tail data\n"));
+ /* didn't find one, setup a new one */
+ root = (struct _root *)ibex_block_read(blocks, 0);
+ tailid = ibex_block_get(blocks);
+ tail = (struct _tailblock *)ibex_block_read(blocks, tailid);
+ tail->next = block_number(root->tail);
+ root->tail = tailid;
+ tail->used = 1;
+ tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size;
+ ibex_block_dirty((struct _block *)tail);
+ d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, 0), tail->used));
+ return TAIL_KEY(tailid, 0);
+}
+
+static void
+tail_free(struct _memcache *blocks, blockid_t tailid)
+{
+ struct _tailblock *tail;
+
+ d(printf("freeing tail id %d\n", tailid));
+
+ if (tailid == 0)
+ return;
+
+ tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid));
+ d(printf(" tail %d used %d\n", tailid, tail->used));
+ g_assert(tail->used);
+ if (TAIL_INDEX(tailid) == tail->used - 1) {
+ tail->used --;
+ } else {
+ tail_compress(tail, TAIL_INDEX(tailid), 0);
+ }
+ ibex_block_dirty((struct _block *)tail);
+}
+
+static struct _IBEXStore *disk_create(struct _memcache *bc)
+{
+ struct _IBEXStore *store;
+
+ store = g_malloc0(sizeof(*store));
+ store->klass = &ibex_diskarray_class;
+ store->blocks = bc;
+
+ return store;
+}
+
+static int disk_sync(struct _IBEXStore *store)
+{
+ /* no cache, nop */
+ return 0;
+}
+
+static int disk_close(struct _IBEXStore *store)
+{
+ g_free(store);
+ return 0;
+}
+
+static blockid_t
+disk_add(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data)
+{
+ struct _block *block;
+ struct _block *newblock;
+ blockid_t new, head = *headptr /*, tail = *tailptr*/;
+
+ g_error("Inbimplemented");
+
+ if (head == 0) {
+ head = ibex_block_get(store->blocks);
+ }
+ block = ibex_block_read(store->blocks, head);
+
+ d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next));
+
+ if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) {
+ (printf("adding record into block %d %d\n", head, data));
+ block->bl_data[block->used] = data;
+ block->used++;
+ ibex_block_dirty(block);
+ return head;
+ } else {
+ new = ibex_block_get(store->blocks);
+ newblock = ibex_block_read(store->blocks, new);
+ newblock->next = block_number(head);
+ newblock->bl_data[0] = data;
+ newblock->used = 1;
+ d(printf("adding record into new %d %d, next =%d\n", new, data, newblock->next));
+ ibex_block_dirty(newblock);
+ return new;
+ }
+}
+
+static blockid_t
+disk_add_blocks_internal(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data)
+{
+ blockid_t head = *headptr, tail = *tailptr, new;
+ int tocopy;
+ struct _tailblock *tailblock;
+ struct _block *block, *newblock;
+ int space, copied = 0, left;
+
+ /* assumes this funciton is in control of any tail creation */
+ g_assert(tail == 0);
+
+ d(printf("Adding %d items to block list\n", data->len));
+
+ if (head == 0) {
+ head = ibex_block_get(store->blocks);
+ d(printf("allocating new head %d\n", head));
+ }
+ block = ibex_block_read(store->blocks, head);
+
+ /* ensure the first block is full before proceeding */
+ space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used;
+ if (space) {
+ tocopy = MIN(data->len, space);
+ memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+ block->used += tocopy;
+ ibex_block_dirty(block);
+ copied = tocopy;
+ d(printf("copied %d items to left over last node\n", tocopy));
+ }
+
+ while (copied < data->len) {
+ left = data->len - copied;
+ /* do we drop the rest in a tail? */
+ if (left < TAIL_THRESHOLD) {
+ d(printf("Storing remaining %d items in tail\n", left));
+ tocopy = left;
+ new = tail_get(store->blocks, tocopy);
+ tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new));
+ memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(new)]],
+ &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+ *tailptr = new;
+ } else {
+ new = ibex_block_get(store->blocks);
+ newblock = (struct _block *)ibex_block_read(store->blocks, new);
+ newblock->next = block_number(head);
+ tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0]));
+ d(printf("storing %d items in own block\n", tocopy));
+ memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t));
+ newblock->used = tocopy;
+ block = newblock;
+ head = new;
+ ibex_block_dirty(newblock);
+ }
+ copied += tocopy;
+ }
+
+ *headptr = head;
+ return head;
+}
+/*
+ case 1:
+ no head, no tail, adding a lot of data
+ add blocks until the last, which goes in a tail node
+ case 2:
+ no head, no tail, adding a little data
+ just add a tail node
+ case 3:
+ no head, tail, total exceeds a block
+ merge as much as possible into new full blocks, then the remainder in the tail
+ case 4:
+ no head, tail, room in existing tail for data
+ add new data to tail node
+ case 5:
+ no head, tail, no room in existing tail for data
+ add a new tail node, copy data across, free old tail node
+*/
+
+static blockid_t
+disk_add_list(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data)
+{
+ blockid_t new, head = *headptr, tail = *tailptr, *start;
+ struct _tailblock *tailblock, *tailnew;
+ int len;
+ GArray *tmpdata = NULL;
+
+ d(printf("adding %d items head=%d tail=%d\n", data->len, head, tail));
+ if (data->len == 0)
+ return head;
+
+ /* store length=1 data in the tail pointer */
+ if (head == 0 && tail == 0 && data->len == 1) {
+ *headptr = BLOCK_ONE;
+ *tailptr = g_array_index(data, blockid_t, 0);
+ return BLOCK_ONE;
+ }
+ /* if we got length=1 data to append to, upgrade it to a real block list */
+ if (head == BLOCK_ONE) {
+ tmpdata = g_array_new(0, 0, sizeof(blockid_t));
+ g_array_append_vals(tmpdata, data->data, data->len);
+ g_array_append_val(tmpdata, tail);
+ head = *headptr = 0;
+ tail = *tailptr = 0;
+ }
+
+ /* if we have no head, then check the tail */
+ if (head == 0) {
+ if (tail == 0) {
+ if (data->len >= TAIL_THRESHOLD) {
+ /* add normally */
+ head = disk_add_blocks_internal(store, headptr, tailptr, data);
+ } else {
+ /* else add to a tail block */
+ new = tail_get(store->blocks, data->len);
+ d(printf("adding %d items to a tail block %d\n", data->len, new));
+ tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new));
+ memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]],
+ data->data, data->len*sizeof(blockid_t));
+ *tailptr = new;
+ ibex_block_dirty((struct _block *)tailnew);
+ }
+ } else {
+ tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+ len = tail_info(tailblock, tail, &start);
+ /* case 3 */
+ if (len + data->len >= TAIL_THRESHOLD) {
+ /* this is suboptimal, but should work - merge the tail data with
+ our new data, and write it out */
+ if (tmpdata == NULL) {
+ tmpdata = g_array_new(0, 0, sizeof(blockid_t));
+ g_array_append_vals(tmpdata, data->data, data->len);
+ }
+ g_array_append_vals(tmpdata, start, len);
+ *tailptr = 0;
+ tail_free(store->blocks, tail);
+ head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata);
+ } else if (tail_space(tailblock) >= data->len) {
+ /* can we just expand this in our node, or do we need a new one? */
+ tail_compress(tailblock, TAIL_INDEX(tail), data->len + len);
+ memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(tail)] + len],
+ data->data, data->len * sizeof(blockid_t));
+ ibex_block_dirty((struct _block *)tailblock);
+ } else {
+ /* we need to allocate a new tail node */
+ new = tail_get(store->blocks, data->len + len);
+ /* and copy the data across */
+ tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new));
+ memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]],
+ start, len*sizeof(blockid_t));
+ memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]+len],
+ data->data, data->len*sizeof(blockid_t));
+ tail_free(store->blocks, tail);
+ ibex_block_dirty((struct _block *)tailnew);
+ *tailptr = new;
+ }
+ }
+ } else {
+ if (tail) {
+ /* read/merge the tail with the new data, rewrite out.
+ suboptimal, but it should be 'ok' ? */
+ tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+ len = tail_info(tailblock, tail, &start);
+ tmpdata = g_array_new(0, 0, sizeof(blockid_t));
+ g_array_append_vals(tmpdata, start, len);
+ g_array_append_vals(tmpdata, data->data, data->len);
+ *tailptr = 0;
+ tail_free(store->blocks, tail);
+ head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata);
+ } else {
+ head = disk_add_blocks_internal(store, headptr, tailptr, data);
+ }
+ }
+ if (tmpdata)
+ g_array_free(tmpdata, TRUE);
+
+ return head;
+}
+
+static blockid_t
+disk_remove(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data)
+{
+ blockid_t head = *headptr, node = *headptr, tail = *tailptr;
+ int i;
+
+ /* special case for 1-item nodes */
+ if (head == BLOCK_ONE) {
+ if (tail == data) {
+ *tailptr = 0;
+ *headptr = 0;
+ head = 0;
+ }
+ return head;
+ }
+
+ d(printf("removing %d from %d\n", data, head));
+ while (node) {
+ struct _block *block = ibex_block_read(store->blocks, node);
+
+ for (i=0;i<block->used;i++) {
+ if (block->bl_data[i] == data) {
+ struct _block *start = ibex_block_read(store->blocks, head);
+
+ d(printf("removing data from block %d\n ", head));
+
+ start->used--;
+ block->bl_data[i] = start->bl_data[start->used];
+ if (start->used == 0) {
+ struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0);
+ blockid_t new;
+
+ d(printf("dropping block %d, new head = %d\n", head, start->next));
+ new = block_location(start->next);
+ start->next = block_number(root->free);
+ root->free = head;
+ head = new;
+ ibex_block_dirty((struct _block *)root);
+ }
+ ibex_block_dirty(block);
+ ibex_block_dirty(start);
+ *headptr = head;
+ return head;
+ }
+ }
+ node = block_location(block->next);
+ }
+
+ if (tail) {
+ struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+ int len;
+ blockid_t *start;
+
+ len = tail_info(tailblock, tail, &start);
+ for (i=0;i<len;i++) {
+ if (start[i] == data) {
+ for (;i<len-1;i++)
+ start[i] = start[i+1];
+ if (len == 1)
+ *tailptr = 0;
+ if (len == 1 && (tailblock->used-1) == TAIL_INDEX(tail)) {
+ d(printf("dropping/unlinking tailblock %d (%d) used = %d\n",
+ TAIL_BLOCK(tail), tail, tailblock->used));
+ tailblock->used--;
+ /* drop/unlink block? */
+ ibex_block_dirty((struct _block *)tailblock);
+ *tailptr = 0;
+ } else {
+ tail_compress(tailblock, TAIL_INDEX(tail), len-1);
+ ibex_block_dirty((struct _block *)tailblock);
+ }
+ }
+ }
+
+ }
+
+ return head;
+}
+
+static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail)
+{
+ blockid_t next;
+ struct _block *block;
+
+ if (head == BLOCK_ONE)
+ return;
+
+ while (head) {
+ d(printf("freeing block %d\n", head));
+ block = ibex_block_read(store->blocks, head);
+ next = block_location(block->next);
+ ibex_block_free(store->blocks, head);
+ head = next;
+ }
+ if (tail) {
+ struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+ d(printf("freeing tail block %d (%d)\n", TAIL_BLOCK(tail), tail));
+ tail_compress(tailblock, TAIL_INDEX(tail), 0);
+ ibex_block_dirty((struct _block *)tailblock);
+ }
+}
+
+static gboolean
+disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data)
+{
+ blockid_t node = head;
+ int i;
+
+ d(printf("finding %d from %d\n", data, head));
+
+ if (head == BLOCK_ONE)
+ return data == tail;
+
+ /* first check full blocks */
+ while (node) {
+ struct _block *block = ibex_block_read(store->blocks, node);
+
+ for (i=0;i<block->used;i++) {
+ if (block->bl_data[i] == data) {
+ return TRUE;
+ }
+ }
+ node = block_location(block->next);
+ }
+
+ /* then check tail */
+ if (tail) {
+ struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+ int len;
+ blockid_t *start;
+
+ len = tail_info(tailblock, tail, &start);
+ for (i=0;i<len;i++) {
+ if (start[i] == data)
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+static GArray *
+disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail)
+{
+ GArray *result = g_array_new(0, 0, sizeof(nameid_t));
+
+ if (head == BLOCK_ONE) {
+ g_array_append_val(result, tail);
+ return result;
+ }
+
+ while (head) {
+ struct _block *block = ibex_block_read(store->blocks, head);
+
+ d(printf("getting data from block %d\n", head));
+
+ g_array_append_vals(result, block->bl_data, block->used);
+ head = block_location(block->next);
+ d(printf("next = %d\n", head));
+ }
+
+ /* chuck on any tail data as well */
+ if (tail) {
+ struct _tailblock *tailblock;
+ int len;
+ blockid_t *start;
+
+ tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail));
+ len = tail_info(tailblock, tail, &start);
+ g_array_append_vals(result, start, len);
+ }
+ return result;
+}
+
+void
+ibex_diskarray_dump(struct _memcache *blocks, blockid_t head, blockid_t tail)
+{
+ blockid_t node = head;
+
+ printf("dumping list %d tail %d\n", node, tail);
+ if (head == BLOCK_ONE) {
+ printf(" 1 length index: %d\n", tail);
+ return;
+ }
+
+ while (node) {
+ struct _block *block = ibex_block_read(blocks, node);
+ int i;
+
+ printf(" block %d used %d\n ", node, block->used);
+ for (i=0;i<block->used;i++) {
+ printf(" %d", block->bl_data[i]);
+ }
+ printf("\n");
+ node = block_location(block->next);
+ }
+
+ printf("tail: ");
+ if (tail) {
+ struct _tailblock *tailblock;
+ int len;
+ blockid_t *start;
+ int i;
+
+ tailblock = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tail));
+ len = tail_info(tailblock, tail, &start);
+ for (i=0;i<len;i++)
+ printf(" %d", start[i]);
+ }
+ printf("\n");
+}
+
+#if 0
+int main(int argc, char **argv)
+{
+ struct _memcache *bc;
+ struct _IBEXStore *disk;
+ int i;
+ blockid_t data[100];
+ GArray adata;
+ blockid_t head=0, tail=0;
+
+ for (i=0;i<100;i++) {
+ data[i] = i;
+ }
+
+ bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600);
+ disk = ibex_diskarray_class.create(bc);
+
+ head = 0;
+ tail = 0;
+ adata.data = data;
+ adata.len = 70;
+ for (i=0;i<100;i++) {
+ printf("Loop %d\n", i);
+ ibex_diskarray_class.add_list(disk, &head, &tail, &adata);
+ ibex_diskarray_dump(bc, head, tail);
+ }
+
+#if 0
+ for (i=1;i<100;i++) {
+ printf("inserting %d items\n", i);
+ adata.data = data;
+ adata.len = i;
+ head=0;
+ tail=0;
+ ibex_diskarray_class.add_list(disk, &head, &tail, &adata);
+ ibex_diskarray_dump(bc, head, tail);
+ }
+#endif
+ ibex_diskarray_class.close(disk);
+ ibex_block_cache_close(bc);
+ return 0;
+}
+
+#endif
diff --git a/libibex/file.c b/libibex/file.c
deleted file mode 100644
index 70b7ed96cb..0000000000
--- a/libibex/file.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-/*
- * Copyright (C) 2000 Helix Code, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public License
- * as published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with the Gnome Library; see the file COPYING.LIB. If not,
- * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
-
-/* file.c: index file read/write ops */
-
-#include <ctype.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "ibex_internal.h"
-
-static unsigned long read_number (FILE *f);
-static void write_number (FILE *f, unsigned long n);
-static char *get_compressed_word (FILE *f, char **lastword);
-
-static gint free_file (gpointer key, gpointer value, gpointer data);
-static void free_word (gpointer key, gpointer value, gpointer data);
-
-/* The file format is:
- *
- * version string (currently "ibex1")
- * file count
- * list of compressed filenames, separated by \0
- * word count
- * list of compressed words, each followed by \0, a count, and that
- * many references.
- *
- * All numbers are stored 7-bit big-endian, with the high bit telling
- * whether or not the number continues to the next byte.
- *
- * compressed text consists of a byte telling how many characters the
- * line has in common with the line before it, followed by the rest of
- * the string. Obviously this only really works if the lists are sorted.
- */
-
-/**
- * ibex_open: open (or possibly create) an ibex index
- * @file: the name of the file
- * @flags: open flags, see open(2).
- * @mode: If O_CREAT is passed in flags, then the file mode
- * to create the new file with. It will be anded with the current
- * umask.
- *
- * Open and/or create the named ibex file and return a handle to it.
- *
- * Return value: an ibex handle, or NULL if an error occurred.
- **/
-ibex *
-ibex_open (char *file, int flags, int mode)
-{
- ibex *ib;
- FILE *f;
- char vbuf[sizeof (IBEX_VERSION) - 1];
- char *word, *lastword;
- unsigned long nfiles, nwords, nrefs, ref;
- ibex_file **ibfs = NULL;
- int i;
- GPtrArray *refs;
- int fd;
- char *modestr;
-
- fd = open(file, flags, mode);
- if (fd == -1) {
- return NULL;
- }
-
- /* yuck, this is because we use FILE * interface
- internally */
- switch (flags & O_ACCMODE) {
- case O_RDONLY:
- modestr = "r";
- break;
- case O_RDWR:
- if (flags & O_APPEND)
- modestr = "a+";
- else
- modestr = "w+";
- break;
- case O_WRONLY:
- if (flags & O_APPEND)
- modestr = "a";
- else
- modestr = "w";
- break;
- default:
- if (flags & O_APPEND)
- modestr = "a+";
- else
- modestr = "r+";
- break;
- }
-
- f = fdopen(fd, modestr);
- if (f == NULL) {
- if (errno == 0)
- errno = ENOMEM;
- close(fd);
- return NULL;
- }
-
- ib = g_malloc (sizeof (ibex));
- ib->dirty = FALSE;
- ib->path = g_strdup (file);
- ib->files = g_tree_new ((GCompareFunc) strcmp);
- ib->words = g_hash_table_new (g_str_hash, g_str_equal);
- ib->oldfiles = g_ptr_array_new ();
-
- if (!f) {
- close(fd);
- return ib;
- }
-
- /* Check version. If its empty, then we have just created it */
- if (fread (vbuf, 1, sizeof (vbuf), f) != sizeof (vbuf)) {
- if (feof (f)) {
- fclose(f);
- close(fd);
- return ib;
- }
- }
- if (strncmp (vbuf, IBEX_VERSION, sizeof (vbuf) != 0)) {
- errno = EINVAL;
- goto errout;
- }
-
- /* Read list of files. */
- nfiles = read_number (f);
- ibfs = g_malloc (nfiles * sizeof (ibex_file *));
- lastword = NULL;
- for (i = 0; i < nfiles; i++) {
- ibfs[i] = g_malloc (sizeof (ibex_file));
- ibfs[i]->name = get_compressed_word (f, &lastword);
- if (!ibfs[i]->name) {
- goto errout;
- }
- ibfs[i]->index = 0;
- g_tree_insert (ib->files, ibfs[i]->name, ibfs[i]);
- }
-
- /* Read list of words. */
- nwords = read_number (f);
- lastword = NULL;
- for (i = 0; i < nwords; i++) {
- word = get_compressed_word (f, &lastword);
- if (!word) {
- goto errout;
- }
-
- nrefs = read_number (f);
- refs = g_ptr_array_new ();
- g_ptr_array_set_size (refs, nrefs);
- while (nrefs--) {
- ref = read_number (f);
- if (ref >= nfiles) {
- goto errout;
- }
- refs->pdata[nrefs] = ibfs[ref];
- }
-
- g_hash_table_insert (ib->words, word, refs);
- }
-
- g_free (ibfs);
- fclose (f);
- close(fd);
- return ib;
-
-errout:
-
- fclose (f);
- close(fd);
- g_tree_traverse (ib->files, free_file, G_IN_ORDER, NULL);
- g_tree_destroy (ib->files);
- g_hash_table_foreach (ib->words, free_word, NULL);
- g_hash_table_destroy (ib->words);
- g_ptr_array_free (ib->oldfiles, TRUE);
- if (ibfs)
- g_free (ibfs);
- g_free (ib->path);
- g_free (ib);
-
- return NULL;
-}
-
-struct ibex_write_data {
- unsigned long index;
- FILE *f;
- char *lastname;
-};
-
-/* This is an internal function to find the longest common initial
- * prefix between the last-written word and the current word.
- */
-static int
-get_prefix (struct ibex_write_data *iwd, char *name)
-{
- int i = 0;
- if (iwd->lastname) {
- while (!strncmp (iwd->lastname, name, i + 1))
- i++;
- }
- iwd->lastname = name;
- return i;
-}
-
-static gint
-write_file (gpointer key, gpointer value, gpointer data)
-{
- char *file = key;
- ibex_file *ibf = value;
- struct ibex_write_data *iwd = data;
- int prefix;
-
- ibf->index = iwd->index++;
- prefix = get_prefix (iwd, file);
- fprintf (iwd->f, "%c%s", prefix, file + prefix);
- fputc (0, iwd->f);
- return FALSE;
-}
-
-/* scans for words which still exist in the index (after
- index removals), and adds them to the ordered tree for
- writing out in order */
-static void
-store_word (gpointer key, gpointer value, gpointer data)
-{
- GTree *wtree = data;
- GPtrArray *refs = value;
- int i;
- ibex_file *ibf;
-
- for (i = 0; i < refs->len; i++) {
- ibf = g_ptr_array_index (refs, i);
- if (ibf->index == -1) {
- g_ptr_array_remove_index_fast (refs, i);
- i--;
- }
- }
-
- if (refs->len > 0) {
- g_tree_insert (wtree, key, value);
- }
-}
-
-/* writes a word out, in order */
-static gint
-write_word (gpointer key, gpointer value, gpointer data)
-{
- char *word = key;
- GPtrArray *refs = value;
- struct ibex_write_data *iwd = data;
- int i, prefix;
- ibex_file *ibf;
-
- prefix = get_prefix (iwd, word);
- fprintf (iwd->f, "%c%s", prefix, word + prefix);
- fputc (0, iwd->f);
-
- write_number (iwd->f, refs->len);
-
- for (i = 0; i < refs->len; i++) {
- ibf = g_ptr_array_index (refs, i);
- write_number (iwd->f, ibf->index);
- }
- return FALSE;
-}
-
-/**
- * ibex_write: Write an ibex out to disk.
- * @ib: the ibex
- *
- * This writes an ibex to disk.
- *
- * Return value: 0 for success, -1 for failure (in which case errno
- * is set).
- **/
-int
-ibex_write (ibex *ib)
-{
- struct ibex_write_data iwd;
- GTree *wtree;
- char *tmpfile;
-
- tmpfile = g_strdup_printf ("%s~", ib->path);
- iwd.f = fopen (tmpfile, "w");
- if (!iwd.f) {
- if (errno == 0)
- errno = ENOMEM;
- g_free (tmpfile);
- return -1;
- }
-
- fputs (IBEX_VERSION, iwd.f);
- if (ferror (iwd.f))
- goto lose;
-
- iwd.index = 0;
- iwd.lastname = NULL;
- write_number (iwd.f, g_tree_nnodes (ib->files));
- if (ferror (iwd.f))
- goto lose;
- g_tree_traverse (ib->files, write_file, G_IN_ORDER, &iwd);
- if (ferror (iwd.f))
- goto lose;
-
- iwd.lastname = NULL;
- wtree = g_tree_new ((GCompareFunc) strcmp);
- g_hash_table_foreach (ib->words, store_word, wtree);
- write_number (iwd.f, g_tree_nnodes(wtree));
- if (ferror (iwd.f))
- goto lose;
- g_tree_traverse (wtree, write_word, G_IN_ORDER, &iwd);
- g_tree_destroy (wtree);
- if (ferror (iwd.f))
- goto lose;
-
- if (fclose (iwd.f) == 0 && rename (tmpfile, ib->path) == 0) {
- g_free (tmpfile);
- ib->dirty = FALSE;
- return 0;
- }
-
-lose:
- unlink (tmpfile);
- g_free (tmpfile);
- return -1;
-}
-
-/**
- * ibex_save:
- * @ib:
- *
- * Only write out an ibex if it is dirty.
- *
- * Return value: Same as ibex_write.
- **/
-int
-ibex_save (ibex *ib)
-{
- if (ib->dirty)
- return ibex_write(ib);
- return 0;
-}
-
-/**
- * ibex_close: Write out the ibex file (if it has changed) and free
- * the data associated with it.
- * @ib: the ibex
- *
- * If this ibex file has been modified since it was opened, this will
- * call ibex_write() to write it out to disk. It will then free all data
- * associated with the ibex. After calling ibex_close(), @ib will no
- * longer be a valid ibex.
- *
- * Return value: 0 on success, -1 on an ibex_write() failure (in which
- * case @ib will not be destroyed).
- **/
-int
-ibex_close (ibex *ib)
-{
- ibex_file *ibf;
-
- if (ib->dirty && ibex_write (ib) == -1)
- return -1;
-
- g_tree_traverse (ib->files, free_file, G_IN_ORDER, NULL);
- g_tree_destroy (ib->files);
- g_hash_table_foreach (ib->words, free_word, NULL);
- g_hash_table_destroy (ib->words);
-
- while (ib->oldfiles->len) {
- ibf = g_ptr_array_remove_index (ib->oldfiles, 0);
- g_free (ibf->name);
- g_free (ibf);
- }
- g_ptr_array_free (ib->oldfiles, TRUE);
- g_free (ib->path);
- g_free (ib);
-
- return 0;
-}
-
-static gint
-free_file (gpointer key, gpointer value, gpointer data)
-{
- ibex_file *ibf = value;
-
- g_free (ibf->name);
- g_free (ibf);
- return FALSE;
-}
-
-static void
-free_word (gpointer key, gpointer value, gpointer data)
-{
- g_free (key);
- g_ptr_array_free (value, TRUE);
-}
-
-static char *
-get_compressed_word (FILE *f, char **lastword)
-{
- char *buf, *p;
- int c, size;
-
- c = getc (f);
- if (c == EOF)
- return NULL;
-
- size = c + 10;
- buf = g_malloc (size);
- if (*lastword)
- strncpy (buf, *lastword, c);
- p = buf + c;
- do {
- c = getc (f);
- if (c == EOF)
- return NULL;
- if (p == buf + size) {
- buf = g_realloc (buf, size + 10);
- p = buf + size;
- size += 10;
- }
- *p++ = c;
- } while (c != 0);
-
- *lastword = buf;
- return buf;
-}
-
-static void
-write_number (FILE *f, unsigned long number)
-{
- int i, flag = 0;
- char buf[4];
-
- i = 4;
- do {
- buf[--i] = (number & 0x7F) | flag;
- number = number >> 7;
- flag = 0x80;
- } while (number != 0);
-
- fwrite (buf + i, 1, 4 - i, f);
-}
-
-static unsigned long
-read_number (FILE *f)
-{
- int byte;
- unsigned long num;
-
- num = 0;
- do {
- byte = getc (f);
- num = num << 7 | (byte & 0x7F);
- } while (byte & 0x80);
-
- return num;
-}
-
diff --git a/libibex/find.c b/libibex/find.c
deleted file mode 100644
index 6b9815f0c0..0000000000
--- a/libibex/find.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-/*
- * Copyright (C) 2000 Helix Code, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public License
- * as published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with the Gnome Library; see the file COPYING.LIB. If not,
- * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
-
-/* find.c: index file searching ops */
-
-#include <string.h>
-
-#include "ibex_internal.h"
-
-/**
- * ibex_find: search an ibex for a word
- * @ib: an ibex
- * @word: the word
- *
- * This routine searches an ibex for a word and returns a GPtrArray
- * containing the names of the files in the ibex that contain the word.
- * If no matches are found, it will return an empty array (not NULL).
- * The caller must free the array, but MUST NOT free or alter its
- * elements.
- *
- * Return value: the array of filenames containing @word
- **/
-GPtrArray *
-ibex_find (ibex *ib, char *word)
-{
- GPtrArray *refs, *ret;
- ibex_file *ibf;
- int i;
-
- ret = g_ptr_array_new ();
- refs = g_hash_table_lookup (ib->words, word);
- if (refs) {
- for (i = 0; i < refs->len; i++) {
- ibf = g_ptr_array_index (refs, i);
- g_ptr_array_add (ret, ibf->name);
- }
- }
- return ret;
-}
-
-/**
- * ibex_contains_name:
- * @ib:
- * @name:
- *
- * Returns #TRUE if the ibex @ib has any index entry for
- * the key @name.
- *
- * Return value:
- **/
-gboolean
-ibex_contains_name(ibex *ib, char *name)
-{
- return g_tree_lookup(ib->files, name) != NULL;
-}
-
-/**
- * ibex_find_name: Check if a word occurs in a given file
- * @ib: an ibex
- * @name: a filename
- * @word: a word
- *
- * This checks if the given word occurs in the given file.
- *
- * Return value: TRUE or FALSE
- **/
-gboolean
-ibex_find_name (ibex *ib, char *name, char *word)
-{
- GPtrArray *refs;
- ibex_file *ibf;
- int i;
-
- refs = g_hash_table_lookup (ib->words, word);
- if (refs) {
- for (i = 0; i < refs->len; i++) {
- ibf = g_ptr_array_index (refs, i);
- if (!strcmp (ibf->name, name))
- return TRUE;
- }
- }
- return FALSE;
-}
-
-static gint
-build_array (gpointer key, gpointer value, gpointer data)
-{
- char *name = key;
- unsigned int count = GPOINTER_TO_UINT (value);
- GPtrArray *ret = data;
-
- if (count == 1)
- g_ptr_array_add (ret, name);
- return FALSE;
-}
-
-/**
- * ibex_find_all: Find files containing multiple words
- * @ib: an ibex
- * @words: a GPtrArray of words
- *
- * This works like ibex_find(), but returns an array of filenames
- * which contain all of the words in @words.
- *
- * Return value: an array of matches
- **/
-GPtrArray *
-ibex_find_all (ibex *ib, GPtrArray *words)
-{
- GTree *work;
- GPtrArray *wrefs, *ret;
- int i, j, count;
- char *word;
- ibex_file *ibf;
-
- if (words->len == 0)
- return g_ptr_array_new ();
- else if (words->len == 1)
- return ibex_find (ib, g_ptr_array_index (words, 0));
-
- work = g_tree_new ((GCompareFunc) strcmp);
- for (i = 0; i < words->len; i++) {
- word = g_ptr_array_index (words, i);
- wrefs = g_hash_table_lookup (ib->words, word);
- if (!wrefs) {
- /* One of the words isn't even in the index. */
- g_tree_destroy (work);
- return g_ptr_array_new ();
- }
-
- if (i == 0) {
- /* Copy the references into a tree, using the
- * filenames as keys and the size of words as
- * the value.
- */
- for (j = 0; j < wrefs->len; j++) {
- ibf = g_ptr_array_index (wrefs, j);
- g_tree_insert (work, ibf->name,
- GUINT_TO_POINTER (words->len));
- }
- } else {
- /* Increment the counts in the working tree
- * for the references for this word.
- */
- for (j = 0; j < wrefs->len; j++) {
- ibf = g_ptr_array_index (wrefs, j);
- count = GPOINTER_TO_UINT (g_tree_lookup (work, ibf->name));
- if (count) {
- g_tree_insert (work, ibf->name,
- GUINT_TO_POINTER (count - 1));
- }
- }
- }
- }
-
- /* Build an array with the refs that contain all the words. */
- ret = g_ptr_array_new ();
- g_tree_traverse (work, build_array, G_IN_ORDER, ret);
- g_tree_destroy (work);
- return ret;
-}
-
-static void
-ibex_dump_foo(char *key, GPtrArray *refs, void *data)
-{
- int i;
-
- g_print("%s: ", key);
- for (i=0;i<refs->len;i++) {
- ibex_file *ibf = g_ptr_array_index (refs, i);
- g_print("%c%s", ibf->index==-1?'-':' ', ibf->name);
- }
- g_print("\n");
-}
-
-/* debug function to dump the tree, in key order */
-void
-ibex_dump_all (ibex *ib)
-{
- g_hash_table_foreach(ib->words, (GHFunc) ibex_dump_foo, 0);
-}
diff --git a/libibex/hash.c b/libibex/hash.c
new file mode 100644
index 0000000000..02e01a1ae0
--- /dev/null
+++ b/libibex/hash.c
@@ -0,0 +1,701 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/* hash based index mechanism */
+
+#include <stdio.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "block.h"
+#include "index.h"
+
+#define d(x)
+
+#define HASH_SIZE (1024)
+#define KEY_THRESHOLD (sizeof(struct _hashkey) + 4) /* minimum number of free bytes we worry about
+ maintaining free blocks for */
+#define ARRAY_LEN(a) (sizeof(a)/sizeof(a[0]))
+
+typedef guint32 hashid_t;
+
+static struct _IBEXIndex *hash_create(struct _memcache *bc, int size);
+static struct _IBEXIndex *hash_open(struct _memcache *bc, blockid_t root);
+static int hash_sync(struct _IBEXIndex *index);
+static int hash_close(struct _IBEXIndex *index);
+
+static hashid_t hash_find(struct _IBEXIndex *index, const char *key, int keylen);
+static void hash_remove(struct _IBEXIndex *index, const char *key, int keylen);
+static hashid_t hash_insert(struct _IBEXIndex *index, const char *key, int keylen);
+static char *hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len);
+static void hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail);
+static blockid_t hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail);
+
+struct _IBEXIndexClass ibex_hash_class = {
+ hash_create, hash_open,
+ hash_sync, hash_close,
+ hash_find,
+ hash_remove,
+ hash_insert,
+ hash_get_key,
+ hash_set_data_block,
+ hash_get_data_block,
+};
+
+/* the reason we have the tail here is that otherwise we need to
+ have a 32 bit blockid for the root node; which would make this
+ structure the same size anyway, with about 24 wasted bits */
+struct _hashkey {
+ blockid_t next; /* next in hash chain */
+ blockid_t tail;
+ unsigned int root:32-BLOCK_BITS;
+ unsigned int keyoffset:BLOCK_BITS;
+};
+
+struct _hashblock {
+ /*blockid_t next;*/ /* all key blocks linked together? */
+ guint32 used; /* elements used */
+ union {
+ struct _hashkey keys[(BLOCK_SIZE-4)/sizeof(struct _hashkey)];
+ char keydata[BLOCK_SIZE-4];
+ } hashblock_u;
+};
+#define hb_keys hashblock_u.keys
+#define hb_keydata hashblock_u.keydata
+
+/* root block for a hash index */
+struct _hashroot {
+ hashid_t free; /* free list */
+ guint32 size; /* how big the hash table is */
+ hashid_t table[(BLOCK_SIZE-8)/sizeof(hashid_t)]; /* pointers to blocks of pointers */
+};
+
+struct _hashtableblock {
+ hashid_t buckets[BLOCK_SIZE/sizeof(hashid_t)];
+};
+
+/* map a hash index to a block index */
+#define HASH_INDEX(b) ((b) & (BLOCK_SIZE-1))
+/* map a hash index to a block number */
+#define HASH_BLOCK(b) ((b) & ~(BLOCK_SIZE-1))
+/* map a block + index to a hash key */
+#define HASH_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1)))
+
+/* taken from tdb/gdbm */
+static unsigned int hash_key(const unsigned char *key, int keylen)
+{
+ char *newkey;
+ newkey = alloca(keylen+1);
+ memcpy(newkey, key, keylen);
+ newkey[keylen]=0;
+ return g_str_hash(newkey);
+#if 0
+ unsigned int value; /* Used to compute the hash value. */
+ unsigned int i; /* Used to cycle through random values. */
+
+ /* Set the initial value from the key size. */
+ value = 0x238F13AF * keylen;
+ for (i=0; i < keylen; i++) {
+ value = (value + (key[i] << (i*5 % 24)));
+ }
+
+ value = (1103515243 * value + 12345);
+
+ return value;
+#endif
+}
+
+/* create a new hash table, return a pointer to its root block */
+static struct _IBEXIndex *
+hash_create(struct _memcache *bc, int size)
+{
+ blockid_t root, block;
+ struct _hashroot *hashroot;
+ int i;
+ struct _hashtableblock *table;
+ struct _IBEXIndex *index;
+
+ g_assert(size<=10240);
+
+ d(printf("initialising hash table, size = %d\n", size));
+
+ index = g_malloc0(sizeof(*index));
+ index->blocks = bc;
+ index->klass = &ibex_hash_class;
+ root = ibex_block_get(bc);
+ index->root = root;
+ d(printf(" root = %d\n", root));
+ hashroot = (struct _hashroot *)ibex_block_read(bc, root);
+ hashroot->free = 0;
+ hashroot->size = size;
+ ibex_block_dirty((struct _block *)hashroot);
+ for (i=0;i<size/(BLOCK_SIZE/sizeof(blockid_t));i++) {
+ d(printf("initialising hash table index block %d\n", i));
+ block = hashroot->table[i] = ibex_block_get(bc);
+ table = (struct _hashtableblock *)ibex_block_read(bc, block);
+ memset(table, 0, sizeof(table));
+ ibex_block_dirty((struct _block *)table);
+ }
+
+ return index;
+}
+
+static struct _IBEXIndex *
+hash_open(struct _memcache *bc, blockid_t root)
+{
+ struct _IBEXIndex *index;
+
+ /* FIXME: check a 'magic', and the root for validity */
+
+ index = g_malloc0(sizeof(*index));
+ index->blocks = bc;
+ index->root = root;
+ index->klass = &ibex_hash_class;
+
+ return index;
+}
+
+
+static int hash_sync(struct _IBEXIndex *index)
+{
+ /* nop, index always synced on disk (at least, to blocks) */
+ return 0;
+}
+
+static int hash_close(struct _IBEXIndex *index)
+{
+ g_free(index);
+ return 0;
+}
+
+/* convert a hashbucket id into a name */
+static char *
+hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len)
+{
+ struct _hashblock *bucket;
+ int ind;
+ char *ret, *start, *end;
+
+ if (hashbucket == 0) {
+ if (len)
+ *len = 0;
+ return g_strdup("");
+ }
+
+ bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+ ind = HASH_INDEX(hashbucket);
+
+ g_assert(ind < bucket->used);
+
+ start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset];
+ if (ind == 0) {
+ end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+ } else {
+ end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset];
+ }
+
+ ret = g_malloc(end-start+1);
+ memcpy(ret, start, end-start);
+ ret[end-start]=0;
+ if (len)
+ *len = end-start;
+ return ret;
+}
+
+/* sigh, this is fnugly code ... */
+static hashid_t
+hash_find(struct _IBEXIndex *index, const char *key, int keylen)
+{
+ struct _hashroot *hashroot;
+ guint32 hash;
+ int hashentry;
+ blockid_t hashtable;
+ hashid_t hashbucket;
+ struct _hashtableblock *table;
+
+ g_assert(index != 0);
+ g_assert(index->root != 0);
+
+ d(printf("finding hash %.*s\n", keylen, key));
+
+ hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+
+ /* find the table containing this entry */
+ hash = hash_key(key, keylen) % hashroot->size;
+ hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))];
+ g_assert(hashtable != 0);
+ table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
+ hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t));
+ /* and its bucket */
+ hashbucket = table->buckets[hashentry];
+
+ /* go down the bucket chain, reading each entry till we are done ... */
+ while (hashbucket != 0) {
+ struct _hashblock *bucket;
+ char *start, *end;
+ int ind;
+
+ d(printf(" checking bucket %d\n", hashbucket));
+
+ /* get the real bucket id from the hashbucket id */
+ bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+ /* and get the key number within the block */
+ ind = HASH_INDEX(hashbucket);
+
+ g_assert(ind < bucket->used);
+
+ start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset];
+ if (ind == 0) {
+ end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+ } else {
+ end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset];
+ }
+ if ( (end-start) == keylen
+ && memcmp(start, key, keylen) == 0) {
+ return hashbucket;
+ }
+ hashbucket = bucket->hb_keys[ind].next;
+ }
+ return 0;
+}
+
+/* compresses the bucket 'bucket', removing data
+ at index 'index' */
+static void
+hash_compress(struct _hashblock *bucket, int index)
+{
+ int i;
+ char *start, *end, *newstart;
+
+ /* get start/end of area to zap */
+ start = &bucket->hb_keydata[bucket->hb_keys[index].keyoffset];
+ if (index == 0) {
+ end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+ } else {
+ end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset];
+ }
+
+ if (start == end)
+ return;
+
+ /* fixup data */
+ newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset];
+ memmove(newstart+(end-start), newstart, start-newstart);
+
+ /* fixup key pointers */
+ for (i=index;i<bucket->used;i++) {
+ bucket->hb_keys[i].keyoffset += (end-start);
+ }
+ ibex_block_dirty((struct _block *)bucket);
+}
+
+/* make room 'len' for the key 'index' */
+/* assumes key 'index' is already empty (0 length) */
+static void
+hash_expand(struct _hashblock *bucket, int index, int len)
+{
+ int i;
+ char *end, *newstart;
+
+ /* get start/end of area to zap */
+ if (index == 0) {
+ end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+ } else {
+ end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset];
+ }
+
+ /* fixup data */
+ newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset];
+ memmove(newstart-len, newstart, end-newstart);
+
+ /* fixup key pointers */
+ for (i=index;i<bucket->used;i++) {
+ bucket->hb_keys[i].keyoffset -= len;
+ }
+ ibex_block_dirty((struct _block *)bucket);
+}
+
+static void
+hash_remove(struct _IBEXIndex *index, const char *key, int keylen)
+{
+ struct _hashroot *hashroot;
+ guint32 hash;
+ int hashentry;
+ blockid_t hashtable;
+ hashid_t hashbucket, hashprev;
+ struct _hashtableblock *table;
+
+ g_assert(index != 0);
+ g_assert(index->root != 0);
+
+ d(printf("removing hash %.*s\n", keylen, key));
+
+ hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+
+ /* find the table containing this entry */
+ hash = hash_key(key, keylen) % hashroot->size;
+ hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))];
+ table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
+ hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t));
+ /* and its bucket */
+ hashbucket = table->buckets[hashentry];
+
+ /* go down the bucket chain, reading each entry till we are done ... */
+ hashprev = 0;
+ while (hashbucket != 0) {
+ struct _hashblock *bucket;
+ char *start, *end;
+ int ind;
+
+ d(printf(" checking bucket %d\n", hashbucket));
+
+ /* get the real bucket id from the hashbucket id */
+ bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+ /* and get the key number within the block */
+ ind = HASH_INDEX(hashbucket);
+
+ g_assert(ind < bucket->used);
+
+ start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset];
+ if (ind == 0) {
+ end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])];
+ } else {
+ end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset];
+ }
+ if ( (end-start) == keylen
+ && memcmp(start, key, keylen) == 0) {
+ struct _hashblock *prevbucket;
+
+ if (hashprev == 0) {
+ /* unlink from hash chain */
+ table->buckets[hashentry] = bucket->hb_keys[HASH_INDEX(hashbucket)].next;
+ /* link into free list */
+ bucket->hb_keys[HASH_INDEX(hashbucket)].next = hashroot->free;
+ hashroot->free = hashbucket;
+ /* compress away data */
+ hash_compress(bucket, HASH_INDEX(hashbucket));
+ ibex_block_dirty((struct _block *)bucket);
+ ibex_block_dirty((struct _block *)table);
+ ibex_block_dirty((struct _block *)hashroot);
+ } else {
+ prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashprev));
+ prevbucket->hb_keys[HASH_INDEX(hashprev)].next =
+ bucket->hb_keys[ind].next;
+ /* link into free list */
+ bucket->hb_keys[ind].next = hashroot->free;
+ hashroot->free = hashbucket;
+ /* compress entry */
+ hash_compress(bucket, ind);
+ ibex_block_dirty((struct _block *)bucket);
+ ibex_block_dirty((struct _block *)prevbucket);
+ ibex_block_dirty((struct _block *)hashroot);
+ }
+ return;
+ }
+ hashprev = hashbucket;
+ hashbucket = bucket->hb_keys[ind].next;
+ }
+}
+
+/* set where the datablock is located */
+static void
+hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail)
+{
+ struct _hashblock *bucket;
+
+ d(printf("setting data block hash %d to %d tail %d\n", keyid, blockid, tail));
+
+ /* map to a block number */
+ g_assert((blockid & (BLOCK_SIZE-1)) == 0);
+ blockid >>= BLOCK_BITS;
+
+ bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid));
+ if (bucket->hb_keys[HASH_INDEX(keyid)].root != blockid
+ || bucket->hb_keys[HASH_INDEX(keyid)].tail != tail) {
+ bucket->hb_keys[HASH_INDEX(keyid)].tail = tail;
+ bucket->hb_keys[HASH_INDEX(keyid)].root = blockid;
+ ibex_block_dirty((struct _block *)bucket);
+ }
+}
+
+static blockid_t
+hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail)
+{
+ struct _hashblock *bucket;
+
+ d(printf("getting data block hash %d\n", keyid));
+
+ if (keyid == 0) {
+ if (tail)
+ *tail = 0;
+ return 0;
+ }
+
+ bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid));
+ if (tail)
+ *tail = bucket->hb_keys[HASH_INDEX(keyid)].tail;
+ return bucket->hb_keys[HASH_INDEX(keyid)].root << BLOCK_BITS;
+}
+
+static hashid_t
+hash_insert(struct _IBEXIndex *index, const char *key, int keylen)
+{
+ struct _hashroot *hashroot;
+ guint32 hash;
+ int hashentry;
+ blockid_t hashtable;
+ hashid_t hashbucket, keybucket, keyprev, keyfree;
+ struct _hashtableblock *table;
+ struct _hashblock *bucket;
+ int count;
+
+ g_assert(index != 0);
+ g_assert(index->root != 0);
+
+ d(printf("inserting hash %.*s\n", keylen, key));
+
+ hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+
+ /* find the table containing this entry */
+ hash = hash_key(key, keylen) % hashroot->size;
+ hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))];
+ table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
+ hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t));
+ /* and its bucket */
+ hashbucket = table->buckets[hashentry];
+
+ /* now look for a free slot, first try the free list */
+ /* but dont try too hard if our key is just too long ... so just scan upto
+ 4 blocks, but if we dont find a space, tough ... */
+ keybucket = hashroot->free;
+ keyprev = 0;
+ count = 0;
+ while (keybucket && count<4) {
+ int space;
+
+ d(printf(" checking free %d\n", keybucket));
+
+ /* read the bucket containing this free key */
+ bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keybucket));
+
+ /* check if there is enough space for the key */
+ space = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset]
+ - (char *)&bucket->hb_keys[bucket->used];
+ if (space >= keylen) {
+ hash_expand(bucket, HASH_INDEX(keybucket), keylen);
+ memcpy(&bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(keybucket)].keyoffset],
+ key, keylen);
+
+ /* check if there is free space still in this node, and there are no other empty blocks */
+ keyfree = bucket->hb_keys[HASH_INDEX(keybucket)].next;
+ if ((space-keylen) >= KEY_THRESHOLD) {
+ int i;
+ int head = ARRAY_LEN(bucket->hb_keydata);
+ int found = FALSE;
+
+ for (i=0;i<bucket->used;i++) {
+ if (bucket->hb_keys[i].keyoffset == head) {
+ /* already have a free slot in this block, leave it */
+ found = TRUE;
+ break;
+ }
+ head = bucket->hb_keys[i].keyoffset;
+ }
+ if (!found) {
+ /* we should link in a new free slot for this node */
+ bucket->hb_keys[bucket->used].next = bucket->hb_keys[HASH_INDEX(keybucket)].next;
+ bucket->hb_keys[bucket->used].keyoffset = bucket->hb_keys[bucket->used-1].keyoffset;
+ keyfree = HASH_KEY(HASH_BLOCK(keybucket), bucket->used);
+ bucket->used++;
+ }
+ }
+ /* link 'keyfree' back to the parent ... */
+ if (keyprev == 0) {
+ hashroot->free = keyfree;
+ ibex_block_dirty((struct _block *)hashroot);
+ } else {
+ struct _hashblock *prevbucket;
+ prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyprev));
+ prevbucket->hb_keys[HASH_INDEX(keyprev)].next = keyfree;
+ ibex_block_dirty((struct _block *)prevbucket);
+ }
+
+ /* link into the hash chain */
+ bucket->hb_keys[HASH_INDEX(keybucket)].next = hashbucket;
+ bucket->hb_keys[HASH_INDEX(keybucket)].root = 0;
+ bucket->hb_keys[HASH_INDEX(keybucket)].tail = 0;
+ table->buckets[hashentry] = keybucket;
+ ibex_block_dirty((struct _block *)table);
+ ibex_block_dirty((struct _block *)bucket);
+
+ d(printf(" new key id %d\n", keybucket));
+ d(printf(" new free id %d\n", hashroot->free));
+
+ return keybucket;
+ }
+
+ count++;
+ keyprev = keybucket;
+ keybucket = bucket->hb_keys[HASH_INDEX(keybucket)].next;
+ }
+
+ /* else create a new block ... */
+ keybucket = ibex_block_get(index->blocks);
+ bucket = (struct _hashblock *)ibex_block_read(index->blocks, keybucket);
+
+ d(printf("creating new key bucket %d\n", keybucket));
+
+ memset(bucket, 0, sizeof(*bucket));
+
+ bucket->used = 2;
+ /* first block, is the new key */
+ bucket->hb_keys[0].keyoffset = ARRAY_LEN(bucket->hb_keydata) - keylen;
+ memcpy(&bucket->hb_keydata[bucket->hb_keys[0].keyoffset], key, keylen);
+ bucket->hb_keys[0].next = hashbucket;
+ bucket->hb_keys[0].root = 0;
+ bucket->hb_keys[0].tail = 0;
+
+ table->buckets[hashentry] = HASH_KEY(keybucket, 0);
+
+ /* next block is a free block, link into free list */
+ bucket->hb_keys[1].keyoffset = bucket->hb_keys[0].keyoffset;
+ bucket->hb_keys[1].next = hashroot->free;
+ hashroot->free = HASH_KEY(keybucket, 1);
+
+ ibex_block_dirty((struct _block *)hashroot);
+ ibex_block_dirty((struct _block *)table);
+ ibex_block_dirty((struct _block *)bucket);
+
+ d(printf(" new key id %d\n", HASH_KEY(keybucket, 0)));
+ d(printf(" new free id %d\n", hashroot->free));
+
+ return HASH_KEY(keybucket, 0);
+}
+
+/* debug */
+void ibex_hash_dump(struct _IBEXIndex *index);
+
+void
+ibex_hash_dump(struct _IBEXIndex *index)
+{
+ int i;
+ struct _hashtableblock *table;
+ struct _hashblock *bucket;
+ struct _hashroot *hashroot;
+ blockid_t hashtable;
+ hashid_t hashbucket;
+
+ printf("Walking hash tree:\n");
+ hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+ for (i=0;i<hashroot->size;i++) {
+ printf("Hash table chain: %d\n", i);
+ hashtable = hashroot->table[i / (BLOCK_SIZE/sizeof(blockid_t))];
+ table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable);
+ hashbucket = table->buckets[i % (BLOCK_SIZE/sizeof(blockid_t))];
+ while (hashbucket) {
+ int len;
+
+ bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+ printf(" bucket %d: [used %d]", hashbucket, bucket->used);
+ if (HASH_INDEX(hashbucket) == 0) {
+ len = ARRAY_LEN(bucket->hb_keydata) -
+ bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset;
+ } else {
+ len = bucket->hb_keys[HASH_INDEX(hashbucket)-1].keyoffset -
+ bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset;
+ }
+ printf("'%.*s' = %d next=%d\n", len, &bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset],
+ bucket->hb_keys[HASH_INDEX(hashbucket)].root,
+ bucket->hb_keys[HASH_INDEX(hashbucket)].next);
+ ibex_diskarray_dump(index->blocks,
+ bucket->hb_keys[HASH_INDEX(hashbucket)].root << BLOCK_BITS,
+ bucket->hb_keys[HASH_INDEX(hashbucket)].tail);
+
+ hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next;
+ }
+ /* make sure its still in the cache */
+ hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root);
+ }
+
+ hashbucket = hashroot->free;
+ printf("Dumping free lists ..\n");
+ while (hashbucket) {
+ printf(" %d", hashbucket);
+ bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket));
+ hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next;
+ }
+ printf("\n");
+}
+
+#if 0
+int main(int argc, char **argv)
+{
+ struct _memcache *bc;
+ struct _IBEXIndex *hash;
+ int i;
+
+ bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600);
+ hash = ibex_hash_class.create(bc, 1024);
+ for (i=0;i<10000;i++) {
+ char key[16];
+ sprintf(key, "key %d", i);
+ ibex_hash_class.insert(hash, key, strlen(key));
+ }
+
+ for (i=500;i<1000;i++) {
+ char key[16];
+ sprintf(key, "key %d", i);
+ ibex_hash_class.remove(hash, key, strlen(key));
+ }
+
+ for (i=500;i<1000;i++) {
+ char key[16];
+ sprintf(key, "key %d", i);
+ ibex_hash_class.insert(hash, key, strlen(key));
+ }
+
+
+ ibex_hash_dump(hash);
+
+ for (i=0;i<2000;i++) {
+ char key[16], *lookup;
+ hashid_t keyid;
+ blockid_t root, tail;
+
+ sprintf(key, "key %d", i);
+ keyid = ibex_hash_class.find(hash, key, strlen(key));
+ lookup = ibex_hash_class.get_key(hash, keyid, 0);
+ root = ibex_hash_class.get_data(hash, keyid, &tail);
+ printf("key %s = %d = '%s' root:%d tail:%d \n", key, keyid, lookup, root, tail);
+ g_free(lookup);
+ }
+
+ ibex_hash_class.close(hash);
+ ibex_block_cache_close(bc);
+ return 0;
+}
+
+#endif
diff --git a/libibex/words.c b/libibex/ibex_block.c
index 7642b0a305..bbc6fb9c8b 100644
--- a/libibex/words.c
+++ b/libibex/ibex_block.c
@@ -1,33 +1,16 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-/*
- * Copyright (C) 2000 Helix Code, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public License
- * as published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with the Gnome Library; see the file COPYING.LIB. If not,
- * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
-/* words.c: low-level indexing ops */
+#include <glib.h>
+#include <db.h>
+#include <stdio.h>
+#include <unicode.h>
#include <ctype.h>
-#include <errno.h>
#include <string.h>
-#include <unicode.h>
-
#include "ibex_internal.h"
+#define d(x)
+
static signed char utf8_trans[] = {
'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I',
'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U',
@@ -60,7 +43,7 @@ static char *utf8_long_trans[] = {
* It is not safe to call this routine with bad UTF-8.
*/
static void
-normalize_word (char *start, char *end, char *buf)
+ibex_normalise_word(char *start, char *end, char *buf)
{
unsigned char *s, *d;
unicode_char_t uc;
@@ -151,39 +134,6 @@ utf8_category (char *sp, char **snp, char *send)
}
}
-static ibex_file *
-get_ibex_file (ibex *ib, char *name)
-{
- ibex_file *ibf;
-
- ibf = g_tree_lookup (ib->files, name);
- if (!ibf) {
- ibf = g_malloc (sizeof (ibex_file));
- ibf->name = g_strdup (name);
- ibf->index = 0;
- g_tree_insert (ib->files, ibf->name, ibf);
- ib->dirty = TRUE;
- }
- return ibf;
-}
-
-static void
-ref_word (ibex *ib, ibex_file *ibf, char *word)
-{
- GPtrArray *refs;
-
- refs = g_hash_table_lookup (ib->words, word);
- if (!refs) {
- refs = g_ptr_array_new ();
- g_hash_table_insert (ib->words, g_strdup (word), refs);
- g_ptr_array_add (refs, ibf);
- ib->dirty = TRUE;
- } else if (g_ptr_array_index (refs, refs->len - 1) != ibf) {
- g_ptr_array_add (refs, ibf);
- ib->dirty = TRUE;
- }
-}
-
/**
* ibex_index_buffer: the lowest-level ibex indexing interface
* @ib: an ibex
@@ -206,12 +156,13 @@ ref_word (ibex *ib, ibex_file *ibf, char *word)
* Return value: 0 on success, -1 on failure.
**/
int
-ibex_index_buffer (ibex *ib, char *name, char *buffer,
- size_t len, size_t *unread)
+ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread)
{
char *p, *q, *nq, *end, *word;
- ibex_file *ibf = get_ibex_file (ib, name);
int wordsiz, cat;
+ GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal);
+ GPtrArray *wordlist = g_ptr_array_new();
+ int i, ret=-1;
if (unread)
*unread = 0;
@@ -229,12 +180,9 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer,
p = q;
}
if (p == end) {
- g_free (word);
- return 0;
+ goto done;
} else if (cat == IBEX_INVALID) {
- errno = EINVAL;
- g_free (word);
- return -1;
+ goto error;
} else if (cat == IBEX_INCOMPLETE)
q = end;
@@ -246,24 +194,107 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer,
}
if (cat == IBEX_INVALID ||
(cat == IBEX_INCOMPLETE && !unread)) {
- errno = EINVAL;
- g_free (word);
- return -1;
+ goto error;
} else if (cat == IBEX_INCOMPLETE || (q == end && unread)) {
*unread = end - p;
- g_free (word);
- return 0;
+ goto done;
}
if (wordsiz < q - p + 1) {
wordsiz = q - p + 1;
word = g_realloc (word, wordsiz);
}
- normalize_word (p, q, word);
- ref_word (ib, ibf, word);
+ ibex_normalise_word (p, q, word);
+ if (word[0]) {
+ if (g_hash_table_lookup(words, word) == 0) {
+ char *newword = g_strdup(word);
+ g_ptr_array_add(wordlist, newword);
+ g_hash_table_insert(words, newword, name);
+ }
+ }
p = q;
}
-
+done:
+ d(printf("name %s count %d size %d\n", name, wordlist->len, len));
+ ib->words->klass->add_list(ib->words, name, wordlist);
+ ret = 0;
+error:
+ for (i=0;i<wordlist->len;i++)
+ g_free(wordlist->pdata[i]);
+ g_ptr_array_free(wordlist, TRUE);
+ g_hash_table_destroy(words);
g_free (word);
+ return ret;
+}
+
+
+ibex *ibex_open (char *file, int flags, int mode)
+{
+ ibex *ib;
+
+ ib = g_malloc0(sizeof(*ib));
+ ib->blocks = ibex_block_cache_open(file, flags, mode);
+ if (ib->blocks == 0) {
+ g_warning("create: Error occured?: %s\n", strerror(errno));
+ g_free(ib);
+ return NULL;
+ }
+ /* FIXME: the blockcache or the wordindex needs to manage the other one */
+ ib->words = ib->blocks->words;
+
+ return ib;
+}
+
+int ibex_save (ibex *ib)
+{
+ printf("syncing database\n");
+ ib->words->klass->sync(ib->words);
+ /* FIXME: some return */
+ ibex_block_cache_sync(ib->blocks);
return 0;
}
+
+int ibex_close (ibex *ib)
+{
+ int ret = 0;
+
+ printf("closing database\n");
+
+ ib->words->klass->close(ib->words);
+ ibex_block_cache_close(ib->blocks);
+ g_free(ib);
+ return ret;
+}
+
+void ibex_unindex (ibex *ib, char *name)
+{
+ d(printf("trying to unindex '%s'\n", name));
+ ib->words->klass->unindex_name(ib->words, name);
+}
+
+GPtrArray *ibex_find (ibex *ib, char *word)
+{
+ char *normal;
+ int len;
+
+ len = strlen(word);
+ normal = alloca(len+1);
+ ibex_normalise_word(word, word+len, normal);
+ return ib->words->klass->find(ib->words, normal);
+}
+
+gboolean ibex_find_name (ibex *ib, char *name, char *word)
+{
+ char *normal;
+ int len;
+
+ len = strlen(word);
+ normal = alloca(len+1);
+ ibex_normalise_word(word, word+len, normal);
+ return ib->words->klass->find_name(ib->words, name, normal);
+}
+
+gboolean ibex_contains_name(ibex *ib, char *name)
+{
+ return ib->words->klass->contains_name(ib->words, name);
+}
diff --git a/libibex/ibex_db.c b/libibex/ibex_db.c
new file mode 100644
index 0000000000..1ef0b3d1bc
--- /dev/null
+++ b/libibex/ibex_db.c
@@ -0,0 +1,512 @@
+
+#include <glib.h>
+
+#include <db.h>
+#include <stdio.h>
+#include <unicode.h>
+#include <ctype.h>
+
+#include "ibex_internal.h"
+
+#define d(x)
+
+/*
+ Uses 2 databases:
+
+ names - HASH - lists which files are stored
+ words - BTREE - index words to files
+
+*/
+
+static int
+db_delete_name(DB *db, const char *name)
+{
+ DBC *cursor;
+ int err, namelen;
+ DBT key, data;
+
+ printf("called to delete name %s\n", name);
+ return 0;
+
+ err = db->cursor(db, NULL, &cursor, 0);
+ if (err != 0) {
+ printf("Error occured?: %s\n", db_strerror(err));
+ return err;
+ }
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ namelen = strlen(name);
+
+ err = cursor->c_get(cursor, &key, &data, DB_FIRST);
+ while (err == 0) {
+ if (data.size == namelen && memcmp(data.data, name, namelen) == 0) {
+ d(printf("Found match, removing it\n"));
+ cursor->c_del(cursor, 0);
+ } else {
+ data.size = namelen;
+ data.data = (void *)name;
+ if (cursor->c_get(cursor, &key, &data, DB_GET_BOTH) == 0) {
+ d(printf("seek to match, removing it\n"));
+ cursor->c_del(cursor, 0);
+ } else {
+ d(printf("seek to match, not found\n"));
+ }
+ }
+ err = cursor->c_get(cursor, &key, &data, DB_NEXT_NODUP);
+ }
+
+ cursor->c_close(cursor);
+
+ return 0;
+}
+
+static int
+db_index_words(DB *db, char *name, char **words)
+{
+ DBT key, data;
+ int err = 0;
+ char *word;
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ data.data = name;
+ data.size = strlen(name);
+ for (;(word=*words);words++) {
+ /* normalise word first ... */
+ key.data = word;
+ key.size = strlen(word);
+
+ err = db->put(db, NULL, &key, &data, 0);
+ if (err != 0)
+ printf("Error occured?: %s\n", db_strerror(err));
+ }
+
+ return err;
+}
+
+static int
+db_index_word(DB *db, char *name, char *word)
+{
+ DBT key, data;
+ int err = 0;
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ data.data = name;
+ data.size = strlen(name);
+ key.data = word;
+ key.size = strlen(word);
+
+ err = db->put(db, NULL, &key, &data, 0);
+ if (err != 0)
+ printf("Error occured?: %s\n", db_strerror(err));
+
+ return err;
+}
+
+static GPtrArray *
+db_find(DB *db, char *word)
+{
+ DBT key, data;
+ int err;
+ DBC *cursor;
+ GPtrArray *matches = g_ptr_array_new();
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+
+ err = db->cursor(db, NULL, &cursor, 0);
+ if (err != 0) {
+ printf("Error occured?: %s\n", db_strerror(err));
+ return matches;
+ }
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+
+ key.size = strlen(word);
+ key.data = word;
+
+ err = cursor->c_get(cursor, &key, &data, DB_SET);
+ while (err == 0) {
+ char *name;
+ name = g_malloc(data.size+1);
+ memcpy(name, data.data, data.size);
+ name[data.size] = '\0';
+ g_ptr_array_add(matches, name);
+ err = cursor->c_get(cursor, &key, &data, DB_NEXT_DUP);
+ }
+ if (err != DB_NOTFOUND) {
+ printf("Error occured?: %s\n", db_strerror(err));
+ /* what to do? doesn't matter too much its just a search ... */
+ }
+
+ cursor->c_close(cursor);
+
+ return matches;
+}
+
+/* check that db contains key @word, optionally with contents @name */
+static gboolean
+db_contains_word(DB *db, char *name, char *word)
+{
+ DBT key, data;
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+
+ if (name != NULL) {
+ data.size = strlen(name);
+ data.data = name;
+ }
+ key.size = strlen(word);
+ key.data = word;
+
+ return db->get(db, NULL, &key, &data, name?DB_GET_BOTH:DB_SET) == 0;
+}
+
+static int
+db_delete_word(DB *db, char *word)
+{
+ DBT key;
+
+ memset(&key, 0, sizeof(key));
+ key.size = strlen(word);
+ key.data = word;
+ return db->del(db, NULL, &key, 0);
+}
+
+
+static signed char utf8_trans[] = {
+ 'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I',
+ 'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U',
+ 'U', 'Y', -3, -4, 'a', 'a', 'a', 'a', 'a', 'a', -5, 'c', 'e', 'e',
+ 'e', 'e', 'i', 'i', 'i', 'i', -6, 'n', 'o', 'o', 'o', 'o', 'o', '/',
+ 'o', 'u', 'u', 'u', 'u', 'y', -7, 'y', 'A', 'a', 'A', 'a', 'A', 'a',
+ 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e',
+ 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g',
+ 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i',
+ 'I', 'i', -8, -9, 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L',
+ 'l', 'L', 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', -10, -11,
+ 'O', 'o', 'O', 'o', 'O', 'o', -12, -13, 'R', 'r', 'R', 'r', 'R', 'r',
+ 'S', 'r', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't',
+ 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w',
+ 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's'
+};
+
+static char *utf8_long_trans[] = {
+ "AE", "TH", "TH", "ss", "ae", "th", "th", "IJ", "ij",
+ "NG", "ng", "OE", "oe"
+};
+
+/* This is a bit weird. It takes pointers to the start and end (actually
+ * just past the end) of a UTF-8-encoded word, and a buffer at least 1
+ * byte longer than the length of the word. It copies the word into the
+ * buffer in all lowercase without accents, and splits up ligatures.
+ * (Since any ligature would be a multi-byte character in UTF-8, splitting
+ * them into two US-ASCII characters won't overrun the buffer.)
+ *
+ * It is not safe to call this routine with bad UTF-8.
+ */
+static void
+ibex_normalise_word(char *start, char *end, char *buf)
+{
+ unsigned char *s, *d;
+ unicode_char_t uc;
+
+ s = (unsigned char *)start;
+ d = (unsigned char *)buf;
+ while (s < (unsigned char *)end) {
+ if (*s < 0x80) {
+ /* US-ASCII character: copy unless it's
+ * an apostrophe.
+ */
+ if (*s != '\'')
+ *d++ = tolower (*s);
+ s++;
+ } else {
+ char *next = unicode_get_utf8 (s, &uc);
+ if (uc >= 0xc0 && uc < 0xc0 + sizeof (utf8_trans)) {
+ signed char ch = utf8_trans[uc - 0xc0];
+ if (ch > 0)
+ *d++ = tolower (ch);
+ else {
+ *d++ = tolower (utf8_long_trans[-ch - 1][0]);
+ *d++ = tolower (utf8_long_trans[-ch - 1][1]);
+ }
+ s = next;
+ } else {
+ while (s < (unsigned char *)next)
+ *d++ = *s++;
+ }
+ }
+ }
+ *d = '\0';
+}
+
+enum { IBEX_ALPHA, IBEX_NONALPHA, IBEX_INVALID, IBEX_INCOMPLETE };
+
+/* This incorporates parts of libunicode, because there's no way to
+ * force libunicode to not read past a certain point.
+ */
+static int
+utf8_category (char *sp, char **snp, char *send)
+{
+ unsigned char *p = (unsigned char *)sp, **np = (unsigned char **)snp;
+ unsigned char *end = (unsigned char *)send;
+
+ if (isascii (*p)) {
+ *np = p + 1;
+ if (isalpha (*p) || *p == '\'')
+ return IBEX_ALPHA;
+ return IBEX_NONALPHA;
+ } else {
+ unicode_char_t uc;
+ int more;
+
+ if ((*p & 0xe0) == 0xc0) {
+ more = 1;
+ uc = *p & 0x1f;
+ } else if ((*p & 0xf0) == 0xe0) {
+ more = 2;
+ uc = *p & 0x0f;
+ } else if ((*p & 0xf8) == 0xf0) {
+ more = 3;
+ uc = *p & 0x07;
+ } else if ((*p & 0xfc) == 0xf8) {
+ more = 4;
+ uc = *p & 0x03;
+ } else if ((*p & 0xfe) == 0xfc) {
+ more = 5;
+ uc = *p & 0x01;
+ } else
+ return IBEX_INVALID;
+
+ if (p + more > end)
+ return IBEX_INCOMPLETE;
+
+ while (more--) {
+ if ((*++p & 0xc0) != 0x80)
+ return IBEX_INVALID;
+ uc <<= 6;
+ uc |= *p & 0x3f;
+ }
+
+ *np = p + 1;
+ if (unicode_isalpha (uc))
+ return IBEX_ALPHA;
+ else
+ return IBEX_NONALPHA;
+ }
+}
+
+static void
+do_insert_words(char *key, char *name, ibex *ib)
+{
+ db_index_word(ib->words, name, key);
+ g_free(key);
+}
+
+static void
+do_free_words(char *key, char *name, void *data)
+{
+ g_free(key);
+}
+
+/**
+ * ibex_index_buffer: the lowest-level ibex indexing interface
+ * @ib: an ibex
+ * @name: the name of the file being indexed
+ * @buffer: a buffer containing data from the file
+ * @len: the length of @buffer
+ * @unread: an output argument containing the number of unread bytes
+ *
+ * This routine indexes up to @len bytes from @buffer into @ib.
+ * If @unread is NULL, the indexer assumes that the buffer ends on a
+ * word boundary, and will index all the way to the end of the
+ * buffer. If @unread is not NULL, and the buffer ends with an
+ * alphabetic character, the indexer will assume that the buffer has
+ * been cut off in the middle of a word, and return the number of
+ * un-indexed bytes at the end of the buffer in *@unread. The caller
+ * should then read in more data through whatever means it has
+ * and pass in the unread bytes from the original buffer, followed
+ * by the new data, on its next call.
+ *
+ * Return value: 0 on success, -1 on failure.
+ **/
+int
+ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread)
+{
+ char *p, *q, *nq, *end, *word;
+ int wordsiz, cat;
+ GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal);
+
+ if (unread)
+ *unread = 0;
+
+ end = buffer + len;
+ wordsiz = 20;
+ word = g_malloc (wordsiz);
+
+ p = buffer;
+ while (p < end) {
+ while (p < end) {
+ cat = utf8_category (p, &q, end);
+ if (cat != IBEX_NONALPHA)
+ break;
+ p = q;
+ }
+ if (p == end) {
+ goto done;
+ } else if (cat == IBEX_INVALID) {
+ goto error;
+ } else if (cat == IBEX_INCOMPLETE)
+ q = end;
+
+ while (q < end) {
+ cat = utf8_category (q, &nq, end);
+ if (cat != IBEX_ALPHA)
+ break;
+ q = nq;
+ }
+ if (cat == IBEX_INVALID ||
+ (cat == IBEX_INCOMPLETE && !unread)) {
+ goto error;
+ } else if (cat == IBEX_INCOMPLETE || (q == end && unread)) {
+ *unread = end - p;
+ goto done;
+ }
+
+ if (wordsiz < q - p + 1) {
+ wordsiz = q - p + 1;
+ word = g_realloc (word, wordsiz);
+ }
+ ibex_normalise_word (p, q, word);
+ if (word[0]) {
+ if (g_hash_table_lookup(words, word) == 0) {
+ g_hash_table_insert(words, g_strdup(word), name);
+ }
+ }
+ p = q;
+ }
+done:
+ /* FIXME: do this and inserts within a transaction */
+ if (!db_contains_word(ib->names, NULL, name)) {
+ printf("adding '%s' to database\n", name);
+ db_index_word(ib->names, "", name);
+ }
+ g_hash_table_foreach(words, (GHFunc)do_insert_words, ib);
+ g_hash_table_destroy(words);
+ g_free (word);
+ return 0;
+error:
+ g_hash_table_foreach(words, (GHFunc)do_free_words, NULL);
+ g_hash_table_destroy(words);
+ g_free (word);
+ return -1;
+}
+
+
+ibex *ibex_open (char *file, int flags, int mode)
+{
+ ibex *ib;
+ u_int32_t dbflags = 0;
+ int err;
+
+ ib = g_malloc0(sizeof(*ib));
+ err = db_create(&ib->words, NULL, 0);
+ if (err != 0) {
+ g_warning("create: Error occured?: %s\n", db_strerror(err));
+ g_free(ib);
+ return NULL;
+ }
+
+ err = ib->words->set_flags(ib->words, DB_DUP);
+ if (err != 0) {
+ g_warning("set flags: Error occured?: %s\n", db_strerror(err));
+ ib->words->close(ib->words, 0);
+ g_free(ib);
+ return NULL;
+ }
+
+ if (flags & O_CREAT)
+ dbflags |= DB_CREATE;
+ if (flags & O_EXCL)
+ dbflags |= DB_EXCL;
+ if (flags & O_RDONLY)
+ dbflags |= DB_RDONLY;
+
+ /* 1MB cache size? */
+ ib->words->set_cachesize(ib->words, 0, 1000000, 0);
+
+ err = ib->words->open(ib->words, file, "words", DB_BTREE, dbflags, mode);
+ if (err != 0) {
+ printf("open: Error occured?: %s\n", db_strerror(err));
+ ib->words->close(ib->words, 0);
+ g_free(ib);
+ return NULL;
+ }
+
+ /* FIXME: check returns */
+ err = db_create(&ib->names, NULL, 0);
+ err = ib->names->open(ib->names, file, "names", DB_HASH, dbflags, mode);
+
+ return ib;
+}
+
+int ibex_save (ibex *ib)
+{
+ printf("syncing database\n");
+ ib->names->sync(ib->names, 0);
+ return ib->words->sync(ib->words, 0);
+}
+
+int ibex_close (ibex *ib)
+{
+ int ret;
+
+ printf("closing database\n");
+
+ ret = ib->names->close(ib->names, 0);
+ ret = ib->words->close(ib->words, 0);
+ g_free(ib);
+ return ret;
+}
+
+void ibex_unindex (ibex *ib, char *name)
+{
+ printf("trying to unindex '%s'\n", name);
+ if (db_contains_word(ib->names, NULL, name)) {
+ /* FIXME: do within transaction? */
+ db_delete_name(ib->words, name);
+ db_delete_word(ib->names, name);
+ }
+}
+
+GPtrArray *ibex_find (ibex *ib, char *word)
+{
+ char *normal;
+ int len;
+
+ len = strlen(word);
+ normal = alloca(len+1);
+ ibex_normalise_word(word, word+len, normal);
+ return db_find(ib->words, normal);
+}
+
+gboolean ibex_find_name (ibex *ib, char *name, char *word)
+{
+ char *normal;
+ int len;
+
+ len = strlen(word);
+ normal = alloca(len+1);
+ ibex_normalise_word(word, word+len, normal);
+ return db_contains_word(ib->words, name, normal);
+}
+
+gboolean ibex_contains_name(ibex *ib, char *name)
+{
+ return db_contains_word(ib->names, NULL, name);
+}
diff --git a/libibex/ibex_internal.h b/libibex/ibex_internal.h
index 1647147a3a..a21867a2ab 100644
--- a/libibex/ibex_internal.h
+++ b/libibex/ibex_internal.h
@@ -21,19 +21,13 @@
#include <glib.h>
#include "ibex.h"
+#include "block.h"
+#include "wordindex.h"
-#define IBEX_VERSION "ibex1"
+#define IBEX_VERSION "ibex3"
struct ibex {
char *path;
- GTree *files;
- GHashTable *words;
- GPtrArray *oldfiles;
- gboolean dirty;
+ struct _memcache *blocks;
+ struct _IBEXWord *words;
};
-
-struct ibex_file {
- char *name;
- long index;
-};
-typedef struct ibex_file ibex_file;
diff --git a/libibex/index.c b/libibex/index.c
deleted file mode 100644
index 1afedbd251..0000000000
--- a/libibex/index.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
-/*
- * Copyright (C) 2000 Helix Code, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public License
- * as published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with the Gnome Library; see the file COPYING.LIB. If not,
- * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
-
-/* index.c: high-level indexing ops */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "ibex_internal.h"
-
-/**
- * ibex_index_file: Index a file by name
- * @ib: an ibex
- * @filename: name of the file to index
- *
- * This indexes the given file into @ib. If @filename is already in
- * the ibex, the existing index entries for it are discarded and the
- * file is indexed anew.
- *
- * Return value: 0 on success, -1 on failure.
- **/
-int
-ibex_index_file (ibex *ib, char *filename)
-{
- int fd;
- int status;
- struct stat st;
-
- fd = open (filename, O_RDONLY);
- if (fd < 0)
- return -1;
-
- if (fstat (fd, &st) == -1) {
- close (fd);
- return -1;
- }
- if (!S_ISREG (st.st_mode)) {
- close (fd);
- errno = EINVAL;
- return -1;
- }
-
- ibex_unindex (ib, filename);
- status = ibex_index_fd (ib, filename, fd, st.st_size);
- close (fd);
- return status;
-}
-
-/**
- * ibex_index_fd: Index a file given a file descriptor
- * @ib: an ibex
- * @name: the name of the file being indexed
- * @fd: a file descriptor, open for reading
- * @len: the number of bytes to read from the file
- *
- * This indexes a file, or a part of a file, given an open file
- * descriptor and a size. There is no requirement that @name
- * actually correspond to @fd in any particular way.
- *
- * If the function returns successfully, the file descriptor offset of
- * @fd will be exactly @len bytes beyond where it was when the
- * function was called. The indexer assumes that this point is a word
- * boundary.
- *
- * The behavior of this function is not defined if it is not
- * possible to read @len bytes off @fd.
- *
- * Return value: 0 on success, -1 on failure.
- **/
-int
-ibex_index_fd (ibex *ib, char *name, int fd, size_t len)
-{
- char *buf;
- int off = 0, nread, status;
-
- buf = g_malloc (len);
- do {
- nread = read (fd, buf + off, len - off);
- if (nread == -1) {
- g_free (buf);
- return -1;
- }
- off += nread;
- } while (off != len);
-
- status = ibex_index_buffer (ib, name, buf, len, NULL);
- g_free (buf);
-
- return status;
-}
-
-/**
- * ibex_unindex: Remove a file from the ibex
- * @ib: an ibex
- * @name: name of the file to remove
- *
- * This removes all references to @name from @ib. No memory is freed
- * right away, but further searches on @ib will never return @name.
- **/
-void
-ibex_unindex (ibex *ib, char *name)
-{
- ibex_file *ibf;
-
- ibf = g_tree_lookup (ib->files, name);
- if (ibf) {
- ibf->index = -1;
- g_tree_remove (ib->files, name);
- g_ptr_array_add (ib->oldfiles, ibf);
- ib->dirty = TRUE;
- }
-}
-
-/**
- * ibex_rename: Rename a file in the ibex
- * @ib: an ibex
- * @oldname: the old name of the file
- * @newname: the new name of the file
- *
- * This renames a file in the ibex.
- **/
-void
-ibex_rename (ibex *ib, char *oldname, char *newname)
-{
- ibex_file *ibf;
-
- ibf = g_tree_lookup (ib->files, oldname);
- if (ibf) {
- g_tree_remove (ib->files, oldname);
- g_free (ibf->name);
- ibf->name = g_strdup (newname);
- g_tree_insert (ib->files, ibf->name, ibf);
- }
-}
diff --git a/libibex/index.h b/libibex/index.h
new file mode 100644
index 0000000000..35e3df23c2
--- /dev/null
+++ b/libibex/index.h
@@ -0,0 +1,82 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef _INDEX_H
+#define _INDEX_H
+
+/* an indexing 'class' maps a key to 1 piece of info */
+
+struct _IBEXIndex {
+ struct _IBEXIndexClass *klass;
+ struct _memcache *blocks;
+ blockid_t root; /* root block of ondisk index data */
+};
+
+struct _IBEXIndexClass {
+
+ struct _IBEXIndex *(*create)(struct _memcache *bc, int size);
+ struct _IBEXIndex *(*open)(struct _memcache *bc, blockid_t root);
+
+ int (*sync)(struct _IBEXIndex *);
+ int (*close)(struct _IBEXIndex *);
+
+ /* lookup a key in the index, returns the keyid of this item, or 0 if not found */
+ guint32 (*find)(struct _IBEXIndex *, const char *key, int keylen);
+
+ /* remove a key from the index */
+ void (*remove)(struct _IBEXIndex *, const char *key, int keylen);
+
+ /* insert a new key into the index, the keyid is returned */
+ guint32 (*insert)(struct _IBEXIndex *, const char *key, int keylen);
+
+ /* get the key contents/key length from the keyid */
+ char *(*get_key)(struct _IBEXIndex *, guint32 keyid, int *keylenptr);
+
+ /* set the key contents based on the keyid */
+ void (*set_data)(struct _IBEXIndex *, guint32 keyid, blockid_t datablock, blockid_t tail);
+
+ /* get the key contents based on the keyid */
+ blockid_t (*get_data)(struct _IBEXIndex *, guint32 keyid, blockid_t *tail);
+};
+
+/* a storage class, stores lists of lists of id's */
+
+struct _IBEXStore {
+ struct _IBEXStoreClass *klass;
+ struct _memcache *blocks;
+};
+
+struct _IBEXStoreClass {
+ struct _IBEXStore *(*create)(struct _memcache *bc);
+ int (*sync)(struct _IBEXStore *store);
+ int (*close)(struct _IBEXStore *store);
+
+ blockid_t (*add)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
+ blockid_t (*add_list)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data);
+ blockid_t (*remove)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data);
+ void (*free)(struct _IBEXStore *store, blockid_t head, blockid_t tail);
+
+ gboolean (*find)(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data);
+ GArray *(*get)(struct _IBEXStore *store, blockid_t head, blockid_t tail);
+};
+
+#endif
diff --git a/libibex/wordindex.c b/libibex/wordindex.c
new file mode 100644
index 0000000000..dbde1f4a7f
--- /dev/null
+++ b/libibex/wordindex.c
@@ -0,0 +1,488 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/* code to manage a word index */
+/* includes a cache for word index writes,
+ but not for name index writes (currently), or any reads.
+
+Note the word cache is only needed during indexing of lots
+of words, and could then be discarded (:flush()).
+
+*/
+
+#include <stdio.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <glib.h>
+
+#include "block.h"
+#include "index.h"
+#include "wordindex.h"
+
+#define d(x)
+
+/*#define WORDCACHE_SIZE (256)*/
+#define WORDCACHE_SIZE (10240)
+
+extern struct _IBEXStoreClass ibex_diskarray_class;
+extern struct _IBEXIndexClass ibex_hash_class;
+
+/* need 2 types of hash key?
+ one that just stores the wordid / wordblock
+ and one that stores the filecount/files?
+*/
+
+
+struct _wordcache {
+ struct _wordcache *next;
+ struct _wordcache *prev;
+ nameid_t wordid; /* disk wordid */
+ blockid_t wordblock; /* head of disk list */
+ blockid_t wordtail; /* and the tail data */
+ int filecount; /* how many we have in memory */
+ /*nameid_t files[32];*/ /* memory cache of files */
+ nameid_t files[62]; /* memory cache of files */
+ char word[1]; /* actual word follows */
+};
+
+static void unindex_name(struct _IBEXWord *, const char *name); /* unindex all entries for name */
+static gboolean contains_name(struct _IBEXWord *, const char *name); /* index contains data for name */
+static GPtrArray *find(struct _IBEXWord *, const char *word); /* returns all matches for word */
+static gboolean find_name(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */
+static void add(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name (slow) */
+static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */
+static int word_sync(struct _IBEXWord *idx);
+static int word_flush(struct _IBEXWord *idx);
+static int word_close(struct _IBEXWord *idx);
+
+struct _IBEXWordClass ibex_word_index_class = {
+ word_sync, word_flush, word_close,
+ unindex_name, contains_name,
+ find, find_name,
+ add, add_list
+};
+
+/* this interface isn't the best, but it'll do for now */
+struct _IBEXWord *
+ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot)
+{
+ struct _IBEXWord *idx;
+
+ idx = g_malloc0(sizeof(*idx));
+ idx->wordcache = g_hash_table_new(g_str_hash, g_str_equal);
+ ibex_list_new(&idx->wordnodes);
+ idx->wordcount = 0;
+ idx->klass = &ibex_word_index_class;
+
+ /* we use the same block array storage for both indexes at the moment */
+ idx->wordstore = ibex_diskarray_class.create(bc);
+ idx->namestore = idx->wordstore;
+
+ /* but not the same indexes! */
+ if (*wordroot) {
+ printf("opening wordindex root = %d\n", *wordroot);
+ idx->wordindex = ibex_hash_class.open(bc, *wordroot);
+ } else {
+ idx->wordindex = ibex_hash_class.create(bc, 1024);
+ *wordroot = idx->wordindex->root;
+ printf("creating wordindex root = %d\n", *wordroot);
+ }
+ if (*nameroot) {
+ printf("opening nameindex root = %d\n", *nameroot);
+ idx->nameindex = ibex_hash_class.open(bc, *nameroot);
+ } else {
+ idx->nameindex = ibex_hash_class.create(bc, 1024);
+ *nameroot = idx->nameindex->root;
+ printf("creating nameindex root = %d\n", *nameroot);
+ }
+ return idx;
+}
+
+#if 0
+static void
+cache_sanity(struct _wordcache *head)
+{
+ while (head->next) {
+ g_assert(head->filecount <= sizeof(head->files)/sizeof(head->files[0]));
+ g_assert(strlen(head->word) != 0);
+ head = head->next;
+ }
+}
+#endif
+
+/* unindex all entries for name */
+static void unindex_name(struct _IBEXWord *idx, const char *name)
+{
+ GArray *words;
+ int i;
+ nameid_t nameid, wordid;
+ blockid_t nameblock, wordblock, newblock, nametail, wordtail, newtail;
+
+ d(printf("unindexing %s\n", name));
+
+ /* lookup the hash key */
+ nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
+ /* get the block for this key */
+ nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
+ /* and the data for this block */
+ words = idx->namestore->klass->get(idx->namestore, nameblock, nametail);
+ /* walk it ... */
+ for (i=0;i<words->len;i++) {
+ /* get the word */
+ wordid = g_array_index(words, nameid_t, i);
+ d(printf(" word %d\n", wordid));
+ /* get the data block */
+ wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
+ /* clear this name from it */
+ newblock = wordblock;
+ newtail = wordtail;
+ idx->wordstore->klass->remove(idx->wordstore, &newblock, &newtail, nameid);
+ if (newblock != wordblock || newtail != wordtail)
+ idx->wordindex->klass->set_data(idx->wordindex, wordid, newblock, newtail);
+
+ /* FIXME: check cache as well */
+ }
+ g_array_free(words, TRUE);
+
+ /* and remove name data and itself */
+ idx->namestore->klass->free(idx->namestore, nameblock, nametail);
+ idx->nameindex->klass->remove(idx->nameindex, name, strlen(name));
+}
+
+/* index contains (any) data for name */
+static gboolean contains_name(struct _IBEXWord *idx, const char *name)
+{
+ return idx->nameindex->klass->find(idx->nameindex, name, strlen(name)) != 0;
+}
+
+/* returns all matches for word */
+static GPtrArray *find(struct _IBEXWord *idx, const char *word)
+{
+ nameid_t wordid, nameid;
+ GPtrArray *res;
+ GArray *names;
+ int i;
+ char *new;
+ struct _wordcache *cache;
+ blockid_t wordblock, wordtail;
+
+ res = g_ptr_array_new();
+
+ cache = g_hash_table_lookup(idx->wordcache, word);
+ if (cache) {
+ /* freshen cache entry if we touch it */
+ ibex_list_remove((struct _listnode *)cache);
+ ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
+ wordid = cache->wordid;
+ wordblock = cache->wordblock;
+ wordtail = cache->wordtail;
+ } else {
+ /* lookup the hash key */
+ wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
+ /* get the block for this key */
+ wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
+ }
+ /* and the data for this block */
+ names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail);
+ /* .. including any memory-only data */
+ if (cache) {
+ g_array_append_vals(names, cache->files, cache->filecount);
+ }
+
+ /* walk it ... converting id's back to strings */
+ g_ptr_array_set_size(res, names->len);
+ for (i=0;i<names->len;i++) {
+ nameid = g_array_index(names, nameid_t, i);
+ new = idx->nameindex->klass->get_key(idx->nameindex, nameid, NULL);
+ res->pdata[i] = new;
+ }
+ g_array_free(names, TRUE);
+ return res;
+}
+
+/* find if name contains word */
+static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *word)
+{
+ nameid_t wordid, nameid;
+ blockid_t nameblock, nametail;
+ struct _wordcache *cache;
+ int i;
+
+ /* lookup the hash key for the name */
+ nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
+ /* get the block for this name */
+ nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
+
+ /* check if there is an in-memory cache for this word, check its file there first */
+ cache = g_hash_table_lookup(idx->wordcache, word);
+ if (cache) {
+ /* freshen cache entry if we touch it */
+ ibex_list_remove((struct _listnode *)cache);
+ ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
+ for (i=0;i<cache->filecount;i++) {
+ if (cache->files[i] == nameid)
+ return TRUE;
+ }
+ /* not there? well we can use the wordid anyway */
+ wordid = cache->wordid;
+ } else {
+ /* lookup the hash key for word */
+ wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
+ }
+
+ /* see if wordid is in nameblock */
+ return idx->namestore->klass->find(idx->namestore, nameblock, nametail, wordid);
+}
+
+/* cache helper functions */
+/* flush a cache entry to disk, and empty it out */
+static void
+sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache)
+{
+ GArray array; /* just use this as a header */
+ blockid_t oldblock, oldtail;
+
+ d(printf("syncing cache entry '%s'\n", cache->word));
+ array.data = (char *)cache->files;
+ array.len = cache->filecount;
+ oldblock = cache->wordblock;
+ oldtail = cache->wordtail;
+ idx->wordstore->klass->add_list(idx->wordstore, &cache->wordblock, &cache->wordtail, &array);
+ if (oldblock != cache->wordblock || oldtail != cache->wordtail) {
+ idx->wordindex->klass->set_data(idx->wordindex, cache->wordid, cache->wordblock, cache->wordtail);
+ }
+ cache->filecount = 0;
+}
+
+/* create a new key in an index, returning its id and head block */
+static void
+add_index_key(struct _IBEXIndex *wordindex, const char *word, nameid_t *wordid, blockid_t *wordblock, blockid_t *wordtail)
+{
+ /* initialise cache entry - id of word entry and head block */
+ *wordid = wordindex->klass->find(wordindex, word, strlen(word));
+ if (*wordid == 0) {
+ *wordid = wordindex->klass->insert(wordindex, word, strlen(word));
+ *wordblock = 0;
+ *wordtail = 0;
+ } else {
+ *wordblock = wordindex->klass->get_data(wordindex, *wordid, wordtail);
+ }
+}
+
+/* create a new key in a cached index (only word cache so far), flushing old keys
+ if too much space is being used */
+static struct _wordcache *
+add_index_cache(struct _IBEXWord *idx, const char *word)
+{
+ struct _wordcache *cache;
+
+ d(printf("adding %s to cache\n", word));
+
+ cache = g_hash_table_lookup(idx->wordcache, word);
+ if (cache == 0) {
+ /* see if we have to flush off the last entry */
+ if (idx->wordcount >= WORDCACHE_SIZE) {
+ /* remove last entry, and flush it */
+ cache = (struct _wordcache *)idx->wordnodes.tailpred;
+ d(printf("flushing word from cache %s\n", cache->word));
+ ibex_list_remove((struct _listnode *)cache);
+ g_hash_table_remove(idx->wordcache, cache->word);
+ sync_cache_entry(idx, cache);
+ g_free(cache);
+ idx->wordcount--;
+ }
+ cache = g_malloc0(sizeof(*cache)+strlen(word));
+ /* initialise cache entry - id of word entry and head block */
+ add_index_key(idx->wordindex, word, &cache->wordid, &cache->wordblock, &cache->wordtail);
+ /* other fields */
+ strcpy(cache->word, word);
+ cache->filecount = 0;
+ g_hash_table_insert(idx->wordcache, cache->word, cache);
+ ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
+ idx->wordcount++;
+ } else {
+ /* move cache bucket ot the head of the cache list */
+ ibex_list_remove((struct _listnode *)cache);
+ ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
+ }
+ return cache;
+}
+
+/* adds a single word to name (slow) */
+static void add(struct _IBEXWord *idx, const char *name, const char *word)
+{
+ nameid_t nameid;
+ blockid_t nameblock, newblock, nametail, newtail;
+ struct _wordcache *cache;
+
+ g_error("Dont use wordindex::add()");
+ abort();
+
+ cache = add_index_cache(idx, word);
+
+ /* get the nameid and block start for this name */
+ add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+
+ /* check for repeats of the last name - dont add anything */
+ if (cache->files[cache->filecount] == nameid)
+ return;
+
+ /* yay, now insert the nameindex into the word list, and the word index into the name list */
+ cache->files[cache->filecount] = nameid;
+ cache->filecount++;
+ /* if we are full, force a flush now */
+ if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
+ sync_cache_entry(idx, cache);
+ }
+
+ newtail = nametail;
+ newblock = nameblock;
+ idx->namestore->klass->add(idx->namestore, &newblock, &newtail, cache->wordid);
+ if (newblock != nameblock || newtail != nametail) {
+ idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
+ }
+}
+
+/* adds a bunch of words to a given name */
+static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words)
+{
+ int i;
+ GArray *data = g_array_new(0, 0, sizeof(nameid_t));
+ blockid_t nameblock, newblock, nametail, newtail;
+ nameid_t nameid;
+ struct _wordcache *cache;
+
+ d(printf("Adding words to name %s\n", name));
+
+ d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
+
+ /* get the nameid and block start for this name */
+ add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+
+ d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
+
+ for (i=0;i<words->len;i++) {
+ char *word = words->pdata[i];
+
+ cache = add_index_cache(idx, word);
+
+ /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
+
+ /* check for duplicates; doesn't catch duplicates over an overflow
+ boundary. Watch me care. */
+ if (cache->filecount == 0
+ || cache->files[cache->filecount-1] != nameid) {
+ cache->files[cache->filecount] = nameid;
+ cache->filecount++;
+ /* if we are full, force a flush now */
+ if (cache->filecount >= sizeof(cache->files)/sizeof(cache->files[0])) {
+ sync_cache_entry(idx, cache);
+ }
+
+ /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
+
+ /* and append this wordid for this name in memory */
+ g_array_append_val(data, cache->wordid);
+ }
+
+ /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
+ }
+
+ d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
+
+ /* and append these word id's in one go */
+ newblock = nameblock;
+ newtail = nametail;
+ idx->namestore->klass->add_list(idx->namestore, &newblock, &newtail, data);
+ if (newblock != nameblock || newtail != nametail) {
+ idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
+ }
+
+ d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
+
+ g_array_free(data, TRUE);
+}
+
+/* sync any in-memory data to disk */
+static int
+word_sync(struct _IBEXWord *idx)
+{
+ /* we just flush also, save memory */
+ word_flush(idx);
+
+#if 0
+ struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head;
+
+ while (cache->next) {
+ sync_cache_entry(idx, cache);
+ cache = cache->next;
+ }
+
+ /*ibex_hash_dump(idx->wordindex);*/
+ /*ibex_hash_dump(idx->nameindex);*/
+#endif
+ return 0;
+}
+
+/* sync and flush any in-memory data to disk and free it */
+static int
+word_flush(struct _IBEXWord *idx)
+{
+ struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
+
+ next = cache->next;
+ while (next) {
+ sync_cache_entry(idx, cache);
+ g_hash_table_remove(idx->wordcache, cache->word);
+ g_free(cache);
+ cache = next;
+ next = cache->next;
+ }
+ ibex_list_new(&idx->wordnodes);
+ return 0;
+}
+
+static int word_close(struct _IBEXWord *idx)
+{
+ struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
+
+ next = cache->next;
+ while (next) {
+ sync_cache_entry(idx, cache);
+ g_free(cache);
+ cache = next;
+ next = cache->next;
+ }
+ idx->namestore->klass->close(idx->namestore);
+ idx->nameindex->klass->close(idx->nameindex);
+ /*same as namestore:
+ idx->wordstore->klass->close(idx->wordstore);*/
+ idx->wordindex->klass->close(idx->wordindex);
+ g_hash_table_destroy(idx->wordcache);
+ g_free(idx);
+
+ return 0;
+}
diff --git a/libibex/wordindex.h b/libibex/wordindex.h
new file mode 100644
index 0000000000..d222ff5a87
--- /dev/null
+++ b/libibex/wordindex.h
@@ -0,0 +1,65 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef _WORDINDEX_H
+#define _WORDINDEX_H
+
+#include <glib.h>
+
+#include "block.h"
+#include "index.h"
+
+struct _IBEXWord;
+
+/* not used yet */
+typedef void (*IBEXNormaliseFunc)(char *source, int len, char *dest);
+
+struct _IBEXWordClass {
+ int (*sync)(struct _IBEXWord *);
+ int (*flush)(struct _IBEXWord *);
+ int (*close)(struct _IBEXWord *);
+
+ void (*unindex_name)(struct _IBEXWord *, const char *name); /* unindex all entries for name */
+ gboolean (*contains_name)(struct _IBEXWord *, const char *name); /* index contains data for name */
+ GPtrArray *(*find)(struct _IBEXWord *, const char *word); /* returns all matches for word */
+ gboolean (*find_name)(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */
+ void (*add)(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name */
+ void (*add_list)(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */
+};
+
+struct _IBEXWord {
+ struct _IBEXWordClass *klass;
+ struct _IBEXStore *wordstore;
+ struct _IBEXIndex *wordindex;
+ struct _IBEXStore *namestore;
+ struct _IBEXIndex *nameindex;
+
+ /* word caching info (should probably be modularised) */
+ GHashTable *wordcache; /* word->struct _wordcache mapping */
+ struct _list wordnodes; /* LRU list of wordcache structures */
+ int wordcount; /* how much space used in cache */
+};
+
+
+struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
+
+#endif /* !_WORDINDEX_H */