/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996, 1997, 1998, 1999
 *	Sleepycat Software.  All rights reserved.
 */
/*
 * Copyright (c) 1990, 1993, 1994
 *	Margo Seltzer.  All rights reserved.
 */
/*
 * Copyright (c) 1990, 1993, 1994
 *	The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Margo Seltzer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "db_config.h"

#ifndef lint
static const char sccsid[] = "@(#)hash.c	11.29 (Sleepycat) 11/14/99";
#endif /* not lint */

#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>

#include <errno.h>
#include <stdlib.h>
#include <string.h>
#endif

#include "db_int.h"
#include "db_page.h"
#include "db_am.h"
#include "db_ext.h"
#include "db_shash.h"
#include "db_swap.h"
#include "hash.h"
#include "btree.h"
#include "log.h"
#include "lock.h"
#include "txn.h"

static int  CDB___ham_c_close __P((DBC *));
static int  CDB___ham_c_del __P((DBC *, u_int32_t));
static int  CDB___ham_c_destroy __P((DBC *));
static int  CDB___ham_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
static int  CDB___ham_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
static int  CDB___ham_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
static int  CDB___ham_dup_return __P((DBC *, DBT *, u_int32_t));
static int  CDB___ham_expand_table __P((DBC *));
static int  CDB___ham_init_htab __P((DBC *,
		const char *, db_pgno_t, u_int32_t, u_int32_t));
static int  CDB___ham_lookup __P((DBC *, const DBT *, u_int32_t, db_lockmode_t));
static int  CDB___ham_overwrite __P((DBC *, DBT *));

/*
 * CDB___ham_metachk --
 *
 * PUBLIC: int CDB___ham_metachk __P((DB *, const char *, HMETA *));
 */
int
CDB___ham_metachk(dbp, name, hashm)
	DB *dbp;
	const char *name;
	HMETA *hashm;
{
	DB_ENV *dbenv;
	u_int32_t vers;
	int ret;

	dbenv = dbp->dbenv;

	/*
	 * At this point, all we know is that the magic number is for a Hash.
	 * Check the version, the database may be out of date.
	 */
	vers = hashm->dbmeta.version;
	if (F_ISSET(dbp, DB_AM_SWAP))
		M_32_SWAP(vers);
	switch (vers) {
	case 4:
		/* FALLTHROUGH */
	case 5:
		CDB___db_err(dbenv,
		    "%s: hash version %lu requires a version upgrade",
		    name, (u_long)vers);
		return (DB_OLD_VERSION);
	case 6:
		break;
	default:
		CDB___db_err(dbenv,
		    "%s: unsupported hash version: %lu", name, (u_long)vers);
		return (EINVAL);
	}

	/* Swap the page if we need to. */
	if (F_ISSET(dbp, DB_AM_SWAP) && (ret = CDB___ham_mswap((PAGE *)hashm)) != 0)
		return (ret);

	/* Check the type. */
	if (dbp->type != DB_HASH && dbp->type != DB_UNKNOWN)
		return (EINVAL);
	dbp->type = DB_HASH;
	DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);

	/*
	 * Check application info against metadata info, and set info, flags,
	 * and type based on metadata info.
	 */
	if ((ret = CDB___db_fchk(dbenv,
	    "DB->open", hashm->dbmeta.flags,
	    DB_HASH_DUP | DB_HASH_SUBDB)) != 0)
		return (ret);

	if (F_ISSET(&hashm->dbmeta, DB_HASH_DUP))
		F_SET(&hashm->dbmeta, DB_HASH_DUP);
	else
		if (F_ISSET(dbp, DB_AM_DUP)) {
			CDB___db_err(dbenv,
		"%s: DB_DUP specified to open method but not set in database",
			    name);
			return (EINVAL);
		}

	if (F_ISSET(&hashm->dbmeta, DB_HASH_SUBDB))
		F_SET(dbp, DB_AM_SUBDB);
	else
		if (F_ISSET(dbp, DB_AM_SUBDB)) {
			CDB___db_err(dbenv,
		    "%s: subdatabase specified but not supported in database",
			name);
			return (EINVAL);
		}

	/* Set the page size. */
	dbp->pgsize = hashm->dbmeta.pagesize;
	F_CLR(dbp, DB_AM_PGDEF);

	/* Copy the file's ID. */
	memcpy(dbp->fileid, hashm->dbmeta.uid, DB_FILE_ID_LEN);

	return (0);
}

/*
 * CDB___ham_open --
 *
 * PUBLIC: int CDB___ham_open __P((DB *, const char *, db_pgno_t));
 */
int
CDB___ham_open(dbp, name, base_pgno)
	DB *dbp;
	const char *name;
	db_pgno_t base_pgno;
{
	DB_ENV *dbenv;
	DBC *dbc;
	HASH_CURSOR *hcp;
	HASH *hashp;
	int need_sync, ret, t_ret;

	dbc = NULL;
	dbenv = dbp->dbenv;
	need_sync = 0;

	/* Initialize the remaining fields/methods of the DB. */
	dbp->del = CDB___ham_delete;
	dbp->stat = CDB___ham_stat;

	/* Get a cursor we can use for the rest of this function. */
	if ((ret = dbp->cursor(dbp, dbp->open_txn, &dbc, 0)) != 0)
		return (ret);

	hcp = (HASH_CURSOR *)dbc->internal;
	hashp = dbp->h_internal;
	hashp->meta_pgno = base_pgno;
	if ((ret = CDB___ham_get_meta(dbc)) != 0)
		goto err1;

	/*
	 * If this is a new file, initialize it, and put it back dirty.
	 *
	 * Initialize the hdr structure.
	 */
	if (hcp->hdr->dbmeta.magic == DB_HASHMAGIC) {
		/* File exists, verify the data in the header. */
		if (hashp->h_hash == NULL)
			hashp->h_hash = hcp->hdr->dbmeta.version < 5
			? CDB___ham_func4 : CDB___ham_func5;
		if (hashp->h_hash(CHARKEY, sizeof(CHARKEY)) !=
		    hcp->hdr->h_charkey) {
			CDB___db_err(dbp->dbenv,
			    "hash: incompatible hash function");
			ret = EINVAL;
			goto err2;
		}
		if (F_ISSET(&hcp->hdr->dbmeta, DB_HASH_DUP))
			F_SET(dbp, DB_AM_DUP);
		if (F_ISSET(&hcp->hdr->dbmeta, DB_HASH_SUBDB))
			F_SET(dbp, DB_AM_SUBDB);
	} else {
		/*
		 * File does not exist, we must initialize the header.  If
		 * locking is enabled that means getting a write lock first.
		 */
		dbc->lock.pgno = base_pgno;

		if (F_ISSET(dbenv, DB_ENV_LOCKING) &&
		    ((ret = CDB_lock_put(dbenv, &hcp->hlock)) != 0 ||
		    (ret = CDB_lock_get(dbenv, dbc->locker,
		    DB_NONBLOCK(dbc) ? DB_LOCK_NOWAIT : 0,
		    &dbc->lock_dbt, DB_LOCK_WRITE, &hcp->hlock)) != 0))
			goto err2;

		if ((ret = CDB___ham_init_htab(dbc, name,
		    base_pgno, hashp->h_nelem, hashp->h_ffactor)) != 0)
			goto err2;

		need_sync = 1;
	}

	/* Make sure we always have a valid hashp->h_hash function. */
	if (hashp->h_hash == NULL)
		hashp->h_hash = hcp->hdr->dbmeta.version < 5
		? CDB___ham_func4 : CDB___ham_func5;

err2:	/* Release the meta data page */
	if ((t_ret = CDB___ham_release_meta(dbc)) != 0 && ret == 0)
		ret = t_ret;
err1:	if ((t_ret  = dbc->c_close(dbc)) != 0 && ret == 0)
		ret = t_ret;

	/* Sync the file so that we know that the meta data goes to disk. */
	if (ret == 0 && need_sync)
		ret = dbp->sync(dbp, 0);
#if CONFIG_TEST
	if (ret == 0)
		DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name);

DB_TEST_RECOVERY_LABEL
#endif
	if (ret != 0)
		(void)CDB___ham_db_close(dbp);

	return (ret);
}

/************************** LOCAL CREATION ROUTINES **********************/
/*
 * Returns 0 on No Error
 */
static int
CDB___ham_init_htab(dbc, name, pgno, nelem, ffactor)
	DBC *dbc;
	const char *name;
	db_pgno_t pgno;
	u_int32_t nelem, ffactor;
{
	DB *dbp;
	DB_LOCK metalock;
	DB_LSN orig_lsn;
	DBMETA *mmeta;
	HASH_CURSOR *hcp;
	HASH *hashp;
	PAGE *h;
	db_pgno_t mpgno;
	int32_t l2, nbuckets;
	int dirty_mmeta, i, ret, t_ret;

	hcp = (HASH_CURSOR *)dbc->internal;
	dbp = dbc->dbp;
	hashp = dbp->h_internal;
	mmeta = NULL;
	dirty_mmeta = 0;
	metalock.off = LOCK_INVALID;

	if (hashp->h_hash == NULL)
		hashp->h_hash = DB_HASHVERSION < 5 ? CDB___ham_func4 : CDB___ham_func5;

	if (nelem != 0 && ffactor != 0) {
		nelem = (nelem - 1) / ffactor + 1;
		l2 = CDB___db_log2(nelem > 2 ? nelem : 2);
	} else
		l2 = 1;
	nbuckets = 1 << l2;

	orig_lsn = hcp->hdr->dbmeta.lsn;
	memset(hcp->hdr, 0, sizeof(HMETA));
	ZERO_LSN(hcp->hdr->dbmeta.lsn);
	hcp->hdr->dbmeta.pgno = pgno;
	hcp->hdr->dbmeta.magic = DB_HASHMAGIC;
	hcp->hdr->dbmeta.version = DB_HASHVERSION;
	hcp->hdr->dbmeta.pagesize = dbp->pgsize;
	hcp->hdr->dbmeta.type = P_HASHMETA;
	hcp->hdr->dbmeta.free = PGNO_INVALID;
	hcp->hdr->max_bucket = hcp->hdr->high_mask = nbuckets - 1;
	hcp->hdr->low_mask = (nbuckets >> 1) - 1;
	hcp->hdr->ffactor = ffactor;
	hcp->hdr->h_charkey = hashp->h_hash(CHARKEY, sizeof(CHARKEY));
	memcpy(hcp->hdr->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);

	if (F_ISSET(dbp, DB_AM_DUP))
		F_SET(&hcp->hdr->dbmeta, DB_HASH_DUP);
	if (F_ISSET(dbp, DB_AM_SUBDB)) {
		F_SET(&hcp->hdr->dbmeta, DB_HASH_SUBDB);

		/*
		 * If this is a subdatabase, then we need to get the LSN
		 * off the master meta data page because that's where free
		 * pages are linked and during recovery we need to access
		 * that page and roll it backward/forward correctly with
		 * respect to LSN.
		 */
		mpgno = PGNO_BASE_MD;
		if ((ret = CDB___db_lget(dbc,
		   0, mpgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
			return (ret);
		if ((ret = CDB_memp_fget(dbp->mpf,
		    &mpgno, 0, (PAGE **)&mmeta)) != 0)
			goto err;
	}
	if ((ret = CDB___ham_dirty_page(dbp, (PAGE *)hcp->hdr)) != 0)
		goto err;

	/*
	 * Create the first and second buckets pages so that we have the
	 * page numbers for them and we can store that page number
	 * in the meta-data header (spares[0]).
	 */
	hcp->hdr->spares[0] = nbuckets;
	if ((ret = CDB_memp_fget(dbp->mpf,
	    &hcp->hdr->spares[0], DB_MPOOL_NEW_GROUP, &h)) != 0)
		goto err;

	P_INIT(h, dbp->pgsize, hcp->hdr->spares[0], PGNO_INVALID,
	    PGNO_INVALID, 0, P_HASH);

	/* Fill in the last fields of the meta data page. */
	hcp->hdr->spares[0] -= (nbuckets - 1);
	for (i = 1; i <= l2; i++)
		hcp->hdr->spares[i] = hcp->hdr->spares[0];
	for (; i < NCACHED; i++)
		hcp->hdr->spares[i] = PGNO_INVALID;

	/*
	 * Before we are about to put any dirty pages, we need to log
	 * the meta-data page create.
	 */
	ret = CDB___db_log_page(dbp, name, &orig_lsn, pgno, (PAGE *)hcp->hdr);

	if (dbp->open_txn != NULL) {
		if ((t_ret = CDB___ham_groupalloc_log(dbp->dbenv, dbp->open_txn,
		    &hcp->hdr->dbmeta.lsn, 0, dbp->log_fileid,
		    hcp->hdr->dbmeta.pgno, &hcp->hdr->dbmeta.lsn,
		    mmeta == NULL ? &hcp->hdr->dbmeta.lsn : &mmeta->lsn,
		    hcp->hdr->spares[0], hcp->hdr->max_bucket + 1)) != 0 &&
		    ret == 0)
			ret = t_ret;
		if (t_ret == 0 && mmeta != NULL) {
			mmeta->lsn = hcp->hdr->dbmeta.lsn;
			dirty_mmeta = 1;
		}
	}

	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, name);

DB_TEST_RECOVERY_LABEL
	if ((t_ret = CDB_memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0 && ret == 0)
		ret = t_ret;

err:	if (mmeta != NULL)
		if ((t_ret = CDB_memp_fput(dbp->mpf, mmeta,
		    dirty_mmeta ? DB_MPOOL_DIRTY : 0)) != 0 && ret == 0)
			ret = t_ret;
	if (metalock.off != LOCK_INVALID)
		(void)__TLPUT(dbc, metalock);

	return (ret);
}

static int
CDB___ham_delete(dbp, txn, key, flags)
	DB *dbp;
	DB_TXN *txn;
	DBT *key;
	u_int32_t flags;
{
	DBC *dbc;
	HASH_CURSOR *hcp;
	int ret, t_ret;

	PANIC_CHECK(dbp->dbenv);
	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");

	if ((ret =
	    CDB___db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
		return (ret);

	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
		return (ret);

	DEBUG_LWRITE(dbc, txn, "ham_delete", key, NULL, flags);

	hcp = (HASH_CURSOR *)dbc->internal;
	if ((ret = CDB___ham_get_meta(dbc)) != 0)
		goto out;

	if ((ret = CDB___ham_lookup(dbc, key, 0, DB_LOCK_WRITE)) == 0) {
		if (F_ISSET(hcp, H_OK))
			ret = CDB___ham_del_pair(dbc, 1);
		else
			ret = DB_NOTFOUND;
	}

	if ((t_ret = CDB___ham_release_meta(dbc)) != 0 && ret == 0)
		ret = t_ret;

out:	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
		ret = t_ret;
	return (ret);
}

/* ****************** CURSORS ********************************** */
/*
 * CDB___ham_c_init --
 *	Initialize the hash-specific portion of a cursor.
 *
 * PUBLIC: int CDB___ham_c_init __P((DBC *));
 */
int
CDB___ham_c_init(dbc)
	DBC *dbc;
{
	HASH_CURSOR *new_curs;
	int ret;

	if ((ret = CDB___os_calloc(1, sizeof(struct cursor_t), &new_curs)) != 0)
		return (ret);
	if ((ret =
	    CDB___os_malloc(dbc->dbp->pgsize, NULL, &new_curs->split_buf)) != 0) {
		CDB___os_free(new_curs, sizeof(*new_curs));
		return (ret);
	}

	new_curs->dbc = dbc;

	dbc->internal = new_curs;
	dbc->c_am_close = CDB___ham_c_close;
	dbc->c_am_destroy = CDB___ham_c_destroy;
	dbc->c_del = CDB___ham_c_del;
	dbc->c_get = CDB___ham_c_get;
	dbc->c_put = CDB___ham_c_put;

	CDB___ham_item_init(new_curs);

	return (0);
}

/*
 * CDB___ham_c_close --
 *	Close down the cursor from a single use.
 */
static int
CDB___ham_c_close(dbc)
	DBC *dbc;
{
	int ret;

	if ((ret = CDB___ham_item_done(dbc, 0)) != 0)
		return (ret);

	CDB___ham_item_init((HASH_CURSOR *)dbc->internal);
	return (0);
}

/*
 * CDB___ham_c_destroy --
 *	Cleanup the access method private part of a cursor.
 */
static int
CDB___ham_c_destroy(dbc)
	DBC *dbc;
{
	HASH_CURSOR *hcp;

	hcp = (HASH_CURSOR *)dbc->internal;
	if (hcp->split_buf != NULL)
		CDB___os_free(hcp->split_buf, dbc->dbp->pgsize);
	CDB___os_free(hcp, sizeof(HASH_CURSOR));

	return (0);
}

static int
CDB___ham_c_del(dbc, flags)
	DBC *dbc;
	u_int32_t flags;
{
	DB *dbp;
	DBT repldbt;
	HASH_CURSOR *hcp;
	HASH_CURSOR save_curs;
	db_pgno_t ppgno, chg_pgno;
	int ret, t_ret;

	DEBUG_LWRITE(dbc, dbc->txn, "ham_c_del", NULL, NULL, flags);
	dbp = dbc->dbp;
	PANIC_CHECK(dbp->dbenv);
	hcp = (HASH_CURSOR *)dbc->internal;

	if ((ret = CDB___db_cdelchk(dbc->dbp, flags,
	    F_ISSET(dbc->dbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0)
		return (ret);

	if (F_ISSET(hcp, H_DELETED))
		return (DB_NOTFOUND);

	/*
	 * If we are in the concurrent DB product and this cursor
	 * is not a write cursor, then this request is invalid.
	 * If it is a simple write cursor, then we need to upgrade its
	 * lock.
	 */
	if (F_ISSET(dbp->dbenv, DB_ENV_CDB)) {
		/* Make sure it's a valid update cursor. */
		if (!F_ISSET(dbc, DBC_WRITECURSOR | DBC_WRITER))
			return (EPERM);

		if (F_ISSET(dbc, DBC_WRITECURSOR) &&
		    (ret = CDB_lock_get(dbp->dbenv, dbc->locker,
		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
		    &dbc->mylock)) != 0)
			return (ret);
	}

	SAVE_CURSOR(hcp, &save_curs);

	if ((ret = CDB___ham_get_meta(dbc)) != 0)
		goto out;

	if ((ret = CDB___ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0)
		goto out;
	if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno != PGNO_INVALID) {
		/*
		 * We are about to remove a duplicate from offpage.
		 *
		 * There are 4 cases.
		 * 1. We will remove an item on a page, but there are more
		 *    items on that page.
		 * 2. We will remove the last item on a page, but there is a
		 *    following page of duplicates.
		 * 3. We will remove the last item on a page, this page was the
		 *    last page in a duplicate set, but there were dups before
		 *    it.
		 * 4. We will remove the last item on a page, removing the last
		 *    duplicate.
		 * In case 1 hcp->dpagep is unchanged.
		 * In case 2 hcp->dpagep comes back pointing to the next dup
		 *     page.
		 * In case 3 hcp->dpagep comes back NULL.
		 * In case 4 hcp->dpagep comes back NULL.
		 *
		 * Case 4 results in deleting the pair off the master page.
		 * The normal code for doing this knows how to delete the
		 * duplicates, so we will handle this case in the normal code.
		 */
		ppgno = PREV_PGNO(hcp->dpagep);
		if (ppgno == PGNO_INVALID &&
		    NEXT_PGNO(hcp->dpagep) == PGNO_INVALID &&
		    NUM_ENT(hcp->dpagep) == 1)
			goto normal;

		/* Remove item from duplicate page. */
		chg_pgno = hcp->dpgno;
		if ((ret = CDB___db_drem(dbc, &hcp->dpagep, hcp->dndx)) != 0)
			goto out;

		if (hcp->dpagep == NULL) {
			if (ppgno != PGNO_INVALID) {		/* Case 3 */
				hcp->dpgno = ppgno;
				if ((ret = CDB___ham_get_cpage(dbc,
				    DB_LOCK_READ)) != 0)
					goto out;
				hcp->dndx = NUM_ENT(hcp->dpagep);
				F_SET(hcp, H_DELETED);
			} else {				/* Case 4 */
				ret = CDB___ham_del_pair(dbc, 1);
				hcp->dpgno = PGNO_INVALID;
				/*
				 * Delpair updated the cursor queue, so we
				 * don't have to do that here.
				 */
				chg_pgno = PGNO_INVALID;
			}
		} else if (PGNO(hcp->dpagep) != hcp->dpgno) {
			hcp->dndx = 0;				/* Case 2 */
			hcp->dpgno = PGNO(hcp->dpagep);
			if (ppgno == PGNO_INVALID)
				memcpy(HOFFDUP_PGNO(P_ENTRY(hcp->pagep,
				    H_DATAINDEX(hcp->bndx))),
				    &hcp->dpgno, sizeof(db_pgno_t));
			/*
			 * We need to put the master page here, because
			 * although we have a duplicate page, the master
			 * page is dirty, and ham_item_done assumes that
			 * if you have a duplicate page, it's the only one
			 * that can be dirty.
			 */
			ret = CDB___ham_put_page(dbp, hcp->pagep, 1);
			hcp->pagep = NULL;
			F_SET(hcp, H_DELETED);
		} else						/* Case 1 */
			F_SET(hcp, H_DELETED);
		if (chg_pgno != PGNO_INVALID)
			CDB___ham_c_update(hcp, chg_pgno, 0, 0, 1);
	} else if (F_ISSET(hcp, H_ISDUP)) {			/* on page */
		if (hcp->dup_off == 0 && DUP_SIZE(hcp->dup_len) ==
		    LEN_HDATA(hcp->pagep, hcp->hdr->dbmeta.pagesize, hcp->bndx))
			ret = CDB___ham_del_pair(dbc, 1);
		else {
			repldbt.flags = 0;
			F_SET(&repldbt, DB_DBT_PARTIAL);
			repldbt.doff = hcp->dup_off;
			repldbt.dlen = DUP_SIZE(hcp->dup_len);
			repldbt.size = 0;
			repldbt.data =
			    HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx));
			ret = CDB___ham_replpair(dbc, &repldbt, 0);
			hcp->dup_tlen -= DUP_SIZE(hcp->dup_len);
			F_SET(hcp, H_DELETED);
			CDB___ham_c_update(hcp, hcp->pgno,
			    DUP_SIZE(hcp->dup_len), 0, 1);
		}

	} else
		/* Not a duplicate */
normal:		ret = CDB___ham_del_pair(dbc, 1);

out:	if ((t_ret = CDB___ham_item_done(dbc, ret == 0)) != 0 && ret == 0)
		ret = t_ret;
	if ((t_ret = CDB___ham_release_meta(dbc)) != 0 && ret == 0)
		ret = t_ret;
	RESTORE_CURSOR(dbp, hcp, &save_curs, ret);
	if (F_ISSET(dbc, DBC_WRITECURSOR))
		(void)CDB___lock_downgrade(dbp->dbenv,
		    &dbc->mylock, DB_LOCK_IWRITE, 0);
	return (ret);
}

/*
 * CDB___ham_c_dup --
 *	Duplicate a hash cursor, such that the new one holds appropriate
 *	locks for the position of the original.
 *
 * PUBLIC: int CDB___ham_c_dup __P((DBC *, DBC *));
 */
int
CDB___ham_c_dup(orig_dbc, new_dbc)
	DBC *orig_dbc, *new_dbc;
{
	HASH_CURSOR *orig, *new;

	orig = (HASH_CURSOR *)orig_dbc->internal;
	new = (HASH_CURSOR *)new_dbc->internal;

#ifdef DIAGNOSTIC
	memset(new, 0, sizeof(*new));
#endif
	new->dbc = orig->dbc;
	new->bucket = orig->bucket;
	new->lbucket = orig->lbucket;
	new->pgno = orig->pgno;
	new->bndx = orig->bndx;
	new->dpgno = orig->dpgno;
	new->dndx = orig->dndx;
	new->dup_off = orig->dup_off;
	new->dup_len = orig->dup_len;
	new->dup_tlen = orig->dup_tlen;

	if (F_ISSET(orig, H_DELETED))
		F_SET(new, H_DELETED);
	if (F_ISSET(orig, H_ISDUP))
		F_SET(new, H_ISDUP);

	/*
	 * If the old cursor held a lock and we're not in transactions, get one
	 * for the new one.   The reason that we don't need a new lock if we're
	 * in a transaction is because we already hold a lock and will continue
	 * to do so until commit, so there is no point in reaquiring it. We
	 * don't know if the old lock was a read or write lock, but it doesn't
	 * matter. We'll get a read lock.  We know that this locker already
	 * holds a lock of the correct type, so if we need a write lock and
	 * request it, we know that we'll get it.
	 */
	if (orig->lock.off == LOCK_INVALID || orig_dbc->txn != NULL) {
		new->lock.off = LOCK_INVALID;
		return (0);
	}

	return (CDB___ham_lock_bucket(new_dbc, DB_LOCK_READ));
}

static int
CDB___ham_c_get(dbc, key, data, flags)
	DBC *dbc;
	DBT *key;
	DBT *data;
	u_int32_t flags;
{
	DB *dbp;
	HASH_CURSOR *hcp, save_curs;
	db_lockmode_t lock_type;
	int get_key, ret, t_ret;

	DEBUG_LREAD(dbc, dbc->txn, "ham_c_get",
	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL,
	    NULL, flags);

	hcp = (HASH_CURSOR *)dbc->internal;
	dbp = dbc->dbp;
	PANIC_CHECK(dbp->dbenv);
	if ((ret = CDB___db_cgetchk(dbp, key, data, flags, IS_VALID(hcp))) != 0)
		return (ret);

	/* Clear OR'd in additional bits so we can check for flag equality. */
	if (LF_ISSET(DB_RMW)) {
		lock_type = DB_LOCK_WRITE;
		LF_CLR(DB_RMW);
	} else
		lock_type = DB_LOCK_READ;

	SAVE_CURSOR(hcp, &save_curs);
	if ((ret = CDB___ham_get_meta(dbc)) != 0)
		return (ret);
	hcp->seek_size = 0;

	ret = 0;
	get_key = 1;
	switch (flags) {
	case DB_PREV:
		if (hcp->bucket != BUCKET_INVALID) {
			ret = CDB___ham_item_prev(dbc, lock_type);
			break;
		}
		/* FALLTHROUGH */
	case DB_LAST:
		ret = CDB___ham_item_last(dbc, lock_type);
		break;
	case DB_NEXT:
		if (hcp->bucket != BUCKET_INVALID) {
			ret = CDB___ham_item_next(dbc, lock_type);
			break;
		}
		/* FALLTHROUGH */
	case DB_FIRST:
		ret = CDB___ham_item_first(dbc, lock_type);
		break;
	case DB_NEXT_DUP:
		/* cgetchk has already determined that the cursor is set. */
		F_SET(hcp, H_DUPONLY);
		ret = CDB___ham_item_next(dbc, lock_type);
		break;
	case DB_SET:
	case DB_SET_RANGE:
	case DB_GET_BOTH:
		if (F_ISSET(dbc, DBC_CONTINUE)) {
			F_SET(hcp, H_DUPONLY);
			ret = CDB___ham_item_next(dbc, lock_type);
		} else
			ret = CDB___ham_lookup(dbc, key, 0, lock_type);
		get_key = 0;
		break;
	case DB_CURRENT:
		/* cgetchk has already determined that the cursor is set. */
		if (F_ISSET(hcp, H_DELETED)) {
			ret = DB_KEYEMPTY;
			goto err1;
		}

		ret = CDB___ham_item(dbc, lock_type);
		break;
	}

	/*
	 * Must always enter this loop to do error handling and
	 * check for big key/data pair.
	 */
	while (1) {
		if (ret != 0 && ret != DB_NOTFOUND)
			goto err2;
		else if (F_ISSET(hcp, H_OK)) {
			/* Get the key. */
			if (get_key && (ret = CDB___db_ret(dbp, hcp->pagep,
			    H_KEYINDEX(hcp->bndx), key, &dbc->rkey.data,
			    &dbc->rkey.size)) != 0)
				goto err2;

			ret = CDB___ham_dup_return(dbc, data, flags);
			break;
		} else if (!F_ISSET(hcp, H_NOMORE)) {
			abort();
			break;
		}

		/*
		 * Ran out of entries in a bucket; change buckets.
		 */
		switch (flags) {
			case DB_LAST:
			case DB_PREV:
				ret = CDB___ham_item_done(dbc, 0);
				if (hcp->bucket == 0) {
					ret = DB_NOTFOUND;
					goto err2;
				}
				hcp->bucket--;
				hcp->bndx = NDX_INVALID;
				if (ret == 0)
					ret = CDB___ham_item_prev(dbc, lock_type);
				break;
			case DB_FIRST:
			case DB_NEXT:
				ret = CDB___ham_item_done(dbc, 0);
				hcp->bndx = NDX_INVALID;
				hcp->bucket++;
				hcp->pgno = PGNO_INVALID;
				hcp->pagep = NULL;
				if (hcp->bucket > hcp->hdr->max_bucket) {
					ret = DB_NOTFOUND;
					goto err2;
				}
				if (ret == 0)
					ret = CDB___ham_item_next(dbc, lock_type);
				break;
			case DB_GET_BOTH:
			case DB_NEXT_DUP:
			case DB_SET:
			case DB_SET_RANGE:
				/* Key not found. */
				ret = DB_NOTFOUND;
				goto err2;
			case DB_CURRENT:
				/*
				 * This should only happen if you are doing
				 * deletes and reading with concurrent threads
				 * and not doing proper locking.  We return
				 * the same error code as we would if the
				 * cursor were deleted.
				 */
				ret = DB_KEYEMPTY;
				goto err2;
		}
	}

err2:	if ((t_ret = CDB___ham_item_done(dbc, 0)) != 0 && ret == 0)
		ret = t_ret;
err1:	if ((t_ret = CDB___ham_release_meta(dbc)) != 0 && ret == 0)
		ret = t_ret;

	RESTORE_CURSOR(dbp, hcp, &save_curs, ret);

	F_CLR(hcp, H_DUPONLY);

	return (ret);
}

static int
CDB___ham_c_put(dbc, key, data, flags)
	DBC *dbc;
	DBT *key;
	DBT *data;
	u_int32_t flags;
{
	DB *dbp;
	DBT tmp_val, *myval;
	HASH_CURSOR *hcp, save_curs;
	u_int32_t nbytes;
	int ret, t_ret;

	/*
	 * The compiler doesn't realize that we only use this when ret is
	 * equal to 0 and that if ret is equal to 0, that we must have set
	 * myval.  So, we initialize it here to shut the compiler up.
	 */
	COMPQUIET(myval, NULL);

	dbp = dbc->dbp;
	PANIC_CHECK(dbp->dbenv);
	DEBUG_LWRITE(dbc, dbc->txn, "ham_c_put",
	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
	    data, flags);
	hcp = (HASH_CURSOR *)dbc->internal;

	if ((ret = CDB___db_cputchk(dbp, key, data, flags,
	    F_ISSET(dbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0)
		return (ret);

	if (F_ISSET(hcp, H_DELETED) &&
	    flags != DB_KEYFIRST && flags != DB_KEYLAST)
		return (DB_NOTFOUND);

	/*
	 * If we are in the concurrent DB product and this cursor
	 * is not a write cursor, then this request is invalid.
	 * If it is a simple write cursor, then we need to upgrade its
	 * lock.
	 */
	if (F_ISSET(dbp->dbenv, DB_ENV_CDB)) {
		/* Make sure it's a valid update cursor. */
		if (!F_ISSET(dbc, DBC_WRITECURSOR | DBC_WRITER))
			return (EPERM);

		if (F_ISSET(dbc, DBC_WRITECURSOR) &&
		    (ret = CDB_lock_get(dbp->dbenv, dbc->locker,
		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
		    &dbc->mylock)) != 0)
			return (ret);
	}

	SAVE_CURSOR(hcp, &save_curs);

	if ((ret = CDB___ham_get_meta(dbc)) != 0)
		goto err1;

	switch (flags) {
	case DB_KEYLAST:
	case DB_KEYFIRST:
		nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE :
		    HKEYDATA_PSIZE(key->size)) +
		    (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE :
		    HKEYDATA_PSIZE(data->size));
		if ((ret = CDB___ham_lookup(dbc,
		    key, nbytes, DB_LOCK_WRITE)) == DB_NOTFOUND) {
			ret = 0;
			if (hcp->seek_found_page != PGNO_INVALID &&
			    hcp->seek_found_page != hcp->pgno) {
				if ((ret = CDB___ham_item_done(dbc, 0)) != 0)
					goto err2;
				hcp->pgno = hcp->seek_found_page;
				hcp->bndx = NDX_INVALID;
			}

			if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) {
				/*
				 * A partial put, but the key does not exist
				 * and we are not beginning the write at 0.
				 * We must create a data item padded up to doff
				 * and then write the new bytes represented by
				 * val.
				 */
				if ((ret = CDB___ham_init_dbt(&tmp_val,
				    data->size + data->doff,
				    &dbc->rdata.data, &dbc->rdata.size)) == 0) {
					memset(tmp_val.data, 0, data->doff);
					memcpy((u_int8_t *)tmp_val.data +
					    data->doff, data->data, data->size);
					myval = &tmp_val;
				}
			} else
				myval = (DBT *)data;

			if (ret == 0)
				ret = CDB___ham_add_el(dbc, key, myval, H_KEYDATA);
			goto done;
		}
		break;
	case DB_BEFORE:
	case DB_AFTER:
	case DB_CURRENT:
		ret = CDB___ham_item(dbc, DB_LOCK_WRITE);
		break;
	}

	if (ret == 0) {
		if (flags == DB_CURRENT ||
		    ((flags == DB_KEYFIRST || flags == DB_KEYLAST) &&
		    !F_ISSET(dbp, DB_AM_DUP)))
			ret = CDB___ham_overwrite(dbc, data);
		else
			ret = CDB___ham_add_dup(dbc, data, flags);
	}

done:	if (ret == 0 && F_ISSET(hcp, H_EXPAND)) {
		ret = CDB___ham_expand_table(dbc);
		F_CLR(hcp, H_EXPAND);
	}

	if ((t_ret = CDB___ham_item_done(dbc, ret == 0)) != 0 && ret == 0)
		ret = t_ret;

err2:	if ((t_ret = CDB___ham_release_meta(dbc)) != 0 && ret == 0)
		ret = t_ret;

err1:	RESTORE_CURSOR(dbp, hcp, &save_curs, ret);


	if (F_ISSET(dbc, DBC_WRITECURSOR))
		(void)CDB___lock_downgrade(dbp->dbenv,
		    &dbc->mylock, DB_LOCK_IWRITE, 0);

	return (ret);
}

/********************************* UTILITIES ************************/

/*
 * CDB___ham_expand_table --
 */
static int
CDB___ham_expand_table(dbc)
	DBC *dbc;
{
	DB *dbp;
	PAGE *h;
	HASH_CURSOR *hcp;
	db_pgno_t pgno;
	u_int32_t old_bucket, new_bucket;
	int ret;

	dbp = dbc->dbp;
	hcp = (HASH_CURSOR *)dbc->internal;
	if ((ret = CDB___ham_dirty_meta(dbc)) != 0)
		return (ret);

	/*
	 * If the split point is about to increase, make sure that we
	 * have enough extra pages.  The calculation here is weird.
	 * We'd like to do this after we've upped max_bucket, but it's
	 * too late then because we've logged the meta-data split.  What
	 * we'll do between then and now is increment max bucket and then
	 * see what the log of one greater than that is; here we have to
	 * look at the log of max + 2.  VERY NASTY STUFF.
	 *
	 * It just got even nastier.  With subdatabases, we have to request
	 * a chunk of contiguous pages, so we do that here using an
	 * undocumented feature of mpool (the MPOOL_NEW_GROUP flag) to
	 * give us a number of contiguous pages.  Ouch.
	 */
	if (hcp->hdr->max_bucket == hcp->hdr->high_mask) {
		/*
		 * Ask mpool to give us a set of contiguous page numbers
		 * large enough to contain the next doubling.
		 *
		 * Figure out how many new pages we need.   This will return
		 * us the last page.  We calculate its page number, initialize
		 * the page and then write it back to reserve all the pages
		 * in between.  It is possible that the allocation of new pages
		 * has already been done, but the tranaction aborted.  Since
		 * we don't undo the allocation, check for a valid pgno before
		 * doing the allocation.
		 */
		pgno = hcp->hdr->max_bucket + 1;
		if (hcp->hdr->spares[CDB___db_log2(pgno) + 1] == PGNO_INVALID)
			/* Allocate a group of pages. */
			ret = CDB_memp_fget(dbp->mpf,
			    &pgno, DB_MPOOL_NEW_GROUP, &h);
		else {
			/* Just read in the last page of the batch */
			pgno = hcp->hdr->spares[CDB___db_log2(pgno) + 1] +
			    hcp->hdr->max_bucket + 1;
			ret = CDB_memp_fget(dbp->mpf,
			    &pgno, DB_MPOOL_CREATE, &h);
		}
		if (ret != 0)
			return (ret);

		P_INIT(h, dbp->pgsize, pgno,
		    PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
		pgno -= hcp->hdr->max_bucket;
	} else {
		pgno = BUCKET_TO_PAGE(hcp, hcp->hdr->max_bucket + 1);
		if ((ret =
		    CDB_memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, &h)) != 0)
			return (ret);
	}

	/* Now we can log the meta-data split. */
	if (DB_LOGGING(dbc)) {
		if ((ret = CDB___ham_metagroup_log(dbp->dbenv,
		    dbc->txn, &h->lsn, 0, dbp->log_fileid,
		    hcp->hdr->max_bucket, pgno, &hcp->hdr->dbmeta.lsn,
		    &h->lsn)) != 0)
			return (ret);

		hcp->hdr->dbmeta.lsn = h->lsn;
	}

	/* If we allocated some new pages, write out the last page. */
	if ((ret = CDB_memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
		return (ret);

	new_bucket = ++hcp->hdr->max_bucket;
	old_bucket = (hcp->hdr->max_bucket & hcp->hdr->low_mask);

	/*
	 * If we started a new doubling, fill in the spares array with
	 * the starting page number negatively offset by the bucket number.
	 */
	if (new_bucket > hcp->hdr->high_mask) {
		/* Starting a new doubling */
		hcp->hdr->low_mask = hcp->hdr->high_mask;
		hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask;
		if (hcp->hdr->spares[CDB___db_log2(new_bucket) + 1] == PGNO_INVALID)
			hcp->hdr->spares[CDB___db_log2(new_bucket) + 1] =
			    pgno - new_bucket;
	}

	/* Relocate records to the new bucket */
	return (CDB___ham_split_page(dbc, old_bucket, new_bucket));
}

/*
 * PUBLIC: u_int32_t CDB___ham_call_hash __P((HASH_CURSOR *, u_int8_t *, int32_t));
 */
u_int32_t
CDB___ham_call_hash(hcp, k, len)
	HASH_CURSOR *hcp;
	u_int8_t *k;
	int32_t len;
{
	u_int32_t n, bucket;
	HASH *hashp;

	hashp = hcp->dbc->dbp->h_internal;
	n = (u_int32_t)(hashp->h_hash(k, len));

	bucket = n & hcp->hdr->high_mask;
	if (bucket > hcp->hdr->max_bucket)
		bucket = bucket & hcp->hdr->low_mask;
	return (bucket);
}

/*
 * Check for duplicates, and call CDB___db_ret appropriately.  Release
 * everything held by the cursor.
 */
static int
CDB___ham_dup_return(dbc, val, flags)
	DBC *dbc;
	DBT *val;
	u_int32_t flags;
{
	DB *dbp;
	HASH_CURSOR *hcp;
	PAGE *pp;
	DBT *myval, tmp_val;
	db_indx_t ndx;
	db_pgno_t pgno;
	u_int32_t off, tlen;
	u_int8_t *hk, type;
	int cmp, ret;
	db_indx_t len;

	/* Check for duplicate and return the first one. */
	dbp = dbc->dbp;
	hcp = (HASH_CURSOR *)dbc->internal;
	ndx = H_DATAINDEX(hcp->bndx);
	type = HPAGE_TYPE(hcp->pagep, ndx);
	pp = hcp->pagep;
	myval = val;

	/*
	 * There are 4 cases:
	 * 1. We are not in duplicate, simply call db_ret.
	 * 2. We are looking at keys and stumbled onto a duplicate.
	 * 3. We are in the middle of a duplicate set. (ISDUP set)
	 * 4. This is a duplicate and we need to return a specific item.
	 */

	/*
	 * Here we check for the case where we just stumbled onto a
	 * duplicate.  In this case, we do initialization and then
	 * let the normal duplicate code handle it.
	 */
	if (!F_ISSET(hcp, H_ISDUP)) {
		if (type == H_DUPLICATE) {
			F_SET(hcp, H_ISDUP);
			hcp->dup_tlen = LEN_HDATA(hcp->pagep,
			    hcp->hdr->dbmeta.pagesize, hcp->bndx);
			hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
			if (flags == DB_LAST || flags == DB_PREV) {
				hcp->dndx = 0;
				hcp->dup_off = 0;
				do {
					memcpy(&len,
					    HKEYDATA_DATA(hk) + hcp->dup_off,
					    sizeof(db_indx_t));
					hcp->dup_off += DUP_SIZE(len);
					hcp->dndx++;
				} while (hcp->dup_off < hcp->dup_tlen);
				hcp->dup_off -= DUP_SIZE(len);
				hcp->dndx--;
			} else {
				memcpy(&len,
				    HKEYDATA_DATA(hk), sizeof(db_indx_t));
				hcp->dup_off = 0;
				hcp->dndx = 0;
			}
			hcp->dup_len = len;
		} else if (type == H_OFFDUP) {
			F_SET(hcp, H_ISDUP);
			if (flags == DB_CURRENT) {
				pgno = hcp->dpgno;
				ndx = hcp->dndx;
			} else
				memcpy(&pgno,
				    HOFFDUP_PGNO(P_ENTRY(hcp->pagep, ndx)),
				    sizeof(db_pgno_t));
			if (flags == DB_LAST || flags == DB_PREV) {
				if ((ret = CDB___db_dend(dbc,
				    pgno, &hcp->dpagep)) != 0)
					return (ret);
				hcp->dpgno = PGNO(hcp->dpagep);
				hcp->dndx = NUM_ENT(hcp->dpagep) - 1;
			} else if ((ret = CDB___ham_next_cpage(dbc,
			    pgno, 0, H_ISDUP)) != 0)
				return (ret);
			if (flags == DB_CURRENT)
				hcp->dndx = ndx;
		}
	}

	/*
	 * If we are retrieving a specific key/data pair, then we
	 * may need to adjust the cursor before returning data.
	 */
	if (flags == DB_GET_BOTH) {
		if (F_ISSET(hcp, H_ISDUP)) {
			if (hcp->dpgno != PGNO_INVALID) {
				if ((ret = CDB___db_dsearch(dbc, 0, val,
				    hcp->dpgno, &hcp->dndx, &hcp->dpagep, &cmp))
				    != 0)
					return (ret);
				if (cmp == 0)
					hcp->dpgno = PGNO(hcp->dpagep);
			} else {
				CDB___ham_dsearch(dbc, val, &off, &cmp);
				hcp->dup_off = off;
			}
		} else {
			hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
			if (((HKEYDATA *)hk)->type == H_OFFPAGE) {
				memcpy(&tlen,
				    HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
				memcpy(&pgno,
				    HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
				if ((ret = CDB___db_moff(dbp, val,
				    pgno, tlen, dbp->dup_compare, &cmp)) != 0)
					return (ret);
			} else {
				/*
				 * We do not zero tmp_val since the comparison
				 * routines may only look at data and size.
				 */
				tmp_val.data = HKEYDATA_DATA(hk);
				tmp_val.size = LEN_HDATA(hcp->pagep,
				    dbp->pgsize, hcp->bndx);
				cmp = dbp->dup_compare == NULL ?
				    CDB___bam_defcmp(&tmp_val, val) :
				    dbp->dup_compare(&tmp_val, val);
			}
		}

		if (cmp != 0)
			return (DB_NOTFOUND);
	}

	/*
	 * Now, everything is initialized, grab a duplicate if
	 * necessary.
	 */
	if (F_ISSET(hcp, H_ISDUP)) {
		if (hcp->dpgno != PGNO_INVALID) {
			pp = hcp->dpagep;
			ndx = hcp->dndx;
		} else {
			/*
			 * Copy the DBT in case we are retrieving into user
			 * memory and we need the parameters for it.  If the
			 * user requested a partial, then we need to adjust
			 * the user's parameters to get the partial of the
			 * duplicate which is itself a partial.
			 */
			memcpy(&tmp_val, val, sizeof(*val));
			if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) {
				/*
				 * Take the user's length unless it would go
				 * beyond the end of the duplicate.
				 */
				if (tmp_val.doff + hcp->dup_off > hcp->dup_len)
					tmp_val.dlen = 0;
				else if (tmp_val.dlen + tmp_val.doff >
				    hcp->dup_len)
					tmp_val.dlen =
					    hcp->dup_len - tmp_val.doff;

				/*
				 * Calculate the new offset.
				 */
				tmp_val.doff += hcp->dup_off;
			} else {
				F_SET(&tmp_val, DB_DBT_PARTIAL);
				tmp_val.dlen = hcp->dup_len;
				tmp_val.doff = hcp->dup_off + sizeof(db_indx_t);
			}
			myval = &tmp_val;
		}
	}


	/*
	 * Finally, if we had a duplicate, pp, ndx, and myval should be
	 * set appropriately.
	 */
	if ((ret = CDB___db_ret(dbp, pp, ndx, myval, &dbc->rdata.data,
	    &dbc->rdata.size)) != 0)
		return (ret);

	/*
	 * In case we sent a temporary off to db_ret, set the real
	 * return values.
	 */
	val->data = myval->data;
	val->size = myval->size;

	return (0);
}

static int
CDB___ham_overwrite(dbc, nval)
	DBC *dbc;
	DBT *nval;
{
	HASH_CURSOR *hcp;
	DBT *myval, tmp_val, tmp_val2;
	void *newrec;
	u_int8_t *hk, *p;
	u_int32_t len, nondup_size;
	db_pgno_t prev;
	db_indx_t newsize, dndx;
	int ret;

	hcp = (HASH_CURSOR *)dbc->internal;
	if (F_ISSET(hcp, H_ISDUP)) {
		/*
		 * This is an overwrite of a duplicate; check for
		 * onpage versus offpage and whether it's partial.
		 */
		if (hcp->dpagep != NULL) {
do_offpage:		if (F_ISSET(nval, DB_DBT_PARTIAL)) {
				/*
				 * We are using btree routines that are
				 * actually OK for hash to use.  Since all
				 * dbps have bt_internal initialized, this
				 * *should* just work.
				 */
				newsize = CDB___bam_partsize(
				    DB_CURRENT, nval, hcp->dpagep, hcp->dndx);
				memcpy(&tmp_val, nval, sizeof(tmp_val));
				if ((ret =
				   CDB___bam_build(dbc, DB_CURRENT, &tmp_val,
				   hcp->dpagep, hcp->dndx, newsize)) != 0)
					return (ret);
				myval = &tmp_val;
			} else
				myval = nval;

			/*
			 * Make sure that the caller isn't corrupting
			 * the sort order.
			 */
			if (dbc->dbp->dup_compare != NULL &&
			    CDB___bam_cmp(dbc->dbp, myval, hcp->dpagep,
			    hcp->dndx, dbc->dbp->dup_compare) != 0)
				return (EINVAL);

			prev = PREV_PGNO(hcp->dpagep);
			if ((ret =
			    CDB___db_drem(dbc, &hcp->dpagep, hcp->dndx)) != 0)
				return (ret);
			/*
			 * It's possible that hcp->dpagep is now NULL.  If
			 * we have a prev, we can deal pretty easily; if not
			 * this gets ugly.
			 */
			if (hcp->dpagep == NULL) {
				if (prev == PGNO_INVALID) {
					/*
					 * This was a duplicate page with
					 * a single item.  Pretend to reenter
					 * this routine simply overwriting the
					 * entry on the main page.
					 */
					F_CLR(hcp, H_ISDUP);
					goto doreplace;
				}
				if ((ret = CDB___ham_next_cpage(dbc,
				    prev, 0, H_ISDUP)) != 0)
					return (ret);
				hcp->dndx = NUM_ENT(hcp->dpagep);
			}

			/*
			 * On page splits, the 4th parameter of db_dput returns
			 * the location the new item was put.  We cannot pass
			 * in permanent fields from the cursor, they may have
			 * been updated in cursor adjustment.
			 */
			dndx = hcp->dndx;
			ret = CDB___db_dput(dbc, myval, &hcp->dpagep, &dndx);
			hcp->dpgno = PGNO(hcp->dpagep);
			hcp->dndx = dndx;
			return (ret);
		}

		/* On page dups */
		if (F_ISSET(nval, DB_DBT_PARTIAL)) {
			/*
			 * We're going to have to get the current item, then
			 * construct the record, do any padding and do a
			 * replace.
			 */
			memset(&tmp_val, 0, sizeof(tmp_val));
			if ((ret =
			    CDB___ham_dup_return(dbc, &tmp_val, DB_CURRENT)) != 0)
				return (ret);

			/* Figure out new size. */
			nondup_size = tmp_val.size;
			newsize = nondup_size;

			/*
			 * Three cases:
			 * 1. strictly append (may need to allocate space
			 * 	for pad bytes; really gross).
			 * 2. overwrite some and append.
			 * 3. strictly overwrite.
			 */
			if (nval->doff > nondup_size)
				newsize +=
				    (nval->doff - nondup_size + nval->size);
			else if (nval->doff + nval->dlen > nondup_size)
				newsize += nval->size -
				    (nondup_size - nval->doff);
			else
				newsize += nval->size - nval->dlen;

			/*
			 * Make sure that the new size doesn't put us over
			 * the onpage duplicate size in which case we need
			 * to convert to off-page duplicates.
			 */
			if (ISBIG(hcp, hcp->dup_tlen - nondup_size + newsize)) {
				if ((ret = CDB___ham_dup_convert(dbc)) != 0)
					return (ret);
				goto do_offpage;
			}

			if ((ret =
			    CDB___os_malloc(DUP_SIZE(newsize), NULL, &newrec)) != 0)
				return (ret);
			memset(&tmp_val2, 0, sizeof(tmp_val2));
			F_SET(&tmp_val2, DB_DBT_PARTIAL);

			/* Construct the record. */
			p = newrec;
			/* Initial size. */
			memcpy(p, &newsize, sizeof(db_indx_t));
			p += sizeof(db_indx_t);

			/* First part of original record. */
			len = nval->doff > tmp_val.size
			    ? tmp_val.size : nval->doff;
			memcpy(p, tmp_val.data, len);
			p += len;

			if (nval->doff > tmp_val.size) {
				/* Padding */
				memset(p, 0, nval->doff - tmp_val.size);
				p += nval->doff - tmp_val.size;
			}

			/* New bytes */
			memcpy(p, nval->data, nval->size);
			p += nval->size;

			/* End of original record (if there is any) */
			if (nval->doff + nval->dlen < tmp_val.size) {
				len = tmp_val.size - nval->doff - nval->dlen;
				memcpy(p, (u_int8_t *)tmp_val.data +
				    nval->doff + nval->dlen, len);
				p += len;
			}

			/* Final size. */
			memcpy(p, &newsize, sizeof(db_indx_t));

			/*
			 * Make sure that the caller isn't corrupting
			 * the sort order.
			 */
			if (dbc->dbp->dup_compare != NULL) {
				tmp_val2.data =
				    (u_int8_t *)newrec + sizeof(db_indx_t);
				tmp_val2.size = newsize;
				if (dbc->dbp->dup_compare(&tmp_val, &tmp_val2)
				    != 0) {
					(void)CDB___os_free(newrec,
					    DUP_SIZE(newsize));
					return (EINVAL);
				}
			}

			tmp_val2.data = newrec;
			tmp_val2.size = DUP_SIZE(newsize);
			tmp_val2.doff = hcp->dup_off;
			tmp_val2.dlen = DUP_SIZE(hcp->dup_len);

			ret = CDB___ham_replpair(dbc, &tmp_val2, 0);
			(void)CDB___os_free(newrec, DUP_SIZE(newsize));

			/* Update cursor */
			if (ret != 0)
				return (ret);

			if (newsize > nondup_size)
				hcp->dup_tlen += (newsize - nondup_size);
			else
				hcp->dup_tlen -= (nondup_size - newsize);
			hcp->dup_len = DUP_SIZE(newsize);
			return (0);
		} else {
			/* Check whether we need to convert to off page. */
			if (ISBIG(hcp,
			    hcp->dup_tlen - hcp->dup_len + nval->size)) {
				if ((ret = CDB___ham_dup_convert(dbc)) != 0)
					return (ret);
				goto do_offpage;
			}

			/* Make sure we maintain sort order. */
			if (dbc->dbp->dup_compare != NULL) {
				tmp_val2.data =
				    HKEYDATA_DATA(H_PAIRDATA(hcp->pagep,
				    hcp->bndx)) + hcp->dup_off +
				    sizeof(db_indx_t);
				tmp_val2.size = hcp->dup_len;
				if (dbc->dbp->dup_compare(nval, &tmp_val2) != 0)
					return (EINVAL);
			}
			/* Overwriting a complete duplicate. */
			if ((ret = CDB___ham_make_dup(nval, &tmp_val,
			    &dbc->rdata.data, &dbc->rdata.size)) != 0)
				return (ret);
			/* Now fix what we are replacing. */
			tmp_val.doff = hcp->dup_off;
			tmp_val.dlen = DUP_SIZE(hcp->dup_len);

			/* Update cursor */
			if (nval->size > hcp->dup_len)
				hcp->dup_tlen += (nval->size - hcp->dup_len);
			else
				hcp->dup_tlen -= (hcp->dup_len - nval->size);
			hcp->dup_len = DUP_SIZE(nval->size);
		}
		myval = &tmp_val;
	} else if (!F_ISSET(nval, DB_DBT_PARTIAL)) {
		/* Put/overwrite */
		memcpy(&tmp_val, nval, sizeof(*nval));
		F_SET(&tmp_val, DB_DBT_PARTIAL);
		tmp_val.doff = 0;
		hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
		if (HPAGE_PTYPE(hk) == H_OFFPAGE)
			memcpy(&tmp_val.dlen,
			    HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
		else
			tmp_val.dlen = LEN_HDATA(hcp->pagep,
			    hcp->hdr->dbmeta.pagesize,hcp->bndx);
		myval = &tmp_val;
	} else
		/* Regular partial put */
		myval = nval;
doreplace:
	return (CDB___ham_replpair(dbc, myval, 0));
}

/*
 * Given a key and a cursor, sets the cursor to the page/ndx on which
 * the key resides.  If the key is found, the cursor H_OK flag is set
 * and the pagep, bndx, pgno (dpagep, dndx, dpgno) fields are set.
 * If the key is not found, the H_OK flag is not set.  If the sought
 * field is non-0, the pagep, bndx, pgno (dpagep, dndx, dpgno) fields
 * are set indicating where an add might take place.  If it is 0,
 * non of the cursor pointer field are valid.
 */
static int
CDB___ham_lookup(dbc, key, sought, mode)
	DBC *dbc;
	const DBT *key;
	u_int32_t sought;
	db_lockmode_t mode;
{
	DB *dbp;
	HASH_CURSOR *hcp;
	db_pgno_t pgno;
	u_int32_t tlen;
	int match, ret, t_ret;
	u_int8_t *hk;

	dbp = dbc->dbp;
	hcp = (HASH_CURSOR *)dbc->internal;
	/*
	 * Set up cursor so that we're looking for space to add an item
	 * as we cycle through the pages looking for the key.
	 */
	if ((ret = CDB___ham_item_reset(dbc)) != 0)
		return (ret);
	hcp->seek_size = sought;

	hcp->bucket = CDB___ham_call_hash(hcp, (u_int8_t *)key->data, key->size);
	while (1) {
		if ((ret = CDB___ham_item_next(dbc, mode)) != 0)
			return (ret);

		if (F_ISSET(hcp, H_NOMORE))
			break;

		hk = H_PAIRKEY(hcp->pagep, hcp->bndx);
		switch (HPAGE_PTYPE(hk)) {
		case H_OFFPAGE:
			memcpy(&tlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
			if (tlen == key->size) {
				memcpy(&pgno,
				    HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
				if ((ret = CDB___db_moff(dbp,
				    key, pgno, tlen, NULL, &match)) != 0)
					return (ret);
				if (match == 0) {
					F_SET(hcp, H_OK);
					return (0);
				}
			}
			break;
		case H_KEYDATA:
			if (key->size ==
			    LEN_HKEY(hcp->pagep, dbp->pgsize, hcp->bndx) &&
			    memcmp(key->data,
			    HKEYDATA_DATA(hk), key->size) == 0) {
				F_SET(hcp, H_OK);
				return (0);
			}
			break;
		case H_DUPLICATE:
		case H_OFFDUP:
			/*
			 * These are errors because keys are never
			 * duplicated, only data items are.
			 */
			return (CDB___db_pgfmt(dbp, PGNO(hcp->pagep)));
		}
	}

	/*
	 * Item was not found.
	 */

	if (sought != 0)
		return (ret);

	if ((t_ret = CDB___ham_item_done(dbc, 0)) != 0 && ret == 0)
		ret = t_ret;
	return (ret);
}

/*
 * CDB___ham_init_dbt --
 *	Initialize a dbt using some possibly already allocated storage
 *	for items.
 *
 * PUBLIC: int CDB___ham_init_dbt __P((DBT *, u_int32_t, void **, u_int32_t *));
 */
int
CDB___ham_init_dbt(dbt, size, bufp, sizep)
	DBT *dbt;
	u_int32_t size;
	void **bufp;
	u_int32_t *sizep;
{
	int ret;

	memset(dbt, 0, sizeof(*dbt));
	if (*sizep < size) {
		if ((ret = CDB___os_realloc(size, NULL, bufp)) != 0) {
			*sizep = 0;
			return (ret);
		}
		*sizep = size;
	}
	dbt->data = *bufp;
	dbt->size = size;
	return (0);
}

/*
 * Adjust the cursor after an insert or delete.  The cursor passed is
 * the one that was operated upon; we just need to check any of the
 * others.
 *
 * len indicates the length of the item added/deleted
 * add indicates if the item indicated by the cursor has just been
 * added (add == 1) or deleted (add == 0).
 * dup indicates if the addition occurred into a duplicate set.
 *
 * PUBLIC: void CDB___ham_c_update
 * PUBLIC:    __P((HASH_CURSOR *, db_pgno_t, u_int32_t, int, int));
 */
void
CDB___ham_c_update(hcp, chg_pgno, len, add, is_dup)
	HASH_CURSOR *hcp;
	db_pgno_t chg_pgno;
	u_int32_t len;
	int add, is_dup;
{
	DB *dbp;
	DBC *cp;
	HASH_CURSOR *lcp;
	int page_deleted;

	/*
	 * Regular adds are always at the end of a given page, so we never
	 * have to adjust anyone's cursor after a regular add.
	 */
	if (!is_dup && add)
		return;

	/*
	 * Determine if a page was deleted.    If this is a regular update
	 * (i.e., not is_dup) then the deleted page's number will be that in
	 * chg_pgno, and the pgno in the cursor will be different.  If this
	 * was an onpage-duplicate, then the same conditions apply.  If this
	 * was an off-page duplicate, then we need to verify if hcp->dpgno
	 * is the same (no delete) or different (delete) than chg_pgno.
	 */
	if (!is_dup || hcp->dpgno == PGNO_INVALID)
		page_deleted =
		    chg_pgno != PGNO_INVALID && chg_pgno != hcp->pgno;
	else
		page_deleted =
		    chg_pgno != PGNO_INVALID && chg_pgno != hcp->dpgno;

	dbp = hcp->dbc->dbp;
	MUTEX_THREAD_LOCK(dbp->mutexp);

	for (cp = TAILQ_FIRST(&dbp->active_queue); cp != NULL;
	    cp = TAILQ_NEXT(cp, links)) {
		if (cp->internal == hcp)
			continue;

		lcp = (HASH_CURSOR *)cp->internal;

		if (!is_dup && lcp->pgno != chg_pgno)
			continue;

		if (is_dup &&
		    ((lcp->dpgno == PGNO_INVALID && lcp->pgno != chg_pgno) ||
		    (lcp->dpgno != PGNO_INVALID && lcp->dpgno != chg_pgno)))
			continue;

		if (is_dup && F_ISSET(hcp, H_DELETED)) {
			if (lcp->dpgno == PGNO_INVALID) {
				if (lcp->pgno != chg_pgno)
					continue;
			} else if (lcp->dpgno != chg_pgno)
				continue;
		}

		if (page_deleted) {
			if (is_dup) {
				lcp->dpgno = hcp->dpgno;
				lcp->dndx = hcp->dndx;
			} else {
				lcp->pgno = hcp->pgno;
				lcp->bndx = hcp->bndx;
				lcp->bucket = hcp->bucket;
			}
			F_CLR(lcp, H_ISDUP);
			continue;
		}

		if (!is_dup && lcp->bndx > hcp->bndx)
			lcp->bndx--;
		else if (!is_dup && lcp->bndx == hcp->bndx)
			if (add)
				lcp->bndx++;
			else
				F_SET(lcp, H_DELETED);
		else if (is_dup && hcp->dpgno != PGNO_INVALID &&
		    hcp->dpgno == lcp->dpgno) {
			/* Off-page duplicate. */
			if (add && lcp->dndx >= hcp->dndx )
				lcp->dndx++;
			else if (!add && lcp->dndx > hcp->dndx)
				lcp->dndx--;
			else if (!add && lcp->dndx == hcp->dndx)
				F_SET(lcp, H_DELETED);

		} else if (is_dup && lcp->pgno == chg_pgno &&
		    lcp->bndx == hcp->bndx) {
			/* On-page duplicate. */
			if (add) {
				lcp->dup_tlen += len;
				if (lcp->dup_off > hcp->dup_off)
					lcp->dup_off += len;
				if (lcp->dup_off == hcp->dup_off)
					lcp->dup_len = len;
			} else {
				lcp->dup_tlen -= len;
				if (lcp->dup_off > hcp->dup_off)
					lcp->dup_off -= len;
				else if (lcp->dup_off == hcp->dup_off)
					F_SET(lcp, H_DELETED);
			}
		}
	}
	MUTEX_THREAD_UNLOCK(dbp->mutexp);
}

/*
 * CDB___ham_get_clist --
 *
 * Get a list of cursors either on a particular bucket or on a particular
 * page and index combination.  The former is so that we can update
 * cursors on a split.  The latter is so we can update cursors when we
 * move items off page.
 *
 * PUBLIC: int CDB___ham_get_clist __P((DB *,
 * PUBLIC:     db_pgno_t, u_int32_t, HASH_CURSOR ***));
 */
int
CDB___ham_get_clist(dbp, bucket, indx, listp)
	DB *dbp;
	db_pgno_t bucket;
	u_int32_t indx;
	HASH_CURSOR ***listp;
{
	DBC *cp;
	int nalloc, nused, ret;

	/*
	 * Assume that finding anything is the exception, so optimize for
	 * the case where there aren't any.
	 */
	nalloc = nused = 0;
	*listp = NULL;

	MUTEX_THREAD_LOCK(dbp->mutexp);

	for (cp = TAILQ_FIRST(&dbp->active_queue); cp != NULL;
	    cp = TAILQ_NEXT(cp, links))
		if ((indx == NDX_INVALID &&
		    ((HASH_CURSOR *)(cp->internal))->bucket == bucket) ||
		    (indx != NDX_INVALID &&
		    ((HASH_CURSOR *)(cp->internal))->pgno == bucket &&
		    ((HASH_CURSOR *)(cp->internal))->bndx == indx)) {
			if (nused >= nalloc) {
				nalloc += 10;
				if ((ret =
				    CDB___os_realloc(nalloc * sizeof(HASH_CURSOR *),
				    NULL, listp)) != 0)
					return (ret);
			}
			(*listp)[nused++] = (HASH_CURSOR *)cp->internal;
		}

	MUTEX_THREAD_UNLOCK(dbp->mutexp);
	if (listp != NULL) {
		if (nused >= nalloc) {
			nalloc++;
			if ((ret = CDB___os_realloc(nalloc * sizeof(HASH_CURSOR *),
			    NULL, listp)) != 0)
				return (ret);
		}
		(*listp)[nused] = NULL;
	}
	return (0);
}