You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

615 lines
17 KiB

/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1998, 1999
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char sccsid[] = "@(#)db_join.c 11.6 (Sleepycat) 10/19/99";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#include <errno.h>
#include <string.h>
#endif
#include "db_int.h"
#include "db_page.h"
#include "db_join.h"
#include "db_am.h"
#include "btree.h"
static int CDB___db_join_close __P((DBC *));
static int CDB___db_join_del __P((DBC *, u_int32_t));
static int CDB___db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
static int CDB___db_join_getnext __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
static int CDB___db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
/*
* This is the duplicate-assisted join functionality. Right now we're
* going to write it such that we return one item at a time, although
* I think we may need to optimize it to return them all at once.
* It should be easier to get it working this way, and I believe that
* changing it should be fairly straightforward.
*
* XXX
* Right now we do not maintain the number of duplicates so we do
* not optimize the join. If the caller does, then best performance
* will be achieved by putting the cursor with the smallest cardinality
* first.
*
* The first cursor moves sequentially through the duplicate set while
* the others search explicitly for the duplicate in question.
*
*/
/*
* CDB___db_join --
* This is the interface to the duplicate-assisted join functionality.
* In the same way that cursors mark a position in a database, a cursor
* can mark a position in a join. While most cursors are created by the
* cursor method of a DB, join cursors are created through an explicit
* call to DB->join.
*
* The curslist is an array of existing, intialized cursors and primary
* is the DB of the primary file. The data item that joins all the
* cursors in the curslist is used as the key into the primary and that
* key and data are returned. When no more items are left in the join
* set, the c_next operation off the join cursor will return DB_NOTFOUND.
*
* PUBLIC: int CDB___db_join __P((DB *, DBC **, DBC **, u_int32_t));
*/
int
CDB___db_join(primary, curslist, dbcp, flags)
DB *primary;
DBC **curslist, **dbcp;
u_int32_t flags;
{
DBC *dbc;
JOIN_CURSOR *jc;
int i, ret, nslots;
COMPQUIET(nslots, 0);
PANIC_CHECK(primary->dbenv);
if ((ret = CDB___db_joinchk(primary, flags)) != 0)
return (ret);
if (curslist == NULL || curslist[0] == NULL)
return (EINVAL);
dbc = NULL;
jc = NULL;
if ((ret = CDB___os_calloc(1, sizeof(DBC), &dbc)) != 0)
goto err;
if ((ret = CDB___os_calloc(1, sizeof(JOIN_CURSOR), &jc)) != 0)
goto err;
if ((ret = CDB___os_malloc(256, NULL, &jc->j_key.data)) != 0)
goto err;
jc->j_key.ulen = 256;
F_SET(&jc->j_key, DB_DBT_USERMEM);
for (jc->j_curslist = curslist;
*jc->j_curslist != NULL; jc->j_curslist++)
;
/*
* The number of cursor slots we allocate is one greater than
* the number of cursors involved in the join, because the
* list is NULL-terminated.
*/
nslots = jc->j_curslist - curslist + 1;
/*
* !!! -- A note on the various lists hanging off jc.
*
* j_curslist is the initial NULL-terminated list of cursors passed
* into CDB___db_join. The original cursors are not modified; pristine
* copies are required because, in databases with unsorted dups, we
* must reset all of the secondary cursors after the first each
* time the first one is incremented, or else we will lose data
* which happen to be sorted differently in two different cursors.
*
* j_workcurs is where we put those copies that we're planning to
* work with. They're lazily c_dup'ed from j_curslist as we need
* them, and closed when the join cursor is closed or when we need
* to reset them to their original values (in which case we just
* c_dup afresh).
*
* j_fdupcurs is an array of cursors which point to the first
* duplicate in the duplicate set that contains the data value
* we're currently interested in. We need this to make
* CDB___db_join_get correctly return duplicate duplicates; i.e., if a
* given data value occurs twice in the set belonging to cursor #2,
* and thrice in the set belonging to cursor #3, and once in all
* the other cursors, successive calls to CDB___db_join_get need to
* return that data item six times. To make this happen, each time
* cursor N is allowed to advance to a new datum, all cursors M
* such that M > N have to be reset to the first duplicate with
* that datum, so CDB___db_join_get will return all the dup-dups again.
* We could just reset them to the original cursor from j_curslist,
* but that would be a bit slower in the unsorted case and a LOT
* slower in the sorted one.
*
* j_exhausted is a list of boolean values which represent
* whether or not their corresponding cursors are "exhausted",
* i.e. whether the datum under the corresponding cursor has
* been found not to exist in any unreturned combinations of
* later secondary cursors, in which case they are ready to be
* incremented.
*/
/* We don't want to free regions whose callocs have failed. */
jc->j_curslist = NULL;
jc->j_workcurs = NULL;
jc->j_fdupcurs = NULL;
jc->j_exhausted = NULL;
if ((ret = CDB___os_calloc(nslots, sizeof(DBC *),
&jc->j_curslist)) != 0)
goto err;
if ((ret = CDB___os_calloc(nslots, sizeof(DBC *),
&jc->j_workcurs)) != 0)
goto err;
if ((ret = CDB___os_calloc(nslots, sizeof(DBC *),
&jc->j_fdupcurs)) != 0)
goto err;
if ((ret = CDB___os_calloc(nslots, sizeof(u_int8_t),
&jc->j_exhausted)) != 0)
goto err;
for (i = 0; curslist[i] != NULL; i++) {
jc->j_curslist[i] = curslist[i];
jc->j_workcurs[i] = NULL;
jc->j_fdupcurs[i] = NULL;
jc->j_exhausted[i] = 0;
}
/*
* We never need to reset the 0th cursor, so there's no
* solid reason to use workcurs[0] rather than curslist[0] in
* join_get. Nonetheless, it feels cleaner to do it for symmetry,
* and this is the most logical place to copy it.
*
* !!!
* There's no need to close the new cursor if we goto err only
* because this is the last thing that can fail. Modifier of this
* function beware!
*/
if ((ret = CDB___os_malloc(sizeof(DBC), NULL, jc->j_workcurs)) != 0)
goto err;
if ((ret = jc->j_curslist[0]->c_dup(jc->j_curslist[0], jc->j_workcurs,
DB_POSITIONI)) != 0)
goto err;
dbc->c_close = CDB___db_join_close;
dbc->c_del = CDB___db_join_del;
dbc->c_get = CDB___db_join_get;
dbc->c_put = CDB___db_join_put;
dbc->internal = jc;
dbc->dbp = primary;
jc->j_primary = primary;
*dbcp = dbc;
return (0);
err: if (jc != NULL) {
if (jc->j_curslist != NULL)
CDB___os_free(jc->j_curslist, nslots * sizeof(DBC *));
if (jc->j_workcurs != NULL) {
if (jc->j_workcurs[0] != NULL)
CDB___os_free(jc->j_workcurs[0], sizeof(DBC));
CDB___os_free(jc->j_workcurs, nslots * sizeof(DBC *));
}
if (jc->j_fdupcurs != NULL)
CDB___os_free(jc->j_fdupcurs, nslots * sizeof(DBC *));
if (jc->j_exhausted != NULL)
CDB___os_free(jc->j_exhausted, nslots * sizeof(u_int8_t));
CDB___os_free(jc, sizeof(JOIN_CURSOR));
}
if (dbc != NULL)
CDB___os_free(dbc, sizeof(DBC));
return (ret);
}
static int
CDB___db_join_put(dbc, key, data, flags)
DBC *dbc;
DBT *key;
DBT *data;
u_int32_t flags;
{
PANIC_CHECK(dbc->dbp->dbenv);
COMPQUIET(key, NULL);
COMPQUIET(data, NULL);
COMPQUIET(flags, 0);
return (EINVAL);
}
static int
CDB___db_join_del(dbc, flags)
DBC *dbc;
u_int32_t flags;
{
PANIC_CHECK(dbc->dbp->dbenv);
COMPQUIET(flags, 0);
return (EINVAL);
}
static int
CDB___db_join_get(dbc, key, data, flags)
DBC *dbc;
DBT *key, *data;
u_int32_t flags;
{
DBT currkey;
DB *dbp;
DBC *cp;
JOIN_CURSOR *jc;
int ret, i, j;
u_int32_t operation;
dbp = dbc->dbp;
memset(&currkey, 0, sizeof(currkey));
PANIC_CHECK(dbp->dbenv);
operation = LF_ISSET(DB_OPFLAGS_MASK);
if (operation != 0 && operation != DB_JOIN_ITEM)
return (CDB___db_ferr(dbp->dbenv, "DBcursor->c_get", 0));
LF_CLR(DB_OPFLAGS_MASK);
if ((ret =
CDB___db_fchk(dbp->dbenv, "DBcursor->c_get", flags, DB_RMW)) != 0)
return (ret);
/*
* Partial gets on join cursors don't make much sense, and the
* DBT_PARTIAL flag is liable to produce some rather strange
* results given the weird way the DBTs are used ("key" is used as
* the datum in all the secondary cursors), so we simply
* disallow it.
*/
if (F_ISSET(key, DB_DBT_PARTIAL) || F_ISSET(data, DB_DBT_PARTIAL))
return (EINVAL);
jc = (JOIN_CURSOR *)dbc->internal;
retry:
ret = jc->j_workcurs[0]->c_get(jc->j_workcurs[0],
&jc->j_key, key, jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT);
if (ret == ENOMEM) {
jc->j_key.ulen <<= 1;
if ((ret =
CDB___os_realloc(jc->j_key.ulen, NULL, &jc->j_key.data)) != 0)
goto err;
goto retry;
}
/*
* If ret == DB_NOTFOUND, we're out of elements of the first
* secondary cursor. This is how we finally finish the join
* if all goes well.
*/
if (ret != 0)
goto err;
/*
* Copy key into currkey; this is the current duplicate data
* value that we're interested in, which we will use for comparison
* purposes with c_gets on all the other secondary cursors.
*/
if ((ret = CDB___os_realloc(key->size, NULL, &currkey.data)) != 0)
goto err;
memcpy(currkey.data, key->data, key->size);
currkey.size = key->size;
/*
* If jc->j_curslist[1] == NULL, we have only one cursor in the join.
* Thus, we can safely increment that one cursor on each call
* to CDB___db_join_get, and we signal this by setting jc->j_exhausted[0]
* right away.
*
* Otherwise, reset jc->j_exhausted[0] to 0, so that we don't
* increment it until we know we're ready to.
*/
if (jc->j_curslist[1] == NULL)
jc->j_exhausted[0] = 1;
else
jc->j_exhausted[0] = 0;
/* We have the first element; now look for it in the other cursors. */
for (i = 1; jc->j_curslist[i] != NULL; i++) {
if (jc->j_workcurs[i] == NULL)
/* If this is NULL, we need to dup curslist into it. */
if ((ret = jc->j_curslist[i]->c_dup(
jc->j_curslist[i], jc->j_workcurs + i,
DB_POSITIONI)) != 0)
goto err;
retry2:
cp = jc->j_workcurs[i];
if ((ret = CDB___db_join_getnext(cp, &jc->j_key, key, &currkey,
jc->j_exhausted[i])) == DB_NOTFOUND) {
/*
* jc->j_workcurs[i] has no more of the datum we're
* interested in. Go back one cursor and get
* a new dup. We can't just move to a new
* element of the outer relation, because that way
* we might miss duplicate duplicates in cursor i-1.
*
* If this takes us back to the first cursor,
* -then- we can move to a new element of the outer
* relation.
*/
--i;
jc->j_exhausted[i] = 1;
if (i == 0) {
for (j = 1; jc->j_workcurs[j] != NULL; j++) {
/*
* We're moving to a new element of
* the first secondary cursor. If
* that cursor is sorted, then any
* other sorted cursors can be safely
* reset to the first duplicate
* duplicate in the current set if we
* have a pointer to it (we can't just
* leave them be, or we'll miss
* duplicate duplicates in the outer
* relation).
*
* If the first cursor is unsorted, or
* if cursor j is unsorted, we can
* make no assumptions about what
* we're looking for next or where it
* will be, so we reset to the very
* beginning (setting workcurs NULL
* will achieve this next go-round).
*
* XXX: This is likely to break
* horribly if any two cursors are
* both sorted, but have different
* specified sort functions. For,
* now, we dismiss this as pathology
* and let strange things happen--we
* can't make rope childproof.
*/
if ((ret = jc->j_workcurs[j]->c_close(
jc->j_workcurs[j])) != 0)
goto err;
if ((jc->j_workcurs[0]->dbp->dup_compare
== NULL) ||
(jc->j_workcurs[j]->dbp->dup_compare
== NULL) ||
jc->j_fdupcurs[j] == NULL)
/*
* Unsafe conditions;
* reset fully.
*/
jc->j_workcurs[j] = NULL;
else
/* Partial reset suffices. */
if ((jc->j_fdupcurs[j]->c_dup(
jc->j_fdupcurs[j],
&jc->j_workcurs[j],
DB_POSITIONI)) != 0)
goto err;
jc->j_exhausted[j] = 0;
}
goto retry;
/* NOTREACHED */
}
/*
* We're about to advance the cursor and need to
* reset all of the workcurs[j] where j>i, so that
* we don't miss any duplicate duplicates.
*/
for (j = i + 1;
jc->j_workcurs[j] != NULL;
j++) {
if ((ret = jc->j_workcurs[j]->c_close(
jc->j_workcurs[j])) != 0)
goto err;
if (jc->j_fdupcurs[j] != NULL) {
if ((ret = jc->j_fdupcurs[j]->c_dup(
jc->j_fdupcurs[j],
&jc->j_workcurs[j],
DB_POSITIONI)) != 0)
goto err;
jc->j_exhausted[j] = 0;
} else
jc->j_workcurs[j] = NULL;
}
goto retry2;
/* NOTREACHED */
}
if (ret == ENOMEM) {
jc->j_key.ulen <<= 1;
if ((ret = CDB___os_realloc(jc->j_key.ulen,
NULL, &jc->j_key.data)) != 0)
goto err;
goto retry2;
}
if (ret != 0)
goto err;
/*
* If we made it this far, we've found a matching
* datum in cursor i. Mark the current cursor
* unexhausted, so we don't miss any duplicate
* duplicates the next go-round--unless this is the
* very last cursor, in which case there are none to
* miss, and we'll need that exhausted flag to finally
* get a DB_NOTFOUND and move on to the next datum in
* the outermost cursor.
*/
if (jc->j_curslist[i + 1] != NULL)
jc->j_exhausted[i] = 0;
else
jc->j_exhausted[i] = 1;
/*
* If jc->j_fdupcurs[i] is NULL, this is the first
* time we've gotten this far since the original
* CDB___db_join. If jc->j_exhausted[0] == 1, it's the
* first time we're here since advancing cursor 0. In
* either case, we have a new datum of interest, and
* we set jc->j_fdupcurs[i], which stores the first
* duplicate duplicate of the current datum.
*/
if (jc->j_exhausted[0] == 1 || jc->j_fdupcurs[i] == NULL) {
if (jc->j_fdupcurs[i] != NULL)
if ((ret = jc->j_fdupcurs[i]->c_close(
jc->j_fdupcurs[i])) != 0)
goto err;
if ((ret = cp->c_dup(cp, &jc->j_fdupcurs[i],
DB_POSITIONI)) != 0)
goto err;
}
}
err:
/*
* We're done with this; free it now, before
* both error and regular returns.
*/
if (currkey.data != NULL)
CDB___os_free(currkey.data, 0);
if (ret != 0)
return (ret);
/*
* ret == 0; we have a key to return. If DB_JOIN_ITEM is
* set, we return it; otherwise we do the lookup in the
* primary and then return.
*/
if (operation == DB_JOIN_ITEM)
return (0);
else
return ((jc->j_primary->get)(jc->j_primary,
jc->j_curslist[0]->txn, key, data, 0));
}
static int
CDB___db_join_close(dbc)
DBC *dbc;
{
JOIN_CURSOR *jc;
int i, ret, t_ret;
PANIC_CHECK(dbc->dbp->dbenv);
jc = (JOIN_CURSOR *)dbc->internal;
ret = t_ret = 0;
/*
* Close any open scratch cursors. In each case, there may
* not be as many outstanding as there are cursors in
* curslist, but the first NULL we hit will be after the last
* of whatever's there. If one of them fails, there's no
* reason not to close everything else; we'll just return the
* error code of the last one to fail. There's not much the
* caller can do anyway, since this cursor only exists hanging
* off a db-internal data structure that they shouldn't be
* mucking with.
*/
for (i = 0; jc->j_workcurs[i] != NULL; i++)
if((t_ret = jc->j_workcurs[i]->c_close(jc->j_workcurs[i])) != 0)
ret = t_ret;
for (i = 0; jc->j_fdupcurs[i] != NULL; i++)
if((t_ret = jc->j_fdupcurs[i]->c_close(jc->j_fdupcurs[i])) != 0)
ret = t_ret;
CDB___os_free(jc->j_exhausted, 0);
CDB___os_free(jc->j_curslist, 0);
CDB___os_free(jc->j_key.data, jc->j_key.ulen);
CDB___os_free(jc, sizeof(JOIN_CURSOR));
CDB___os_free(dbc, sizeof(DBC));
return (ret);
}
/*
* CDB___db_join_getnext--
* This function replaces the DBC_CONTINUE and DBC_KEYSET
* functionality inside the various cursor get routines.
*
* If exhausted == 0, we're not done with the current datum;
* return it if it matches "matching", otherwise search
* using DBC_CONTINUE (which is faster than iteratively doing
* DB_NEXT_DUP) forward until we find one that does.
*
* If exhausted == 1, we are done with the current datum, so just
* leap forward to searching NEXT_DUPs.
*
* If no matching datum exists, returns DB_NOTFOUND, else 0.
*/
static int
CDB___db_join_getnext(dbc, key, data, matching, exhausted)
DBC *dbc;
DBT *key, *data, *matching;
u_int32_t exhausted;
{
int ret, cmp;
DB *dbp;
int (*func) __P((const DBT *, const DBT *));
dbp = dbc->dbp;
func = (dbp->dup_compare == NULL) ? CDB___bam_defcmp : dbp->dup_compare;
switch (exhausted) {
case 0:
if ((ret = dbc->c_get(dbc, key, data, DB_CURRENT)) != 0)
break;
cmp = func(matching, data);
if (cmp == 0)
return (0);
/*
* Didn't match--we want to fall through and search future
* dups. But we've just stepped on the value of data,
* so we copy matching back into it.
*
* We don't have to copy the data itself, because
* the ensuing c_get call will take care of things for us.
*/
data->data = matching->data;
data->size = matching->size;
/* FALLTHROUGH */
case 1:
F_SET(dbc, DBC_CONTINUE);
ret = dbc->c_get(dbc, key, data, DB_GET_BOTH);
F_CLR(dbc, DBC_CONTINUE);
break;
default:
ret = EINVAL;
break;
}
return (ret);
}