/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1998, 1999 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint static const char sccsid[] = "@(#)db_join.c 11.6 (Sleepycat) 10/19/99"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #include #endif #include "db_int.h" #include "db_page.h" #include "db_join.h" #include "db_am.h" #include "btree.h" static int CDB___db_join_close __P((DBC *)); static int CDB___db_join_del __P((DBC *, u_int32_t)); static int CDB___db_join_get __P((DBC *, DBT *, DBT *, u_int32_t)); static int CDB___db_join_getnext __P((DBC *, DBT *, DBT *, DBT *, u_int32_t)); static int CDB___db_join_put __P((DBC *, DBT *, DBT *, u_int32_t)); /* * This is the duplicate-assisted join functionality. Right now we're * going to write it such that we return one item at a time, although * I think we may need to optimize it to return them all at once. * It should be easier to get it working this way, and I believe that * changing it should be fairly straightforward. * * XXX * Right now we do not maintain the number of duplicates so we do * not optimize the join. If the caller does, then best performance * will be achieved by putting the cursor with the smallest cardinality * first. * * The first cursor moves sequentially through the duplicate set while * the others search explicitly for the duplicate in question. * */ /* * CDB___db_join -- * This is the interface to the duplicate-assisted join functionality. * In the same way that cursors mark a position in a database, a cursor * can mark a position in a join. While most cursors are created by the * cursor method of a DB, join cursors are created through an explicit * call to DB->join. * * The curslist is an array of existing, intialized cursors and primary * is the DB of the primary file. The data item that joins all the * cursors in the curslist is used as the key into the primary and that * key and data are returned. When no more items are left in the join * set, the c_next operation off the join cursor will return DB_NOTFOUND. * * PUBLIC: int CDB___db_join __P((DB *, DBC **, DBC **, u_int32_t)); */ int CDB___db_join(primary, curslist, dbcp, flags) DB *primary; DBC **curslist, **dbcp; u_int32_t flags; { DBC *dbc; JOIN_CURSOR *jc; int i, ret, nslots; COMPQUIET(nslots, 0); PANIC_CHECK(primary->dbenv); if ((ret = CDB___db_joinchk(primary, flags)) != 0) return (ret); if (curslist == NULL || curslist[0] == NULL) return (EINVAL); dbc = NULL; jc = NULL; if ((ret = CDB___os_calloc(1, sizeof(DBC), &dbc)) != 0) goto err; if ((ret = CDB___os_calloc(1, sizeof(JOIN_CURSOR), &jc)) != 0) goto err; if ((ret = CDB___os_malloc(256, NULL, &jc->j_key.data)) != 0) goto err; jc->j_key.ulen = 256; F_SET(&jc->j_key, DB_DBT_USERMEM); for (jc->j_curslist = curslist; *jc->j_curslist != NULL; jc->j_curslist++) ; /* * The number of cursor slots we allocate is one greater than * the number of cursors involved in the join, because the * list is NULL-terminated. */ nslots = jc->j_curslist - curslist + 1; /* * !!! -- A note on the various lists hanging off jc. * * j_curslist is the initial NULL-terminated list of cursors passed * into CDB___db_join. The original cursors are not modified; pristine * copies are required because, in databases with unsorted dups, we * must reset all of the secondary cursors after the first each * time the first one is incremented, or else we will lose data * which happen to be sorted differently in two different cursors. * * j_workcurs is where we put those copies that we're planning to * work with. They're lazily c_dup'ed from j_curslist as we need * them, and closed when the join cursor is closed or when we need * to reset them to their original values (in which case we just * c_dup afresh). * * j_fdupcurs is an array of cursors which point to the first * duplicate in the duplicate set that contains the data value * we're currently interested in. We need this to make * CDB___db_join_get correctly return duplicate duplicates; i.e., if a * given data value occurs twice in the set belonging to cursor #2, * and thrice in the set belonging to cursor #3, and once in all * the other cursors, successive calls to CDB___db_join_get need to * return that data item six times. To make this happen, each time * cursor N is allowed to advance to a new datum, all cursors M * such that M > N have to be reset to the first duplicate with * that datum, so CDB___db_join_get will return all the dup-dups again. * We could just reset them to the original cursor from j_curslist, * but that would be a bit slower in the unsorted case and a LOT * slower in the sorted one. * * j_exhausted is a list of boolean values which represent * whether or not their corresponding cursors are "exhausted", * i.e. whether the datum under the corresponding cursor has * been found not to exist in any unreturned combinations of * later secondary cursors, in which case they are ready to be * incremented. */ /* We don't want to free regions whose callocs have failed. */ jc->j_curslist = NULL; jc->j_workcurs = NULL; jc->j_fdupcurs = NULL; jc->j_exhausted = NULL; if ((ret = CDB___os_calloc(nslots, sizeof(DBC *), &jc->j_curslist)) != 0) goto err; if ((ret = CDB___os_calloc(nslots, sizeof(DBC *), &jc->j_workcurs)) != 0) goto err; if ((ret = CDB___os_calloc(nslots, sizeof(DBC *), &jc->j_fdupcurs)) != 0) goto err; if ((ret = CDB___os_calloc(nslots, sizeof(u_int8_t), &jc->j_exhausted)) != 0) goto err; for (i = 0; curslist[i] != NULL; i++) { jc->j_curslist[i] = curslist[i]; jc->j_workcurs[i] = NULL; jc->j_fdupcurs[i] = NULL; jc->j_exhausted[i] = 0; } /* * We never need to reset the 0th cursor, so there's no * solid reason to use workcurs[0] rather than curslist[0] in * join_get. Nonetheless, it feels cleaner to do it for symmetry, * and this is the most logical place to copy it. * * !!! * There's no need to close the new cursor if we goto err only * because this is the last thing that can fail. Modifier of this * function beware! */ if ((ret = CDB___os_malloc(sizeof(DBC), NULL, jc->j_workcurs)) != 0) goto err; if ((ret = jc->j_curslist[0]->c_dup(jc->j_curslist[0], jc->j_workcurs, DB_POSITIONI)) != 0) goto err; dbc->c_close = CDB___db_join_close; dbc->c_del = CDB___db_join_del; dbc->c_get = CDB___db_join_get; dbc->c_put = CDB___db_join_put; dbc->internal = jc; dbc->dbp = primary; jc->j_primary = primary; *dbcp = dbc; return (0); err: if (jc != NULL) { if (jc->j_curslist != NULL) CDB___os_free(jc->j_curslist, nslots * sizeof(DBC *)); if (jc->j_workcurs != NULL) { if (jc->j_workcurs[0] != NULL) CDB___os_free(jc->j_workcurs[0], sizeof(DBC)); CDB___os_free(jc->j_workcurs, nslots * sizeof(DBC *)); } if (jc->j_fdupcurs != NULL) CDB___os_free(jc->j_fdupcurs, nslots * sizeof(DBC *)); if (jc->j_exhausted != NULL) CDB___os_free(jc->j_exhausted, nslots * sizeof(u_int8_t)); CDB___os_free(jc, sizeof(JOIN_CURSOR)); } if (dbc != NULL) CDB___os_free(dbc, sizeof(DBC)); return (ret); } static int CDB___db_join_put(dbc, key, data, flags) DBC *dbc; DBT *key; DBT *data; u_int32_t flags; { PANIC_CHECK(dbc->dbp->dbenv); COMPQUIET(key, NULL); COMPQUIET(data, NULL); COMPQUIET(flags, 0); return (EINVAL); } static int CDB___db_join_del(dbc, flags) DBC *dbc; u_int32_t flags; { PANIC_CHECK(dbc->dbp->dbenv); COMPQUIET(flags, 0); return (EINVAL); } static int CDB___db_join_get(dbc, key, data, flags) DBC *dbc; DBT *key, *data; u_int32_t flags; { DBT currkey; DB *dbp; DBC *cp; JOIN_CURSOR *jc; int ret, i, j; u_int32_t operation; dbp = dbc->dbp; memset(&currkey, 0, sizeof(currkey)); PANIC_CHECK(dbp->dbenv); operation = LF_ISSET(DB_OPFLAGS_MASK); if (operation != 0 && operation != DB_JOIN_ITEM) return (CDB___db_ferr(dbp->dbenv, "DBcursor->c_get", 0)); LF_CLR(DB_OPFLAGS_MASK); if ((ret = CDB___db_fchk(dbp->dbenv, "DBcursor->c_get", flags, DB_RMW)) != 0) return (ret); /* * Partial gets on join cursors don't make much sense, and the * DBT_PARTIAL flag is liable to produce some rather strange * results given the weird way the DBTs are used ("key" is used as * the datum in all the secondary cursors), so we simply * disallow it. */ if (F_ISSET(key, DB_DBT_PARTIAL) || F_ISSET(data, DB_DBT_PARTIAL)) return (EINVAL); jc = (JOIN_CURSOR *)dbc->internal; retry: ret = jc->j_workcurs[0]->c_get(jc->j_workcurs[0], &jc->j_key, key, jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT); if (ret == ENOMEM) { jc->j_key.ulen <<= 1; if ((ret = CDB___os_realloc(jc->j_key.ulen, NULL, &jc->j_key.data)) != 0) goto err; goto retry; } /* * If ret == DB_NOTFOUND, we're out of elements of the first * secondary cursor. This is how we finally finish the join * if all goes well. */ if (ret != 0) goto err; /* * Copy key into currkey; this is the current duplicate data * value that we're interested in, which we will use for comparison * purposes with c_gets on all the other secondary cursors. */ if ((ret = CDB___os_realloc(key->size, NULL, &currkey.data)) != 0) goto err; memcpy(currkey.data, key->data, key->size); currkey.size = key->size; /* * If jc->j_curslist[1] == NULL, we have only one cursor in the join. * Thus, we can safely increment that one cursor on each call * to CDB___db_join_get, and we signal this by setting jc->j_exhausted[0] * right away. * * Otherwise, reset jc->j_exhausted[0] to 0, so that we don't * increment it until we know we're ready to. */ if (jc->j_curslist[1] == NULL) jc->j_exhausted[0] = 1; else jc->j_exhausted[0] = 0; /* We have the first element; now look for it in the other cursors. */ for (i = 1; jc->j_curslist[i] != NULL; i++) { if (jc->j_workcurs[i] == NULL) /* If this is NULL, we need to dup curslist into it. */ if ((ret = jc->j_curslist[i]->c_dup( jc->j_curslist[i], jc->j_workcurs + i, DB_POSITIONI)) != 0) goto err; retry2: cp = jc->j_workcurs[i]; if ((ret = CDB___db_join_getnext(cp, &jc->j_key, key, &currkey, jc->j_exhausted[i])) == DB_NOTFOUND) { /* * jc->j_workcurs[i] has no more of the datum we're * interested in. Go back one cursor and get * a new dup. We can't just move to a new * element of the outer relation, because that way * we might miss duplicate duplicates in cursor i-1. * * If this takes us back to the first cursor, * -then- we can move to a new element of the outer * relation. */ --i; jc->j_exhausted[i] = 1; if (i == 0) { for (j = 1; jc->j_workcurs[j] != NULL; j++) { /* * We're moving to a new element of * the first secondary cursor. If * that cursor is sorted, then any * other sorted cursors can be safely * reset to the first duplicate * duplicate in the current set if we * have a pointer to it (we can't just * leave them be, or we'll miss * duplicate duplicates in the outer * relation). * * If the first cursor is unsorted, or * if cursor j is unsorted, we can * make no assumptions about what * we're looking for next or where it * will be, so we reset to the very * beginning (setting workcurs NULL * will achieve this next go-round). * * XXX: This is likely to break * horribly if any two cursors are * both sorted, but have different * specified sort functions. For, * now, we dismiss this as pathology * and let strange things happen--we * can't make rope childproof. */ if ((ret = jc->j_workcurs[j]->c_close( jc->j_workcurs[j])) != 0) goto err; if ((jc->j_workcurs[0]->dbp->dup_compare == NULL) || (jc->j_workcurs[j]->dbp->dup_compare == NULL) || jc->j_fdupcurs[j] == NULL) /* * Unsafe conditions; * reset fully. */ jc->j_workcurs[j] = NULL; else /* Partial reset suffices. */ if ((jc->j_fdupcurs[j]->c_dup( jc->j_fdupcurs[j], &jc->j_workcurs[j], DB_POSITIONI)) != 0) goto err; jc->j_exhausted[j] = 0; } goto retry; /* NOTREACHED */ } /* * We're about to advance the cursor and need to * reset all of the workcurs[j] where j>i, so that * we don't miss any duplicate duplicates. */ for (j = i + 1; jc->j_workcurs[j] != NULL; j++) { if ((ret = jc->j_workcurs[j]->c_close( jc->j_workcurs[j])) != 0) goto err; if (jc->j_fdupcurs[j] != NULL) { if ((ret = jc->j_fdupcurs[j]->c_dup( jc->j_fdupcurs[j], &jc->j_workcurs[j], DB_POSITIONI)) != 0) goto err; jc->j_exhausted[j] = 0; } else jc->j_workcurs[j] = NULL; } goto retry2; /* NOTREACHED */ } if (ret == ENOMEM) { jc->j_key.ulen <<= 1; if ((ret = CDB___os_realloc(jc->j_key.ulen, NULL, &jc->j_key.data)) != 0) goto err; goto retry2; } if (ret != 0) goto err; /* * If we made it this far, we've found a matching * datum in cursor i. Mark the current cursor * unexhausted, so we don't miss any duplicate * duplicates the next go-round--unless this is the * very last cursor, in which case there are none to * miss, and we'll need that exhausted flag to finally * get a DB_NOTFOUND and move on to the next datum in * the outermost cursor. */ if (jc->j_curslist[i + 1] != NULL) jc->j_exhausted[i] = 0; else jc->j_exhausted[i] = 1; /* * If jc->j_fdupcurs[i] is NULL, this is the first * time we've gotten this far since the original * CDB___db_join. If jc->j_exhausted[0] == 1, it's the * first time we're here since advancing cursor 0. In * either case, we have a new datum of interest, and * we set jc->j_fdupcurs[i], which stores the first * duplicate duplicate of the current datum. */ if (jc->j_exhausted[0] == 1 || jc->j_fdupcurs[i] == NULL) { if (jc->j_fdupcurs[i] != NULL) if ((ret = jc->j_fdupcurs[i]->c_close( jc->j_fdupcurs[i])) != 0) goto err; if ((ret = cp->c_dup(cp, &jc->j_fdupcurs[i], DB_POSITIONI)) != 0) goto err; } } err: /* * We're done with this; free it now, before * both error and regular returns. */ if (currkey.data != NULL) CDB___os_free(currkey.data, 0); if (ret != 0) return (ret); /* * ret == 0; we have a key to return. If DB_JOIN_ITEM is * set, we return it; otherwise we do the lookup in the * primary and then return. */ if (operation == DB_JOIN_ITEM) return (0); else return ((jc->j_primary->get)(jc->j_primary, jc->j_curslist[0]->txn, key, data, 0)); } static int CDB___db_join_close(dbc) DBC *dbc; { JOIN_CURSOR *jc; int i, ret, t_ret; PANIC_CHECK(dbc->dbp->dbenv); jc = (JOIN_CURSOR *)dbc->internal; ret = t_ret = 0; /* * Close any open scratch cursors. In each case, there may * not be as many outstanding as there are cursors in * curslist, but the first NULL we hit will be after the last * of whatever's there. If one of them fails, there's no * reason not to close everything else; we'll just return the * error code of the last one to fail. There's not much the * caller can do anyway, since this cursor only exists hanging * off a db-internal data structure that they shouldn't be * mucking with. */ for (i = 0; jc->j_workcurs[i] != NULL; i++) if((t_ret = jc->j_workcurs[i]->c_close(jc->j_workcurs[i])) != 0) ret = t_ret; for (i = 0; jc->j_fdupcurs[i] != NULL; i++) if((t_ret = jc->j_fdupcurs[i]->c_close(jc->j_fdupcurs[i])) != 0) ret = t_ret; CDB___os_free(jc->j_exhausted, 0); CDB___os_free(jc->j_curslist, 0); CDB___os_free(jc->j_key.data, jc->j_key.ulen); CDB___os_free(jc, sizeof(JOIN_CURSOR)); CDB___os_free(dbc, sizeof(DBC)); return (ret); } /* * CDB___db_join_getnext-- * This function replaces the DBC_CONTINUE and DBC_KEYSET * functionality inside the various cursor get routines. * * If exhausted == 0, we're not done with the current datum; * return it if it matches "matching", otherwise search * using DBC_CONTINUE (which is faster than iteratively doing * DB_NEXT_DUP) forward until we find one that does. * * If exhausted == 1, we are done with the current datum, so just * leap forward to searching NEXT_DUPs. * * If no matching datum exists, returns DB_NOTFOUND, else 0. */ static int CDB___db_join_getnext(dbc, key, data, matching, exhausted) DBC *dbc; DBT *key, *data, *matching; u_int32_t exhausted; { int ret, cmp; DB *dbp; int (*func) __P((const DBT *, const DBT *)); dbp = dbc->dbp; func = (dbp->dup_compare == NULL) ? CDB___bam_defcmp : dbp->dup_compare; switch (exhausted) { case 0: if ((ret = dbc->c_get(dbc, key, data, DB_CURRENT)) != 0) break; cmp = func(matching, data); if (cmp == 0) return (0); /* * Didn't match--we want to fall through and search future * dups. But we've just stepped on the value of data, * so we copy matching back into it. * * We don't have to copy the data itself, because * the ensuing c_get call will take care of things for us. */ data->data = matching->data; data->size = matching->size; /* FALLTHROUGH */ case 1: F_SET(dbc, DBC_CONTINUE); ret = dbc->c_get(dbc, key, data, DB_GET_BOTH); F_CLR(dbc, DBC_CONTINUE); break; default: ret = EINVAL; break; } return (ret); }