/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996, 1997, 1998, 1999 * Sleepycat Software. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 * Keith Bostic. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "db_config.h" #ifndef lint static const char sccsid[] = "@(#)db.c 11.31 (Sleepycat) 11/12/99"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #include #include #include #endif #include "db_int.h" #include "db_page.h" #include "db_shash.h" #include "db_swap.h" #include "btree.h" #include "db_am.h" #include "hash.h" #include "lock.h" #include "log.h" #include "mp.h" #include "qam.h" static int CDB___db_dbopen __P((DB *, const char *, u_int32_t, int, db_pgno_t)); static int CDB___db_dbenv_setup __P((DB *, const char *, u_int32_t)); static int CDB___db_file_setup __P((DB *, const char *, u_int32_t, int, db_pgno_t, int *)); static int CDB___db_master_open __P((DB_ENV *, DB_TXN *, const char *, u_int32_t, int, DB **)); static int CDB___db_master_update __P((DB *, const char *, u_int32_t, db_pgno_t *, int, u_int32_t)); static int CDB___db_metabegin __P((DB *, DB_LOCK *)); static int CDB___db_metaend __P((DB *, DB_LOCK *, int, int (*)(DB *, void *), void *)); static int CDB___db_refresh __P((DB *)); static int CDB___db_remove_callback __P((DB *, void *)); static int CDB___db_set_pgsize __P((DB *, DB_FH *, char *)); static int CDB___db_subdb_remove __P((DB *, const char *, const char *)); #if CONFIG_TEST static void __db_makecopy __P((const char *, const char *)); #endif /* * CDB___db_open -- * Main library interface to the DB access methods. * * PUBLIC: int CDB___db_open __P((DB *, * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int)); */ int CDB___db_open(dbp, name, subdb, type, flags, mode) DB *dbp; const char *name, *subdb; DBTYPE type; u_int32_t flags; int mode; { DB_ENV *dbenv; DB_LOCK open_lock; DB *mdbp; db_pgno_t meta_pgno; u_int32_t ok_flags; int ret, t_ret; dbenv = dbp->dbenv; mdbp = NULL; /* Validate arguments. */ #define OKFLAGS \ (DB_CREATE | DB_EXCL | DB_FCNTL_LOCKING | \ DB_NOMMAP | DB_RDONLY | DB_THREAD | DB_TRUNCATE | DB_COMPRESS) if ((ret = CDB___db_fchk(dbenv, "DB->open", flags, OKFLAGS)) != 0) return (ret); if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE)) return (CDB___db_ferr(dbenv, "DB->open", 1)); if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE)) return (CDB___db_ferr(dbenv, "DB->open", 1)); /* * Transparent I/O compression does not work on mmap'd files. */ if(LF_ISSET(DB_COMPRESS)) LF_SET(DB_NOMMAP); switch (type) { case DB_UNKNOWN: ok_flags = 0; break; case DB_BTREE: ok_flags = DB_OK_BTREE; break; case DB_HASH: ok_flags = DB_OK_HASH; break; case DB_QUEUE: ok_flags = DB_OK_QUEUE; break; case DB_RECNO: ok_flags = DB_OK_RECNO; break; default: CDB___db_err(dbp->dbenv, "unknown type: %lu", type); return (EINVAL); } if (ok_flags) DB_ILLEGAL_METHOD(dbp, ok_flags); /* The environment may have been created, but never opened. */ if (!F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_OPEN_CALLED)) { CDB___db_err(dbenv, "environment not yet opened"); return (EINVAL); } /* * Historically, you could pass in an environment that didn't have a * mpool, and DB would create a private one behind the scenes. This * no longer works. */ if (!F_ISSET(dbenv, DB_ENV_DBLOCAL) && dbenv->mp_handle == NULL) { CDB___db_err(dbenv, "environment did not include a memory pool."); return (EINVAL); } /* * You can't specify threads during DB->open if subsystems in the * environment weren't configured with them. */ if (LF_ISSET(DB_THREAD) && !F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_THREAD)) { CDB___db_err(dbenv, "environment not created using DB_THREAD"); return (EINVAL); } /* DB_TRUNCATE is not transaction recoverable. */ if (LF_ISSET(DB_TRUNCATE) && F_ISSET(dbenv, DB_ENV_TXN)) { CDB___db_err(dbenv, "DB_TRUNCATE illegal in a transaction protected environment"); return (EINVAL); } /* Subdatabase checks. */ if (subdb != NULL) { /* Subdatabases must be created in named files. */ if (name == NULL) { CDB___db_err(dbenv, "subdatabases cannot be created in temporary files"); return (EINVAL); } /* QAM can't be done as a subdatabase. */ if (type == DB_QUEUE) { CDB___db_err(dbenv, "subdatabases cannot be queue files"); return (EINVAL); } } /* Convert any DB->open flags. */ if (LF_ISSET(DB_RDONLY)) F_SET(dbp, DB_AM_RDONLY); if (LF_ISSET(DB_COMPRESS)) F_SET(dbp, DB_AM_CMPR); /* Fill in the type. */ dbp->type = type; /* * If we're potentially creating a database, wrap the open inside of * a transaction. */ if (F_ISSET(dbenv, DB_ENV_TXN) && LF_ISSET(DB_CREATE)) if ((ret = CDB___db_metabegin(dbp, &open_lock)) != 0) return (ret); /* * If we're opening a subdatabase, we have to open (and potentially * create) the main database, and then get (and potentially store) * our base page number in that database. Then, we can finally open * the subdatabase. */ if (subdb == NULL) meta_pgno = PGNO_BASE_MD; else { /* * Open the master database, optionally updating it, and * retrieving the metadata page number. */ if ((ret = CDB___db_master_open(dbp->dbenv, dbp->open_txn, name, flags, mode, &mdbp)) != 0) goto err; /* Copy the page size and file id from the master. */ dbp->pgsize = mdbp->pgsize; F_SET(dbp, DB_AM_SUBDB); memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN); if ((ret = CDB___db_master_update(mdbp, subdb, type, &meta_pgno, 0, flags)) != 0) goto err; /* * Clear the exclusive open and truncation flags, they only * apply to the open of the master database. */ LF_CLR(DB_EXCL | DB_TRUNCATE); } ret = CDB___db_dbopen(dbp, name, flags, mode, meta_pgno); /* * You can open the database that describes the subdatabases in the * rest of the file read-only. The content of each key's data is * unspecified and applications should never be adding new records * or updating existing records. However, during recovery, we need * to open these databases R/W so we can redo/undo changes in them. */ if (subdb == NULL && (dbenv->lg_handle == NULL || !F_ISSET((DB_LOG *)(dbenv->lg_handle), DBC_RECOVER)) && !LF_ISSET(DB_RDONLY) && F_ISSET(dbp, DB_AM_SUBDB)) { CDB___db_err(dbenv, "databases containing subdatabase lists may only be opened read-only"); ret = EINVAL; goto err; } err: /* * End any transaction, committing if we were successful, aborting * otherwise. */ if (F_ISSET(dbenv, DB_ENV_TXN) && LF_ISSET(DB_CREATE)) if ((t_ret = CDB___db_metaend(dbp, &open_lock, ret == 0, NULL, NULL)) != 0 && ret == 0) ret = t_ret; /* If we were successful, don't discard the file on close. */ if (ret == 0) F_CLR(dbp, DB_AM_DISCARD); /* If we were unsuccessful, destroy the DB handle. */ if (ret != 0) CDB___db_refresh(dbp); if (mdbp != NULL) { /* If we were successful, don't discard the file on close. */ if (ret == 0) F_CLR(mdbp, DB_AM_DISCARD); if ((t_ret = mdbp->close(mdbp, 0)) != 0 && ret == 0) ret = t_ret; } return (ret); } /* * CDB___db_dbopen -- * Open a database. */ static int CDB___db_dbopen(dbp, name, flags, mode, meta_pgno) DB *dbp; const char *name; u_int32_t flags; int mode; db_pgno_t meta_pgno; { DB_ENV *dbenv; int ret; int zero_length; dbenv = dbp->dbenv; /* Set up the underlying file. */ if ((ret = CDB___db_file_setup(dbp, name, flags, mode, meta_pgno, &zero_length)) != 0) return (ret); /* Set up the underlying environment. */ if ((ret = CDB___db_dbenv_setup(dbp, name, flags)) != 0) return (ret); /* * Do access method specific initialization. * * !!! * Set the open flag. (The underlying access method open functions * may want to do things like acquire cursors, so the open flag has * to be set before calling them.) */ F_SET(dbp, DB_OPEN_CALLED); if (zero_length) return (0); switch (dbp->type) { case DB_BTREE: ret = CDB___bam_open(dbp, name, meta_pgno); break; case DB_HASH: ret = CDB___ham_open(dbp, name, meta_pgno); break; case DB_RECNO: ret = CDB___ram_open(dbp, name, meta_pgno); break; case DB_QUEUE: ret = CDB___qam_open(dbp, name, meta_pgno); break; case DB_UNKNOWN: ret = EINVAL; /* Shouldn't be possible. */ break; } return (ret); } /* * CDB___db_master_open -- * Open up a handle on a master database. */ static int CDB___db_master_open(dbenv, txn, name, flags, mode, dbpp) DB_ENV *dbenv; DB_TXN *txn; const char *name; u_int32_t flags; int mode; DB **dbpp; { DB *dbp; int ret; /* * Open up a handle on the main database. */ if ((ret = CDB_db_create( &dbp, F_ISSET(dbenv, DB_ENV_DBLOCAL) ? NULL : dbenv, 0)) != 0) return (ret); dbp->open_txn = txn; /* * It's always a btree; flag that we're creating a database with * subdatabases. */ dbp->type = DB_BTREE; F_SET(dbp, DB_AM_SUBDB); ret = CDB___db_dbopen(dbp, name, flags, mode, PGNO_BASE_MD); *dbpp = dbp; return (ret); } /* * CDB___db_master_update -- * Add/Remove a subdatabase from a master database. */ static int CDB___db_master_update(mdbp, subdb, type, meta_pgnop, is_remove, flags) DB *mdbp; const char *subdb; u_int32_t type; db_pgno_t *meta_pgnop; /* !NULL if creating/reading. */ int is_remove; u_int32_t flags; { DBC *dbc; DBT key, data; PAGE *p; int ret, t_ret; dbc = NULL; p = NULL; /* Open up a cursor. */ if ((ret = mdbp->cursor(mdbp, mdbp->open_txn, &dbc, 0)) != 0) goto err; /* * Try to point the cursor at the record. * * If we're removing or potentially creating an entry, lock the page * with DB_RMW. * * !!! * We don't include the name's nul termination in the database. */ memset(&key, 0, sizeof(key)); memset(&data, 0, sizeof(data)); key.data = (char *)subdb; key.size = strlen(subdb); ret = dbc->c_get(dbc, &key, &data, DB_SET | (meta_pgnop == NULL || (F_ISSET( mdbp->dbenv, DB_ENV_LOCKING) && LF_ISSET(DB_CREATE)) ? DB_RMW : 0)); if (is_remove) { /* We should have found something if we're removing it. */ if (ret != 0) goto err; memcpy(meta_pgnop, data.data, sizeof(db_pgno_t)); /* Delete the subdatabase entry. */ if ((ret = dbc->c_del(dbc, 0)) != 0) goto err; if ((ret = CDB_memp_fget(mdbp->mpf, meta_pgnop, 0, &p)) != 0) goto err; /* Free and put the page. */ if ((ret = CDB___db_free(dbc, p)) != 0) goto err; p = NULL; } else { /* * Get the subdatabase information. If it already exists, * copy out the page number and we're done. */ switch (ret) { case 0: memcpy(meta_pgnop, data.data, sizeof(db_pgno_t)); goto done; case DB_NOTFOUND: if (LF_ISSET(DB_CREATE)) break; ret = ENOENT; goto err; default: goto err; } if ((ret = CDB___db_new(dbc, type == DB_HASH ? P_HASHMETA : P_BTREEMETA, &p)) != 0) goto err; data.data = &PGNO(p); data.size = sizeof(db_pgno_t); if ((ret = dbc->c_put(dbc, &key, &data, DB_KEYLAST)) != 0) goto err; *meta_pgnop = PGNO(p); } err: done: /* * If we allocated a page: if we're successful, mark the page dirty * and return it to the cache, otherwise, discard/free it. */ if (p != NULL) { if (ret == 0) { if ((t_ret = CDB_memp_fput(mdbp->mpf, p, DB_MPOOL_DIRTY)) != 0) ret = t_ret; /* * Since we cannot close this file until after * transaction commit, we need to sync the dirty * pages, because we'll read these directly from * disk to open. */ if ((t_ret = mdbp->sync(mdbp, 0)) != 0 && ret == 0) ret = t_ret; } else (void)CDB___db_free(dbc, p); } /* Discard the cursor. */ if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * CDB___db_dbenv_setup -- * Set up the underlying environment during a db_open. */ static int CDB___db_dbenv_setup(dbp, name, flags) DB *dbp; const char *name; u_int32_t flags; { DB_ENV *dbenv; DBT pgcookie; DB_MPOOL_FINFO finfo; DB_PGINFO pginfo; int ret; dbenv = dbp->dbenv; /* If the environment is local, it's time to create it. */ if (F_ISSET(dbenv, DB_ENV_DBLOCAL)) { /* Make sure we have at least DB_MINCACHE pages in our cache. */ if (dbenv->mp_gbytes == 0 && dbenv->mp_bytes < dbp->pgsize * DB_MINPAGECACHE && (ret = dbenv->set_cachesize( dbenv, 0, dbp->pgsize * DB_MINPAGECACHE, 0)) != 0) return (ret); if ((ret = dbenv->open(dbenv, NULL, NULL, DB_CREATE | DB_INIT_MPOOL | DB_PRIVATE | LF_ISSET(DB_THREAD), 0)) != 0) return (ret); } /* Register DB's pgin/pgout functions. */ if ((ret = CDB_memp_register(dbenv, DB_FTYPE_SET, CDB___db_pgin, CDB___db_pgout)) != 0) return (ret); /* * Open a backing file in the memory pool. * * If we need to pre- or post-process a file's pages on I/O, set the * file type. If it's a hash file, always call the pgin and pgout * routines. This means that hash files can never be mapped into * process memory. If it's a btree file and requires swapping, we * need to page the file in and out. This has to be right -- we can't * mmap files that are being paged in and out. */ memset(&finfo, 0, sizeof(finfo)); switch (dbp->type) { case DB_BTREE: case DB_RECNO: finfo.ftype = F_ISSET(dbp, DB_AM_SWAP) ? DB_FTYPE_SET : DB_FTYPE_NOTSET; finfo.clear_len = DB_PAGE_DB_LEN; break; case DB_HASH: finfo.ftype = DB_FTYPE_SET; finfo.clear_len = DB_PAGE_DB_LEN; break; case DB_QUEUE: finfo.ftype = F_ISSET(dbp, DB_AM_SWAP) ? DB_FTYPE_SET : DB_FTYPE_NOTSET; finfo.clear_len = DB_PAGE_QUEUE_LEN; break; case DB_UNKNOWN: return (EINVAL); /* Shouldn't be possible. */ } /* * Better compression is achieved if the page does not contain random data. */ if(F_ISSET(dbp, DB_AM_CMPR)) finfo.clear_len = 0; finfo.pgcookie = &pgcookie; finfo.fileid = dbp->fileid; finfo.lsn_offset = 0; pginfo.db_pagesize = dbp->pgsize; /* * Forbiding byte swap when compression is enabled * makes things simpler for the compression. */ if(F_ISSET(dbp, DB_AM_SWAP) && LF_ISSET(DB_COMPRESS)) return (EINVAL); pginfo.needswap = F_ISSET(dbp, DB_AM_SWAP); pgcookie.data = &pginfo; pgcookie.size = sizeof(DB_PGINFO); if ((ret = CDB_memp_fopen(dbenv, name, LF_ISSET(DB_RDONLY | DB_NOMMAP | DB_COMPRESS), 0666, dbp->pgsize, &finfo, &dbp->mpf)) != 0) return (ret); /* * We may need a per-thread mutex. Allocate it from the environment * region, there's supposed to be extra space there for that purpose. */ if (LF_ISSET(DB_THREAD)) { if ((ret = CDB___db_mutex_alloc( dbenv, dbenv->reginfo, (MUTEX **)&dbp->mutexp)) != 0) return (ret); if ((ret = __db_mutex_init( dbenv, dbp->mutexp, 0, MUTEX_THREAD)) != 0) return (ret); } /* Get a log file id. */ if (F_ISSET(dbenv, DB_ENV_LOGGING) && #if !defined(DEBUG_ROP) !F_ISSET(dbp, DB_AM_RDONLY) && #endif (ret = CDB_log_register(dbenv, dbp, name, &dbp->log_fileid)) != 0) return (ret); return (0); } /* * CDB___db_file_setup -- * Setup the file or in-memory data. * Read the database metadata and resolve it with our arguments. */ static int CDB___db_file_setup(dbp, name, flags, mode, meta_pgno, zerop) DB *dbp; const char *name; u_int32_t flags; int mode; db_pgno_t meta_pgno; int *zerop; { DBT namedbt; DB_ENV *dbenv; DB_FH *fhp, fh; DB_LSN lsn; DB_TXN *txn; ssize_t nr; u_int32_t magic, oflags; int ret, retry_cnt, t_ret; char *real_name, mbuf[256]; #define IS_SUBDB_SETUP (meta_pgno != PGNO_BASE_MD) dbenv = dbp->dbenv; txn = NULL; *zerop = 0; /* * If we open a file handle and our caller is doing fcntl(2) locking, * we can't close it because that would discard the caller's lock. * Save it until we close the DB handle. */ if (LF_ISSET(DB_FCNTL_LOCKING)) { if ((ret = CDB___os_malloc(sizeof(*fhp), NULL, &fhp)) != 0) return (ret); } else fhp = &fh; F_CLR(fhp, DB_FH_VALID); /* * If the file is in-memory, set up is simple. Otherwise, do the * hard work of opening and reading the file. * * If we have a file name, try and read the first page, figure out * what type of file it is, and initialize everything we can based * on that file's meta-data page. * * !!! * There's a reason we don't push this code down into the buffer cache. * The problem is that there's no information external to the file that * we can use as a unique ID. UNIX has dev/inode pairs, but they are * not necessarily unique after reboot, if the file was mounted via NFS. * Windows has similar problems, as the FAT filesystem doesn't maintain * dev/inode numbers across reboot. So, we must get something from the * file we can use to ensure that, even after a reboot, the file we're * joining in the cache is the right file for us to join. The solution * we use is to maintain a file ID that's stored in the database, and * that's why we have to open and read the file before calling into the * buffer cache. * * The secondary reason is that there's additional information that * we want to have before instantiating a file in the buffer cache: * the page size, file type (btree/hash), if swapping is required, * and flags (DB_RDONLY, DB_CREATE, DB_TRUNCATE). We could handle * needing this information by allowing it to be set for a file in * the buffer cache even after the file has been opened, and, of * course, supporting the ability to flush a file from the cache as * necessary, e.g., if we guessed wrongly about the page size. Given * that we have to read the file anyway to get the file ID, we might * as well get the rest, too. * * Get the real file name. */ if (name == NULL) { F_SET(dbp, DB_AM_INMEM); if (dbp->type == DB_UNKNOWN) { CDB___db_err(dbenv, "DBTYPE of unknown without existing file"); return (EINVAL); } real_name = NULL; /* * If the file is a temporary file and we're doing locking, * then we have to create a unique file ID. We can't use our * normal dev/inode pair (or whatever this OS uses in place of * dev/inode pairs) because no backing file will be created * until the mpool cache is filled forcing the buffers to disk. * Grab a random locker ID to use as a file ID. The created * ID must never match a potential real file ID -- we know it * won't because real file IDs contain a time stamp after the * dev/inode pair, and we're simply storing a 4-byte value. * * !!! * Store the locker in the file id structure -- we can get it * from there as necessary, and it saves having two copies. */ if (F_ISSET(dbenv, DB_ENV_LOCKING | DB_ENV_CDB) && (ret = CDB_lock_id(dbenv, (u_int32_t *)dbp->fileid)) != 0) return (ret); return (0); } /* Get the real backing file name. */ if ((ret = CDB___db_appname(dbenv, DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0) return (ret); /* * Open the backing file. We need to make sure that multiple processes * attempting to create the file at the same time are properly ordered * so that only one of them creates the "unique" file ID, so we open it * O_EXCL and O_CREAT so two simultaneous attempts to create the region * will return failure in one of the attempts. If we're the one that * fails, simply retry without the O_CREAT flag, which will require the * meta-data page exist. */ /* Fill in the default file mode. */ if (mode == 0) mode = CDB___db_omode("rwrw--"); oflags = 0; if (LF_ISSET(DB_RDONLY)) oflags |= DB_OSO_RDONLY; if (LF_ISSET(DB_TRUNCATE)) oflags |= DB_OSO_TRUNC; retry_cnt = 0; open_retry: *zerop = 0; ret = 0; if (LF_ISSET(DB_CREATE)) { if (dbp->open_txn != NULL) { /* * Start a child transaction to wrap this individual * create. */ if ((ret = CDB_txn_begin(dbenv, dbp->open_txn, &txn, 0)) != 0) goto err_msg; memset(&namedbt, 0, sizeof(namedbt)); namedbt.data = (char *)name; namedbt.size = strlen(name) + 1; if ((ret = CDB___crdel_fileopen_log(dbenv, txn, &lsn, DB_FLUSH, &namedbt, mode)) != 0) goto err_msg; } DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, name); if ((ret = CDB___os_open(real_name, oflags | DB_OSO_CREATE | DB_OSO_EXCL, mode, fhp)) == 0) { DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, name); /* Commit the file create. */ if (dbp->open_txn != NULL) { if ((ret = CDB_txn_commit(txn, DB_TXN_SYNC)) != 0) goto err_msg; txn = NULL; } /* * We created the file. This means that if we later * fail, we need to delete the file and if we're going * to do that, we need to trash any pages in the * memory pool. Since we only know here that we * created the file, we're going to set the flag here * and clear it later if we commit successfully. */ F_SET(dbp, DB_AM_DISCARD); } else { /* * Abort the file create. If the abort fails, report * the error returned by CDB_txn_abort(), rather than the * open error, for no particular reason. */ if (dbp->open_txn != NULL) { if ((t_ret = CDB_txn_abort(txn)) != 0) { ret = t_ret; goto err_msg; } txn = NULL; } /* * If we were not doing an exclusive open, try again * without the create flag. */ if (ret == EEXIST && !LF_ISSET(DB_EXCL)) { LF_CLR(DB_CREATE); DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, name); goto open_retry; } } } else ret = CDB___os_open(real_name, oflags, mode, fhp); /* * Be quiet if we couldn't open the file because it didn't exist, * the customers don't like those messages appearing in the logs. * Otherwise, complain loudly. */ if (ret != 0) { if (ret == ENOENT) goto err; goto err_msg; } /* Set the page size if we don't have one yet. */ if (dbp->pgsize == 0 && (ret = CDB___db_set_pgsize(dbp, fhp, real_name)) != 0) goto err; /* * Seek to the metadata offset; if it's a master database open or a * database without subdatabases, we're seeking to 0, but that's OK. */ if ((ret = CDB___os_seek(fhp, dbp->pgsize, meta_pgno, 0, 0, DB_OS_SEEK_SET)) != 0) goto err_msg; /* * Read the metadata page. We read 256 bytes, which is larger than * any access method's metadata page and smaller than any disk sector. */ if ((ret = CDB___os_read(fhp, mbuf, sizeof(mbuf), &nr)) != 0) goto err_msg; if (nr == sizeof(mbuf)) { /* * Figure out what access method we're dealing with, and then * call access method specific code to check error conditions * based on conflicts between the found file and application * arguments. A found file overrides some user information -- * we don't consider it an error, for example, if the user set * an expected byte order and the found file doesn't match it. */ F_CLR(dbp, DB_AM_SWAP); magic = ((DBMETA *)mbuf)->magic; swap_retry: switch (magic) { case DB_BTREEMAGIC: if ((ret = CDB___bam_metachk(dbp, name, (BTMETA *)mbuf)) != 0) goto err; break; case DB_HASHMAGIC: if ((ret = CDB___ham_metachk(dbp, name, (HMETA *)mbuf)) != 0) goto err; break; case DB_QAMMAGIC: if ((ret = CDB___qam_metachk(dbp, name, (QMETA *)mbuf)) != 0) goto err; break; case 0: /* * There are two ways we can get a 0 magic number. * If we're creating a subdatabase, then the magic * number will be 0. We allocate a page as part of * finding out what the base page number will be for * the new subdatabase, but it's not initialized in * any way. * * The second case happens if we are in recovery * and we are going to recreate a database, it's * possible that it's page was created (on systems * where pages must be created explicitly to avoid * holes in files) but is still 0. */ if (IS_SUBDB_SETUP) /* Case 1 */ goto empty; if (!LF_ISSET(DB_CREATE | DB_TRUNCATE)) { /* Case 2 */ *zerop = 1; goto empty; } goto bad_format; default: if (F_ISSET(dbp, DB_AM_SWAP)) goto bad_format; M_32_SWAP(magic); F_SET(dbp, DB_AM_SWAP); goto swap_retry; } } else { /* * Only newly created files are permitted to fail magic * number tests. */ if (nr != 0 || IS_SUBDB_SETUP) goto bad_format; /* Let the caller know that we had a 0-length file. */ if (!LF_ISSET(DB_CREATE | DB_TRUNCATE)) *zerop = 1; /* * The only way we can reach here with the DB_CREATE flag set * is if we created the file. If that's not the case, then * either (a) someone else created the file but has not yet * written out the metadata page, or (b) we truncated the file * (DB_TRUNCATE) leaving it zero-length. In the case of (a), * we want to sleep and give the file creator time to write * the metadata page. In the case of (b), we want to continue. * * !!! * There's a race in the case of two processes opening the file * with the DB_TRUNCATE flag set at roughly the same time, and * they could theoretically hurt each other. Sure hope that's * unlikely. */ if (!LF_ISSET(DB_CREATE | DB_TRUNCATE) && (dbenv->lg_handle == NULL || !F_ISSET((DB_LOG *)dbenv->lg_handle, DBC_RECOVER))) { if (retry_cnt++ < 3) { CDB___os_sleep(1, 0); goto open_retry; } bad_format: CDB___db_err(dbenv, "%s: unexpected file type or format", name); ret = EINVAL; goto err; } if (dbp->type == DB_UNKNOWN) { CDB___db_err(dbenv, "%s: DB_UNKNOWN type specified with empty file", name); ret = EINVAL; goto err; } empty: /* * The file is empty, and that's OK. If it's not a subdatabase, * though, we do need to generate a unique file ID for it. The * unique file ID includes a timestampe so that we can't collide * with any other files, even when the file IDs (dev/inode pair) * are reused. */ if (*zerop == 1) memset(dbp->fileid, 0, DB_FILE_ID_LEN); else if (!IS_SUBDB_SETUP && (ret = CDB___os_fileid(dbenv, real_name, 1, dbp->fileid)) != 0) goto err_msg; } if (0) { err_msg: CDB___db_err(dbenv, "%s: %s", name, CDB_db_strerror(ret)); } /* * Abort any running transaction -- it can only exist if something * went wrong. */ err: if (txn != NULL) (void)CDB_txn_abort(txn); DB_TEST_RECOVERY_LABEL /* * If we opened a file handle and our caller is doing fcntl(2) locking, * then we can't close it because that would discard the caller's lock. * Otherwise, close the handle. */ if (F_ISSET(fhp, DB_FH_VALID)) { if (ret == 0 && LF_ISSET(DB_FCNTL_LOCKING)) dbp->saved_open_fhp = fhp; else if ((t_ret = CDB___os_closehandle(fhp)) != 0 && ret == 0) ret = t_ret; } if (real_name != NULL) CDB___os_freestr(real_name); return (ret); } /* * CDB___db_set_pgsize -- * Set the page size based on file information. */ static int CDB___db_set_pgsize(dbp, fhp, name) DB *dbp; DB_FH *fhp; char *name; { DB_ENV *dbenv; u_int32_t iopsize; int ret; dbenv = dbp->dbenv; /* * Use the filesystem's optimum I/O size as the pagesize if a pagesize * not specified. Some filesystems have 64K as their optimum I/O size, * but as that results in fairly large default caches, we limit the * default pagesize to 16K. */ if ((ret = CDB___os_ioinfo(name, fhp, NULL, NULL, &iopsize)) != 0) { CDB___db_err(dbenv, "%s: %s", name, CDB_db_strerror(ret)); return (ret); } if (iopsize < 512) iopsize = 512; if (iopsize > 16 * 1024) iopsize = 16 * 1024; /* * If compression is on, the minimum page size must be multiplied * by the compression factor. */ #ifdef HAVE_LIBZ if(F_ISSET(dbp, DB_AM_CMPR)) { if(iopsize < DB_CMPR_MULTIPLY(dbenv, DB_MIN_PGSIZE)) iopsize = DB_CMPR_MULTIPLY(dbenv, DB_MIN_PGSIZE); } #endif /* HAVE_LIBZ */ /* * Sheer paranoia, but we don't want anything that's not a power-of-2 * (we rely on that for alignment of various types on the pages), and * we want a multiple of the sector size as well. */ OS_ROUNDOFF(iopsize, 512); dbp->pgsize = iopsize; F_SET(dbp, DB_AM_PGDEF); return (0); } /* * CDB___db_close -- * DB destructor. * * PUBLIC: int CDB___db_close __P((DB *, u_int32_t)); */ int CDB___db_close(dbp, flags) DB *dbp; u_int32_t flags; { DB_ENV *dbenv; DBC *dbc; int ret, t_ret; ret = 0; PANIC_CHECK(dbp->dbenv); /* Validate arguments. */ if ((ret = CDB___db_closechk(dbp, flags)) != 0) return (ret); /* If never opened, or not currently open, it's easy. */ if (!F_ISSET((dbp), DB_OPEN_CALLED)) goto never_opened; /* Sync the underlying access method. */ if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) && (t_ret = dbp->sync(dbp, 0)) != 0 && ret == 0) ret = t_ret; /* * Go through the active cursors and call the cursor recycle routine, * which resolves pending operations and moves the cursors onto the * free list. Then, walk the free list and call the cursor destroy * routine. */ while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) ret = t_ret; while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL) if ((t_ret = CDB___db_c_destroy(dbc)) != 0 && ret == 0) ret = t_ret; /* Sync the memory pool. */ if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) && (t_ret = CDB_memp_fsync(dbp->mpf)) != 0 && t_ret != DB_INCOMPLETE && ret == 0) ret = t_ret; /* Close any handle we've been holding since the open. */ if (dbp->saved_open_fhp != NULL && F_ISSET(dbp->saved_open_fhp, DB_FH_VALID) && (t_ret = CDB___os_closehandle(dbp->saved_open_fhp)) != 0 && ret == 0) ret = t_ret; never_opened: /* * Call the access specific close function. * * !!! * Because of where the function is called in the close process, * these routines can't do anything that would dirty pages or * otherwise affect closing down the database. */ if ((t_ret = CDB___ham_db_close(dbp)) != 0 && ret == 0) ret = t_ret; if ((t_ret = CDB___bam_db_close(dbp)) != 0 && ret == 0) ret = t_ret; if ((t_ret = CDB___qam_db_close(dbp)) != 0 && ret == 0) ret = t_ret; /* Refresh the structure and close any local environment. */ dbenv = dbp->dbenv; if ((t_ret = CDB___db_refresh(dbp)) != 0 && ret == 0) ret = t_ret; if (F_ISSET(dbenv, DB_ENV_DBLOCAL) && (t_ret = dbenv->close(dbenv, 0)) != 0 && ret == 0) ret = t_ret; memset(dbp, CLEAR_BYTE, sizeof(*dbp)); CDB___os_free(dbp, sizeof(*dbp)); return (ret); } /* * CDB___db_refresh -- * Refresh the DB structure, releasing any allocated resources. */ static int CDB___db_refresh(dbp) DB *dbp; { DB_ENV *dbenv; int ret, t_ret; ret = 0; dbenv = dbp->dbenv; dbp->type = 0; /* Close the memory pool file handle. */ if (dbp->mpf != NULL) { if (F_ISSET(dbp, DB_AM_DISCARD)) (void)CDB___memp_fremove(dbp->mpf); if ((t_ret = CDB_memp_fclose(dbp->mpf)) != 0 && ret == 0) ret = t_ret; dbp->mpf = NULL; } /* Discard the thread mutex. */ if (dbp->mutexp != NULL) { CDB___db_mutex_free(dbenv, dbenv->reginfo, dbp->mutexp); dbp->mutexp = NULL; } /* Discard the log file id. */ if (dbp->log_fileid != DB_LOGFILEID_INVALID) { (void)CDB_log_unregister(dbenv, dbp->log_fileid); dbp->log_fileid = DB_LOGFILEID_INVALID; } TAILQ_INIT(&dbp->free_queue); TAILQ_INIT(&dbp->active_queue); F_CLR(dbp, DB_AM_DISCARD); F_CLR(dbp, DB_AM_INMEM); F_CLR(dbp, DB_AM_RDONLY); F_CLR(dbp, DB_AM_SWAP); F_CLR(dbp, DB_DBM_ERROR); F_CLR(dbp, DB_OPEN_CALLED); return (ret); } /* * CDB___db_remove * Remove method for DB. * * PUBLIC: int CDB___db_remove __P((DB *, const char *, const char *, u_int32_t)); */ int CDB___db_remove(dbp, name, subdb, flags) DB *dbp; const char *name, *subdb; u_int32_t flags; { DBT namedbt; DB_ENV *dbenv; DB_LOCK remove_lock; DB_LSN newlsn; int ret, t_ret; char *backup, *real_back, *real_name; dbenv = dbp->dbenv; ret = 0; backup = real_back = real_name = NULL; PANIC_CHECK(dbenv); DB_ILLEGAL_AFTER_OPEN(dbp, "remove"); /* Validate arguments. */ if ((ret = CDB___db_removechk(dbp, flags)) != 0) return (ret); /* * Subdatabases. */ if (subdb != NULL) { /* Subdatabases must be created in named files. */ if (name == NULL) { CDB___db_err(dbenv, "subdatabases cannot be created in temporary files"); return (EINVAL); } return (CDB___db_subdb_remove(dbp, name, subdb)); } /* Start the transaction and log the delete. */ if (F_ISSET(dbenv, DB_ENV_TXN)) { if ((ret = CDB___db_metabegin(dbp, &remove_lock)) != 0) return (ret); memset(&namedbt, 0, sizeof(namedbt)); namedbt.data = (char *)name; namedbt.size = strlen(name) + 1; if ((ret = CDB___crdel_delete_log(dbenv, dbp->open_txn, &newlsn, DB_FLUSH, &namedbt)) != 0) { CDB___db_err(dbenv, "%s: %s", name, CDB_db_strerror(ret)); goto err; } } /* * XXX * We need to open the file and call CDB___memp_fremove on the mpf. I'm * not sure that we need to do this. Is it our responsibility or the * application's responsibility to make sure someone else isn't busily * deleting pages behind our backs? */ /* Find the real name of the file. */ if ((ret = CDB___db_appname(dbenv, DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0) goto err; /* Create name for backup file. */ if ((ret = CDB___db_backup_name(name, &backup, &newlsn)) != 0) goto err; if ((ret = CDB___db_appname(dbenv, DB_APP_DATA, NULL, backup, 0, NULL, &real_back)) != 0) goto err; DB_TEST_RECOVERY(dbp, DB_TEST_PRERENAME, ret, name); ret = CDB___os_rename(real_name, real_back); DB_TEST_RECOVERY(dbp, DB_TEST_POSTRENAME, ret, name); err: DB_TEST_RECOVERY_LABEL /* * End the transaction, committing the transaction if we were * successful, aborting otherwise. */ if (dbp->open_txn != NULL && (t_ret = CDB___db_metaend(dbp, &remove_lock, ret == 0, CDB___db_remove_callback, real_back)) != 0 && ret == 0) ret = t_ret; if (real_name != NULL) CDB___os_freestr(real_name); if (backup != NULL) CDB___os_freestr(backup); return (ret); } /* * CDB___db_subdb_remove -- * Remove a subdatabase. */ static int CDB___db_subdb_remove(dbp, name, subdb) DB *dbp; const char *name, *subdb; { DB *mdbp; DBC *dbc; DB_ENV *dbenv; DB_LOCK remove_lock; db_pgno_t meta_pgno; int ret, t_ret; mdbp = NULL; dbc = NULL; dbenv = dbp->dbenv; /* Start the transaction. */ if (F_ISSET(dbenv, DB_ENV_TXN) && (ret = CDB___db_metabegin(dbp, &remove_lock)) != 0) return (ret); /* * Open the subdatabase. We can use the user's DB handle for this * purpose, I think. */ if ((ret = CDB___db_open(dbp, name, subdb, DB_UNKNOWN, 0, 0)) != 0) goto err; /* Free up the pages in the subdatabase. */ switch (dbp->type) { case DB_BTREE: case DB_RECNO: if ((ret = CDB___bam_reclaim(dbp, dbp->open_txn)) != 0) goto err; break; case DB_HASH: if ((ret = CDB___ham_reclaim(dbp, dbp->open_txn)) != 0) goto err; break; default: ret = EINVAL; /* Shouldn't be possible. */ goto err; } /* * Remove the entry from the main database and free the subdatabase * metadata page. */ if ((ret = CDB___db_master_open(dbp->dbenv, dbp->open_txn, name, 0, 0, &mdbp)) != 0) goto err; if ((ret = CDB___db_master_update(mdbp, subdb, dbp->type, &meta_pgno, 1, 0)) != 0) goto err; err: /* * End the transaction, committing the transaction if we were * successful, aborting otherwise. */ if (dbp->open_txn != NULL && (t_ret = CDB___db_metaend(dbp, &remove_lock, ret == 0, NULL, NULL)) != 0 && ret == 0) ret = t_ret; /* * Close the user's DB handle -- do this LAST to avoid smashing the * the transaction information. */ if ((t_ret = dbp->close(dbp, 0)) != 0 && ret == 0) ret = t_ret; if (mdbp != NULL && (t_ret = mdbp->close(mdbp, 0)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * CDB___db_metabegin -- * * Begin a meta-data operation. This involves doing any required locking, * potentially beginning a transaction and then telling the caller if you * did or did not begin the transaction. * * The writing flag indicates if the caller is actually allowing creates * or doing deletes (i.e., if the caller is opening and not creating, then * we don't need to do any of this). */ static int CDB___db_metabegin(dbp, lockp) DB *dbp; DB_LOCK *lockp; { DB_ENV *dbenv; DBT dbplock; u_int32_t locker, lockval; int ret; dbenv = dbp->dbenv; lockp->off = LOCK_INVALID; /* * There is no single place where we can know that we are or are not * going to be creating any files and/or subdatabases, so we will * always begin a tranasaction when we start creating one. If we later * discover that this was unnecessary, we will abort the transaction. * Recovery is written so that if we log a file create, but then * discover that we didn't have to do it, we recover correctly. The * file recovery design document has details. * * We need to single thread all create and delete operations, so if we * are running with locking, we must obtain a lock. We use CDB_lock_id to * generate a unique locker id and use a handcrafted DBT as the object * on which we are locking. */ if (F_ISSET(dbenv, DB_ENV_LOCKING | DB_ENV_CDB)) { if ((ret = CDB_lock_id(dbenv, &locker)) != 0) return (ret); lockval = 0; dbplock.data = &lockval; dbplock.size = sizeof(lockval); if ((ret = CDB_lock_get(dbenv, locker, 0, &dbplock, DB_LOCK_WRITE, lockp)) != 0) return(ret); } return (CDB_txn_begin(dbenv, NULL, &dbp->open_txn, 0)); } /* * CDB___db_metaend -- * End a meta-data operation. */ static int CDB___db_metaend(dbp, lockp, commit, callback, cookie) DB *dbp; DB_LOCK *lockp; int commit, (*callback) __P((DB *, void *)); void *cookie; { DB_ENV *dbenv; int ret, t_ret; dbenv = dbp->dbenv; /* End the transaction. */ if (commit) { if ((ret = CDB_txn_commit(dbp->open_txn, DB_TXN_SYNC)) == 0) { /* * Unlink any underlying file, we've committed the * transaction. */ if (callback != NULL) ret = callback(dbp, cookie); } } else ret = CDB_txn_abort(dbp->open_txn); /* Release our lock. */ if (lockp->off != LOCK_INVALID && (t_ret = CDB_lock_put(dbenv, lockp)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * CDB___db_backup_name * Create the backup file name for a given file. * * PUBLIC: int CDB___db_backup_name __P((const char *, char **, DB_LSN *)); */ #undef BACKUP_PREFIX #define BACKUP_PREFIX "__db." #undef MAX_LSN_TO_TEXT #define MAX_LSN_TO_TEXT 21 int CDB___db_backup_name(name, backup, lsn) const char *name; char **backup; DB_LSN *lsn; { size_t len; int ret; char *retp; len = strlen(name) + strlen(BACKUP_PREFIX) + MAX_LSN_TO_TEXT + 1; if ((ret = CDB___os_malloc(len, NULL, &retp)) != 0) return (ret); /* * Create the name. Backup file names are of the form: * * __db.name.0x[lsn-file].0x[lsn-offset] * * which guarantees uniqueness. */ snprintf(retp, len, "%s%s.0x%x0x%x", BACKUP_PREFIX, name, lsn->file, lsn->offset); *backup = retp; return (0); } /* * CDB___db_remove_callback -- * Callback function -- on file remove commit, it unlinks the backing * file. */ static int CDB___db_remove_callback(dbp, cookie) DB *dbp; void *cookie; { COMPQUIET(dbp, NULL); return (CDB___os_unlink(cookie)); } #if CONFIG_TEST /* * __db_testcopy * Create a copy of all backup files and our "main" DB. * * PUBLIC: int __db_testcopy __P((DB *, const char *)); */ int __db_testcopy(dbp, name) DB *dbp; const char *name; { size_t len; int dircnt, i, ret; char **namesp, *backup, *copy, *dir, *p, *real_name; real_name = NULL; /* Get the real backing file name. */ if ((ret = CDB___db_appname(dbp->dbenv, DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0) return (ret); /* * Maximum size of file, including adding a ".afterop". */ len = strlen(real_name) + strlen(BACKUP_PREFIX) + MAX_LSN_TO_TEXT + 9; if ((ret = CDB___os_malloc(len, NULL, ©)) != 0) goto out; if ((ret = CDB___os_malloc(len, NULL, &backup)) != 0) goto out; /* * First copy the file itself. */ snprintf(copy, len, "%s.afterop", real_name); __db_makecopy(real_name, copy); if ((ret = CDB___os_strdup(real_name, &dir)) != 0) goto out; CDB___os_freestr(real_name); real_name = NULL; /* * Create the name. Backup file names are of the form: * * __db.name.0x[lsn-file].0x[lsn-offset] * * which guarantees uniqueness. We want to look for the * backup name, followed by a '.0x' (so that if they have * files named, say, 'a' and 'abc' we won't match 'abc' when * looking for 'a'. */ snprintf(backup, len, "%s%s.0x", BACKUP_PREFIX, name); /* * We need the directory path to do the CDB___os_dirlist. */ p = CDB___db_rpath(dir); if (p != NULL) *p = '\0'; ret = CDB___os_dirlist(dir, &namesp, &dircnt); #if DIAGNOSTIC /* * XXX * To get the memory guard code to work because * it uses strlen and we just moved the end of the * string somewhere sooner. This causes the guard * code to fail as it looks at one byte past the end * of the string. * XXX */ *p = '/'; #endif CDB___os_freestr(dir); if (ret != 0) goto out; for (i = 0; i < dircnt; i++) { /* * Need to check if it is a backup file for this. * No idea what namesp[i] may be or how long, so * must use strncmp and not memcmp. We don't want * to use strcmp either because we are only matching * the first part of the real file's name. We don't * know its LSN's. */ if (strncmp(namesp[i], backup, strlen(backup)) == 0) { if ((ret = CDB___db_appname(dbp->dbenv, DB_APP_DATA, NULL, namesp[i], 0, NULL, &real_name)) != 0) goto out; /* * This should not happen. Check that old * .afterop files aren't around. * If so, just move on. */ if (strstr(real_name, ".afterop") != NULL) { CDB___os_freestr(real_name); real_name = NULL; continue; } snprintf(copy, len, "%s.afterop", real_name); __db_makecopy(real_name, copy); CDB___os_freestr(real_name); real_name = NULL; } } out: if (real_name) CDB___os_freestr(real_name); return (ret); } static void __db_makecopy(src, dest) const char *src, *dest; { DB_FH rfh, wfh; ssize_t rcnt, wcnt; char *buf; memset(&rfh, 0, sizeof(rfh)); memset(&wfh, 0, sizeof(wfh)); if (CDB___os_malloc(1024, NULL, &buf) != 0) return; if (CDB___os_open(src, DB_OSO_RDONLY, CDB___db_omode("rw----"), &rfh) != 0) goto err; if (CDB___os_open(dest, DB_OSO_CREATE | DB_OSO_TRUNC, CDB___db_omode("rw----"), &wfh) != 0) goto err; for (;;) if (CDB___os_read(&rfh, buf, 1024, &rcnt) < 0 || rcnt == 0 || CDB___os_write(&wfh, buf, rcnt, &wcnt) < 0 || wcnt != rcnt) break; err: CDB___os_free(buf, 1024); if (F_ISSET(&rfh, DB_FH_VALID)) CDB___os_closehandle(&rfh); if (F_ISSET(&wfh, DB_FH_VALID)) CDB___os_closehandle(&wfh); } #endif