-
Applications, using Berkeley DB's Concurrent Data Store product with
the DB_CDB_ALLDB flag set, that open databases while also holding open
cursors could hang.
*** fileops/fop_util.c 8 Jan 2003 05:01:56 -0000 1.57
--- fileops/fop_util.c 12 Jan 2003 19:44:29 -0000 1.58
***************
*** 40,46 ****
u_int32_t __lockval; \
\
if (LOCKING_ON((ENV))) { \
! __lockval = 0; \
__dbt.data = &__lockval; \
__dbt.size = sizeof(__lockval); \
if ((ret = (ENV)->lock_get((ENV), (ID), \
--- 40,46 ----
u_int32_t __lockval; \
\
if (LOCKING_ON((ENV))) { \
! __lockval = 1; \
__dbt.data = &__lockval; \
__dbt.size = sizeof(__lockval); \
if ((ret = (ENV)->lock_get((ENV), (ID), \
-
The following patch addresses the following issues:
- Applications with largely dirty caches could see performance problems
in the cache allocation code.
- Environment recovery could fail after the failure of a database
open.
- Catastrophic environment recovery could fail after a normal recovery
performed when sections of the database environment log only contained
database open/close pairs.
*** dbinc/mp.h.orig 2004-02-02 10:24:53.000000000 -0800
--- dbinc/mp.h 2004-02-02 10:26:27.000000000 -0800
***************
*** 149,154 ****
--- 149,161 ----
* region lock).
*/
DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */
+
+ /*
+ * We track page puts so that we can decide when allocation is never
+ * going to succeed. We don't lock the field, all we care about is
+ * if it changes.
+ */
+ u_int32_t put_counter; /* Count of page put calls. */
};
struct __db_mpool_hash {
*** mp/mp_fput.c.orig 2002-08-13 06:26:41.000000000 -0700
--- mp/mp_fput.c 2004-02-02 10:22:35.000000000 -0800
***************
*** 19,24 ****
--- 19,26 ----
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
+ static void __memp_reset_lru __P((DB_ENV *, REGINFO *));
+
/*
* __memp_fput --
* Mpool file put function.
***************
*** 198,202 ****
--- 200,255 ----
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ /*
+ * On every buffer put we update the buffer generation number and check
+ * for wraparound.
+ */
+ if (++c_mp->lru_count == UINT32_T_MAX)
+ __memp_reset_lru(dbenv, dbmp->reginfo);
+
return (0);
}
+
+ /*
+ * __memp_reset_lru --
+ * Reset the cache LRU counter.
+ */
+ static void
+ __memp_reset_lru(dbenv, memreg)
+ DB_ENV *dbenv;
+ REGINFO *memreg;
+ {
+ BH *bhp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *c_mp;
+ int bucket;
+
+ c_mp = memreg->primary;
+
+ /*
+ * Update the counter so all future allocations will start at the
+ * bottom.
+ */
+ c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+
+ /* Adjust the priority of every buffer in the system. */
+ for (hp = R_ADDR(memreg, c_mp->htab),
+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ if (bhp->priority != UINT32_T_MAX &&
+ bhp->priority > MPOOL_BASE_DECREMENT)
+ bhp->priority -= MPOOL_BASE_DECREMENT;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ }
+ }
*** mp/mp_alloc.c.orig 2002-08-17 07:23:25.000000000 -0700
--- mp/mp_alloc.c 2004-02-02 10:28:15.000000000 -0800
***************
*** 25,31 ****
} HS;
static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
- static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
/*
* __memp_alloc --
--- 25,30 ----
***************
*** 50,57 ****
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
size_t freed_space;
! u_int32_t buckets, buffers, high_priority, max_na, priority;
! int aggressive, ret;
void *p;
dbenv = dbmp->dbenv;
--- 49,57 ----
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
size_t freed_space;
! u_int32_t buckets, buffers, high_priority, priority, put_counter;
! u_int32_t total_buckets;
! int aggressive, giveup, ret;
void *p;
dbenv = dbmp->dbenv;
***************
*** 59,76 ****
dbht = R_ADDR(memreg, c_mp->htab);
hp_end = &dbht[c_mp->htab_buckets];
! buckets = buffers = 0;
! aggressive = 0;
c_mp->stat.st_alloc++;
/*
- * Get aggressive if we've tried to flush the number of pages as are
- * in the system without finding space.
- */
- max_na = 5 * c_mp->htab_buckets;
-
- /*
* If we're allocating a buffer, and the one we're discarding is the
* same size, we don't want to waste the time to re-integrate it into
* the shared memory free list. If the DB_MPOOLFILE argument isn't
--- 59,71 ----
dbht = R_ADDR(memreg, c_mp->htab);
hp_end = &dbht[c_mp->htab_buckets];
! buckets = buffers = put_counter = total_buckets = 0;
! aggressive = giveup = 0;
! hp_tmp = NULL;
c_mp->stat.st_alloc++;
/*
* If we're allocating a buffer, and the one we're discarding is the
* same size, we don't want to waste the time to re-integrate it into
* the shared memory free list. If the DB_MPOOLFILE argument isn't
***************
*** 81,99 ****
len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
R_LOCK(dbenv, memreg);
-
- /*
- * On every buffer allocation we update the buffer generation number
- * and check for wraparound.
- */
- if (++c_mp->lru_count == UINT32_T_MAX)
- __memp_reset_lru(dbenv, memreg, c_mp);
-
/*
* Anything newer than 1/10th of the buffer pool is ignored during
* allocation (unless allocation starts failing).
*/
- DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
/*
--- 76,85 ----
***************
*** 120,129 ****
* We're not holding the region locked here, these statistics
* can't be trusted.
*/
! if (buckets != 0) {
! if (buckets > c_mp->stat.st_alloc_max_buckets)
! c_mp->stat.st_alloc_max_buckets = buckets;
! c_mp->stat.st_alloc_buckets += buckets;
}
if (buffers != 0) {
if (buffers > c_mp->stat.st_alloc_max_pages)
--- 106,116 ----
* We're not holding the region locked here, these statistics
* can't be trusted.
*/
! total_buckets += buckets;
! if (total_buckets != 0) {
! if (total_buckets > c_mp->stat.st_alloc_max_buckets)
! c_mp->stat.st_alloc_max_buckets = total_buckets;
! c_mp->stat.st_alloc_buckets += total_buckets;
}
if (buffers != 0) {
if (buffers > c_mp->stat.st_alloc_max_pages)
***************
*** 131,136 ****
--- 118,129 ----
c_mp->stat.st_alloc_pages += buffers;
}
return (0);
+ } else if (giveup || c_mp->stat.st_pages == 0) {
+ R_UNLOCK(dbenv, memreg);
+
+ __db_err(dbenv,
+ "unable to allocate space from the buffer cache");
+ return (ret);
}
/*
***************
*** 138,163 ****
* we need. Reset our free-space counter.
*/
freed_space = 0;
/*
* Walk the hash buckets and find the next two with potentially useful
* buffers. Free the buffer with the lowest priority from the buckets'
* chains.
*/
! for (hp_tmp = NULL;;) {
/* Check for wrap around. */
hp = &dbht[c_mp->last_checked++];
if (hp >= hp_end) {
c_mp->last_checked = 0;
!
! /*
! * If we've gone through all of the hash buckets, try
! * an allocation. If the cache is small, the old page
! * size is small, and the new page size is large, we
! * might have freed enough memory (but not 3 times the
! * memory).
! */
! goto alloc;
}
/*
--- 131,154 ----
* we need. Reset our free-space counter.
*/
freed_space = 0;
+ total_buckets += buckets;
+ buckets = 0;
/*
* Walk the hash buckets and find the next two with potentially useful
* buffers. Free the buffer with the lowest priority from the buckets'
* chains.
*/
! for (;;) {
! /* All pages have been freed, make one last try */
! if (c_mp->stat.st_pages == 0)
! goto alloc;
!
/* Check for wrap around. */
hp = &dbht[c_mp->last_checked++];
if (hp >= hp_end) {
c_mp->last_checked = 0;
! hp = &dbht[c_mp->last_checked++];
}
/*
***************
*** 172,210 ****
/*
* The failure mode is when there are too many buffers we can't
* write or there's not enough memory in the system. We don't
! * have a metric for deciding if allocation has no possible way
! * to succeed, so we don't ever fail, we assume memory will be
! * available if we wait long enough.
*
! * Get aggressive if we've tried to flush 5 times the number of
! * hash buckets as are in the system -- it's possible we have
! * been repeatedly trying to flush the same buffers, although
! * it's unlikely. Aggressive means:
*
* a: set a flag to attempt to flush high priority buffers as
* well as other buffers.
* b: sync the mpool to force out queue extent pages. While we
* might not have enough space for what we want and flushing
* is expensive, why not?
! * c: sleep for a second -- hopefully someone else will run and
! * free up some memory. Try to allocate memory too, in case
! * the other thread returns its memory to the region.
! * d: look at a buffer in every hash bucket rather than choose
* the more preferable of two.
*
* !!!
* This test ignores pathological cases like no buffers in the
* system -- that shouldn't be possible.
*/
! if ((++buckets % max_na) == 0) {
! aggressive = 1;
!
R_UNLOCK(dbenv, memreg);
! (void)__memp_sync_int(
! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
!
! (void)__os_sleep(dbenv, 1, 0);
R_LOCK(dbenv, memreg);
goto alloc;
--- 163,221 ----
/*
* The failure mode is when there are too many buffers we can't
* write or there's not enough memory in the system. We don't
! * have a way to know that allocation has no way to succeed.
! * We fail if there were no pages returned to the cache after
! * we've been trying for a relatively long time.
*
! * Get aggressive if we've tried to flush the number of hash
! * buckets as are in the system and have not found any more
! * space. Aggressive means:
*
* a: set a flag to attempt to flush high priority buffers as
* well as other buffers.
* b: sync the mpool to force out queue extent pages. While we
* might not have enough space for what we want and flushing
* is expensive, why not?
! * c: look at a buffer in every hash bucket rather than choose
* the more preferable of two.
+ * d: start to think about giving up.
+ *
+ * If we get here twice, sleep for a second, hopefully someone
+ * else will run and free up some memory.
+ *
+ * Always try to allocate memory too, in case some other thread
+ * returns its memory to the region.
*
* !!!
* This test ignores pathological cases like no buffers in the
* system -- that shouldn't be possible.
*/
! if ((++buckets % c_mp->htab_buckets) == 0) {
! if (freed_space > 0)
! goto alloc;
R_UNLOCK(dbenv, memreg);
! switch (++aggressive) {
! case 1:
! break;
! case 2:
! put_counter = c_mp->put_counter;
! /* FALLTHROUGH */
! case 3:
! case 4:
! case 5:
! case 6:
! (void)__memp_sync_int(
! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
!
! (void)__os_sleep(dbenv, 1, 0);
! break;
! default:
! aggressive = 1;
! if (put_counter == c_mp->put_counter)
! giveup = 1;
! break;
! }
R_LOCK(dbenv, memreg);
goto alloc;
***************
*** 277,283 ****
* thread may have acquired this buffer and incremented the ref
* count after we wrote it, in which case we can't have it.
*
! * If there's a write error, avoid selecting this buffer again
* by making it the bucket's least-desirable buffer.
*/
if (ret != 0 || bhp->ref != 0) {
--- 288,295 ----
* thread may have acquired this buffer and incremented the ref
* count after we wrote it, in which case we can't have it.
*
! * If there's a write error and we're having problems finding
! * something to allocate, avoid selecting this buffer again
* by making it the bucket's least-desirable buffer.
*/
if (ret != 0 || bhp->ref != 0) {
***************
*** 301,306 ****
--- 313,320 ----
freed_space += __db_shsizeof(bhp);
__memp_bhfree(dbmp, hp, bhp, 1);
+ if (aggressive > 1)
+ aggressive = 1;
/*
* Unlock this hash bucket and re-acquire the region lock. If
***************
*** 362,415 ****
hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
}
- /*
- * __memp_reset_lru --
- * Reset the cache LRU counter.
- */
- static void
- __memp_reset_lru(dbenv, memreg, c_mp)
- DB_ENV *dbenv;
- REGINFO *memreg;
- MPOOL *c_mp;
- {
- BH *bhp;
- DB_MPOOL_HASH *hp;
- int bucket;
-
- /*
- * Update the counter so all future allocations will start at the
- * bottom.
- */
- c_mp->lru_count -= MPOOL_BASE_DECREMENT;
-
- /* Release the region lock. */
- R_UNLOCK(dbenv, memreg);
-
- /* Adjust the priority of every buffer in the system. */
- for (hp = R_ADDR(memreg, c_mp->htab),
- bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
- /*
- * Skip empty buckets.
- *
- * We can check for empty buckets before locking as we
- * only care if the pointer is zero or non-zero.
- */
- if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
- continue;
-
- MUTEX_LOCK(dbenv, &hp->hash_mutex);
- for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
- if (bhp->priority != UINT32_T_MAX &&
- bhp->priority > MPOOL_BASE_DECREMENT)
- bhp->priority -= MPOOL_BASE_DECREMENT;
- MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
- }
-
- /* Reacquire the region lock. */
- R_LOCK(dbenv, memreg);
- }
-
#ifdef DIAGNOSTIC
/*
* __memp_check_order --
--- 376,381 ----
*** dbreg/dbreg_rec.c.orig 2002-08-17 07:22:52.000000000 -0700
--- dbreg/dbreg_rec.c 2003-11-08 10:59:19.000000000 -0800
***************
*** 174,192 ****
* Typically, closes should match an open which means
* that if this is a close, there should be a valid
* entry in the dbentry table when we get here,
! * however there is an exception. If this is an
* OPENFILES pass, then we may have started from
* a log file other than the first, and the
* corresponding open appears in an earlier file.
! * We can ignore that case, but all others are errors.
*/
dbe = &dblp->dbentry[argp->fileid];
if (dbe->dbp == NULL && !dbe->deleted) {
/* No valid entry here. */
! if ((argp->opcode != LOG_CLOSE &&
! argp->opcode != LOG_RCLOSE) ||
! (op != DB_TXN_OPENFILES &&
! op !=DB_TXN_POPENFILES)) {
__db_err(dbenv,
"Improper file close at %lu/%lu",
(u_long)lsnp->file,
--- 174,193 ----
* Typically, closes should match an open which means
* that if this is a close, there should be a valid
* entry in the dbentry table when we get here,
! * however there are exceptions. 1. If this is an
* OPENFILES pass, then we may have started from
* a log file other than the first, and the
* corresponding open appears in an earlier file.
! * 2. If we are undoing an open on an abort or
! * recovery, it's possible that we failed after
! * the log record, but before we actually entered
! * a handle here.
*/
dbe = &dblp->dbentry[argp->fileid];
if (dbe->dbp == NULL && !dbe->deleted) {
/* No valid entry here. */
! if (DB_REDO(op) ||
! argp->opcode == LOG_CHECKPOINT) {
__db_err(dbenv,
"Improper file close at %lu/%lu",
(u_long)lsnp->file,
*** env/env_recover.c.orig.1 2002-08-22 14:52:51.000000000 -0700
--- env/env_recover.c 2003-11-15 08:20:59.000000000 -0800
***************
*** 232,243 ****
* we'll still need to do a vtruncate based on information we haven't
* yet collected.
*/
! if (ret == DB_NOTFOUND) {
ret = 0;
! if (max_lsn == NULL)
! goto done;
! }
! if (ret != 0)
goto err;
hi_txn = txnid;
--- 232,240 ----
* we'll still need to do a vtruncate based on information we haven't
* yet collected.
*/
! if (ret == DB_NOTFOUND)
ret = 0;
! else if (ret != 0)
goto err;
hi_txn = txnid;
***************
*** 331,337 ****
/* Find a low txnid. */
ret = 0;
! do {
/* txnid is after rectype, which is a u_int32. */
memcpy(&txnid,
(u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
--- 328,334 ----
/* Find a low txnid. */
ret = 0;
! if (hi_txn != 0) do {
/* txnid is after rectype, which is a u_int32. */
memcpy(&txnid,
(u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
***************
*** 344,354 ****
* There are no transactions and we're not recovering to an LSN (see
* above), so there is nothing to do.
*/
! if (ret == DB_NOTFOUND) {
ret = 0;
- if (max_lsn == NULL)
- goto done;
- }
/* Reset to the first lsn. */
if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
--- 341,348 ----
* There are no transactions and we're not recovering to an LSN (see
* above), so there is nothing to do.
*/
! if (ret == DB_NOTFOUND)
ret = 0;
/* Reset to the first lsn. */
if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
***************
*** 367,372 ****
--- 361,370 ----
txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
goto err;
+ /* If there were no transactions, then we can bail out early. */
+ if (hi_txn == 0 && max_lsn == NULL)
+ goto done;
+
/*
* Pass #2.
*
***************
*** 483,488 ****
--- 481,487 ----
if ((ret = __dbreg_close_files(dbenv)) != 0)
goto err;
+ done:
if (max_lsn != NULL) {
region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
***************
*** 538,544 ****
__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
__db_err(dbenv, "%s %lx %s [%lu][%lu]",
"Maximum transaction ID",
! ((DB_TXNHEAD *)txninfo)->maxid,
"Recovery checkpoint",
(u_long)region->last_ckp.file,
(u_long)region->last_ckp.offset);
--- 537,544 ----
__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
__db_err(dbenv, "%s %lx %s [%lu][%lu]",
"Maximum transaction ID",
! txninfo == NULL ? TXN_MINIMUM :
! ((DB_TXNHEAD *)txninfo)->maxid,
"Recovery checkpoint",
(u_long)region->last_ckp.file,
(u_long)region->last_ckp.offset);
***************
*** 550,556 ****
(u_long)lsn.file, (u_long)lsn.offset, pass);
}
- done:
err: if (lockid != DB_LOCK_INVALIDID) {
if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0)
ret = t_ret;
--- 550,555 ----
-
Fix a bug where cache buffer retrieval could race with a checkpoint call,
potentially causing database environment recovery to fail. [#14657]
*** mp/mp_fget.c.orig 2002-08-07 08:23:01.000000000 -0700
--- mp/mp_fget.c 2006-05-30 20:32:20.000000000 -0700
***************
*** 506,513 ****
*/
if (state != SECOND_MISS && bhp->ref == 1) {
bhp->priority = UINT32_T_MAX;
! SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
! SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
hp->hash_priority =
SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
}
--- 506,517 ----
*/
if (state != SECOND_MISS && bhp->ref == 1) {
bhp->priority = UINT32_T_MAX;
! /* Move the buffer if there are others in the bucket. */
! if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) != bhp
! || SH_TAILQ_NEXT(bhp, hq, __bh) != NULL) {
! SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
! SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
! }
hp->hash_priority =
SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
}
*** mp/mp_fput.c.orig 2002-08-13 06:26:41.000000000 -0700
--- mp/mp_fput.c 2006-05-30 20:55:11.000000000 -0700
***************
*** 166,171 ****
--- 166,176 ----
* to the correct position in the list.
*/
argbhp = bhp;
+ /* Move the buffer if there are others in the bucket. */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == bhp
+ && SH_TAILQ_NEXT(bhp, hq, __bh) != NULL)
+ goto done;
+
SH_TAILQ_REMOVE(&hp->hash_bucket, argbhp, hq, __bh);
prev = NULL;
***************
*** 178,183 ****
--- 183,189 ----
else
SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, prev, argbhp, hq, __bh);
+ done:
/* Reset the hash bucket's priority. */
hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;