List:Commits« Previous MessageNext Message »
From:Kevin Lewis Date:December 5 2008 7:26pm
Subject:bzr commit into mysql-6.0-falcon-team branch (klewis:2927) Bug#34893,
Bug#36700
View as plain text  
#At file:///C:/Work/bzr/Chg-08/mysql-6.0-falcon-team/

 2927 Kevin Lewis	2008-12-05
      Bug#36700 & Bug#34893  Improved Scavenger.
      
      This patch is the first collection of the changes needed 
      to improve the scavenger.  They include;
      
      1) Separate the two tasks involved in scavenging into
         A. Pruning old invisible record versions.
            1. As part of this task, a survey is taken of records
               in age groups so that the next step can be done in 
               one pass
         B. Retiring least-recently-used records from the record cache
      
      2) Watch how much records space is added to the record cache 
         and start a new age group at regular intervals. 
      
      3) Put the scavenger into a separate thread that can be 
         signaled when needed.
      
      Future and known issues;  (need input)
      1) Adjust the default scavenge threshold and floor to a higher 
         percentage of the record cache.
      
      2) Adjust the current dependency upon the memory manager to 
         report how much memory is allocated at an address.  
         Each record should know this.
      
      3) During the scavenge, the current age group is not 
         incremented since the memory is being both scavenged 
         and added to by client threads.  So those age groups 
         can have 2 to 3 times more records in them.
         
modified:
  storage/falcon/Database.cpp
  storage/falcon/Database.h
  storage/falcon/MemMgr.cpp
  storage/falcon/MemMgr.h
  storage/falcon/Record.cpp
  storage/falcon/Record.h
  storage/falcon/RecordGroup.cpp
  storage/falcon/RecordGroup.h
  storage/falcon/RecordLeaf.cpp
  storage/falcon/RecordLeaf.h
  storage/falcon/RecordScavenge.cpp
  storage/falcon/RecordScavenge.h
  storage/falcon/RecordSection.h
  storage/falcon/RecordVersion.cpp
  storage/falcon/RecordVersion.h
  storage/falcon/Table.cpp
  storage/falcon/Table.h
  storage/falcon/Transaction.cpp
  storage/falcon/Transaction.h

=== modified file 'storage/falcon/Database.cpp'
--- a/storage/falcon/Database.cpp	2008-11-20 17:05:50 +0000
+++ b/storage/falcon/Database.cpp	2008-12-05 19:27:45 +0000
@@ -420,7 +420,10 @@ Database::Database(const char *dbName, C
 	recordMemoryMax = configuration->recordMemoryMax;
 	recordScavengeFloor = configuration->recordScavengeFloor;
 	recordScavengeThreshold = configuration->recordScavengeThreshold;
-	lastRecordMemory = 0;
+	recordScavengeMaxGroupSize = recordMemoryMax / AGE_GROUPS_IN_CACHE;
+	recordPoolAllocCount = 0;
+	lastGenerationMemory = 0;
+	lastActiveMemoryChecked = 0;
 	utf8 = false;
 	stepNumber = 0;
 	shuttingDown = false;
@@ -460,6 +463,9 @@ Database::Database(const char *dbName, C
 	tableSpaceManager = NULL;
 	timestamp = time (NULL);
 	tickerThread = NULL;
+	scavengerThread = NULL;
+	scavengerThreadSleeping = 0;
+	scavengerThreadSignaled = 0;
 	serialLog = NULL;
 	pageWriter = NULL;
 	zombieTables = NULL;
@@ -480,8 +486,8 @@ Database::Database(const char *dbName, C
 	syncResultSets.setName("Database::syncResultSets");
 	syncConnectionStatements.setName("Database::syncConnectionStatements");
 	syncScavenge.setName("Database::syncScavenge");
-	IO::deleteFile(BACKLOG_FILE);
 	syncSysDDL.setName("Database::syncSysDDL");
+	IO::deleteFile(BACKLOG_FILE);
 }
 
 
@@ -521,6 +527,7 @@ void Database::start()
 	filterSetManager = new FilterSetManager(this);
 	timestamp = time(NULL);
 	tickerThread = threads->start("Database::Database", &Database::ticker, this);
+	scavengerThread = threads->start("Database::scavengerThreadMain", &Database::scavengerThreadMain, this);
 	internalScheduler->addEvent(scavenger);
 	internalScheduler->addEvent(garbageCollector);
 	internalScheduler->addEvent(serialLog);
@@ -1735,9 +1742,10 @@ void Database::scavenge()
 	
 	syncStmt.unlock();
 
-	// Next, scavenge tables
+	transactionManager->purgeTransactions();
 
-	retireRecords(false);				// age group based scavenger
+	// Scavenge the record cache
+	scavengeRecords();
 
 	// Scavenge expired licenses
 	
@@ -1766,108 +1774,110 @@ void Database::scavenge()
 }
 
 
-void Database::retireRecords(bool forced)
+void Database::scavengeRecords(void)
 {
-	int cycle = scavengeCycle;
-	
-	Sync syncScavenger(&syncScavenge, "Database::retireRecords(1)");
-	syncScavenger.lock(Exclusive);
-
-	if (forced && scavengeCycle > cycle)
-		return;
-	
 	// Commit pending system transactions before proceeding
-	
-	if (!forced && systemConnection->transaction)
+
+	if (systemConnection->transaction)
 		commitSystemTransaction();
 
-	if (forced)
-		Log::log("Forced record scavenge cycle\n");
-	
-	transactionManager->purgeTransactions();
+	Sync syncScavenger(&syncScavenge, "Database::scavengeRecords(Scavenge)");
+	syncScavenger.lock(Exclusive);
+
+	// Create an object to track this record scavenge cycle.
+
 	TransId oldestActiveTransaction = transactionManager->findOldestActive();
-	uint64 total = recordDataPool->activeMemory;
-	RecordScavenge recordScavenge(this, oldestActiveTransaction, forced);
-	
-	// If we passed the upper limit, scavenge.  If we didn't pick up
-	// a significant amount of memory since the last cycle, don't bother
-	// bumping the age group.
+	uint64 lastScavengeMemory = recordDataPool->activeMemory;
+	RecordScavenge recordScavenge(this, scavengeCycle++, oldestActiveTransaction, lastScavengeMemory);
+	recordScavenge.startingActiveMemory = recordDataPool->activeMemory;
+	recordScavenge.scavengeStart = deltaTime;
+
+	// Take inventory of the record cache and prune invisible record versions
+
+	pruneRecords(&recordScavenge);
+	recordScavenge.prunedActiveMemory = recordDataPool->activeMemory;
+	recordScavenge.pruneStop = deltaTime;
+	syncScavenger.unlock();  // take a breath!
+
+	// Retire visible records with no dependencies in the oldest age groups
 
-	if (forced || total >  recordScavengeThreshold)
+	syncScavenger.lock(Exclusive);
+	retireRecords(&recordScavenge);
+	recordScavenge.retiredActiveMemory = recordDataPool->activeMemory;
+	recordScavenge.retireStop = deltaTime;
+
+	// Check for low memory 
+
+	if (recordScavenge.spaceRemaining > recordScavengeFloor)
+		setLowMemory();
+
+	recordScavenge.print();
+	// Log::log(analyze(analyzeRecordLeafs));
+
+	// Start a new generation now that scavenging is done and the 
+	// active memory has been adjusted.
+
+	Sync syncMem(&syncMemory, "Database::checkRecordScavenge");
+	syncMem.lock(Exclusive);
+
+//	INTERLOCKED_INCREMENT (currentGeneration);
+	lastActiveMemoryChecked = lastGenerationMemory = recordDataPool->activeMemory;
+}
+
+// Take inventory of the record cache and prune invisible record versions
+
+void Database::pruneRecords(RecordScavenge *recordScavenge)
+{
+	//Log::log(analyze(analyzeRecordLeafs));
+	//LogStream stream;
+	//recordDataPool->analyze(0, &stream, NULL, NULL);
+
+	Sync syncTbl(&syncTables, "Database::pruneRecords(tables)");
+	syncTbl.lock(Shared);
+
+	for (Table *table = tableList; table; table = table->next)
 		{
-		//LogStream stream;
-		//recordDataPool->analyze(0, &stream, NULL, NULL);
-		
-		Sync syncTbl(&syncTables, "Database::retireRecords(2)");
-		syncTbl.lock(Shared);
-		
-		Table *table;
-		time_t scavengeStart = deltaTime;
-		
-		if (!forced)
-			for (table = tableList; table; table = table->next)
-				table->inventoryRecords(&recordScavenge);
-		
-		recordScavenge.computeThreshold(recordScavengeFloor);
-		recordScavenge.printRecordMemory();	
-		int count = 0;
-		int skipped = 0;
-		
-		for (table = tableList; table; table = table->next)
+		try
 			{
-			try
-				{
-				int n = table->retireRecords(&recordScavenge);
-				
-				if (n >= 0)
-					count += n;
-				else
-					++skipped;
-				}
-			catch (SQLException &exception)
-				{
-				//syncTbl.unlock();
-				Log::debug ("Exception during scavenge of table %s.%s: %s\n",
-						table->schemaName, table->name, exception.getText());
-				}
+			table->pruneRecords(recordScavenge);
+			}
+		catch (SQLException &exception)
+			{
+			Log::debug ("Exception during pruning of table %s.%s: %s\n",
+					table->schemaName, table->name, exception.getText());
 			}
-
-		syncTbl.unlock();
-		
-		
-		// Check for low memory 
-		
-		if (recordScavenge.spaceRemaining > recordScavengeFloor)
-			setLowMemory();
-		/***
-		else
-			lowMemory = false;
-		***/
-			
-		Log::log(LogScavenge, "%d: Scavenged %d records, " I64FORMAT " bytes in %d seconds\n", 
-					deltaTime, recordScavenge.recordsReclaimed, recordScavenge.spaceReclaimed, deltaTime - scavengeStart);
-			
-		total = recordScavenge.spaceRemaining;
 		}
-	else if ((total - lastRecordMemory) < recordScavengeThreshold / AGE_GROUPS)
-		{
-		recordScavenge.scavengeGeneration = UNDEFINED;
-		cleanupRecords (&recordScavenge);
+}
 
-		++scavengeCycle;
-				
+
+void Database::retireRecords(RecordScavenge *recordScavenge)
+{
+	// If we passed the upper limit, scavenge.
+
+	if (recordDataPool->activeMemory < recordScavengeThreshold)
 		return;
-		}
-	else
+
+	//LogStream stream;
+	//recordDataPool->analyze(0, &stream, NULL, NULL);
+
+	Sync syncTbl(&syncTables, "Database::retireRecords(2)");
+	syncTbl.lock(Shared);
+
+	uint64 spaceToRetire = recordDataPool->activeMemory - recordScavengeFloor;
+	recordScavenge->computeThreshold(spaceToRetire);
+
+	for (Table *table = tableList; table; table = table->next)
 		{
-		recordScavenge.scavengeGeneration = UNDEFINED;
-		cleanupRecords (&recordScavenge);
+		try
+			{
+			table->retireRecords(recordScavenge);
+			}
+		catch (SQLException &exception)
+			{
+			Log::debug ("Exception during scavenge of table %s.%s: %s\n",
+				table->schemaName, table->name, exception.getText());
+			}
 		}
-
-	++scavengeCycle;
-	
-	lastRecordMemory = recordDataPool->activeMemory;
-	INTERLOCKED_INCREMENT (currentGeneration);
 }
 
 void Database::ticker(void * database)
@@ -1892,6 +1902,42 @@ void Database::ticker()
 		}
 }
 
+void Database::scavengerThreadMain(void * database)
+{
+	((Database*) database)->scavengerThreadMain();
+}
+
+void Database::scavengerThreadMain(void)
+{
+	Thread *thread = Thread::getThread("Database::scavengerThreadMain");
+
+	thread->sleep(1000);
+	scavengerThreadSleeping = 0;
+	scavengerThreadSignaled = 0;
+	while (!thread->shutdownInProgress)
+		{
+		scavenge();
+		if (recordDataPool->activeMemory < recordScavengeThreshold)
+			{
+			INTERLOCKED_INCREMENT(scavengerThreadSleeping);
+			thread->sleep();
+			scavengerThreadSignaled = 0;
+			INTERLOCKED_DECREMENT(scavengerThreadSleeping);
+			}
+		}
+}
+
+void Database::scavengerThreadWakeup(void)
+{
+	if (scavengerThread)
+		{
+		scavengerThread->wake();
+		}
+		else
+		{
+		}
+}
+
 int Database::createSequence(int64 initialValue)
 {
 	Transaction *transaction = getSystemTransaction();
@@ -2169,25 +2215,6 @@ int Database::getMemorySize(const char *
 }
 
 
-void Database::cleanupRecords(RecordScavenge *recordScavenge)
-{
-	Sync sync (&syncTables, "Database::cleanupRecords");
-	sync.lock (Shared);
-
-	for (Table *table = tableList; table; table = table->next)
-		{
-		try
-			{
-			table->cleanupRecords(recordScavenge);
-			}
-		catch (SQLException &exception)
-			{
-			Log::debug ("Exception during cleanupRecords of table %s.%s: %s\n",
-					table->schemaName, table->name, exception.getText());
-			}
-		}
-}
-
 void Database::licenseCheck()
 {
 #ifdef LICENSE
@@ -2506,9 +2533,52 @@ void Database::setRecordScavengeFloor(in
 		}
 }
 
+void Database::checkRecordScavenge(void)
+{
+	// Signal a load-based scavenge if we are over the threshold
+	if (scavengerThreadSleeping && !scavengerThreadSignaled)
+		{
+		Sync syncMem(&syncMemory, "Database::checkRecordScavenge");
+		syncMem.lock(Exclusive);
+
+		if (!scavengerThreadSignaled && (recordDataPool->activeMemory > lastActiveMemoryChecked))
+			{
+			if ((recordDataPool->activeMemory - lastGenerationMemory) > recordScavengeMaxGroupSize)
+				{
+				// Start a new age generation regularly, except during a scavenge.
+				// Let the scavenger run to prune records.  
+				// It will retire records if above the thresold
+
+				INTERLOCKED_INCREMENT (currentGeneration);
+				lastGenerationMemory = recordDataPool->activeMemory;
+
+				INTERLOCKED_INCREMENT(scavengerThreadSignaled);
+				scavengerThreadWakeup();
+				}
+
+			else if (recordDataPool->activeMemory >= recordScavengeThreshold)
+				{
+				INTERLOCKED_INCREMENT(scavengerThreadSignaled);
+				scavengerThreadWakeup();
+				}
+
+			lastActiveMemoryChecked = recordDataPool->activeMemory;
+			}
+		}
+}
+
 void Database::forceRecordScavenge(void)
 {
-	retireRecords(true);
+	// This code needs to use scavengerThreadSignaled
+
+	Sync syncMem(&syncMemory, "Database::checkRecordScavenge");
+	syncMem.lock(Exclusive);
+
+	if (scavengerThreadSleeping && !scavengerThreadSignaled)
+		{
+		INTERLOCKED_INCREMENT(scavengerThreadSignaled);
+		scavengerThreadWakeup();
+		}
 }
 
 void Database::debugTrace(void)

=== modified file 'storage/falcon/Database.h'
--- a/storage/falcon/Database.h	2008-10-16 01:04:03 +0000
+++ b/storage/falcon/Database.h	2008-12-05 19:27:45 +0000
@@ -127,9 +127,10 @@ public:
 	Repository*		findRepository(const char *schema, const char *name);
 	const char*		fetchTemplate (JString applicationName, JString templateName, TemplateContext *context);
 	void			licenseCheck();
-	void			cleanupRecords (RecordScavenge *recordScavenge);
 	void			serverOperation (int op, Parameters *parameters);
-	void			retireRecords(bool forced);
+	void			scavengeRecords(void);
+	void			pruneRecords(RecordScavenge* recordScavenge);
+	void			retireRecords(RecordScavenge* recordScavenge);
 	int				getMemorySize (const char *string);
 	JString			analyze(int mask);
 	void			upgradeSystemTables();
@@ -148,6 +149,9 @@ public:
 	int				createSequence(int64 initialValue);
 	void			ticker();
 	static void		ticker (void *database);
+	static void		scavengerThreadMain(void * database);
+	void			scavengerThreadMain(void);
+	void			scavengerThreadWakeup(void);
 	void			scavenge();
 	void			validate (int optionMask);
 	Role*			findRole(const char *schemaName, const char * roleName);
@@ -220,6 +224,7 @@ public:
 	void			setRecordMemoryMax(uint64 value);
 	void			setRecordScavengeThreshold(int value);
 	void			setRecordScavengeFloor(int value);
+	void			checkRecordScavenge(void);
 	void			forceRecordScavenge(void);
 	void			debugTrace(void);
 	void			pageCacheFlushed(int64 flushArg);
@@ -270,6 +275,7 @@ public:
 	SyncObject			syncConnectionStatements;
 	SyncObject			syncScavenge;
 	SyncObject			syncSysDDL;
+	Mutex				syncMemory;
 	PriorityScheduler	*ioScheduler;
 	Threads				*threads;
 	Scheduler			*scheduler;
@@ -290,6 +296,9 @@ public:
 	SyncHandler			*syncHandler;
 	SearchWords			*searchWords;
 	Thread				*tickerThread;
+	Thread				*scavengerThread;
+	volatile INTERLOCK_TYPE	scavengerThreadSleeping;
+	volatile INTERLOCK_TYPE	scavengerThreadSignaled;
 	PageWriter			*pageWriter;
 	PreparedStatement	*updateCardinality;
 	MemMgr				*recordDataPool;
@@ -310,8 +319,11 @@ public:
 	volatile INTERLOCK_TYPE	currentGeneration;
 	uint64				recordMemoryMax;
 	uint64				recordScavengeThreshold;
+	uint64				recordScavengeMaxGroupSize;
 	uint64				recordScavengeFloor;
-	int64				lastRecordMemory;
+	uint64				recordPoolAllocCount;
+	uint64				lastGenerationMemory;
+	uint64				lastActiveMemoryChecked;
 	time_t				creationTime;
 	volatile time_t		lastScavenge;
 };

=== modified file 'storage/falcon/MemMgr.cpp'
--- a/storage/falcon/MemMgr.cpp	2008-11-03 00:34:05 +0000
+++ b/storage/falcon/MemMgr.cpp	2008-12-05 19:27:45 +0000
@@ -1209,3 +1209,10 @@ void MemMgr::validateBlock(MemBlock *blo
 			corrupt ("guard bytes overwritten");
 #endif
 }
+
+int MemMgr::blockSize(void *object)
+{
+	MemBlock *block = (MemBlock*) ((UCHAR*) object - OFFSET(MemBlock*, body));
+
+	return ABS(block->length);
+}

=== modified file 'storage/falcon/MemMgr.h'
--- a/storage/falcon/MemMgr.h	2008-10-31 15:42:42 +0000
+++ b/storage/falcon/MemMgr.h	2008-12-05 19:27:45 +0000
@@ -163,7 +163,7 @@ public:
 	static void		release(void* block);
 	static void		validate(void *object);
 	static void		validateBlock (void *object);
-	
+	static int		blockSize(void *object);
 };
 
 #endif

=== modified file 'storage/falcon/Record.cpp'
--- a/storage/falcon/Record.cpp	2008-10-29 23:25:13 +0000
+++ b/storage/falcon/Record.cpp	2008-12-05 19:27:45 +0000
@@ -22,6 +22,7 @@
 #include "Engine.h"
 #include "Record.h"
 #include "RecordVersion.h"
+#include "RecordScavenge.h"
 #include "Value.h"
 #include "Transaction.h"
 #include "Format.h"
@@ -36,6 +37,7 @@
 #include "EncodedRecord.h"
 #include "Field.h"
 #include "Serialize.h"
+#include "MemMgr.h"
 
 #undef new
 
@@ -484,10 +486,23 @@ bool Record::isVersion()
 	return false;
 }
 
-
-bool Record::scavenge(RecordScavenge *recordScavenge, LockType lockType)
+bool Record::retire(RecordScavenge *recordScavenge)
 {
-	return true;
+	if (generation <= recordScavenge->scavengeGeneration)
+		{
+		recordScavenge->spaceRetired += getMemUsage();
+		++recordScavenge->recordsRetired;
+#ifdef CHECK_RECORD_ACTIVITY
+		active = false;
+#endif
+		release();
+		return true;
+		}
+
+	++recordScavenge->recordsRemaining;
+	recordScavenge->spaceRemaining += getMemUsage();
+
+	return false;
 }
 
 void Record::scavenge(TransId targetTransactionId, int oldestActiveSavePointId)
@@ -918,6 +933,10 @@ char* Record::allocRecordData(int length
 	for (int n = 0;; ++n)
 		try
 			{
+			if (format && format->table)
+				if (++format->table->database->recordPoolAllocCount & 0xFF)
+					format->table->database->checkRecordScavenge();
+
 			return POOL_NEW(format->table->database->recordDataPool) char[length];
 			}
 		catch (SQLException& exception)
@@ -960,6 +979,18 @@ int Record::getSize(void)
 	return sizeof(*this);
 }
 
+int Record::getDataMemUsage(void)
+{
+	return (data.record == NULL ? 0 : MemMgr::blockSize(data.record));
+}
+
+int Record::getMemUsage(void)
+{
+	int objectSize = MemMgr::blockSize(this);
+	return objectSize + getDataMemUsage();
+}
+
+
 SyncObject* Record::getSyncPrior(void)
 {
 	return format->table->getSyncPrior(this);

=== modified file 'storage/falcon/Record.h'
--- a/storage/falcon/Record.h	2008-10-29 23:25:13 +0000
+++ b/storage/falcon/Record.h	2008-12-05 19:27:45 +0000
@@ -77,7 +77,7 @@ public:
 	virtual void	setSuperceded (bool flag);
 	virtual Record*	fetchVersion (Transaction * transaction);
 	virtual Record*	fetchVersionRecursive (Transaction *transaction);
-	virtual bool	scavenge(RecordScavenge *recordScavenge, LockType lockType);
+	virtual bool	retire(RecordScavenge *recordScavenge);
 	virtual void	scavenge(TransId targetTransactionId, int oldestActiveSavePointId);
 	virtual bool	isVersion();
 	virtual bool	isSuperceded();
@@ -115,6 +115,8 @@ public:
 	void			validateData(void);
 	char*			allocRecordData(int length);
 	void			expungeRecord(void);
+	int				getDataMemUsage(void);
+	int				getMemUsage(void);
 	
 	Record (Table *table, Format *recordFormat);
 	Record (Table *table, int32 recordNumber, Stream *stream);

=== modified file 'storage/falcon/RecordGroup.cpp'
--- a/storage/falcon/RecordGroup.cpp	2008-10-16 02:40:08 +0000
+++ b/storage/falcon/RecordGroup.cpp	2008-12-05 19:27:45 +0000
@@ -110,9 +110,8 @@ bool RecordGroup::store(Record * record,
 	return section->store(record, prior, id % base, NULL);
 }
 
-int RecordGroup::retireRecords(Table *table, int base, RecordScavenge *recordScavenge)
+void RecordGroup::pruneRecords(Table *table, int base, RecordScavenge *recordScavenge)
 {
-	int count = 0;
 	int recordNumber = base * RECORD_SLOTS;
 
 	for (RecordSection **ptr = records, **end = records + RECORD_SLOTS; ptr < end; ++ptr, ++recordNumber)
@@ -120,23 +119,22 @@ int RecordGroup::retireRecords(Table *ta
 		RecordSection *section = *ptr;
 		
 		if (section)
-			{
-			int n = section->retireRecords(table, recordNumber, recordScavenge);
-			count += n;
-			
-			/***
-			if (n)
-				count += n;
-			else
-				{
-				delete section;
-				*ptr = NULL;
-				}
-			***/
-			}
+			section->pruneRecords(table, recordNumber, recordScavenge);
 		}
+}
 
-	return count;
+void RecordGroup::retireRecords(Table *table, int base, RecordScavenge *recordScavenge)
+{
+	int count = 0;
+	int recordNumber = base * RECORD_SLOTS;
+
+	for (RecordSection **ptr = records, **end = records + RECORD_SLOTS; ptr < end; ++ptr, ++recordNumber)
+		{
+		RecordSection *section = *ptr;
+		
+		if (section)
+			section->retireRecords(table, recordNumber, recordScavenge);
+		}
 }
 
 int RecordGroup::countActiveRecords()

=== modified file 'storage/falcon/RecordGroup.h'
--- a/storage/falcon/RecordGroup.h	2008-10-16 02:40:08 +0000
+++ b/storage/falcon/RecordGroup.h	2008-12-05 19:27:45 +0000
@@ -35,11 +35,12 @@ public:
 
 	virtual int     countActiveRecords();
 	virtual bool    anyActiveRecords();
-	virtual	int    chartActiveRecords(int *chart);
+	virtual int     chartActiveRecords(int *chart);
 	virtual bool    store (Record *record, Record *prior, int32 id, RecordSection **parentPtr);
 	virtual void    inventoryRecords(RecordScavenge* recordScavenge);
 	virtual Record* fetch (int32 id);
-	virtual int     retireRecords(Table *table, int base, RecordScavenge *recordScavenge);
+	virtual void    pruneRecords(Table *table, int base, RecordScavenge *recordScavenge);
+	virtual void    retireRecords(Table *table, int base, RecordScavenge *recordScavenge);
 	virtual bool    retireSections(Table * table, int id);
 	virtual bool    inactive();
 	

=== modified file 'storage/falcon/RecordLeaf.cpp'
--- a/storage/falcon/RecordLeaf.cpp	2008-10-16 02:40:08 +0000
+++ b/storage/falcon/RecordLeaf.cpp	2008-12-05 19:27:45 +0000
@@ -114,115 +114,101 @@ bool RecordLeaf::store(Record *record, R
 	return true;
 }
 
-int RecordLeaf::retireRecords (Table *table, int base, RecordScavenge *recordScavenge)
+// Prune old invisible record versions from the end of record chains.
+// The visible versions at the front of the list are kept.
+
+void RecordLeaf::pruneRecords (Table *table, int base, RecordScavenge *recordScavenge)
 {
-	int count = 0;
 	Record **ptr, **end;
-	Sync sync(&syncObject, "RecordLeaf::retireRecords(syncObject)");
+
+	// Get a shared lock since we are just traversing the tree.  
+	// pruneRecords does not empty any slots in a record leaf.
+
+	Sync sync(&syncObject, "RecordLeaf::pruneRecords(syncObject)");
 	sync.lock(Shared);
-	
-	// Get a shared lock to find at least one record to scavenge
-	// If scavengeGeneration == UNDEFINED then just count the records in the leaf.
-	
+
+	// Inventory each record slot on this leaf.
+
 	for (ptr = records, end = records + RECORD_SLOTS; ptr < end; ++ptr)
 		{
 		Record *record = *ptr;
 		
 		if (record)
 			{
-			if (recordScavenge->scavengeGeneration == UNDEFINED)
-				++count;
-			else if (record->isVersion())
+			Record* visible = recordScavenge->inventoryRecord(record);
+
+			// Prune invisible records.
+
+			if (visible)
 				{
-				Sync syncPrior(record->getSyncPrior(), "RecordLeaf::retireRecords(prior)");
-				syncPrior.lock(Shared);
-	
-				if (record->scavenge(recordScavenge, Shared))
-					break;
-				else
-					++count;
+				Record *prior = visible->clearPriorVersion();
+
+				for (Record *prune = prior; prune; prune = prune->getPriorVersion())
+					{
+					recordScavenge->recordsPruned++;
+					recordScavenge->spacePruned += prune->getMemUsage();
+					}
+
+				if (prior)
+					{
+#ifdef CHECK_RECORD_ACTIVITY
+					prior->active = false;
+#endif
+					table->garbageCollect(prior, record, NULL, false);
+					prior->release();
+					}
 				}
-			else if (   record->generation <= recordScavenge->scavengeGeneration
-			         && record->useCount == 1)
-				break;
-			else
-				++count;
 			}
 		}
+}
+
+void RecordLeaf::retireRecords (Table *table, int base, RecordScavenge *recordScavenge)
+{
+	int count = 0;
+	Record **ptr, **end;
+
+	Sync sync(&syncObject, "RecordLeaf::retireRecords(syncObject)");
+	sync.lock(Shared);
+
+	// Get a shared lock to find at least one record to scavenge
+
+	for (ptr = records, end = records + RECORD_SLOTS; ptr < end; ++ptr)
+		{
+		Record *record = *ptr;
+
+		if (record && recordScavenge->canBeRetired(record))
+			break;
+		}
 
 	if (ptr >= end)
-		return count;
-	
-	// Get an exclusive lock and do the actual scavenging
-	
+		return;
+
+	// We can retire at least one record from this leaf;
+	// Get an exclusive lock and retire as many as possible.
+
 	sync.unlock();
 	sync.lock(Exclusive);
-	count = 0;
-	
+
 	for (ptr = records; ptr < end; ++ptr)
 		{
 		Record *record = *ptr;
 		
-		if (record)
+		if (record && recordScavenge->canBeRetired(record))
 			{
-			if (record->isVersion())
-				{
-				Sync syncPrior(record->getSyncPrior(), "RecordLeaf::retireRecords(3)");
-				syncPrior.lock(Exclusive);
-				
-				if (record->scavenge(recordScavenge, Exclusive))
-					{
-					*ptr = NULL;
-					recordScavenge->spaceReclaimed += record->size;
-					++recordScavenge->recordsReclaimed;
-#ifdef CHECK_RECORD_ACTIVITY
-					record->active = false;
-#endif
-					if (record->state == recDeleted)
-						record->expungeRecord();
-
-					record->release();
-					}
-				else
-					{
-					++recordScavenge->recordsRemaining;
-					recordScavenge->spaceRemaining += record->size;
-					++count;
-					}
-				}
-			else if (   (record->generation <= recordScavenge->scavengeGeneration)
-			         && (record->useCount == 1))
-				{
+			if (record->retire(recordScavenge))
 				*ptr = NULL;
-				recordScavenge->spaceReclaimed += record->size;
-				++recordScavenge->recordsReclaimed;
-#ifdef CHECK_RECORD_ACTIVITY
-				record->active = false;
-#endif
-				record->release();
-				}
 			else
-				{
-				++recordScavenge->recordsRemaining;
-				recordScavenge->spaceRemaining += record->size;
-				++count;
-				
-				for (Record *prior = record->getPriorVersion(); prior; prior = prior->getPriorVersion())
-					{
-					++recordScavenge->versionsRemaining;
-					recordScavenge->spaceRemaining += prior->size;
-					}
-				}
+				count++;
 			}
 		}
-		
+
 	// If this node is empty, store the base record number for use as an
 	// identifier when the leaf node is scavenged later.
-	
+
 	if (!count && table->emptySections)
 		table->emptySections->set(base);
 
-	return count;
+	return;
 }
 
 bool RecordLeaf::retireSections(Table * table, int id)

=== modified file 'storage/falcon/RecordLeaf.h'
--- a/storage/falcon/RecordLeaf.h	2008-10-16 02:40:08 +0000
+++ b/storage/falcon/RecordLeaf.h	2008-12-05 19:27:45 +0000
@@ -34,7 +34,8 @@ public:
 	virtual int  countActiveRecords();
 	virtual bool anyActiveRecords();
 	virtual int  chartActiveRecords(int *chart);
-	virtual int  retireRecords(Table *table, int id, RecordScavenge *recordScavenge);
+	virtual void pruneRecords(Table *table, int id, RecordScavenge *recordScavenge);
+	virtual void retireRecords(Table *table, int id, RecordScavenge *recordScavenge);
 	virtual bool retireSections(Table * table, int id);
 	virtual bool inactive();
 	virtual bool store(Record *record, Record *prior, int32 id,RecordSection **parentPtr);

=== modified file 'storage/falcon/RecordScavenge.cpp'
--- a/storage/falcon/RecordScavenge.cpp	2008-10-29 23:25:13 +0000
+++ b/storage/falcon/RecordScavenge.cpp	2008-12-05 19:27:45 +0000
@@ -18,90 +18,275 @@
 #include "RecordScavenge.h"
 #include "Database.h"
 #include "Record.h"
+#include "RecordVersion.h"
 #include "Log.h"
 #include "MemMgr.h"
 #include "Sync.h"
 
-
-RecordScavenge::RecordScavenge(Database *db, TransId oldestTransaction, bool wasForced)
+RecordScavenge::RecordScavenge(Database *db, uint64 whichCycle, TransId oldestTransaction, uint64 activeMemory)
 {
 	database = db;
-	transactionId = oldestTransaction;
-	forced = wasForced;
-	baseGeneration = database->currentGeneration;
+	cycle = database->scavengeCycle;
+	oldestActive = oldestTransaction;
+
 	memset(ageGroups, 0, sizeof(ageGroups));
-	recordsReclaimed = 0;
+	veryOldRecords = 0;
+	veryOldRecordSpace = 0;
+
+	startingActiveMemory = db->recordDataPool->activeMemory;
+	prunedActiveMemory = 0;
+	retiredActiveMemory = 0;
+
+	pruneStop = 0;
+	retireStop = 0;
+
+	baseGeneration = database->currentGeneration;
+	scavengeGeneration = 0;
+
+	// Results of Scavenging
+	recordsPruned = 0;
+	spacePruned = 0;
+	recordsRetired = 0;
+	spaceRetired = 0;
 	recordsRemaining = 0;
-	versionsRemaining = 0;
-	spaceReclaimed = 0;
 	spaceRemaining = 0;
-	overflowSpace = 0;
-	numberRecords = 0;
-	recordSpace = 0;
+
+	// Results of the inventory
+	totalRecords = 0;
+	totalRecordSpace = 0;
+	pruneableRecords = 0;
+	pruneableSpace = 0;
+	retireableRecords = 0;
+	retireableSpace = 0;
+	unScavengeableRecords = 0;
+	unScavengeableSpace = 0;
 }
 
 RecordScavenge::~RecordScavenge(void)
 {
 }
 
-void RecordScavenge::inventoryRecord(Record* record)
+bool RecordScavenge::canBeRetired(Record* record)
 {
+	// Check if this record can be retired
+
+	if (record->generation <= scavengeGeneration)
+		{
+		// Record objects are read from pages
+
+		if (!record->isVersion())
+			return true;
+
+		RecordVersion * recVer = (RecordVersion *) record;
+		ASSERT(!recVer->superceded);  // Must be the base record
+
+		// This record version may be retired if 
+		// it is currently not pointed to by a transaction
+
+		if (!recVer->transaction)
+			return true;
+		}
+
+	return false;
+}
+
+// Take an inventory of every record in this record chain.
+// If there are any old invisible records at the end of the chain,
+// return a pointer to the oldest visible record.
+
+Record* RecordScavenge::inventoryRecord(Record* record)
+{
+	Record *oldestVisibleRec = NULL;
+
 	Sync syncPrior(record->getSyncPrior(), "RecordScavenge::inventoryRecord");
 	syncPrior.lock(Shared);
 
 	for (Record *rec = record; rec; rec = rec->getPriorVersion())
 		{
-		++numberRecords;
-		recordSpace += record->size;
-		uint64 age = baseGeneration - record->generation;
-		int size = record->size + sizeof(MemBigHeader);
-		
-		if (record->hasRecord(false) || (record->state == recChilled))
-			size += sizeof(MemBigHeader);
+		int scavengeType = CANNOT_SCAVENGE;  // Initial value
+
+		++totalRecords;
+		int size = rec->getMemUsage();
+		totalRecordSpace += size;
+
+		// Check if this record can be scavenged somehow
+
+		if (rec->isVersion())
+			{
+			RecordVersion * recVer = (RecordVersion *) rec;
+
+			// This record may be retired if 
+			// 1) it is the base record
+			// 2) it is currently not pointed to by a transaction
+			// Note: Other pointers can come and go to a base record between now
+			// and when scavenge tries to retire it.  But let's just inventory 
+			// how much we can retire right now.
+
+			if (recVer == record && !recVer->transaction)
+				scavengeType = CAN_BE_RETIRED;
 			
-		if (age != UNDEFINED && age < AGE_GROUPS)
-			ageGroups[age] += size;
-		else if (age >= AGE_GROUPS)
-			overflowSpace += size;
+			// Look for the oldest visible record which the record that
+			// was committed when the oldest active transaction started.
+			// Use transaction == NULL to tell if the old trans has no dependencies.
+
+			if (oldestVisibleRec)
+				scavengeType = CAN_BE_PRUNED;
+			else if (  !recVer->transaction 
+				     && recVer->transactionId < oldestActive)
+				oldestVisibleRec = rec;
+			}
+		else if (oldestVisibleRec)
+			scavengeType = CAN_BE_PRUNED;
 		else
-			ageGroups[0] = size;
+			scavengeType = CAN_BE_RETIRED;
+
+		// Add up the scavengeable space.
+
+		switch (scavengeType)
+			{
+			case CAN_BE_PRUNED:
+				pruneableRecords++;
+				pruneableSpace += size;
+				break;
+
+			case CAN_BE_RETIRED:
+				retireableRecords++;
+				retireableSpace += size;
+				break;
+
+			default:
+				unScavengeableRecords++;
+				unScavengeableSpace += size;
+			}
+
+		// Only base records can be retired
+
+		if (rec == record)
+			{
+			int64 age = (int64) baseGeneration - (int64) rec->generation;
+
+			if (age < 0)
+				ageGroups[0] += size;
+			else if (age < 1)
+				ageGroups[0] += size;
+			else if (age < AGE_GROUPS)
+				ageGroups[age] += size;
+			else	// age >= AGE_GROUPS
+				{
+				veryOldRecords++;
+				veryOldRecordSpace += size;
+				}
+			}
+
 		}
+
+	return oldestVisibleRec;
 }
 
-uint64 RecordScavenge::computeThreshold(uint64 target)
+uint64 RecordScavenge::computeThreshold(uint64 spaceToRetire)
 {
-	totalSpace = 0;
+	uint64 totalSpace = veryOldRecordSpace;
 	scavengeGeneration = 0;
-	
-	for (uint64 n = 0; n < AGE_GROUPS; ++n)
+
+	// The baseGeneration is the currentGeneration when the scavenge started
+	// It is in ageGroups[0].  Next oldest in ageGroups[1], etc.
+	// Find the youngest generation to start scavenging.
+	// Scavenge that scavengeGeneration and older.
+
+	for (int n = AGE_GROUPS - 1; n && !scavengeGeneration; n--)
 		{
 		totalSpace += ageGroups[n];
-		
-		if (totalSpace >= target && scavengeGeneration == 0)
+
+		if (totalSpace >= spaceToRetire)
 			scavengeGeneration = baseGeneration - n;
 		}
 
-	totalSpace += overflowSpace;
-
-	if (forced || (scavengeGeneration == 0 && totalSpace > target))
-		scavengeGeneration = baseGeneration + AGE_GROUPS;
-	
 	return scavengeGeneration;
 }
 
-void RecordScavenge::printRecordMemory(void)
+void RecordScavenge::print(void)
 {
-	Log::debug ("Record Memory usage for %s:\n", (const char*) database->name);
+	Log::log(LogScavenge, "=== Scavenge Cycle " I64FORMAT " - %s - %d seconds\n",
+		cycle, (const char*) database->name, retireStop - scavengeStart);
+
+	if (!recordsPruned && !recordsRetired)
+		return;
+
 	uint64 max;
 
+	// Find the maximum age group represented
+
 	for (max = AGE_GROUPS - 1; max > 0; --max)
 		if (ageGroups[max])
 			break;
 
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Base Generation=" I64FORMAT 
+		"  Scavenge Generation=" I64FORMAT "\n", 
+		cycle, baseGeneration, scavengeGeneration);
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Oldest Active Transaction=%d Threshold=" I64FORMAT 
+		"  Floor=" I64FORMAT "\n", 
+		cycle, oldestActive);
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Threshold=" I64FORMAT 
+		"  Floor=" I64FORMAT 
+		"  Now=" I64FORMAT "\n", 
+		cycle, database->recordScavengeThreshold, 
+		database->recordScavengeFloor,
+		retiredActiveMemory );
 	for (uint64 n = 0; n <= max; ++n)
 		if (ageGroups [n])
-			Log::debug ("  %d. %d\n", baseGeneration - n, ageGroups[n]);
+			Log::log (LogScavenge,"Cycle=" I64FORMAT 
+				"  Age=" I64FORMAT "  Size=" I64FORMAT "\n", 
+				cycle, baseGeneration - n, ageGroups[n]);
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Very Old Records=" I64FORMAT " Size=" I64FORMAT "\n", 
+		cycle, veryOldRecords, veryOldRecordSpace);
+
+	// Results of the inventory
+
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Inventory; Total records=" I64FORMAT " containing " I64FORMAT " bytes\n", 
+		cycle, totalRecords, totalRecordSpace);
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Inventory; Pruneable records=" I64FORMAT " containing " I64FORMAT " bytes\n", 
+		cycle, pruneableRecords, pruneableSpace);
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Inventory; Retireable records=" I64FORMAT " containing " I64FORMAT " bytes\n", 
+		cycle, retireableRecords, retireableSpace);
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Inventory; unScavengeable records=" I64FORMAT " containing " I64FORMAT " bytes\n", 
+		cycle, unScavengeableRecords, unScavengeableSpace);
+
+	// Results of the Scavenge Cycle;
+
+	Log::log(LogScavenge, "Cycle=" I64FORMAT 
+		"  Results; Pruned " I64FORMAT " records, " I64FORMAT 
+		" bytes in %d seconds\n", 
+		cycle, recordsPruned, spacePruned, pruneStop - scavengeStart);
+	Log::log(LogScavenge, "Cycle=" I64FORMAT 
+		"  Results; Retired " I64FORMAT " records, " I64FORMAT 
+		" bytes in %d seconds\n", 
+		cycle, recordsRetired, spaceRetired, retireStop - pruneStop);
+
+	if (!recordsRetired)
+		{
+		recordsRemaining = totalRecords - recordsPruned;
+		spaceRemaining = totalRecordSpace - spacePruned;
+		}
 
-	Log::log(LogScavenge, " total: " I64FORMAT ", threshold %d%s\n", totalSpace, scavengeGeneration,
-				(scavengeGeneration > 0) ? " -- scavenge" : "");
+	Log::log(LogScavenge, "Cycle=" I64FORMAT 
+		"  Results; Remaining " I64FORMAT 
+		" Records, " I64FORMAT " remaining bytes\n", 
+		cycle, recordsRemaining, spaceRemaining);
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Results; Active memory at Scavenge Start=" I64FORMAT "\n", 
+		cycle, startingActiveMemory);
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Results; Active memory after Pruning Records=" I64FORMAT "\n", 
+		cycle, prunedActiveMemory);
+	Log::log (LogScavenge,"Cycle=" I64FORMAT 
+		"  Results; Active memory after Retiring Records=" I64FORMAT "\n", 
+		cycle, retiredActiveMemory );
 }

=== modified file 'storage/falcon/RecordScavenge.h'
--- a/storage/falcon/RecordScavenge.h	2008-05-09 19:58:50 +0000
+++ b/storage/falcon/RecordScavenge.h	2008-12-05 19:27:45 +0000
@@ -16,37 +16,62 @@
 #ifndef _RECORD_SCAVENGE_H_
 #define _RECORD_SCAVENGE_H_
 
-static const uint64 AGE_GROUPS = 20;
+static const uint64 AGE_GROUPS = 100;
+static const uint64 AGE_GROUPS_IN_CACHE = 20;
 static const uint64 UNDEFINED = -1;
 
+static const int CANNOT_SCAVENGE = 1;
+static const int CAN_BE_RETIRED  = 2;
+static const int CAN_BE_PRUNED   = 3;
+
 class Database;
 class Record;
 
 class RecordScavenge
 {
 public:
+	RecordScavenge(Database *db, uint64 whichCycle, TransId oldestTransaction, uint64 activeMemory);
+	~RecordScavenge(void);
+
+	bool     canBeRetired(Record* record);
+	Record*  inventoryRecord(Record* record);
+	uint64   computeThreshold(uint64 spaceToRetire);
+	void     print(void);
+
 	Database	*database;
-	TransId		transactionId;
-	uint64		scavengeGeneration;
+	TransId		oldestActive;
+	uint64		cycle;
+	uint64		startingActiveMemory;
+	uint64		prunedActiveMemory;
+	uint64		retiredActiveMemory;
+
+	time_t		scavengeStart;
+	time_t		pruneStop;
+	time_t		retireStop;
+
 	uint64		baseGeneration;
-	uint		recordsReclaimed;
-	uint		recordsRemaining;
-	uint		numberRecords;
-	uint		versionsRemaining;
-	uint64		spaceReclaimed;
+	uint64		scavengeGeneration;
+
+	// Results of Scavenging
+	uint64		recordsPruned;
+	uint64		spacePruned;
+	uint64		recordsRetired;
+	uint64		spaceRetired;
+	uint64		recordsRemaining;
 	uint64		spaceRemaining;
-	uint64		ageGroups[AGE_GROUPS];
-	uint64		overflowSpace;
-	uint64		totalSpace;
-	uint64		recordSpace;
-	bool		forced;
-	
-	RecordScavenge(Database *db, TransId oldestTransaction, bool wasForced);
-	~RecordScavenge(void);
 
-	void		inventoryRecord(Record* record);
-	uint64		computeThreshold(uint64 target);
-	void		printRecordMemory(void);
+	// Results of the inventory
+	uint64		totalRecords;
+	uint64		totalRecordSpace;
+	uint64		pruneableRecords;
+	uint64		pruneableSpace;
+	uint64		retireableRecords;
+	uint64		retireableSpace;
+	uint64		unScavengeableRecords;
+	uint64		unScavengeableSpace;
+	uint64		ageGroups[AGE_GROUPS];
+	uint64		veryOldRecords;
+	uint64		veryOldRecordSpace;
 };
 
 #endif

=== modified file 'storage/falcon/RecordSection.h'
--- a/storage/falcon/RecordSection.h	2008-10-16 04:48:40 +0000
+++ b/storage/falcon/RecordSection.h	2008-12-05 19:27:45 +0000
@@ -36,15 +36,16 @@ class RecordSection  
 public:
 	virtual bool retireSections(Table * table, int id) = 0;
 	virtual bool inactive() = 0;
-	virtual		~RecordSection();
+	virtual      ~RecordSection();
 	
-	virtual	Record* fetch (int32 id) = 0;
-	virtual	bool    store (Record *record, Record *prior, int32 id, RecordSection **parentPtr) = 0;
-	virtual	int     retireRecords(Table *table, int base, RecordScavenge *recordScavenge) = 0;
-	virtual	void    inventoryRecords(RecordScavenge* recordScavenge) = 0;
-	virtual	int     countActiveRecords() = 0;
-	virtual	bool    anyActiveRecords() = 0;
-	virtual	int     chartActiveRecords(int *chart) = 0;
+	virtual Record* fetch (int32 id) = 0;
+	virtual bool    store (Record *record, Record *prior, int32 id, RecordSection **parentPtr) = 0;
+	virtual void    pruneRecords (Table *table, int base, RecordScavenge *recordScavenge) = 0;
+	virtual void    retireRecords(Table *table, int base, RecordScavenge *recordScavenge) = 0;
+	virtual void    inventoryRecords(RecordScavenge* recordScavenge) = 0;
+	virtual int     countActiveRecords() = 0;
+	virtual bool    anyActiveRecords() = 0;
+	virtual int     chartActiveRecords(int *chart) = 0;
 
 	int32			base;
 };

=== modified file 'storage/falcon/RecordVersion.cpp'
--- a/storage/falcon/RecordVersion.cpp	2008-11-07 01:09:04 +0000
+++ b/storage/falcon/RecordVersion.cpp	2008-12-05 19:27:45 +0000
@@ -180,66 +180,37 @@ void RecordVersion::commit()
 	transaction = NULL;
 }
 
-// Scavenge record versions by the scavenger thread.  Return true if the
-// record or any prior version of the record is a scavenge candidate.
-
-bool RecordVersion::scavenge(RecordScavenge *recordScavenge, LockType lockType)
+bool RecordVersion::retire(RecordScavenge *recordScavenge)
 {
-	// Scavenge criteria:
-	// 
-	// 1. Use count == 1 AND
-	// 2. Record Version is older than the record version that was visible
-	//    to the oldest active transaction AND
-	// 3. Either the record generation is older than the current generation
-	//    OR the scavenge is forced
-	//    OR there is no record data associated with the record version.
-
-	if (	useCount == 1
+	if (   generation <= recordScavenge->scavengeGeneration
+		&& useCount == 1
+		&& !priorVersion
 		&& !transaction
-		&& transactionId < recordScavenge->transactionId
-		&& (!hasRecord(false)
-			|| generation <= recordScavenge->scavengeGeneration
-			|| recordScavenge->forced))
+		&& transactionId < recordScavenge->oldestActive)
 		{
-		
-		// Expunge all record versions prior to this
+		recordScavenge->recordsRetired++;
+		recordScavenge->spaceRetired += getMemUsage();
+#ifdef CHECK_RECORD_ACTIVITY
+		active = false;
+#endif
+		if (state == recDeleted)
+			expungeRecord();
 
-		if (priorVersion && lockType == Exclusive)
-			format->table->expungeRecordVersions(this, recordScavenge);
-			
+		release();
 		return true;
 		}
-	else
-		{
-		 // Signal Table::cleanupRecords() that there is work to do
-		 
-		format->table->activeVersions = true;
 
-		// Scavenge criteria not met for this base record, so check prior versions.
-		
-		if (priorVersion && (recordScavenge->forced || recordScavenge->scavengeGeneration != UNDEFINED))
-			{
-			
-			// Scavenge prior record versions only if we have an exclusive lock on
-			// the record leaf. Return 'false' because the base record is not scavengable. 
-			
-			if (lockType == Exclusive)
-				priorVersion->scavenge(recordScavenge, lockType);
-			else
-
-				// Scan the prior record versions and return 'true' if a scavenge
-				// candidate is found.
-				
-				for (Record *rec = priorVersion; rec; rec = rec->getPriorVersion())
-					if (	rec->useCount == 1
-						&& !rec->getTransaction()
-						&& rec->getTransactionId() < recordScavenge->transactionId
-						&& (!rec->hasRecord(false)
-							|| rec->generation <= recordScavenge->scavengeGeneration))
-						return true;
-			}
+	// Cannot retire this record.  Add up remaining space.
+
+	recordScavenge->recordsRemaining++;
+	recordScavenge->spaceRemaining += getMemUsage();
+
+	for (Record *prior = getPriorVersion(); prior; prior = prior->getPriorVersion())
+		{
+		++recordScavenge->recordsRemaining;
+		recordScavenge->spaceRemaining += prior->getMemUsage();
 		}
-		return false;
+	return false;
 }
 
 // Scavenge record versions replaced within a savepoint.
@@ -320,6 +291,9 @@ bool RecordVersion::isSuperceded()
 
 Record* RecordVersion::clearPriorVersion(void)
 {
+	Sync syncPrior(getSyncPrior(), "RecordVersion::clearPriorVersion");
+	syncPrior.lock(Exclusive);
+
 	Record * prior = priorVersion;
 	priorVersion = NULL;
 	return prior;
@@ -367,7 +341,7 @@ int RecordVersion::thaw()
 	// Nothing to do if the record is no longer chilled
 	
 	if (state != recChilled)
-		return size;
+		return getDataMemUsage();
 		
 	// First, try to thaw from the serial log. If transaction->writePending is 
 	// true, then the record data can be restored from the serial log. If writePending

=== modified file 'storage/falcon/RecordVersion.h'
--- a/storage/falcon/RecordVersion.h	2008-10-29 23:25:13 +0000
+++ b/storage/falcon/RecordVersion.h	2008-12-05 19:27:45 +0000
@@ -42,7 +42,7 @@ public:
 	virtual void		setSuperceded (bool flag);
 	virtual Record*		getPriorVersion();
 	virtual Record*		getGCPriorVersion(void);
-	virtual bool		scavenge(RecordScavenge *recordScavenge, LockType lockType);
+	virtual bool		retire(RecordScavenge *recordScavenge);
 	virtual void		scavenge(TransId targetTransactionId, int oldestActiveSavePoint);
 	virtual bool		isVersion();
 	virtual void		rollback(Transaction *transaction);

=== modified file 'storage/falcon/Table.cpp'
--- a/storage/falcon/Table.cpp	2008-11-20 17:05:50 +0000
+++ b/storage/falcon/Table.cpp	2008-12-05 19:27:45 +0000
@@ -474,8 +474,6 @@ void Table::reformat()
 
 void Table::updateRecord (RecordVersion * record)
 {
-	activeVersions = true;
-	
 	FOR_OBJECTS (TableAttachment*, attachment, &attachments)
 		if (attachment->mask & POST_COMMIT)
 			attachment->postCommit(this, record);
@@ -841,7 +839,6 @@ void Table::init(int id, const char *sch
 	highWater = 0;
 	eof = false;
 	markedForDelete = false;
-	activeVersions = false;
 	primaryKey = NULL;
 	formats = NEW Format* [FORMAT_HASH_SIZE];
 	triggers = NULL;
@@ -857,7 +854,6 @@ void Table::init(int id, const char *sch
 	alterIsActive = false;
 	syncObject.setName("Table::syncObject");
 	syncTriggers.setName("Table::syncTriggers");
-	syncScavenge.setName("Table::syncScavenge");
 	syncAlter.setName("Table::syncAlter");
 	
 	for (int n = 0; n < SYNC_VERSIONS_SIZE; n++)
@@ -1256,8 +1252,6 @@ void Table::update(Transaction * transac
 	RecordVersion *record = NULL;
 	bool updated = false;
 	int recordNumber = oldRecord->recordNumber;
-	Sync scavenge(&syncScavenge, "Table::update(1)");
-	//scavenge.lock(Shared);
 	
 	try
 		{
@@ -1311,8 +1305,6 @@ void Table::update(Transaction * transac
 		
 		// Make insert/update atomic, then check for unique index duplicats
 
-
-		scavenge.lock(Shared);
 		validateAndInsert(transaction, record);
 		transaction->addRecord(record);
 		updated = true;
@@ -1476,7 +1468,6 @@ ForeignKey* Table::findForeignKey(Foreig
 void Table::deleteRecord(Transaction * transaction, Record * orgRecord)
 {
 	database->preUpdate();
-	Sync scavenge(&syncScavenge, "Table::deleteRecord");
 
 	// syncPrior is not needed here.  It is handled in fetchVersion()
 	Record *candidate = fetch(orgRecord->recordNumber);
@@ -1533,8 +1524,6 @@ void Table::deleteRecord(Transaction * t
 			attachment->preDelete(this, record);
 	END_FOR;
 
-	scavenge.lock(Shared);
-
 	if (wasLock)
 		{
 		record->state = recDeleted;
@@ -1860,58 +1849,61 @@ void Table::setView(View *viewObject)
 	view = viewObject;
 }
 
+// Prune old invisible records from this table and inventory the rest.
 
-int Table::retireRecords(RecordScavenge *recordScavenge)
+void Table::pruneRecords(RecordScavenge *recordScavenge)
 {
 	if (!records)
-		return 0;
+		return;
 
 	Sync syncObj(&syncObject, "Table::retireRecords");
 	syncObj.lock(Shared);
 
+	if (records)
+		records->pruneRecords(this, 0, recordScavenge);
+}
+
+void Table::retireRecords(RecordScavenge *recordScavenge)
+{
 	if (!records)
-		return 0;
-	
-	activeVersions = false;
+		return;
+
+	Sync syncObj(&syncObject, "Table::retireRecords");
+	syncObj.lock(Shared);
+
+	if (!records)
+		return;
+
 	emptySections->clear();
-	int count = records->retireRecords(this, 0, recordScavenge);
+	records->retireRecords(this, 0, recordScavenge);
+	syncObj.unlock();
+
+	// Get an exclusive lock only if there are empty leaf nodes. Find and
+	// delete the empty nodes using the stored record numbers as identifiers.
 
-	if (count == 0)
+	if (emptySections->count > 0)
 		{
-		syncObj.unlock();
 		syncObj.lock(Exclusive);
 
-		// Confirm that tree is still empty
+		// Delete these newly emptied RecordLeaf sections
 
-		count = records->countActiveRecords();
+		for (int sectionNumber = 0; (sectionNumber = emptySections->nextSet(0)) >= 0;)
+			{
+			int recordNumber = sectionNumber * RECORD_SLOTS;
+			records->retireSections(this, recordNumber);
+			emptySections->clear(sectionNumber);
+			}
 
-		if (count == 0)
+		// Check if there are any sections/active records left in this table.
+
+		if (!records->anyActiveRecords())
 			{
 			delete records;
 			records = NULL;
 			}
 		}
-	else
-		{
-		// Get an exclusive lock only if there are empty leaf nodes. Find and
-		// delete the empty nodes using the stored record numbers as identifiers.
-
-		if (emptySections->count > 0)
-			{
-			syncObj.unlock();
-			syncObj.lock(Exclusive);
 
-			for (int sectionNumber = 0; (sectionNumber = emptySections->nextSet(0)) >= 0;)
-				{
-				int recordNumber = sectionNumber * RECORD_SLOTS;
-				records->retireSections(this, recordNumber);
-				emptySections->clear(sectionNumber);
-				}
-				
-			}
-		}
-	
-	return count;
+	return;
 }
 
 void Table::inventoryRecords(RecordScavenge* recordScavenge)
@@ -1984,28 +1976,6 @@ bool Table::insert(Record * record, Reco
 	return false;
 }
 
-void Table::expungeRecordVersions(RecordVersion *record, RecordScavenge *recordScavenge)
-{
-	ASSERT(record->state != recLock);
-
-	Record *prior = record->clearPriorVersion();
-	
-	if (recordScavenge)
-		for (Record *rec = prior; rec; rec = rec->getPriorVersion())
-			{
-			++recordScavenge->recordsReclaimed;
-			recordScavenge->spaceReclaimed += record->size;
-			}
-			
-#ifdef CHECK_RECORD_ACTIVITY
-	for (Record *rec = prior; rec; rec = rec->getPriorVersion())
-		rec->active = false;
-#endif
-			
-	garbageCollect(prior, record, NULL, false);
-	prior->release();
-}
-
 bool Table::duplicateBlob(Value * blob, int fieldId, Record * recordChain)
 {
 	bool isDuplicate = false;
@@ -2553,9 +2523,9 @@ bool Table::checkUniqueRecordVersion(int
 
 		if (!dup->hasRecord())
 			{
-			// If the record is a lock record, keep looking for a dup.
+			// If the record is locked or being unlocked keep looking for a dup.
 
-			if (dup->state == recLock)
+			if ((dup->state == recLock) || (dup->state == recUnlocked))
 				continue;  // Next record version.
 
 			// The record has been deleted.
@@ -2793,12 +2763,6 @@ void Table::rebuildIndex(Index *index, T
 }
 
 
-void Table::cleanupRecords(RecordScavenge *recordScavenge)
-{
-	if (activeVersions)
-		retireRecords(recordScavenge);
-}
-
 void Table::validateBlobs(int optionMask)
 {
 	Field *field;
@@ -3109,8 +3073,6 @@ void Table::update(Transaction * transac
 
 	RecordVersion *record = NULL;
 	bool updated = false;
-	Sync scavenge(&syncScavenge, "Table::update(2)");
-	//scavenge.lock(Shared);
 	
 	if (candidate->state == recLock && candidate->getTransaction() == transaction)
 		{
@@ -3159,7 +3121,6 @@ void Table::update(Transaction * transac
 		END_FOR;
 
 		//updateInversion(record, transaction);
-		scavenge.lock(Shared);
 		
 		if (record->state == recLock)
 			record->state = recData;
@@ -3654,36 +3615,48 @@ bool Table::setAlter(void)
 RecordVersion* Table::allocRecordVersion(Format* format, Transaction* transaction, Record* priorVersion)
 {
 	for (int n = 0;; ++n)
+		{
 		try
 			{
+			if (++database->recordPoolAllocCount & 0xFF)
+				database->checkRecordScavenge();
+
 			return POOL_NEW(database->recordDataPool) RecordVersion(this, format, transaction, priorVersion);
 			}
+
 		catch (SQLException& exception)
 			{
 			if (n > 2 || exception.getSqlcode() != OUT_OF_RECORD_MEMORY_ERROR)
 				throw;
-			
+
 			database->forceRecordScavenge();
 			}
-	
+		}
+
 	return NULL;
 }
 
 Record* Table::allocRecord(int recordNumber, Stream* stream)
 {
 	for (int n = 0;; ++n)
+		{
 		try
 			{
+			if (++database->recordPoolAllocCount & 0xFF)
+				database->checkRecordScavenge();
+
 			return POOL_NEW(database->recordDataPool) Record (this, recordNumber, stream);
 			}
+
 		catch (SQLException& exception)
 			{
 			if (n > 2 || exception.getSqlcode() != OUT_OF_RECORD_MEMORY_ERROR)
 				throw;
-			
+
 			database->forceRecordScavenge();
 			}
-	
+		}
+
 	return NULL;
 }
 

=== modified file 'storage/falcon/Table.h'
--- a/storage/falcon/Table.h	2008-11-20 05:32:18 +0000
+++ b/storage/falcon/Table.h	2008-12-05 19:27:45 +0000
@@ -96,9 +96,9 @@ public:
 	void		rebuildIndexes (Transaction *transaction, bool force = false);
 	void		collationChanged (Field *field);
 	void		validateBlobs (int optionMask);
-	void		cleanupRecords(RecordScavenge *recordScavenge);
 	void		rebuildIndex (Index *index, Transaction *transaction);
-	int			retireRecords (RecordScavenge *recordScavenge);
+	void		pruneRecords (RecordScavenge *recordScavenge);
+	void			retireRecords (RecordScavenge *recordScavenge);
 	int			countActiveRecords();
 	int			chartActiveRecords(int *chart);
 	bool		foreignKeyMember (ForeignKey *key);
@@ -139,7 +139,6 @@ public:
 	void		expungeBlob (Value *blob);
 	bool		duplicateBlob (Value *blob, int fieldId, Record *recordChain);
 	void		expungeRecord(int32 recordNumber);
-	void		expungeRecordVersions (RecordVersion *record, RecordScavenge *recordScavenge);
 	void		setView (View *view);
 	Index*		findIndex (const char *indexName);
 	virtual		PrivObject getPrivilegeType();
@@ -231,7 +230,6 @@ public:
 	Dbb				*dbb;
 	SyncObject		syncObject;
 	SyncObject		syncTriggers;
-	SyncObject		syncScavenge;
 	SyncObject		syncAlter;				// prevent concurrent Alter statements.
 	SyncObject		syncPriorVersions[SYNC_VERSIONS_SIZE];
 	SyncObject		syncThaw[SYNC_THAW_SIZE];
@@ -267,7 +265,6 @@ public:
 	bool			changed;
 	bool			eof;
 	bool			markedForDelete;
-	bool			activeVersions;
 	bool			alterIsActive;
 	bool			deleting;					// dropping or truncating.
 	int32			highWater;

=== modified file 'storage/falcon/Transaction.cpp'
--- a/storage/falcon/Transaction.cpp	2008-11-20 17:05:50 +0000
+++ b/storage/falcon/Transaction.cpp	2008-12-05 19:27:45 +0000
@@ -567,7 +567,7 @@ int Transaction::thaw(RecordVersion * re
 	// Nothing to do if record is no longer chilled
 	
 	if (record->state != recChilled)
-		return record->size;
+		return record->getDataMemUsage();
 		
 	// Get pointer to record data in serial log
 

=== modified file 'storage/falcon/Transaction.h'
--- a/storage/falcon/Transaction.h	2008-10-02 23:20:47 +0000
+++ b/storage/falcon/Transaction.h	2008-12-05 19:27:45 +0000
@@ -113,7 +113,6 @@ public:
 	void		releaseSavepoint(int savepointId);
 	void		releaseSavepoints(void);
 	void		rollbackSavepoint (int savepointId);
-	void		scavengeRecords(int ageGroup);
 	void		add(DeferredIndex* deferredIndex);
 	void		initialize(Connection* cnct, TransId seq);
 	bool		isXidEqual(int testLength, const UCHAR* test);

Thread
bzr commit into mysql-6.0-falcon-team branch (klewis:2927) Bug#34893,Bug#36700Kevin Lewis5 Dec