From: Frazer Clement Date: October 30 2012 1:32am Subject: bzr push into mysql-5.1-telco-6.3 branch (frazer.clement:3491 to 3492) Bug#14828998 List-Archive: http://lists.mysql.com/commits/145309 X-Bug: 14828998 Message-Id: <201210300132.q9U1WglV015231@acsmt358.oracle.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit 3492 Frazer Clement 2012-10-29 Bug #14828998 NDB : SLOW FILESYSTEM CAN CAUSE DIH FILE PAGE EXHAUSTION Limit number of concurrent table definition updates that DIH can issue. This avoids a slow filesystem exerting pressure on DIH File page buffers which can lead to a crash if they're exhausted. Add a testcase showing the crash behaviour using error insert and showing that it fixes the problem. @ storage/ndb/test/ndbapi/testLimits.cpp 'Backport' of testLimits from 7.0. added: storage/ndb/src/kernel/vm/CountingSemaphore.hpp storage/ndb/test/ndbapi/testLimits.cpp modified: storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp storage/ndb/src/kernel/blocks/ERROR_codes.txt storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp storage/ndb/src/kernel/blocks/dbdih/DbdihInit.cpp storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp storage/ndb/test/ndbapi/Makefile.am storage/ndb/test/run-test/daily-basic-tests.txt 3491 Martin Skold 2012-10-22 Ignore C4090 warnings (Windows) modified: support-files/compiler_warnings.supp === modified file 'storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp' --- a/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp 2012-09-12 13:24:49 +0000 +++ b/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp 2012-10-29 18:34:05 +0000 @@ -156,6 +156,7 @@ public: // 7019 // 7020 // 7021 + DihDumpPageRecInfo = 7032, EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP DihSetTimeBetweenGcp = 7090, DihStartLcpImmediately = 7099, === modified file 'storage/ndb/src/kernel/blocks/ERROR_codes.txt' --- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2011-10-28 13:49:12 +0000 +++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2012-10-29 18:34:05 +0000 @@ -20,7 +20,7 @@ Next DBACC 3002 Next DBTUP 4032 Next DBLQH 5074 Next DBDICT 6026 -Next DBDIH 7229 +Next DBDIH 7236 Next DBTC 8090 Next CMVMI 9000 Next BACKUP 10041 === modified file 'storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp' --- a/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2011-02-01 21:05:11 +0000 +++ b/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2012-10-29 18:34:05 +0000 @@ -28,6 +28,7 @@ #include #include #include +#include #ifdef DBDIH_C @@ -99,6 +100,8 @@ /* SIZES */ /*#########*/ #define ZPAGEREC 100 +#define MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES 4 +#define MAX_CONCURRENT_DIH_TAB_DEF_OPS (MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES + 2) #define ZCREATE_REPLICA_FILE_SIZE 4 #define ZPROXY_MASTER_FILE_SIZE 10 #define ZPROXY_FILE_SIZE 10 @@ -442,6 +445,7 @@ public: enum UpdateState { US_IDLE, US_LOCAL_CHECKPOINT, + US_LOCAL_CHECKPOINT_QUEUED, US_REMOVE_NODE, US_COPY_TAB_REQ, US_ADD_TABLE_MASTER, @@ -1485,6 +1489,11 @@ private: Uint32 c_set_initial_start_flag; Uint64 c_current_time; // Updated approx. every 10ms + /* Limit the number of concurrent table definition writes during LCP + * This avoids exhausting the DIH page pool + */ + CountingSemaphore c_lcpTabDefWritesControl; + public: enum LcpMasterTakeOverState { LMTOS_IDLE = 0, === modified file 'storage/ndb/src/kernel/blocks/dbdih/DbdihInit.cpp' --- a/storage/ndb/src/kernel/blocks/dbdih/DbdihInit.cpp 2011-02-01 21:05:11 +0000 +++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihInit.cpp 2012-10-29 18:34:05 +0000 @@ -70,6 +70,8 @@ void Dbdih::initData() cntrlblockref = RNIL; c_set_initial_start_flag = FALSE; c_sr_wait_to = false; + + c_lcpTabDefWritesControl.init(MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES); }//Dbdih::initData() void Dbdih::initRecords() === modified file 'storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp' --- a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2011-10-28 13:49:12 +0000 +++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2012-10-29 18:34:05 +0000 @@ -1169,6 +1169,14 @@ void Dbdih::execFSWRITECONF(Signal* sign break; case FileRecord::TABLE_WRITE: jam(); + if (ERROR_INSERTED(7235)) + { + jam(); + filePtr.p->reqStatus = status; + /* Suspend processing of WRITECONFs */ + sendSignalWithDelay(reference(), GSN_FSWRITECONF, signal, 1000, signal->getLength()); + return; + } tableWriteLab(signal, filePtr); break; default: @@ -11872,10 +11880,26 @@ void Dbdih::execLCP_FRAG_REP(Signal* sig */ tabPtr.p->tabLcpStatus = TabRecord::TLS_WRITING_TO_FILE; tabPtr.p->tabCopyStatus = TabRecord::CS_LCP_READ_TABLE; - tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT; - signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES; - signal->theData[1] = tabPtr.i; - sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB); + + /** + * Check whether we should write immediately, or queue... + */ + if (c_lcpTabDefWritesControl.requestMustQueue()) + { + jam(); + //ndbout_c("DIH : Queueing tab def flush op on table %u", tabPtr.i); + /* Mark as queued - will be started when an already running op completes */ + tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT_QUEUED; + } + else + { + /* Run immediately */ + jam(); + tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT; + signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES; + signal->theData[1] = tabPtr.i; + sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB); + } bool ret = checkLcpAllTablesDoneInLqh(__LINE__); if (ret && ERROR_INSERTED(7209)) @@ -12597,12 +12621,48 @@ void Dbdih::tableCloseLab(Signal* signal case TabRecord::US_LOCAL_CHECKPOINT: jam(); releaseTabPages(tabPtr.i); - signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED; - sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB); tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE; tabPtr.p->tabUpdateState = TabRecord::US_IDLE; tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED; + + /* Check whether there's some queued table definition flush op to start */ + if (c_lcpTabDefWritesControl.releaseMustStartQueued()) + { + jam(); + /* Some table write is queued - let's kick it off */ + /* First find it... + * By using the tabUpdateState to 'queue' operations, we lose + * the original flush request order, which shouldn't matter. + * In any case, the checkpoint proceeds by table id, as does this + * search, so a similar order should result + */ + TabRecordPtr tabPtr; + for (tabPtr.i = 0; tabPtr.i < ctabFileSize; tabPtr.i++) + { + ptrAss(tabPtr, tabRecord); + if (tabPtr.p->tabUpdateState == TabRecord::US_LOCAL_CHECKPOINT_QUEUED) + { + jam(); + //ndbout_c("DIH : Starting queued table def flush op on table %u", tabPtr.i); + tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT; + signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES; + signal->theData[1] = tabPtr.i; + sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB); + return; + } + } + /* No queued table write found - error */ + ndbout_c("DIH : Error in queued table writes : inUse %u queued %u total %u", + c_lcpTabDefWritesControl.inUse, + c_lcpTabDefWritesControl.queuedRequests, + c_lcpTabDefWritesControl.totalResources); + ndbrequire(false); + } + jam(); + signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED; + sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB); + return; break; case TabRecord::US_REMOVE_NODE: @@ -16253,6 +16313,39 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal signal->theData[1] = ptr.i; } + if (arg == DumpStateOrd::DihDumpPageRecInfo) + { + jam(); + ndbout_c("MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES %u", MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES); + ndbout_c("MAX_CONCURRENT_DIH_TAB_DEF_OPS %u", MAX_CONCURRENT_DIH_TAB_DEF_OPS); + ndbout_c("MAX_CRASHED_REPLICAS %u", MAX_CRASHED_REPLICAS); + ndbout_c("MAX_LCP_STORED %u", MAX_LCP_STORED); + ndbout_c("MAX_REPLICAS %u", MAX_REPLICAS); + ndbout_c("MAX_NDB_PARTITIONS %u", MAX_NDB_PARTITIONS); +// ndbout_c("PACK_REPLICAS_WORDS %u", PACK_REPLICAS_WORDS); +// ndbout_c("PACK_FRAGMENT_WORDS %u", PACK_FRAGMENT_WORDS); +// ndbout_c("PACK_TABLE_WORDS %u", PACK_TABLE_WORDS); +// ndbout_c("PACK_TABLE_PAGE_WORDS %u", PACK_TABLE_PAGE_WORDS); +// ndbout_c("PACK_TABLE_PAGES %u", PACK_TABLE_PAGES); + ndbout_c("ZPAGEREC %u", ZPAGEREC); + ndbout_c("Total bytes : %lu", ZPAGEREC * sizeof(PageRecord)); + ndbout_c("LCP Tab def write ops inUse %u queued %u", + c_lcpTabDefWritesControl.inUse, + c_lcpTabDefWritesControl.queuedRequests); + Uint32 freeCount = 0; + PageRecordPtr tmp; + tmp.i = cfirstfreepage; + while (tmp.i != RNIL) + { + jam(); + ptrCheckGuard(tmp, cpageFileSize, pageRecord); + freeCount++; + tmp.i = tmp.p->nextfreepage; + }; + ndbout_c("Pages in use %u/%u", cpageFileSize - freeCount, cpageFileSize); + return; + } + DECLARE_DUMP0(DBDIH, 7213, "Set error 7213 with extra arg") { SET_ERROR_INSERT_VALUE2(7213, signal->theData[1]); === added file 'storage/ndb/src/kernel/vm/CountingSemaphore.hpp' --- a/storage/ndb/src/kernel/vm/CountingSemaphore.hpp 1970-01-01 00:00:00 +0000 +++ b/storage/ndb/src/kernel/vm/CountingSemaphore.hpp 2012-10-29 18:34:05 +0000 @@ -0,0 +1,161 @@ +/* + Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef COUNTING_SEMAPHORE_HPP +#define COUNTING_SEMAPHORE_HPP + +/** + * CountingSemaphore + * + * Helper for limiting concurrency on some resources. + * The Semaphore is created with some maximum concurrency level + * Up to this many resources may be concurrently used. + * When more than this number of resources are used concurrently, + * further requests must queue until a resource is released. + * + * This structure does not manage queueing and restarting of + * resource allocation requests, just monitors the number of + * resources in use, and the number of resource requests + * queued up. + * + * To be useful, some external request queueing and dequeuing + * mechanism is required. + */ + class CountingSemaphore + { + public: + CountingSemaphore(): + inUse(0), + queuedRequests(0), + totalResources(1) + {}; + + ~CountingSemaphore() {}; + + /** + * init + * Initialise the totalResources + */ + void init(Uint32 _totalResources) + { + assert(inUse == 0); + totalResources = _totalResources; + } + + /** + * requestMustQueue + * + * Part of semaphore P()/acquire()/down() implementation + * + * Called to request a resource. + * Returns whether the request must be queued, or + * can be satisfied immediately. + * + * true - no resource available, queue request. + * false - resource available, proceed. + * + * e.g. if (.requestMustQueue()) { + * queue_request; + * return; + * } + * + * proceed; + */ + bool requestMustQueue() + { + assert(inUse <= totalResources); + if (inUse == totalResources) + { + queuedRequests++; + return true; + } + else + { + assert(queuedRequests == 0); + inUse++; + return false; + } + } + + /** + * releaseMustStartQueued + * + * Part of semaphore V()/release()/up() + * + * Called to release a resource. + * Returns whether some queued resource request + * must be restarted. + * + * true - a queued request exists and must be started. + * false - no queued request exists, proceed. + * + * e.g. + * if (.releaseMustStartQueued()) { + * dequeue_request; + * begin_request_processing; + * } + * + * proceed; + */ + bool releaseMustStartQueued() + { + assert(inUse > 0); + if (queuedRequests > 0) + { + assert(inUse == totalResources); + queuedRequests--; + return true; + } + + inUse--; + return false; + } + + /** + * getTotalRequests + * + * Returns the sum of the inuse resources and queued requests. + * e.g. the offered concurrency on the resource. + */ + Uint32 getTotalRequests() const + { + return inUse + queuedRequests; + } + + /** + * getResourcesAvailable() + * + * Returns the number of resources available currently + */ + Uint32 getResourcesAvailable() const + { + assert(inUse <= totalResources); + return (totalResources - inUse); + } + + + /* inUse - number resources currently in use */ + Uint32 inUse; + + /* queuedRequests - number requests waiting 'outside' */ + Uint32 queuedRequests; + + /* totalResources - the maximum resources in use at one time */ + Uint32 totalResources; + }; /* CountingSemaphore */ + +#endif === modified file 'storage/ndb/test/ndbapi/Makefile.am' --- a/storage/ndb/test/ndbapi/Makefile.am 2012-08-13 11:03:25 +0000 +++ b/storage/ndb/test/ndbapi/Makefile.am 2012-10-29 18:34:05 +0000 @@ -32,6 +32,7 @@ testBlobs \ testDataBuffers \ testDict \ testIndex \ +testLimits \ testMgm \ testNdbApi \ testNodeRestart \ @@ -87,6 +88,7 @@ testBlobs_SOURCES = testBlobs.cpp testDataBuffers_SOURCES = testDataBuffers.cpp testDict_SOURCES = testDict.cpp testIndex_SOURCES = testIndex.cpp +testLimits_SOURCES = testLimits.cpp testMgm_SOURCES = testMgm.cpp testNdbApi_SOURCES = testNdbApi.cpp testNodeRestart_SOURCES = testNodeRestart.cpp === added file 'storage/ndb/test/ndbapi/testLimits.cpp' --- a/storage/ndb/test/ndbapi/testLimits.cpp 1970-01-01 00:00:00 +0000 +++ b/storage/ndb/test/ndbapi/testLimits.cpp 2012-10-29 18:34:05 +0000 @@ -0,0 +1,361 @@ +/* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include +#include +#include + +int create100Tables(NDBT_Context* ctx, NDBT_Step* step) +{ + Ndb* pNdb = GETNDB(step); + const NdbDictionary::Table* pTab= ctx->getTab(); + + /* Run as a 'T1' testcase - do nothing for other tables */ + if (strcmp(pTab->getName(), "T1") != 0) + return NDBT_OK; + + for (Uint32 t=0; t < 100; t++) + { + char tabnameBuff[10]; + snprintf(tabnameBuff, sizeof(tabnameBuff), "TAB%u", t); + + NdbDictionary::Table tab; + tab.setName(tabnameBuff); + NdbDictionary::Column pk; + pk.setName("PK"); + pk.setType(NdbDictionary::Column::Varchar); + pk.setLength(20); + pk.setNullable(false); + pk.setPrimaryKey(true); + tab.addColumn(pk); + + pNdb->getDictionary()->dropTable(tab.getName()); + if(pNdb->getDictionary()->createTable(tab) != 0) + { + ndbout << "Create table failed with error : " + << pNdb->getDictionary()->getNdbError().code + << " " + << pNdb->getDictionary()->getNdbError().message + << endl; + return NDBT_FAILED; + } + + ndbout << "Created table " << tabnameBuff << endl; + } + + return NDBT_OK; +} + +int drop100Tables(NDBT_Context* ctx, NDBT_Step* step) +{ + Ndb* pNdb = GETNDB(step); + const NdbDictionary::Table* pTab= ctx->getTab(); + + /* Run as a 'T1' testcase - do nothing for other tables */ + if (strcmp(pTab->getName(), "T1") != 0) + return NDBT_OK; + + for (Uint32 t=0; t < 100; t++) + { + char tabnameBuff[10]; + snprintf(tabnameBuff, sizeof(tabnameBuff), "TAB%u", t); + + if (pNdb->getDictionary()->dropTable(tabnameBuff) != 0) + { + ndbout << "Drop table failed with error : " + << pNdb->getDictionary()->getNdbError().code + << " " + << pNdb->getDictionary()->getNdbError().message + << endl; + } + else + { + ndbout << "Dropped table " << tabnameBuff << endl; + } + } + + return NDBT_OK; +} + +int dropTable(NDBT_Context* ctx, NDBT_Step* step, Uint32 num) +{ + Ndb* pNdb = GETNDB(step); + const NdbDictionary::Table* pTab= ctx->getTab(); + + /* Run as a 'T1' testcase - do nothing for other tables */ + if (strcmp(pTab->getName(), "T1") != 0) + return NDBT_OK; + + char tabnameBuff[10]; + snprintf(tabnameBuff, sizeof(tabnameBuff), "TAB%u", num); + + if (pNdb->getDictionary()->dropTable(tabnameBuff) != 0) + { + ndbout << "Drop table failed with error : " + << pNdb->getDictionary()->getNdbError().code + << " " + << pNdb->getDictionary()->getNdbError().message + << endl; + } + else + { + ndbout << "Dropped table " << tabnameBuff << endl; + } + + return NDBT_OK; +} + + +enum Scenarios +{ +// NORMAL, // Commented to save some time. + DROP_TABLE, + RESTART_MASTER, + RESTART_SLAVE, + NUM_SCENARIOS +}; + + +enum Tasks +{ + WAIT = 0, + DROP_TABLE_REQ = 1, + MASTER_RESTART_REQ = 2, + SLAVE_RESTART_REQ = 3 +}; + +int testWorker(NDBT_Context* ctx, NDBT_Step* step) +{ + /* Run as a 'T1' testcase - do nothing for other tables */ + if (strcmp(ctx->getTab()->getName(), "T1") != 0) + return NDBT_OK; + + /* Worker step to run in a separate thread for + * blocking activities + * Generally the blocking of the DIH table definition flush + * blocks the completion of the drop table/node restarts, + * so this must be done in a separate thread to avoid + * deadlocks. + */ + + while (!ctx->isTestStopped()) + { + ndbout_c("Worker : waiting for request..."); + ctx->getPropertyWait("DIHWritesRequest", 1); + + if (!ctx->isTestStopped()) + { + Uint32 req = ctx->getProperty("DIHWritesRequestType", (Uint32)0); + + switch ((Tasks) req) + { + case DROP_TABLE_REQ: + { + /* Drop table */ + ndbout_c("Worker : dropping table"); + if (dropTable(ctx, step, 2) != NDBT_OK) + { + return NDBT_FAILED; + } + ndbout_c("Worker : table dropped."); + break; + } + case MASTER_RESTART_REQ: + { + ndbout_c("Worker : restarting Master"); + + NdbRestarter restarter; + int master_nodeid = restarter.getMasterNodeId(); + ndbout_c("Worker : Restarting Master (%d)...", master_nodeid); + if (restarter.restartOneDbNode2(master_nodeid, + NdbRestarter::NRRF_NOSTART | +// NdbRestarter::NRRF_FORCE | + NdbRestarter::NRRF_ABORT) || + restarter.waitNodesNoStart(&master_nodeid, 1) || + restarter.startAll()) + { + ndbout_c("Worker : Error restarting Master."); + return NDBT_FAILED; + } + ndbout_c("Worker : Waiting for master to recover..."); + if (restarter.waitNodesStarted(&master_nodeid, 1)) + { + ndbout_c("Worker : Error waiting for Master restart"); + return NDBT_FAILED; + } + ndbout_c("Worker : Master recovered."); + break; + } + case SLAVE_RESTART_REQ: + { + NdbRestarter restarter; + int slave_nodeid = restarter.getRandomNotMasterNodeId(rand()); + ndbout_c("Worker : Restarting non-master (%d)...", slave_nodeid); + if (restarter.restartOneDbNode2(slave_nodeid, + NdbRestarter::NRRF_NOSTART | +// NdbRestarter::NRRF_FORCE | + NdbRestarter::NRRF_ABORT) || + restarter.waitNodesNoStart(&slave_nodeid, 1) || + restarter.startAll()) + { + ndbout_c("Worker : Error restarting Slave."); + return NDBT_FAILED; + } + ndbout_c("Worker : Waiting for slave to recover..."); + if (restarter.waitNodesStarted(&slave_nodeid, 1)) + { + ndbout_c("Worker : Error waiting for Slave restart"); + return NDBT_FAILED; + } + ndbout_c("Worker : Slave recovered."); + break; + } + default: + { + break; + } + } + } + ctx->setProperty("DIHWritesRequestType", (Uint32) 0); + ctx->setProperty("DIHWritesRequest", (Uint32) 2); + } + + ndbout_c("Worker, done."); + return NDBT_OK; +} + +int testSlowDihFileWrites(NDBT_Context* ctx, NDBT_Step* step) +{ + /* Testcase checks behaviour with slow flushing of DIH table definitions + * This caused problems in the past by exhausting the DIH page pool + * Now there's a concurrent operations limit. + * Check that it behaves with many queued ops, parallel drop/node restarts + */ + + /* Run as a 'T1' testcase - do nothing for other tables */ + if (strcmp(ctx->getTab()->getName(), "T1") != 0) + return NDBT_OK; + + /* 1. Activate slow write error insert + * 2. Trigger LCP + * 3. Wait some time, periodically producing info on + * the internal state + * 4. Perform some parallel action (drop table/node restarts) + * 5. Wait some time, periodically producing info on + * the internal state + * 6. Clear the error insert + * 7. Wait a little longer + * 8. Done. + */ + NdbRestarter restarter; + + for (Uint32 scenario = 0; scenario < NUM_SCENARIOS; scenario++) + { + ndbout_c("Inserting error 7235"); + restarter.insertErrorInAllNodes(7235); + + ndbout_c("Triggering LCP"); + int dumpArg = 7099; + restarter.dumpStateAllNodes(&dumpArg, 1); + + const Uint32 periodSeconds = 10; + Uint32 waitPeriods = 6; + dumpArg = 7032; + + for (Uint32 p=0; psetProperty("DIHWritesRequestType", (Uint32) DROP_TABLE_REQ); + ctx->setProperty("DIHWritesRequest", (Uint32) 1); + break; + } + case RESTART_MASTER: + { + ndbout_c("Requesting Master restart"); + ctx->setProperty("DIHWritesRequestType", (Uint32) MASTER_RESTART_REQ); + ctx->setProperty("DIHWritesRequest", (Uint32) 1); + + break; + } + case RESTART_SLAVE: + { + ndbout_c("Requesting Slave restart"); + ctx->setProperty("DIHWritesRequestType", (Uint32) SLAVE_RESTART_REQ); + ctx->setProperty("DIHWritesRequest", (Uint32) 1); + + break; + } + default: + break; + } + } + + ndbout_c("Dumping DIH page info to ndbd stdout"); + restarter.dumpStateAllNodes(&dumpArg, 1); + NdbSleep_MilliSleep(periodSeconds * 1000); + } + + ndbout_c("Clearing error insert..."); + restarter.insertErrorInAllNodes(0); + + waitPeriods = 2; + for (Uint32 p=0; pgetPropertyWait("DIHWritesRequest", 2); + + if (ctx->isTestStopped()) + return NDBT_OK; + + ndbout_c("Done."); + } + + /* Finish up */ + ctx->stopTest(); + + return NDBT_OK; +} + + + +NDBT_TESTSUITE(testLimits); + +TESTCASE("SlowDihFileWrites", + "Test behaviour of slow Dih table file writes") +{ + INITIALIZER(create100Tables); + STEP(testWorker); + STEP(testSlowDihFileWrites); + FINALIZER(drop100Tables); +} + +NDBT_TESTSUITE_END(testLimits); + +int main(int argc, const char** argv){ + ndb_init(); + return testLimits.execute(argc, argv); +} === modified file 'storage/ndb/test/run-test/daily-basic-tests.txt' --- a/storage/ndb/test/run-test/daily-basic-tests.txt 2012-09-12 13:24:49 +0000 +++ b/storage/ndb/test/run-test/daily-basic-tests.txt 2012-10-29 18:34:05 +0000 @@ -1553,3 +1553,7 @@ max-time : 300 cmd: testScan args: -n ScanKeyInfoExhaust T1 +max-time : 1200 +cmd: testLimits +args: -n SlowDihFileWrites T1 + No bundle (reason: useless for push emails).