From: Date: April 24 2008 11:29am Subject: bk commit into 5.1 tree (jonas:1.2202) BUG#36276 List-Archive: http://lists.mysql.com/commits/45936 X-Bug: 36276 Message-Id: <20080424092934.C6ADC1E467@perch.localdomain> Below is the list of changes that have just been committed into a local 5.1 repository of jonas. When jonas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet@stripped, 2008-04-24 11:29:32+02:00, jonas@stripped +7 -0 ndb - (drop6) fix for bug#36199, bug#36246, bug#36247, bug#36276 all related to cascading master failure storage/ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2008-04-24 11:29:30+02:00, jonas@stripped +3 -3 ndb - fix for bug#36199, bug#36246, bug#36247, bug#36276 all related to cascading master failure storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2008-04-24 11:29:30+02:00, jonas@stripped +3 -0 ndb - fix for bug#36199, bug#36246, bug#36247, bug#36276 all related to cascading master failure storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2008-04-24 11:29:30+02:00, jonas@stripped +163 -49 ndb - fix for bug#36199, bug#36246, bug#36247, bug#36276 all related to cascading master failure storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp@stripped, 2008-04-24 11:29:31+02:00, jonas@stripped +30 -0 ndb - fix for bug#36199, bug#36246, bug#36247, bug#36276 all related to cascading master failure storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp@stripped, 2008-04-24 11:29:31+02:00, jonas@stripped +60 -25 ndb - fix for bug#36199, bug#36246, bug#36247, bug#36276 all related to cascading master failure storage/ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2008-04-24 11:29:31+02:00, jonas@stripped +498 -0 ndb - fix for bug#36199, bug#36246, bug#36247, bug#36276 all related to cascading master failure storage/ndb/test/run-test/daily-basic-tests.txt@stripped, 2008-04-24 11:29:31+02:00, jonas@stripped +22 -0 ndb - fix for bug#36199, bug#36246, bug#36247, bug#36276 all related to cascading master failure # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: jonas # Host: perch.ndb.mysql.com # Root: /home/jonas/src/drop6 --- 1.64/storage/ndb/test/run-test/daily-basic-tests.txt 2008-04-24 11:29:34 +02:00 +++ 1.65/storage/ndb/test/run-test/daily-basic-tests.txt 2008-04-24 11:29:34 +02:00 @@ -819,3 +819,25 @@ cmd: test_event args: -n Bug34853 T1 +# 2008-04-22 +max-time: 1500 +cmd: testNodeRestart +args: -n MNF T1 + +max-time: 300 +cmd: testNodeRestart +args: -n Bug36199 T1 + +max-time: 300 +cmd: testNodeRestart +args: -n Bug36246 T1 + +max-time: 300 +cmd: testNodeRestart +args: -n Bug36247 T1 + +max-time: 300 +cmd: testNodeRestart +args: -n Bug36276 T1 + +# EOF --- 1.31/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2008-04-24 11:29:34 +02:00 +++ 1.32/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2008-04-24 11:29:34 +02:00 @@ -3,10 +3,10 @@ Next NDBFS 2000 Next DBACC 3002 Next DBTUP 4013 -Next DBLQH 5047 +Next DBLQH 5051 Next DBDICT 6007 -Next DBDIH 7195 -Next DBTC 8057 +Next DBDIH 7211 +Next DBTC 8063 Next CMVMI 9000 Next BACKUP 10022 Next DBUTIL 11002 --- 1.22/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2008-04-24 11:29:34 +02:00 +++ 1.23/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2008-04-24 11:29:34 +02:00 @@ -1347,6 +1347,9 @@ Uint32 m_masterLcpDihRef; bool m_MASTER_LCPREQ_Received; Uint32 m_MASTER_LCPREQ_FailedNodeId; + + Uint32 m_lastLCP_COMPLETE_REP_id; + Uint32 m_lastLCP_COMPLETE_REP_ref; } c_lcpState; /*------------------------------------------------------------------------*/ --- 1.66/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2008-04-24 11:29:34 +02:00 +++ 1.67/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2008-04-24 11:29:34 +02:00 @@ -4601,37 +4601,47 @@ c_lcpState.m_participatingDIH.clear(failedNodePtr.i); c_lcpState.m_participatingLQH.clear(failedNodePtr.i); - if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i)){ + bool wf = c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i); + + if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i)) + { jam(); LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); rep->nodeId = failedNodePtr.i; rep->lcpId = SYSFILE->latestLCP_ID; rep->blockNo = DBDIH; sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal, - LcpCompleteRep::SignalLength, JBB); + LcpCompleteRep::SignalLength, JBB); } - - /** - * Check if we'r waiting for the failed node's LQH to complete - * - * Note that this is ran "before" LCP master take over - */ - if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){ + + bool lcp_complete_rep = false; + if (!wf) + { jam(); - - LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); - rep->nodeId = nodeId; - rep->lcpId = SYSFILE->latestLCP_ID; - rep->blockNo = DBLQH; - sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal, - LcpCompleteRep::SignalLength, JBB); - - if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){ + + /** + * Check if we'r waiting for the failed node's LQH to complete + * + * Note that this is ran "before" LCP master take over + */ + if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){ jam(); - /** - * Make sure we're ready to accept it - */ - c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId); + + lcp_complete_rep = true; + LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); + rep->nodeId = nodeId; + rep->lcpId = SYSFILE->latestLCP_ID; + rep->blockNo = DBLQH; + sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal, + LcpCompleteRep::SignalLength, JBB); + + if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){ + jam(); + /** + * Make sure we're ready to accept it + */ + c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId); + } } } @@ -4657,7 +4667,9 @@ StartLcpConf::SignalLength, JBB); }//if - if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i)) { +dosend: + if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i)) + { jam(); EmptyLcpConf * const rep = (EmptyLcpConf *)&signal->theData[0]; rep->senderNodeId = failedNodePtr.i; @@ -4668,8 +4680,14 @@ rep->idle = true; sendSignal(reference(), GSN_EMPTY_LCP_CONF, signal, EmptyLcpConf::SignalLength, JBB); - }//if - + } + else if (!c_EMPTY_LCP_REQ_Counter.done() && lcp_complete_rep) + { + jam(); + c_EMPTY_LCP_REQ_Counter.setWaitingFor(failedNodePtr.i); + goto dosend; + } + if (c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i)) { jam(); MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0]; @@ -4737,19 +4755,37 @@ c_lcpMasterTakeOverState.set(LMTOS_WAIT_EMPTY_LCP, __LINE__); - if(c_EMPTY_LCP_REQ_Counter.done()){ - jam(); - c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(); + + EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend(); + req->senderRef = reference(); - EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend(); - req->senderRef = reference(); - sendLoopMacro(EMPTY_LCP_REQ, sendEMPTY_LCP_REQ); - ndbrequire(!c_EMPTY_LCP_REQ_Counter.done()); - } else { - /** - * Node failure during master take over... - */ - ndbout_c("Nodefail during master take over (old: %d)", oldNode); + { + NodeRecordPtr specNodePtr; + specNodePtr.i = cfirstAliveNode; + do { + jam(); + ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord); + if (!c_EMPTY_LCP_REQ_Counter.isWaitingFor(specNodePtr.i)) + { + jam(); + c_EMPTY_LCP_REQ_Counter.setWaitingFor(specNodePtr.i); + if (!(ERROR_INSERTED(7209) && specNodePtr.i == getOwnNodeId())) + { + sendEMPTY_LCP_REQ(signal, specNodePtr.i); + } + else + { + ndbout_c("NOT sending EMPTY_LCP_REQ to %u", specNodePtr.i); + } + + if (c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(specNodePtr.i)) + { + jam(); + c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(); + } + } + specNodePtr.i = specNodePtr.p->nextNode; + } while (specNodePtr.i != RNIL); } NodeRecordPtr nodePtr; @@ -5738,6 +5774,9 @@ const EmptyLcpConf * const conf = (EmptyLcpConf *)&signal->theData[0]; Uint32 nodeId = conf->senderNodeId; + CRASH_INSERTION(7206); + + if(!conf->idle){ jam(); if (conf->tableId < c_lcpMasterTakeOverState.minTableId) { @@ -5815,6 +5854,25 @@ jamEntry(); const BlockReference newMasterBlockref = req->masterRef; + CRASH_INSERTION(7205); + + if (ERROR_INSERTED(7207)) + { + jam(); + SET_ERROR_INSERT_VALUE(7208); + sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal, + 500, signal->getLength()); + return; + } + + if (ERROR_INSERTED(7208)) + { + jam(); + signal->theData[0] = 9999; + sendSignal(numberToRef(CMVMI, refToNode(newMasterBlockref)), + GSN_NDB_TAMPER, signal, 1, JBB); + } + if (newMasterBlockref != cmasterdihref) { jam(); @@ -5837,6 +5895,11 @@ jam(); ndbrequire(0); } + + if (ERROR_INSERTED(7209)) + { + SET_ERROR_INSERT_VALUE(7210); + } sendMASTER_LCPCONF(signal); }//Dbdih::execMASTER_LCPREQ() @@ -6175,12 +6238,22 @@ { const MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0]; jamEntry(); - receiveLoopMacro(MASTER_LCPREQ, ref->senderNodeId); + + Uint32 senderNodeId = ref->senderNodeId; + Uint32 failedNodeId = ref->failedNodeId; + + if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(senderNodeId)) + { + jam(); + c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.clearWaitingFor(senderNodeId); + } + + receiveLoopMacro(MASTER_LCPREQ, senderNodeId); /*-------------------------------------------------------------------------*/ // We have now received all responses and are ready to take over the LCP // protocol as master. /*-------------------------------------------------------------------------*/ - MASTER_LCPhandling(signal, ref->failedNodeId); + MASTER_LCPhandling(signal, failedNodeId); }//Dbdih::execMASTER_LCPREF() void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId) @@ -10293,7 +10366,15 @@ signal->theData[1] = tabPtr.i; sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB); - checkLcpAllTablesDoneInLqh(); + bool ret = checkLcpAllTablesDoneInLqh(); + if (ret && ERROR_INSERTED(7209)) + { + jam(); + + signal->theData[0] = 9999; + sendSignal(numberToRef(CMVMI, cmasterNodeId), + GSN_NDB_TAMPER, signal, 1, JBB); + } } } @@ -10622,12 +10703,30 @@ CRASH_INSERTION2(7027, isMaster()); CRASH_INSERTION2(7018, !isMaster()); - if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED){ + if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED) + { /** * We'r done */ + + if (ERROR_INSERTED(7209)) + { + signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED; + sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB); + return; + } + c_lcpState.setLcpStatus(LCP_TAB_SAVED, __LINE__); sendLCP_COMPLETE_REP(signal); + + if (ERROR_INSERTED(7210)) + { + CLEAR_ERROR_INSERT_VALUE; + EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtr(); + req->senderRef = reference(); + sendEMPTY_LCP_REQ(signal, getOwnNodeId()); + } + return; } @@ -10639,13 +10738,28 @@ void Dbdih::sendLCP_COMPLETE_REP(Signal* signal){ jam(); - LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); - rep->nodeId = getOwnNodeId(); - rep->lcpId = SYSFILE->latestLCP_ID; - rep->blockNo = DBDIH; - - sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal, - LcpCompleteRep::SignalLength, JBB); + + /** + * Quick and dirty fix for bug#36276 dont save + * LCP_COMPLETE_REP to same node same LCP twice + */ + bool alreadysent = + c_lcpState.m_lastLCP_COMPLETE_REP_id == SYSFILE->latestLCP_ID && + c_lcpState.m_lastLCP_COMPLETE_REP_ref == c_lcpState.m_masterLcpDihRef; + + if (!alreadysent) + { + LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); + rep->nodeId = getOwnNodeId(); + rep->lcpId = SYSFILE->latestLCP_ID; + rep->blockNo = DBDIH; + + sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal, + LcpCompleteRep::SignalLength, JBB); + + c_lcpState.m_lastLCP_COMPLETE_REP_id = SYSFILE->latestLCP_ID; + c_lcpState.m_lastLCP_COMPLETE_REP_ref = c_lcpState.m_masterLcpDihRef; + } /** * Say that an initial node restart does not need to be redone @@ -11818,7 +11932,7 @@ c_lcpState.ctimer = 0; c_lcpState.immediateLcpStart = false; c_lcpState.m_MASTER_LCPREQ_Received = false; - + c_lcpState.m_lastLCP_COMPLETE_REP_ref = 0; cmasterdihref = 0; cmasterNodeId = 0; cmasterState = MASTER_IDLE; --- 1.99/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2008-04-24 11:29:34 +02:00 +++ 1.100/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2008-04-24 11:29:34 +02:00 @@ -6767,6 +6767,22 @@ * * now scan markers */ + if (ERROR_INSERTED(5050)) + { + ndbout_c("send ZSCAN_MARKERS with 5s delay and killing master"); + CLEAR_ERROR_INSERT_VALUE; + signal->theData[0] = ZSCAN_MARKERS; + signal->theData[1] = tcNodeFailptr.i; + signal->theData[2] = 0; + signal->theData[3] = RNIL; + sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 5000, 4); + + signal->theData[0] = 9999; + sendSignal(numberToRef(CMVMI, + refToNode(tcNodeFailptr.p->newTcBlockref)), + GSN_NDB_TAMPER, signal, 1, JBB); + return; + } scanMarkers(signal, tcNodeFailptr.i, 0, RNIL); return; }//if @@ -6851,6 +6867,20 @@ tcNodeFailPtr.i = tcNodeFail; ptrCheckGuard(tcNodeFailPtr, ctcNodeFailrecFileSize, tcNodeFailRecord); const Uint32 crashedTcNodeId = tcNodeFailPtr.p->oldNodeId; + + if (tcNodeFailPtr.p->tcFailStatus == TcNodeFailRecord::TC_STATE_BREAK) + { + jam(); + + /* ---------------------------------------------------------------------- + * AN INTERRUPTION TO THIS NODE FAIL HANDLING WAS RECEIVED AND A NEW + * TC HAVE BEEN ASSIGNED TO TAKE OVER THE FAILED TC. PROBABLY THE OLD + * NEW TC HAVE FAILED. + * ---------------------------------------------------------------------- */ + tcNodeFailptr = tcNodeFailPtr; + lqhTransNextLab(signal); + return; + } CommitAckMarkerIterator iter; if(i == RNIL){ --- 1.111/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2008-04-24 11:29:34 +02:00 +++ 1.112/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2008-04-24 11:29:34 +02:00 @@ -7072,21 +7072,10 @@ }//if }//if - if (getOwnNodeId() != tnewMasterId) - { - jam(); - /** - * Only master does takeover currently - */ - hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER; - } - else - { - jam(); - signal->theData[0] = hostptr.i; - sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); - } - + jam(); + signal->theData[0] = hostptr.i; + sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); + checkScanActiveInFailedLqh(signal, 0, hostptr.i); checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid nodeFailCheckTransactions(signal, 0, hostptr.i); @@ -7123,6 +7112,14 @@ sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, NFCompleteRep::SignalLength, JBB); } + + CRASH_INSERTION(8058); + if (ERROR_INSERTED(8059)) + { + signal->theData[0] = 9999; + sendSignalWithDelay(numberToRef(CMVMI, hostptr.i), + GSN_NDB_TAMPER, signal, 100, 1); + } } void Dbtc::checkScanActiveInFailedLqh(Signal* signal, @@ -7191,7 +7188,14 @@ Ptr transPtr; Uint32 TtcTimer = ctcTimer; Uint32 TapplTimeout = c_appl_timeout_value; - for (transPtr.i = transPtrI; transPtr.i < capiConnectFilesize; transPtr.i++) + Uint32 RT_BREAK = 64; + Uint32 endPtrI = transPtrI + RT_BREAK; + if (endPtrI > capiConnectFilesize) + { + endPtrI = capiConnectFilesize; + } + + for (transPtr.i = transPtrI; transPtr.i < endPtrI; transPtr.i++) { ptrCheckGuard(transPtr, capiConnectFilesize, apiConnectRecord); if (transPtr.p->m_transaction_nodes.get(failedNodeId)) @@ -7203,18 +7207,25 @@ setApiConTimer(transPtr.i, TtcTimer - 2, __LINE__); timeOutFoundLab(signal, transPtr.i, ZNODEFAIL_BEFORE_COMMIT); c_appl_timeout_value = TapplTimeout; + + transPtr.i++; + break; } - - // Send CONTINUEB to continue later + } + + if (transPtr.i == capiConnectFilesize) + { + jam(); + checkNodeFailComplete(signal, failedNodeId, + HostRecord::NF_CHECK_TRANSACTION); + } + else + { signal->theData[0] = TcContinueB::ZNF_CHECK_TRANSACTIONS; - signal->theData[1] = transPtr.i + 1; // Check next + signal->theData[1] = transPtr.i; signal->theData[2] = failedNodeId; sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); - return; } - - checkNodeFailComplete(signal, failedNodeId, - HostRecord::NF_CHECK_TRANSACTION); } @@ -7237,7 +7248,23 @@ if (signal->getSendersBlockRef() != reference()) { jam(); - return; + /** + * Node should be in queue + */ + Uint32 i = 0; + Uint32 end = tcNodeFailptr.p->queueIndex; + for (; iqueueList[i] == hostptr.i) + { + jam(); + break; + } + } + ndbrequire(i != end); + tcNodeFailptr.p->queueList[i] = tcNodeFailptr.p->queueList[end-1]; + tcNodeFailptr.p->queueIndex = end - 1; } checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER); @@ -7249,7 +7276,9 @@ tfailedNodeId = signal->theData[0]; tcNodeFailptr.i = 0; ptrAss(tcNodeFailptr, tcFailRecord); - if (tcNodeFailptr.p->failStatus != FS_IDLE) { + if (tcNodeFailptr.p->failStatus != FS_IDLE || + cmasterNodeId != getOwnNodeId()) + { jam(); /*------------------------------------------------------------*/ /* WE CAN CURRENTLY ONLY HANDLE ONE TAKE OVER AT A TIME */ @@ -7303,6 +7332,8 @@ jamEntry(); LqhTransConf * const lqhTransConf = (LqhTransConf *)&signal->theData[0]; + CRASH_INSERTION(8060); + tcNodeFailptr.i = lqhTransConf->tcRef; ptrCheckGuard(tcNodeFailptr, 1, tcFailRecord); tnodeid = lqhTransConf->lqhNodeId; @@ -7365,6 +7396,8 @@ { Uint32 guard0; + CRASH_INSERTION(8061); + hostptr.i = tnodeid; ptrCheckGuard(hostptr, chostFilesize, hostRecord); hostptr.p->lqhTransStatus = LTS_IDLE; @@ -7472,6 +7505,8 @@ }//if tcNodeFailptr.p->takeOverProcState[TtakeOverInd] = ZTAKE_OVER_IDLE; tcNodeFailptr.p->completedTakeOver++; + + CRASH_INSERTION(8062); if (tcNodeFailptr.p->completedTakeOver == cnoParallelTakeOver) { jam(); --- 1.44/storage/ndb/test/ndbapi/testNodeRestart.cpp 2008-04-24 11:29:34 +02:00 +++ 1.45/storage/ndb/test/ndbapi/testNodeRestart.cpp 2008-04-24 11:29:34 +02:00 @@ -24,6 +24,7 @@ #include #include #include +#include int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){ @@ -1767,6 +1768,480 @@ return NDBT_OK; } +int +runMNF(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + NdbRestarter res; + + if (res.getNumDbNodes() < 2) + { + return NDBT_OK; + } + + Vector part0; + Vector part1; + Bitmask<255> part0mask; + Bitmask<255> part1mask; + Bitmask<255> ngmask; + for (int i = 0; igetNumLoops(); + while (loops-- && !ctx->isTestStopped()) + { + int cnt, *nodes; + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + + bool cmf = false; + if (part0mask.get(master) && part0mask.get(nextMaster)) + { + cmf = true; + cnt = part0.size(); + nodes = part0.getBase(); + printf("restarting part0"); + } + else if(part1mask.get(master) && part1mask.get(nextMaster)) + { + cmf = true; + cnt = part1.size(); + nodes = part1.getBase(); + printf("restarting part1"); + } + else + { + cmf = false; + if (loops & 1) + { + cnt = part0.size(); + nodes = part0.getBase(); + printf("restarting part0"); + } + else + { + cnt = part1.size(); + nodes = part0.getBase(); + printf("restarting part0"); + } + } + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + for (int i = 0; istopTest(); + return NDBT_OK; +} + +int +runBug36199(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + NdbRestarter res; + + if (res.getNumDbNodes() < 4) + return NDBT_OK; + + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand()); + if (victim == master) + { + victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand()); + } + + ndbout_c("master: %u next master: %u victim: %u", + master, nextMaster, victim); + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + res.dumpStateOneNode(master, val2, 2); + res.dumpStateOneNode(victim, val2, 2); + + res.insertErrorInNode(victim, 7205); + res.insertErrorInNode(master, 7014); + int lcp = 7099; + res.dumpStateOneNode(master, &lcp, 1); + + int nodes[2]; + nodes[0] = master; + nodes[1] = victim; + if (res.waitNodesNoStart(nodes, 2)) + { + return NDBT_FAILED; + } + + if (res.startNodes(nodes, 2)) + { + return NDBT_FAILED; + } + + if (res.waitClusterStarted()) + return NDBT_FAILED; + + return NDBT_OK; +} + +int +runBug36246(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + NdbRestarter res; + Ndb* pNdb = GETNDB(step); + + if (res.getNumDbNodes() < 4) + return NDBT_OK; + + HugoOperations hugoOps(*ctx->getTab()); +restartloop: + int tryloop = 0; + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + +loop: + if(hugoOps.startTransaction(pNdb) != 0) + return NDBT_FAILED; + + if(hugoOps.pkUpdateRecord(pNdb, 1, 1) != 0) + return NDBT_FAILED; + + if(hugoOps.execute_NoCommit(pNdb) != 0) + return NDBT_FAILED; + + int victim = hugoOps.getTransaction()->getConnectedNodeId(); + printf("master: %u nextMaster: %u victim: %u", + master, nextMaster, victim); + if (victim == master || victim == nextMaster || + res.getNodeGroup(victim) == res.getNodeGroup(master) || + res.getNodeGroup(victim) == res.getNodeGroup(nextMaster)) + { + hugoOps.execute_Rollback(pNdb); + hugoOps.closeTransaction(pNdb); + tryloop++; + if (tryloop == 10) + { + ndbout_c(" -> restarting next master: %u", nextMaster); + res.restartOneDbNode(nextMaster, + /** initial */ false, + /** nostart */ true, + /** abort */ true); + + res.waitNodesNoStart(&nextMaster, 1); + res.startNodes(&nextMaster, 1); + if (res.waitClusterStarted()) + return NDBT_FAILED; + goto restartloop; + } + else + { + ndbout_c(" -> loop"); + goto loop; + } + } + ndbout_c(" -> go go gadget skates"); + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + res.dumpStateOneNode(master, val2, 2); + res.dumpStateOneNode(victim, val2, 2); + + res.insertErrorInNode(master, 8060); + res.insertErrorInNode(victim, 9999); + + int nodes[2]; + nodes[0] = master; + nodes[1] = victim; + if (res.waitNodesNoStart(nodes, 2)) + { + return NDBT_FAILED; + } + + if (res.startNodes(nodes, 2)) + { + return NDBT_FAILED; + } + + if (res.waitClusterStarted()) + return NDBT_FAILED; + + hugoOps.execute_Rollback(pNdb); + hugoOps.closeTransaction(pNdb); + + return NDBT_OK; +} + +int +runBug36247(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + NdbRestarter res; + Ndb* pNdb = GETNDB(step); + + if (res.getNumDbNodes() < 4) + return NDBT_OK; + + HugoOperations hugoOps(*ctx->getTab()); + +restartloop: + int tryloop = 0; + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + +loop: + if(hugoOps.startTransaction(pNdb) != 0) + return NDBT_FAILED; + + if(hugoOps.pkUpdateRecord(pNdb, 1, 100) != 0) + return NDBT_FAILED; + + if(hugoOps.execute_NoCommit(pNdb) != 0) + return NDBT_FAILED; + + int victim = hugoOps.getTransaction()->getConnectedNodeId(); + printf("master: %u nextMaster: %u victim: %u", + master, nextMaster, victim); + if (victim == master || victim == nextMaster || + res.getNodeGroup(victim) == res.getNodeGroup(master) || + res.getNodeGroup(victim) == res.getNodeGroup(nextMaster)) + { + hugoOps.execute_Rollback(pNdb); + hugoOps.closeTransaction(pNdb); + tryloop++; + if (tryloop == 10) + { + ndbout_c(" -> restarting next master: %u", nextMaster); + res.restartOneDbNode(nextMaster, + /** initial */ false, + /** nostart */ true, + /** abort */ true); + + res.waitNodesNoStart(&nextMaster, 1); + res.startNodes(&nextMaster, 1); + if (res.waitClusterStarted()) + return NDBT_FAILED; + goto restartloop; + } + else + { + ndbout_c(" -> loop"); + goto loop; + } + } + ndbout_c(" -> go go gadget skates"); + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + res.dumpStateOneNode(master, val2, 2); + res.dumpStateOneNode(victim, val2, 2); + + for (int i = 0; igetNumLoops(); + NdbRestarter res; + Ndb* pNdb = GETNDB(step); + + if (res.getNumDbNodes() < 4) + return NDBT_OK; + + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand()); + if (victim == master) + { + victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand()); + } + + ndbout_c("master: %u nextMaster: %u victim: %u", + master, nextMaster, victim); + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + res.dumpStateOneNode(master, val2, 2); + res.insertErrorInNode(victim, 7209); + + int lcp = 7099; + res.dumpStateOneNode(master, &lcp, 1); + + if (res.waitNodesNoStart(&master, 1)) + { + return NDBT_FAILED; + } + + if (res.startNodes(&master, 1)) + { + return NDBT_FAILED; + } + + if (res.waitClusterStarted()) + return NDBT_FAILED; + + return NDBT_OK; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", "Test that one node at a time can be stopped and then restarted "\ @@ -2127,6 +2602,29 @@ } TESTCASE("Bug32160", ""){ INITIALIZER(runBug32160); +} +TESTCASE("MNF", ""){ + INITIALIZER(runLoadTable); + STEP(runMNF); + STEP(runScanUpdateUntilStopped); +} +TESTCASE("Bug36199", ""){ + INITIALIZER(runBug36199); +} +TESTCASE("Bug36246", ""){ + INITIALIZER(runLoadTable); + STEP(runBug36246); + VERIFIER(runClearTable); +} +TESTCASE("Bug36247", ""){ + INITIALIZER(runLoadTable); + STEP(runBug36247); + VERIFIER(runClearTable); +} +TESTCASE("Bug36276", ""){ + INITIALIZER(runLoadTable); + STEP(runBug36276); + VERIFIER(runClearTable); } NDBT_TESTSUITE_END(testNodeRestart);