From: Date: April 23 2008 4:08pm Subject: bk commit into 5.0 tree (jonas:1.2600) BUG#36199 List-Archive: http://lists.mysql.com/commits/45894 X-Bug: 36199 Message-Id: <20080423140841.9A4981E45C@perch.localdomain> Below is the list of changes that have just been committed into a local 5.0 repository of jonas. When jonas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet@stripped, 2008-04-23 16:08:38+02:00, jonas@stripped +7 -0 ndb - fix for bug#36199, bug#36246, bug#36247, bug#36276 all related to cascading master failure ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2008-04-23 16:08:36+02:00, jonas@stripped +3 -3 update error codes ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2008-04-23 16:08:36+02:00, jonas@stripped +3 -0 fix for bug#36199, bug#36246, bug#36247, bug#36276 ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2008-04-23 16:08:36+02:00, jonas@stripped +163 -50 fix for bug#36199, bug#36246, bug#36247, bug#36276 ndb/src/kernel/blocks/dblqh/DblqhMain.cpp@stripped, 2008-04-23 16:08:36+02:00, jonas@stripped +30 -0 fix for bug#36199, bug#36246, bug#36247, bug#36276 ndb/src/kernel/blocks/dbtc/DbtcMain.cpp@stripped, 2008-04-23 16:08:36+02:00, jonas@stripped +60 -25 fix for bug#36199, bug#36246, bug#36247, bug#36276 ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2008-04-23 16:08:36+02:00, jonas@stripped +498 -0 fix for bug#36199, bug#36246, bug#36247, bug#36276 ndb/test/run-test/daily-basic-tests.txt@stripped, 2008-04-23 16:08:36+02:00, jonas@stripped +22 -0 fix for bug#36199, bug#36246, bug#36247, bug#36276 diff -Nrup a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt --- a/ndb/src/kernel/blocks/ERROR_codes.txt 2007-11-07 20:57:19 +01:00 +++ b/ndb/src/kernel/blocks/ERROR_codes.txt 2008-04-23 16:08:36 +02:00 @@ -3,10 +3,10 @@ Next NDBCNTR 1002 Next NDBFS 2000 Next DBACC 3002 Next DBTUP 4014 -Next DBLQH 5043 +Next DBLQH 5051 Next DBDICT 6007 -Next DBDIH 7195 -Next DBTC 8052 +Next DBDIH 7211 +Next DBTC 8063 Next CMVMI 9000 Next BACKUP 10022 Next DBUTIL 11002 diff -Nrup a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp --- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-11-07 20:57:19 +01:00 +++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2008-04-23 16:08:36 +02:00 @@ -1342,6 +1342,9 @@ private: Uint32 m_masterLcpDihRef; bool m_MASTER_LCPREQ_Received; Uint32 m_MASTER_LCPREQ_FailedNodeId; + + Uint32 m_lastLCP_COMPLETE_REP_id; + Uint32 m_lastLCP_COMPLETE_REP_ref; } c_lcpState; /*------------------------------------------------------------------------*/ diff -Nrup a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-11-07 20:57:19 +01:00 +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2008-04-23 16:08:36 +02:00 @@ -4501,37 +4501,47 @@ void Dbdih::failedNodeLcpHandling(Signal c_lcpState.m_participatingDIH.clear(failedNodePtr.i); c_lcpState.m_participatingLQH.clear(failedNodePtr.i); - if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i)){ + bool wf = c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i); + + if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i)) + { jam(); LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); rep->nodeId = failedNodePtr.i; rep->lcpId = SYSFILE->latestLCP_ID; rep->blockNo = DBDIH; sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal, - LcpCompleteRep::SignalLength, JBB); + LcpCompleteRep::SignalLength, JBB); } - - /** - * Check if we'r waiting for the failed node's LQH to complete - * - * Note that this is ran "before" LCP master take over - */ - if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){ + + bool lcp_complete_rep = false; + if (!wf) + { jam(); - - LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); - rep->nodeId = nodeId; - rep->lcpId = SYSFILE->latestLCP_ID; - rep->blockNo = DBLQH; - sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal, - LcpCompleteRep::SignalLength, JBB); - - if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){ + + /** + * Check if we'r waiting for the failed node's LQH to complete + * + * Note that this is ran "before" LCP master take over + */ + if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){ jam(); - /** - * Make sure we're ready to accept it - */ - c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId); + + lcp_complete_rep = true; + LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); + rep->nodeId = nodeId; + rep->lcpId = SYSFILE->latestLCP_ID; + rep->blockNo = DBLQH; + sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal, + LcpCompleteRep::SignalLength, JBB); + + if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){ + jam(); + /** + * Make sure we're ready to accept it + */ + c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId); + } } } @@ -4557,7 +4567,9 @@ void Dbdih::failedNodeLcpHandling(Signal StartLcpConf::SignalLength, JBB); }//if - if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i)) { +dosend: + if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i)) + { jam(); EmptyLcpConf * const rep = (EmptyLcpConf *)&signal->theData[0]; rep->senderNodeId = failedNodePtr.i; @@ -4568,8 +4580,14 @@ void Dbdih::failedNodeLcpHandling(Signal rep->idle = true; sendSignal(reference(), GSN_EMPTY_LCP_CONF, signal, EmptyLcpConf::SignalLength, JBB); - }//if - + } + else if (!c_EMPTY_LCP_REQ_Counter.done() && lcp_complete_rep) + { + jam(); + c_EMPTY_LCP_REQ_Counter.setWaitingFor(failedNodePtr.i); + goto dosend; + } + if (c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i)) { jam(); MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0]; @@ -4637,19 +4655,36 @@ Dbdih::startLcpMasterTakeOver(Signal* si c_lcpMasterTakeOverState.set(LMTOS_WAIT_EMPTY_LCP, __LINE__); - if(c_EMPTY_LCP_REQ_Counter.done()){ - jam(); - c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(); - - EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend(); - req->senderRef = reference(); - sendLoopMacro(EMPTY_LCP_REQ, sendEMPTY_LCP_REQ); - ndbrequire(!c_EMPTY_LCP_REQ_Counter.done()); - } else { - /** - * Node failure during master take over... - */ - g_eventLogger.info("Nodefail during master take over (old: %d)", oldNode); + + EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend(); + req->senderRef = reference(); + { + NodeRecordPtr specNodePtr; + specNodePtr.i = cfirstAliveNode; + do { + jam(); + ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord); + if (!c_EMPTY_LCP_REQ_Counter.isWaitingFor(specNodePtr.i)) + { + jam(); + c_EMPTY_LCP_REQ_Counter.setWaitingFor(specNodePtr.i); + if (!(ERROR_INSERTED(7209) && specNodePtr.i == getOwnNodeId())) + { + sendEMPTY_LCP_REQ(signal, specNodePtr.i); + } + else + { + ndbout_c("NOT sending EMPTY_LCP_REQ to %u", specNodePtr.i); + } + + if (c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(specNodePtr.i)) + { + jam(); + c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(); + } + } + specNodePtr.i = specNodePtr.p->nextNode; + } while (specNodePtr.i != RNIL); } NodeRecordPtr nodePtr; @@ -5639,6 +5674,9 @@ void Dbdih::execEMPTY_LCP_CONF(Signal* s const EmptyLcpConf * const conf = (EmptyLcpConf *)&signal->theData[0]; Uint32 nodeId = conf->senderNodeId; + CRASH_INSERTION(7206); + + if(!conf->idle){ jam(); if (conf->tableId < c_lcpMasterTakeOverState.minTableId) { @@ -5716,6 +5754,25 @@ void Dbdih::execMASTER_LCPREQ(Signal* si jamEntry(); const BlockReference newMasterBlockref = req->masterRef; + CRASH_INSERTION(7205); + + if (ERROR_INSERTED(7207)) + { + jam(); + SET_ERROR_INSERT_VALUE(7208); + sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal, + 500, signal->getLength()); + return; + } + + if (ERROR_INSERTED(7208)) + { + jam(); + signal->theData[0] = 9999; + sendSignal(numberToRef(CMVMI, refToNode(newMasterBlockref)), + GSN_NDB_TAMPER, signal, 1, JBB); + } + if (newMasterBlockref != cmasterdihref) { jam(); @@ -5738,6 +5795,11 @@ void Dbdih::execMASTER_LCPREQ(Signal* si jam(); ndbrequire(0); } + + if (ERROR_INSERTED(7209)) + { + SET_ERROR_INSERT_VALUE(7210); + } sendMASTER_LCPCONF(signal); }//Dbdih::execMASTER_LCPREQ() @@ -6081,12 +6143,22 @@ void Dbdih::execMASTER_LCPREF(Signal* si { const MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0]; jamEntry(); - receiveLoopMacro(MASTER_LCPREQ, ref->senderNodeId); + + Uint32 senderNodeId = ref->senderNodeId; + Uint32 failedNodeId = ref->failedNodeId; + + if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(senderNodeId)) + { + jam(); + c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.clearWaitingFor(senderNodeId); + } + + receiveLoopMacro(MASTER_LCPREQ, senderNodeId); /*-------------------------------------------------------------------------*/ // We have now received all responses and are ready to take over the LCP // protocol as master. /*-------------------------------------------------------------------------*/ - MASTER_LCPhandling(signal, ref->failedNodeId); + MASTER_LCPhandling(signal, failedNodeId); }//Dbdih::execMASTER_LCPREF() void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId) @@ -10053,7 +10125,15 @@ void Dbdih::execLCP_FRAG_REP(Signal* sig signal->theData[1] = tabPtr.i; sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB); - checkLcpAllTablesDoneInLqh(); + bool ret = checkLcpAllTablesDoneInLqh(); + if (ret && ERROR_INSERTED(7209)) + { + jam(); + + signal->theData[0] = 9999; + sendSignal(numberToRef(CMVMI, cmasterNodeId), + GSN_NDB_TAMPER, signal, 1, JBB); + } } } @@ -10348,12 +10428,30 @@ void Dbdih::checkLcpCompletedLab(Signal* CRASH_INSERTION2(7027, isMaster()); CRASH_INSERTION2(7018, !isMaster()); - if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED){ + if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED) + { /** * We'r done */ + + if (ERROR_INSERTED(7209)) + { + signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED; + sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB); + return; + } + c_lcpState.setLcpStatus(LCP_TAB_SAVED, __LINE__); sendLCP_COMPLETE_REP(signal); + + if (ERROR_INSERTED(7210)) + { + CLEAR_ERROR_INSERT_VALUE; + EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtr(); + req->senderRef = reference(); + sendEMPTY_LCP_REQ(signal, getOwnNodeId()); + } + return; } @@ -10365,13 +10463,28 @@ void Dbdih::checkLcpCompletedLab(Signal* void Dbdih::sendLCP_COMPLETE_REP(Signal* signal){ jam(); - LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); - rep->nodeId = getOwnNodeId(); - rep->lcpId = SYSFILE->latestLCP_ID; - rep->blockNo = DBDIH; - - sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal, - LcpCompleteRep::SignalLength, JBB); + + /** + * Quick and dirty fix for bug#36276 dont save + * LCP_COMPLETE_REP to same node same LCP twice + */ + bool alreadysent = + c_lcpState.m_lastLCP_COMPLETE_REP_id == SYSFILE->latestLCP_ID && + c_lcpState.m_lastLCP_COMPLETE_REP_ref == c_lcpState.m_masterLcpDihRef; + + if (!alreadysent) + { + LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend(); + rep->nodeId = getOwnNodeId(); + rep->lcpId = SYSFILE->latestLCP_ID; + rep->blockNo = DBDIH; + + sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal, + LcpCompleteRep::SignalLength, JBB); + + c_lcpState.m_lastLCP_COMPLETE_REP_id = SYSFILE->latestLCP_ID; + c_lcpState.m_lastLCP_COMPLETE_REP_ref = c_lcpState.m_masterLcpDihRef; + } /** * Say that an initial node restart does not need to be redone @@ -11426,7 +11539,7 @@ void Dbdih::initCommonData() c_lcpState.ctimer = 0; c_lcpState.immediateLcpStart = false; c_lcpState.m_MASTER_LCPREQ_Received = false; - + c_lcpState.m_lastLCP_COMPLETE_REP_ref = 0; cmasterdihref = 0; cmasterNodeId = 0; cmasterState = MASTER_IDLE; diff -Nrup a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp --- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2007-07-04 22:42:28 +02:00 +++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2008-04-23 16:08:36 +02:00 @@ -6810,6 +6810,22 @@ void Dblqh::lqhTransNextLab(Signal* sign * * now scan markers */ + if (ERROR_INSERTED(5050)) + { + ndbout_c("send ZSCAN_MARKERS with 5s delay and killing master"); + CLEAR_ERROR_INSERT_VALUE; + signal->theData[0] = ZSCAN_MARKERS; + signal->theData[1] = tcNodeFailptr.i; + signal->theData[2] = 0; + signal->theData[3] = RNIL; + sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 5000, 4); + + signal->theData[0] = 9999; + sendSignal(numberToRef(CMVMI, + refToNode(tcNodeFailptr.p->newTcBlockref)), + GSN_NDB_TAMPER, signal, 1, JBB); + return; + } scanMarkers(signal, tcNodeFailptr.i, 0, RNIL); return; }//if @@ -6894,6 +6910,20 @@ Dblqh::scanMarkers(Signal* signal, tcNodeFailPtr.i = tcNodeFail; ptrCheckGuard(tcNodeFailPtr, ctcNodeFailrecFileSize, tcNodeFailRecord); const Uint32 crashedTcNodeId = tcNodeFailPtr.p->oldNodeId; + + if (tcNodeFailPtr.p->tcFailStatus == TcNodeFailRecord::TC_STATE_BREAK) + { + jam(); + + /* ---------------------------------------------------------------------- + * AN INTERRUPTION TO THIS NODE FAIL HANDLING WAS RECEIVED AND A NEW + * TC HAVE BEEN ASSIGNED TO TAKE OVER THE FAILED TC. PROBABLY THE OLD + * NEW TC HAVE FAILED. + * ---------------------------------------------------------------------- */ + tcNodeFailptr = tcNodeFailPtr; + lqhTransNextLab(signal); + return; + } CommitAckMarkerIterator iter; if(i == RNIL){ diff -Nrup a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp --- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2007-12-13 21:31:36 +01:00 +++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2008-04-23 16:08:36 +02:00 @@ -7154,21 +7154,10 @@ void Dbtc::execNODE_FAILREP(Signal* sign }//if }//if - if (getOwnNodeId() != tnewMasterId) - { - jam(); - /** - * Only master does takeover currently - */ - hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER; - } - else - { - jam(); - signal->theData[0] = hostptr.i; - sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); - } - + jam(); + signal->theData[0] = hostptr.i; + sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); + checkScanActiveInFailedLqh(signal, 0, hostptr.i); checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid nodeFailCheckTransactions(signal, 0, hostptr.i); @@ -7205,6 +7194,14 @@ Dbtc::checkNodeFailComplete(Signal* sign sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, NFCompleteRep::SignalLength, JBB); } + + CRASH_INSERTION(8058); + if (ERROR_INSERTED(8059)) + { + signal->theData[0] = 9999; + sendSignalWithDelay(numberToRef(CMVMI, hostptr.i), + GSN_NDB_TAMPER, signal, 100, 1); + } } void Dbtc::checkScanActiveInFailedLqh(Signal* signal, @@ -7273,7 +7270,14 @@ Dbtc::nodeFailCheckTransactions(Signal* Ptr transPtr; Uint32 TtcTimer = ctcTimer; Uint32 TapplTimeout = c_appl_timeout_value; - for (transPtr.i = transPtrI; transPtr.i < capiConnectFilesize; transPtr.i++) + Uint32 RT_BREAK = 64; + Uint32 endPtrI = transPtrI + RT_BREAK; + if (endPtrI > capiConnectFilesize) + { + endPtrI = capiConnectFilesize; + } + + for (transPtr.i = transPtrI; transPtr.i < endPtrI; transPtr.i++) { ptrCheckGuard(transPtr, capiConnectFilesize, apiConnectRecord); if (transPtr.p->m_transaction_nodes.get(failedNodeId)) @@ -7285,18 +7289,25 @@ Dbtc::nodeFailCheckTransactions(Signal* setApiConTimer(transPtr.i, TtcTimer - 2, __LINE__); timeOutFoundLab(signal, transPtr.i, ZNODEFAIL_BEFORE_COMMIT); c_appl_timeout_value = TapplTimeout; + + transPtr.i++; + break; } - - // Send CONTINUEB to continue later + } + + if (transPtr.i == capiConnectFilesize) + { + jam(); + checkNodeFailComplete(signal, failedNodeId, + HostRecord::NF_CHECK_TRANSACTION); + } + else + { signal->theData[0] = TcContinueB::ZNF_CHECK_TRANSACTIONS; - signal->theData[1] = transPtr.i + 1; // Check next + signal->theData[1] = transPtr.i; signal->theData[2] = failedNodeId; sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); - return; } - - checkNodeFailComplete(signal, failedNodeId, - HostRecord::NF_CHECK_TRANSACTION); } @@ -7319,7 +7330,23 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* s if (signal->getSendersBlockRef() != reference()) { jam(); - return; + /** + * Node should be in queue + */ + Uint32 i = 0; + Uint32 end = tcNodeFailptr.p->queueIndex; + for (; iqueueList[i] == hostptr.i) + { + jam(); + break; + } + } + ndbrequire(i != end); + tcNodeFailptr.p->queueList[i] = tcNodeFailptr.p->queueList[end-1]; + tcNodeFailptr.p->queueIndex = end - 1; } checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER); @@ -7331,7 +7358,9 @@ void Dbtc::execTAKE_OVERTCREQ(Signal* si tfailedNodeId = signal->theData[0]; tcNodeFailptr.i = 0; ptrAss(tcNodeFailptr, tcFailRecord); - if (tcNodeFailptr.p->failStatus != FS_IDLE) { + if (tcNodeFailptr.p->failStatus != FS_IDLE || + cmasterNodeId != getOwnNodeId()) + { jam(); /*------------------------------------------------------------*/ /* WE CAN CURRENTLY ONLY HANDLE ONE TAKE OVER AT A TIME */ @@ -7385,6 +7414,8 @@ void Dbtc::execLQH_TRANSCONF(Signal* sig jamEntry(); LqhTransConf * const lqhTransConf = (LqhTransConf *)&signal->theData[0]; + CRASH_INSERTION(8060); + tcNodeFailptr.i = lqhTransConf->tcRef; ptrCheckGuard(tcNodeFailptr, 1, tcFailRecord); tnodeid = lqhTransConf->lqhNodeId; @@ -7447,6 +7478,8 @@ void Dbtc::nodeTakeOverCompletedLab(Sign { Uint32 guard0; + CRASH_INSERTION(8061); + hostptr.i = tnodeid; ptrCheckGuard(hostptr, chostFilesize, hostRecord); hostptr.p->lqhTransStatus = LTS_IDLE; @@ -7554,6 +7587,8 @@ void Dbtc::completeTransAtTakeOverDoLast }//if tcNodeFailptr.p->takeOverProcState[TtakeOverInd] = ZTAKE_OVER_IDLE; tcNodeFailptr.p->completedTakeOver++; + + CRASH_INSERTION(8062); if (tcNodeFailptr.p->completedTakeOver == cnoParallelTakeOver) { jam(); diff -Nrup a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp --- a/ndb/test/ndbapi/testNodeRestart.cpp 2007-11-07 20:57:19 +01:00 +++ b/ndb/test/ndbapi/testNodeRestart.cpp 2008-04-23 16:08:36 +02:00 @@ -23,6 +23,7 @@ #include #include #include +#include int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){ @@ -1392,6 +1393,480 @@ runBug32160(NDBT_Context* ctx, NDBT_Step return NDBT_OK; } +int +runMNF(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + NdbRestarter res; + + if (res.getNumDbNodes() < 2) + { + return NDBT_OK; + } + + Vector part0; + Vector part1; + Bitmask<255> part0mask; + Bitmask<255> part1mask; + Bitmask<255> ngmask; + for (int i = 0; igetNumLoops(); + while (loops-- && !ctx->isTestStopped()) + { + int cnt, *nodes; + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + + bool cmf = false; + if (part0mask.get(master) && part0mask.get(nextMaster)) + { + cmf = true; + cnt = part0.size(); + nodes = part0.getBase(); + printf("restarting part0"); + } + else if(part1mask.get(master) && part1mask.get(nextMaster)) + { + cmf = true; + cnt = part1.size(); + nodes = part1.getBase(); + printf("restarting part1"); + } + else + { + cmf = false; + if (loops & 1) + { + cnt = part0.size(); + nodes = part0.getBase(); + printf("restarting part0"); + } + else + { + cnt = part1.size(); + nodes = part0.getBase(); + printf("restarting part0"); + } + } + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + for (int i = 0; istopTest(); + return NDBT_OK; +} + +int +runBug36199(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + NdbRestarter res; + + if (res.getNumDbNodes() < 4) + return NDBT_OK; + + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand()); + if (victim == master) + { + victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand()); + } + + ndbout_c("master: %u next master: %u victim: %u", + master, nextMaster, victim); + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + res.dumpStateOneNode(master, val2, 2); + res.dumpStateOneNode(victim, val2, 2); + + res.insertErrorInNode(victim, 7205); + res.insertErrorInNode(master, 7014); + int lcp = 7099; + res.dumpStateOneNode(master, &lcp, 1); + + int nodes[2]; + nodes[0] = master; + nodes[1] = victim; + if (res.waitNodesNoStart(nodes, 2)) + { + return NDBT_FAILED; + } + + if (res.startNodes(nodes, 2)) + { + return NDBT_FAILED; + } + + if (res.waitClusterStarted()) + return NDBT_FAILED; + + return NDBT_OK; +} + +int +runBug36246(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + NdbRestarter res; + Ndb* pNdb = GETNDB(step); + + if (res.getNumDbNodes() < 4) + return NDBT_OK; + + HugoOperations hugoOps(*ctx->getTab()); +restartloop: + int tryloop = 0; + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + +loop: + if(hugoOps.startTransaction(pNdb) != 0) + return NDBT_FAILED; + + if(hugoOps.pkUpdateRecord(pNdb, 1, 1) != 0) + return NDBT_FAILED; + + if(hugoOps.execute_NoCommit(pNdb) != 0) + return NDBT_FAILED; + + int victim = hugoOps.getTransaction()->getConnectedNodeId(); + printf("master: %u nextMaster: %u victim: %u", + master, nextMaster, victim); + if (victim == master || victim == nextMaster || + res.getNodeGroup(victim) == res.getNodeGroup(master) || + res.getNodeGroup(victim) == res.getNodeGroup(nextMaster)) + { + hugoOps.execute_Rollback(pNdb); + hugoOps.closeTransaction(pNdb); + tryloop++; + if (tryloop == 10) + { + ndbout_c(" -> restarting next master: %u", nextMaster); + res.restartOneDbNode(nextMaster, + /** initial */ false, + /** nostart */ true, + /** abort */ true); + + res.waitNodesNoStart(&nextMaster, 1); + res.startNodes(&nextMaster, 1); + if (res.waitClusterStarted()) + return NDBT_FAILED; + goto restartloop; + } + else + { + ndbout_c(" -> loop"); + goto loop; + } + } + ndbout_c(" -> go go gadget skates"); + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + res.dumpStateOneNode(master, val2, 2); + res.dumpStateOneNode(victim, val2, 2); + + res.insertErrorInNode(master, 8060); + res.insertErrorInNode(victim, 9999); + + int nodes[2]; + nodes[0] = master; + nodes[1] = victim; + if (res.waitNodesNoStart(nodes, 2)) + { + return NDBT_FAILED; + } + + if (res.startNodes(nodes, 2)) + { + return NDBT_FAILED; + } + + if (res.waitClusterStarted()) + return NDBT_FAILED; + + hugoOps.execute_Rollback(pNdb); + hugoOps.closeTransaction(pNdb); + + return NDBT_OK; +} + +int +runBug36247(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + NdbRestarter res; + Ndb* pNdb = GETNDB(step); + + if (res.getNumDbNodes() < 4) + return NDBT_OK; + + HugoOperations hugoOps(*ctx->getTab()); + +restartloop: + int tryloop = 0; + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + +loop: + if(hugoOps.startTransaction(pNdb) != 0) + return NDBT_FAILED; + + if(hugoOps.pkUpdateRecord(pNdb, 1, 100) != 0) + return NDBT_FAILED; + + if(hugoOps.execute_NoCommit(pNdb) != 0) + return NDBT_FAILED; + + int victim = hugoOps.getTransaction()->getConnectedNodeId(); + printf("master: %u nextMaster: %u victim: %u", + master, nextMaster, victim); + if (victim == master || victim == nextMaster || + res.getNodeGroup(victim) == res.getNodeGroup(master) || + res.getNodeGroup(victim) == res.getNodeGroup(nextMaster)) + { + hugoOps.execute_Rollback(pNdb); + hugoOps.closeTransaction(pNdb); + tryloop++; + if (tryloop == 10) + { + ndbout_c(" -> restarting next master: %u", nextMaster); + res.restartOneDbNode(nextMaster, + /** initial */ false, + /** nostart */ true, + /** abort */ true); + + res.waitNodesNoStart(&nextMaster, 1); + res.startNodes(&nextMaster, 1); + if (res.waitClusterStarted()) + return NDBT_FAILED; + goto restartloop; + } + else + { + ndbout_c(" -> loop"); + goto loop; + } + } + ndbout_c(" -> go go gadget skates"); + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + res.dumpStateOneNode(master, val2, 2); + res.dumpStateOneNode(victim, val2, 2); + + for (int i = 0; igetNumLoops(); + NdbRestarter res; + Ndb* pNdb = GETNDB(step); + + if (res.getNumDbNodes() < 4) + return NDBT_OK; + + int master = res.getMasterNodeId(); + int nextMaster = res.getNextMasterNodeId(master); + int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand()); + if (victim == master) + { + victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand()); + } + + ndbout_c("master: %u nextMaster: %u victim: %u", + master, nextMaster, victim); + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + res.dumpStateOneNode(master, val2, 2); + res.insertErrorInNode(victim, 7209); + + int lcp = 7099; + res.dumpStateOneNode(master, &lcp, 1); + + if (res.waitNodesNoStart(&master, 1)) + { + return NDBT_FAILED; + } + + if (res.startNodes(&master, 1)) + { + return NDBT_FAILED; + } + + if (res.waitClusterStarted()) + return NDBT_FAILED; + + return NDBT_OK; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", "Test that one node at a time can be stopped and then restarted "\ @@ -1733,6 +2208,29 @@ TESTCASE("Bug29364", ""){ } TESTCASE("Bug32160", ""){ INITIALIZER(runBug32160); +} +TESTCASE("MNF", ""){ + INITIALIZER(runLoadTable); + STEP(runMNF); + STEP(runScanUpdateUntilStopped); +} +TESTCASE("Bug36199", ""){ + INITIALIZER(runBug36199); +} +TESTCASE("Bug36246", ""){ + INITIALIZER(runLoadTable); + STEP(runBug36246); + VERIFIER(runClearTable); +} +TESTCASE("Bug36247", ""){ + INITIALIZER(runLoadTable); + STEP(runBug36247); + VERIFIER(runClearTable); +} +TESTCASE("Bug36276", ""){ + INITIALIZER(runLoadTable); + STEP(runBug36276); + VERIFIER(runClearTable); } NDBT_TESTSUITE_END(testNodeRestart); diff -Nrup a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt --- a/ndb/test/run-test/daily-basic-tests.txt 2008-01-31 23:14:21 +01:00 +++ b/ndb/test/run-test/daily-basic-tests.txt 2008-04-23 16:08:36 +02:00 @@ -791,3 +791,25 @@ max-time: 180 cmd: testIndex args: -n Bug28804_ATTRINFO T1 T3 +# 2008-04-22 +max-time: 1500 +cmd: testNodeRestart +args: -n MNF T1 + +max-time: 300 +cmd: testNodeRestart +args: -n Bug36199 T1 + +max-time: 300 +cmd: testNodeRestart +args: -n Bug36246 T1 + +max-time: 300 +cmd: testNodeRestart +args: -n Bug36247 T1 + +max-time: 300 +cmd: testNodeRestart +args: -n Bug36276 T1 + +# EOF