Below is the list of changes that have just been committed into a local
5.0 repository of jonas. When jonas does a push these changes
will be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet@stripped, 2008-04-23 16:08:38+02:00, jonas@stripped +7 -0
ndb -
fix for bug#36199, bug#36246, bug#36247, bug#36276
all related to cascading master failure
ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +3 -3
update error codes
ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +3 -0
fix for bug#36199, bug#36246, bug#36247, bug#36276
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +163 -50
fix for bug#36199, bug#36246, bug#36247, bug#36276
ndb/src/kernel/blocks/dblqh/DblqhMain.cpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +30 -0
fix for bug#36199, bug#36246, bug#36247, bug#36276
ndb/src/kernel/blocks/dbtc/DbtcMain.cpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +60 -25
fix for bug#36199, bug#36246, bug#36247, bug#36276
ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +498 -0
fix for bug#36199, bug#36246, bug#36247, bug#36276
ndb/test/run-test/daily-basic-tests.txt@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +22 -0
fix for bug#36199, bug#36246, bug#36247, bug#36276
diff -Nrup a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt
--- a/ndb/src/kernel/blocks/ERROR_codes.txt 2007-11-07 20:57:19 +01:00
+++ b/ndb/src/kernel/blocks/ERROR_codes.txt 2008-04-23 16:08:36 +02:00
@@ -3,10 +3,10 @@ Next NDBCNTR 1002
Next NDBFS 2000
Next DBACC 3002
Next DBTUP 4014
-Next DBLQH 5043
+Next DBLQH 5051
Next DBDICT 6007
-Next DBDIH 7195
-Next DBTC 8052
+Next DBDIH 7211
+Next DBTC 8063
Next CMVMI 9000
Next BACKUP 10022
Next DBUTIL 11002
diff -Nrup a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
--- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-11-07 20:57:19 +01:00
+++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2008-04-23 16:08:36 +02:00
@@ -1342,6 +1342,9 @@ private:
Uint32 m_masterLcpDihRef;
bool m_MASTER_LCPREQ_Received;
Uint32 m_MASTER_LCPREQ_FailedNodeId;
+
+ Uint32 m_lastLCP_COMPLETE_REP_id;
+ Uint32 m_lastLCP_COMPLETE_REP_ref;
} c_lcpState;
/*------------------------------------------------------------------------*/
diff -Nrup a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
--- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-11-07 20:57:19 +01:00
+++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2008-04-23 16:08:36 +02:00
@@ -4501,37 +4501,47 @@ void Dbdih::failedNodeLcpHandling(Signal
c_lcpState.m_participatingDIH.clear(failedNodePtr.i);
c_lcpState.m_participatingLQH.clear(failedNodePtr.i);
- if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i)){
+ bool wf = c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i);
+
+ if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i))
+ {
jam();
LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
rep->nodeId = failedNodePtr.i;
rep->lcpId = SYSFILE->latestLCP_ID;
rep->blockNo = DBDIH;
sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
- LcpCompleteRep::SignalLength, JBB);
+ LcpCompleteRep::SignalLength, JBB);
}
-
- /**
- * Check if we'r waiting for the failed node's LQH to complete
- *
- * Note that this is ran "before" LCP master take over
- */
- if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){
+
+ bool lcp_complete_rep = false;
+ if (!wf)
+ {
jam();
-
- LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
- rep->nodeId = nodeId;
- rep->lcpId = SYSFILE->latestLCP_ID;
- rep->blockNo = DBLQH;
- sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
- LcpCompleteRep::SignalLength, JBB);
-
- if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){
+
+ /**
+ * Check if we'r waiting for the failed node's LQH to complete
+ *
+ * Note that this is ran "before" LCP master take over
+ */
+ if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){
jam();
- /**
- * Make sure we're ready to accept it
- */
- c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId);
+
+ lcp_complete_rep = true;
+ LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
+ rep->nodeId = nodeId;
+ rep->lcpId = SYSFILE->latestLCP_ID;
+ rep->blockNo = DBLQH;
+ sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
+ LcpCompleteRep::SignalLength, JBB);
+
+ if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){
+ jam();
+ /**
+ * Make sure we're ready to accept it
+ */
+ c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId);
+ }
}
}
@@ -4557,7 +4567,9 @@ void Dbdih::failedNodeLcpHandling(Signal
StartLcpConf::SignalLength, JBB);
}//if
- if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i)) {
+dosend:
+ if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i))
+ {
jam();
EmptyLcpConf * const rep = (EmptyLcpConf *)&signal->theData[0];
rep->senderNodeId = failedNodePtr.i;
@@ -4568,8 +4580,14 @@ void Dbdih::failedNodeLcpHandling(Signal
rep->idle = true;
sendSignal(reference(), GSN_EMPTY_LCP_CONF, signal,
EmptyLcpConf::SignalLength, JBB);
- }//if
-
+ }
+ else if (!c_EMPTY_LCP_REQ_Counter.done() && lcp_complete_rep)
+ {
+ jam();
+ c_EMPTY_LCP_REQ_Counter.setWaitingFor(failedNodePtr.i);
+ goto dosend;
+ }
+
if (c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i)) {
jam();
MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
@@ -4637,19 +4655,36 @@ Dbdih::startLcpMasterTakeOver(Signal* si
c_lcpMasterTakeOverState.set(LMTOS_WAIT_EMPTY_LCP, __LINE__);
- if(c_EMPTY_LCP_REQ_Counter.done()){
- jam();
- c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor();
-
- EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend();
- req->senderRef = reference();
- sendLoopMacro(EMPTY_LCP_REQ, sendEMPTY_LCP_REQ);
- ndbrequire(!c_EMPTY_LCP_REQ_Counter.done());
- } else {
- /**
- * Node failure during master take over...
- */
- g_eventLogger.info("Nodefail during master take over (old: %d)", oldNode);
+
+ EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend();
+ req->senderRef = reference();
+ {
+ NodeRecordPtr specNodePtr;
+ specNodePtr.i = cfirstAliveNode;
+ do {
+ jam();
+ ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
+ if (!c_EMPTY_LCP_REQ_Counter.isWaitingFor(specNodePtr.i))
+ {
+ jam();
+ c_EMPTY_LCP_REQ_Counter.setWaitingFor(specNodePtr.i);
+ if (!(ERROR_INSERTED(7209) && specNodePtr.i == getOwnNodeId()))
+ {
+ sendEMPTY_LCP_REQ(signal, specNodePtr.i);
+ }
+ else
+ {
+ ndbout_c("NOT sending EMPTY_LCP_REQ to %u", specNodePtr.i);
+ }
+
+ if (c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(specNodePtr.i))
+ {
+ jam();
+ c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor();
+ }
+ }
+ specNodePtr.i = specNodePtr.p->nextNode;
+ } while (specNodePtr.i != RNIL);
}
NodeRecordPtr nodePtr;
@@ -5639,6 +5674,9 @@ void Dbdih::execEMPTY_LCP_CONF(Signal* s
const EmptyLcpConf * const conf = (EmptyLcpConf *)&signal->theData[0];
Uint32 nodeId = conf->senderNodeId;
+ CRASH_INSERTION(7206);
+
+
if(!conf->idle){
jam();
if (conf->tableId < c_lcpMasterTakeOverState.minTableId) {
@@ -5716,6 +5754,25 @@ void Dbdih::execMASTER_LCPREQ(Signal* si
jamEntry();
const BlockReference newMasterBlockref = req->masterRef;
+ CRASH_INSERTION(7205);
+
+ if (ERROR_INSERTED(7207))
+ {
+ jam();
+ SET_ERROR_INSERT_VALUE(7208);
+ sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
+ 500, signal->getLength());
+ return;
+ }
+
+ if (ERROR_INSERTED(7208))
+ {
+ jam();
+ signal->theData[0] = 9999;
+ sendSignal(numberToRef(CMVMI, refToNode(newMasterBlockref)),
+ GSN_NDB_TAMPER, signal, 1, JBB);
+ }
+
if (newMasterBlockref != cmasterdihref)
{
jam();
@@ -5738,6 +5795,11 @@ void Dbdih::execMASTER_LCPREQ(Signal* si
jam();
ndbrequire(0);
}
+
+ if (ERROR_INSERTED(7209))
+ {
+ SET_ERROR_INSERT_VALUE(7210);
+ }
sendMASTER_LCPCONF(signal);
}//Dbdih::execMASTER_LCPREQ()
@@ -6081,12 +6143,22 @@ void Dbdih::execMASTER_LCPREF(Signal* si
{
const MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
jamEntry();
- receiveLoopMacro(MASTER_LCPREQ, ref->senderNodeId);
+
+ Uint32 senderNodeId = ref->senderNodeId;
+ Uint32 failedNodeId = ref->failedNodeId;
+
+ if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(senderNodeId))
+ {
+ jam();
+ c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.clearWaitingFor(senderNodeId);
+ }
+
+ receiveLoopMacro(MASTER_LCPREQ, senderNodeId);
/*-------------------------------------------------------------------------*/
// We have now received all responses and are ready to take over the LCP
// protocol as master.
/*-------------------------------------------------------------------------*/
- MASTER_LCPhandling(signal, ref->failedNodeId);
+ MASTER_LCPhandling(signal, failedNodeId);
}//Dbdih::execMASTER_LCPREF()
void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId)
@@ -10053,7 +10125,15 @@ void Dbdih::execLCP_FRAG_REP(Signal* sig
signal->theData[1] = tabPtr.i;
sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
- checkLcpAllTablesDoneInLqh();
+ bool ret = checkLcpAllTablesDoneInLqh();
+ if (ret && ERROR_INSERTED(7209))
+ {
+ jam();
+
+ signal->theData[0] = 9999;
+ sendSignal(numberToRef(CMVMI, cmasterNodeId),
+ GSN_NDB_TAMPER, signal, 1, JBB);
+ }
}
}
@@ -10348,12 +10428,30 @@ void Dbdih::checkLcpCompletedLab(Signal*
CRASH_INSERTION2(7027, isMaster());
CRASH_INSERTION2(7018, !isMaster());
- if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED){
+ if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED)
+ {
/**
* We'r done
*/
+
+ if (ERROR_INSERTED(7209))
+ {
+ signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED;
+ sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
+ return;
+ }
+
c_lcpState.setLcpStatus(LCP_TAB_SAVED, __LINE__);
sendLCP_COMPLETE_REP(signal);
+
+ if (ERROR_INSERTED(7210))
+ {
+ CLEAR_ERROR_INSERT_VALUE;
+ EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtr();
+ req->senderRef = reference();
+ sendEMPTY_LCP_REQ(signal, getOwnNodeId());
+ }
+
return;
}
@@ -10365,13 +10463,28 @@ void Dbdih::checkLcpCompletedLab(Signal*
void
Dbdih::sendLCP_COMPLETE_REP(Signal* signal){
jam();
- LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
- rep->nodeId = getOwnNodeId();
- rep->lcpId = SYSFILE->latestLCP_ID;
- rep->blockNo = DBDIH;
-
- sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal,
- LcpCompleteRep::SignalLength, JBB);
+
+ /**
+ * Quick and dirty fix for bug#36276 dont save
+ * LCP_COMPLETE_REP to same node same LCP twice
+ */
+ bool alreadysent =
+ c_lcpState.m_lastLCP_COMPLETE_REP_id == SYSFILE->latestLCP_ID &&
+ c_lcpState.m_lastLCP_COMPLETE_REP_ref == c_lcpState.m_masterLcpDihRef;
+
+ if (!alreadysent)
+ {
+ LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
+ rep->nodeId = getOwnNodeId();
+ rep->lcpId = SYSFILE->latestLCP_ID;
+ rep->blockNo = DBDIH;
+
+ sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal,
+ LcpCompleteRep::SignalLength, JBB);
+
+ c_lcpState.m_lastLCP_COMPLETE_REP_id = SYSFILE->latestLCP_ID;
+ c_lcpState.m_lastLCP_COMPLETE_REP_ref = c_lcpState.m_masterLcpDihRef;
+ }
/**
* Say that an initial node restart does not need to be redone
@@ -11426,7 +11539,7 @@ void Dbdih::initCommonData()
c_lcpState.ctimer = 0;
c_lcpState.immediateLcpStart = false;
c_lcpState.m_MASTER_LCPREQ_Received = false;
-
+ c_lcpState.m_lastLCP_COMPLETE_REP_ref = 0;
cmasterdihref = 0;
cmasterNodeId = 0;
cmasterState = MASTER_IDLE;
diff -Nrup a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
--- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2007-07-04 22:42:28 +02:00
+++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2008-04-23 16:08:36 +02:00
@@ -6810,6 +6810,22 @@ void Dblqh::lqhTransNextLab(Signal* sign
*
* now scan markers
*/
+ if (ERROR_INSERTED(5050))
+ {
+ ndbout_c("send ZSCAN_MARKERS with 5s delay and killing master");
+ CLEAR_ERROR_INSERT_VALUE;
+ signal->theData[0] = ZSCAN_MARKERS;
+ signal->theData[1] = tcNodeFailptr.i;
+ signal->theData[2] = 0;
+ signal->theData[3] = RNIL;
+ sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 5000, 4);
+
+ signal->theData[0] = 9999;
+ sendSignal(numberToRef(CMVMI,
+ refToNode(tcNodeFailptr.p->newTcBlockref)),
+ GSN_NDB_TAMPER, signal, 1, JBB);
+ return;
+ }
scanMarkers(signal, tcNodeFailptr.i, 0, RNIL);
return;
}//if
@@ -6894,6 +6910,20 @@ Dblqh::scanMarkers(Signal* signal,
tcNodeFailPtr.i = tcNodeFail;
ptrCheckGuard(tcNodeFailPtr, ctcNodeFailrecFileSize, tcNodeFailRecord);
const Uint32 crashedTcNodeId = tcNodeFailPtr.p->oldNodeId;
+
+ if (tcNodeFailPtr.p->tcFailStatus == TcNodeFailRecord::TC_STATE_BREAK)
+ {
+ jam();
+
+ /* ----------------------------------------------------------------------
+ * AN INTERRUPTION TO THIS NODE FAIL HANDLING WAS RECEIVED AND A NEW
+ * TC HAVE BEEN ASSIGNED TO TAKE OVER THE FAILED TC. PROBABLY THE OLD
+ * NEW TC HAVE FAILED.
+ * ---------------------------------------------------------------------- */
+ tcNodeFailptr = tcNodeFailPtr;
+ lqhTransNextLab(signal);
+ return;
+ }
CommitAckMarkerIterator iter;
if(i == RNIL){
diff -Nrup a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
--- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2007-12-13 21:31:36 +01:00
+++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2008-04-23 16:08:36 +02:00
@@ -7154,21 +7154,10 @@ void Dbtc::execNODE_FAILREP(Signal* sign
}//if
}//if
- if (getOwnNodeId() != tnewMasterId)
- {
- jam();
- /**
- * Only master does takeover currently
- */
- hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER;
- }
- else
- {
- jam();
- signal->theData[0] = hostptr.i;
- sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB);
- }
-
+ jam();
+ signal->theData[0] = hostptr.i;
+ sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB);
+
checkScanActiveInFailedLqh(signal, 0, hostptr.i);
checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid
nodeFailCheckTransactions(signal, 0, hostptr.i);
@@ -7205,6 +7194,14 @@ Dbtc::checkNodeFailComplete(Signal* sign
sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal,
NFCompleteRep::SignalLength, JBB);
}
+
+ CRASH_INSERTION(8058);
+ if (ERROR_INSERTED(8059))
+ {
+ signal->theData[0] = 9999;
+ sendSignalWithDelay(numberToRef(CMVMI, hostptr.i),
+ GSN_NDB_TAMPER, signal, 100, 1);
+ }
}
void Dbtc::checkScanActiveInFailedLqh(Signal* signal,
@@ -7273,7 +7270,14 @@ Dbtc::nodeFailCheckTransactions(Signal*
Ptr<ApiConnectRecord> transPtr;
Uint32 TtcTimer = ctcTimer;
Uint32 TapplTimeout = c_appl_timeout_value;
- for (transPtr.i = transPtrI; transPtr.i < capiConnectFilesize; transPtr.i++)
+ Uint32 RT_BREAK = 64;
+ Uint32 endPtrI = transPtrI + RT_BREAK;
+ if (endPtrI > capiConnectFilesize)
+ {
+ endPtrI = capiConnectFilesize;
+ }
+
+ for (transPtr.i = transPtrI; transPtr.i < endPtrI; transPtr.i++)
{
ptrCheckGuard(transPtr, capiConnectFilesize, apiConnectRecord);
if (transPtr.p->m_transaction_nodes.get(failedNodeId))
@@ -7285,18 +7289,25 @@ Dbtc::nodeFailCheckTransactions(Signal*
setApiConTimer(transPtr.i, TtcTimer - 2, __LINE__);
timeOutFoundLab(signal, transPtr.i, ZNODEFAIL_BEFORE_COMMIT);
c_appl_timeout_value = TapplTimeout;
+
+ transPtr.i++;
+ break;
}
-
- // Send CONTINUEB to continue later
+ }
+
+ if (transPtr.i == capiConnectFilesize)
+ {
+ jam();
+ checkNodeFailComplete(signal, failedNodeId,
+ HostRecord::NF_CHECK_TRANSACTION);
+ }
+ else
+ {
signal->theData[0] = TcContinueB::ZNF_CHECK_TRANSACTIONS;
- signal->theData[1] = transPtr.i + 1; // Check next
+ signal->theData[1] = transPtr.i;
signal->theData[2] = failedNodeId;
sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
- return;
}
-
- checkNodeFailComplete(signal, failedNodeId,
- HostRecord::NF_CHECK_TRANSACTION);
}
@@ -7319,7 +7330,23 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* s
if (signal->getSendersBlockRef() != reference())
{
jam();
- return;
+ /**
+ * Node should be in queue
+ */
+ Uint32 i = 0;
+ Uint32 end = tcNodeFailptr.p->queueIndex;
+ for (; i<end; i++)
+ {
+ jam();
+ if (tcNodeFailptr.p->queueList[i] == hostptr.i)
+ {
+ jam();
+ break;
+ }
+ }
+ ndbrequire(i != end);
+ tcNodeFailptr.p->queueList[i] = tcNodeFailptr.p->queueList[end-1];
+ tcNodeFailptr.p->queueIndex = end - 1;
}
checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER);
@@ -7331,7 +7358,9 @@ void Dbtc::execTAKE_OVERTCREQ(Signal* si
tfailedNodeId = signal->theData[0];
tcNodeFailptr.i = 0;
ptrAss(tcNodeFailptr, tcFailRecord);
- if (tcNodeFailptr.p->failStatus != FS_IDLE) {
+ if (tcNodeFailptr.p->failStatus != FS_IDLE ||
+ cmasterNodeId != getOwnNodeId())
+ {
jam();
/*------------------------------------------------------------*/
/* WE CAN CURRENTLY ONLY HANDLE ONE TAKE OVER AT A TIME */
@@ -7385,6 +7414,8 @@ void Dbtc::execLQH_TRANSCONF(Signal* sig
jamEntry();
LqhTransConf * const lqhTransConf = (LqhTransConf *)&signal->theData[0];
+ CRASH_INSERTION(8060);
+
tcNodeFailptr.i = lqhTransConf->tcRef;
ptrCheckGuard(tcNodeFailptr, 1, tcFailRecord);
tnodeid = lqhTransConf->lqhNodeId;
@@ -7447,6 +7478,8 @@ void Dbtc::nodeTakeOverCompletedLab(Sign
{
Uint32 guard0;
+ CRASH_INSERTION(8061);
+
hostptr.i = tnodeid;
ptrCheckGuard(hostptr, chostFilesize, hostRecord);
hostptr.p->lqhTransStatus = LTS_IDLE;
@@ -7554,6 +7587,8 @@ void Dbtc::completeTransAtTakeOverDoLast
}//if
tcNodeFailptr.p->takeOverProcState[TtakeOverInd] = ZTAKE_OVER_IDLE;
tcNodeFailptr.p->completedTakeOver++;
+
+ CRASH_INSERTION(8062);
if (tcNodeFailptr.p->completedTakeOver == cnoParallelTakeOver) {
jam();
diff -Nrup a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp
--- a/ndb/test/ndbapi/testNodeRestart.cpp 2007-11-07 20:57:19 +01:00
+++ b/ndb/test/ndbapi/testNodeRestart.cpp 2008-04-23 16:08:36 +02:00
@@ -23,6 +23,7 @@
#include <signaldata/DumpStateOrd.hpp>
#include <Bitmask.hpp>
#include <RefConvert.hpp>
+#include <NdbEnv.h>
int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){
@@ -1392,6 +1393,480 @@ runBug32160(NDBT_Context* ctx, NDBT_Step
return NDBT_OK;
}
+int
+runMNF(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ NdbRestarter res;
+
+ if (res.getNumDbNodes() < 2)
+ {
+ return NDBT_OK;
+ }
+
+ Vector<int> part0;
+ Vector<int> part1;
+ Bitmask<255> part0mask;
+ Bitmask<255> part1mask;
+ Bitmask<255> ngmask;
+ for (int i = 0; i<res.getNumDbNodes(); i++)
+ {
+ int nodeId = res.getDbNodeId(i);
+ int ng = res.getNodeGroup(nodeId);
+ if (ngmask.get(ng))
+ {
+ part1.push_back(nodeId);
+ part1mask.set(nodeId);
+ }
+ else
+ {
+ ngmask.set(ng);
+ part0.push_back(nodeId);
+ part0mask.set(nodeId);
+ }
+ }
+
+ printf("part0: ");
+ for (size_t i = 0; i<part0.size(); i++)
+ printf("%u ", part0[i]);
+ printf("\n");
+
+ printf("part1: ");
+ for (size_t i = 0; i<part1.size(); i++)
+ printf("%u ", part1[i]);
+ printf("\n");
+
+ int loops = ctx->getNumLoops();
+ while (loops-- && !ctx->isTestStopped())
+ {
+ int cnt, *nodes;
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+
+ bool cmf = false;
+ if (part0mask.get(master) && part0mask.get(nextMaster))
+ {
+ cmf = true;
+ cnt = part0.size();
+ nodes = part0.getBase();
+ printf("restarting part0");
+ }
+ else if(part1mask.get(master) && part1mask.get(nextMaster))
+ {
+ cmf = true;
+ cnt = part1.size();
+ nodes = part1.getBase();
+ printf("restarting part1");
+ }
+ else
+ {
+ cmf = false;
+ if (loops & 1)
+ {
+ cnt = part0.size();
+ nodes = part0.getBase();
+ printf("restarting part0");
+ }
+ else
+ {
+ cnt = part1.size();
+ nodes = part0.getBase();
+ printf("restarting part0");
+ }
+ }
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ for (int i = 0; i<cnt; i++)
+ if (res.dumpStateOneNode(nodes[i], val2, 2))
+ return NDBT_FAILED;
+
+ int type = loops;
+ char buf[100];
+ if (NdbEnv_GetEnv("MNF", buf, sizeof(buf)))
+ {
+ type = atoi(buf);
+ }
+ if (cmf)
+ {
+ type = type % 7;
+ }
+ else
+ {
+ type = type % 4;
+ }
+ ndbout_c(" type: %u (cmf: %u)", type, cmf);
+ switch(type){
+ case 0:
+ for (int i = 0; i<cnt; i++)
+ {
+ if (res.restartOneDbNode(nodes[i],
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true))
+ return NDBT_FAILED;
+
+ NdbSleep_MilliSleep(10);
+ }
+ break;
+ case 1:
+ for (int i = 0; i<cnt; i++)
+ {
+ if (res.restartOneDbNode(nodes[i],
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true))
+ return NDBT_FAILED;
+
+ }
+ break;
+ case 2:
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 8058);
+ }
+ res.restartOneDbNode(nodes[0],
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true);
+ break;
+ case 3:
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 8059);
+ }
+ res.restartOneDbNode(nodes[0],
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true);
+ break;
+ case 4:
+ {
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 7180);
+ }
+
+ int lcp = 7099;
+ res.insertErrorInNode(master, 7193);
+ res.dumpStateOneNode(master, &lcp, 1);
+ break;
+ }
+ case 5:
+ {
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 7206);
+ }
+
+ int lcp = 7099;
+ res.insertErrorInNode(master, 7193);
+ res.dumpStateOneNode(master, &lcp, 1);
+ break;
+ }
+ case 6:
+ {
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 5008);
+ }
+
+ int lcp = 7099;
+ res.insertErrorInNode(master, 7193);
+ res.dumpStateOneNode(master, &lcp, 1);
+ break;
+ }
+ }
+
+ if (res.waitNodesNoStart(nodes, cnt))
+ return NDBT_FAILED;
+
+ if (res.startNodes(nodes, cnt))
+ return NDBT_FAILED;
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+ }
+
+ ctx->stopTest();
+ return NDBT_OK;
+}
+
+int
+runBug36199(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ int loops = ctx->getNumLoops();
+ NdbRestarter res;
+
+ if (res.getNumDbNodes() < 4)
+ return NDBT_OK;
+
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+ int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand());
+ if (victim == master)
+ {
+ victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand());
+ }
+
+ ndbout_c("master: %u next master: %u victim: %u",
+ master, nextMaster, victim);
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ res.dumpStateOneNode(master, val2, 2);
+ res.dumpStateOneNode(victim, val2, 2);
+
+ res.insertErrorInNode(victim, 7205);
+ res.insertErrorInNode(master, 7014);
+ int lcp = 7099;
+ res.dumpStateOneNode(master, &lcp, 1);
+
+ int nodes[2];
+ nodes[0] = master;
+ nodes[1] = victim;
+ if (res.waitNodesNoStart(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.startNodes(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+
+ return NDBT_OK;
+}
+
+int
+runBug36246(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ int loops = ctx->getNumLoops();
+ NdbRestarter res;
+ Ndb* pNdb = GETNDB(step);
+
+ if (res.getNumDbNodes() < 4)
+ return NDBT_OK;
+
+ HugoOperations hugoOps(*ctx->getTab());
+restartloop:
+ int tryloop = 0;
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+
+loop:
+ if(hugoOps.startTransaction(pNdb) != 0)
+ return NDBT_FAILED;
+
+ if(hugoOps.pkUpdateRecord(pNdb, 1, 1) != 0)
+ return NDBT_FAILED;
+
+ if(hugoOps.execute_NoCommit(pNdb) != 0)
+ return NDBT_FAILED;
+
+ int victim = hugoOps.getTransaction()->getConnectedNodeId();
+ printf("master: %u nextMaster: %u victim: %u",
+ master, nextMaster, victim);
+ if (victim == master || victim == nextMaster ||
+ res.getNodeGroup(victim) == res.getNodeGroup(master) ||
+ res.getNodeGroup(victim) == res.getNodeGroup(nextMaster))
+ {
+ hugoOps.execute_Rollback(pNdb);
+ hugoOps.closeTransaction(pNdb);
+ tryloop++;
+ if (tryloop == 10)
+ {
+ ndbout_c(" -> restarting next master: %u", nextMaster);
+ res.restartOneDbNode(nextMaster,
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true);
+
+ res.waitNodesNoStart(&nextMaster, 1);
+ res.startNodes(&nextMaster, 1);
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+ goto restartloop;
+ }
+ else
+ {
+ ndbout_c(" -> loop");
+ goto loop;
+ }
+ }
+ ndbout_c(" -> go go gadget skates");
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ res.dumpStateOneNode(master, val2, 2);
+ res.dumpStateOneNode(victim, val2, 2);
+
+ res.insertErrorInNode(master, 8060);
+ res.insertErrorInNode(victim, 9999);
+
+ int nodes[2];
+ nodes[0] = master;
+ nodes[1] = victim;
+ if (res.waitNodesNoStart(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.startNodes(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+
+ hugoOps.execute_Rollback(pNdb);
+ hugoOps.closeTransaction(pNdb);
+
+ return NDBT_OK;
+}
+
+int
+runBug36247(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ int loops = ctx->getNumLoops();
+ NdbRestarter res;
+ Ndb* pNdb = GETNDB(step);
+
+ if (res.getNumDbNodes() < 4)
+ return NDBT_OK;
+
+ HugoOperations hugoOps(*ctx->getTab());
+
+restartloop:
+ int tryloop = 0;
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+
+loop:
+ if(hugoOps.startTransaction(pNdb) != 0)
+ return NDBT_FAILED;
+
+ if(hugoOps.pkUpdateRecord(pNdb, 1, 100) != 0)
+ return NDBT_FAILED;
+
+ if(hugoOps.execute_NoCommit(pNdb) != 0)
+ return NDBT_FAILED;
+
+ int victim = hugoOps.getTransaction()->getConnectedNodeId();
+ printf("master: %u nextMaster: %u victim: %u",
+ master, nextMaster, victim);
+ if (victim == master || victim == nextMaster ||
+ res.getNodeGroup(victim) == res.getNodeGroup(master) ||
+ res.getNodeGroup(victim) == res.getNodeGroup(nextMaster))
+ {
+ hugoOps.execute_Rollback(pNdb);
+ hugoOps.closeTransaction(pNdb);
+ tryloop++;
+ if (tryloop == 10)
+ {
+ ndbout_c(" -> restarting next master: %u", nextMaster);
+ res.restartOneDbNode(nextMaster,
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true);
+
+ res.waitNodesNoStart(&nextMaster, 1);
+ res.startNodes(&nextMaster, 1);
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+ goto restartloop;
+ }
+ else
+ {
+ ndbout_c(" -> loop");
+ goto loop;
+ }
+ }
+ ndbout_c(" -> go go gadget skates");
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ res.dumpStateOneNode(master, val2, 2);
+ res.dumpStateOneNode(victim, val2, 2);
+
+ for (int i = 0; i<res.getNumDbNodes(); i++)
+ {
+ int nodeId = res.getDbNodeId(i);
+ res.insertErrorInNode(nodeId, 5050);
+ }
+
+ res.insertErrorInNode(victim, 9999);
+
+ int nodes[2];
+ nodes[0] = master;
+ nodes[1] = victim;
+ if (res.waitNodesNoStart(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.startNodes(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+
+ hugoOps.execute_Rollback(pNdb);
+ hugoOps.closeTransaction(pNdb);
+
+ return NDBT_OK;
+}
+
+int
+runBug36276(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ int loops = ctx->getNumLoops();
+ NdbRestarter res;
+ Ndb* pNdb = GETNDB(step);
+
+ if (res.getNumDbNodes() < 4)
+ return NDBT_OK;
+
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+ int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand());
+ if (victim == master)
+ {
+ victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand());
+ }
+
+ ndbout_c("master: %u nextMaster: %u victim: %u",
+ master, nextMaster, victim);
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ res.dumpStateOneNode(master, val2, 2);
+ res.insertErrorInNode(victim, 7209);
+
+ int lcp = 7099;
+ res.dumpStateOneNode(master, &lcp, 1);
+
+ if (res.waitNodesNoStart(&master, 1))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.startNodes(&master, 1))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+
+ return NDBT_OK;
+}
+
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
@@ -1733,6 +2208,29 @@ TESTCASE("Bug29364", ""){
}
TESTCASE("Bug32160", ""){
INITIALIZER(runBug32160);
+}
+TESTCASE("MNF", ""){
+ INITIALIZER(runLoadTable);
+ STEP(runMNF);
+ STEP(runScanUpdateUntilStopped);
+}
+TESTCASE("Bug36199", ""){
+ INITIALIZER(runBug36199);
+}
+TESTCASE("Bug36246", ""){
+ INITIALIZER(runLoadTable);
+ STEP(runBug36246);
+ VERIFIER(runClearTable);
+}
+TESTCASE("Bug36247", ""){
+ INITIALIZER(runLoadTable);
+ STEP(runBug36247);
+ VERIFIER(runClearTable);
+}
+TESTCASE("Bug36276", ""){
+ INITIALIZER(runLoadTable);
+ STEP(runBug36276);
+ VERIFIER(runClearTable);
}
NDBT_TESTSUITE_END(testNodeRestart);
diff -Nrup a/ndb/test/run-test/daily-basic-tests.txt
b/ndb/test/run-test/daily-basic-tests.txt
--- a/ndb/test/run-test/daily-basic-tests.txt 2008-01-31 23:14:21 +01:00
+++ b/ndb/test/run-test/daily-basic-tests.txt 2008-04-23 16:08:36 +02:00
@@ -791,3 +791,25 @@ max-time: 180
cmd: testIndex
args: -n Bug28804_ATTRINFO T1 T3
+# 2008-04-22
+max-time: 1500
+cmd: testNodeRestart
+args: -n MNF T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36199 T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36246 T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36247 T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36276 T1
+
+# EOF
| Thread |
|---|
| • bk commit into 5.0 tree (jonas:1.2600) BUG#36199 | jonas | 23 Apr 2008 |