Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet@stripped, 2008-04-24 11:29:32+02:00, jonas@stripped +7 -0
ndb - (drop6)
fix for bug#36199, bug#36246, bug#36247, bug#36276
all related to cascading master failure
storage/ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2008-04-24 11:29:30+02:00,
jonas@stripped +3 -3
ndb -
fix for bug#36199, bug#36246, bug#36247, bug#36276
all related to cascading master failure
storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2008-04-24 11:29:30+02:00,
jonas@stripped +3 -0
ndb -
fix for bug#36199, bug#36246, bug#36247, bug#36276
all related to cascading master failure
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2008-04-24 11:29:30+02:00,
jonas@stripped +163 -49
ndb -
fix for bug#36199, bug#36246, bug#36247, bug#36276
all related to cascading master failure
storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp@stripped, 2008-04-24 11:29:31+02:00,
jonas@stripped +30 -0
ndb -
fix for bug#36199, bug#36246, bug#36247, bug#36276
all related to cascading master failure
storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp@stripped, 2008-04-24 11:29:31+02:00,
jonas@stripped +60 -25
ndb -
fix for bug#36199, bug#36246, bug#36247, bug#36276
all related to cascading master failure
storage/ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2008-04-24 11:29:31+02:00,
jonas@stripped +498 -0
ndb -
fix for bug#36199, bug#36246, bug#36247, bug#36276
all related to cascading master failure
storage/ndb/test/run-test/daily-basic-tests.txt@stripped, 2008-04-24 11:29:31+02:00,
jonas@stripped +22 -0
ndb -
fix for bug#36199, bug#36246, bug#36247, bug#36276
all related to cascading master failure
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: jonas
# Host: perch.ndb.mysql.com
# Root: /home/jonas/src/drop6
--- 1.64/storage/ndb/test/run-test/daily-basic-tests.txt 2008-04-24 11:29:34 +02:00
+++ 1.65/storage/ndb/test/run-test/daily-basic-tests.txt 2008-04-24 11:29:34 +02:00
@@ -819,3 +819,25 @@
cmd: test_event
args: -n Bug34853 T1
+# 2008-04-22
+max-time: 1500
+cmd: testNodeRestart
+args: -n MNF T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36199 T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36246 T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36247 T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36276 T1
+
+# EOF
--- 1.31/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2008-04-24 11:29:34 +02:00
+++ 1.32/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2008-04-24 11:29:34 +02:00
@@ -3,10 +3,10 @@
Next NDBFS 2000
Next DBACC 3002
Next DBTUP 4013
-Next DBLQH 5047
+Next DBLQH 5051
Next DBDICT 6007
-Next DBDIH 7195
-Next DBTC 8057
+Next DBDIH 7211
+Next DBTC 8063
Next CMVMI 9000
Next BACKUP 10022
Next DBUTIL 11002
--- 1.22/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2008-04-24 11:29:34 +02:00
+++ 1.23/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2008-04-24 11:29:34 +02:00
@@ -1347,6 +1347,9 @@
Uint32 m_masterLcpDihRef;
bool m_MASTER_LCPREQ_Received;
Uint32 m_MASTER_LCPREQ_FailedNodeId;
+
+ Uint32 m_lastLCP_COMPLETE_REP_id;
+ Uint32 m_lastLCP_COMPLETE_REP_ref;
} c_lcpState;
/*------------------------------------------------------------------------*/
--- 1.66/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2008-04-24 11:29:34 +02:00
+++ 1.67/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2008-04-24 11:29:34 +02:00
@@ -4601,37 +4601,47 @@
c_lcpState.m_participatingDIH.clear(failedNodePtr.i);
c_lcpState.m_participatingLQH.clear(failedNodePtr.i);
- if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i)){
+ bool wf = c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i);
+
+ if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i))
+ {
jam();
LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
rep->nodeId = failedNodePtr.i;
rep->lcpId = SYSFILE->latestLCP_ID;
rep->blockNo = DBDIH;
sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
- LcpCompleteRep::SignalLength, JBB);
+ LcpCompleteRep::SignalLength, JBB);
}
-
- /**
- * Check if we'r waiting for the failed node's LQH to complete
- *
- * Note that this is ran "before" LCP master take over
- */
- if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){
+
+ bool lcp_complete_rep = false;
+ if (!wf)
+ {
jam();
-
- LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
- rep->nodeId = nodeId;
- rep->lcpId = SYSFILE->latestLCP_ID;
- rep->blockNo = DBLQH;
- sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
- LcpCompleteRep::SignalLength, JBB);
-
- if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){
+
+ /**
+ * Check if we'r waiting for the failed node's LQH to complete
+ *
+ * Note that this is ran "before" LCP master take over
+ */
+ if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){
jam();
- /**
- * Make sure we're ready to accept it
- */
- c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId);
+
+ lcp_complete_rep = true;
+ LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
+ rep->nodeId = nodeId;
+ rep->lcpId = SYSFILE->latestLCP_ID;
+ rep->blockNo = DBLQH;
+ sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
+ LcpCompleteRep::SignalLength, JBB);
+
+ if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){
+ jam();
+ /**
+ * Make sure we're ready to accept it
+ */
+ c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId);
+ }
}
}
@@ -4657,7 +4667,9 @@
StartLcpConf::SignalLength, JBB);
}//if
- if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i)) {
+dosend:
+ if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i))
+ {
jam();
EmptyLcpConf * const rep = (EmptyLcpConf *)&signal->theData[0];
rep->senderNodeId = failedNodePtr.i;
@@ -4668,8 +4680,14 @@
rep->idle = true;
sendSignal(reference(), GSN_EMPTY_LCP_CONF, signal,
EmptyLcpConf::SignalLength, JBB);
- }//if
-
+ }
+ else if (!c_EMPTY_LCP_REQ_Counter.done() && lcp_complete_rep)
+ {
+ jam();
+ c_EMPTY_LCP_REQ_Counter.setWaitingFor(failedNodePtr.i);
+ goto dosend;
+ }
+
if (c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i)) {
jam();
MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
@@ -4737,19 +4755,37 @@
c_lcpMasterTakeOverState.set(LMTOS_WAIT_EMPTY_LCP, __LINE__);
- if(c_EMPTY_LCP_REQ_Counter.done()){
- jam();
- c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor();
+
+ EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend();
+ req->senderRef = reference();
- EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend();
- req->senderRef = reference();
- sendLoopMacro(EMPTY_LCP_REQ, sendEMPTY_LCP_REQ);
- ndbrequire(!c_EMPTY_LCP_REQ_Counter.done());
- } else {
- /**
- * Node failure during master take over...
- */
- ndbout_c("Nodefail during master take over (old: %d)", oldNode);
+ {
+ NodeRecordPtr specNodePtr;
+ specNodePtr.i = cfirstAliveNode;
+ do {
+ jam();
+ ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
+ if (!c_EMPTY_LCP_REQ_Counter.isWaitingFor(specNodePtr.i))
+ {
+ jam();
+ c_EMPTY_LCP_REQ_Counter.setWaitingFor(specNodePtr.i);
+ if (!(ERROR_INSERTED(7209) && specNodePtr.i == getOwnNodeId()))
+ {
+ sendEMPTY_LCP_REQ(signal, specNodePtr.i);
+ }
+ else
+ {
+ ndbout_c("NOT sending EMPTY_LCP_REQ to %u", specNodePtr.i);
+ }
+
+ if (c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(specNodePtr.i))
+ {
+ jam();
+ c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor();
+ }
+ }
+ specNodePtr.i = specNodePtr.p->nextNode;
+ } while (specNodePtr.i != RNIL);
}
NodeRecordPtr nodePtr;
@@ -5738,6 +5774,9 @@
const EmptyLcpConf * const conf = (EmptyLcpConf *)&signal->theData[0];
Uint32 nodeId = conf->senderNodeId;
+ CRASH_INSERTION(7206);
+
+
if(!conf->idle){
jam();
if (conf->tableId < c_lcpMasterTakeOverState.minTableId) {
@@ -5815,6 +5854,25 @@
jamEntry();
const BlockReference newMasterBlockref = req->masterRef;
+ CRASH_INSERTION(7205);
+
+ if (ERROR_INSERTED(7207))
+ {
+ jam();
+ SET_ERROR_INSERT_VALUE(7208);
+ sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
+ 500, signal->getLength());
+ return;
+ }
+
+ if (ERROR_INSERTED(7208))
+ {
+ jam();
+ signal->theData[0] = 9999;
+ sendSignal(numberToRef(CMVMI, refToNode(newMasterBlockref)),
+ GSN_NDB_TAMPER, signal, 1, JBB);
+ }
+
if (newMasterBlockref != cmasterdihref)
{
jam();
@@ -5837,6 +5895,11 @@
jam();
ndbrequire(0);
}
+
+ if (ERROR_INSERTED(7209))
+ {
+ SET_ERROR_INSERT_VALUE(7210);
+ }
sendMASTER_LCPCONF(signal);
}//Dbdih::execMASTER_LCPREQ()
@@ -6175,12 +6238,22 @@
{
const MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
jamEntry();
- receiveLoopMacro(MASTER_LCPREQ, ref->senderNodeId);
+
+ Uint32 senderNodeId = ref->senderNodeId;
+ Uint32 failedNodeId = ref->failedNodeId;
+
+ if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(senderNodeId))
+ {
+ jam();
+ c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.clearWaitingFor(senderNodeId);
+ }
+
+ receiveLoopMacro(MASTER_LCPREQ, senderNodeId);
/*-------------------------------------------------------------------------*/
// We have now received all responses and are ready to take over the LCP
// protocol as master.
/*-------------------------------------------------------------------------*/
- MASTER_LCPhandling(signal, ref->failedNodeId);
+ MASTER_LCPhandling(signal, failedNodeId);
}//Dbdih::execMASTER_LCPREF()
void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId)
@@ -10293,7 +10366,15 @@
signal->theData[1] = tabPtr.i;
sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
- checkLcpAllTablesDoneInLqh();
+ bool ret = checkLcpAllTablesDoneInLqh();
+ if (ret && ERROR_INSERTED(7209))
+ {
+ jam();
+
+ signal->theData[0] = 9999;
+ sendSignal(numberToRef(CMVMI, cmasterNodeId),
+ GSN_NDB_TAMPER, signal, 1, JBB);
+ }
}
}
@@ -10622,12 +10703,30 @@
CRASH_INSERTION2(7027, isMaster());
CRASH_INSERTION2(7018, !isMaster());
- if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED){
+ if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED)
+ {
/**
* We'r done
*/
+
+ if (ERROR_INSERTED(7209))
+ {
+ signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED;
+ sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
+ return;
+ }
+
c_lcpState.setLcpStatus(LCP_TAB_SAVED, __LINE__);
sendLCP_COMPLETE_REP(signal);
+
+ if (ERROR_INSERTED(7210))
+ {
+ CLEAR_ERROR_INSERT_VALUE;
+ EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtr();
+ req->senderRef = reference();
+ sendEMPTY_LCP_REQ(signal, getOwnNodeId());
+ }
+
return;
}
@@ -10639,13 +10738,28 @@
void
Dbdih::sendLCP_COMPLETE_REP(Signal* signal){
jam();
- LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
- rep->nodeId = getOwnNodeId();
- rep->lcpId = SYSFILE->latestLCP_ID;
- rep->blockNo = DBDIH;
-
- sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal,
- LcpCompleteRep::SignalLength, JBB);
+
+ /**
+ * Quick and dirty fix for bug#36276 dont save
+ * LCP_COMPLETE_REP to same node same LCP twice
+ */
+ bool alreadysent =
+ c_lcpState.m_lastLCP_COMPLETE_REP_id == SYSFILE->latestLCP_ID &&
+ c_lcpState.m_lastLCP_COMPLETE_REP_ref == c_lcpState.m_masterLcpDihRef;
+
+ if (!alreadysent)
+ {
+ LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
+ rep->nodeId = getOwnNodeId();
+ rep->lcpId = SYSFILE->latestLCP_ID;
+ rep->blockNo = DBDIH;
+
+ sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal,
+ LcpCompleteRep::SignalLength, JBB);
+
+ c_lcpState.m_lastLCP_COMPLETE_REP_id = SYSFILE->latestLCP_ID;
+ c_lcpState.m_lastLCP_COMPLETE_REP_ref = c_lcpState.m_masterLcpDihRef;
+ }
/**
* Say that an initial node restart does not need to be redone
@@ -11818,7 +11932,7 @@
c_lcpState.ctimer = 0;
c_lcpState.immediateLcpStart = false;
c_lcpState.m_MASTER_LCPREQ_Received = false;
-
+ c_lcpState.m_lastLCP_COMPLETE_REP_ref = 0;
cmasterdihref = 0;
cmasterNodeId = 0;
cmasterState = MASTER_IDLE;
--- 1.99/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2008-04-24 11:29:34 +02:00
+++ 1.100/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2008-04-24 11:29:34 +02:00
@@ -6767,6 +6767,22 @@
*
* now scan markers
*/
+ if (ERROR_INSERTED(5050))
+ {
+ ndbout_c("send ZSCAN_MARKERS with 5s delay and killing master");
+ CLEAR_ERROR_INSERT_VALUE;
+ signal->theData[0] = ZSCAN_MARKERS;
+ signal->theData[1] = tcNodeFailptr.i;
+ signal->theData[2] = 0;
+ signal->theData[3] = RNIL;
+ sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 5000, 4);
+
+ signal->theData[0] = 9999;
+ sendSignal(numberToRef(CMVMI,
+ refToNode(tcNodeFailptr.p->newTcBlockref)),
+ GSN_NDB_TAMPER, signal, 1, JBB);
+ return;
+ }
scanMarkers(signal, tcNodeFailptr.i, 0, RNIL);
return;
}//if
@@ -6851,6 +6867,20 @@
tcNodeFailPtr.i = tcNodeFail;
ptrCheckGuard(tcNodeFailPtr, ctcNodeFailrecFileSize, tcNodeFailRecord);
const Uint32 crashedTcNodeId = tcNodeFailPtr.p->oldNodeId;
+
+ if (tcNodeFailPtr.p->tcFailStatus == TcNodeFailRecord::TC_STATE_BREAK)
+ {
+ jam();
+
+ /* ----------------------------------------------------------------------
+ * AN INTERRUPTION TO THIS NODE FAIL HANDLING WAS RECEIVED AND A NEW
+ * TC HAVE BEEN ASSIGNED TO TAKE OVER THE FAILED TC. PROBABLY THE OLD
+ * NEW TC HAVE FAILED.
+ * ---------------------------------------------------------------------- */
+ tcNodeFailptr = tcNodeFailPtr;
+ lqhTransNextLab(signal);
+ return;
+ }
CommitAckMarkerIterator iter;
if(i == RNIL){
--- 1.111/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2008-04-24 11:29:34 +02:00
+++ 1.112/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2008-04-24 11:29:34 +02:00
@@ -7072,21 +7072,10 @@
}//if
}//if
- if (getOwnNodeId() != tnewMasterId)
- {
- jam();
- /**
- * Only master does takeover currently
- */
- hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER;
- }
- else
- {
- jam();
- signal->theData[0] = hostptr.i;
- sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB);
- }
-
+ jam();
+ signal->theData[0] = hostptr.i;
+ sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB);
+
checkScanActiveInFailedLqh(signal, 0, hostptr.i);
checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid
nodeFailCheckTransactions(signal, 0, hostptr.i);
@@ -7123,6 +7112,14 @@
sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal,
NFCompleteRep::SignalLength, JBB);
}
+
+ CRASH_INSERTION(8058);
+ if (ERROR_INSERTED(8059))
+ {
+ signal->theData[0] = 9999;
+ sendSignalWithDelay(numberToRef(CMVMI, hostptr.i),
+ GSN_NDB_TAMPER, signal, 100, 1);
+ }
}
void Dbtc::checkScanActiveInFailedLqh(Signal* signal,
@@ -7191,7 +7188,14 @@
Ptr<ApiConnectRecord> transPtr;
Uint32 TtcTimer = ctcTimer;
Uint32 TapplTimeout = c_appl_timeout_value;
- for (transPtr.i = transPtrI; transPtr.i < capiConnectFilesize; transPtr.i++)
+ Uint32 RT_BREAK = 64;
+ Uint32 endPtrI = transPtrI + RT_BREAK;
+ if (endPtrI > capiConnectFilesize)
+ {
+ endPtrI = capiConnectFilesize;
+ }
+
+ for (transPtr.i = transPtrI; transPtr.i < endPtrI; transPtr.i++)
{
ptrCheckGuard(transPtr, capiConnectFilesize, apiConnectRecord);
if (transPtr.p->m_transaction_nodes.get(failedNodeId))
@@ -7203,18 +7207,25 @@
setApiConTimer(transPtr.i, TtcTimer - 2, __LINE__);
timeOutFoundLab(signal, transPtr.i, ZNODEFAIL_BEFORE_COMMIT);
c_appl_timeout_value = TapplTimeout;
+
+ transPtr.i++;
+ break;
}
-
- // Send CONTINUEB to continue later
+ }
+
+ if (transPtr.i == capiConnectFilesize)
+ {
+ jam();
+ checkNodeFailComplete(signal, failedNodeId,
+ HostRecord::NF_CHECK_TRANSACTION);
+ }
+ else
+ {
signal->theData[0] = TcContinueB::ZNF_CHECK_TRANSACTIONS;
- signal->theData[1] = transPtr.i + 1; // Check next
+ signal->theData[1] = transPtr.i;
signal->theData[2] = failedNodeId;
sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
- return;
}
-
- checkNodeFailComplete(signal, failedNodeId,
- HostRecord::NF_CHECK_TRANSACTION);
}
@@ -7237,7 +7248,23 @@
if (signal->getSendersBlockRef() != reference())
{
jam();
- return;
+ /**
+ * Node should be in queue
+ */
+ Uint32 i = 0;
+ Uint32 end = tcNodeFailptr.p->queueIndex;
+ for (; i<end; i++)
+ {
+ jam();
+ if (tcNodeFailptr.p->queueList[i] == hostptr.i)
+ {
+ jam();
+ break;
+ }
+ }
+ ndbrequire(i != end);
+ tcNodeFailptr.p->queueList[i] = tcNodeFailptr.p->queueList[end-1];
+ tcNodeFailptr.p->queueIndex = end - 1;
}
checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER);
@@ -7249,7 +7276,9 @@
tfailedNodeId = signal->theData[0];
tcNodeFailptr.i = 0;
ptrAss(tcNodeFailptr, tcFailRecord);
- if (tcNodeFailptr.p->failStatus != FS_IDLE) {
+ if (tcNodeFailptr.p->failStatus != FS_IDLE ||
+ cmasterNodeId != getOwnNodeId())
+ {
jam();
/*------------------------------------------------------------*/
/* WE CAN CURRENTLY ONLY HANDLE ONE TAKE OVER AT A TIME */
@@ -7303,6 +7332,8 @@
jamEntry();
LqhTransConf * const lqhTransConf = (LqhTransConf *)&signal->theData[0];
+ CRASH_INSERTION(8060);
+
tcNodeFailptr.i = lqhTransConf->tcRef;
ptrCheckGuard(tcNodeFailptr, 1, tcFailRecord);
tnodeid = lqhTransConf->lqhNodeId;
@@ -7365,6 +7396,8 @@
{
Uint32 guard0;
+ CRASH_INSERTION(8061);
+
hostptr.i = tnodeid;
ptrCheckGuard(hostptr, chostFilesize, hostRecord);
hostptr.p->lqhTransStatus = LTS_IDLE;
@@ -7472,6 +7505,8 @@
}//if
tcNodeFailptr.p->takeOverProcState[TtakeOverInd] = ZTAKE_OVER_IDLE;
tcNodeFailptr.p->completedTakeOver++;
+
+ CRASH_INSERTION(8062);
if (tcNodeFailptr.p->completedTakeOver == cnoParallelTakeOver) {
jam();
--- 1.44/storage/ndb/test/ndbapi/testNodeRestart.cpp 2008-04-24 11:29:34 +02:00
+++ 1.45/storage/ndb/test/ndbapi/testNodeRestart.cpp 2008-04-24 11:29:34 +02:00
@@ -24,6 +24,7 @@
#include <signaldata/DumpStateOrd.hpp>
#include <Bitmask.hpp>
#include <RefConvert.hpp>
+#include <NdbEnv.h>
int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){
@@ -1767,6 +1768,480 @@
return NDBT_OK;
}
+int
+runMNF(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ NdbRestarter res;
+
+ if (res.getNumDbNodes() < 2)
+ {
+ return NDBT_OK;
+ }
+
+ Vector<int> part0;
+ Vector<int> part1;
+ Bitmask<255> part0mask;
+ Bitmask<255> part1mask;
+ Bitmask<255> ngmask;
+ for (int i = 0; i<res.getNumDbNodes(); i++)
+ {
+ int nodeId = res.getDbNodeId(i);
+ int ng = res.getNodeGroup(nodeId);
+ if (ngmask.get(ng))
+ {
+ part1.push_back(nodeId);
+ part1mask.set(nodeId);
+ }
+ else
+ {
+ ngmask.set(ng);
+ part0.push_back(nodeId);
+ part0mask.set(nodeId);
+ }
+ }
+
+ printf("part0: ");
+ for (size_t i = 0; i<part0.size(); i++)
+ printf("%u ", part0[i]);
+ printf("\n");
+
+ printf("part1: ");
+ for (size_t i = 0; i<part1.size(); i++)
+ printf("%u ", part1[i]);
+ printf("\n");
+
+ int loops = ctx->getNumLoops();
+ while (loops-- && !ctx->isTestStopped())
+ {
+ int cnt, *nodes;
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+
+ bool cmf = false;
+ if (part0mask.get(master) && part0mask.get(nextMaster))
+ {
+ cmf = true;
+ cnt = part0.size();
+ nodes = part0.getBase();
+ printf("restarting part0");
+ }
+ else if(part1mask.get(master) && part1mask.get(nextMaster))
+ {
+ cmf = true;
+ cnt = part1.size();
+ nodes = part1.getBase();
+ printf("restarting part1");
+ }
+ else
+ {
+ cmf = false;
+ if (loops & 1)
+ {
+ cnt = part0.size();
+ nodes = part0.getBase();
+ printf("restarting part0");
+ }
+ else
+ {
+ cnt = part1.size();
+ nodes = part0.getBase();
+ printf("restarting part0");
+ }
+ }
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ for (int i = 0; i<cnt; i++)
+ if (res.dumpStateOneNode(nodes[i], val2, 2))
+ return NDBT_FAILED;
+
+ int type = loops;
+ char buf[100];
+ if (NdbEnv_GetEnv("MNF", buf, sizeof(buf)))
+ {
+ type = atoi(buf);
+ }
+ if (cmf)
+ {
+ type = type % 7;
+ }
+ else
+ {
+ type = type % 4;
+ }
+ ndbout_c(" type: %u (cmf: %u)", type, cmf);
+ switch(type){
+ case 0:
+ for (int i = 0; i<cnt; i++)
+ {
+ if (res.restartOneDbNode(nodes[i],
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true))
+ return NDBT_FAILED;
+
+ NdbSleep_MilliSleep(10);
+ }
+ break;
+ case 1:
+ for (int i = 0; i<cnt; i++)
+ {
+ if (res.restartOneDbNode(nodes[i],
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true))
+ return NDBT_FAILED;
+
+ }
+ break;
+ case 2:
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 8058);
+ }
+ res.restartOneDbNode(nodes[0],
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true);
+ break;
+ case 3:
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 8059);
+ }
+ res.restartOneDbNode(nodes[0],
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true);
+ break;
+ case 4:
+ {
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 7180);
+ }
+
+ int lcp = 7099;
+ res.insertErrorInNode(master, 7193);
+ res.dumpStateOneNode(master, &lcp, 1);
+ break;
+ }
+ case 5:
+ {
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 7206);
+ }
+
+ int lcp = 7099;
+ res.insertErrorInNode(master, 7193);
+ res.dumpStateOneNode(master, &lcp, 1);
+ break;
+ }
+ case 6:
+ {
+ for (int i = 0; i<cnt; i++)
+ {
+ res.insertErrorInNode(nodes[i], 5008);
+ }
+
+ int lcp = 7099;
+ res.insertErrorInNode(master, 7193);
+ res.dumpStateOneNode(master, &lcp, 1);
+ break;
+ }
+ }
+
+ if (res.waitNodesNoStart(nodes, cnt))
+ return NDBT_FAILED;
+
+ if (res.startNodes(nodes, cnt))
+ return NDBT_FAILED;
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+ }
+
+ ctx->stopTest();
+ return NDBT_OK;
+}
+
+int
+runBug36199(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ int loops = ctx->getNumLoops();
+ NdbRestarter res;
+
+ if (res.getNumDbNodes() < 4)
+ return NDBT_OK;
+
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+ int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand());
+ if (victim == master)
+ {
+ victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand());
+ }
+
+ ndbout_c("master: %u next master: %u victim: %u",
+ master, nextMaster, victim);
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ res.dumpStateOneNode(master, val2, 2);
+ res.dumpStateOneNode(victim, val2, 2);
+
+ res.insertErrorInNode(victim, 7205);
+ res.insertErrorInNode(master, 7014);
+ int lcp = 7099;
+ res.dumpStateOneNode(master, &lcp, 1);
+
+ int nodes[2];
+ nodes[0] = master;
+ nodes[1] = victim;
+ if (res.waitNodesNoStart(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.startNodes(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+
+ return NDBT_OK;
+}
+
+int
+runBug36246(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ int loops = ctx->getNumLoops();
+ NdbRestarter res;
+ Ndb* pNdb = GETNDB(step);
+
+ if (res.getNumDbNodes() < 4)
+ return NDBT_OK;
+
+ HugoOperations hugoOps(*ctx->getTab());
+restartloop:
+ int tryloop = 0;
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+
+loop:
+ if(hugoOps.startTransaction(pNdb) != 0)
+ return NDBT_FAILED;
+
+ if(hugoOps.pkUpdateRecord(pNdb, 1, 1) != 0)
+ return NDBT_FAILED;
+
+ if(hugoOps.execute_NoCommit(pNdb) != 0)
+ return NDBT_FAILED;
+
+ int victim = hugoOps.getTransaction()->getConnectedNodeId();
+ printf("master: %u nextMaster: %u victim: %u",
+ master, nextMaster, victim);
+ if (victim == master || victim == nextMaster ||
+ res.getNodeGroup(victim) == res.getNodeGroup(master) ||
+ res.getNodeGroup(victim) == res.getNodeGroup(nextMaster))
+ {
+ hugoOps.execute_Rollback(pNdb);
+ hugoOps.closeTransaction(pNdb);
+ tryloop++;
+ if (tryloop == 10)
+ {
+ ndbout_c(" -> restarting next master: %u", nextMaster);
+ res.restartOneDbNode(nextMaster,
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true);
+
+ res.waitNodesNoStart(&nextMaster, 1);
+ res.startNodes(&nextMaster, 1);
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+ goto restartloop;
+ }
+ else
+ {
+ ndbout_c(" -> loop");
+ goto loop;
+ }
+ }
+ ndbout_c(" -> go go gadget skates");
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ res.dumpStateOneNode(master, val2, 2);
+ res.dumpStateOneNode(victim, val2, 2);
+
+ res.insertErrorInNode(master, 8060);
+ res.insertErrorInNode(victim, 9999);
+
+ int nodes[2];
+ nodes[0] = master;
+ nodes[1] = victim;
+ if (res.waitNodesNoStart(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.startNodes(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+
+ hugoOps.execute_Rollback(pNdb);
+ hugoOps.closeTransaction(pNdb);
+
+ return NDBT_OK;
+}
+
+int
+runBug36247(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ int loops = ctx->getNumLoops();
+ NdbRestarter res;
+ Ndb* pNdb = GETNDB(step);
+
+ if (res.getNumDbNodes() < 4)
+ return NDBT_OK;
+
+ HugoOperations hugoOps(*ctx->getTab());
+
+restartloop:
+ int tryloop = 0;
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+
+loop:
+ if(hugoOps.startTransaction(pNdb) != 0)
+ return NDBT_FAILED;
+
+ if(hugoOps.pkUpdateRecord(pNdb, 1, 100) != 0)
+ return NDBT_FAILED;
+
+ if(hugoOps.execute_NoCommit(pNdb) != 0)
+ return NDBT_FAILED;
+
+ int victim = hugoOps.getTransaction()->getConnectedNodeId();
+ printf("master: %u nextMaster: %u victim: %u",
+ master, nextMaster, victim);
+ if (victim == master || victim == nextMaster ||
+ res.getNodeGroup(victim) == res.getNodeGroup(master) ||
+ res.getNodeGroup(victim) == res.getNodeGroup(nextMaster))
+ {
+ hugoOps.execute_Rollback(pNdb);
+ hugoOps.closeTransaction(pNdb);
+ tryloop++;
+ if (tryloop == 10)
+ {
+ ndbout_c(" -> restarting next master: %u", nextMaster);
+ res.restartOneDbNode(nextMaster,
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true);
+
+ res.waitNodesNoStart(&nextMaster, 1);
+ res.startNodes(&nextMaster, 1);
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+ goto restartloop;
+ }
+ else
+ {
+ ndbout_c(" -> loop");
+ goto loop;
+ }
+ }
+ ndbout_c(" -> go go gadget skates");
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ res.dumpStateOneNode(master, val2, 2);
+ res.dumpStateOneNode(victim, val2, 2);
+
+ for (int i = 0; i<res.getNumDbNodes(); i++)
+ {
+ int nodeId = res.getDbNodeId(i);
+ res.insertErrorInNode(nodeId, 5050);
+ }
+
+ res.insertErrorInNode(victim, 9999);
+
+ int nodes[2];
+ nodes[0] = master;
+ nodes[1] = victim;
+ if (res.waitNodesNoStart(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.startNodes(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+
+ hugoOps.execute_Rollback(pNdb);
+ hugoOps.closeTransaction(pNdb);
+
+ return NDBT_OK;
+}
+
+int
+runBug36276(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ int loops = ctx->getNumLoops();
+ NdbRestarter res;
+ Ndb* pNdb = GETNDB(step);
+
+ if (res.getNumDbNodes() < 4)
+ return NDBT_OK;
+
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+ int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand());
+ if (victim == master)
+ {
+ victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand());
+ }
+
+ ndbout_c("master: %u nextMaster: %u victim: %u",
+ master, nextMaster, victim);
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ res.dumpStateOneNode(master, val2, 2);
+ res.insertErrorInNode(victim, 7209);
+
+ int lcp = 7099;
+ res.dumpStateOneNode(master, &lcp, 1);
+
+ if (res.waitNodesNoStart(&master, 1))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.startNodes(&master, 1))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+
+ return NDBT_OK;
+}
+
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
@@ -2127,6 +2602,29 @@
}
TESTCASE("Bug32160", ""){
INITIALIZER(runBug32160);
+}
+TESTCASE("MNF", ""){
+ INITIALIZER(runLoadTable);
+ STEP(runMNF);
+ STEP(runScanUpdateUntilStopped);
+}
+TESTCASE("Bug36199", ""){
+ INITIALIZER(runBug36199);
+}
+TESTCASE("Bug36246", ""){
+ INITIALIZER(runLoadTable);
+ STEP(runBug36246);
+ VERIFIER(runClearTable);
+}
+TESTCASE("Bug36247", ""){
+ INITIALIZER(runLoadTable);
+ STEP(runBug36247);
+ VERIFIER(runClearTable);
+}
+TESTCASE("Bug36276", ""){
+ INITIALIZER(runLoadTable);
+ STEP(runBug36276);
+ VERIFIER(runClearTable);
}
NDBT_TESTSUITE_END(testNodeRestart);
| Thread |
|---|
| • bk commit into 5.1 tree (jonas:1.2202) BUG#36276 | jonas | 24 Apr |