Below is the list of changes that have just been committed into a local
5.0 repository of jonas. When jonas does a push these changes
will be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet@stripped, 2008-04-25 08:36:45+02:00, jonas@stripped +6 -0
ndb - bug#36245
NF_COMPLETEREP can get lost on cascading master failure
causing *big* pain and misery
ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +2 -2
new error codes
ndb/src/kernel/blocks/dbtc/DbtcMain.cpp@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +3 -3
new error codes
ndb/src/kernel/blocks/qmgr/QmgrMain.cpp@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +39 -22
- new error codes
- fix by sending NF_COMPLETEREP from all nodes
ndb/src/ndbapi/ClusterMgr.cpp@stripped, 2008-04-25 08:36:43+02:00, jonas@stripped
+5 -2
only signal NF_COMPLETEREP once to TransportFacade
ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +101 -0
testcase
ndb/test/run-test/daily-basic-tests.txt@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +6 -1
testcase
diff -Nrup a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt
--- a/ndb/src/kernel/blocks/ERROR_codes.txt 2008-04-23 16:08:36 +02:00
+++ b/ndb/src/kernel/blocks/ERROR_codes.txt 2008-04-25 08:36:43 +02:00
@@ -1,4 +1,4 @@
-Next QMGR 1
+Next QMGR 937
Next NDBCNTR 1002
Next NDBFS 2000
Next DBACC 3002
@@ -6,7 +6,7 @@ Next DBTUP 4014
Next DBLQH 5051
Next DBDICT 6007
Next DBDIH 7211
-Next DBTC 8063
+Next DBTC 8064
Next CMVMI 9000
Next BACKUP 10022
Next DBUTIL 11002
diff -Nrup a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
--- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2008-04-23 16:08:36 +02:00
+++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2008-04-25 08:36:43 +02:00
@@ -2723,6 +2723,8 @@ void Dbtc::execTCKEYREQ(Signal* signal)
if (seizeCacheRecord(signal) != 0) {
return;
}//if
+
+ CRASH_INSERTION(8063);
TcConnectRecord * const regTcPtr = tcConnectptr.p;
CacheRecord * const regCachePtr = cachePtr.p;
@@ -4583,9 +4585,7 @@ void Dbtc::execCOMMITTED(Signal* signal)
CLEAR_ERROR_INSERT_VALUE;
return;
}//if
- if (ERROR_INSERTED(8030)) {
- systemErrorLab(signal, __LINE__);
- }//if
+ CRASH_INSERTION(8030);
if (ERROR_INSERTED(8025)) {
SET_ERROR_INSERT_VALUE(8026);
return;
diff -Nrup a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
--- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2007-05-15 08:34:36 +02:00
+++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2008-04-25 08:36:43 +02:00
@@ -2474,28 +2474,37 @@ void Qmgr::execNDB_FAILCONF(Signal* sign
progError(__LINE__, 0, buf);
systemErrorLab(signal, __LINE__);
}//if
- if (cpresident == getOwnNodeId()) {
+
+ if (cpresident == getOwnNodeId())
+ {
jam();
- /**
- * Prepare a NFCompleteRep and send to all connected API's
- * They can then abort all transaction waiting for response from
- * the failed node
- */
- NFCompleteRep * const nfComp = (NFCompleteRep *)&signal->theData[0];
- nfComp->blockNo = QMGR_REF;
- nfComp->nodeId = getOwnNodeId();
- nfComp->failedNodeId = failedNodePtr.i;
+
+ CRASH_INSERTION(936);
+ }
- for (nodePtr.i = 1; nodePtr.i < MAX_NODES; nodePtr.i++) {
+ /**
+ * Prepare a NFCompleteRep and send to all connected API's
+ * They can then abort all transaction waiting for response from
+ * the failed node
+ *
+ * NOTE: This is sent from all nodes, as otherwise we would need
+ * take-over if cpresident dies befor sending this
+ */
+ NFCompleteRep * const nfComp = (NFCompleteRep *)&signal->theData[0];
+ nfComp->blockNo = QMGR_REF;
+ nfComp->nodeId = getOwnNodeId();
+ nfComp->failedNodeId = failedNodePtr.i;
+
+ for (nodePtr.i = 1; nodePtr.i < MAX_NODES; nodePtr.i++)
+ {
+ jam();
+ ptrAss(nodePtr, nodeRec);
+ if (nodePtr.p->phase == ZAPI_ACTIVE){
jam();
- ptrAss(nodePtr, nodeRec);
- if (nodePtr.p->phase == ZAPI_ACTIVE){
- jam();
- sendSignal(nodePtr.p->blockRef, GSN_NF_COMPLETEREP, signal,
- NFCompleteRep::SignalLength, JBA);
- }//if
- }//for
- }
+ sendSignal(nodePtr.p->blockRef, GSN_NF_COMPLETEREP, signal,
+ NFCompleteRep::SignalLength, JBA);
+ }//if
+ }//for
return;
}//Qmgr::execNDB_FAILCONF()
@@ -3332,9 +3341,17 @@ void Qmgr::execCOMMIT_FAILREQ(Signal* si
jam();
NodeBitmask::set(nodeFail->theNodes, ccommitFailedNodes[i]);
}//if
- sendSignal(NDBCNTR_REF, GSN_NODE_FAILREP, signal,
- NodeFailRep::SignalLength, JBB);
-
+
+ if (ERROR_INSERTED(936))
+ {
+ sendSignalWithDelay(NDBCNTR_REF, GSN_NODE_FAILREP, signal,
+ 200, NodeFailRep::SignalLength);
+ }
+ else
+ {
+ sendSignal(NDBCNTR_REF, GSN_NODE_FAILREP, signal,
+ NodeFailRep::SignalLength, JBB);
+ }
guard0 = cnoCommitFailedNodes - 1;
arrGuard(guard0, MAX_NDB_NODES);
/**--------------------------------------------------------------------
diff -Nrup a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp
--- a/ndb/src/ndbapi/ClusterMgr.cpp 2007-05-09 15:02:59 +02:00
+++ b/ndb/src/ndbapi/ClusterMgr.cpp 2008-04-25 08:36:43 +02:00
@@ -472,8 +472,11 @@ ClusterMgr::execNF_COMPLETEREP(const Uin
const NodeId nodeId = nfComp->failedNodeId;
assert(nodeId > 0 && nodeId < MAX_NODES);
- theFacade.ReportNodeFailureComplete(nodeId);
- theNodes[nodeId].nfCompleteRep = true;
+ if (theNodes[nodeId].nfCompleteRep == false)
+ {
+ theFacade.ReportNodeFailureComplete(nodeId);
+ theNodes[nodeId].nfCompleteRep = true;
+ }
}
void
diff -Nrup a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp
--- a/ndb/test/ndbapi/testNodeRestart.cpp 2008-04-23 16:08:36 +02:00
+++ b/ndb/test/ndbapi/testNodeRestart.cpp 2008-04-25 08:36:43 +02:00
@@ -1867,6 +1867,102 @@ runBug36276(NDBT_Context* ctx, NDBT_Step
return NDBT_OK;
}
+int
+runBug36245(NDBT_Context* ctx, NDBT_Step* step)
+{
+ int result = NDBT_OK;
+ int loops = ctx->getNumLoops();
+ NdbRestarter res;
+ Ndb* pNdb = GETNDB(step);
+
+ if (res.getNumDbNodes() < 4)
+ return NDBT_OK;
+
+ /**
+ * Make sure master and nextMaster is in different node groups
+ */
+loop1:
+ int master = res.getMasterNodeId();
+ int nextMaster = res.getNextMasterNodeId(master);
+
+ printf("master: %u nextMaster: %u", master, nextMaster);
+ if (res.getNodeGroup(master) == res.getNodeGroup(nextMaster))
+ {
+ ndbout_c(" -> restarting next master: %u", nextMaster);
+ res.restartOneDbNode(nextMaster,
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true);
+
+ res.waitNodesNoStart(&nextMaster, 1);
+ res.startNodes(&nextMaster, 1);
+ if (res.waitClusterStarted())
+ {
+ ndbout_c("cluster didnt restart!!");
+ return NDBT_FAILED;
+ }
+ goto loop1;
+ }
+ ndbout_c(" -> go go gadget skates");
+
+ int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+ res.dumpStateOneNode(master, val2, 2);
+ res.dumpStateOneNode(nextMaster, val2, 2);
+
+ res.insertErrorInNode(master, 8063);
+ res.insertErrorInNode(nextMaster, 936);
+
+
+ int err = 0;
+ HugoOperations hugoOps(*ctx->getTab());
+loop2:
+ if((err = hugoOps.startTransaction(pNdb)) != 0)
+ {
+ ndbout_c("failed to start transaction: %u", err);
+ return NDBT_FAILED;
+ }
+
+ int victim = hugoOps.getTransaction()->getConnectedNodeId();
+ if (victim != master)
+ {
+ ndbout_c("transnode: %u != master: %u -> loop",
+ victim, master);
+ hugoOps.closeTransaction(pNdb);
+ goto loop2;
+ }
+
+ if((err = hugoOps.pkUpdateRecord(pNdb, 1)) != 0)
+ {
+ ndbout_c("failed to update: %u", err);
+ return NDBT_FAILED;
+ }
+
+ if((err = hugoOps.execute_Commit(pNdb)) != 4010)
+ {
+ ndbout_c("incorrect error code: %u", err);
+ return NDBT_FAILED;
+ }
+ hugoOps.closeTransaction(pNdb);
+
+ int nodes[2];
+ nodes[0] = master;
+ nodes[1] = nextMaster;
+ if (res.waitNodesNoStart(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.startNodes(nodes, 2))
+ {
+ return NDBT_FAILED;
+ }
+
+ if (res.waitClusterStarted())
+ return NDBT_FAILED;
+
+ return NDBT_OK;
+}
+
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
@@ -2230,6 +2326,11 @@ TESTCASE("Bug36247", ""){
TESTCASE("Bug36276", ""){
INITIALIZER(runLoadTable);
STEP(runBug36276);
+ VERIFIER(runClearTable);
+}
+TESTCASE("Bug36245", ""){
+ INITIALIZER(runLoadTable);
+ STEP(runBug36245);
VERIFIER(runClearTable);
}
NDBT_TESTSUITE_END(testNodeRestart);
diff -Nrup a/ndb/test/run-test/daily-basic-tests.txt
b/ndb/test/run-test/daily-basic-tests.txt
--- a/ndb/test/run-test/daily-basic-tests.txt 2008-04-23 16:20:37 +02:00
+++ b/ndb/test/run-test/daily-basic-tests.txt 2008-04-25 08:36:43 +02:00
@@ -812,4 +812,9 @@ max-time: 300
cmd: testNodeRestart
args: -n Bug36276 T1
-# EOF
+# 2008-04-25
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36245 T1
+
+# EOF 2008-04-25
| Thread |
|---|
| • bk commit into 5.0 tree (jonas:1.2602) BUG#36245 | jonas | 25 Apr |