Below is the list of changes that have just been committed into a local
4.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet
1.2471 06/03/17 10:55:02 jonas@stripped +3 -0
ndb - bug#16772
dont't allow node to join cluster until all nodes has completed failure handling
ndb/test/run-test/daily-basic-tests.txt
1.25 06/03/17 10:55:00 jonas@stripped +4 -0
Run test in basic suite
ndb/test/ndbapi/testNodeRestart.cpp
1.14 06/03/17 10:55:00 jonas@stripped +50 -0
testcase for bug#16772
ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
1.16 06/03/17 10:55:00 jonas@stripped +87 -14
When getting CM_ADD for node that I haven't completed failure handling for do _not_
just override.
But instead set state...and send CM_ACK_ADD on execCONNECT_REP (much...later)
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: jonas
# Host: perch.ndb.mysql.com
# Root: /home/jonas/src/41-work
--- 1.24/ndb/test/run-test/daily-basic-tests.txt 2005-12-12 17:19:02 +01:00
+++ 1.25/ndb/test/run-test/daily-basic-tests.txt 2006-03-17 10:55:00 +01:00
@@ -446,6 +446,10 @@
cmd: testNodeRestart
args: -n Bug15685 T1
+max-time: 500
+cmd: testNodeRestart
+args: -n Bug16772 T1
+
# OLD FLEX
max-time: 500
cmd: flexBench
--- 1.15/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2005-09-21 16:41:41 +02:00
+++ 1.16/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2006-03-17 10:55:00 +01:00
@@ -257,6 +257,7 @@
void Qmgr::execCONNECT_REP(Signal* signal)
{
+ jamEntry();
const Uint32 nodeId = signal->theData[0];
c_connectedNodes.set(nodeId);
NodeRecPtr nodePtr;
@@ -264,9 +265,13 @@
ptrCheckGuard(nodePtr, MAX_NODES, nodeRec);
switch(nodePtr.p->phase){
case ZSTARTING:
+ case ZRUNNING:
jam();
+ if(!c_start.m_nodes.isWaitingFor(nodeId)){
+ jam();
+ return;
+ }
break;
- case ZRUNNING:
case ZPREPARE_FAIL:
case ZFAIL_CLOSING:
jam();
@@ -277,21 +282,28 @@
case ZAPI_INACTIVE:
return;
}
-
- if(!c_start.m_nodes.isWaitingFor(nodeId)){
- jam();
- return;
- }
-
+
switch(c_start.m_gsn){
case GSN_CM_REGREQ:
jam();
sendCmRegReq(signal, nodeId);
return;
- case GSN_CM_NODEINFOREQ:{
+ case GSN_CM_NODEINFOREQ:
jam();
sendCmNodeInfoReq(signal, nodeId, nodePtr.p);
return;
+ case GSN_CM_ADD:{
+ jam();
+
+ ndbrequire(getOwnNodeId() != cpresident);
+ c_start.m_nodes.clearWaitingFor(nodeId);
+ c_start.m_gsn = RNIL;
+
+ NodeRecPtr addNodePtr;
+ addNodePtr.i = nodeId;
+ ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec);
+ cmAddPrepare(signal, addNodePtr, nodePtr.p);
+ return;
}
default:
return;
@@ -924,15 +936,27 @@
return;
case ZFAIL_CLOSING:
jam();
-#ifdef VM_TRACE
- ndbout_c("Enabling communication to CM_ADD node state=%d",
- nodePtr.p->phase);
-#endif
+
+#if 1
+ warningEvent("Recieved request to incorperate node %u, "
+ "while error handling has not yet completed",
+ nodePtr.i);
+
+ ndbrequire(getOwnNodeId() != cpresident);
+ ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD);
+ c_start.m_nodes.clearWaitingFor();
+ c_start.m_nodes.setWaitingFor(nodePtr.i);
+ c_start.m_gsn = GSN_CM_ADD;
+#else
+ warningEvent("Enabling communication to CM_ADD node %u state=%d",
+ nodePtr.i,
+ nodePtr.p->phase);
nodePtr.p->phase = ZSTARTING;
nodePtr.p->failState = NORMAL;
signal->theData[0] = 0;
signal->theData[1] = nodePtr.i;
sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA);
+#endif
return;
case ZSTARTING:
break;
@@ -1766,11 +1790,27 @@
jamEntry();
failedNodePtr.i = signal->theData[0];
+
+ if (ERROR_INSERTED(930))
+ {
+ CLEAR_ERROR_INSERT_VALUE;
+ infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i);
+ return;
+ }
+
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){
failedNodePtr.p->failState = NORMAL;
} else {
jam();
+
+ char buf[100];
+ BaseString::snprintf(buf, 100,
+ "Received NDB_FAILCONF for node %u with state: %d %d",
+ failedNodePtr.i,
+ failedNodePtr.p->phase,
+ failedNodePtr.p->failState);
+ progError(__LINE__, 0, buf);
systemErrorLab(signal, __LINE__);
}//if
if (cpresident == getOwnNodeId()) {
@@ -2077,10 +2117,42 @@
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
if (failedNodePtr.i == getOwnNodeId()) {
jam();
- systemErrorLab(signal, __LINE__);
+
+ const char * msg = 0;
+ switch(aFailCause){
+ case FailRep::ZOWN_FAILURE:
+ msg = "Own failure";
+ break;
+ case FailRep::ZOTHER_NODE_WHEN_WE_START:
+ case FailRep::ZOTHERNODE_FAILED_DURING_START:
+ msg = "Other node died during start";
+ break;
+ case FailRep::ZIN_PREP_FAIL_REQ:
+ msg = "Prep fail";
+ break;
+ case FailRep::ZSTART_IN_REGREQ:
+ msg = "Start timeout";
+ break;
+ case FailRep::ZHEARTBEAT_FAILURE:
+ msg = "Hearbeat failure";
+ break;
+ case FailRep::ZLINK_FAILURE:
+ msg = "Connection failure";
+ break;
+ }
+
+ char buf[100];
+ BaseString::snprintf(buf, 100,
+ "We(%u) have been declared dead by %u reason: %s(%u)",
+ getOwnNodeId(),
+ refToNode(signal->getSendersBlockRef()),
+ aFailCause,
+ msg ? msg : "<Unknown>");
+
+ progError(__LINE__, 0, buf);
return;
}//if
-
+
myNodePtr.i = getOwnNodeId();
ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec);
if (myNodePtr.p->phase != ZRUNNING) {
@@ -2791,6 +2863,7 @@
cfailureNr = cprepareFailureNr;
ctoFailureNr = 0;
ctoStatus = Q_ACTIVE;
+ c_start.reset(); // Don't take over nodes being started
if (cnoCommitFailedNodes > 0) {
jam();
/**-----------------------------------------------------------------
--- 1.13/ndb/test/ndbapi/testNodeRestart.cpp 2005-12-12 17:19:02 +01:00
+++ 1.14/ndb/test/ndbapi/testNodeRestart.cpp 2006-03-17 10:55:00 +01:00
@@ -535,6 +535,52 @@
return NDBT_FAILED;
}
+int
+runBug16772(NDBT_Context* ctx, NDBT_Step* step){
+
+ NdbRestarter restarter;
+ if (restarter.getNumDbNodes() < 2)
+ {
+ ctx->stopTest();
+ return NDBT_OK;
+ }
+
+ int aliveNodeId = restarter.getRandomNotMasterNodeId(rand());
+ int deadNodeId = aliveNodeId;
+ while (deadNodeId == aliveNodeId)
+ deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes());
+
+ if (restarter.insertErrorInNode(aliveNodeId, 930))
+ return NDBT_FAILED;
+
+ if (restarter.restartOneDbNode(deadNodeId,
+ /** initial */ false,
+ /** nostart */ true,
+ /** abort */ true))
+ return NDBT_FAILED;
+
+ if (restarter.waitNodesNoStart(&deadNodeId, 1))
+ return NDBT_FAILED;
+
+ if (restarter.startNodes(&deadNodeId, 1))
+ return NDBT_FAILED;
+
+ // It should now be hanging since we throw away NDB_FAILCONF
+ int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10);
+ // So this should fail...i.e it should not reach startphase 3
+
+ // Now send a NDB_FAILCONF for deadNo
+ int dump[] = { 7020, 323, 252, 0 };
+ dump[3] = deadNodeId;
+ if (restarter.dumpStateOneNode(aliveNodeId, dump, 4))
+ return NDBT_FAILED;
+
+ if (restarter.waitNodesStarted(&deadNodeId, 1))
+ return NDBT_FAILED;
+
+ return ret ? NDBT_OK : NDBT_FAILED;
+}
+
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
@@ -819,6 +865,10 @@
"Test bug with NF during abort"){
STEP(runBug15685);
FINALIZER(runClearTable);
+}
+TESTCASE("Bug16772",
+ "Test bug with restarting before NF handling is complete"){
+ STEP(runBug16772);
}
NDBT_TESTSUITE_END(testNodeRestart);
| Thread |
|---|
| • bk commit into 4.1 tree (jonas:1.2471) BUG#16772 | jonas | 17 Mar |