From: Date: March 17 2006 10:55am Subject: bk commit into 4.1 tree (jonas:1.2471) BUG#16772 List-Archive: http://lists.mysql.com/commits/3917 X-Bug: 16772 Message-Id: <20060317095506.33A452E7FBE@perch.ndb.mysql.com> Below is the list of changes that have just been committed into a local 4.1 repository of jonas. When jonas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet 1.2471 06/03/17 10:55:02 jonas@stripped +3 -0 ndb - bug#16772 dont't allow node to join cluster until all nodes has completed failure handling ndb/test/run-test/daily-basic-tests.txt 1.25 06/03/17 10:55:00 jonas@stripped +4 -0 Run test in basic suite ndb/test/ndbapi/testNodeRestart.cpp 1.14 06/03/17 10:55:00 jonas@stripped +50 -0 testcase for bug#16772 ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 1.16 06/03/17 10:55:00 jonas@stripped +87 -14 When getting CM_ADD for node that I haven't completed failure handling for do _not_ just override. But instead set state...and send CM_ACK_ADD on execCONNECT_REP (much...later) # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: jonas # Host: perch.ndb.mysql.com # Root: /home/jonas/src/41-work --- 1.24/ndb/test/run-test/daily-basic-tests.txt 2005-12-12 17:19:02 +01:00 +++ 1.25/ndb/test/run-test/daily-basic-tests.txt 2006-03-17 10:55:00 +01:00 @@ -446,6 +446,10 @@ cmd: testNodeRestart args: -n Bug15685 T1 +max-time: 500 +cmd: testNodeRestart +args: -n Bug16772 T1 + # OLD FLEX max-time: 500 cmd: flexBench --- 1.15/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2005-09-21 16:41:41 +02:00 +++ 1.16/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2006-03-17 10:55:00 +01:00 @@ -257,6 +257,7 @@ void Qmgr::execCONNECT_REP(Signal* signal) { + jamEntry(); const Uint32 nodeId = signal->theData[0]; c_connectedNodes.set(nodeId); NodeRecPtr nodePtr; @@ -264,9 +265,13 @@ ptrCheckGuard(nodePtr, MAX_NODES, nodeRec); switch(nodePtr.p->phase){ case ZSTARTING: + case ZRUNNING: jam(); + if(!c_start.m_nodes.isWaitingFor(nodeId)){ + jam(); + return; + } break; - case ZRUNNING: case ZPREPARE_FAIL: case ZFAIL_CLOSING: jam(); @@ -277,21 +282,28 @@ case ZAPI_INACTIVE: return; } - - if(!c_start.m_nodes.isWaitingFor(nodeId)){ - jam(); - return; - } - + switch(c_start.m_gsn){ case GSN_CM_REGREQ: jam(); sendCmRegReq(signal, nodeId); return; - case GSN_CM_NODEINFOREQ:{ + case GSN_CM_NODEINFOREQ: jam(); sendCmNodeInfoReq(signal, nodeId, nodePtr.p); return; + case GSN_CM_ADD:{ + jam(); + + ndbrequire(getOwnNodeId() != cpresident); + c_start.m_nodes.clearWaitingFor(nodeId); + c_start.m_gsn = RNIL; + + NodeRecPtr addNodePtr; + addNodePtr.i = nodeId; + ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec); + cmAddPrepare(signal, addNodePtr, nodePtr.p); + return; } default: return; @@ -924,15 +936,27 @@ return; case ZFAIL_CLOSING: jam(); -#ifdef VM_TRACE - ndbout_c("Enabling communication to CM_ADD node state=%d", - nodePtr.p->phase); -#endif + +#if 1 + warningEvent("Recieved request to incorperate node %u, " + "while error handling has not yet completed", + nodePtr.i); + + ndbrequire(getOwnNodeId() != cpresident); + ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD); + c_start.m_nodes.clearWaitingFor(); + c_start.m_nodes.setWaitingFor(nodePtr.i); + c_start.m_gsn = GSN_CM_ADD; +#else + warningEvent("Enabling communication to CM_ADD node %u state=%d", + nodePtr.i, + nodePtr.p->phase); nodePtr.p->phase = ZSTARTING; nodePtr.p->failState = NORMAL; signal->theData[0] = 0; signal->theData[1] = nodePtr.i; sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA); +#endif return; case ZSTARTING: break; @@ -1766,11 +1790,27 @@ jamEntry(); failedNodePtr.i = signal->theData[0]; + + if (ERROR_INSERTED(930)) + { + CLEAR_ERROR_INSERT_VALUE; + infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i); + return; + } + ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){ failedNodePtr.p->failState = NORMAL; } else { jam(); + + char buf[100]; + BaseString::snprintf(buf, 100, + "Received NDB_FAILCONF for node %u with state: %d %d", + failedNodePtr.i, + failedNodePtr.p->phase, + failedNodePtr.p->failState); + progError(__LINE__, 0, buf); systemErrorLab(signal, __LINE__); }//if if (cpresident == getOwnNodeId()) { @@ -2077,10 +2117,42 @@ ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); if (failedNodePtr.i == getOwnNodeId()) { jam(); - systemErrorLab(signal, __LINE__); + + const char * msg = 0; + switch(aFailCause){ + case FailRep::ZOWN_FAILURE: + msg = "Own failure"; + break; + case FailRep::ZOTHER_NODE_WHEN_WE_START: + case FailRep::ZOTHERNODE_FAILED_DURING_START: + msg = "Other node died during start"; + break; + case FailRep::ZIN_PREP_FAIL_REQ: + msg = "Prep fail"; + break; + case FailRep::ZSTART_IN_REGREQ: + msg = "Start timeout"; + break; + case FailRep::ZHEARTBEAT_FAILURE: + msg = "Hearbeat failure"; + break; + case FailRep::ZLINK_FAILURE: + msg = "Connection failure"; + break; + } + + char buf[100]; + BaseString::snprintf(buf, 100, + "We(%u) have been declared dead by %u reason: %s(%u)", + getOwnNodeId(), + refToNode(signal->getSendersBlockRef()), + aFailCause, + msg ? msg : ""); + + progError(__LINE__, 0, buf); return; }//if - + myNodePtr.i = getOwnNodeId(); ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec); if (myNodePtr.p->phase != ZRUNNING) { @@ -2791,6 +2863,7 @@ cfailureNr = cprepareFailureNr; ctoFailureNr = 0; ctoStatus = Q_ACTIVE; + c_start.reset(); // Don't take over nodes being started if (cnoCommitFailedNodes > 0) { jam(); /**----------------------------------------------------------------- --- 1.13/ndb/test/ndbapi/testNodeRestart.cpp 2005-12-12 17:19:02 +01:00 +++ 1.14/ndb/test/ndbapi/testNodeRestart.cpp 2006-03-17 10:55:00 +01:00 @@ -535,6 +535,52 @@ return NDBT_FAILED; } +int +runBug16772(NDBT_Context* ctx, NDBT_Step* step){ + + NdbRestarter restarter; + if (restarter.getNumDbNodes() < 2) + { + ctx->stopTest(); + return NDBT_OK; + } + + int aliveNodeId = restarter.getRandomNotMasterNodeId(rand()); + int deadNodeId = aliveNodeId; + while (deadNodeId == aliveNodeId) + deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes()); + + if (restarter.insertErrorInNode(aliveNodeId, 930)) + return NDBT_FAILED; + + if (restarter.restartOneDbNode(deadNodeId, + /** initial */ false, + /** nostart */ true, + /** abort */ true)) + return NDBT_FAILED; + + if (restarter.waitNodesNoStart(&deadNodeId, 1)) + return NDBT_FAILED; + + if (restarter.startNodes(&deadNodeId, 1)) + return NDBT_FAILED; + + // It should now be hanging since we throw away NDB_FAILCONF + int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10); + // So this should fail...i.e it should not reach startphase 3 + + // Now send a NDB_FAILCONF for deadNo + int dump[] = { 7020, 323, 252, 0 }; + dump[3] = deadNodeId; + if (restarter.dumpStateOneNode(aliveNodeId, dump, 4)) + return NDBT_FAILED; + + if (restarter.waitNodesStarted(&deadNodeId, 1)) + return NDBT_FAILED; + + return ret ? NDBT_OK : NDBT_FAILED; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", @@ -819,6 +865,10 @@ "Test bug with NF during abort"){ STEP(runBug15685); FINALIZER(runClearTable); +} +TESTCASE("Bug16772", + "Test bug with restarting before NF handling is complete"){ + STEP(runBug16772); } NDBT_TESTSUITE_END(testNodeRestart);