List:Commits« Previous MessageNext Message »
From:jonas Date:March 17 2006 10:55am
Subject:bk commit into 4.1 tree (jonas:1.2471) BUG#16772
View as plain text  
Below is the list of changes that have just been committed into a local
4.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2471 06/03/17 10:55:02 jonas@stripped +3 -0
  ndb - bug#16772
    dont't allow node to join cluster until all nodes has completed failure handling

  ndb/test/run-test/daily-basic-tests.txt
    1.25 06/03/17 10:55:00 jonas@stripped +4 -0
    Run test in basic suite

  ndb/test/ndbapi/testNodeRestart.cpp
    1.14 06/03/17 10:55:00 jonas@stripped +50 -0
    testcase for bug#16772

  ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
    1.16 06/03/17 10:55:00 jonas@stripped +87 -14
    When getting CM_ADD for node that I haven't completed failure handling for do _not_ just override.
    But instead set state...and send CM_ACK_ADD on execCONNECT_REP (much...later)

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/41-work

--- 1.24/ndb/test/run-test/daily-basic-tests.txt	2005-12-12 17:19:02 +01:00
+++ 1.25/ndb/test/run-test/daily-basic-tests.txt	2006-03-17 10:55:00 +01:00
@@ -446,6 +446,10 @@
 cmd: testNodeRestart
 args: -n Bug15685 T1
 
+max-time: 500
+cmd: testNodeRestart
+args: -n Bug16772 T1
+
 # OLD FLEX
 max-time: 500
 cmd: flexBench

--- 1.15/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2005-09-21 16:41:41 +02:00
+++ 1.16/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2006-03-17 10:55:00 +01:00
@@ -257,6 +257,7 @@
 
 void Qmgr::execCONNECT_REP(Signal* signal)
 {
+  jamEntry();
   const Uint32 nodeId = signal->theData[0];
   c_connectedNodes.set(nodeId);
   NodeRecPtr nodePtr;
@@ -264,9 +265,13 @@
   ptrCheckGuard(nodePtr, MAX_NODES, nodeRec);
   switch(nodePtr.p->phase){
   case ZSTARTING:
+  case ZRUNNING:
     jam();
+    if(!c_start.m_nodes.isWaitingFor(nodeId)){
+      jam();
+      return;
+    }
     break;
-  case ZRUNNING:
   case ZPREPARE_FAIL:
   case ZFAIL_CLOSING:
     jam();
@@ -277,21 +282,28 @@
   case ZAPI_INACTIVE:
     return;
   }
-
-  if(!c_start.m_nodes.isWaitingFor(nodeId)){
-    jam();
-    return;
-  }
-
+  
   switch(c_start.m_gsn){
   case GSN_CM_REGREQ:
     jam();
     sendCmRegReq(signal, nodeId);
     return;
-  case GSN_CM_NODEINFOREQ:{
+  case GSN_CM_NODEINFOREQ:
     jam();
     sendCmNodeInfoReq(signal, nodeId, nodePtr.p);
     return;
+  case GSN_CM_ADD:{
+    jam();
+
+    ndbrequire(getOwnNodeId() != cpresident);
+    c_start.m_nodes.clearWaitingFor(nodeId);
+    c_start.m_gsn = RNIL;
+    
+    NodeRecPtr addNodePtr;
+    addNodePtr.i = nodeId;
+    ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec);
+    cmAddPrepare(signal, addNodePtr, nodePtr.p);
+    return;
   }
   default:
     return;
@@ -924,15 +936,27 @@
     return;
   case ZFAIL_CLOSING:
     jam();
-#ifdef VM_TRACE
-    ndbout_c("Enabling communication to CM_ADD node state=%d", 
-	     nodePtr.p->phase);
-#endif
+    
+#if 1
+    warningEvent("Recieved request to incorperate node %u, "
+		 "while error handling has not yet completed",
+		 nodePtr.i);
+    
+    ndbrequire(getOwnNodeId() != cpresident);
+    ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD);
+    c_start.m_nodes.clearWaitingFor();
+    c_start.m_nodes.setWaitingFor(nodePtr.i);
+    c_start.m_gsn = GSN_CM_ADD;
+#else
+    warningEvent("Enabling communication to CM_ADD node %u state=%d", 
+		 nodePtr.i,
+		 nodePtr.p->phase);
     nodePtr.p->phase = ZSTARTING;
     nodePtr.p->failState = NORMAL;
     signal->theData[0] = 0;
     signal->theData[1] = nodePtr.i;
     sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA);
+#endif
     return;
   case ZSTARTING:
     break;
@@ -1766,11 +1790,27 @@
 
   jamEntry();
   failedNodePtr.i = signal->theData[0];  
+
+  if (ERROR_INSERTED(930))
+  {
+    CLEAR_ERROR_INSERT_VALUE;
+    infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i);
+    return;
+  }
+  
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
   if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){
     failedNodePtr.p->failState = NORMAL;
   } else {
     jam();
+
+    char buf[100];
+    BaseString::snprintf(buf, 100, 
+			 "Received NDB_FAILCONF for node %u with state: %d %d",
+			 failedNodePtr.i,
+			 failedNodePtr.p->phase,
+			 failedNodePtr.p->failState);
+    progError(__LINE__, 0, buf);
     systemErrorLab(signal, __LINE__);
   }//if
   if (cpresident == getOwnNodeId()) {
@@ -2077,10 +2117,42 @@
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
   if (failedNodePtr.i == getOwnNodeId()) {
     jam();
-    systemErrorLab(signal, __LINE__);
+
+    const char * msg = 0;
+    switch(aFailCause){
+    case FailRep::ZOWN_FAILURE: 
+      msg = "Own failure"; 
+      break;
+    case FailRep::ZOTHER_NODE_WHEN_WE_START: 
+    case FailRep::ZOTHERNODE_FAILED_DURING_START:
+      msg = "Other node died during start"; 
+      break;
+    case FailRep::ZIN_PREP_FAIL_REQ:
+      msg = "Prep fail";
+      break;
+    case FailRep::ZSTART_IN_REGREQ:
+      msg = "Start timeout";
+      break;
+    case FailRep::ZHEARTBEAT_FAILURE:
+      msg = "Hearbeat failure";
+      break;
+    case FailRep::ZLINK_FAILURE:
+      msg = "Connection failure";
+      break;
+    }
+    
+    char buf[100];
+    BaseString::snprintf(buf, 100, 
+			 "We(%u) have been declared dead by %u reason: %s(%u)",
+			 getOwnNodeId(),
+			 refToNode(signal->getSendersBlockRef()),
+			 aFailCause,
+			 msg ? msg : "<Unknown>");
+
+    progError(__LINE__, 0, buf);
     return;
   }//if
-
+  
   myNodePtr.i = getOwnNodeId();
   ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec);
   if (myNodePtr.p->phase != ZRUNNING) {
@@ -2791,6 +2863,7 @@
         cfailureNr = cprepareFailureNr;
         ctoFailureNr = 0;
         ctoStatus = Q_ACTIVE;
+	c_start.reset(); // Don't take over nodes being started
         if (cnoCommitFailedNodes > 0) {
           jam();
 	  /**-----------------------------------------------------------------

--- 1.13/ndb/test/ndbapi/testNodeRestart.cpp	2005-12-12 17:19:02 +01:00
+++ 1.14/ndb/test/ndbapi/testNodeRestart.cpp	2006-03-17 10:55:00 +01:00
@@ -535,6 +535,52 @@
   return NDBT_FAILED;
 }
 
+int 
+runBug16772(NDBT_Context* ctx, NDBT_Step* step){
+
+  NdbRestarter restarter;
+  if (restarter.getNumDbNodes() < 2)
+  {
+    ctx->stopTest();
+    return NDBT_OK;
+  }
+
+  int aliveNodeId = restarter.getRandomNotMasterNodeId(rand());
+  int deadNodeId = aliveNodeId;
+  while (deadNodeId == aliveNodeId)
+    deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes());
+  
+  if (restarter.insertErrorInNode(aliveNodeId, 930))
+    return NDBT_FAILED;
+
+  if (restarter.restartOneDbNode(deadNodeId,
+				 /** initial */ false, 
+				 /** nostart */ true,
+				 /** abort   */ true))
+    return NDBT_FAILED;
+  
+  if (restarter.waitNodesNoStart(&deadNodeId, 1))
+    return NDBT_FAILED;
+
+  if (restarter.startNodes(&deadNodeId, 1))
+    return NDBT_FAILED;
+
+  // It should now be hanging since we throw away NDB_FAILCONF
+  int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10);
+  // So this should fail...i.e it should not reach startphase 3
+
+  // Now send a NDB_FAILCONF for deadNo
+  int dump[] = { 7020, 323, 252, 0 };
+  dump[3] = deadNodeId;
+  if (restarter.dumpStateOneNode(aliveNodeId, dump, 4))
+    return NDBT_FAILED;
+  
+  if (restarter.waitNodesStarted(&deadNodeId, 1))
+    return NDBT_FAILED;
+
+  return ret ? NDBT_OK : NDBT_FAILED;
+}
+
 
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
@@ -819,6 +865,10 @@
 	 "Test bug with NF during abort"){
   STEP(runBug15685);
   FINALIZER(runClearTable);
+}
+TESTCASE("Bug16772",
+	 "Test bug with restarting before NF handling is complete"){
+  STEP(runBug16772);
 }
 NDBT_TESTSUITE_END(testNodeRestart);
 
Thread
bk commit into 4.1 tree (jonas:1.2471) BUG#16772jonas17 Mar