List:Commits« Previous MessageNext Message »
From:jonas Date:April 3 2006 9:26am
Subject:bk commit into 4.1 tree (jonas:1.2488) BUG#18612
View as plain text  
Below is the list of changes that have just been committed into a local
4.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2488 06/04/03 11:26:29 jonas@stripped +5 -0
  ndb - bug#18612
    post weeked fixes :-)
    change impl. to use READ_NODESREQ to query state of other qmgr(partition)
      this as it has no (current) side effects, so that it's possible only to kill
      starting cluster (if one started and one starting)

  ndb/test/ndbapi/testNodeRestart.cpp
    1.19 06/04/03 11:26:27 jonas@stripped +2 -2
    Require that only starting cluster dies

  ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
    1.20 06/04/03 11:26:27 jonas@stripped +105 -121
    Use READ_NODESREQ to query state of other QMGR (instead of CM_REGREQ)

  ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
    1.6 06/04/03 11:26:27 jonas@stripped +3 -0
    Use READ_NODESREQ to query state of other QMGR (instead of CM_REGREQ)

  ndb/src/kernel/blocks/qmgr/Qmgr.hpp
    1.7 06/04/03 11:26:27 jonas@stripped +5 -2
    Use READ_NODESREQ to query state of other QMGR (instead of CM_REGREQ)

  ndb/include/kernel/signaldata/FailRep.hpp
    1.3 06/04/03 11:26:27 jonas@stripped +9 -2
    Add paritioned FAIL_REP

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/41-work

--- 1.2/ndb/include/kernel/signaldata/FailRep.hpp	2006-03-31 16:46:25 +02:00
+++ 1.3/ndb/include/kernel/signaldata/FailRep.hpp	2006-04-03 11:26:27 +02:00
@@ -36,7 +36,8 @@
 
 public:
   STATIC_CONST( SignalLength = 2 );
-
+  STATIC_CONST( ExtraLength = 1 + NdbNodeBitmask::Size );
+  
   enum FailCause {
     ZOWN_FAILURE=0,
     ZOTHER_NODE_WHEN_WE_START=1,
@@ -45,13 +46,19 @@
     ZHEARTBEAT_FAILURE=4,
     ZLINK_FAILURE=5,
     ZOTHERNODE_FAILED_DURING_START=6,
-    ZMULTI_NODE_SHUTDOWN = 7
+    ZMULTI_NODE_SHUTDOWN = 7,
+    ZPARTITIONED_CLUSTER = 8
   };
   
 private:
   
   Uint32 failNodeId;
   Uint32 failCause;
+  /**
+   * Used when failCause == ZPARTITIONED_CLUSTER
+   */
+  Uint32 president;
+  Uint32 partition[NdbNodeBitmask::Size];
 };
 
 

--- 1.6/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2006-03-31 16:46:26 +02:00
+++ 1.7/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2006-04-03 11:26:27 +02:00
@@ -124,7 +124,7 @@
    *
    * i.e. nodes that connect to use, when we already have elected president
    */
-  NdbNodeBitmask c_cmregreq_nodes;
+  NdbNodeBitmask c_readnodes_nodes;
   
   Uint32 c_maxDynamicId;
   
@@ -233,6 +233,8 @@
   void execREAD_NODESREQ(Signal* signal);
   void execSET_VAR_REQ(Signal* signal);
 
+  void execREAD_NODESREF(Signal* signal);
+  void execREAD_NODESCONF(Signal* signal);
 
   void execAPI_VERSION_REQ(Signal* signal);
   void execAPI_BROADCAST_REP(Signal* signal);
@@ -249,6 +251,8 @@
   void execARBIT_STOPREP(Signal* signal);
 
   // Statement blocks
+  void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
+
   void node_failed(Signal* signal, Uint16 aFailedNode);
   void checkStartInterface(Signal* signal);
   void failReport(Signal* signal,
@@ -268,7 +272,6 @@
   void startphase1(Signal* signal);
   void electionWon(Signal* signal);
   void cmInfoconf010Lab(Signal* signal);
-  bool check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
   
   void apiHbHandlingLab(Signal* signal);
   void timerHandlingLab(Signal* signal);

--- 1.5/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp	2006-03-31 16:46:26 +02:00
+++ 1.6/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp	2006-04-03 11:26:27 +02:00
@@ -94,6 +94,9 @@
   addRecSignal(GSN_ARBIT_CHOOSEREF, &Qmgr::execARBIT_CHOOSEREF);
   addRecSignal(GSN_ARBIT_STOPREP, &Qmgr::execARBIT_STOPREP);
 
+  addRecSignal(GSN_READ_NODESREF, &Qmgr::execREAD_NODESREF);
+  addRecSignal(GSN_READ_NODESCONF, &Qmgr::execREAD_NODESCONF);
+  
   initData();
 }//Qmgr::Qmgr()
 

--- 1.19/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2006-03-31 16:46:26 +02:00
+++ 1.20/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2006-04-03 11:26:27 +02:00
@@ -369,13 +369,29 @@
   }
   
   ndbrequire(!c_start.m_nodes.isWaitingFor(nodeId));
-  ndbrequire(!c_cmregreq_nodes.get(nodeId));
-  c_cmregreq_nodes.set(nodeId);
-  sendCmRegReq(signal, nodeId);  
-  c_regReqReqSent--;
+  ndbrequire(!c_readnodes_nodes.get(nodeId));
+  c_readnodes_nodes.set(nodeId);
+  signal->theData[0] = reference();
+  sendSignal(calcQmgrBlockRef(nodeId), GSN_READ_NODESREQ, signal, 1, JBA);
   return;
 }//Qmgr::execCONNECT_REP()
 
+void
+Qmgr::execREAD_NODESCONF(Signal* signal)
+{
+  check_readnodes_reply(signal, 
+			refToNode(signal->getSendersBlockRef()),
+			GSN_READ_NODESCONF);
+}
+
+void
+Qmgr::execREAD_NODESREF(Signal* signal)
+{
+  check_readnodes_reply(signal, 
+			refToNode(signal->getSendersBlockRef()),
+			GSN_READ_NODESREF);
+}
+
 /*******************************/
 /* CM_INFOCONF                */
 /*******************************/
@@ -668,12 +684,6 @@
   const CmRegConf * const cmRegConf = (CmRegConf *)&signal->theData[0];
   Uint32 presidentNodeId = cmRegConf->presidentNodeId;
 
-  if (check_cmregreq_reply(signal, presidentNodeId, GSN_CM_REGCONF))
-  {
-    jam();
-    return;
-  }
-
   if (!ndbCompatible_ndb_ndb(NDB_VERSION, cmRegConf->presidentVersion)) {
     jam();
     char buf[128];
@@ -731,8 +741,8 @@
   return;
 }//Qmgr::execCM_REGCONF()
 
-bool
-Qmgr::check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn)
+void
+Qmgr::check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn)
 {
   NodeRecPtr myNodePtr;
   myNodePtr.i = getOwnNodeId();
@@ -741,117 +751,65 @@
   NodeRecPtr nodePtr;
   nodePtr.i = nodeId;
   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec);
-  
-  /**
-   * Try to decide if replying node
-   *   knows who is president
-   */
-  Uint32 president_reply = RNIL;
-  switch(gsn){
-  case GSN_CM_REGREF:{
-    jam();
-    CmRegRef* ref = (CmRegRef*)signal->getDataPtr();
-    switch(ref->errorCode){
-    case CmRegRef::ZBUSY:
-    case CmRegRef::ZBUSY_PRESIDENT:
-    case CmRegRef::ZBUSY_TO_PRES:
-      jam();
-      /**
-       * Only president replies this
-       */
-      ndbrequire(nodeId == ref->presidentCandidate);
-      president_reply = nodeId;
-      break;
-    case CmRegRef::ZNOT_PRESIDENT:
-      jam();
-      president_reply = ref->presidentCandidate;
-      break;
-    case CmRegRef::ZNOT_IN_CFG:
-    case CmRegRef::ZNOT_DEAD:
-    case CmRegRef::ZELECTION:
-      // Neither of these replies give certain president knowledge
-      jam();
-    }
-    break;
-  }
-  case GSN_CM_REGCONF:
-    jam();
-    president_reply = nodeId;
-    break;
-  }
-  
-  char buf[256];
-  switch(c_start.m_gsn){
-  case GSN_CM_REGREQ:
-    jam();
-    ndbrequire(c_start.m_nodes.isWaitingFor(nodeId));
-    ndbrequire(c_cmregreq_nodes.isclear());    
-    ndbrequire(myNodePtr.p->phase == ZSTARTING);
-    return false;
-  case GSN_CM_NODEINFOREQ:
-    jam();
 
-    ndbrequire(myNodePtr.p->phase == ZSTARTING);
-    if (c_start.m_nodes.isWaitingFor(nodeId))
-    {
-      jam();
-      /**
-       * We're waiting for CM_NODEINFO
-       */
-      if (gsn == GSN_CM_REGREF)
-      {
-	jam();
-	return false;
-      }
-      
-      jam();
-      BaseString::snprintf(buf, sizeof(buf), 
-			   "Partitioned cluster! check StartPartialTimeout, "
-			   " received CM_REGCONF from %d"
-			   " while waiting for GSN_CM_NODEINFOCONF."
-			   " president=%d", 
-			   nodeId, cpresident);
-      goto die_direct;
-    }
-    
-    goto check_reply;
-  default:
-  case GSN_CM_NODEINFOCONF:
+  ndbrequire(c_readnodes_nodes.get(nodeId));
+  ReadNodesConf* conf = (ReadNodesConf*)signal->getDataPtr();
+  if (gsn == GSN_READ_NODESREF)
+  {
     jam();
-    ndbrequire(myNodePtr.p->phase == ZRUNNING);
-    goto check_reply;
+retry:
+    signal->theData[0] = reference();
+    sendSignal(calcQmgrBlockRef(nodeId), GSN_READ_NODESREQ, signal, 1, JBA);
+    return;
   }
   
-check_reply:
-  jam();
-  c_cmregreq_nodes.clear(nodeId);
-  
-  if (gsn == GSN_CM_REGCONF)
+  if (conf->masterNodeId == ZNIL)
   {
     jam();
-    BaseString::snprintf(buf, sizeof(buf),
-			 "Partitioned cluster! check StartPartialTimeout, "
-			 " received CM_REGCONF"
-			 " from %d I think president: %d",
-			 nodeId, cpresident);
-    goto die_direct;
+    goto retry;
   }
   
-  if (president_reply != RNIL && president_reply != cpresident)
+  Uint32 president = conf->masterNodeId;
+  if (president == cpresident)
   {
     jam();
-    BaseString::snprintf(buf, sizeof(buf),
-			 "Partitioned cluster! check StartPartialTimeout, "
-			 " received CM_REGREF from %d specifying president as"
-			 " %d, president: %d",
-			 nodeId, president_reply, cpresident);
-    goto die_direct;
+    c_readnodes_nodes.clear(nodeId);
+    return;
   }
-  
-  return true;
 
-die_direct:
+  char buf[255];
+  BaseString::snprintf(buf, sizeof(buf),
+		       "Partitioned cluster! check StartPartialTimeout, "
+		       " node %d thinks %d is president, "
+		       " I think president is: %d",
+		       nodeId, president, cpresident);
+
   ndbout_c(buf);
+  CRASH_INSERTION(933);
+
+  if (getNodeState().startLevel == NodeState::SL_STARTED)
+  {
+    jam();
+    NdbNodeBitmask part;
+    part.assign(NdbNodeBitmask::Size, conf->clusterNodes);
+    FailRep* rep = (FailRep*)signal->getDataPtrSend();
+    rep->failCause = FailRep::ZPARTITIONED_CLUSTER;
+    rep->president = cpresident;
+    c_clusterNodes.copyto(NdbNodeBitmask::Size, rep->partition);
+    Uint32 ref = calcQmgrBlockRef(nodeId);
+    Uint32 i = 0;
+    while((i = part.find(i + 1)) != NdbNodeBitmask::NotFound)
+    {
+      if (i == nodeId)
+	continue;
+      rep->failNodeId = i;
+      sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA);
+    }
+    rep->failNodeId = nodeId;
+    sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBB);
+    return;
+  }
+  
   CRASH_INSERTION(932);
   
   progError(__LINE__, 
@@ -899,12 +857,6 @@
   Uint32 candidate = signal->theData[3];
   DEBUG_START3(signal, TrefuseReason);
 
-  if (check_cmregreq_reply(signal, TaddNodeno, GSN_CM_REGREF))
-  {
-    jam();
-    return;
-  }
-
   c_regReqReqRecv++;
 
   // Ignore block reference in data[0]
@@ -2069,7 +2021,7 @@
   const DisconnectRep * const rep = (DisconnectRep *)&signal->theData[0];
   const Uint32 nodeId = rep->nodeId;
   c_connectedNodes.clear(nodeId);
-  c_cmregreq_nodes.clear(nodeId);
+  c_readnodes_nodes.clear(nodeId);
   
   NodeRecPtr nodePtr;
   nodePtr.i = getOwnNodeId();
@@ -2342,13 +2294,16 @@
 
   failedNodePtr.i = aFailedNode;
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
+  FailRep* rep = (FailRep*)signal->getDataPtr();
 
   check_multi_node_shutdown(signal);
   
   if (failedNodePtr.i == getOwnNodeId()) {
     jam();
 
+    Uint32 code = 0;
     const char * msg = 0;
+    char extra[100];
     switch(aFailCause){
     case FailRep::ZOWN_FAILURE: 
       msg = "Own failure"; 
@@ -2369,17 +2324,46 @@
     case FailRep::ZLINK_FAILURE:
       msg = "Connection failure";
       break;
+    case FailRep::ZPARTITIONED_CLUSTER:
+    {
+      code = ERR_ARBIT_SHUTDOWN;
+      char buf1[100], buf2[100];
+      c_clusterNodes.getText(buf1);
+      if (signal->getLength()== FailRep::SignalLength + FailRep::ExtraLength &&
+	  signal->header.theVerId_signalNumber == GSN_FAIL_REP)
+      {
+	jam();
+	NdbNodeBitmask part;
+	part.assign(NdbNodeBitmask::Size, rep->partition);
+	part.getText(buf2);
+	BaseString::snprintf(extra, sizeof(extra),
+			     "Partitioned cluster!"
+			     " Our cluster: %s other cluster: %s",
+			     buf1, buf2);
+      }
+      else
+      {
+	jam();
+	BaseString::snprintf(extra, sizeof(extra),
+			     "Partitioned cluster!"
+			     " Our cluster: %s ", buf1);
+      }
+      msg = extra;
+      break;
+    }
     }
     
-    char buf[100];
-    BaseString::snprintf(buf, 100, 
+    CRASH_INSERTION(932);
+
+    char buf[255];
+    BaseString::snprintf(buf, sizeof(buf), 
 			 "We(%u) have been declared dead by %u reason: %s(%u)",
 			 getOwnNodeId(),
 			 refToNode(signal->getSendersBlockRef()),
 			 aFailCause,
 			 msg ? msg : "<Unknown>");
-
-    progError(__LINE__, 0, buf);
+    
+    progError(__LINE__, code, buf);
     return;
   }//if
   

--- 1.18/ndb/test/ndbapi/testNodeRestart.cpp	2006-03-31 16:46:26 +02:00
+++ 1.19/ndb/test/ndbapi/testNodeRestart.cpp	2006-04-03 11:26:27 +02:00
@@ -753,13 +753,13 @@
       if (restarter.dumpStateAllNodes(dump, 2))
 	return NDBT_FAILED;
 
-    if (restarter.waitClusterNoStart())
+    if (restarter.waitNodesNoStart(partition0, cnt/2))
       return NDBT_FAILED;
     
     for (Uint32 i = 0; i<cnt/2; i++)
       if (restarter.restartOneDbNode(partition0[i], true, true, true))
 	return NDBT_FAILED;
-
+    
     if (restarter.waitNodesNoStart(partition0, cnt/2))
       return NDBT_FAILED;
     
Thread
bk commit into 4.1 tree (jonas:1.2488) BUG#18612jonas3 Apr