List:Commits« Previous MessageNext Message »
From:jonas Date:March 31 2006 4:46pm
Subject:bk commit into 4.1 tree (jonas:1.2486) BUG#18612
View as plain text  
Below is the list of changes that have just been committed into a local
4.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2486 06/03/31 16:46:28 jonas@stripped +10 -0
  ndb - bug#18612 (detection of partitioned cluster)
    this also impl. gcp safe multi node shutdown
    1) block gcp
    2) wait for ongoing gcp
    3) inform all stopping QMGR's (so that they don't start with error handler)
    4) wait for all QMGR's to reply
    5) broadcast failrep for stopping nodes
    6) (if !master died) unblock gcp
  
    

  ndb/test/ndbapi/testNodeRestart.cpp
    1.18 06/03/31 16:46:26 jonas@stripped +111 -1
    test program for bug#18612

  ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
    1.19 06/03/31 16:46:26 jonas@stripped +45 -1
    Impl. GCP safe multi node shutdown in order to test bug#18612

  ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
    1.5 06/03/31 16:46:26 jonas@stripped +2 -0
    Impl. GCP safe multi node shutdown in order to test bug#18612

  ndb/src/kernel/blocks/qmgr/Qmgr.hpp
    1.6 06/03/31 16:46:26 jonas@stripped +5 -1
    Impl. GCP safe multi node shutdown in order to test bug#18612

  ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp
    1.16 06/03/31 16:46:26 jonas@stripped +301 -50
    Impl. GCP safe multi node shutdown in order to test bug#18612

  ndb/src/kernel/blocks/ndbcntr/NdbcntrInit.cpp
    1.7 06/03/31 16:46:26 jonas@stripped +1 -0
    Impl. GCP safe multi node shutdown in order to test bug#18612

  ndb/src/kernel/blocks/ndbcntr/Ndbcntr.hpp
    1.4 06/03/31 16:46:26 jonas@stripped +11 -0
    Impl. GCP safe multi node shutdown in order to test bug#18612

  ndb/include/kernel/signaldata/StopReq.hpp
    1.3 06/03/31 16:46:25 jonas@stripped +22 -16
    Impl. GCP safe multi node shutdown in order to test bug#18612

  ndb/include/kernel/signaldata/FailRep.hpp
    1.2 06/03/31 16:46:25 jonas@stripped +4 -2
    Impl. GCP safe multi node shutdown in order to test bug#18612

  ndb/include/kernel/signaldata/DumpStateOrd.hpp
    1.8 06/03/31 16:46:25 jonas@stripped +1 -0
    Impl. GCP safe multi node shutdown in order to test bug#18612

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/41-work

--- 1.7/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2006-03-21 15:13:39 +01:00
+++ 1.8/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2006-03-31 16:46:25 +02:00
@@ -64,6 +64,7 @@
     // 19 NDBFS Fipple with O_SYNC, O_CREATE etc.
     // 20-24 BACKUP
     NdbcntrTestStopOnError = 25,
+    NdbcntrStopNodes = 70,
     // 100-105 TUP and ACC  
     // 200-240 UTIL
     // 300-305 TRIX

--- 1.1/ndb/include/kernel/signaldata/FailRep.hpp	2004-04-14 10:23:54 +02:00
+++ 1.2/ndb/include/kernel/signaldata/FailRep.hpp	2006-03-31 16:46:25 +02:00
@@ -27,6 +27,7 @@
    * Sender(s) & Reciver(s)
    */
   friend class Qmgr;
+  friend class Ndbcntr;
   
   /**
    * For printing
@@ -43,9 +44,10 @@
     ZSTART_IN_REGREQ=3,
     ZHEARTBEAT_FAILURE=4,
     ZLINK_FAILURE=5,
-    ZOTHERNODE_FAILED_DURING_START=6
+    ZOTHERNODE_FAILED_DURING_START=6,
+    ZMULTI_NODE_SHUTDOWN = 7
   };
-
+  
 private:
   
   Uint32 failNodeId;

--- 1.2/ndb/include/kernel/signaldata/StopReq.hpp	2005-09-12 16:08:12 +02:00
+++ 1.3/ndb/include/kernel/signaldata/StopReq.hpp	2006-03-31 16:46:25 +02:00
@@ -32,7 +32,7 @@
   friend class MgmtSrvr;
 
 public:
-  STATIC_CONST( SignalLength = 9 );
+  STATIC_CONST( SignalLength = 9 + NdbNodeBitmask::Size);
   
 public:
   Uint32 senderRef;
@@ -49,29 +49,34 @@
   Int32 readOperationTimeout; // Timeout before read operations are aborted
   Int32 operationTimeout;     // Timeout before all operations are aborted
 
+  Uint32 nodes[NdbNodeBitmask::Size];
+
   static void setSystemStop(Uint32 & requestInfo, bool value);
   static void setPerformRestart(Uint32 & requestInfo, bool value);
   static void setNoStart(Uint32 & requestInfo, bool value);
   static void setInitialStart(Uint32 & requestInfo, bool value);
-  static void setEscalateOnNodeFail(Uint32 & requestInfo, bool value);
   /**
    * Don't perform "graceful" shutdown/restart...
    */
   static void setStopAbort(Uint32 & requestInfo, bool value);
+  static void setStopNodes(Uint32 & requestInfo, bool value);
 
   static bool getSystemStop(const Uint32 & requestInfo);
   static bool getPerformRestart(const Uint32 & requestInfo);
   static bool getNoStart(const Uint32 & requestInfo);
   static bool getInitialStart(const Uint32 & requestInfo);
-  static bool getEscalateOnNodeFail(const Uint32 & requestInfo);
   static bool getStopAbort(const Uint32 & requestInfo);
+  static bool getStopNodes(const Uint32 & requestInfo);
 };
 
 struct StopConf
 {
   STATIC_CONST( SignalLength = 2 );
   Uint32 senderData;
-  Uint32 nodeState;
+  union {
+    Uint32 nodeState;
+    Uint32 nodeId;
+  };
 };
 
 class StopRef 
@@ -94,7 +99,9 @@
     NodeShutdownInProgress = 1,
     SystemShutdownInProgress = 2,
     NodeShutdownWouldCauseSystemCrash = 3,
-    TransactionAbortFailed = 4
+    TransactionAbortFailed = 4,
+    UnsupportedNodeShutdown = 5,
+    MultiNodeShutdownNotMaster = 6
   };
   
 public:
@@ -132,16 +139,16 @@
 
 inline
 bool
-StopReq::getEscalateOnNodeFail(const Uint32 & requestInfo)
+StopReq::getStopAbort(const Uint32 & requestInfo)
 {
-  return requestInfo & 16;
+  return requestInfo & 32;
 }
 
 inline
 bool
-StopReq::getStopAbort(const Uint32 & requestInfo)
+StopReq::getStopNodes(const Uint32 & requestInfo)
 {
-  return requestInfo & 32;
+  return requestInfo & 64;
 }
 
 
@@ -187,24 +194,23 @@
 
 inline
 void
-StopReq::setEscalateOnNodeFail(Uint32 & requestInfo, bool value)
+StopReq::setStopAbort(Uint32 & requestInfo, bool value)
 {
   if(value)
-    requestInfo |= 16;
+    requestInfo |= 32;
   else
-    requestInfo &= ~16;
+    requestInfo &= ~32;
 }
 
 inline
 void
-StopReq::setStopAbort(Uint32 & requestInfo, bool value)
+StopReq::setStopNodes(Uint32 & requestInfo, bool value)
 {
   if(value)
-    requestInfo |= 32;
+    requestInfo |= 64;
   else
-    requestInfo &= ~32;
+    requestInfo &= ~64;
 }
-
 
 #endif
 

--- 1.3/ndb/src/kernel/blocks/ndbcntr/Ndbcntr.hpp	2005-08-31 16:14:59 +02:00
+++ 1.4/ndb/src/kernel/blocks/ndbcntr/Ndbcntr.hpp	2006-03-31 16:46:26 +02:00
@@ -202,6 +202,7 @@
   void execWAIT_GCP_CONF(Signal* signal);
 
   void execSTOP_REQ(Signal* signal);
+  void execSTOP_CONF(Signal* signal);
   void execRESUME_REQ(Signal* signal);
 
   void execCHANGE_NODE_STATE_CONF(Signal* signal);
@@ -337,6 +338,16 @@
     void progError(int line, int cause, const char * extra) { 
       cntr.progError(line, cause, extra); 
     }
+
+    enum StopNodesStep {
+      SR_BLOCK_GCP_START_GCP = 0,
+      SR_WAIT_COMPLETE_GCP = 1,
+      SR_UNBLOCK_GCP_START_GCP = 2,
+      SR_QMGR_STOP_REQ = 3,
+      SR_WAIT_NODE_FAILURES = 4,
+      SR_CLUSTER_SHUTDOWN = 12
+    } m_state;
+    SignalCounter m_stop_req_counter;
   };
 private:
   StopRecord c_stopRec;

--- 1.6/ndb/src/kernel/blocks/ndbcntr/NdbcntrInit.cpp	2005-08-31 16:14:59 +02:00
+++ 1.7/ndb/src/kernel/blocks/ndbcntr/NdbcntrInit.cpp	2006-03-31 16:46:26 +02:00
@@ -86,6 +86,7 @@
   addRecSignal(GSN_STOP_ME_CONF, &Ndbcntr::execSTOP_ME_CONF);
 
   addRecSignal(GSN_STOP_REQ, &Ndbcntr::execSTOP_REQ);
+  addRecSignal(GSN_STOP_CONF, &Ndbcntr::execSTOP_CONF);
   addRecSignal(GSN_RESUME_REQ, &Ndbcntr::execRESUME_REQ);
 
   addRecSignal(GSN_WAIT_GCP_REF, &Ndbcntr::execWAIT_GCP_REF);

--- 1.15/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp	2005-09-12 16:08:12 +02:00
+++ 1.16/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp	2006-03-31 16:46:26 +02:00
@@ -42,6 +42,8 @@
 #include <signaldata/FsRemoveReq.hpp>
 #include <signaldata/ReadConfig.hpp>
 
+#include <signaldata/FailRep.hpp>
+
 #include <AttributeHeader.hpp>
 #include <Configuration.hpp>
 #include <DebuggerNames.hpp>
@@ -1454,13 +1456,74 @@
   sendSignal(SUMA_REF, GSN_NODE_FAILREP, signal,
 	     NodeFailRep::SignalLength, JBB);
 
+  if (c_stopRec.stopReq.senderRef)
+  {
+    jam();
+    switch(c_stopRec.m_state){
+    case StopRecord::SR_WAIT_NODE_FAILURES:
+    {
+      jam();
+      NdbNodeBitmask tmp;
+      tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+      tmp.bitANDC(allFailed);      
+      tmp.copyto(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+      
+      if (tmp.isclear())
+      {
+	jam();
+	if (c_stopRec.stopReq.senderRef != RNIL)
+	{
+	  jam();
+	  StopConf * const stopConf = (StopConf *)&signal->theData[0];
+	  stopConf->senderData = c_stopRec.stopReq.senderData;
+	  stopConf->nodeState  = (Uint32) NodeState::SL_SINGLEUSER;
+	  sendSignal(c_stopRec.stopReq.senderRef, GSN_STOP_CONF, signal, 
+		     StopConf::SignalLength, JBB);
+	}
+
+	c_stopRec.stopReq.senderRef = 0;
+	WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+	req->senderRef = reference();
+	req->senderData = StopRecord::SR_UNBLOCK_GCP_START_GCP;
+	req->requestType = WaitGCPReq::UnblockStartGcp;
+	sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+		   WaitGCPReq::SignalLength, JBA);
+      }
+      break;
+    }
+    case StopRecord::SR_QMGR_STOP_REQ:
+    {
+      NdbNodeBitmask tmp;
+      tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+      tmp.bitANDC(allFailed);      
+
+      if (tmp.isclear())
+      {
+	Uint32 nodeId = allFailed.find(0);
+	tmp.set(nodeId);
+
+	StopConf* conf = (StopConf*)signal->getDataPtrSend();
+	conf->senderData = c_stopRec.stopReq.senderData;
+	conf->nodeId = nodeId;
+	sendSignal(reference(), 
+		   GSN_STOP_CONF, signal, StopConf::SignalLength, JBB);
+      }
+
+      tmp.copyto(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+      
+      break;
+    }
+    }
+  }
+  
+  signal->theData[0] = EventReport::NODE_FAILREP;
+  signal->theData[2] = 0;
+  
   Uint32 nodeId = 0;
   while(!allFailed.isclear()){
     nodeId = allFailed.find(nodeId + 1);
     allFailed.clear(nodeId);
-    signal->theData[0] = EventReport::NODE_FAILREP;
     signal->theData[1] = nodeId;
-    signal->theData[2] = 0;
     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
   }//for
 
@@ -1908,13 +1971,15 @@
 Ndbcntr::execDUMP_STATE_ORD(Signal* signal)
 {
   DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0];
-  if(signal->theData[0] == 13){
+  Uint32 arg = dumpState->args[0];
+
+  if(arg == 13){
     infoEvent("Cntr: cstartPhase = %d, cinternalStartphase = %d, block = %d", 
 	      cstartPhase, cinternalStartphase, cndbBlocksCount);
     infoEvent("Cntr: cmasterNodeId = %d", cmasterNodeId);
   }
 
-  if (dumpState->args[0] == DumpStateOrd::NdbcntrTestStopOnError){
+  if (arg == DumpStateOrd::NdbcntrTestStopOnError){
     if (theConfiguration.stopOnError() == true)
       ((Configuration&)theConfiguration).stopOnError(false);
     
@@ -1927,6 +1992,28 @@
 	       SystemError::SignalLength, JBA);
   }
 
+  if (arg == DumpStateOrd::NdbcntrStopNodes)
+  {
+    NdbNodeBitmask mask;
+    for(Uint32 i = 1; i<signal->getLength(); i++)
+      mask.set(signal->theData[i]);
+
+    StopReq* req = (StopReq*)signal->getDataPtrSend();
+    req->senderRef = RNIL;
+    req->senderData = 123;
+    req->requestInfo = 0;
+    req->singleuser = 0;
+    req->singleUserApi = 0;
+    mask.copyto(NdbNodeBitmask::Size, req->nodes);
+    StopReq::setPerformRestart(req->requestInfo, 1);
+    StopReq::setNoStart(req->requestInfo, 1);
+    StopReq::setStopNodes(req->requestInfo, 1);
+    StopReq::setStopAbort(req->requestInfo, 1);
+    
+    sendSignal(reference(), GSN_STOP_REQ, signal,
+	       StopReq::SignalLength, JBB);
+    return;
+  }
 
 }//Ndbcntr::execDUMP_STATE_ORD()
 
@@ -1987,9 +2074,12 @@
   Uint32 senderData = req->senderData;
   BlockReference senderRef = req->senderRef;
   bool abort = StopReq::getStopAbort(req->requestInfo);
+  bool stopnodes = StopReq::getStopNodes(req->requestInfo);
 
-  if(getNodeState().startLevel < NodeState::SL_STARTED || 
-     abort && !singleuser){
+  if(!singleuser && 
+     (getNodeState().startLevel < NodeState::SL_STARTED || 
+      (abort && !stopnodes)))
+  {
     /**
      * Node is not started yet
      *
@@ -2028,21 +2118,71 @@
     else
       ref->errorCode = StopRef::NodeShutdownInProgress;
     ref->senderData = senderData;
-    sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
+    
+    if (senderRef != RNIL)
+      sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
+    return;
+  }
+
+  if (stopnodes && !abort)
+  {
+    jam();
+    ref->errorCode = StopRef::UnsupportedNodeShutdown;
+    ref->senderData = senderData;
+    if (senderRef != RNIL)
+      sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
+    return;
+  }
+
+  if (stopnodes && cmasterNodeId != getOwnNodeId())
+  {
+    jam();
+    ref->errorCode = StopRef::MultiNodeShutdownNotMaster;
+    ref->senderData = senderData;
+    if (senderRef != RNIL)
+      sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
     return;
   }
   
   c_stopRec.stopReq = * req;
   c_stopRec.stopInitiatedTime = NdbTick_CurrentMillisecond();
   
-  if(!singleuser) {
-    if(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo)) {
+  if (stopnodes)
+  {
+    jam();
+
+    if(!c_stopRec.checkNodeFail(signal))
+    {
       jam();
-      if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo)){
+      return;
+    }
+
+    char buf[100];
+    NdbNodeBitmask mask;
+    mask.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+    infoEvent("Initiating shutdown abort of %s", mask.getText(buf));
+    ndbout_c("Initiating shutdown abort of %s", mask.getText(buf));    
+
+    WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+    req->senderRef = reference();
+    req->senderData = StopRecord::SR_BLOCK_GCP_START_GCP;
+    req->requestType = WaitGCPReq::BlockStartGcp;
+    sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+	       WaitGCPReq::SignalLength, JBB);
+    return;
+  }
+  else if(!singleuser) 
+  {
+    if(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo)) 
+    {
+      jam();
+      if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo))
+      {
 	((Configuration&)theConfiguration).stopOnError(false);
       }
     }
-    if(!c_stopRec.checkNodeFail(signal)){
+    if(!c_stopRec.checkNodeFail(signal))
+    {
       jam();
       return;
     }
@@ -2112,7 +2252,17 @@
    */
   NodeBitmask ndbMask; 
   ndbMask.assign(cntr.c_startedNodes);
-  ndbMask.clear(cntr.getOwnNodeId());
+
+  if (StopReq::getStopNodes(stopReq.requestInfo))
+  {
+    NdbNodeBitmask tmp;
+    tmp.assign(NdbNodeBitmask::Size, stopReq.nodes);
+    ndbMask.bitANDC(tmp);
+  }
+  else
+  {
+    ndbMask.clear(cntr.getOwnNodeId());
+  }
   
   CheckNodeGroups* sd = (CheckNodeGroups*)&signal->theData[0];
   sd->blockRef = cntr.reference();
@@ -2134,7 +2284,8 @@
   ref->errorCode = StopRef::NodeShutdownWouldCauseSystemCrash;
   
   const BlockReference bref = stopReq.senderRef;
-  cntr.sendSignal(bref, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
+  if (bref != RNIL)
+    cntr.sendSignal(bref, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
   
   stopReq.senderRef = 0;
 
@@ -2184,23 +2335,23 @@
     if(stopReq.getSystemStop(stopReq.requestInfo)  || stopReq.singleuser){
       jam();
       if(stopReq.singleuser) 
-	{
-	  jam();
-	   AbortAllReq * req = (AbortAllReq*)&signal->theData[0];
-	   req->senderRef = cntr.reference();
-	   req->senderData = 12;
-	   cntr.sendSignal(DBTC_REF, GSN_ABORT_ALL_REQ, signal, 
-		      AbortAllReq::SignalLength, JBB);
-	} 
+      {
+	jam();
+	AbortAllReq * req = (AbortAllReq*)&signal->theData[0];
+	req->senderRef = cntr.reference();
+	req->senderData = 12;
+	cntr.sendSignal(DBTC_REF, GSN_ABORT_ALL_REQ, signal, 
+			AbortAllReq::SignalLength, JBB);
+      } 
       else
-	{
-	  WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
-	  req->senderRef = cntr.reference();
-	  req->senderData = 12;
-	  req->requestType = WaitGCPReq::CompleteForceStart;
-	  cntr.sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
-			  WaitGCPReq::SignalLength, JBB);
-	}
+      {
+	WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+	req->senderRef = cntr.reference();
+	req->senderData = StopRecord::SR_CLUSTER_SHUTDOWN;
+	req->requestType = WaitGCPReq::CompleteForceStart;
+	cntr.sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+			WaitGCPReq::SignalLength, JBB);
+      }
     } else {
       jam();
       StopPermReq * req = (StopPermReq*)&signal->theData[0];
@@ -2362,7 +2513,7 @@
 
   WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
   req->senderRef = reference();
-  req->senderData = 12;
+  req->senderData = StopRecord::SR_CLUSTER_SHUTDOWN;
   req->requestType = WaitGCPReq::CompleteForceStart;
   sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
 	     WaitGCPReq::SignalLength, JBB);
@@ -2371,29 +2522,129 @@
 void Ndbcntr::execWAIT_GCP_CONF(Signal* signal){
   jamEntry();
 
-  ndbrequire(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
-  NodeState newState(NodeState::SL_STOPPING_3, true); 
+  WaitGCPConf* conf = (WaitGCPConf*)signal->getDataPtr();
 
-  /**
-   * Inform QMGR so that arbitrator won't kill us
-   */
-  NodeStateRep * rep = (NodeStateRep *)&signal->theData[0];
-  rep->nodeState = newState;
-  rep->nodeState.masterNodeId = cmasterNodeId;
-  rep->nodeState.setNodeGroup(c_nodeGroup);
-  EXECUTE_DIRECT(QMGR, GSN_NODE_STATE_REP, signal, NodeStateRep::SignalLength);
-
-  if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo)){
-    jam();
-    StartOrd * startOrd = (StartOrd *)&signal->theData[0];
-    startOrd->restartInfo = c_stopRec.stopReq.requestInfo;
-    sendSignalWithDelay(CMVMI_REF, GSN_START_ORD, signal, 500, 
-			StartOrd::SignalLength);
-  } else {
+  switch(conf->senderData){
+  case StopRecord::SR_BLOCK_GCP_START_GCP:
+  {
+    jam();
+    /**
+     * 
+     */
+    if(!c_stopRec.checkNodeFail(signal))
+    {
+      jam();
+      goto unblock;
+    }
+    
+    WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+    req->senderRef = reference();
+    req->senderData = StopRecord::SR_WAIT_COMPLETE_GCP;
+    req->requestType = WaitGCPReq::CompleteIfRunning;
+
+    sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+	       WaitGCPReq::SignalLength, JBB);
+    return;
+  }
+  case StopRecord::SR_UNBLOCK_GCP_START_GCP:
+  {
     jam();
-    sendSignalWithDelay(CMVMI_REF, GSN_STOP_ORD, signal, 500, 1);
+    return;
+  }
+  case StopRecord::SR_WAIT_COMPLETE_GCP:
+  {
+    jam();
+    if(!c_stopRec.checkNodeFail(signal))
+    {
+      jam();
+      goto unblock;
+    }
+
+    NdbNodeBitmask tmp;
+    tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+    c_stopRec.m_stop_req_counter = tmp;
+    NodeReceiverGroup rg(QMGR, tmp);
+    StopReq * stopReq = (StopReq *)&signal->theData[0];
+    * stopReq = c_stopRec.stopReq;
+    stopReq->senderRef = reference();
+    sendSignal(rg, GSN_STOP_REQ, signal, StopReq::SignalLength, JBA);
+    c_stopRec.m_state = StopRecord::SR_QMGR_STOP_REQ; 
+    return;
+  }
+  case StopRecord::SR_CLUSTER_SHUTDOWN:
+  {
+    jam();
+    break;
+  }
+  }
+  
+  {  
+    ndbrequire(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
+    NodeState newState(NodeState::SL_STOPPING_3, true); 
+    
+    /**
+     * Inform QMGR so that arbitrator won't kill us
+     */
+    NodeStateRep * rep = (NodeStateRep *)&signal->theData[0];
+    rep->nodeState = newState;
+    rep->nodeState.masterNodeId = cmasterNodeId;
+    rep->nodeState.setNodeGroup(c_nodeGroup);
+    EXECUTE_DIRECT(QMGR, GSN_NODE_STATE_REP, signal, 
+		   NodeStateRep::SignalLength);
+    
+    if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo)){
+      jam();
+      StartOrd * startOrd = (StartOrd *)&signal->theData[0];
+      startOrd->restartInfo = c_stopRec.stopReq.requestInfo;
+      sendSignalWithDelay(CMVMI_REF, GSN_START_ORD, signal, 500, 
+			  StartOrd::SignalLength);
+    } else {
+      jam();
+      sendSignalWithDelay(CMVMI_REF, GSN_STOP_ORD, signal, 500, 1);
+    }
+    return;
+  }
+  
+unblock:
+  WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+  req->senderRef = reference();
+  req->senderData = StopRecord::SR_UNBLOCK_GCP_START_GCP;
+  req->requestType = WaitGCPReq::UnblockStartGcp;
+  sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+	     WaitGCPReq::SignalLength, JBB);
+}
+
+void
+Ndbcntr::execSTOP_CONF(Signal* signal)
+{
+  jamEntry();
+  StopConf *conf = (StopConf*)signal->getDataPtr();
+  ndbrequire(c_stopRec.m_state == StopRecord::SR_QMGR_STOP_REQ);
+  c_stopRec.m_stop_req_counter.clearWaitingFor(conf->nodeId);
+  if (c_stopRec.m_stop_req_counter.done())
+  {
+    char buf[100];
+    NdbNodeBitmask mask;
+    mask.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+    infoEvent("Stopping of %s", mask.getText(buf));
+    ndbout_c("Stopping of %s", mask.getText(buf));    
+
+    /**
+     * Kill any node...
+     */
+    FailRep * const failRep = (FailRep *)&signal->theData[0];
+    failRep->failCause = FailRep::ZMULTI_NODE_SHUTDOWN;
+    NodeReceiverGroup rg(QMGR, c_clusterNodes);
+    Uint32 nodeId = 0;
+    while ((nodeId = NdbNodeBitmask::find(c_stopRec.stopReq.nodes, nodeId+1))
+	   != NdbNodeBitmask::NotFound)
+    {
+      failRep->failNodeId = nodeId;
+      sendSignal(rg, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA);
+    }
+    c_stopRec.m_state = StopRecord::SR_WAIT_NODE_FAILURES;
+    return;
   }
-  return;
 }
 
 void Ndbcntr::execSTTORRY(Signal* signal){

--- 1.5/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2006-03-30 14:20:52 +02:00
+++ 1.6/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2006-03-31 16:46:26 +02:00
@@ -29,6 +29,7 @@
 #include <signaldata/CmRegSignalData.hpp>
 #include <signaldata/ApiRegSignalData.hpp>
 #include <signaldata/FailRep.hpp>
+#include <signaldata/StopReq.hpp>
 
 #include "timer.hpp"
 
@@ -218,6 +219,7 @@
   void execPRES_TOCONF(Signal* signal);
   void execDISCONNECT_REP(Signal* signal);
   void execSYSTEM_ERROR(Signal* signal);
+  void execSTOP_REQ(Signal* signal);
 
   // Received signals
   void execDUMP_STATE_ORD(Signal* signal);
@@ -402,7 +404,9 @@
   Uint16 cfailedNodes[MAX_NDB_NODES];
   Uint16 cprepFailedNodes[MAX_NDB_NODES];
   Uint16 ccommitFailedNodes[MAX_NDB_NODES];
-
+  
+  StopReq c_stopReq;
+  void check_multi_node_shutdown(Signal* signal);
 };
 
 #endif

--- 1.4/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp	2005-08-18 14:02:20 +02:00
+++ 1.5/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp	2006-03-31 16:46:26 +02:00
@@ -35,6 +35,7 @@
 
   Uint32 hbDBAPI = 500;
   setHbApiDelay(hbDBAPI);
+  c_stopReq.senderRef = 0;
 }//Qmgr::initData()
 
 void Qmgr::initRecords() 
@@ -49,6 +50,7 @@
 
   // Transit signals
   addRecSignal(GSN_DUMP_STATE_ORD, &Qmgr::execDUMP_STATE_ORD);
+  addRecSignal(GSN_STOP_REQ, &Qmgr::execSTOP_REQ);
   addRecSignal(GSN_DEBUG_SIG, &Qmgr::execDEBUG_SIG);
   addRecSignal(GSN_CONTINUEB, &Qmgr::execCONTINUEB);
   addRecSignal(GSN_CM_HEARTBEAT, &Qmgr::execCM_HEARTBEAT);

--- 1.18/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2006-03-31 11:39:34 +02:00
+++ 1.19/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2006-03-31 16:46:26 +02:00
@@ -2342,6 +2342,9 @@
 
   failedNodePtr.i = aFailedNode;
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
+
+  check_multi_node_shutdown(signal);
+  
   if (failedNodePtr.i == getOwnNodeId()) {
     jam();
 
@@ -2433,7 +2436,9 @@
 {
   NodeRecPtr myNodePtr;
   jamEntry();
-
+  
+  check_multi_node_shutdown(signal);
+  
   PrepFailReqRef * const prepFail = (PrepFailReqRef *)&signal->theData[0];
 
   BlockReference Tblockref  = prepFail->xxxBlockRef;
@@ -4085,6 +4090,8 @@
   if (! (arbitRec.getTimediff() > getArbitTimeout()))
     return;
 #endif
+  CRASH_INSERTION(932);
+
   progError(__LINE__, ERR_ARBIT_SHUTDOWN, "Arbitrator decided to shutdown this node");
 }
 
@@ -4244,4 +4251,41 @@
   
   NodeReceiverGroup rg(API_CLUSTERMGR, mask);
   sendSignal(rg, api.gsn, signal, len, JBB); // forward sections
+}
+
+void
+Qmgr::execSTOP_REQ(Signal* signal)
+{
+  jamEntry();
+  c_stopReq = * (StopReq*)signal->getDataPtr();
+
+  if (c_stopReq.senderRef)
+  {
+    ndbrequire(NdbNodeBitmask::get(c_stopReq.nodes, getOwnNodeId()));
+    
+    StopConf *conf = (StopConf*)signal->getDataPtrSend();
+    conf->senderData = c_stopReq.senderData;
+    conf->nodeState = getOwnNodeId();
+    sendSignal(c_stopReq.senderRef, 
+	       GSN_STOP_CONF, signal, StopConf::SignalLength, JBA);
+  }
+}
+
+void
+Qmgr::check_multi_node_shutdown(Signal* signal)
+{
+  if (c_stopReq.senderRef && 
+      NdbNodeBitmask::get(c_stopReq.nodes, getOwnNodeId()))
+  {
+    jam();
+    if(StopReq::getPerformRestart(c_stopReq.requestInfo))
+    {
+      jam();
+      StartOrd * startOrd = (StartOrd *)&signal->theData[0];
+      startOrd->restartInfo = c_stopReq.requestInfo;
+      EXECUTE_DIRECT(CMVMI, GSN_START_ORD, signal, 2);
+    } else {
+      EXECUTE_DIRECT(CMVMI, GSN_STOP_ORD, signal, 1);
+    }
+  }
 }

--- 1.17/ndb/test/ndbapi/testNodeRestart.cpp	2006-03-27 10:18:46 +02:00
+++ 1.18/ndb/test/ndbapi/testNodeRestart.cpp	2006-03-31 16:46:26 +02:00
@@ -22,7 +22,7 @@
 #include <NdbRestarts.hpp>
 #include <Vector.hpp>
 #include <signaldata/DumpStateOrd.hpp>
-
+#include <Bitmask.hpp>
 
 int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){
 
@@ -669,6 +669,110 @@
   return NDBT_FAILED;    
 }
 
+int 
+runBug18612(NDBT_Context* ctx, NDBT_Step* step){
+
+  // Assume two replicas
+  NdbRestarter restarter;
+  if (restarter.getNumDbNodes() < 2)
+  {
+    ctx->stopTest();
+    return NDBT_OK;
+  }
+
+  Uint32 cnt = restarter.getNumDbNodes();
+
+  for(int loop = 0; loop < ctx->getNumLoops(); loop++)
+  {
+    int partition0[256];
+    int partition1[256];
+    bzero(partition0, sizeof(partition0));
+    bzero(partition1, sizeof(partition1));
+    Bitmask<4> nodesmask;
+    
+    Uint32 node1 = restarter.getDbNodeId(rand()%cnt);
+    for (Uint32 i = 0; i<cnt/2; i++)
+    {
+      do { 
+	node1 = restarter.getRandomNodeOtherNodeGroup(node1, rand());
+      } while(nodesmask.get(node1));
+      
+      partition0[i] = node1;
+      partition1[i] = restarter.getRandomNodeSameNodeGroup(node1, rand());
+      
+      ndbout_c("nodes %d %d", node1, partition1[i]);
+      
+      assert(!nodesmask.get(node1));
+      assert(!nodesmask.get(partition1[i]));
+      nodesmask.set(node1);
+      nodesmask.set(partition1[i]);
+    } 
+    
+    ndbout_c("done");
+
+    int dump[255];
+    dump[0] = DumpStateOrd::NdbcntrStopNodes;
+    memcpy(dump + 1, partition0, sizeof(int)*cnt/2);
+    
+    Uint32 master = restarter.getMasterNodeId();
+    
+    if (restarter.dumpStateOneNode(master, dump, 1+cnt/2))
+      return NDBT_FAILED;
+    
+    if (restarter.waitNodesNoStart(partition0, cnt/2))
+      return NDBT_FAILED;
+
+    int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+    
+    if (restarter.dumpStateAllNodes(val2, 2))
+      return NDBT_FAILED;
+    
+    if (restarter.insertErrorInAllNodes(932))
+      return NDBT_FAILED;
+
+    dump[0] = 9000;
+    memcpy(dump + 1, partition0, sizeof(int)*cnt/2);    
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.dumpStateOneNode(partition1[i], dump, 1+cnt/2))
+	return NDBT_FAILED;
+
+    dump[0] = 9000;
+    memcpy(dump + 1, partition1, sizeof(int)*cnt/2);    
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.dumpStateOneNode(partition0[i], dump, 1+cnt/2))
+	return NDBT_FAILED;
+    
+    if (restarter.startNodes(partition0, cnt/2))
+      return NDBT_FAILED;
+    
+    if (restarter.waitNodesStartPhase(partition0, cnt/2, 2))
+      return NDBT_FAILED;
+    
+    dump[0] = 9001;
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.dumpStateAllNodes(dump, 2))
+	return NDBT_FAILED;
+
+    if (restarter.waitClusterNoStart())
+      return NDBT_FAILED;
+    
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.restartOneDbNode(partition0[i], true, true, true))
+	return NDBT_FAILED;
+
+    if (restarter.waitNodesNoStart(partition0, cnt/2))
+      return NDBT_FAILED;
+    
+    if (restarter.startAll())
+      return NDBT_FAILED;
+
+    if (restarter.waitClusterStarted())
+      return NDBT_FAILED;
+  }
+  return NDBT_OK;
+}
+
+
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
 	 "Test that one node at a time can be stopped and then restarted "\
@@ -961,6 +1065,12 @@
 	 "Test bug with NF during NR"){
   INITIALIZER(runLoadTable);
   STEP(runBug18414);
+  FINALIZER(runClearTable);
+}
+TESTCASE("Bug18612",
+	 "Test bug with partitioned clusters"){
+  INITIALIZER(runLoadTable);
+  STEP(runBug18612);
   FINALIZER(runClearTable);
 }
 NDBT_TESTSUITE_END(testNodeRestart);
Thread
bk commit into 4.1 tree (jonas:1.2486) BUG#18612jonas31 Mar