List:Commits« Previous MessageNext Message »
From:jonas Date:May 30 2007 8:01pm
Subject:bk commit into 5.1 tree (jonas:1.2150) BUG#28445
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-05-30 22:01:18+02:00, jonas@stripped +4 -0
  ndb - bug#28445
    start hb already on connect, not on first received hb

  storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp@stripped, 2007-05-30 22:01:17+02:00, jonas@stripped +20 -28
    make sure qmgr is "fully" informed about connections so that it can handle hb correctly

  storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp@stripped, 2007-05-30 22:01:17+02:00, jonas@stripped +3 -0
    move api failure handling into own method
    add START_ORD so that hb checking can start really early

  storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp@stripped, 2007-05-30 22:01:17+02:00, jonas@stripped +1 -0
    move api failure handling into own method
    add START_ORD so that hb checking can start really early

  storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp@stripped, 2007-05-30 22:01:17+02:00, jonas@stripped +131 -111
    start hb handling directly on connect rep
    (instead of first hb)

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/drop5

--- 1.32/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp	2007-05-30 22:01:22 +02:00
+++ 1.33/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp	2007-05-30 22:01:22 +02:00
@@ -398,9 +398,10 @@
 //  Uint32 noOfNodes = closeCom->noOfNodes;
   
   jamEntry();
-  for (unsigned i = 0; i < MAX_NODES; i++){
-    if(NodeBitmask::get(closeCom->theNodes, i)){
-    
+  for (unsigned i = 0; i < MAX_NODES; i++)
+  {
+    if(NodeBitmask::get(closeCom->theNodes, i))
+    {
       jam();
 
       //-----------------------------------------------------
@@ -414,7 +415,9 @@
       globalTransporterRegistry.do_disconnect(i);
     }
   }
-  if (failNo != 0) {
+
+  if (failNo != 0) 
+  {
     jam();
     signal->theData[0] = userRef;
     signal->theData[1] = failNo;
@@ -433,8 +436,8 @@
   jamEntry();
 
   const Uint32 len = signal->getLength();
-  if(len == 2){
-
+  if(len == 2)
+  {
 #ifdef ERROR_INSERT
     if (! ((ERROR_INSERTED(9000) || ERROR_INSERTED(9002)) 
 	   && c_error_9000_nodes_mask.get(tStartingNode)))
@@ -452,9 +455,11 @@
       //-----------------------------------------------------
     }
   } else {
-    for(unsigned int i = 1; i < MAX_NODES; i++ ) {
+    for(unsigned int i = 1; i < MAX_NODES; i++ ) 
+    {
       jam();
-      if (i != getOwnNodeId() && getNodeInfo(i).m_type == tData2){
+      if (i != getOwnNodeId() && getNodeInfo(i).m_type == tData2)
+      {
 	jam();
 
 #ifdef ERROR_INSERT
@@ -513,24 +518,10 @@
   setNodeInfo(hostId).m_connectCount++;
   const NodeInfo::NodeType type = getNodeInfo(hostId).getType();
   ndbrequire(type != NodeInfo::INVALID);
-  
-  if(type == NodeInfo::DB || globalData.theStartLevel == NodeState::SL_STARTED){
-    jam();
-    DisconnectRep * const rep = (DisconnectRep *)&signal->theData[0];
-    rep->nodeId = hostId;
-    rep->err = errNo;
-    sendSignal(QMGR_REF, GSN_DISCONNECT_REP, signal, 
-	       DisconnectRep::SignalLength, JBA);
-  } else if((globalData.theStartLevel == NodeState::SL_CMVMI ||
-	     globalData.theStartLevel == NodeState::SL_STARTING)
-	    && type == NodeInfo::MGM) {
-    /**
-     * Someone disconnected during cmvmi period
-     */
-    jam();
-    globalTransporterRegistry.do_connect(hostId);
-  }
 
+  sendSignal(QMGR_REF, GSN_DISCONNECT_REP, signal, 
+             DisconnectRep::SignalLength, JBA);
+  
   cancelSubscription(hostId);
 
   signal->theData[0] = NDB_LE_Disconnected;
@@ -564,6 +555,8 @@
      */
     if(type == NodeInfo::MGM){
       jam();
+      signal->theData[0] = hostId;
+      sendSignal(QMGR_REF, GSN_CONNECT_REP, signal, 1, JBA);
     } else {
       /**
        * Dont allow api nodes to connect
@@ -795,6 +788,8 @@
         }
       }
     }
+
+    EXECUTE_DIRECT(QMGR, GSN_START_ORD, signal, 1);
     return ;
   }
   
@@ -822,9 +817,6 @@
      *
      * Do Restart
      */
-
-    globalScheduler.clear();
-    globalTimeQueue.clear();
     
     // Disconnect all nodes as part of the system restart. 
     // We need to ensure that we are starting up

--- 1.17/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2007-05-30 22:01:22 +02:00
+++ 1.18/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2007-05-30 22:01:22 +02:00
@@ -265,6 +265,8 @@
   void execALLOC_NODEID_CONF(Signal *);
   void execALLOC_NODEID_REF(Signal *);
   void completeAllocNodeIdReq(Signal *);
+  
+  void execSTART_ORD(Signal*);
 
   // Arbitration signals
   void execARBIT_CFG(Signal* signal);
@@ -281,6 +283,7 @@
   void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
   Uint32 check_startup(Signal* signal);
 
+  void api_failed(Signal* signal, Uint32 aFailedNode);
   void node_failed(Signal* signal, Uint16 aFailedNode);
   void checkStartInterface(Signal* signal);
   void failReport(Signal* signal,

--- 1.12/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp	2007-05-30 22:01:22 +02:00
+++ 1.13/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp	2007-05-30 22:01:22 +02:00
@@ -115,6 +115,7 @@
   addRecSignal(GSN_DIH_RESTARTREF, &Qmgr::execDIH_RESTARTREF);
   addRecSignal(GSN_DIH_RESTARTCONF, &Qmgr::execDIH_RESTARTCONF);
   addRecSignal(GSN_NODE_VERSION_REP, &Qmgr::execNODE_VERSION_REP);
+  addRecSignal(GSN_START_ORD, &Qmgr::execSTART_ORD);
   
   initData();
 }//Qmgr::Qmgr()

--- 1.40/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2007-05-30 22:01:22 +02:00
+++ 1.41/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2007-05-30 22:01:22 +02:00
@@ -238,6 +238,16 @@
 	     ReadConfigConf::SignalLength, JBB);
 }
 
+void
+Qmgr::execSTART_ORD(Signal* signal)
+{
+  /**
+   * Start timer handling 
+   */
+  signal->theData[0] = ZTIMER_HANDLING;
+  sendSignal(QMGR_REF, GSN_CONTINUEB, signal, 1, JBB);
+}
+
 /*
 4.2  ADD NODE MODULE*/
 /*##########################################################################*/
@@ -1165,12 +1175,6 @@
   {
     jam();
     electionWon(signal);
-    
-    /**
-     * Start timer handling 
-     */
-    signal->theData[0] = ZTIMER_HANDLING;
-    sendSignal(QMGR_REF, GSN_CONTINUEB, signal, 10, JBB);
   }
   
   return;
@@ -1809,12 +1813,6 @@
   
   sendSttorryLab(signal);
   
-  /**
-   * Start timer handling 
-   */
-  signal->theData[0] = ZTIMER_HANDLING;
-  sendSignal(QMGR_REF, GSN_CONTINUEB, signal, 10, JBB);
-  
   sendCmAckAdd(signal, getOwnNodeId(), CmAdd::CommitNew);
 }
 
@@ -2191,20 +2189,22 @@
       hb_check_timer.reset();
     }
   }
-
+  
   if (interface_check_timer.check(TcurrentTime)) {
     jam();
     interface_check_timer.reset();
     checkStartInterface(signal);
   }
 
+  if (hb_api_timer.check(TcurrentTime)) 
+  {
+    jam();
+    hb_api_timer.reset();
+    apiHbHandlingLab(signal);
+  }
+
   if (cactivateApiCheck != 0) {
     jam();
-    if (hb_api_timer.check(TcurrentTime)) {
-      jam();
-      hb_api_timer.reset();
-      apiHbHandlingLab(signal);
-    }//if
     if (clatestTransactionCheck == 0) {
       //-------------------------------------------------------------
       // Initialise the Transaction check timer.
@@ -2321,18 +2321,21 @@
     if(type == NodeInfo::INVALID)
       continue;
 
-    if (TnodePtr.p->phase == ZAPI_ACTIVE){
+    if (c_connectedNodes.get(nodeId))
+    {
       jam();
       setNodeInfo(TnodePtr.i).m_heartbeat_cnt++;
       
-      if(getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 2){
+      if(getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 2)
+      {
 	signal->theData[0] = NDB_LE_MissedHeartbeat;
 	signal->theData[1] = nodeId;
 	signal->theData[2] = getNodeInfo(TnodePtr.i).m_heartbeat_cnt - 1;
 	sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
       }
       
-      if (getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 4) {
+      if (getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 4) 
+      {
         jam();
 	/*------------------------------------------------------------------*/
 	/* THE API NODE HAS NOT SENT ANY HEARTBEAT FOR THREE SECONDS. 
@@ -2344,8 +2347,8 @@
 	signal->theData[0] = NDB_LE_DeadDueToHeartbeat;
 	signal->theData[1] = nodeId;
 	sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
-
-        node_failed(signal, nodeId);
+        
+        api_failed(signal, nodeId);
       }//if
     }//if
   }//for
@@ -2435,26 +2438,6 @@
   sendSignal(DBTC_REF, GSN_API_FAILREQ, signal, 2, JBA);
   sendSignal(DBDICT_REF, GSN_API_FAILREQ, signal, 2, JBA);
   sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA);
-
-  /**-------------------------------------------------------------------------
-   * THE OTHER NODE WAS AN API NODE. THE COMMUNICATION LINK IS ALREADY 
-   * BROKEN AND THUS NO ACTION IS NEEDED TO BREAK THE CONNECTION. 
-   * WE ONLY NEED TO SET PARAMETERS TO ENABLE A NEW CONNECTION IN A FEW 
-   * SECONDS. 
-   *-------------------------------------------------------------------------*/
-  setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
-  setNodeInfo(failedNodePtr.i).m_version = 0;
-  recompute_version_info(getNodeInfo(failedNodePtr.i).m_type);
-  
-  CloseComReqConf * const closeCom = (CloseComReqConf *)&signal->theData[0];
-
-  closeCom->xxxBlockRef = reference();
-  closeCom->failNo      = 0;
-  closeCom->noOfNodes   = 1;
-  NodeBitmask::clear(closeCom->theNodes);
-  NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
-  sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal, 
-	     CloseComReqConf::SignalLength, JBA);
 }//Qmgr::sendApiFailReq()
 
 void Qmgr::execAPI_FAILREQ(Signal* signal)
@@ -2467,20 +2450,7 @@
   
   ndbrequire(getNodeInfo(failedNodePtr.i).getType() != NodeInfo::DB);
 
-  // ignore if api not active
-  if (failedNodePtr.p->phase != ZAPI_ACTIVE)
-  {
-    jam();
-    // But send to SUMA anyway...
-    sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA);
-    return;
-  }
-
-  signal->theData[0] = NDB_LE_Disconnected;
-  signal->theData[1] = failedNodePtr.i;
-  sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
-
-  node_failed(signal, failedNodePtr.i);
+  api_failed(signal, signal->theData[0]);
 }
 
 void Qmgr::execAPI_FAILCONF(Signal* signal) 
@@ -2604,6 +2574,13 @@
     ndbrequire(false);
   }
   
+  if (getNodeInfo(nodeId).getType() != NodeInfo::DB)
+  {
+    jam();
+    api_failed(signal, nodeId);
+    return;
+  }
+
   switch(nodePtr.p->phase){
   case ZRUNNING:
     jam();
@@ -2640,66 +2617,109 @@
   failedNodePtr.i = aFailedNode;
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
 
-  if (getNodeInfo(failedNodePtr.i).getType() == NodeInfo::DB){
+  ndbrequire(getNodeInfo(failedNodePtr.i).getType() == NodeInfo::DB);
+  
+  /**---------------------------------------------------------------------
+   *   THE OTHER NODE IS AN NDB NODE, WE HANDLE IT AS IF A HEARTBEAT 
+   *   FAILURE WAS DISCOVERED.
+   *---------------------------------------------------------------------*/
+  switch(failedNodePtr.p->phase){
+  case ZRUNNING:
     jam();
-    /**---------------------------------------------------------------------
-     *   THE OTHER NODE IS AN NDB NODE, WE HANDLE IT AS IF A HEARTBEAT 
-     *   FAILURE WAS DISCOVERED.
-     *---------------------------------------------------------------------*/
-    switch(failedNodePtr.p->phase){
-    case ZRUNNING:
-      jam();
-      failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
-      return;
-    case ZFAIL_CLOSING:
-      jam();
-      return;
-    case ZSTARTING:
-      c_start.reset();
-      // Fall-through
-    default:
-      jam();
-      /*---------------------------------------------------------------------*/
-      // The other node is still not in the cluster but disconnected. 
-      // We must restart communication in three seconds.
-      /*---------------------------------------------------------------------*/
-      failedNodePtr.p->failState = NORMAL;
-      failedNodePtr.p->phase = ZFAIL_CLOSING;
-      setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
-
-      CloseComReqConf * const closeCom = 
-	(CloseComReqConf *)&signal->theData[0];
-
-      closeCom->xxxBlockRef = reference();
-      closeCom->failNo      = 0;
-      closeCom->noOfNodes   = 1;
-      NodeBitmask::clear(closeCom->theNodes);
-      NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
-      sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal, 
-		 CloseComReqConf::SignalLength, JBA);
-    }//if
+    failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
     return;
-  }
-
-  /**
-   * API code
-   */
-  jam();
-  if (failedNodePtr.p->phase != ZFAIL_CLOSING){
+  case ZFAIL_CLOSING:
     jam();
-    //-------------------------------------------------------------------------
-    // The API was active and has now failed. We need to initiate API failure
-    // handling. If the API had already failed then we can ignore this
-    // discovery.
-    //-------------------------------------------------------------------------
+    return;
+  case ZSTARTING:
+    c_start.reset();
+    // Fall-through
+  default:
+    jam();
+    /*---------------------------------------------------------------------*/
+    // The other node is still not in the cluster but disconnected. 
+    // We must restart communication in three seconds.
+    /*---------------------------------------------------------------------*/
+    failedNodePtr.p->failState = NORMAL;
     failedNodePtr.p->phase = ZFAIL_CLOSING;
-    
-    sendApiFailReq(signal, aFailedNode);
-    arbitRec.code = ArbitCode::ApiFail;
-    handleArbitApiFail(signal, aFailedNode);
+    setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
+
+    CloseComReqConf * const closeCom = 
+      (CloseComReqConf *)&signal->theData[0];
+
+    closeCom->xxxBlockRef = reference();
+    closeCom->failNo      = 0;
+    closeCom->noOfNodes   = 1;
+    NodeBitmask::clear(closeCom->theNodes);
+    NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
+    sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal, 
+               CloseComReqConf::SignalLength, JBA);
   }//if
   return;
-}//Qmgr::node_failed()
+}
+
+void
+Qmgr::api_failed(Signal* signal, Uint32 nodeId)
+{
+  NodeRecPtr failedNodePtr;
+  /**------------------------------------------------------------------------
+   *   A COMMUNICATION LINK HAS BEEN DISCONNECTED. WE MUST TAKE SOME ACTION
+   *   DUE TO THIS.
+   *-----------------------------------------------------------------------*/
+  failedNodePtr.i = nodeId;
+  ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
+  
+  if (failedNodePtr.p->phase == ZFAIL_CLOSING)
+  {
+    /**
+     * Failure handling already in progress
+     */
+    jam();
+    return;
+  }
+
+  if (failedNodePtr.p->phase == ZAPI_ACTIVE)
+  {
+    jam();
+    sendApiFailReq(signal, nodeId);
+    arbitRec.code = ArbitCode::ApiFail;
+    handleArbitApiFail(signal, nodeId);
+  }
+  else
+  {
+    /**
+     * Always inform SUMA
+     */
+    jam();
+    signal->theData[0] = nodeId;
+    signal->theData[1] = QMGR_REF;
+    sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA);
+    failedNodePtr.p->failState = NORMAL;
+  }
+
+  failedNodePtr.p->phase = ZFAIL_CLOSING;
+  setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
+  setNodeInfo(failedNodePtr.i).m_version = 0;
+  recompute_version_info(getNodeInfo(failedNodePtr.i).m_type);
+  
+  CloseComReqConf * const closeCom = (CloseComReqConf *)&signal->theData[0];
+  closeCom->xxxBlockRef = reference();
+  closeCom->failNo      = 0;
+  closeCom->noOfNodes   = 1;
+  NodeBitmask::clear(closeCom->theNodes);
+  NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
+  sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal, 
+             CloseComReqConf::SignalLength, JBA);
+
+  if (getNodeInfo(failedNodePtr.i).getType() == NodeInfo::MGM)
+  {
+    /**
+     * Allow MGM do reconnect "directly"
+     */
+    jam();
+    setNodeInfo(failedNodePtr.i).m_heartbeat_cnt = 3;
+  }
+}
 
 /**--------------------------------------------------------------------------
  * AN API NODE IS REGISTERING. IF FOR THE FIRST TIME WE WILL ENABLE 
Thread
bk commit into 5.1 tree (jonas:1.2150) BUG#28445jonas30 May