List:Internals« Previous MessageNext Message »
From:Stewart Smith Date:July 22 2005 10:29am
Subject:bk commit into 5.0 tree (stewart:1.1882)
View as plain text  
Below is the list of changes that have just been committed into a local
5.0 repository of stewart. When stewart does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.1882 05/07/22 20:29:25 stewart@stripped +10 -0
  WL#2347 - Load independent heartbeats
  
  Reset missed heartbeat count on receipt of signal from node.
  
  This fixes a bug where that under high network load, the heartbeat packets could be
  delayed, causing the appearance of node failure (due to lost heartbeats).

  ndb/src/ndbapi/TransporterFacade.hpp
    1.24 05/07/22 20:29:16 stewart@stripped +9 -0
    Add hb_received(nodeId)

  ndb/src/ndbapi/TransporterFacade.cpp
    1.38 05/07/22 20:29:16 stewart@stripped +4 -0
    Implement transporter_recv_from for ndbapi - which resets hbSent

  ndb/src/ndbapi/ClusterMgr.hpp
    1.6 05/07/22 20:29:15 stewart@stripped +8 -2
    Use NodeInfo::m_heartbeat_cnt instead of ClusterMgr::Node::hbSent for missed
    heartbeat count.
    
    We now use the same storage for API and Kernel heartbeats.
    
    Add ClusterMgr::hb_received(nodeId) to reset hbSent (as if we received a heartbeat,
    but callable from elsewhere - e.g. when signal received)

  ndb/src/ndbapi/ClusterMgr.cpp
    1.21 05/07/22 20:29:15 stewart@stripped +5 -5
    Use NodeInfo::m_heartbeat_cnt for missed heartbeat count

  ndb/src/kernel/vm/TransporterCallback.cpp
    1.8 05/07/22 20:29:15 stewart@stripped +6 -0
    add transporter_recv_from(), which is called on receipt of signals.
    It resets missed heartbeat count for that node.

  ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
    1.15 05/07/22 20:29:15 stewart@stripped +27 -26
    Use NodeInfo::m_heartbeat_cnt for missed heartbeat count

  ndb/src/kernel/blocks/qmgr/Qmgr.hpp
    1.5 05/07/22 20:29:15 stewart@stripped +1 -2
    remove NodeRec::alarmCount. missed heartbeat count now kept in NodeInfo

  ndb/src/common/transporter/TransporterRegistry.cpp
    1.58 05/07/22 20:29:15 stewart@stripped +4 -0
    Add calls to transporter_receive_from when data is received (before unpack)

  ndb/include/transporter/TransporterCallback.hpp
    1.2 05/07/22 20:29:14 stewart@stripped +3 -0
    add prototype for transporter_recv_from()
    
    Called on receipt from a node.

  ndb/include/kernel/NodeInfo.hpp
    1.3 05/07/22 20:29:14 stewart@stripped +2 -0
    Add m_heartbeat_cnt to track missed heartbeats

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	stewart
# Host:	kennedy.(none)
# Root:	/home/stewart/Documents/MySQL/5.0/wl2347

--- 1.2/ndb/include/kernel/NodeInfo.hpp	2004-06-24 09:53:13 +10:00
+++ 1.3/ndb/include/kernel/NodeInfo.hpp	2005-07-22 20:29:14 +10:00
@@ -41,6 +41,7 @@
   Uint32 m_type;          ///< Node type
   Uint32 m_connectCount;  ///< No of times connected
   bool   m_connected;     ///< Node is connected
+  Uint32 m_heartbeat_cnt; ///< Missed heartbeats
   
   friend NdbOut & operator<<(NdbOut&, const NodeInfo&); 
 };
@@ -52,6 +53,7 @@
   m_signalVersion = 0;
   m_type = INVALID;
   m_connectCount = 0;
+  m_heartbeat_cnt= 0;
 }
 
 inline

--- 1.1/ndb/include/transporter/TransporterCallback.hpp	2004-04-14 18:23:57 +10:00
+++ 1.2/ndb/include/transporter/TransporterCallback.hpp	2005-07-22 20:29:14 +10:00
@@ -341,5 +341,8 @@
  */ 
 void 
 reportError(void * callbackObj, NodeId nodeId, TransporterError errorCode); 
+
+void
+transporter_recv_from(void* callbackObj, NodeId node);
  
 #endif   

--- 1.57/ndb/src/common/transporter/TransporterRegistry.cpp	2005-07-15 22:00:02 +10:00
+++ 1.58/ndb/src/common/transporter/TransporterRegistry.cpp	2005-07-22 20:29:15 +10:00
@@ -918,6 +918,7 @@
       NodeId remoteNodeId;
       Uint32 * readPtr;
       Uint32 sz = theOSEReceiver->getReceiveData(&remoteNodeId, &readPtr);
+      transporter_recv_from(callbackObj, remoteNodeId);
       Uint32 szUsed = unpack(readPtr,
 			     sz,
 			     remoteNodeId,
@@ -953,6 +954,7 @@
 	  {
 	    Uint32 * ptr;
 	    Uint32 sz = t->getReceiveData(&ptr);
+	    transporter_recv_from(callbackObj, nodeId);
 	    Uint32 szUsed = unpack(ptr, sz, nodeId, ioStates[nodeId]);
 	    t->updateReceiveDataPtr(szUsed);
           }
@@ -976,6 +978,7 @@
       {
 	Uint32 * readPtr, * eodPtr;
 	t->getReceivePtr(&readPtr, &eodPtr);
+	transporter_recv_from(callbackObj, nodeId);
 	Uint32 *newPtr = unpack(readPtr, eodPtr, nodeId, ioStates[nodeId]);
 	t->updateReceivePtr(newPtr);
       }
@@ -993,6 +996,7 @@
       {
 	Uint32 * readPtr, * eodPtr;
 	t->getReceivePtr(&readPtr, &eodPtr);
+	transporter_recv_from(callbackObj, nodeId);
 	Uint32 *newPtr = unpack(readPtr, eodPtr, nodeId, ioStates[nodeId]);
 	t->updateReceivePtr(newPtr);
       }

--- 1.4/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2005-07-15 03:52:40 +10:00
+++ 1.5/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2005-07-22 20:29:15 +10:00
@@ -118,8 +118,7 @@
   struct NodeRec {
     UintR ndynamicId;
     Phase phase;
-    UintR alarmCount;
-    
+
     QmgrState sendPrepFailReqStatus;
     QmgrState sendCommitFailReqStatus;
     QmgrState sendPresToStatus;

--- 1.14/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2005-07-15 03:52:41 +10:00
+++ 1.15/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2005-07-22 20:29:15 +10:00
@@ -66,7 +66,7 @@
   jamEntry();
   hbNodePtr.i = signal->theData[0];
   ptrCheckGuard(hbNodePtr, MAX_NDB_NODES, nodeRec);
-  hbNodePtr.p->alarmCount = 0;
+  setNodeInfo(hbNodePtr.i).m_heartbeat_cnt= 0;
   return;
 }//Qmgr::execCM_HEARTBEAT()
 
@@ -1040,7 +1040,7 @@
     jam();
     ndbrequire(addNodePtr.p->phase == ZSTARTING);
     addNodePtr.p->phase = ZRUNNING;
-    addNodePtr.p->alarmCount = 0;
+    setNodeInfo(addNodePtr.i).m_heartbeat_cnt= 0;
     c_clusterNodes.set(addNodePtr.i);
     findNeighbours(signal);
 
@@ -1078,7 +1078,7 @@
    * NODES IN THE CLUSTER.
    */
   nodePtr.p->phase = ZRUNNING;
-  nodePtr.p->alarmCount = 0;
+  setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
   findNeighbours(signal);
   c_clusterNodes.set(nodePtr.i);
   c_start.reset();
@@ -1299,7 +1299,7 @@
        *---------------------------------------------------------------------*/
       fnNodePtr.i = cneighbourl;
       ptrCheckGuard(fnNodePtr, MAX_NDB_NODES, nodeRec);
-      fnNodePtr.p->alarmCount = 0;
+      setNodeInfo(fnNodePtr.i).m_heartbeat_cnt= 0;
     }//if
   }//if
 
@@ -1347,8 +1347,8 @@
     } else {
       nodePtr.p->phase = ZAPI_INACTIVE;
     }
-    
-    nodePtr.p->alarmCount = 0;
+
+    setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
     nodePtr.p->sendPrepFailReqStatus = Q_NOT_ACTIVE;
     nodePtr.p->sendCommitFailReqStatus = Q_NOT_ACTIVE;
     nodePtr.p->sendPresToStatus = Q_NOT_ACTIVE;
@@ -1550,18 +1550,18 @@
   }//if
   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec);
   
-  nodePtr.p->alarmCount ++;
+  setNodeInfo(nodePtr.i).m_heartbeat_cnt++;
   ndbrequire(nodePtr.p->phase == ZRUNNING);
   ndbrequire(getNodeInfo(nodePtr.i).m_type == NodeInfo::DB);
 
-  if(nodePtr.p->alarmCount > 2){
+  if(getNodeInfo(nodePtr.i).m_heartbeat_cnt > 2){
     signal->theData[0] = NDB_LE_MissedHeartbeat;
     signal->theData[1] = nodePtr.i;
-    signal->theData[2] = nodePtr.p->alarmCount - 1;
+    signal->theData[2] = getNodeInfo(nodePtr.i).m_heartbeat_cnt - 1;
     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
   }
 
-  if (nodePtr.p->alarmCount > 4) {
+  if (getNodeInfo(nodePtr.i).m_heartbeat_cnt > 4) {
     jam();
     /**----------------------------------------------------------------------
      * OUR LEFT NEIGHBOUR HAVE KEPT QUIET FOR THREE CONSECUTIVE HEARTBEAT 
@@ -1593,16 +1593,16 @@
 
     if (TnodePtr.p->phase == ZAPI_ACTIVE){
       jam();
-      TnodePtr.p->alarmCount ++;
+      setNodeInfo(TnodePtr.i).m_heartbeat_cnt++;
       
-      if(TnodePtr.p->alarmCount > 2){
+      if(getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 2){
 	signal->theData[0] = NDB_LE_MissedHeartbeat;
 	signal->theData[1] = nodeId;
-	signal->theData[2] = TnodePtr.p->alarmCount - 1;
+	signal->theData[2] = getNodeInfo(TnodePtr.i).m_heartbeat_cnt - 1;
 	sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
       }
       
-      if (TnodePtr.p->alarmCount > 4) {
+      if (getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 4) {
         jam();
 	/*------------------------------------------------------------------*/
 	/* THE API NODE HAS NOT SENT ANY HEARTBEAT FOR THREE SECONDS. 
@@ -1634,16 +1634,17 @@
     ptrAss(nodePtr, nodeRec);
     if (nodePtr.p->phase == ZFAIL_CLOSING) {
       jam();
-      nodePtr.p->alarmCount = nodePtr.p->alarmCount + 1;
+      setNodeInfo(nodePtr.i).m_heartbeat_cnt++;
       if (c_connectedNodes.get(nodePtr.i)){
         jam();
 	/*-------------------------------------------------------------------*/
 	// We need to ensure that the connection is not restored until it has 
 	// been disconnected for at least three seconds.
 	/*-------------------------------------------------------------------*/
-        nodePtr.p->alarmCount = 0;
+        setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
       }//if
-      if ((nodePtr.p->alarmCount > 3) && (nodePtr.p->failState == NORMAL)) {
+      if ((getNodeInfo(nodePtr.i).m_heartbeat_cnt > 3)
+	  && (nodePtr.p->failState == NORMAL)) {
 	/**------------------------------------------------------------------
 	 * WE HAVE DISCONNECTED THREE SECONDS AGO. WE ARE NOW READY TO 
 	 * CONNECT AGAIN AND ACCEPT NEW REGISTRATIONS FROM THIS NODE. 
@@ -1659,18 +1660,18 @@
           nodePtr.p->phase = ZINIT;
         }//if
 
-        nodePtr.p->alarmCount = 0;
+        setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
         signal->theData[0] = 0;
         signal->theData[1] = nodePtr.i;
         sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA);
       } else {
-	if(((nodePtr.p->alarmCount + 1) % 60) == 0){
+	if(((getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1) % 60) == 0){
 	  char buf[100];
 	  BaseString::snprintf(buf, sizeof(buf), 
 		   "Failure handling of node %d has not completed in %d min."
 		   " - state = %d",
 		   nodePtr.i, 
-		   (nodePtr.p->alarmCount + 1)/60,
+		   (getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1)/60,
 		   nodePtr.p->failState);
 	  warningEvent(buf);
 	}
@@ -1718,7 +1719,7 @@
    * WE ONLY NEED TO SET PARAMETERS TO ENABLE A NEW CONNECTION IN A FEW 
    * SECONDS. 
    *-------------------------------------------------------------------------*/
-  failedNodePtr.p->alarmCount = 0;
+  setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
 
   CloseComReqConf * const closeCom = (CloseComReqConf *)&signal->theData[0];
 
@@ -1871,7 +1872,7 @@
       /*---------------------------------------------------------------------*/
       failedNodePtr.p->failState = NORMAL;
       failedNodePtr.p->phase = ZFAIL_CLOSING;
-      failedNodePtr.p->alarmCount = 0;
+      setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
 
       CloseComReqConf * const closeCom = 
 	(CloseComReqConf *)&signal->theData[0];
@@ -1965,8 +1966,8 @@
   }
 
   setNodeInfo(apiNodePtr.i).m_version = version;
-   
-  apiNodePtr.p->alarmCount = 0;
+
+  setNodeInfo(apiNodePtr.i).m_heartbeat_cnt= 0;
 
   ApiRegConf * const apiRegConf = (ApiRegConf *)&signal->theData[0];
   apiRegConf->qmgrRef = reference();
@@ -2484,7 +2485,7 @@
       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec);
       nodePtr.p->phase = ZFAIL_CLOSING;
       nodePtr.p->failState = WAITING_FOR_NDB_FAILCONF;
-      nodePtr.p->alarmCount = 0;
+      setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
       c_clusterNodes.clear(nodePtr.i);
     }//for
     /*----------------------------------------------------------------------*/
@@ -2742,7 +2743,7 @@
     failedNodePtr.p->sendPrepFailReqStatus = Q_NOT_ACTIVE;
     failedNodePtr.p->sendCommitFailReqStatus = Q_NOT_ACTIVE;
     failedNodePtr.p->sendPresToStatus = Q_NOT_ACTIVE;
-    failedNodePtr.p->alarmCount = 0;
+    setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
     if (aSendFailRep == ZTRUE) {
       jam();
       if (failedNodePtr.i != getOwnNodeId()) {

--- 1.7/ndb/src/kernel/vm/TransporterCallback.cpp	2005-01-12 09:25:41 +11:00
+++ 1.8/ndb/src/kernel/vm/TransporterCallback.cpp	2005-07-22 20:29:15 +10:00
@@ -33,6 +33,7 @@
 #include <NdbOut.hpp>
 #include "DataBuffer.hpp"
 
+
 /**
  * The instance
  */
@@ -452,3 +453,8 @@
     putc('\n', output);
 }
 
+void
+transporter_recv_from(void * callbackObj, NodeId nodeId){
+  globalData.m_nodeInfo[nodeId].m_heartbeat_cnt= 0;
+  return;
+}

--- 1.20/ndb/src/ndbapi/ClusterMgr.cpp	2005-07-15 03:52:41 +10:00
+++ 1.21/ndb/src/ndbapi/ClusterMgr.cpp	2005-07-22 20:29:15 +10:00
@@ -214,7 +214,7 @@
 	 * It is now time to send a new Heartbeat
 	 */
 	if (theNode.hbCounter >= theNode.hbFrequency) {
-	  theNode.hbSent++;
+	  theNode.m_info.m_heartbeat_cnt++;
 	  theNode.hbCounter = 0;
 	}
 
@@ -231,7 +231,7 @@
 	theFacade.sendSignalUnCond(&signal, nodeId);
       }//if
       
-      if (theNode.hbSent == 4 && theNode.hbFrequency > 0){
+      if (theNode.m_info.m_heartbeat_cnt == 4 && theNode.hbFrequency > 0){
 	reportNodeFailed(i);
       }//if
     }
@@ -337,7 +337,7 @@
       node.compatible = ndbCompatible_api_ndb(NDB_VERSION,
 					      node.m_info.m_version);
   }
-  
+
   node.m_state = apiRegConf->nodeState;
   if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED  ||
 			  node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
@@ -345,7 +345,7 @@
   } else {
     set_node_alive(node, false);
   }//if
-  node.hbSent = 0;
+  node.m_info.m_heartbeat_cnt = 0;
   node.hbCounter = 0;
   if (node.m_info.m_type != NodeInfo::REP) {
     node.hbFrequency = (apiRegConf->apiHeartbeatFrequency * 10) - 50;
@@ -414,7 +414,7 @@
 
   Node & theNode = theNodes[nodeId];
   theNode.connected = true;
-  theNode.hbSent = 0;
+  theNode.m_info.m_heartbeat_cnt = 0;
   theNode.hbCounter = 0;
 
   /**

--- 1.5/ndb/src/ndbapi/ClusterMgr.hpp	2005-05-05 02:40:52 +10:00
+++ 1.6/ndb/src/ndbapi/ClusterMgr.hpp	2005-07-22 20:29:15 +10:00
@@ -73,12 +73,12 @@
      */
     Uint32 hbFrequency; // Heartbeat frequence 
     Uint32 hbCounter;   // # milliseconds passed since last hb sent
-    Uint32 hbSent;      // # heartbeats sent (without answer)
   };
   
   const Node &  getNodeInfo(NodeId) const;
   Uint32        getNoOfConnectedNodes() const;
-  
+  void          hb_received(NodeId);
+
 private:
   Uint32        noOfAliveNodes;
   Uint32        noOfConnectedNodes;
@@ -126,6 +126,12 @@
 Uint32
 ClusterMgr::getNoOfConnectedNodes() const {
   return noOfConnectedNodes;
+}
+
+inline
+void
+ClusterMgr::hb_received(NodeId nodeId) {
+  theNodes[nodeId].m_info.m_heartbeat_cnt= 0;
 }
 
 /*****************************************************************************/

--- 1.37/ndb/src/ndbapi/TransporterFacade.cpp	2005-07-15 09:41:53 +10:00
+++ 1.38/ndb/src/ndbapi/TransporterFacade.cpp	2005-07-22 20:29:16 +10:00
@@ -126,6 +126,10 @@
   //TransporterFacade::instance()->reportDisconnected(nodeId);
 }
 
+void
+transporter_recv_from(void * callbackObj, NodeId nodeId){
+  ((TransporterFacade*)(callbackObj))->hb_received(nodeId);
+}
 
 /****************************************************************************
  * 

--- 1.23/ndb/src/ndbapi/TransporterFacade.hpp	2005-02-17 06:37:01 +11:00
+++ 1.24/ndb/src/ndbapi/TransporterFacade.hpp	2005-07-22 20:29:16 +10:00
@@ -114,6 +114,9 @@
 
   TransporterRegistry* get_registry() { return theTransporterRegistry;};
 
+  // heart beat received from a node (e.g. a signal came)
+  void hb_received(NodeId n);
+
 private:
   /**
    * Send a signal unconditional of node status (used by ClusterMgr)
@@ -293,6 +296,12 @@
 
   const ClusterMgr::Node & node = theClusterMgr->getNodeInfo(n);
   return node.m_alive;
+}
+
+inline
+void
+TransporterFacade::hb_received(NodeId n) {
+  theClusterMgr->hb_received(n);
 }
 
 inline
Thread
bk commit into 5.0 tree (stewart:1.1882)Stewart Smith22 Jul