Below is the list of changes that have just been committed into a local
5.0 repository of stewart. When stewart does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet
1.1882 05/07/22 20:29:25 stewart@stripped +10 -0
WL#2347 - Load independent heartbeats
Reset missed heartbeat count on receipt of signal from node.
This fixes a bug where that under high network load, the heartbeat packets could be
delayed, causing the appearance of node failure (due to lost heartbeats).
ndb/src/ndbapi/TransporterFacade.hpp
1.24 05/07/22 20:29:16 stewart@stripped +9 -0
Add hb_received(nodeId)
ndb/src/ndbapi/TransporterFacade.cpp
1.38 05/07/22 20:29:16 stewart@stripped +4 -0
Implement transporter_recv_from for ndbapi - which resets hbSent
ndb/src/ndbapi/ClusterMgr.hpp
1.6 05/07/22 20:29:15 stewart@stripped +8 -2
Use NodeInfo::m_heartbeat_cnt instead of ClusterMgr::Node::hbSent for missed
heartbeat count.
We now use the same storage for API and Kernel heartbeats.
Add ClusterMgr::hb_received(nodeId) to reset hbSent (as if we received a heartbeat,
but callable from elsewhere - e.g. when signal received)
ndb/src/ndbapi/ClusterMgr.cpp
1.21 05/07/22 20:29:15 stewart@stripped +5 -5
Use NodeInfo::m_heartbeat_cnt for missed heartbeat count
ndb/src/kernel/vm/TransporterCallback.cpp
1.8 05/07/22 20:29:15 stewart@stripped +6 -0
add transporter_recv_from(), which is called on receipt of signals.
It resets missed heartbeat count for that node.
ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
1.15 05/07/22 20:29:15 stewart@stripped +27 -26
Use NodeInfo::m_heartbeat_cnt for missed heartbeat count
ndb/src/kernel/blocks/qmgr/Qmgr.hpp
1.5 05/07/22 20:29:15 stewart@stripped +1 -2
remove NodeRec::alarmCount. missed heartbeat count now kept in NodeInfo
ndb/src/common/transporter/TransporterRegistry.cpp
1.58 05/07/22 20:29:15 stewart@stripped +4 -0
Add calls to transporter_receive_from when data is received (before unpack)
ndb/include/transporter/TransporterCallback.hpp
1.2 05/07/22 20:29:14 stewart@stripped +3 -0
add prototype for transporter_recv_from()
Called on receipt from a node.
ndb/include/kernel/NodeInfo.hpp
1.3 05/07/22 20:29:14 stewart@stripped +2 -0
Add m_heartbeat_cnt to track missed heartbeats
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: stewart
# Host: kennedy.(none)
# Root: /home/stewart/Documents/MySQL/5.0/wl2347
--- 1.2/ndb/include/kernel/NodeInfo.hpp 2004-06-24 09:53:13 +10:00
+++ 1.3/ndb/include/kernel/NodeInfo.hpp 2005-07-22 20:29:14 +10:00
@@ -41,6 +41,7 @@
Uint32 m_type; ///< Node type
Uint32 m_connectCount; ///< No of times connected
bool m_connected; ///< Node is connected
+ Uint32 m_heartbeat_cnt; ///< Missed heartbeats
friend NdbOut & operator<<(NdbOut&, const NodeInfo&);
};
@@ -52,6 +53,7 @@
m_signalVersion = 0;
m_type = INVALID;
m_connectCount = 0;
+ m_heartbeat_cnt= 0;
}
inline
--- 1.1/ndb/include/transporter/TransporterCallback.hpp 2004-04-14 18:23:57 +10:00
+++ 1.2/ndb/include/transporter/TransporterCallback.hpp 2005-07-22 20:29:14 +10:00
@@ -341,5 +341,8 @@
*/
void
reportError(void * callbackObj, NodeId nodeId, TransporterError errorCode);
+
+void
+transporter_recv_from(void* callbackObj, NodeId node);
#endif
--- 1.57/ndb/src/common/transporter/TransporterRegistry.cpp 2005-07-15 22:00:02 +10:00
+++ 1.58/ndb/src/common/transporter/TransporterRegistry.cpp 2005-07-22 20:29:15 +10:00
@@ -918,6 +918,7 @@
NodeId remoteNodeId;
Uint32 * readPtr;
Uint32 sz = theOSEReceiver->getReceiveData(&remoteNodeId, &readPtr);
+ transporter_recv_from(callbackObj, remoteNodeId);
Uint32 szUsed = unpack(readPtr,
sz,
remoteNodeId,
@@ -953,6 +954,7 @@
{
Uint32 * ptr;
Uint32 sz = t->getReceiveData(&ptr);
+ transporter_recv_from(callbackObj, nodeId);
Uint32 szUsed = unpack(ptr, sz, nodeId, ioStates[nodeId]);
t->updateReceiveDataPtr(szUsed);
}
@@ -976,6 +978,7 @@
{
Uint32 * readPtr, * eodPtr;
t->getReceivePtr(&readPtr, &eodPtr);
+ transporter_recv_from(callbackObj, nodeId);
Uint32 *newPtr = unpack(readPtr, eodPtr, nodeId, ioStates[nodeId]);
t->updateReceivePtr(newPtr);
}
@@ -993,6 +996,7 @@
{
Uint32 * readPtr, * eodPtr;
t->getReceivePtr(&readPtr, &eodPtr);
+ transporter_recv_from(callbackObj, nodeId);
Uint32 *newPtr = unpack(readPtr, eodPtr, nodeId, ioStates[nodeId]);
t->updateReceivePtr(newPtr);
}
--- 1.4/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2005-07-15 03:52:40 +10:00
+++ 1.5/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2005-07-22 20:29:15 +10:00
@@ -118,8 +118,7 @@
struct NodeRec {
UintR ndynamicId;
Phase phase;
- UintR alarmCount;
-
+
QmgrState sendPrepFailReqStatus;
QmgrState sendCommitFailReqStatus;
QmgrState sendPresToStatus;
--- 1.14/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2005-07-15 03:52:41 +10:00
+++ 1.15/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2005-07-22 20:29:15 +10:00
@@ -66,7 +66,7 @@
jamEntry();
hbNodePtr.i = signal->theData[0];
ptrCheckGuard(hbNodePtr, MAX_NDB_NODES, nodeRec);
- hbNodePtr.p->alarmCount = 0;
+ setNodeInfo(hbNodePtr.i).m_heartbeat_cnt= 0;
return;
}//Qmgr::execCM_HEARTBEAT()
@@ -1040,7 +1040,7 @@
jam();
ndbrequire(addNodePtr.p->phase == ZSTARTING);
addNodePtr.p->phase = ZRUNNING;
- addNodePtr.p->alarmCount = 0;
+ setNodeInfo(addNodePtr.i).m_heartbeat_cnt= 0;
c_clusterNodes.set(addNodePtr.i);
findNeighbours(signal);
@@ -1078,7 +1078,7 @@
* NODES IN THE CLUSTER.
*/
nodePtr.p->phase = ZRUNNING;
- nodePtr.p->alarmCount = 0;
+ setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
findNeighbours(signal);
c_clusterNodes.set(nodePtr.i);
c_start.reset();
@@ -1299,7 +1299,7 @@
*---------------------------------------------------------------------*/
fnNodePtr.i = cneighbourl;
ptrCheckGuard(fnNodePtr, MAX_NDB_NODES, nodeRec);
- fnNodePtr.p->alarmCount = 0;
+ setNodeInfo(fnNodePtr.i).m_heartbeat_cnt= 0;
}//if
}//if
@@ -1347,8 +1347,8 @@
} else {
nodePtr.p->phase = ZAPI_INACTIVE;
}
-
- nodePtr.p->alarmCount = 0;
+
+ setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
nodePtr.p->sendPrepFailReqStatus = Q_NOT_ACTIVE;
nodePtr.p->sendCommitFailReqStatus = Q_NOT_ACTIVE;
nodePtr.p->sendPresToStatus = Q_NOT_ACTIVE;
@@ -1550,18 +1550,18 @@
}//if
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec);
- nodePtr.p->alarmCount ++;
+ setNodeInfo(nodePtr.i).m_heartbeat_cnt++;
ndbrequire(nodePtr.p->phase == ZRUNNING);
ndbrequire(getNodeInfo(nodePtr.i).m_type == NodeInfo::DB);
- if(nodePtr.p->alarmCount > 2){
+ if(getNodeInfo(nodePtr.i).m_heartbeat_cnt > 2){
signal->theData[0] = NDB_LE_MissedHeartbeat;
signal->theData[1] = nodePtr.i;
- signal->theData[2] = nodePtr.p->alarmCount - 1;
+ signal->theData[2] = getNodeInfo(nodePtr.i).m_heartbeat_cnt - 1;
sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
}
- if (nodePtr.p->alarmCount > 4) {
+ if (getNodeInfo(nodePtr.i).m_heartbeat_cnt > 4) {
jam();
/**----------------------------------------------------------------------
* OUR LEFT NEIGHBOUR HAVE KEPT QUIET FOR THREE CONSECUTIVE HEARTBEAT
@@ -1593,16 +1593,16 @@
if (TnodePtr.p->phase == ZAPI_ACTIVE){
jam();
- TnodePtr.p->alarmCount ++;
+ setNodeInfo(TnodePtr.i).m_heartbeat_cnt++;
- if(TnodePtr.p->alarmCount > 2){
+ if(getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 2){
signal->theData[0] = NDB_LE_MissedHeartbeat;
signal->theData[1] = nodeId;
- signal->theData[2] = TnodePtr.p->alarmCount - 1;
+ signal->theData[2] = getNodeInfo(TnodePtr.i).m_heartbeat_cnt - 1;
sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
}
- if (TnodePtr.p->alarmCount > 4) {
+ if (getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 4) {
jam();
/*------------------------------------------------------------------*/
/* THE API NODE HAS NOT SENT ANY HEARTBEAT FOR THREE SECONDS.
@@ -1634,16 +1634,17 @@
ptrAss(nodePtr, nodeRec);
if (nodePtr.p->phase == ZFAIL_CLOSING) {
jam();
- nodePtr.p->alarmCount = nodePtr.p->alarmCount + 1;
+ setNodeInfo(nodePtr.i).m_heartbeat_cnt++;
if (c_connectedNodes.get(nodePtr.i)){
jam();
/*-------------------------------------------------------------------*/
// We need to ensure that the connection is not restored until it has
// been disconnected for at least three seconds.
/*-------------------------------------------------------------------*/
- nodePtr.p->alarmCount = 0;
+ setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
}//if
- if ((nodePtr.p->alarmCount > 3) && (nodePtr.p->failState == NORMAL)) {
+ if ((getNodeInfo(nodePtr.i).m_heartbeat_cnt > 3)
+ && (nodePtr.p->failState == NORMAL)) {
/**------------------------------------------------------------------
* WE HAVE DISCONNECTED THREE SECONDS AGO. WE ARE NOW READY TO
* CONNECT AGAIN AND ACCEPT NEW REGISTRATIONS FROM THIS NODE.
@@ -1659,18 +1660,18 @@
nodePtr.p->phase = ZINIT;
}//if
- nodePtr.p->alarmCount = 0;
+ setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
signal->theData[0] = 0;
signal->theData[1] = nodePtr.i;
sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA);
} else {
- if(((nodePtr.p->alarmCount + 1) % 60) == 0){
+ if(((getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1) % 60) == 0){
char buf[100];
BaseString::snprintf(buf, sizeof(buf),
"Failure handling of node %d has not completed in %d min."
" - state = %d",
nodePtr.i,
- (nodePtr.p->alarmCount + 1)/60,
+ (getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1)/60,
nodePtr.p->failState);
warningEvent(buf);
}
@@ -1718,7 +1719,7 @@
* WE ONLY NEED TO SET PARAMETERS TO ENABLE A NEW CONNECTION IN A FEW
* SECONDS.
*-------------------------------------------------------------------------*/
- failedNodePtr.p->alarmCount = 0;
+ setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
CloseComReqConf * const closeCom = (CloseComReqConf *)&signal->theData[0];
@@ -1871,7 +1872,7 @@
/*---------------------------------------------------------------------*/
failedNodePtr.p->failState = NORMAL;
failedNodePtr.p->phase = ZFAIL_CLOSING;
- failedNodePtr.p->alarmCount = 0;
+ setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
CloseComReqConf * const closeCom =
(CloseComReqConf *)&signal->theData[0];
@@ -1965,8 +1966,8 @@
}
setNodeInfo(apiNodePtr.i).m_version = version;
-
- apiNodePtr.p->alarmCount = 0;
+
+ setNodeInfo(apiNodePtr.i).m_heartbeat_cnt= 0;
ApiRegConf * const apiRegConf = (ApiRegConf *)&signal->theData[0];
apiRegConf->qmgrRef = reference();
@@ -2484,7 +2485,7 @@
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec);
nodePtr.p->phase = ZFAIL_CLOSING;
nodePtr.p->failState = WAITING_FOR_NDB_FAILCONF;
- nodePtr.p->alarmCount = 0;
+ setNodeInfo(nodePtr.i).m_heartbeat_cnt= 0;
c_clusterNodes.clear(nodePtr.i);
}//for
/*----------------------------------------------------------------------*/
@@ -2742,7 +2743,7 @@
failedNodePtr.p->sendPrepFailReqStatus = Q_NOT_ACTIVE;
failedNodePtr.p->sendCommitFailReqStatus = Q_NOT_ACTIVE;
failedNodePtr.p->sendPresToStatus = Q_NOT_ACTIVE;
- failedNodePtr.p->alarmCount = 0;
+ setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
if (aSendFailRep == ZTRUE) {
jam();
if (failedNodePtr.i != getOwnNodeId()) {
--- 1.7/ndb/src/kernel/vm/TransporterCallback.cpp 2005-01-12 09:25:41 +11:00
+++ 1.8/ndb/src/kernel/vm/TransporterCallback.cpp 2005-07-22 20:29:15 +10:00
@@ -33,6 +33,7 @@
#include <NdbOut.hpp>
#include "DataBuffer.hpp"
+
/**
* The instance
*/
@@ -452,3 +453,8 @@
putc('\n', output);
}
+void
+transporter_recv_from(void * callbackObj, NodeId nodeId){
+ globalData.m_nodeInfo[nodeId].m_heartbeat_cnt= 0;
+ return;
+}
--- 1.20/ndb/src/ndbapi/ClusterMgr.cpp 2005-07-15 03:52:41 +10:00
+++ 1.21/ndb/src/ndbapi/ClusterMgr.cpp 2005-07-22 20:29:15 +10:00
@@ -214,7 +214,7 @@
* It is now time to send a new Heartbeat
*/
if (theNode.hbCounter >= theNode.hbFrequency) {
- theNode.hbSent++;
+ theNode.m_info.m_heartbeat_cnt++;
theNode.hbCounter = 0;
}
@@ -231,7 +231,7 @@
theFacade.sendSignalUnCond(&signal, nodeId);
}//if
- if (theNode.hbSent == 4 && theNode.hbFrequency > 0){
+ if (theNode.m_info.m_heartbeat_cnt == 4 && theNode.hbFrequency > 0){
reportNodeFailed(i);
}//if
}
@@ -337,7 +337,7 @@
node.compatible = ndbCompatible_api_ndb(NDB_VERSION,
node.m_info.m_version);
}
-
+
node.m_state = apiRegConf->nodeState;
if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED ||
node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
@@ -345,7 +345,7 @@
} else {
set_node_alive(node, false);
}//if
- node.hbSent = 0;
+ node.m_info.m_heartbeat_cnt = 0;
node.hbCounter = 0;
if (node.m_info.m_type != NodeInfo::REP) {
node.hbFrequency = (apiRegConf->apiHeartbeatFrequency * 10) - 50;
@@ -414,7 +414,7 @@
Node & theNode = theNodes[nodeId];
theNode.connected = true;
- theNode.hbSent = 0;
+ theNode.m_info.m_heartbeat_cnt = 0;
theNode.hbCounter = 0;
/**
--- 1.5/ndb/src/ndbapi/ClusterMgr.hpp 2005-05-05 02:40:52 +10:00
+++ 1.6/ndb/src/ndbapi/ClusterMgr.hpp 2005-07-22 20:29:15 +10:00
@@ -73,12 +73,12 @@
*/
Uint32 hbFrequency; // Heartbeat frequence
Uint32 hbCounter; // # milliseconds passed since last hb sent
- Uint32 hbSent; // # heartbeats sent (without answer)
};
const Node & getNodeInfo(NodeId) const;
Uint32 getNoOfConnectedNodes() const;
-
+ void hb_received(NodeId);
+
private:
Uint32 noOfAliveNodes;
Uint32 noOfConnectedNodes;
@@ -126,6 +126,12 @@
Uint32
ClusterMgr::getNoOfConnectedNodes() const {
return noOfConnectedNodes;
+}
+
+inline
+void
+ClusterMgr::hb_received(NodeId nodeId) {
+ theNodes[nodeId].m_info.m_heartbeat_cnt= 0;
}
/*****************************************************************************/
--- 1.37/ndb/src/ndbapi/TransporterFacade.cpp 2005-07-15 09:41:53 +10:00
+++ 1.38/ndb/src/ndbapi/TransporterFacade.cpp 2005-07-22 20:29:16 +10:00
@@ -126,6 +126,10 @@
//TransporterFacade::instance()->reportDisconnected(nodeId);
}
+void
+transporter_recv_from(void * callbackObj, NodeId nodeId){
+ ((TransporterFacade*)(callbackObj))->hb_received(nodeId);
+}
/****************************************************************************
*
--- 1.23/ndb/src/ndbapi/TransporterFacade.hpp 2005-02-17 06:37:01 +11:00
+++ 1.24/ndb/src/ndbapi/TransporterFacade.hpp 2005-07-22 20:29:16 +10:00
@@ -114,6 +114,9 @@
TransporterRegistry* get_registry() { return theTransporterRegistry;};
+ // heart beat received from a node (e.g. a signal came)
+ void hb_received(NodeId n);
+
private:
/**
* Send a signal unconditional of node status (used by ClusterMgr)
@@ -293,6 +296,12 @@
const ClusterMgr::Node & node = theClusterMgr->getNodeInfo(n);
return node.m_alive;
+}
+
+inline
+void
+TransporterFacade::hb_received(NodeId n) {
+ theClusterMgr->hb_received(n);
}
inline
| Thread |
|---|
| • bk commit into 5.0 tree (stewart:1.1882) | Stewart Smith | 22 Jul |