Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet@stripped, 2007-05-30 22:01:18+02:00, jonas@stripped +4 -0
ndb - bug#28445
start hb already on connect, not on first received hb
storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp@stripped, 2007-05-30 22:01:17+02:00,
jonas@stripped +20 -28
make sure qmgr is "fully" informed about connections so that it can handle hb
correctly
storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp@stripped, 2007-05-30 22:01:17+02:00,
jonas@stripped +3 -0
move api failure handling into own method
add START_ORD so that hb checking can start really early
storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp@stripped, 2007-05-30 22:01:17+02:00,
jonas@stripped +1 -0
move api failure handling into own method
add START_ORD so that hb checking can start really early
storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp@stripped, 2007-05-30 22:01:17+02:00,
jonas@stripped +131 -111
start hb handling directly on connect rep
(instead of first hb)
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: jonas
# Host: perch.ndb.mysql.com
# Root: /home/jonas/src/drop5
--- 1.32/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp 2007-05-30 22:01:22 +02:00
+++ 1.33/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp 2007-05-30 22:01:22 +02:00
@@ -398,9 +398,10 @@
// Uint32 noOfNodes = closeCom->noOfNodes;
jamEntry();
- for (unsigned i = 0; i < MAX_NODES; i++){
- if(NodeBitmask::get(closeCom->theNodes, i)){
-
+ for (unsigned i = 0; i < MAX_NODES; i++)
+ {
+ if(NodeBitmask::get(closeCom->theNodes, i))
+ {
jam();
//-----------------------------------------------------
@@ -414,7 +415,9 @@
globalTransporterRegistry.do_disconnect(i);
}
}
- if (failNo != 0) {
+
+ if (failNo != 0)
+ {
jam();
signal->theData[0] = userRef;
signal->theData[1] = failNo;
@@ -433,8 +436,8 @@
jamEntry();
const Uint32 len = signal->getLength();
- if(len == 2){
-
+ if(len == 2)
+ {
#ifdef ERROR_INSERT
if (! ((ERROR_INSERTED(9000) || ERROR_INSERTED(9002))
&& c_error_9000_nodes_mask.get(tStartingNode)))
@@ -452,9 +455,11 @@
//-----------------------------------------------------
}
} else {
- for(unsigned int i = 1; i < MAX_NODES; i++ ) {
+ for(unsigned int i = 1; i < MAX_NODES; i++ )
+ {
jam();
- if (i != getOwnNodeId() && getNodeInfo(i).m_type == tData2){
+ if (i != getOwnNodeId() && getNodeInfo(i).m_type == tData2)
+ {
jam();
#ifdef ERROR_INSERT
@@ -513,24 +518,10 @@
setNodeInfo(hostId).m_connectCount++;
const NodeInfo::NodeType type = getNodeInfo(hostId).getType();
ndbrequire(type != NodeInfo::INVALID);
-
- if(type == NodeInfo::DB || globalData.theStartLevel == NodeState::SL_STARTED){
- jam();
- DisconnectRep * const rep = (DisconnectRep *)&signal->theData[0];
- rep->nodeId = hostId;
- rep->err = errNo;
- sendSignal(QMGR_REF, GSN_DISCONNECT_REP, signal,
- DisconnectRep::SignalLength, JBA);
- } else if((globalData.theStartLevel == NodeState::SL_CMVMI ||
- globalData.theStartLevel == NodeState::SL_STARTING)
- && type == NodeInfo::MGM) {
- /**
- * Someone disconnected during cmvmi period
- */
- jam();
- globalTransporterRegistry.do_connect(hostId);
- }
+ sendSignal(QMGR_REF, GSN_DISCONNECT_REP, signal,
+ DisconnectRep::SignalLength, JBA);
+
cancelSubscription(hostId);
signal->theData[0] = NDB_LE_Disconnected;
@@ -564,6 +555,8 @@
*/
if(type == NodeInfo::MGM){
jam();
+ signal->theData[0] = hostId;
+ sendSignal(QMGR_REF, GSN_CONNECT_REP, signal, 1, JBA);
} else {
/**
* Dont allow api nodes to connect
@@ -795,6 +788,8 @@
}
}
}
+
+ EXECUTE_DIRECT(QMGR, GSN_START_ORD, signal, 1);
return ;
}
@@ -822,9 +817,6 @@
*
* Do Restart
*/
-
- globalScheduler.clear();
- globalTimeQueue.clear();
// Disconnect all nodes as part of the system restart.
// We need to ensure that we are starting up
--- 1.17/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2007-05-30 22:01:22 +02:00
+++ 1.18/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2007-05-30 22:01:22 +02:00
@@ -265,6 +265,8 @@
void execALLOC_NODEID_CONF(Signal *);
void execALLOC_NODEID_REF(Signal *);
void completeAllocNodeIdReq(Signal *);
+
+ void execSTART_ORD(Signal*);
// Arbitration signals
void execARBIT_CFG(Signal* signal);
@@ -281,6 +283,7 @@
void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
Uint32 check_startup(Signal* signal);
+ void api_failed(Signal* signal, Uint32 aFailedNode);
void node_failed(Signal* signal, Uint16 aFailedNode);
void checkStartInterface(Signal* signal);
void failReport(Signal* signal,
--- 1.12/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp 2007-05-30 22:01:22 +02:00
+++ 1.13/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp 2007-05-30 22:01:22 +02:00
@@ -115,6 +115,7 @@
addRecSignal(GSN_DIH_RESTARTREF, &Qmgr::execDIH_RESTARTREF);
addRecSignal(GSN_DIH_RESTARTCONF, &Qmgr::execDIH_RESTARTCONF);
addRecSignal(GSN_NODE_VERSION_REP, &Qmgr::execNODE_VERSION_REP);
+ addRecSignal(GSN_START_ORD, &Qmgr::execSTART_ORD);
initData();
}//Qmgr::Qmgr()
--- 1.40/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2007-05-30 22:01:22 +02:00
+++ 1.41/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2007-05-30 22:01:22 +02:00
@@ -238,6 +238,16 @@
ReadConfigConf::SignalLength, JBB);
}
+void
+Qmgr::execSTART_ORD(Signal* signal)
+{
+ /**
+ * Start timer handling
+ */
+ signal->theData[0] = ZTIMER_HANDLING;
+ sendSignal(QMGR_REF, GSN_CONTINUEB, signal, 1, JBB);
+}
+
/*
4.2 ADD NODE MODULE*/
/*##########################################################################*/
@@ -1165,12 +1175,6 @@
{
jam();
electionWon(signal);
-
- /**
- * Start timer handling
- */
- signal->theData[0] = ZTIMER_HANDLING;
- sendSignal(QMGR_REF, GSN_CONTINUEB, signal, 10, JBB);
}
return;
@@ -1809,12 +1813,6 @@
sendSttorryLab(signal);
- /**
- * Start timer handling
- */
- signal->theData[0] = ZTIMER_HANDLING;
- sendSignal(QMGR_REF, GSN_CONTINUEB, signal, 10, JBB);
-
sendCmAckAdd(signal, getOwnNodeId(), CmAdd::CommitNew);
}
@@ -2191,20 +2189,22 @@
hb_check_timer.reset();
}
}
-
+
if (interface_check_timer.check(TcurrentTime)) {
jam();
interface_check_timer.reset();
checkStartInterface(signal);
}
+ if (hb_api_timer.check(TcurrentTime))
+ {
+ jam();
+ hb_api_timer.reset();
+ apiHbHandlingLab(signal);
+ }
+
if (cactivateApiCheck != 0) {
jam();
- if (hb_api_timer.check(TcurrentTime)) {
- jam();
- hb_api_timer.reset();
- apiHbHandlingLab(signal);
- }//if
if (clatestTransactionCheck == 0) {
//-------------------------------------------------------------
// Initialise the Transaction check timer.
@@ -2321,18 +2321,21 @@
if(type == NodeInfo::INVALID)
continue;
- if (TnodePtr.p->phase == ZAPI_ACTIVE){
+ if (c_connectedNodes.get(nodeId))
+ {
jam();
setNodeInfo(TnodePtr.i).m_heartbeat_cnt++;
- if(getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 2){
+ if(getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 2)
+ {
signal->theData[0] = NDB_LE_MissedHeartbeat;
signal->theData[1] = nodeId;
signal->theData[2] = getNodeInfo(TnodePtr.i).m_heartbeat_cnt - 1;
sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
}
- if (getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 4) {
+ if (getNodeInfo(TnodePtr.i).m_heartbeat_cnt > 4)
+ {
jam();
/*------------------------------------------------------------------*/
/* THE API NODE HAS NOT SENT ANY HEARTBEAT FOR THREE SECONDS.
@@ -2344,8 +2347,8 @@
signal->theData[0] = NDB_LE_DeadDueToHeartbeat;
signal->theData[1] = nodeId;
sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
-
- node_failed(signal, nodeId);
+
+ api_failed(signal, nodeId);
}//if
}//if
}//for
@@ -2435,26 +2438,6 @@
sendSignal(DBTC_REF, GSN_API_FAILREQ, signal, 2, JBA);
sendSignal(DBDICT_REF, GSN_API_FAILREQ, signal, 2, JBA);
sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA);
-
- /**-------------------------------------------------------------------------
- * THE OTHER NODE WAS AN API NODE. THE COMMUNICATION LINK IS ALREADY
- * BROKEN AND THUS NO ACTION IS NEEDED TO BREAK THE CONNECTION.
- * WE ONLY NEED TO SET PARAMETERS TO ENABLE A NEW CONNECTION IN A FEW
- * SECONDS.
- *-------------------------------------------------------------------------*/
- setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
- setNodeInfo(failedNodePtr.i).m_version = 0;
- recompute_version_info(getNodeInfo(failedNodePtr.i).m_type);
-
- CloseComReqConf * const closeCom = (CloseComReqConf *)&signal->theData[0];
-
- closeCom->xxxBlockRef = reference();
- closeCom->failNo = 0;
- closeCom->noOfNodes = 1;
- NodeBitmask::clear(closeCom->theNodes);
- NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
- sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal,
- CloseComReqConf::SignalLength, JBA);
}//Qmgr::sendApiFailReq()
void Qmgr::execAPI_FAILREQ(Signal* signal)
@@ -2467,20 +2450,7 @@
ndbrequire(getNodeInfo(failedNodePtr.i).getType() != NodeInfo::DB);
- // ignore if api not active
- if (failedNodePtr.p->phase != ZAPI_ACTIVE)
- {
- jam();
- // But send to SUMA anyway...
- sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA);
- return;
- }
-
- signal->theData[0] = NDB_LE_Disconnected;
- signal->theData[1] = failedNodePtr.i;
- sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
-
- node_failed(signal, failedNodePtr.i);
+ api_failed(signal, signal->theData[0]);
}
void Qmgr::execAPI_FAILCONF(Signal* signal)
@@ -2604,6 +2574,13 @@
ndbrequire(false);
}
+ if (getNodeInfo(nodeId).getType() != NodeInfo::DB)
+ {
+ jam();
+ api_failed(signal, nodeId);
+ return;
+ }
+
switch(nodePtr.p->phase){
case ZRUNNING:
jam();
@@ -2640,66 +2617,109 @@
failedNodePtr.i = aFailedNode;
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
- if (getNodeInfo(failedNodePtr.i).getType() == NodeInfo::DB){
+ ndbrequire(getNodeInfo(failedNodePtr.i).getType() == NodeInfo::DB);
+
+ /**---------------------------------------------------------------------
+ * THE OTHER NODE IS AN NDB NODE, WE HANDLE IT AS IF A HEARTBEAT
+ * FAILURE WAS DISCOVERED.
+ *---------------------------------------------------------------------*/
+ switch(failedNodePtr.p->phase){
+ case ZRUNNING:
jam();
- /**---------------------------------------------------------------------
- * THE OTHER NODE IS AN NDB NODE, WE HANDLE IT AS IF A HEARTBEAT
- * FAILURE WAS DISCOVERED.
- *---------------------------------------------------------------------*/
- switch(failedNodePtr.p->phase){
- case ZRUNNING:
- jam();
- failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
- return;
- case ZFAIL_CLOSING:
- jam();
- return;
- case ZSTARTING:
- c_start.reset();
- // Fall-through
- default:
- jam();
- /*---------------------------------------------------------------------*/
- // The other node is still not in the cluster but disconnected.
- // We must restart communication in three seconds.
- /*---------------------------------------------------------------------*/
- failedNodePtr.p->failState = NORMAL;
- failedNodePtr.p->phase = ZFAIL_CLOSING;
- setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
-
- CloseComReqConf * const closeCom =
- (CloseComReqConf *)&signal->theData[0];
-
- closeCom->xxxBlockRef = reference();
- closeCom->failNo = 0;
- closeCom->noOfNodes = 1;
- NodeBitmask::clear(closeCom->theNodes);
- NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
- sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal,
- CloseComReqConf::SignalLength, JBA);
- }//if
+ failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
return;
- }
-
- /**
- * API code
- */
- jam();
- if (failedNodePtr.p->phase != ZFAIL_CLOSING){
+ case ZFAIL_CLOSING:
jam();
- //-------------------------------------------------------------------------
- // The API was active and has now failed. We need to initiate API failure
- // handling. If the API had already failed then we can ignore this
- // discovery.
- //-------------------------------------------------------------------------
+ return;
+ case ZSTARTING:
+ c_start.reset();
+ // Fall-through
+ default:
+ jam();
+ /*---------------------------------------------------------------------*/
+ // The other node is still not in the cluster but disconnected.
+ // We must restart communication in three seconds.
+ /*---------------------------------------------------------------------*/
+ failedNodePtr.p->failState = NORMAL;
failedNodePtr.p->phase = ZFAIL_CLOSING;
-
- sendApiFailReq(signal, aFailedNode);
- arbitRec.code = ArbitCode::ApiFail;
- handleArbitApiFail(signal, aFailedNode);
+ setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
+
+ CloseComReqConf * const closeCom =
+ (CloseComReqConf *)&signal->theData[0];
+
+ closeCom->xxxBlockRef = reference();
+ closeCom->failNo = 0;
+ closeCom->noOfNodes = 1;
+ NodeBitmask::clear(closeCom->theNodes);
+ NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
+ sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal,
+ CloseComReqConf::SignalLength, JBA);
}//if
return;
-}//Qmgr::node_failed()
+}
+
+void
+Qmgr::api_failed(Signal* signal, Uint32 nodeId)
+{
+ NodeRecPtr failedNodePtr;
+ /**------------------------------------------------------------------------
+ * A COMMUNICATION LINK HAS BEEN DISCONNECTED. WE MUST TAKE SOME ACTION
+ * DUE TO THIS.
+ *-----------------------------------------------------------------------*/
+ failedNodePtr.i = nodeId;
+ ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
+
+ if (failedNodePtr.p->phase == ZFAIL_CLOSING)
+ {
+ /**
+ * Failure handling already in progress
+ */
+ jam();
+ return;
+ }
+
+ if (failedNodePtr.p->phase == ZAPI_ACTIVE)
+ {
+ jam();
+ sendApiFailReq(signal, nodeId);
+ arbitRec.code = ArbitCode::ApiFail;
+ handleArbitApiFail(signal, nodeId);
+ }
+ else
+ {
+ /**
+ * Always inform SUMA
+ */
+ jam();
+ signal->theData[0] = nodeId;
+ signal->theData[1] = QMGR_REF;
+ sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBA);
+ failedNodePtr.p->failState = NORMAL;
+ }
+
+ failedNodePtr.p->phase = ZFAIL_CLOSING;
+ setNodeInfo(failedNodePtr.i).m_heartbeat_cnt= 0;
+ setNodeInfo(failedNodePtr.i).m_version = 0;
+ recompute_version_info(getNodeInfo(failedNodePtr.i).m_type);
+
+ CloseComReqConf * const closeCom = (CloseComReqConf *)&signal->theData[0];
+ closeCom->xxxBlockRef = reference();
+ closeCom->failNo = 0;
+ closeCom->noOfNodes = 1;
+ NodeBitmask::clear(closeCom->theNodes);
+ NodeBitmask::set(closeCom->theNodes, failedNodePtr.i);
+ sendSignal(CMVMI_REF, GSN_CLOSE_COMREQ, signal,
+ CloseComReqConf::SignalLength, JBA);
+
+ if (getNodeInfo(failedNodePtr.i).getType() == NodeInfo::MGM)
+ {
+ /**
+ * Allow MGM do reconnect "directly"
+ */
+ jam();
+ setNodeInfo(failedNodePtr.i).m_heartbeat_cnt = 3;
+ }
+}
/**--------------------------------------------------------------------------
* AN API NODE IS REGISTERING. IF FOR THE FIRST TIME WE WILL ENABLE
| Thread |
|---|
| • bk commit into 5.1 tree (jonas:1.2150) BUG#28445 | jonas | 30 May |