#At file:///home/frazer/bzr/mysql-5.1-telco-6.3/ based on revid:frazer@stripped
3363 Frazer Clement 2010-12-13
Bug#58904 Ndb : FAIL_REP signal does not include source node id
Source node id should be included to aid debugging and enable more
intelligent failure report handling.
modified:
storage/ndb/include/kernel/signaldata/FailRep.hpp
storage/ndb/include/ndb_version.h.in
storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp
storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
=== modified file 'storage/ndb/include/kernel/signaldata/FailRep.hpp'
--- a/storage/ndb/include/kernel/signaldata/FailRep.hpp 2009-05-26 18:53:34 +0000
+++ b/storage/ndb/include/kernel/signaldata/FailRep.hpp 2010-12-13 15:24:10 +0000
@@ -38,8 +38,10 @@ class FailRep {
friend bool printFAIL_REP(FILE *, const Uint32 *, Uint32, Uint16);
public:
- STATIC_CONST( SignalLength = 2 );
- STATIC_CONST( ExtraLength = 1 + NdbNodeBitmask::Size );
+ STATIC_CONST( OrigSignalLength = 2 );
+ STATIC_CONST( PartitionedExtraLength = 1 + NdbNodeBitmask::Size );
+ STATIC_CONST( SourceExtraLength = 1 );
+ STATIC_CONST( SignalLength = OrigSignalLength + SourceExtraLength );
enum FailCause {
ZOWN_FAILURE=0,
@@ -52,7 +54,27 @@ public:
ZMULTI_NODE_SHUTDOWN = 7,
ZPARTITIONED_CLUSTER = 8
};
-
+
+ Uint32 getFailSourceNodeId(Uint32 sigLen) const
+ {
+ /* Get failSourceNodeId from signal given length
+ * 2 cases of 2 existing cases :
+ * 1) Old node, no source id
+ * 2) New node, source id
+ * a) ZPARTITIONED_CLUSTER, extra info
+ * b) Other error, no extra info
+ */
+ if (failCause == ZPARTITIONED_CLUSTER)
+ {
+ return (sigLen == (SignalLength + PartitionedExtraLength)) ?
+ partitionFailSourceNodeId :
+ 0;
+ }
+
+ return (sigLen == SignalLength) ? failSourceNodeId :
+ 0;
+ }
+
private:
Uint32 failNodeId;
@@ -60,8 +82,15 @@ private:
/**
* Used when failCause == ZPARTITIONED_CLUSTER
*/
- Uint32 president;
- Uint32 partition[NdbNodeBitmask::Size];
+ union {
+ struct
+ {
+ Uint32 president;
+ Uint32 partition[NdbNodeBitmask::Size];
+ Uint32 partitionFailSourceNodeId;
+ };
+ Uint32 failSourceNodeId;
+ };
};
=== modified file 'storage/ndb/include/ndb_version.h.in'
--- a/storage/ndb/include/ndb_version.h.in 2010-10-19 18:26:17 +0000
+++ b/storage/ndb/include/ndb_version.h.in 2010-12-13 15:24:10 +0000
@@ -386,4 +386,29 @@ ndbd_dih_sub_gcp_complete_ack(Uint32 x)
}
}
+#define NDBD_FAIL_REP_SOURCE_NODE_63 NDB_MAKE_VERSION(6,3,40)
+#define NDBD_FAIL_REP_SOURCE_NODE_70 NDB_MAKE_VERSION(7,0,21)
+#define NDBD_FAIL_REP_SOURCE_NODE_71 NDB_MAKE_VERSION(7,1,10)
+
+static
+inline
+int
+ndbd_fail_rep_source_node(Uint32 x)
+{
+ {
+ const Uint32 major = (x >> 16) & 0xFF;
+ const Uint32 minor = (x >> 8) & 0xFF;
+
+ if (major == 6)
+ {
+ return x >= NDBD_FAIL_REP_SOURCE_NODE_63;
+ }
+ if (major == 7 && minor == 0)
+ {
+ return x >= NDBD_FAIL_REP_SOURCE_NODE_70;
+ }
+ return x >= NDBD_FAIL_REP_SOURCE_NODE_71;
+ }
+}
+
#endif
=== modified file 'storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp'
--- a/storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp 2010-08-26 09:00:51 +0000
+++ b/storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp 2010-12-13 15:24:10 +0000
@@ -3251,6 +3251,7 @@ Ndbcntr::execSTOP_CONF(Signal* signal)
*/
FailRep * const failRep = (FailRep *)&signal->theData[0];
failRep->failCause = FailRep::ZMULTI_NODE_SHUTDOWN;
+ failRep->failSourceNodeId = getOwnNodeId();
NodeReceiverGroup rg(QMGR, c_clusterNodes);
Uint32 nodeId = 0;
while ((nodeId = NdbNodeBitmask::find(c_stopRec.stopReq.nodes, nodeId+1))
=== modified file 'storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp'
--- a/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2010-09-06 08:14:08 +0000
+++ b/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2010-12-13 15:24:10 +0000
@@ -312,7 +312,8 @@ private:
void failReport(Signal* signal,
Uint16 aFailedNode,
UintR aSendFailRep,
- FailRep::FailCause failCause);
+ FailRep::FailCause failCause,
+ Uint16 sourceNode);
void findNeighbours(Signal* signal, Uint32 from);
Uint16 translateDynamicIdToNodeId(Signal* signal, UintR TdynamicId);
@@ -341,7 +342,8 @@ private:
void commitFailReqLab(Signal* signal);
void commitFailConfLab(Signal* signal);
void failReportLab(Signal* signal, Uint16 aFailedNode,
- FailRep::FailCause aFailCause);
+ FailRep::FailCause aFailCause,
+ Uint16 sourceNode);
void sendCommitFailReq(Signal* signal);
void presToConfLab(Signal* signal);
void sendSttorryLab(Signal* signal);
=== modified file 'storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp'
--- a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2010-12-13 14:48:26 +0000
+++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2010-12-13 15:24:10 +0000
@@ -135,7 +135,7 @@ void Qmgr::execCONTINUEB(Signal* signal)
return;
}//if
//regreqMasterTimeLimitLab(signal);
- failReportLab(signal, c_start.m_startNode, FailRep::ZSTART_IN_REGREQ);
+ failReportLab(signal, c_start.m_startNode, FailRep::ZSTART_IN_REGREQ, getOwnNodeId());
return;
break;
case ZTIMER_HANDLING:
@@ -201,9 +201,15 @@ void Qmgr::execFAIL_REP(Signal* signal)
const FailRep * const failRep = (FailRep *)&signal->theData[0];
const NodeId failNodeId = failRep->failNodeId;
const FailRep::FailCause failCause = (FailRep::FailCause)failRep->failCause;
-
+ Uint32 failSource = failRep->getFailSourceNodeId(signal->length());
+ if (!failSource)
+ {
+ /* Failure source not included, use sender of signal as 'source' */
+ failSource = refToNode(signal->getSendersBlockRef());
+ }
+
jamEntry();
- failReportLab(signal, failNodeId, failCause);
+ failReportLab(signal, failNodeId, failCause, failSource);
return;
}//Qmgr::execFAIL_REP()
@@ -1123,17 +1129,27 @@ retry:
rep->failCause = FailRep::ZPARTITIONED_CLUSTER;
rep->president = cpresident;
c_clusterNodes.copyto(NdbNodeBitmask::Size, rep->partition);
+ rep->partitionFailSourceNodeId = getOwnNodeId();
Uint32 ref = calcQmgrBlockRef(nodeId);
Uint32 i = 0;
+ /* Send source of event info if a node supports it */
+ Uint32 length = FailRep::OrigSignalLength + FailRep::PartitionedExtraLength;
while((i = part.find(i + 1)) != NdbNodeBitmask::NotFound)
{
if (i == nodeId)
continue;
rep->failNodeId = i;
- sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA);
+ bool sendSourceId = ndbd_fail_rep_source_node((getNodeInfo(i)).m_version);
+ sendSignal(ref, GSN_FAIL_REP, signal,
+ length + (sendSourceId ? FailRep::SourceExtraLength : 0),
+ JBA);
}
rep->failNodeId = nodeId;
- sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBB);
+ bool sendSourceId = ndbd_fail_rep_source_node((getNodeInfo(nodeId)).m_version);
+
+ sendSignal(ref, GSN_FAIL_REP, signal,
+ length + (sendSourceId ? FailRep::SourceExtraLength : 0),
+ JBB);
return;
}
@@ -2549,7 +2565,7 @@ void Qmgr::checkHeartbeat(Signal* signal
signal->theData[1] = nodePtr.i;
sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
- failReportLab(signal, nodePtr.i, FailRep::ZHEARTBEAT_FAILURE);
+ failReportLab(signal, nodePtr.i, FailRep::ZHEARTBEAT_FAILURE, getOwnNodeId());
return;
}//if
}//Qmgr::checkHeartbeat()
@@ -2972,7 +2988,7 @@ void Qmgr::node_failed(Signal* signal, U
switch(failedNodePtr.p->phase){
case ZRUNNING:
jam();
- failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
+ failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE, getOwnNodeId());
return;
case ZFAIL_CLOSING:
jam();
@@ -2983,7 +2999,7 @@ void Qmgr::node_failed(Signal* signal, U
* Force "real" failure handling
*/
failedNodePtr.p->phase = ZRUNNING;
- failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
+ failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE, getOwnNodeId());
return;
// Fall-through
default:
@@ -3387,7 +3403,8 @@ Qmgr::sendApiRegRef(Signal* signal, Uint
* OF A FAILED PRESIDENT THEN WE WILL TAKE FURTHER ACTION.
*---------------------------------------------------------------------------*/
void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
- FailRep::FailCause aFailCause)
+ FailRep::FailCause aFailCause,
+ Uint16 sourceNode)
{
NodeRecPtr nodePtr;
NodeRecPtr failedNodePtr;
@@ -3435,8 +3452,9 @@ void Qmgr::failReportLab(Signal* signal,
code = NDBD_EXIT_PARTITIONED_SHUTDOWN;
char buf1[100], buf2[100];
c_clusterNodes.getText(buf1);
- if (signal->getLength()== FailRep::SignalLength + FailRep::ExtraLength &&
- signal->header.theVerId_signalNumber == GSN_FAIL_REP)
+ if (((signal->getLength()== FailRep::OrigSignalLength + FailRep::PartitionedExtraLength) ||
+ (signal->getLength()== FailRep::SignalLength + FailRep::PartitionedExtraLength)) &&
+ signal->header.theVerId_signalNumber == GSN_FAIL_REP)
{
jam();
NdbNodeBitmask part;
@@ -3466,8 +3484,9 @@ void Qmgr::failReportLab(Signal* signal,
char buf[255];
BaseString::snprintf(buf, sizeof(buf),
- "We(%u) have been declared dead by %u reason: %s(%u)",
+ "We(%u) have been declared dead by %u (via %u) reason: %s(%u)",
getOwnNodeId(),
+ sourceNode,
refToNode(signal->getSendersBlockRef()),
msg ? msg : "<Unknown>",
aFailCause);
@@ -3495,7 +3514,7 @@ void Qmgr::failReportLab(Signal* signal,
}
TnoFailedNodes = cnoFailedNodes;
- failReport(signal, failedNodePtr.i, (UintR)ZTRUE, aFailCause);
+ failReport(signal, failedNodePtr.i, (UintR)ZTRUE, aFailCause, sourceNode);
if (cpresident == getOwnNodeId()) {
jam();
if (ctoStatus == Q_NOT_ACTIVE) {
@@ -3599,7 +3618,8 @@ void Qmgr::execPREP_FAILREQ(Signal* sign
failReport(signal,
cprepFailedNodes[Tindex],
(UintR)ZFALSE,
- FailRep::ZIN_PREP_FAIL_REQ);
+ FailRep::ZIN_PREP_FAIL_REQ,
+ 0); /* Source node not required (or known) here */
}//for
sendCloseComReq(signal, Tblockref, TfailureNr);
cnoCommitFailedNodes = 0;
@@ -4262,7 +4282,7 @@ void Qmgr::systemErrorBecauseOtherNodeFa
jam();
// Broadcast that this node is failing to other nodes
- failReport(signal, getOwnNodeId(), (UintR)ZTRUE, FailRep::ZOWN_FAILURE);
+ failReport(signal, getOwnNodeId(), (UintR)ZTRUE, FailRep::ZOWN_FAILURE, getOwnNodeId());
char buf[100];
BaseString::snprintf(buf, 100,
@@ -4277,7 +4297,7 @@ void Qmgr::systemErrorLab(Signal* signal
{
jam();
// Broadcast that this node is failing to other nodes
- failReport(signal, getOwnNodeId(), (UintR)ZTRUE, FailRep::ZOWN_FAILURE);
+ failReport(signal, getOwnNodeId(), (UintR)ZTRUE, FailRep::ZOWN_FAILURE, getOwnNodeId());
// If it's known why shutdown occured
// an error message has been passed to this function
@@ -4294,7 +4314,8 @@ void Qmgr::systemErrorLab(Signal* signal
void Qmgr::failReport(Signal* signal,
Uint16 aFailedNode,
UintR aSendFailRep,
- FailRep::FailCause aFailCause)
+ FailRep::FailCause aFailCause,
+ Uint16 sourceNode)
{
UintR tfrMinDynamicId;
NodeRecPtr failedNodePtr;
@@ -4302,6 +4323,8 @@ void Qmgr::failReport(Signal* signal,
NodeRecPtr presidentNodePtr;
+ ndbassert((! aSendFailRep) || (sourceNode != 0));
+
failedNodePtr.i = aFailedNode;
ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRec);
if (failedNodePtr.p->phase == ZRUNNING) {
@@ -4333,6 +4356,7 @@ void Qmgr::failReport(Signal* signal,
FailRep * const failRep = (FailRep *)&signal->theData[0];
failRep->failNodeId = failedNodePtr.i;
failRep->failCause = aFailCause;
+ failRep->failSourceNodeId = sourceNode;
sendSignal(failedNodePtr.p->blockRef, GSN_FAIL_REP, signal,
FailRep::SignalLength, JBA);
}//if
@@ -4344,6 +4368,7 @@ void Qmgr::failReport(Signal* signal,
FailRep * const failRep = (FailRep *)&signal->theData[0];
failRep->failNodeId = failedNodePtr.i;
failRep->failCause = aFailCause;
+ failRep->failSourceNodeId = sourceNode;
sendSignal(nodePtr.p->blockRef, GSN_FAIL_REP, signal,
FailRep::SignalLength, JBA);
}//if
Attachment: [text/bzr-bundle] bzr/frazer@mysql.com-20101213152410-ozb61hffqx17fkex.bundle
| Thread |
|---|
| • bzr commit into mysql-5.1-telco-6.3 branch (frazer:3363) Bug#58904 | Frazer Clement | 13 Dec |