List:Commits« Previous MessageNext Message »
From:Frazer Clement Date:December 13 2010 3:24pm
Subject:bzr commit into mysql-5.1-telco-6.3 branch (frazer:3363) Bug#58904
View as plain text  
#At file:///home/frazer/bzr/mysql-5.1-telco-6.3/ based on revid:frazer@stripped

 3363 Frazer Clement	2010-12-13
      Bug#58904 Ndb : FAIL_REP signal does not include source node id
      
      Source node id should be included to aid debugging and enable more 
      intelligent failure report handling.

    modified:
      storage/ndb/include/kernel/signaldata/FailRep.hpp
      storage/ndb/include/ndb_version.h.in
      storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp
      storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
      storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
=== modified file 'storage/ndb/include/kernel/signaldata/FailRep.hpp'
--- a/storage/ndb/include/kernel/signaldata/FailRep.hpp	2009-05-26 18:53:34 +0000
+++ b/storage/ndb/include/kernel/signaldata/FailRep.hpp	2010-12-13 15:24:10 +0000
@@ -38,8 +38,10 @@ class FailRep {
   friend bool printFAIL_REP(FILE *, const Uint32 *, Uint32, Uint16);
 
 public:
-  STATIC_CONST( SignalLength = 2 );
-  STATIC_CONST( ExtraLength = 1 + NdbNodeBitmask::Size );
+  STATIC_CONST( OrigSignalLength = 2 );
+  STATIC_CONST( PartitionedExtraLength = 1 + NdbNodeBitmask::Size );
+  STATIC_CONST( SourceExtraLength = 1 );
+  STATIC_CONST( SignalLength = OrigSignalLength + SourceExtraLength );
   
   enum FailCause {
     ZOWN_FAILURE=0,
@@ -52,7 +54,27 @@ public:
     ZMULTI_NODE_SHUTDOWN = 7,
     ZPARTITIONED_CLUSTER = 8
   };
-  
+
+  Uint32 getFailSourceNodeId(Uint32 sigLen) const
+  {
+    /* Get failSourceNodeId from signal given length
+     * 2 cases of 2 existing cases : 
+     *   1) Old node, no source id
+     *   2) New node, source id
+     *   a) ZPARTITIONED_CLUSTER, extra info
+     *   b) Other error, no extra info
+     */
+    if (failCause == ZPARTITIONED_CLUSTER)
+    {
+      return (sigLen == (SignalLength + PartitionedExtraLength)) ?
+        partitionFailSourceNodeId : 
+        0;
+    }
+
+    return (sigLen == SignalLength) ? failSourceNodeId :
+      0;
+  }
+
 private:
   
   Uint32 failNodeId;
@@ -60,8 +82,15 @@ private:
   /**
    * Used when failCause == ZPARTITIONED_CLUSTER
    */
-  Uint32 president;
-  Uint32 partition[NdbNodeBitmask::Size];
+  union {
+    struct
+    {
+      Uint32 president;
+      Uint32 partition[NdbNodeBitmask::Size];
+      Uint32 partitionFailSourceNodeId;
+    };
+    Uint32 failSourceNodeId;
+  };
 };
 
 

=== modified file 'storage/ndb/include/ndb_version.h.in'
--- a/storage/ndb/include/ndb_version.h.in	2010-10-19 18:26:17 +0000
+++ b/storage/ndb/include/ndb_version.h.in	2010-12-13 15:24:10 +0000
@@ -386,4 +386,29 @@ ndbd_dih_sub_gcp_complete_ack(Uint32 x)
   }
 }
 
+#define NDBD_FAIL_REP_SOURCE_NODE_63 NDB_MAKE_VERSION(6,3,40)
+#define NDBD_FAIL_REP_SOURCE_NODE_70 NDB_MAKE_VERSION(7,0,21)
+#define NDBD_FAIL_REP_SOURCE_NODE_71 NDB_MAKE_VERSION(7,1,10)
+
+static 
+inline
+int
+ndbd_fail_rep_source_node(Uint32 x)
+{
+  {
+    const Uint32 major = (x >> 16) & 0xFF;
+    const Uint32 minor = (x >>  8) & 0xFF;
+
+    if (major == 6)
+    {
+      return x >= NDBD_FAIL_REP_SOURCE_NODE_63;
+    }
+    if (major == 7 && minor == 0)
+    {
+      return x >= NDBD_FAIL_REP_SOURCE_NODE_70;
+    }
+    return x >= NDBD_FAIL_REP_SOURCE_NODE_71;
+  }
+}
+
 #endif

=== modified file 'storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp'
--- a/storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp	2010-08-26 09:00:51 +0000
+++ b/storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp	2010-12-13 15:24:10 +0000
@@ -3251,6 +3251,7 @@ Ndbcntr::execSTOP_CONF(Signal* signal)
      */
     FailRep * const failRep = (FailRep *)&signal->theData[0];
     failRep->failCause = FailRep::ZMULTI_NODE_SHUTDOWN;
+    failRep->failSourceNodeId = getOwnNodeId();
     NodeReceiverGroup rg(QMGR, c_clusterNodes);
     Uint32 nodeId = 0;
     while ((nodeId = NdbNodeBitmask::find(c_stopRec.stopReq.nodes, nodeId+1))

=== modified file 'storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp'
--- a/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2010-09-06 08:14:08 +0000
+++ b/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2010-12-13 15:24:10 +0000
@@ -312,7 +312,8 @@ private:
   void failReport(Signal* signal,
                   Uint16 aFailedNode,
                   UintR aSendFailRep,
-                  FailRep::FailCause failCause);
+                  FailRep::FailCause failCause,
+                  Uint16 sourceNode);
   void findNeighbours(Signal* signal, Uint32 from);
   Uint16 translateDynamicIdToNodeId(Signal* signal, UintR TdynamicId);
 
@@ -341,7 +342,8 @@ private:
   void commitFailReqLab(Signal* signal);
   void commitFailConfLab(Signal* signal);
   void failReportLab(Signal* signal, Uint16 aFailedNode, 
-		     FailRep::FailCause aFailCause);
+		     FailRep::FailCause aFailCause,
+                     Uint16 sourceNode);
   void sendCommitFailReq(Signal* signal);
   void presToConfLab(Signal* signal);
   void sendSttorryLab(Signal* signal);

=== modified file 'storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp'
--- a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2010-12-13 14:48:26 +0000
+++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2010-12-13 15:24:10 +0000
@@ -135,7 +135,7 @@ void Qmgr::execCONTINUEB(Signal* signal)
       return;
     }//if
     //regreqMasterTimeLimitLab(signal);
-    failReportLab(signal, c_start.m_startNode, FailRep::ZSTART_IN_REGREQ);
+    failReportLab(signal, c_start.m_startNode, FailRep::ZSTART_IN_REGREQ, getOwnNodeId());
     return;
     break;
   case ZTIMER_HANDLING:
@@ -201,9 +201,15 @@ void Qmgr::execFAIL_REP(Signal* signal) 
   const FailRep * const failRep = (FailRep *)&signal->theData[0];
   const NodeId failNodeId = failRep->failNodeId;
   const FailRep::FailCause failCause = (FailRep::FailCause)failRep->failCause; 
-  
+  Uint32 failSource = failRep->getFailSourceNodeId(signal->length());
+  if (!failSource)
+  {
+    /* Failure source not included, use sender of signal as 'source' */
+    failSource = refToNode(signal->getSendersBlockRef());
+  }
+
   jamEntry();
-  failReportLab(signal, failNodeId, failCause);
+  failReportLab(signal, failNodeId, failCause, failSource);
   return;
 }//Qmgr::execFAIL_REP()
 
@@ -1123,17 +1129,27 @@ retry:
     rep->failCause = FailRep::ZPARTITIONED_CLUSTER;
     rep->president = cpresident;
     c_clusterNodes.copyto(NdbNodeBitmask::Size, rep->partition);
+    rep->partitionFailSourceNodeId = getOwnNodeId();
     Uint32 ref = calcQmgrBlockRef(nodeId);
     Uint32 i = 0;
+    /* Send source of event info if a node supports it */
+    Uint32 length = FailRep::OrigSignalLength + FailRep::PartitionedExtraLength;    
     while((i = part.find(i + 1)) != NdbNodeBitmask::NotFound)
     {
       if (i == nodeId)
 	continue;
       rep->failNodeId = i;
-      sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA);
+      bool sendSourceId = ndbd_fail_rep_source_node((getNodeInfo(i)).m_version);
+      sendSignal(ref, GSN_FAIL_REP, signal, 
+                 length + (sendSourceId ? FailRep::SourceExtraLength : 0), 
+                 JBA);
     }
     rep->failNodeId = nodeId;
-    sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBB);
+    bool sendSourceId = ndbd_fail_rep_source_node((getNodeInfo(nodeId)).m_version);
+    
+    sendSignal(ref, GSN_FAIL_REP, signal,
+               length + (sendSourceId ? FailRep::SourceExtraLength : 0), 
+               JBB);
     return;
   }
   
@@ -2549,7 +2565,7 @@ void Qmgr::checkHeartbeat(Signal* signal
     signal->theData[1] = nodePtr.i;
     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
 
-    failReportLab(signal, nodePtr.i, FailRep::ZHEARTBEAT_FAILURE);
+    failReportLab(signal, nodePtr.i, FailRep::ZHEARTBEAT_FAILURE, getOwnNodeId());
     return;
   }//if
 }//Qmgr::checkHeartbeat()
@@ -2972,7 +2988,7 @@ void Qmgr::node_failed(Signal* signal, U
   switch(failedNodePtr.p->phase){
   case ZRUNNING:
     jam();
-    failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
+    failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE, getOwnNodeId());
     return;
   case ZFAIL_CLOSING:
     jam();
@@ -2983,7 +2999,7 @@ void Qmgr::node_failed(Signal* signal, U
      *   Force "real" failure handling
      */
     failedNodePtr.p->phase = ZRUNNING;
-    failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE);
+    failReportLab(signal, aFailedNode, FailRep::ZLINK_FAILURE, getOwnNodeId());
     return;
     // Fall-through
   default:
@@ -3387,7 +3403,8 @@ Qmgr::sendApiRegRef(Signal* signal, Uint
  * OF A FAILED PRESIDENT THEN WE WILL TAKE FURTHER ACTION. 
  *---------------------------------------------------------------------------*/
 void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
-			 FailRep::FailCause aFailCause) 
+			 FailRep::FailCause aFailCause,
+                         Uint16 sourceNode) 
 {
   NodeRecPtr nodePtr;
   NodeRecPtr failedNodePtr;
@@ -3435,8 +3452,9 @@ void Qmgr::failReportLab(Signal* signal,
       code = NDBD_EXIT_PARTITIONED_SHUTDOWN;
       char buf1[100], buf2[100];
       c_clusterNodes.getText(buf1);
-      if (signal->getLength()== FailRep::SignalLength + FailRep::ExtraLength &&
-	  signal->header.theVerId_signalNumber == GSN_FAIL_REP)
+      if (((signal->getLength()== FailRep::OrigSignalLength + FailRep::PartitionedExtraLength) ||
+           (signal->getLength()== FailRep::SignalLength + FailRep::PartitionedExtraLength)) &&
+          signal->header.theVerId_signalNumber == GSN_FAIL_REP)
       {
 	jam();
 	NdbNodeBitmask part;
@@ -3466,8 +3484,9 @@ void Qmgr::failReportLab(Signal* signal,
 
     char buf[255];
     BaseString::snprintf(buf, sizeof(buf), 
-			 "We(%u) have been declared dead by %u reason: %s(%u)",
+			 "We(%u) have been declared dead by %u (via %u) reason: %s(%u)",
 			 getOwnNodeId(),
+                         sourceNode,
 			 refToNode(signal->getSendersBlockRef()),
 			 msg ? msg : "<Unknown>",
 			 aFailCause);
@@ -3495,7 +3514,7 @@ void Qmgr::failReportLab(Signal* signal,
   }
 
   TnoFailedNodes = cnoFailedNodes;
-  failReport(signal, failedNodePtr.i, (UintR)ZTRUE, aFailCause);
+  failReport(signal, failedNodePtr.i, (UintR)ZTRUE, aFailCause, sourceNode);
   if (cpresident == getOwnNodeId()) {
     jam();
     if (ctoStatus == Q_NOT_ACTIVE) {
@@ -3599,7 +3618,8 @@ void Qmgr::execPREP_FAILREQ(Signal* sign
     failReport(signal,
                cprepFailedNodes[Tindex],
                (UintR)ZFALSE,
-               FailRep::ZIN_PREP_FAIL_REQ);
+               FailRep::ZIN_PREP_FAIL_REQ,
+               0); /* Source node not required (or known) here */
   }//for
   sendCloseComReq(signal, Tblockref, TfailureNr);
   cnoCommitFailedNodes = 0;
@@ -4262,7 +4282,7 @@ void Qmgr::systemErrorBecauseOtherNodeFa
   jam();
 
   // Broadcast that this node is failing to other nodes
-  failReport(signal, getOwnNodeId(), (UintR)ZTRUE, FailRep::ZOWN_FAILURE);
+  failReport(signal, getOwnNodeId(), (UintR)ZTRUE, FailRep::ZOWN_FAILURE, getOwnNodeId());
 
   char buf[100];
   BaseString::snprintf(buf, 100, 
@@ -4277,7 +4297,7 @@ void Qmgr::systemErrorLab(Signal* signal
 {
   jam();
   // Broadcast that this node is failing to other nodes
-  failReport(signal, getOwnNodeId(), (UintR)ZTRUE, FailRep::ZOWN_FAILURE);
+  failReport(signal, getOwnNodeId(), (UintR)ZTRUE, FailRep::ZOWN_FAILURE, getOwnNodeId());
 
   // If it's known why shutdown occured
   // an error message has been passed to this function
@@ -4294,7 +4314,8 @@ void Qmgr::systemErrorLab(Signal* signal
 void Qmgr::failReport(Signal* signal,
                       Uint16 aFailedNode,
                       UintR aSendFailRep,
-                      FailRep::FailCause aFailCause) 
+                      FailRep::FailCause aFailCause,
+                      Uint16 sourceNode) 
 {
   UintR tfrMinDynamicId;
   NodeRecPtr failedNodePtr;
@@ -4302,6 +4323,8 @@ void Qmgr::failReport(Signal* signal,
   NodeRecPtr presidentNodePtr;
 
 
+  ndbassert((! aSendFailRep) || (sourceNode != 0));
+
   failedNodePtr.i = aFailedNode;
   ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRec);
   if (failedNodePtr.p->phase == ZRUNNING) {
@@ -4333,6 +4356,7 @@ void Qmgr::failReport(Signal* signal,
 	FailRep * const failRep = (FailRep *)&signal->theData[0];
         failRep->failNodeId = failedNodePtr.i;
         failRep->failCause = aFailCause;
+        failRep->failSourceNodeId = sourceNode;
         sendSignal(failedNodePtr.p->blockRef, GSN_FAIL_REP, signal, 
 		   FailRep::SignalLength, JBA);
       }//if
@@ -4344,6 +4368,7 @@ void Qmgr::failReport(Signal* signal,
 	  FailRep * const failRep = (FailRep *)&signal->theData[0];
 	  failRep->failNodeId = failedNodePtr.i;
 	  failRep->failCause = aFailCause;
+          failRep->failSourceNodeId = sourceNode;
           sendSignal(nodePtr.p->blockRef, GSN_FAIL_REP, signal, 
 		     FailRep::SignalLength, JBA);
         }//if


Attachment: [text/bzr-bundle] bzr/frazer@mysql.com-20101213152410-ozb61hffqx17fkex.bundle
Thread
bzr commit into mysql-5.1-telco-6.3 branch (frazer:3363) Bug#58904Frazer Clement13 Dec