List:Commits« Previous MessageNext Message »
From:Jonas Oreland Date:May 30 2011 11:52am
Subject:bzr commit into mysql-5.1-telco-6.3 branch (jonas:3439) Bug#12589691
View as plain text  
#At file:///home/jonas/src/telco-6.3/ based on revid:jonas@stripped

 3439 Jonas Oreland	2011-05-30
      ndb - bug#12589691 - improve error reporting when api-fail-req seems to happen

    modified:
      storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
      storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
      storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
      storage/ndb/src/kernel/blocks/suma/Suma.cpp
      storage/ndb/src/kernel/blocks/suma/Suma.hpp
=== modified file 'storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp	2011-03-31 12:31:43 +0000
+++ b/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp	2011-05-30 11:52:38 +0000
@@ -1260,7 +1260,7 @@ void Dbtc::handleApiFailState(Signal* si
   if (capiConnectClosing[TfailedApiNode] == 0) {
     jam();
     signal->theData[0] = TfailedApiNode;
-    signal->theData[1] = cownref;
+    signal->theData[1] = reference();
     sendSignal(capiFailRef, GSN_API_FAILCONF, signal, 2, JBB);
   }//if
 }//Dbtc::handleApiFailState()
@@ -7661,7 +7661,7 @@ Dbtc::apiFailBlockCleanupCallback(Signal
   jamEntry();
   
   signal->theData[0] = failedNodeId;
-  signal->theData[1] = cownref;
+  signal->theData[1] = reference();
   sendSignal(capiFailRef, GSN_API_FAILCONF, signal, 2, JBB);
 }
 
@@ -11897,6 +11897,21 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal)
     return;
   }
 #endif
+
+  if (arg == 7019 && signal->getLength() == 2)
+  {
+    jam();
+    Uint32 nodeId = signal->theData[1];
+    if (nodeId < MAX_NODES && nodeId < NDB_ARRAY_SIZE(capiConnectClosing))
+    {
+      warningEvent(" DBTC: capiConnectClosing[%u]: %u",
+                   nodeId, capiConnectClosing[nodeId]);
+    }
+    else
+    {
+      warningEvent(" DBTC: dump-7019 to unknown node: %u", nodeId);
+    }
+  }
 }//Dbtc::execDUMP_STATE_ORD()
 
 bool

=== modified file 'storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp'
--- a/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp	2011-05-30 11:52:38 +0000
@@ -79,6 +79,7 @@
 
 #endif
 
+#define QMGR_MAX_FAIL_STATE_BLOCKS 4
 
 class Qmgr : public SimulatedBlock {
 public:
@@ -92,9 +93,7 @@ public:
     NORMAL = 0,
     WAITING_FOR_CLOSECOMCONF_ACTIVE = 1,     /* Node had phase ZAPI_ACTIVE */
     WAITING_FOR_CLOSECOMCONF_NOTACTIVE = 2,  /* Node had phase != ZAPI_ACTIVE */
-    WAITING_FOR_FAILCONF1 = 3,
-    WAITING_FOR_FAILCONF2 = 4,
-    WAITING_FOR_FAILCONF3 = 5,
+    WAITING_FOR_API_FAILCONF = 3,
     WAITING_FOR_NDB_FAILCONF = 6
   };
 
@@ -174,8 +173,9 @@ public:
     BlockReference blockRef;
     Uint64 m_secret;
     Uint64 m_alloc_timeout;
+    Uint16 m_failconf_blocks[QMGR_MAX_FAIL_STATE_BLOCKS];
 
-    NodeRec() { }
+    NodeRec() { bzero(m_failconf_blocks, sizeof(m_failconf_blocks)); }
   }; /* p2c: size = 52 bytes */
   
   typedef Ptr<NodeRec> NodeRecPtr;
@@ -408,7 +408,9 @@ private:
 			  const NodeId theNodes[]);
 
   void handleApiCloseComConf(Signal* signal);
-
+  void add_failconf_block(NodeRecPtr, Uint32 block);
+  bool remove_failconf_block(NodeRecPtr, Uint32 block);
+  bool is_empty_failconf_block(NodeRecPtr) const;
   
   /* Wait this time until we try to join the       */
   /* cluster again                                 */

=== modified file 'storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp'
--- a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2011-05-24 14:51:54 +0000
+++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2011-05-30 11:52:38 +0000
@@ -2679,23 +2679,70 @@ void Qmgr::checkStartInterface(Signal* s
         if(((getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1) % 60) == 0)
         {
           jam();
-	  char buf[100];
-	  BaseString::snprintf(buf, sizeof(buf), 
-                               "Failure handling of node %d has not completed"
-                               " in %d min - state = %d",
-                               nodePtr.i, 
-                               (getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1)/60,
-                               nodePtr.p->failState);
-	  warningEvent(buf);
-          if (((getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1) % 300) == 0)
+	  char buf[256];
+          if (getNodeInfo(nodePtr.i).m_type == NodeInfo::DB)
           {
             jam();
-            /**
-             * Also dump DIH nf-state
-             */
-            signal->theData[0] = 7019;
-            signal->theData[1] = nodePtr.i;
-            sendSignal(DBDIH_REF, GSN_DUMP_STATE_ORD, signal, 2, JBB);
+            BaseString::snprintf(buf, sizeof(buf),
+                                 "Failure handling of node %d has not completed"
+                                 " in %d min - state = %d",
+                                 nodePtr.i,
+                                 (getNodeInfo(nodePtr.i).m_heartbeat_cnt+1)/60,
+                                 nodePtr.p->failState);
+            warningEvent("%s", buf);
+            if (((getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1) % 300) == 0)
+            {
+              jam();
+              /**
+               * Also dump DIH nf-state
+               */
+              signal->theData[0] = 7019;
+              signal->theData[1] = nodePtr.i;
+              sendSignal(DBDIH_REF, GSN_DUMP_STATE_ORD, signal, 2, JBB);
+            }
+          }
+          else
+          {
+            jam();
+            BaseString::snprintf(buf, sizeof(buf),
+                                 "Failure handling of api %u has not completed"
+                                 " in %d min - state = %d",
+                                 nodePtr.i,
+                                 (getNodeInfo(nodePtr.i).m_heartbeat_cnt+1)/60,
+                                 nodePtr.p->failState);
+            warningEvent("%s", buf);
+            if (nodePtr.p->failState == WAITING_FOR_API_FAILCONF)
+            {
+              jam();
+              compile_time_assert(NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks) == 4);
+              BaseString::snprintf(buf, sizeof(buf),
+                                   "  Waiting for blocks: %u %u %u %u",
+                                   nodePtr.p->m_failconf_blocks[0],
+                                   nodePtr.p->m_failconf_blocks[1],
+                                   nodePtr.p->m_failconf_blocks[2],
+                                   nodePtr.p->m_failconf_blocks[3]);
+              warningEvent("%s", buf);
+
+              for (Uint32 i = 0; i<NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks);
+                   i++)
+              {
+                jam();
+                if (nodePtr.p->m_failconf_blocks[i] != 0)
+                {
+                  jam();
+                  signal->theData[0] = 7019;
+                  signal->theData[1] = nodePtr.i;
+                  sendSignal(numberToRef(nodePtr.p->m_failconf_blocks[i],
+                                         getOwnNodeId()),
+                             GSN_DUMP_STATE_ORD, signal, 2, JBB);
+                }
+                else
+                {
+                  jam();
+                  break;
+                }
+              }
+            }
           }
 	}
       }
@@ -2726,14 +2773,18 @@ void Qmgr::sendApiFailReq(Signal* signal
   signal->theData[1] = QMGR_REF; 
 
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
-  
-  failedNodePtr.p->failState = WAITING_FOR_FAILCONF1;
+  failedNodePtr.p->failState = WAITING_FOR_API_FAILCONF;
 
   /* JBB used to ensure delivery *after* any pending
    * signals
    */
+  add_failconf_block(failedNodePtr, DBTC);
   sendSignal(DBTC_REF, GSN_API_FAILREQ, signal, 2, JBB);
+
+  add_failconf_block(failedNodePtr, DBDICT);
   sendSignal(DBDICT_REF, GSN_API_FAILREQ, signal, 2, JBB);
+
+  add_failconf_block(failedNodePtr, SUMA);
   sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBB);
 }//Qmgr::sendApiFailReq()
 
@@ -2758,30 +2809,118 @@ void Qmgr::execAPI_FAILCONF(Signal* sign
   failedNodePtr.i = signal->theData[0];  
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
 
-  if (failedNodePtr.p->failState == WAITING_FOR_FAILCONF1)
+  Uint32 block = refToMain(signal->theData[1]);
+  if (failedNodePtr.p->failState != WAITING_FOR_API_FAILCONF ||
+      !remove_failconf_block(failedNodePtr, block))
+  {
+    jam();
+    ndbout << "execAPI_FAILCONF from " << block
+           << " failedNodePtr.p->failState = "
+	   << (Uint32)(failedNodePtr.p->failState)
+           << " blocks: ";
+    for (Uint32 i = 0;i<NDB_ARRAY_SIZE(failedNodePtr.p->m_failconf_blocks);i++)
+    {
+      printf("%u ", failedNodePtr.p->m_failconf_blocks[i]);
+    }
+    ndbout << endl;
+    systemErrorLab(signal, __LINE__);
+  }//if
+
+  if (is_empty_failconf_block(failedNodePtr))
   {
     jam();
-    failedNodePtr.p->failState = WAITING_FOR_FAILCONF2;
+    failedNodePtr.p->failState = NORMAL;
+
+    /**
+     * When we set this state, connection will later be opened
+     *   in checkStartInterface
+     */
   }
-  else if (failedNodePtr.p->failState == WAITING_FOR_FAILCONF2)
+  return;
+}//Qmgr::execAPI_FAILCONF()
+
+void
+Qmgr::add_failconf_block(NodeRecPtr nodePtr, Uint32 block)
+{
+  // Check that it does not already exists!!
+  Uint32 pos = 0;
+  for (; pos < NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks); pos++)
   {
     jam();
-    failedNodePtr.p->failState = WAITING_FOR_FAILCONF3;
+    if (nodePtr.p->m_failconf_blocks[pos] == 0)
+    {
+      jam();
+      break;
+    }
+    else if (nodePtr.p->m_failconf_blocks[pos] == block)
+    {
+      jam();
+      break;
+    }
   }
-  else if (failedNodePtr.p->failState == WAITING_FOR_FAILCONF3)
+
+  ndbrequire(pos != NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks));
+  ndbassert(nodePtr.p->m_failconf_blocks[pos] != block);
+  if (nodePtr.p->m_failconf_blocks[pos] == block)
   {
     jam();
-    failedNodePtr.p->failState = NORMAL;
+    /**
+     * Already in list!!
+     */
+#ifdef ERROR_INSERT
+    ndbrequire(false);
+#endif
+    return;
   }
-  else
+  ndbrequire(nodePtr.p->m_failconf_blocks[pos] == 0);
+  nodePtr.p->m_failconf_blocks[pos] = block;
+}
+
+bool
+Qmgr::remove_failconf_block(NodeRecPtr nodePtr, Uint32 block)
+{
+  // Check that it does exists!!
+  Uint32 pos = 0;
+  for (; pos < NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks); pos++)
   {
     jam();
-    ndbout << "failedNodePtr.p->failState = "
-	   << (Uint32)(failedNodePtr.p->failState) << endl;
-    systemErrorLab(signal, __LINE__);
-  }//if
-  return;
-}//Qmgr::execAPI_FAILCONF()
+    if (nodePtr.p->m_failconf_blocks[pos] == 0)
+    {
+      jam();
+      break;
+    }
+    else if (nodePtr.p->m_failconf_blocks[pos] == block)
+    {
+      jam();
+      break;
+    }
+  }
+
+  if (pos == NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks) ||
+      nodePtr.p->m_failconf_blocks[pos] != block)
+  {
+    jam();
+    /**
+     * Not found!!
+     */
+    return false;
+  }
+
+  nodePtr.p->m_failconf_blocks[pos] = 0;
+  for (pos++; pos < NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks); pos++)
+  {
+    jam();
+    nodePtr.p->m_failconf_blocks[pos - 1] = nodePtr.p->m_failconf_blocks[pos];
+  }
+
+  return true;
+}
+
+bool
+Qmgr::is_empty_failconf_block(NodeRecPtr nodePtr) const
+{
+  return nodePtr.p->m_failconf_blocks[0] == 0;
+}
 
 void Qmgr::execNDB_FAILCONF(Signal* signal) 
 {
@@ -3661,8 +3800,9 @@ void Qmgr::handleApiCloseComConf(Signal*
         jam();
         signal->theData[0] = nodeId;
         signal->theData[1] = QMGR_REF;
+        add_failconf_block(failedNodePtr, SUMA);
         sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBB);
-        failedNodePtr.p->failState = WAITING_FOR_FAILCONF3;
+        failedNodePtr.p->failState = WAITING_FOR_API_FAILCONF;
       }
       
       if (getNodeInfo(failedNodePtr.i).getType() == NodeInfo::MGM)

=== modified file 'storage/ndb/src/kernel/blocks/suma/Suma.cpp'
--- a/storage/ndb/src/kernel/blocks/suma/Suma.cpp	2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/blocks/suma/Suma.cpp	2011-05-30 11:52:38 +0000
@@ -751,6 +751,7 @@ void Suma::execAPI_FAILREQ(Signal* signa
   c_failedApiNodes.set(failedApiNode);
   c_subscriber_nodes.clear(failedApiNode);
   c_subscriber_per_node[failedApiNode] = 0;
+  c_failedApiNodesState[failedApiNode] = __LINE__;
   
   check_start_handover(signal);
 
@@ -770,6 +771,8 @@ CONF:
   signal->theData[1] = reference();
   sendSignal(QMGR_REF, GSN_API_FAILCONF, signal, 2, JBB);
 
+  c_failedApiNodesState[failedApiNode] = 0;
+
   DBUG_VOID_RETURN;
 }//execAPI_FAILREQ()
 
@@ -791,6 +794,7 @@ Suma::api_fail_block_cleanup_callback(Si
   signal->theData[1] = reference();
   sendSignal(QMGR_REF, GSN_API_FAILCONF, signal, 2, JBB);
   c_failedApiNodes.clear(failedNodeId);
+  c_failedApiNodesState[failedNodeId] = 0;
 }
 
 void
@@ -798,9 +802,11 @@ Suma::api_fail_block_cleanup(Signal* sig
 {
   jam();
 
+  c_failedApiNodesState[failedNode] = __LINE__;
+
   Callback cb = {safe_cast(&Suma::api_fail_block_cleanup_callback),
                  failedNode};
-  
+
   simBlockNodeFailure(signal, failedNode, cb);
 }
 
@@ -829,6 +835,7 @@ Suma::api_fail_gci_list(Signal* signal,
 
       c_gcp_list.release(gcp);
 
+      c_failedApiNodesState[nodeId] = __LINE__;
       signal->theData[0] = SumaContinueB::API_FAIL_GCI_LIST;
       signal->theData[1] = nodeId;
       sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 2, JBB);
@@ -851,11 +858,13 @@ Suma::api_fail_gci_list(Signal* signal,
   Ptr<SubOpRecord> subOpPtr;
   if (c_subOpPool.seize(subOpPtr))
   {
+    c_failedApiNodesState[nodeId] = __LINE__;
     signal->theData[2] = subOpPtr.i;
     sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 6, JBB);
   }
   else
   {
+    c_failedApiNodesState[nodeId] = __LINE__;
     sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 3, JBB);
   }
 
@@ -878,6 +887,7 @@ Suma::api_fail_subscriber_list(Signal* s
     {
       jam();
       sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 3, JBB);
+      c_failedApiNodesState[nodeId] = __LINE__;
       return;
     }
   }
@@ -896,6 +906,7 @@ Suma::api_fail_subscriber_list(Signal* s
   {
     jam();
     c_subscriptions.first(iter);
+    c_failedApiNodesState[nodeId] = __LINE__;
   }
   else
   {
@@ -911,6 +922,7 @@ Suma::api_fail_subscriber_list(Signal* s
        * We restart from this bucket :-(
        */
       c_subscriptions.next(bucket, iter);
+      c_failedApiNodesState[nodeId] = __LINE__;
     }
     else
     {
@@ -922,6 +934,7 @@ Suma::api_fail_subscriber_list(Signal* s
   {
     jam();
     api_fail_block_cleanup(signal, nodeId);
+    c_failedApiNodesState[nodeId] = __LINE__;
     return;
   }
 
@@ -936,11 +949,18 @@ Suma::api_fail_subscriber_list(Signal* s
 
   if (empty)
   {
+    jam();
+    c_failedApiNodesState[nodeId] = __LINE__;
     signal->theData[0] = SumaContinueB::API_FAIL_SUBSCRIPTION;
     signal->theData[1] = subOpPtr.i;
     signal->theData[2] = RNIL;
     sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 3, JBB);
   }
+  else
+  {
+    jam();
+    c_failedApiNodesState[nodeId] = __LINE__;
+  }
 }
 
 void
@@ -1001,6 +1021,7 @@ Suma::api_fail_subscription(Signal* sign
   if (!ptr.isNull())
   {
     jam();
+    c_failedApiNodesState[nodeId] = __LINE__;
     signal->theData[0] = SumaContinueB::API_FAIL_SUBSCRIPTION;
     signal->theData[1] = subOpPtr.i;
     signal->theData[2] = ptr.i;
@@ -1019,6 +1040,8 @@ Suma::api_fail_subscription(Signal* sign
 
   if (c_subscriptions.next(iter))
   {
+    jam();
+    c_failedApiNodesState[nodeId] = __LINE__;
     signal->theData[0] = SumaContinueB::API_FAIL_SUBSCRIBER_LIST;
     signal->theData[1] = nodeId;
     signal->theData[2] = subOpPtr.i;
@@ -1560,6 +1583,29 @@ Suma::execDUMP_STATE_ORD(Signal* signal)
     sendSignalWithDelay(reference(), GSN_DUMP_STATE_ORD, signal, 100, 2);
     return;
   }
+
+  if (tCase == 7019 && signal->getLength() == 2)
+  {
+    jam();
+    Uint32 nodeId = signal->theData[1];
+    if (nodeId < MAX_NODES)
+    {
+      warningEvent(" Suma 7019 %u line: %u", nodeId,
+                   c_failedApiNodesState[nodeId]);
+      warningEvent("   c_connected_nodes.get(): %u",
+                   c_connected_nodes.get(nodeId));
+      warningEvent("   c_failedApiNodes.get(): %u",
+                   c_failedApiNodes.get(nodeId));
+      warningEvent("   c_subscriber_nodes.get(): %u",
+                   c_subscriber_nodes.get(nodeId));
+      warningEvent(" c_subscriber_per_node[%u]: %u",
+                   nodeId, c_subscriber_per_node[nodeId]);
+    }
+    else
+    {
+      warningEvent(" SUMP: dump-7019 to unknown node: %u", nodeId);
+    }
+  }
 }
 
 /*************************************************************

=== modified file 'storage/ndb/src/kernel/blocks/suma/Suma.hpp'
--- a/storage/ndb/src/kernel/blocks/suma/Suma.hpp	2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/blocks/suma/Suma.hpp	2011-05-30 11:52:38 +0000
@@ -360,7 +360,8 @@ public:
   Uint32 c_maxBufferedEpochs;
 
   NodeBitmask c_failedApiNodes;
-  
+  Uint32 c_failedApiNodesState[MAX_NODES];
+
   /**
    * Functions
    */


Attachment: [text/bzr-bundle] bzr/jonas@mysql.com-20110530115238-ly2l52yz6temi631.bundle
Thread
bzr commit into mysql-5.1-telco-6.3 branch (jonas:3439) Bug#12589691Jonas Oreland31 May