List:Commits« Previous MessageNext Message »
From:Tomas Ulin Date:December 16 2008 9:51pm
Subject:bzr commit into mysql-5.1 branch (tomas.ulin:2774) Bug#41462
View as plain text  
#At file:///home/tomas/mysql_src/mysql-5.1-telco-6.2-merge/

 2774 Tomas Ulin	2008-12-16
      Bug #41462  Mysqld/ndbapi disconnects too agressively during node restart
modified:
  storage/ndb/src/ndbapi/ClusterMgr.cpp
  storage/ndb/src/ndbapi/ClusterMgr.hpp
  storage/ndb/src/ndbapi/NdbEventOperationImpl.cpp
  storage/ndb/src/ndbapi/NdbEventOperationImpl.hpp
  storage/ndb/src/ndbapi/Ndbif.cpp

=== modified file 'storage/ndb/src/ndbapi/ClusterMgr.cpp'
--- a/storage/ndb/src/ndbapi/ClusterMgr.cpp	2008-04-25 07:14:29 +0000
+++ b/storage/ndb/src/ndbapi/ClusterMgr.cpp	2008-12-16 20:51:49 +0000
@@ -556,8 +556,7 @@ ClusterMgr::reportNodeFailed(NodeId node
     theFacade.ReportNodeDead(nodeId);
   }
   
-  theNode.nfCompleteRep = false;
-  if(noOfAliveNodes == 0)
+  if (noOfConnectedNodes == 0)
   {
     if (!global_flag_skip_invalidate_cache &&
         theFacade.m_globalDictCache)
@@ -568,6 +567,10 @@ ClusterMgr::reportNodeFailed(NodeId node
       m_connect_count ++;
       m_cluster_state = CS_waiting_for_clean_cache;
     }
+  }
+  theNode.nfCompleteRep = false;
+  if(noOfAliveNodes == 0)
+  {
     NFCompleteRep rep;
     for(Uint32 i = 1; i<MAX_NDB_NODES; i++){
       if(theNodes[i].defined && theNodes[i].nfCompleteRep == false){

=== modified file 'storage/ndb/src/ndbapi/ClusterMgr.hpp'
--- a/storage/ndb/src/ndbapi/ClusterMgr.hpp	2007-05-09 14:31:16 +0000
+++ b/storage/ndb/src/ndbapi/ClusterMgr.hpp	2008-12-16 20:51:49 +0000
@@ -85,7 +85,6 @@ public:
   
   const Node &  getNodeInfo(NodeId) const;
   Uint32        getNoOfConnectedNodes() const;
-  bool          isClusterAlive() const;
   void          hb_received(NodeId);
 
   Uint32        m_connect_count;
@@ -144,11 +143,6 @@ ClusterMgr::getNoOfConnectedNodes() cons
   return noOfConnectedNodes;
 }
 
-inline
-bool
-ClusterMgr::isClusterAlive() const {
-  return noOfAliveNodes != 0;
-}
 inline
 void
 ClusterMgr::hb_received(NodeId nodeId) {

=== modified file 'storage/ndb/src/ndbapi/NdbEventOperationImpl.cpp'
--- a/storage/ndb/src/ndbapi/NdbEventOperationImpl.cpp	2008-09-24 12:27:11 +0000
+++ b/storage/ndb/src/ndbapi/NdbEventOperationImpl.cpp	2008-12-16 20:51:49 +0000
@@ -1080,6 +1080,8 @@ NdbEventBuffer::NdbEventBuffer(Ndb *ndb)
   // initialize lists
   bzero(&g_empty_gci_container, sizeof(Gci_container));
   init_gci_containers();
+
+  m_alive_node_bit_mask.clear();
 }
 
 NdbEventBuffer::~NdbEventBuffer()
@@ -1836,11 +1838,16 @@ NdbEventBuffer::complete_bucket(Gci_cont
 
 void
 NdbEventBuffer::execSUB_GCP_COMPLETE_REP(const SubGcpCompleteRep * const rep,
-                                         Uint32 len)
+                                         Uint32 len, int complete_cluster_failure)
 {
-  if (unlikely(m_active_op_count == 0))
+  if (!complete_cluster_failure)
   {
-    return;
+    m_alive_node_bit_mask.set(refToNode(rep->senderRef));
+
+    if (unlikely(m_active_op_count == 0))
+    {
+      return;
+    }
   }
   
   DBUG_ENTER_EVENT("NdbEventBuffer::execSUB_GCP_COMPLETE_REP");
@@ -2089,13 +2096,15 @@ NdbEventBuffer::report_node_connected(Ui
 }
 
 void
-NdbEventBuffer::report_node_failure(Uint32 node_id)
+NdbEventBuffer::report_node_failure_completed(Uint32 node_id)
 {
+  m_alive_node_bit_mask.clear(node_id);
+
   NdbEventOperation* op= m_ndb->getEventOperation(0);
   if (op == 0)
     return;
 
-  DBUG_ENTER("NdbEventBuffer::report_node_failure");
+  DBUG_ENTER("NdbEventBuffer::report_node_failure_completed");
   SubTableData data;
   LinearSectionPtr ptr[3];
   bzero(&data, sizeof(data));
@@ -2110,7 +2119,7 @@ NdbEventBuffer::report_node_failure(Uint
   data.flags = SubTableData::LOG;
 
   Uint64 gci = Uint64((m_latestGCI >> 32) + 1) << 32;
-  find_max_known_gci(&gci);
+  bool found = find_max_known_gci(&gci);
 
   data.gci_hi = Uint32(gci >> 32);
   data.gci_lo = Uint32(gci);
@@ -2120,21 +2129,15 @@ NdbEventBuffer::report_node_failure(Uint
    */
   // no need to lock()/unlock(), receive thread calls this
   insert_event(&op->m_impl, data, ptr, data.senderData);
-  DBUG_VOID_RETURN;
-}
-
-void
-NdbEventBuffer::completeClusterFailed()
-{
-  NdbEventOperation* op= m_ndb->getEventOperation(0);
-  if (op == 0)
-    return;
 
-  DBUG_ENTER("NdbEventBuffer::completeClusterFailed");
+  if (!m_alive_node_bit_mask.isclear())
+    DBUG_VOID_RETURN;
 
+  /*
+   * Cluster failure
+   */
 
-  Uint64 gci = Uint64((m_latestGCI >> 32) + 1) << 32;
-  bool found = find_max_known_gci(&gci);
+  DBUG_PRINT("info", ("Cluster failure"));
 
   Uint64 * array = m_known_gci.getBase();
   Uint32 mask = m_known_gci.size() - 1;
@@ -2169,18 +2172,10 @@ NdbEventBuffer::completeClusterFailed()
   /**
    * Inject new event
    */
-  SubTableData data;
-  LinearSectionPtr ptr[3];
-  bzero(&data, sizeof(data));
-  bzero(ptr, sizeof(ptr));
-
   data.tableId = ~0;
   data.requestInfo = 0;
   SubTableData::setOperation(data.requestInfo,
 			     NdbDictionary::Event::_TE_CLUSTER_FAILURE);
-  data.flags = SubTableData::LOG;
-  data.gci_hi = Uint32(gci >> 32);
-  data.gci_lo = Uint32(gci);
 
   /**
    * Insert this event for each operation
@@ -2212,7 +2207,7 @@ NdbEventBuffer::completeClusterFailed()
   rep.gci_lo= gci & 0xFFFFFFFF;
   rep.gcp_complete_rep_count= cnt;
   rep.flags = 0;
-  execSUB_GCP_COMPLETE_REP(&rep, SubGcpCompleteRep::SignalLength);
+  execSUB_GCP_COMPLETE_REP(&rep, SubGcpCompleteRep::SignalLength, 1);
 
   DBUG_VOID_RETURN;
 }

=== modified file 'storage/ndb/src/ndbapi/NdbEventOperationImpl.hpp'
--- a/storage/ndb/src/ndbapi/NdbEventOperationImpl.hpp	2008-02-11 13:24:17 +0000
+++ b/storage/ndb/src/ndbapi/NdbEventOperationImpl.hpp	2008-12-16 20:51:49 +0000
@@ -424,8 +424,7 @@ public:
     and added to all event ops listed as active or pending delete
     in m_dropped_ev_op using insertDataL, includeing the blob
     event ops referenced by a regular event op.
-    - NdbEventBuffer::report_node_failure
-    - NdbEventBuffer::completeClusterFailed
+    - NdbEventBuffer::report_node_failure_completed
 
     TE_ACTIVE is sent from the kernel on initial execute/start of the
     event op, but is also internally generetad on node connect like
@@ -528,12 +527,12 @@ public:
   int insertDataL(NdbEventOperationImpl *op,
 		  const SubTableData * const sdata, Uint32 len,
 		  LinearSectionPtr ptr[3]);
-  void execSUB_GCP_COMPLETE_REP(const SubGcpCompleteRep * const, Uint32 len);
+  void execSUB_GCP_COMPLETE_REP(const SubGcpCompleteRep * const, Uint32 len,
+                                int complete_cluster_failure= 0);
   void complete_outof_order_gcis();
   
   void report_node_connected(Uint32 node_id);
-  void report_node_failure(Uint32 node_id);
-  void completeClusterFailed();
+  void report_node_failure_completed(Uint32 node_id);
 
   // used by user thread 
   Uint64 getLatestGCI();
@@ -664,6 +663,8 @@ private:
   void complete_bucket(Gci_container*);
   bool find_max_known_gci(Uint64 * res) const;
   void resize_known_gci();
+
+  Bitmask<(unsigned int)_NDB_NODE_BITMASK_SIZE> m_alive_node_bit_mask;
 };
 
 inline

=== modified file 'storage/ndb/src/ndbapi/Ndbif.cpp'
--- a/storage/ndb/src/ndbapi/Ndbif.cpp	2008-11-13 13:15:56 +0000
+++ b/storage/ndb/src/ndbapi/Ndbif.cpp	2008-12-16 20:51:49 +0000
@@ -269,13 +269,7 @@ Ndb::report_node_failure_completed(Uint3
   {
     // node failed
     // eventOperations in the ndb object should be notified
-    theEventBuffer->report_node_failure(node_id);
-    if(!theImpl->m_transporter_facade->theClusterMgr->isClusterAlive())
-    {
-      // cluster is unavailable, 
-      // eventOperations in the ndb object should be notified
-      theEventBuffer->completeClusterFailed();
-    }
+    theEventBuffer->report_node_failure_completed(node_id);
   }
   
   abortTransactionsAfterNodeFailure(node_id);

Thread
bzr commit into mysql-5.1 branch (tomas.ulin:2774) Bug#41462Tomas Ulin19 Dec