List:Commits« Previous MessageNext Message »
From:tomas Date:February 14 2007 5:05am
Subject:bk commit into 5.0 tree (tomas:1.2388) BUG#26293
View as plain text  
Below is the list of changes that have just been committed into a local
5.0 repository of tomas. When tomas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2388 07/02/14 11:05:38 tomas@stripped +7 -0
  Bug#26293 cluster mgmt node sometimes doesn't receive events from all nodes on restart
  - signals where sometimes sent too early when setting up subscriptions

  ndb/src/ndbapi/SignalSender.hpp
    1.3 07/02/14 11:05:31 tomas@stripped +1 -1
    manke metchd const

  ndb/src/ndbapi/SignalSender.cpp
    1.9 07/02/14 11:05:31 tomas@stripped +2 -0
    assert to see that node is sendable when signal is sent

  ndb/src/ndbapi/ClusterMgr.hpp
    1.14 07/02/14 11:05:31 tomas@stripped +1 -0
    added status variable m_api_reg_conf in cluster manager to correctly be able to
determine if a node is sendable

  ndb/src/ndbapi/ClusterMgr.cpp
    1.32 07/02/14 11:05:31 tomas@stripped +4 -1
    added status variable m_api_reg_conf in cluster manager to correctly be able to
determine if a node is sendable

  ndb/src/mgmsrv/MgmtSrvr.cpp
    1.112 07/02/14 11:05:31 tomas@stripped +74 -20
    bug in that signals where sent prior to api reg conf arrived, causing thrown away
signals and subsequent hangs in mgmtserver
    also add retry if node connected but not yet received api reg conf

  ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp
    1.31 07/02/14 11:05:31 tomas@stripped +19 -1
    added dump for active subscriptions in cmvmi

  ndb/include/kernel/signaldata/DumpStateOrd.hpp
    1.11 07/02/14 11:05:31 tomas@stripped +4 -0
    added dump for active subscriptions in cmvmi

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	tomas
# Host:	poseidon.mysql.com
# Root:	/home/tomas/mysql-5.0-telco-gca

--- 1.10/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2007-01-12 03:12:09 +07:00
+++ 1.11/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2007-02-14 11:05:31 +07:00
@@ -107,6 +107,10 @@ public:
     CmvmiDumpLongSignalMemory = 2601,
     CmvmiSetRestartOnErrorInsert = 2602,
     CmvmiTestLongSigWithDelay = 2603,
+    CmvmiDumpSubscriptions = 2604, /* note: done to respective outfile
+                                      to be able to debug if events
+                                      for some reason does not end up
+                                      in clusterlog */
     // 7000 DIH
     // 7001 DIH
     // 7002 DIH

--- 1.30/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp	2007-01-18 03:18:46 +07:00
+++ 1.31/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp	2007-02-14 11:05:31 +07:00
@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* sign
   case TimeToWaitAlive:
 
     // QMGR
-  case HeartbeatIntervalDbDb: // TODO ev till Ndbcnt också
+  case HeartbeatIntervalDbDb: // TODO possibly Ndbcnt too
   case HeartbeatIntervalDbApi:
   case ArbitTimeout:
     sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB);
@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal
     }
   }
   
+  if (arg == DumpStateOrd::CmvmiDumpSubscriptions)
+  {
+    SubscriberPtr ptr;
+    subscribers.first(ptr);  
+    g_eventLogger.info("List subscriptions:");
+    while(ptr.i != RNIL)
+    {
+      g_eventLogger.info("Subscription: %u, nodeId: %u, ref: 0x%x",
+                         ptr.i,  refToNode(ptr.p->blockRef), ptr.p->blockRef);
+      for(Uint32 i = 0; i < LogLevel::LOGLEVEL_CATEGORIES; i++)
+      {
+        Uint32 level = ptr.p->logLevel.getLogLevel((LogLevel::EventCategory)i);
+        g_eventLogger.info("Category %u Level %u", i, level);
+      }
+      subscribers.next(ptr);
+    }
+  }
+
   if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
     infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
 	      g_sectionSegmentPool.getSize(),

--- 1.111/ndb/src/mgmsrv/MgmtSrvr.cpp	2007-01-23 11:44:36 +07:00
+++ 1.112/ndb/src/mgmsrv/MgmtSrvr.cpp	2007-02-14 11:05:31 +07:00
@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, 
     return WRONG_PROCESS_TYPE;
   // Check if we have contact with it
   if(unCond){
-    if(theFacade->theClusterMgr->getNodeInfo(nodeId).connected)
+    if(theFacade->theClusterMgr->getNodeInfo(nodeId).m_api_reg_conf)
       return 0;
   }
   else if (theFacade->get_node_alive(nodeId) == true)
@@ -1562,32 +1562,85 @@ MgmtSrvr::status(int nodeId, 
 }
 
 int 
-MgmtSrvr::setEventReportingLevelImpl(int nodeId, 
+MgmtSrvr::setEventReportingLevelImpl(int nodeId_arg, 
 				     const EventSubscribeReq& ll)
 {
   SignalSender ss(theFacade);
-  ss.lock();
-
-  SimpleSignal ssig;
-  EventSubscribeReq * dst = 
-    CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
-  ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
-	   EventSubscribeReq::SignalLength);
-  *dst = ll;
-
-  NodeBitmask nodes;
+  NdbNodeBitmask nodes;
+  int retries = 30;
   nodes.clear();
-  Uint32 max = (nodeId == 0) ? (nodeId = 1, MAX_NDB_NODES) : nodeId;
-  for(; (Uint32) nodeId <= max; nodeId++)
+  while (1)
   {
-    if (nodeTypes[nodeId] != NODE_TYPE_DB)
-      continue;
-    if (okToSendTo(nodeId, true))
-      continue;
-    if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
+    Uint32 nodeId, max;
+    ss.lock();
+    SimpleSignal ssig;
+    EventSubscribeReq * dst = 
+      CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
+    ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
+             EventSubscribeReq::SignalLength);
+    *dst = ll;
+
+    if (nodeId_arg == 0)
+    {
+      // all nodes
+      nodeId = 1;
+      max = MAX_NDB_NODES;
+    }
+    else
     {
-      nodes.set(nodeId);
+      // only one node
+      max = nodeId = nodeId_arg;
+    }
+    // first make sure nodes are sendable
+    for(; nodeId <= max; nodeId++)
+    {
+      if (nodeTypes[nodeId] != NODE_TYPE_DB)
+        continue;
+      if (okToSendTo(nodeId, true))
+      {
+        if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected  == false)
+        {
+          // node not connected we can safely skip this one
+          continue;
+        }
+        // api_reg_conf not recevied yet, need to retry
+        break;
+      }
     }
+    if (nodeId <= max)
+    {
+      if (--retries)
+      {
+        ss.unlock();
+        NdbSleep_MilliSleep(100);  
+        continue;
+      }
+      return SEND_OR_RECEIVE_FAILED;
+    }
+
+    if (nodeId_arg == 0)
+    {
+      // all nodes
+      nodeId = 1;
+      max = MAX_NDB_NODES;
+    }
+    else
+    {
+      // only one node
+      max = nodeId = nodeId_arg;
+    }
+    // now send to all sendable nodes nodes
+    // note, lock is held, so states have not changed
+    for(; (Uint32) nodeId <= max; nodeId++)
+    {
+      if (nodeTypes[nodeId] != NODE_TYPE_DB)
+        continue;
+      if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected  == false)
+        continue; // node is not connected, skip
+      if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
+        nodes.set(nodeId);
+    }
+    break;
   }
 
   if (nodes.isclear())
@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int
   int error = 0;
   while (!nodes.isclear())
   {
+    Uint32 nodeId;
     SimpleSignal *signal = ss.waitFor();
     int gsn = signal->readSignalNumber();
     nodeId = refToNode(signal->header.theSendersBlockRef);

--- 1.31/ndb/src/ndbapi/ClusterMgr.cpp	2007-01-23 11:44:36 +07:00
+++ 1.32/ndb/src/ndbapi/ClusterMgr.cpp	2007-02-14 11:05:31 +07:00
@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){
 ClusterMgr::Node::Node()
   : m_state(NodeState::SL_NOTHING) { 
   compatible = nfCompleteRep = true;
-  connected = defined = m_alive = false; 
+  connected = defined = m_alive = m_api_reg_conf = false; 
   m_state.m_connected_nodes.clear();
 }
 
@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32
 					      node.m_info.m_version);
   }
 
+  node.m_api_reg_conf = true;
+
   node.m_state = apiRegConf->nodeState;
   if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED  ||
 			  node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId no
 
   noOfConnectedNodes--;
   theNodes[nodeId].connected = false;
+  theNodes[nodeId].m_api_reg_conf = false;
   theNodes[nodeId].m_state.m_connected_nodes.clear();
 
   reportNodeFailed(nodeId, true);

--- 1.13/ndb/src/ndbapi/ClusterMgr.hpp	2007-01-23 11:44:36 +07:00
+++ 1.14/ndb/src/ndbapi/ClusterMgr.hpp	2007-02-14 11:05:31 +07:00
@@ -65,6 +65,7 @@ public:
     bool compatible;    // Version is compatible
     bool nfCompleteRep; // NF Complete Rep has arrived
     bool m_alive;       // Node is alive
+    bool m_api_reg_conf;// API_REGCONF has arrived
     
     NodeInfo  m_info;
     NodeState m_state;

--- 1.8/ndb/src/ndbapi/SignalSender.cpp	2007-01-23 11:44:36 +07:00
+++ 1.9/ndb/src/ndbapi/SignalSender.cpp	2007-02-14 11:05:31 +07:00
@@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() co
 
 SendStatus
 SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){
+  assert(getNodeInfo(nodeId).m_api_reg_conf == true ||
+         s->readSignalNumber() == GSN_API_REGREQ);
   return theFacade->theTransporterRegistry->prepareSend(&s->header,
 							1, // JBB
 							&s->theData[0],

--- 1.2/ndb/src/ndbapi/SignalSender.hpp	2006-12-24 02:04:18 +07:00
+++ 1.3/ndb/src/ndbapi/SignalSender.hpp	2007-02-14 11:05:31 +07:00
@@ -32,7 +32,7 @@ public:
   Uint32 theData[25];
   LinearSectionPtr ptr[3];
 
-  int readSignalNumber() {return header.theVerId_signalNumber; }
+  int readSignalNumber() const {return header.theVerId_signalNumber; }
   Uint32 *getDataPtrSend() { return theData; }
   const Uint32 *getDataPtr() const { return theData; }
 
Thread
bk commit into 5.0 tree (tomas:1.2388) BUG#26293tomas14 Feb