List:Commits« Previous MessageNext Message »
From:Jan Wedvik Date:October 15 2012 11:47am
Subject:bzr push into mysql-5.5-cluster-7.2 branch (jan.wedvik:4030 to 4031)
Bug#14647210
View as plain text  
 4031 Jan Wedvik	2012-10-15
      This commit fixes Bug#14647210 "CAN CRASH ALL NODES EASILY WHEN 
      RESTARTING MORE THAN 6 NODES SIMULTANEOUSLY":
      
      * It implements Dbdict::execGET_TABINFOREF(). This function will resend 
      GET_TABINFOREQ if the DICT master was busy. 
      
      * It removes local queueing of GET_TABINFOREQ signals (via delayed 
      signals) for signals coming from another DICT block.
      
      * It implements a regression test for this bug.
      
      * It adds a version check, such that GET_TABINFOREF will not be sent
      to DICT blocks where the software is too old to handle that signal.

    modified:
      storage/ndb/include/ndb_version.h.in
      storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp
      storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp
      storage/ndb/test/ndbapi/testDict.cpp
      storage/ndb/test/run-test/daily-basic-tests.txt
 4030 John David Duncan	2012-10-12 [merge]
      merge

    added:
      mysql-test/suite/ndb_memcache/include/misc_tables.inc
    modified:
      mysql-test/lib/My/Memcache.pm
      mysql-test/suite/ndb_memcache/include/datatypes_tables.inc
      mysql-test/suite/ndb_memcache/r/external_values.result
      mysql-test/suite/ndb_memcache/t/external_values.test
      mysql-test/suite/ndb_memcache/t/math3.test
      mysql-test/suite/ndb_memcache/t/mpart_key.test
      mysql-test/suite/ndb_memcache/t/type_char.test
      mysql-test/suite/ndb_memcache/t/unique_idx.test
=== modified file 'storage/ndb/include/ndb_version.h.in'
--- a/storage/ndb/include/ndb_version.h.in	2012-08-24 11:53:18 +0000
+++ b/storage/ndb/include/ndb_version.h.in	2012-10-15 11:46:54 +0000
@@ -809,4 +809,17 @@ ndb_join_pushdown(Uint32 x)
   return x >= NDBD_JOIN_PUSHDOWN;
 }
 
+/**
+ * DICT did not have a function for receiving GET_TABINFOREF signals
+ * (see Bug#14647210).
+ */
+#define NDBD_DICT_GET_TABINFOREF_IMPLEMENTED NDB_MAKE_VERSION(7, 2, 9)
+
+inline
+int
+ndbd_dict_get_tabinforef_implemented(Uint32 x)
+{
+  return x >= NDBD_DICT_GET_TABINFOREF_IMPLEMENTED;
+}
+
 #endif

=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp	2012-09-21 12:41:59 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp	2012-10-15 11:46:54 +0000
@@ -431,6 +431,13 @@ void Dbdict::execCONTINUEB(Signal* signa
     jam();
     trans_commit_wait_gci(signal);
     break;
+  case ZGET_TABINFO_RETRY:
+    // We have waited a while. Now we send a new request to the master.
+    memmove(signal->theData, signal->theData+1, 
+            GetTabInfoReq::SignalLength * sizeof *signal->theData);
+    sendSignal(calcDictBlockRef(c_masterNodeId), GSN_GET_TABINFOREQ, signal,
+               GetTabInfoReq::SignalLength, JBB);
+    break;
   default :
     ndbrequire(false);
     break;
@@ -1974,6 +1981,7 @@ Dbdict::Dbdict(Block_context& ctx):
   addRecSignal(GSN_DUMP_STATE_ORD, &Dbdict::execDUMP_STATE_ORD);
   addRecSignal(GSN_GET_TABINFOREQ, &Dbdict::execGET_TABINFOREQ);
   addRecSignal(GSN_GET_TABLEID_REQ, &Dbdict::execGET_TABLEDID_REQ);
+  addRecSignal(GSN_GET_TABINFOREF, &Dbdict::execGET_TABINFOREF);
   addRecSignal(GSN_GET_TABINFO_CONF, &Dbdict::execGET_TABINFO_CONF);
   addRecSignal(GSN_CONTINUEB, &Dbdict::execCONTINUEB);
 
@@ -4144,6 +4152,52 @@ Dbdict::restart_fromWriteSchemaFile(Sign
 }
 
 void
+Dbdict::execGET_TABINFOREF(Signal* signal){
+  jamEntry();
+  /** 
+   * Make copy of 'ref' such that we can build 'req' without overwriting 
+   * source.
+   */
+  const GetTabInfoRef ref_copy =
+    *reinterpret_cast<const GetTabInfoRef*>(signal->getDataPtr());
+
+  if (ref_copy.errorCode == GetTabInfoRef::Busy)
+  {
+    jam();
+
+    /**
+     * Master is busy. Send delayed CONTINUEB to self to add some delay, then
+     * send new GET_TABINFOREQ to master.
+     */
+    signal->getDataPtrSend()[0] = ZGET_TABINFO_RETRY;
+
+    GetTabInfoReq* const req =
+      reinterpret_cast<GetTabInfoReq*>(signal->getDataPtrSend()+1);
+    memset(req, 0, sizeof *req);
+    req->senderRef = reference();
+    req->senderData = ref_copy.senderData;
+    req->requestType =
+      GetTabInfoReq::RequestById | GetTabInfoReq::LongSignalConf;
+    req->tableId = ref_copy.tableId;
+    req->schemaTransId = ref_copy.schemaTransId;
+    // Add a random 5-10ms delay.
+    sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 5 + random()%6, 
+                        GetTabInfoReq::SignalLength+1);
+  }
+  else
+  {
+    // Other error. Restart node.
+    char msg[250];
+    BaseString::snprintf(msg, sizeof(msg),
+                         "Got GET_TABINFOREF from node %u with unexpected "
+                         " error code %u", 
+                         refToNode(signal->getSendersBlockRef()),
+                         ref_copy.errorCode);
+    progError(__LINE__, NDBD_EXIT_RESTORE_SCHEMA, msg);
+  }
+} // Dbdict::execGET_TABINFOREF()
+
+void
 Dbdict::execGET_TABINFO_CONF(Signal* signal)
 {
   jamEntry();
@@ -9993,12 +10047,31 @@ void Dbdict::execGET_TABINFOREQ(Signal* 
     return;
   }//if
 
-  const Uint32 MAX_WAITERS = 5;
+  const Uint32 MAX_WAITERS = (MAX_NDB_NODES*3)/2;
 
-  if(c_retrieveRecord.busyState && fromTimeQueue == false)
+  // Test sending GET_TABINFOREF to DICT (Bug#14647210).
+  const bool testRef = refToMain(signal->senderBlockRef()) == DBDICT &&
+    ERROR_INSERTED_CLEAR(6026);
+  
+  if ((c_retrieveRecord.busyState || testRef) && fromTimeQueue == false)
   {
     jam();
-    if(c_retrieveRecord.noOfWaiters < MAX_WAITERS){
+
+    const Uint32 senderVersion = 
+      getNodeInfo(refToNode(signal->senderBlockRef())).m_version;
+
+    /**
+     * DBDICT may possibly generate large numbers of signals if many nodes
+     * are started at the same time, so we do not want to queue those using
+     * sendSignalWithDelay(). See also Bug#14647210. Signals from other 
+     * blocks we do queue localy, since these blocks may not retry on
+     * GET_TABINFOREF with error==busy, and since they also should not 
+     * generate large bursts of GET_TABINFOREQ.
+     */
+    if (c_retrieveRecord.noOfWaiters < MAX_WAITERS &&
+        (refToMain(signal->senderBlockRef()) != DBDICT ||
+         !ndbd_dict_get_tabinforef_implemented(senderVersion)))
+    {
       jam();
       c_retrieveRecord.noOfWaiters++;
 
@@ -10007,6 +10080,12 @@ void Dbdict::execGET_TABINFOREQ(Signal* 
 			  &handle);
       return;
     }
+
+    if (!c_retrieveRecord.busyState)
+    {
+      ndbout << "Sending extra TABINFOREF to node"
+             << refToNode(signal->senderBlockRef()) << endl;
+    }
     releaseSections(handle);
     sendGET_TABINFOREF(signal, req, GetTabInfoRef::Busy, __LINE__);
     return;

=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp	2012-09-19 07:09:57 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp	2012-10-15 11:46:54 +0000
@@ -98,6 +98,7 @@
 
 #define ZCOMMIT_WAIT_GCI   6
 #define ZINDEX_STAT_BG_PROCESS 7
+#define ZGET_TABINFO_RETRY 8
 
 /*--------------------------------------------------------------*/
 // Other constants in alphabetical order
@@ -856,7 +857,7 @@ private:
 
   void execGET_TABINFOREQ(Signal* signal);
   void execGET_TABLEDID_REQ(Signal* signal);
-  void execGET_TABINFO_REF(Signal* signal);
+  void execGET_TABINFOREF(Signal* signal);
   void execGET_TABINFO_CONF(Signal* signal);
   void execCONTINUEB(Signal* signal);
 

=== modified file 'storage/ndb/test/ndbapi/testDict.cpp'
--- a/storage/ndb/test/ndbapi/testDict.cpp	2012-09-21 12:34:28 +0000
+++ b/storage/ndb/test/ndbapi/testDict.cpp	2012-10-15 11:46:54 +0000
@@ -8965,6 +8965,56 @@ runBug57057(NDBT_Context* ctx, NDBT_Step
   return result;
 }
 
+/**
+ * This is a regression test for Bug #14647210 "CAN CRASH ALL NODES EASILY 
+ * WHEN RESTARTING MORE THAN 6 NODES SIMULTANEOUSLY". The cause of this bug
+ * was that DICT did not handle GET_TABINFOREF signals.
+ */
+static int
+runGetTabInfoRef(NDBT_Context* ctx, NDBT_Step* step)
+{
+  NdbRestarter restarter;
+  if (restarter.getNumDbNodes() == 1)
+  {
+    g_info << "Cannot do this test with just one datanode." << endl;
+    return NDBT_OK;
+  }
+
+  /**
+   * This error insert makes DICT respond with GET_TABINFOREF where
+   * error==busy when receiving the next GET_TABINFOREQ signal.
+   */
+  require(restarter.insertErrorInAllNodes(6026) == 0);
+
+  int nodeSet[MAX_NDB_NODES];
+  for (int i = 0; i < restarter.getNumDbNodes() - 1; i++)
+  {
+    nodeSet[i] = restarter.getDbNodeId(i);
+    g_info << "Node " << nodeSet[i] << " will be stopped." << endl;
+  }
+
+  require(restarter.restartNodes(nodeSet, restarter.getNumDbNodes() - 1,
+                                 NdbRestarter::NRRF_NOSTART |
+                                 NdbRestarter::NRRF_ABORT) == 0);
+
+  g_info << "Waiting for nodes to stop." << endl;
+  require(restarter.waitNodesNoStart(nodeSet, restarter.getNumDbNodes() - 1)
+          == 0);
+
+  require(restarter.startNodes(nodeSet, restarter.getNumDbNodes() - 1) == 0);
+
+  g_info << "Waiting for nodes to start again." << endl;
+  if (restarter.waitClusterStarted() != 0)
+  {
+    g_err << "Failed to restart cluster " << endl;
+    require(restarter.insertErrorInAllNodes(0) == 0);
+    return NDBT_FAILED;
+  }
+
+  require(restarter.insertErrorInAllNodes(0) == 0);
+  return NDBT_OK;
+} // runGetTabInfoRef()
+
 int
 runBug13416603(NDBT_Context* ctx, NDBT_Step* step)
 {
@@ -9655,6 +9705,12 @@ TESTCASE("Bug57057",
   TC_PROPERTY("SubSteps", 1);
   STEP(runBug58277scan);
 }
+TESTCASE("GetTabInfoRef", "Regression test for bug #14647210 'CAN CRASH ALL "
+         "NODES EASILY WHEN RESTARTING MORE THAN 6 NODES SIMULTANEOUSLY'"
+         " (missing handling of GET_TABINFOREF signal).")
+{
+  INITIALIZER(runGetTabInfoRef);
+}
 TESTCASE("Bug13416603", "")
 {
   INITIALIZER(runCreateTheTable);

=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt	2012-09-21 12:41:59 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt	2012-10-15 11:46:54 +0000
@@ -1876,6 +1876,10 @@ cmd: testDict
 args: -n DropWithTakeover T1
 
 max-time: 300
+cmd: testDict
+args: -n GetTabInfoRef T1
+
+max-time: 300
 cmd: testBasic
 args: -n LeakApiConnectObjects T1
 

No bundle (reason: useless for push emails).
Thread
bzr push into mysql-5.5-cluster-7.2 branch (jan.wedvik:4030 to 4031)Bug#14647210Jan Wedvik15 Oct