List:Commits« Previous MessageNext Message »
From:Jan Wedvik Date:August 22 2012 11:16am
Subject:bzr push into mysql-5.5-cluster-7.2 branch (jan.wedvik:3978 to 3979)
View as plain text  
 3979 Jan Wedvik	2012-08-22 [merge]
      Merged 7.1 -> 7.2

    modified:
      storage/ndb/include/kernel/signaldata/SchemaTrans.hpp
      storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp
      storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp
      storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
      storage/ndb/test/ndbapi/testDict.cpp
      storage/ndb/test/run-test/daily-basic-tests.txt
 3978 Martin Skold	2012-08-17
      Bump version to 7.2.9

    modified:
      VERSION
      storage/ndb/VERSION
=== modified file 'storage/ndb/include/kernel/signaldata/SchemaTrans.hpp'
--- a/storage/ndb/include/kernel/signaldata/SchemaTrans.hpp	2011-10-07 18:15:59 +0000
+++ b/storage/ndb/include/kernel/signaldata/SchemaTrans.hpp	2012-08-22 11:15:43 +0000
@@ -81,6 +81,7 @@ struct SchemaTransEndConf {
 struct SchemaTransEndRef {
   enum ErrorCode {
     NoError = 0,
+    Busy = 701,
     NotMaster = 702,
     InvalidTransKey = 781,
     InvalidTransId = 782,

=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp	2012-06-21 15:24:52 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp	2012-08-22 11:15:43 +0000
@@ -2225,6 +2225,7 @@ void Dbdict::initCommonData()
   c_systemRestart = false;
   c_initialNodeRestart = false;
   c_nodeRestart = false;
+  c_takeOverInProgress = false;
 
   c_outstanding_sub_startstop = 0;
   c_sub_startstop_lock.clear();
@@ -4619,6 +4620,7 @@ void Dbdict::handle_master_takeover(Sign
   }
   DictTakeoverReq* req = (DictTakeoverReq*)signal->getDataPtrSend();
   req->senderRef = reference();
+  c_takeOverInProgress = true;
   sendSignal(rg, GSN_DICT_TAKEOVER_REQ, signal,
                DictTakeoverReq::SignalLength, JBB);
 }
@@ -7827,8 +7829,8 @@ Dbdict::dropTable_complete_done(Signal*
     sendSignal(SUMA_REF, GSN_DROP_TAB_CONF, signal,
                DropTabConf::SignalLength, JBB);
   }
-
-  sendTransConf(signal, trans_ptr);
+  ndbassert(op_ptr.p->m_state == SchemaOp::OS_COMPLETING);
+  sendTransConf(signal, op_ptr);
 }
 
 // DropTable: ABORT
@@ -19745,12 +19747,16 @@ Dbdict::execDICT_TAKEOVER_REQ(Signal* si
        case SchemaOp::OS_COMMITTING:
        case SchemaOp::OS_COMPLETING:
        {
+         /**
+          * Wait 100ms and check again. This delay is there to save CPU cycles
+          * and to avoid filling the jam trace buffer.
+          */
          jam();
          Uint32* data = &signal->theData[0];
-         data[0] = ZDICT_TAKEOVER_REQ;
          memmove(&data[1], &data[0], DictTakeoverReq::SignalLength << 2);
-         sendSignal(reference(), GSN_CONTINUEB, signal,
-                    1 + DictTakeoverReq::SignalLength, JBB);
+         data[0] = ZDICT_TAKEOVER_REQ;
+         sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
+                             100, 1 + DictTakeoverReq::SignalLength);
          return;
        }
        default:
@@ -19873,6 +19879,7 @@ Dbdict::execDICT_TAKEOVER_REF(Signal* si
       return;
     }
   }
+  c_takeOverInProgress = false;
   check_takeover_replies(signal);
 }
 
@@ -19918,6 +19925,7 @@ Dbdict::execDICT_TAKEOVER_CONF(Signal* s
       return;
     }
   }
+  c_takeOverInProgress = false;
   check_takeover_replies(signal);
 }
 
@@ -24813,6 +24821,18 @@ Dbdict::execSCHEMA_TRANS_BEGIN_REQ(Signa
       break;
     }
 
+    if (c_takeOverInProgress)
+    {
+      /**
+       * There is a dict takeover in progress. There may thus another
+       * transaction that should be rolled backward or forward before we
+       * can allow another transaction to start.
+       */
+      jam();
+      setError(error, SchemaTransBeginRef::Busy, __LINE__);
+      break;
+    }
+
     if (!check_ndb_versions() && !localTrans)
     {
       jam();
@@ -25010,6 +25030,18 @@ Dbdict::execSCHEMA_TRANS_END_REQ(Signal*
       setError(error, SchemaTransEndRef::NotMaster, __LINE__);
       break;
     }
+
+    if (c_takeOverInProgress)
+    {
+      /**
+       * There is a dict takeover in progress, and the transaction may thus
+       * be in an inconsistent state. Therefore we cannot process this request
+       * now.
+       */
+      jam();
+      setError(error, SchemaTransEndRef::Busy, __LINE__);
+      break;
+    }
 #ifdef MARTIN
     ndbout_c("Dbdict::execSCHEMA_TRANS_END_REQ: trans %u, state %u", trans_ptr.i, trans_ptr.p->m_state);
 #endif
@@ -25222,6 +25254,17 @@ Dbdict::execSCHEMA_TRANS_IMPL_CONF(Signa
   jamEntry();
   ndbrequire(signal->getNoOfSections() == 0);
 
+  if (c_takeOverInProgress)
+  {
+    /**
+     * The new master will rebuild the transaction state from the
+     * DICT_TAKEOVER_CONF signals. Therefore we ignore this signal during 
+     * takeover.
+     */
+    jam();
+    return;
+  }
+
   const SchemaTransImplConf* conf =
     (const SchemaTransImplConf*)signal->getDataPtr();
 
@@ -25248,6 +25291,17 @@ Dbdict::execSCHEMA_TRANS_IMPL_REF(Signal
   jamEntry();
   ndbrequire(signal->getNoOfSections() == 0);
 
+  if (c_takeOverInProgress)
+  {
+    /**
+     * The new master will rebuild the transaction state from the
+     * DICT_TAKEOVER_CONF signals. Therefore we ignore this signal during 
+     * takeover.
+     */
+    jam();
+    return;
+  }
+
   SchemaTransImplRef refCopy =
     *(SchemaTransImplRef*)signal->getDataPtr();
   SchemaTransImplRef * ref = &refCopy;

=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp	2012-01-23 08:29:37 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp	2012-08-22 11:15:43 +0000
@@ -1319,6 +1319,13 @@ private:
   bool   c_nodeRestart;
   bool   c_initialNodeRestart;
   Uint32 c_tabinfoReceived;
+  /**
+   * This flag indicates that a dict takeover is in progress, specifically 
+   * that the new master has outstanding DICT_TAKEOVER_REQ messages. The flag
+   * is used to prevent client from starting (or ending) transactions during
+   * takeover.
+   */
+  bool   c_takeOverInProgress;
 
   /**
    * Temporary structure used when parsing table info

=== modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2012-08-13 15:19:55 +0000
+++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2012-08-22 11:15:43 +0000
@@ -2513,7 +2513,31 @@ Dblqh::dropTab_wait_usage(Signal* signal
 void
 Dblqh::execDROP_TAB_REQ(Signal* signal){
   jamEntry();
-
+  if (ERROR_INSERTED(5076))
+  {
+    /**
+     * This error insert simulates a situation where it takes a long time
+     * to execute DROP_TAB_REQ, such that we can crash the (dict) master
+     * while there is an outstanding DROP_TAB_REQ.
+     */
+    jam();
+    sendSignalWithDelay(reference(), GSN_DROP_TAB_REQ, signal, 1000,
+                        signal->getLength());
+    return;
+  }
+  if (ERROR_INSERTED(5077))
+  {
+    jam();
+    CLEAR_ERROR_INSERT_VALUE;
+    /** 
+     * Kill this node 2 seconds from now. We wait for two seconds to make sure
+     * that DROP_TAB_REQ messages have reached other nodes before this one
+     * dies.
+     */
+    signal->theData[0] = 9999;
+    sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 2000, 1);
+    return;
+  }
   DropTabReq reqCopy = * (DropTabReq*)signal->getDataPtr();
   DropTabReq* req = &reqCopy;
   
@@ -23709,7 +23733,7 @@ void Dblqh::execDBINFO_SCANREQ(Signal *s
       row.write_uint32(getOwnNodeId());
       row.write_uint32(0);              // log type, 0 = REDO
       row.write_uint32(0);              // log id, always 0 in LQH
-      row.write_uint32(logpart);        // log part
+      row.write_uint32(logPartPtr.p->logPartNo); // log part
 
       row.write_uint64(total*1024*1024);          // total allocated
       row.write_uint64((total-mb)*1024*1024);     // currently in use

=== modified file 'storage/ndb/test/ndbapi/testDict.cpp'
--- a/storage/ndb/test/ndbapi/testDict.cpp	2011-12-15 08:28:12 +0000
+++ b/storage/ndb/test/ndbapi/testDict.cpp	2012-08-22 10:49:42 +0000
@@ -776,6 +776,144 @@ int runUseTableUntilStopped3(NDBT_Contex
   return NDBT_OK;
 }
 
+/**
+ * This is a regression test for bug 14190114 
+ * "CLUSTER CRASH DUE TO NDBREQUIRE IN ./LOCALPROXY.HPP DBLQH (LINE: 234)".
+ * This bug occurs if there is a takeover (i.e. the master node crashes) 
+ * while an LQH block is executing a DROP_TAB_REQ signal. It only affects
+ * multi-threaded ndb.
+ */
+static int
+runDropTakeoverTest(NDBT_Context* ctx, NDBT_Step* step)
+{
+  NdbRestarter restarter;
+  if (restarter.getNumDbNodes() == 1)
+  {
+    g_info << "Cannot do this test with just one datanode." << endl;
+    return NDBT_OK;
+  }
+
+  Ndb* const ndb = GETNDB(step);
+  NdbDictionary::Dictionary* const dict = ndb->getDictionary();
+
+  // First we create a table that is a copy of ctx->getTab().
+  NdbDictionary::Table copyTab(*ctx->getTab());
+  const char* copyName = "copyTab";
+
+  copyTab.setName(copyName);
+  if (dict->createTable(copyTab) != 0)
+  {
+    g_err << "Failed to create table " << copyName << endl
+          << dict->getNdbError() << endl;
+    return NDBT_FAILED;
+  }
+
+  /**
+   * Find the node id of the master node and another data node that is not 
+   * the master.
+   */
+  const int masterNodeId = restarter.getMasterNodeId();
+  const int nonMasterNodeId =
+    masterNodeId == restarter.getDbNodeId(0) ?
+    restarter.getDbNodeId(1) : 
+    restarter.getDbNodeId(0);
+
+  /**
+   * This error insert makes LQH resend the DROP_TAB_REQ to itself (with a
+   * delay) rather than executing it, until the error insert is reset.
+   * This makes it appear as if though the LQH block spends a long time 
+   * executing the DROP_TAB_REQ signal.
+   */
+  g_info << "Insert error 5076 in node " << nonMasterNodeId << endl;
+  require(restarter.insertErrorInNode(nonMasterNodeId, 5076) == 0);
+  /**
+   * This error insert makes the master node crash when one of its LQH 
+   * blocks tries to execute a DROP_TAB_REQ signal. This will then trigger
+   * a takeover.
+   */
+  g_info << "Insert error 5077 in node " << masterNodeId << endl;
+  require(restarter.insertErrorInNode(masterNodeId, 5077) == 0);
+
+  // This dropTable should fail, since the master node dies.
+  g_info << "Trying to drop table " << copyName << endl;
+  if (dict->dropTable(copyName) == 0)
+  {
+    g_err << "Unexpectedly managed to drop table " << copyName << endl;
+    return NDBT_FAILED;
+  }
+
+  /** 
+   * Check that only old master is dead. Bug 14190114 would cause other nodes
+   * to die as well.
+   */
+  const int deadNodeId = restarter.checkClusterAlive(&masterNodeId, 1);
+  if (deadNodeId != 0)
+  {
+    g_err << "NodeId " << deadNodeId << " is down." << endl;
+    return NDBT_FAILED;
+  }
+  
+  // Reset error insert.
+  g_info << "insert error 0 in node " << nonMasterNodeId << endl;
+  require(restarter.insertErrorInNode(nonMasterNodeId, 0) == 0);
+
+  // Verify that old master comes back up, and that no other node crashed.
+  g_info << "Waiting for all nodes to be up." << endl;
+  if (restarter.waitClusterStarted() != 0)
+  {
+    g_err << "One or more cluster nodes are not up." << endl;
+    return NDBT_FAILED;
+  }
+
+  /**
+   * The 'drop table' operation should have been rolled forward, since the
+   * node crash happened in the complete phase. Verify that the table is 
+   * gone.
+   */
+  g_info << "Verifying that table " << copyName << " was deleted." << endl;
+  if (dict->getTable(copyName) == NULL)
+  {
+    if (dict->getNdbError().code != 723) // 723 = no such table existed.
+    {
+      g_err << "dict->getTable() for " << copyName 
+            << " failed in unexpedted way:" << endl
+            << dict->getNdbError() << endl;
+      return NDBT_FAILED;
+    }
+  }
+  else
+  {
+    g_err << "Transaction dropping " << copyName << " was not rolled forward"
+          << endl;
+    return NDBT_FAILED;
+  }
+  
+  /** 
+   * Do another dictionary transaction, to verify that the cluster allows that.
+   */
+  NdbDictionary::Table extraTab(*ctx->getTab());
+  const char* extraName = "extraTab";
+
+  extraTab.setName(extraName);
+  g_info << "Trying to create table " << extraName << endl;
+  if (dict->createTable(extraTab) != 0)
+  {
+    g_err << "Failed to create table " << extraName << endl
+          << dict->getNdbError() << endl;
+    return NDBT_FAILED;
+  }
+
+  // Clean up by dropping extraTab.
+  g_info << "Trying to drop table " << extraName << endl;
+  if (dict->dropTable(extraName) != 0)
+  {
+    g_err << "Failed to drop table " << extraName << endl
+          << dict->getNdbError() << endl;
+    return NDBT_FAILED;
+  }
+  
+  return NDBT_OK;
+}
 
 int
 runCreateMaxTables(NDBT_Context* ctx, NDBT_Step* step)
@@ -9076,6 +9214,9 @@ TESTCASE("CreateAndDropDuring",
   STEP(runCreateAndDropDuring);
   STEP(runUseTableUntilStopped);
 }
+TESTCASE("DropWithTakeover","bug 14190114"){
+  INITIALIZER(runDropTakeoverTest);
+}
 TESTCASE("CreateInvalidTables", 
 	 "Try to create the invalid tables we have defined\n"){ 
   INITIALIZER(runCreateInvalidTables);

=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt	2012-08-13 14:03:42 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt	2012-08-22 11:15:43 +0000
@@ -1868,6 +1868,10 @@ cmd: testDict
 args: -n IndexStatCreate T1
 
 max-time: 300
+cmd: testDict
+args: -n DropWithTakeover T1
+
+max-time: 300
 cmd: testBasic
 args: -n LeakApiConnectObjects T1
 

No bundle (reason: useless for push emails).
Thread
bzr push into mysql-5.5-cluster-7.2 branch (jan.wedvik:3978 to 3979) Jan Wedvik22 Aug