List:Commits« Previous MessageNext Message »
From:Jonas Oreland Date:April 1 2009 11:25am
Subject:bzr commit into mysql-5.1-telco-6.2 branch (jonas:2894) Bug#43888
View as plain text  
#At file:///home/jonas/src/telco-6.2/

 2894 Jonas Oreland	2009-04-01
      ndb - bug#43888 - fix race condition with ndb dieing during restart, when just about to be included into gcp
modified:
  storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
  storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
  storage/ndb/test/ndbapi/testNodeRestart.cpp
  storage/ndb/test/run-test/daily-basic-tests.txt

=== modified file 'storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp'
--- a/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2008-04-23 14:29:01 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2009-04-01 11:25:38 +0000
@@ -1479,7 +1479,7 @@ private:
     Uint32 failNr;
     bool activeState;
     bool blockLcp;
-    bool blockGcp;
+    Uint32 blockGcp; // 0, 1=ordered, 2=effective
     Uint32 startInfoErrorCode;
     Uint32 m_outstandingGsn;
   };

=== modified file 'storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2009-02-04 12:35:22 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2009-04-01 11:25:38 +0000
@@ -2217,7 +2217,7 @@ void Dbdih::nodeDictStartConfLab(Signal*
   // THE DICTIONARY AGAIN.
   /*-------------------------------------------------------------------------*/
   c_nodeStartMaster.wait = ZFALSE;
-  c_nodeStartMaster.blockGcp = true;
+  c_nodeStartMaster.blockGcp = 1;
 
   return;
 }//Dbdih::nodeDictStartConfLab()
@@ -2350,7 +2350,7 @@ void Dbdih::execINCL_NODECONF(Signal* si
   // various blocks we are ready to start the global checkpoint protocol
   /*------------------------------------------------------------------------*/
   c_nodeStartMaster.wait = 11;
-  c_nodeStartMaster.blockGcp = false;
+  c_nodeStartMaster.blockGcp = 0;
 
   /**
    * Restart GCP
@@ -4416,7 +4416,7 @@ void Dbdih::execNODE_FAILREP(Signal* sig
   Uint32 newMasterId = nodeFail->masterNodeId;
   const Uint32 noOfFailedNodes = nodeFail->noOfNodes;
 
-  if (ERROR_INSERTED(7179))
+  if (ERROR_INSERTED(7179) || ERROR_INSERTED(7217))
   {
     CLEAR_ERROR_INSERT_VALUE;
   }
@@ -4426,6 +4426,8 @@ void Dbdih::execNODE_FAILREP(Signal* sig
     SET_ERROR_INSERT_VALUE(7000);
   }
 
+
+
   /*-------------------------------------------------------------------------*/
   // The first step is to convert from a bit mask to an array of failed nodes.
   /*-------------------------------------------------------------------------*/
@@ -8246,7 +8248,7 @@ Dbdih::execUPGRADE_PROTOCOL_ORD(Signal* 
 void
 Dbdih::startGcpLab(Signal* signal, Uint32 aWaitTime) 
 {
-  if (c_nodeStartMaster.blockGcp == true &&
+  if (c_nodeStartMaster.blockGcp != 0 &&
       m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE)
   {
     jam();
@@ -8255,9 +8257,27 @@ Dbdih::startGcpLab(Signal* signal, Uint3
     /*  A NEW NODE WANTS IN AND WE MUST ALLOW IT TO COME IN NOW SINCE THE */
     /*       GCP IS COMPLETED.                                            */
     /* ------------------------------------------------------------------ */
-    gcpBlockedLab(signal);
-    return;
-  }//if
+
+    if (ERROR_INSERTED(7217))
+    {
+      jam();
+      
+      signal->theData[0] = 9999;
+      sendSignal(numberToRef(CMVMI, refToNode(c_nodeStartMaster.startNode)),
+                 GSN_NDB_TAMPER, signal, 1, JBB);
+
+      m_micro_gcp.m_master.m_start_time = 0; // Force start
+      // fall through
+    }
+    else
+    {
+      jam();
+      ndbrequire(c_nodeStartMaster.blockGcp == 1); // Ordered...
+      c_nodeStartMaster.blockGcp = 2; // effective
+      gcpBlockedLab(signal);
+      return;
+    }
+  }
 
   if ((cgcpOrderBlocked == 1) ||
       (cfirstVerifyQueue != RNIL)) {
@@ -12212,7 +12232,7 @@ void Dbdih::crashSystemAtGcpStop(Signal*
   if (local)
     goto dolocal;
 
-  if (c_nodeStartMaster.blockGcp)
+  if (c_nodeStartMaster.blockGcp == 2)
   {
     jam();
     /**
@@ -13104,7 +13124,7 @@ void Dbdih::initCommonData()
   cverifyQueueCounter = 0;
   cwaitLcpSr = false;
   c_nextLogPart = 0;
-  c_nodeStartMaster.blockGcp = false;
+  c_nodeStartMaster.blockGcp = 0;
 
   nodeResetStart(0);
   c_nodeStartMaster.wait = ZFALSE;
@@ -13900,17 +13920,17 @@ void Dbdih::newCrashedReplica(Uint32 nod
 void Dbdih::nodeResetStart(Signal *signal)
 {
   jam();
-  bool startGCP = c_nodeStartMaster.blockGcp;
+  Uint32 startGCP = c_nodeStartMaster.blockGcp;
 
   c_nodeStartSlave.nodeId = 0;
   c_nodeStartMaster.startNode = RNIL;
   c_nodeStartMaster.failNr = cfailurenr;
   c_nodeStartMaster.activeState = false;
-  c_nodeStartMaster.blockGcp = false;
+  c_nodeStartMaster.blockGcp = 0;
   c_nodeStartMaster.blockLcp = false;
   c_nodeStartMaster.m_outstandingGsn = 0;
 
-  if (startGCP)
+  if (startGCP == 2) // effective
   {
     jam();
     ndbrequire(isMaster());

=== modified file 'storage/ndb/test/ndbapi/testNodeRestart.cpp'
--- a/storage/ndb/test/ndbapi/testNodeRestart.cpp	2009-01-30 10:41:42 +0000
+++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp	2009-04-01 11:25:38 +0000
@@ -3219,6 +3219,60 @@ runBug42422(NDBT_Context* ctx, NDBT_Step
   return NDBT_OK;
 }
 
+int
+runBug43888(NDBT_Context* ctx, NDBT_Step* step)
+{
+  NdbRestarter res;
+  
+  if (res.getNumDbNodes() < 2)
+  {
+    ctx->stopTest();
+    return NDBT_OK;
+  }
+  
+  int loops = ctx->getNumLoops();
+  while (--loops >= 0)
+  {
+    int master = res.getMasterNodeId();
+    ndbout_c("master: %u", master);
+    int nodeId = master;
+    do {
+      nodeId = res.getNode(NdbRestarter::NS_RANDOM);
+    } while (nodeId == master);
+
+    ndbout_c("target: %u", nodeId);
+    
+    res.restartOneDbNode(nodeId,
+                         /** initial */ false, 
+                         /** nostart */ true,
+                         /** abort   */ true);
+    
+    res.waitNodesNoStart(&nodeId, 1);
+    
+    int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+    if (res.dumpStateOneNode(nodeId, val2, 2))
+      return NDBT_FAILED;
+    
+    res.insertErrorInNode(master, 7217);
+    res.startNodes(&nodeId, 1);
+    NdbSleep_SecSleep(3);
+    ndbout_c("%u : waiting for %u to not get not-started", __LINE__, nodeId);
+    res.waitNodesNoStart(&nodeId, 1);
+    
+    ndbout_c("%u : starting %u", __LINE__, nodeId);
+    res.startNodes(&nodeId, 1);
+    
+    ndbout_c("%u : waiting for cluster started", __LINE__);
+    if (res.waitClusterStarted())
+    {
+      return NDBT_FAILED;
+    }
+  }
+
+  ctx->stopTest();
+  return NDBT_OK;
+}
+
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
 	 "Test that one node at a time can be stopped and then restarted "\
@@ -3675,6 +3729,9 @@ TESTCASE("Bug41469", ""){
 TESTCASE("Bug42422", ""){
   INITIALIZER(runBug42422);
 }
+TESTCASE("Bug43888", ""){
+  INITIALIZER(runBug43888);
+}
 NDBT_TESTSUITE_END(testNodeRestart);
 
 int main(int argc, const char** argv){

=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt	2009-03-30 12:03:52 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt	2009-04-01 11:25:38 +0000
@@ -1272,3 +1272,6 @@ max-time: 300
 cmd: testScan
 args: -n Bug42545 -l 1 T1
 
+max-time: 1200
+cmd: testNodeRestart
+args: -n Bug43888 T1

Thread
bzr commit into mysql-5.1-telco-6.2 branch (jonas:2894) Bug#43888Jonas Oreland1 Apr