List:Commits« Previous MessageNext Message »
From:jonas Date:February 2 2007 5:07pm
Subject:bk commit into 5.1 tree (jonas:1.2419) BUG#25984
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-02-02 17:07:15+01:00, jonas@eel.(none) +3 -0
  ndb - bug#25984 - more than 7 failed node restart can cause cluster failure
  new behaviour is as follows:
  1) node is refused to start, and should fail with message in error log that it must be
restarted --initial
  2) if cluster failure in this situation, node must also be restarted --intial
     if not SR will fail, with this message

  storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-02-02 17:07:12+01:00,
jonas@eel.(none) +27 -2
    Prevent node from starting _at all_ if it has performed more than 6 failed
      node restart.
    

  storage/ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2007-02-02 17:07:13+01:00,
jonas@eel.(none) +98 -0
    test prg 25984

  storage/ndb/test/run-test/daily-basic-tests.txt@stripped, 2007-02-02 17:07:13+01:00,
jonas@eel.(none) +4 -0
    testcase

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	eel.(none)
# Root:	/home/jonas/src/51-work

--- 1.64/storage/ndb/test/run-test/daily-basic-tests.txt	2007-02-02 17:07:23 +01:00
+++ 1.65/storage/ndb/test/run-test/daily-basic-tests.txt	2007-02-02 17:07:23 +01:00
@@ -525,6 +525,10 @@
 cmd: testNodeRestart
 args: -n Bug25554 T1
 
+max-time: 1000
+cmd: testNodeRestart
+args: -n Bug25984
+
 #
 # DICT TESTS
 max-time: 1500

--- 1.100/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-02-02 17:07:23 +01:00
+++ 1.101/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-02-02 17:07:23 +01:00
@@ -1525,10 +1525,26 @@
        */
       SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
       ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
-      warningEvent("Making filesystem for node %d unusable",
+      warningEvent("Making filesystem for node %d unusable (need --initial)",
 		   nodePtr.i);
     }
+    else if (nodePtr.p->nodeStatus == NodeRecord::ALIVE &&
+	     SYSFILE->lastCompletedGCI[nodePtr.i] == 0)
+    {
+      jam();
+      CRASH_INSERTION(7170);
+      char buf[255];
+      BaseString::snprintf(buf, sizeof(buf), 
+			   "Cluster requires this node to be started "
+			   " with --initial as partial start has been performed"
+			   " and this filesystem is unusable");
+      progError(__LINE__, 
+		NDBD_EXIT_SR_RESTARTCONFLICT,
+		buf);
+      ndbrequire(false);
+    }
   }
+
   /**
    * This set which GCI we will try to restart to
    */
@@ -12515,14 +12531,23 @@
   /*       THAT THE NEW REPLICA IS NOT STARTED YET AND REPLICA_LAST_GCI IS*/
   /*       SET TO -1 TO INDICATE THAT IT IS NOT DEAD YET.                 */
   /*----------------------------------------------------------------------*/
+  Uint32 lastGCI = SYSFILE->lastCompletedGCI[nodeId];
   arrGuardErr(ncrReplicaPtr.p->noCrashedReplicas + 1, 8,
               NDBD_EXIT_MAX_CRASHED_REPLICAS);
   ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] = 
-    SYSFILE->lastCompletedGCI[nodeId];
+    lastGCI;
   ncrReplicaPtr.p->noCrashedReplicas = ncrReplicaPtr.p->noCrashedReplicas + 1;
   ncrReplicaPtr.p->createGci[ncrReplicaPtr.p->noCrashedReplicas] = 0;
   ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] = 
     (Uint32)-1;
+
+  if (ncrReplicaPtr.p->noCrashedReplicas == 7 && lastGCI)
+  {
+    jam();
+    SYSFILE->lastCompletedGCI[nodeId] = 0;
+    warningEvent("Making filesystem for node %d unusable (need --initial)",
+		 nodeId);
+  }
 }//Dbdih::newCrashedReplica()
 
 /*************************************************************************/

--- 1.38/storage/ndb/test/ndbapi/testNodeRestart.cpp	2007-02-02 17:07:23 +01:00
+++ 1.39/storage/ndb/test/ndbapi/testNodeRestart.cpp	2007-02-02 17:07:23 +01:00
@@ -1178,6 +1178,101 @@
   return NDBT_OK;
 }
 
+int runBug25984(NDBT_Context* ctx, NDBT_Step* step){
+  
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  int records = ctx->getNumRecords();
+  NdbRestarter restarter;
+
+  if (restarter.getNumDbNodes() < 2)
+    return NDBT_OK;
+
+  if (restarter.restartAll(true, true, true))
+    return NDBT_FAILED;
+
+  if (restarter.waitClusterNoStart())
+    return NDBT_FAILED;
+
+  if (restarter.startAll())
+    return NDBT_FAILED;
+
+  if (restarter.waitClusterStarted())
+    return NDBT_FAILED;
+
+  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };    
+  int master = restarter.getMasterNodeId();
+  int victim = restarter.getRandomNodeOtherNodeGroup(master, rand());
+  if (victim == -1)
+    victim = restarter.getRandomNodeSameNodeGroup(master, rand());
+
+  restarter.restartOneDbNode(victim, false, true, true);
+
+  for (Uint32 i = 0; i<6; i++)
+  {
+    ndbout_c("Loop: %d", i);
+    if (restarter.waitNodesNoStart(&victim, 1))
+      return NDBT_FAILED;
+    
+    if (restarter.dumpStateOneNode(victim, val2, 2))
+      return NDBT_FAILED;
+    
+    if (restarter.insertErrorInNode(victim, 7016))
+      return NDBT_FAILED;
+    
+    if (restarter.startNodes(&victim, 1))
+      return NDBT_FAILED;
+
+    if (restarter.waitNodesStartPhase(&victim, 1, 2))
+      return NDBT_FAILED;
+  }
+
+  if (restarter.waitNodesNoStart(&victim, 1))
+    return NDBT_FAILED;
+
+  if (restarter.dumpStateOneNode(victim, val2, 2))
+    return NDBT_FAILED;
+  
+  if (restarter.insertErrorInNode(victim, 7170))
+    return NDBT_FAILED;
+
+  if (restarter.startNodes(&victim, 1))
+    return NDBT_FAILED;
+
+  if (restarter.waitNodesNoStart(&victim, 1))
+    return NDBT_FAILED;
+  
+  if (restarter.restartAll(false, true, true))
+    return NDBT_FAILED;
+
+  if (restarter.insertErrorInAllNodes(932))
+    return NDBT_FAILED;
+
+  if (restarter.insertErrorInNode(master, 7170))
+    return NDBT_FAILED;
+
+  if (restarter.dumpStateAllNodes(val2, 2))
+    return NDBT_FAILED;
+  
+  restarter.startNodes(&master, 1);
+  NdbSleep_MilliSleep(3000);
+  restarter.startAll();
+
+  if (restarter.waitClusterNoStart())
+    return NDBT_FAILED;
+
+  if (restarter.restartOneDbNode(victim, true, true, true))
+    return NDBT_FAILED;
+
+  if (restarter.startAll())
+    return NDBT_FAILED;
+
+  if (restarter.waitClusterStarted())
+    return NDBT_FAILED;
+
+  return NDBT_OK;
+}
+
 
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
@@ -1513,6 +1608,9 @@
 }
 TESTCASE("Bug25554", ""){
   INITIALIZER(runBug25554);
+}
+TESTCASE("Bug25984", ""){
+  INITIALIZER(runBug25984);
 }
 NDBT_TESTSUITE_END(testNodeRestart);
 
Thread
bk commit into 5.1 tree (jonas:1.2419) BUG#25984jonas2 Feb