MySQL Lists are EOL. Please join:

List:Commits« Previous MessageNext Message »
From:jonas Date:March 21 2006 1:47pm
Subject:bk commit into 4.1 tree (jonas:1.2473) BUG#18385
View as plain text  
Below is the list of changes that have just been committed into a local
4.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2473 06/03/21 14:47:10 jonas@stripped +9 -0
  ndb - bug#18385
    Partial system restart, can not try to start with higher GCI that own
    even if knowing about a higher number

  ndb/test/src/NdbRestarter.cpp
    1.11 06/03/21 14:47:08 jonas@stripped +33 -0
    Add new method for selecting random node

  ndb/test/run-test/daily-basic-tests.txt
    1.27 06/03/21 14:47:08 jonas@stripped +4 -0
    Run test in daily-basic

  ndb/test/ndbapi/testSystemRestart.cpp
    1.9 06/03/21 14:47:08 jonas@stripped +53 -0
    Add new testcase for bug#18385

  ndb/test/include/NdbRestarter.hpp
    1.5 06/03/21 14:47:08 jonas@stripped +1 -0
    Add new method for selecting random node

  ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
    1.33 06/03/21 14:47:08 jonas@stripped +77 -22
    Fix so that we don't try to restart to a too new GCI when doing a partial start
    Add new error code when this node later tries to join

  ndb/src/kernel/blocks/dbdih/Dbdih.hpp
    1.10 06/03/21 14:47:08 jonas@stripped +0 -1
    Move error codes into StartPerm + Add new error code

  ndb/src/kernel/blocks/ERROR_codes.txt
    1.14 06/03/21 14:47:08 jonas@stripped +2 -0
    Add new error insert

  ndb/include/kernel/signaldata/StartPerm.hpp
    1.2 06/03/21 14:47:08 jonas@stripped +6 -0
    Move error codes into StartPerm + Add new error code

  ndb/include/kernel/signaldata/DumpStateOrd.hpp
    1.6 06/03/21 14:47:08 jonas@stripped +1 -0
    Add new dump for setting time between gcp

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/41-work

--- 1.26/ndb/test/run-test/daily-basic-tests.txt	2006-03-20 14:49:44 +01:00
+++ 1.27/ndb/test/run-test/daily-basic-tests.txt	2006-03-21 14:47:08 +01:00
@@ -454,6 +454,10 @@
 cmd: testNodeRestart
 args: -n Bug16772 T1
 
+max-time: 500
+cmd: testSystemRestart
+args: -n Bug18385 T1
+
 # OLD FLEX
 max-time: 500
 cmd: flexBench

--- 1.5/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2005-12-08 15:28:13 +01:00
+++ 1.6/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2006-03-21 14:47:08 +01:00
@@ -127,6 +127,7 @@
     DihMinTimeBetweenLCP = 7017,
     DihMaxTimeBetweenLCP = 7018,
     EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP
+    DihSetTimeBetweenGcp = 7090,
     DihStartLcpImmediately = 7099,
     // 8000 Suma
     // 12000 Tux

--- 1.1/ndb/include/kernel/signaldata/StartPerm.hpp	2004-04-14 10:23:55 +02:00
+++ 1.2/ndb/include/kernel/signaldata/StartPerm.hpp	2006-03-21 14:47:08 +01:00
@@ -64,5 +64,11 @@
   
   Uint32 startingNodeId;
   Uint32 errorCode;  
+
+  enum ErrorCode
+  {
+    ZNODE_ALREADY_STARTING_ERROR = 305,
+    InitialStartRequired = 320
+  };
 };
 #endif

--- 1.13/ndb/src/kernel/blocks/ERROR_codes.txt	2005-12-21 16:31:56 +01:00
+++ 1.14/ndb/src/kernel/blocks/ERROR_codes.txt	2006-03-21 14:47:08 +01:00
@@ -303,6 +303,8 @@
 7131: Crash when receiving START_COPYREQ in master node
 7132: Crash when receiving START_COPYCONF in starting node
 
+7170: Crash when receiving START_PERMREF (InitialStartRequired)
+
 DICT:
 6000  Crash during NR when receiving DICTSTARTREQ
 6001  Crash during NR when receiving SCHEMA_INFO

--- 1.9/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2006-03-17 10:09:33 +01:00
+++ 1.10/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2006-03-21 14:47:08 +01:00
@@ -81,7 +81,6 @@
 #define ZWRONG_FAILURE_NUMBER_ERROR 302
 #define ZWRONG_START_NODE_ERROR 303
 #define ZNO_REPLICA_FOUND_ERROR 304
-#define ZNODE_ALREADY_STARTING_ERROR 305
 #define ZNODE_START_DISALLOWED_ERROR 309
 
 // --------------------------------------

--- 1.32/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2006-03-17 10:09:33 +01:00
+++ 1.33/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2006-03-21 14:47:08 +01:00
@@ -1420,6 +1420,33 @@
     return;
   }
   
+  NodeRecordPtr nodePtr;
+  Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()];
+  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) 
+  {
+    jam();
+    ptrAss(nodePtr, nodeRecord);
+    if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) 
+    {
+      jam();
+      /**
+       * Since we're starting(is master) and there 
+       *   there are other nodes with higher GCI...
+       *   there gci's must be invalidated...
+       *   and they _must_ do an initial start
+       *   indicate this by setting lastCompletedGCI = 0
+       */
+      SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
+      ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
+      warningEvent("Making filesystem for node %d unusable",
+		   nodePtr.i);
+    }
+  }
+  /**
+   * This set which GCI we will try to restart to
+   */
+  SYSFILE->newestRestorableGCI = gci;
+  
   ndbrequire(isMaster());
   copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file!
 }//Dbdih::ndbStartReqLab()
@@ -1557,7 +1584,7 @@
 {
   jamEntry();
   Uint32 errorCode = signal->theData[1];
-  if (errorCode == ZNODE_ALREADY_STARTING_ERROR) {
+  if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) {
     jam();
     /*-----------------------------------------------------------------------*/
     // The master was busy adding another node. We will wait for a second and
@@ -1567,6 +1594,20 @@
     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1);
     return;
   }//if
+
+  if (errorCode == StartPermRef::InitialStartRequired)
+  {
+    CRASH_INSERTION(7170);
+    char buf[255];
+    BaseString::snprintf(buf, sizeof(buf), 
+			 "Cluster requires this node to be started "
+			 " with --initial as partial start has been performed"
+			 " and this filesystem is unusable");
+    progError(__LINE__, 
+	      ERR_SR_RESTARTCONFLICT,
+	      buf);
+    ndbrequire(false);
+  }
   /*------------------------------------------------------------------------*/
   // Some node process in another node involving our node was still active. We
   // will recover from this by crashing here. 
@@ -1657,7 +1698,7 @@
       (c_nodeStartMaster.wait != ZFALSE)) {
     jam();
     signal->theData[0] = nodeId;
-    signal->theData[1] = ZNODE_ALREADY_STARTING_ERROR;
+    signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR;
     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
     return;
   }//if
@@ -1667,6 +1708,16 @@
     ndbrequire(false);
   }//if
 
+  if (SYSFILE->lastCompletedGCI[nodeId] == 0 &&
+      typeStart != NodeState::ST_INITIAL_NODE_RESTART)
+  {
+    jam();
+    signal->theData[0] = nodeId;
+    signal->theData[1] = StartPermRef::InitialStartRequired;
+    sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
+    return;
+  }
+
   /*----------------------------------------------------------------------
    * WE START THE INCLUSION PROCEDURE 
    * ---------------------------------------------------------------------*/
@@ -3515,24 +3566,12 @@
 /* ------------------------------------------------------------------------- */
 void Dbdih::selectMasterCandidateAndSend(Signal* signal)
 {
-  Uint32 gci = 0;
-  Uint32 masterCandidateId = 0;
-  NodeRecordPtr nodePtr;
-  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
-    jam();
-    ptrAss(nodePtr, nodeRecord);
-    if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) {
-      jam();
-      masterCandidateId = nodePtr.i;
-      gci = SYSFILE->lastCompletedGCI[nodePtr.i];
-    }//if
-  }//for
-  ndbrequire(masterCandidateId != 0);
   setNodeGroups();
-  signal->theData[0] = masterCandidateId;
-  signal->theData[1] = gci;
+  signal->theData[0] = getOwnNodeId();
+  signal->theData[1] = SYSFILE->lastCompletedGCI[getOwnNodeId()];
   sendSignal(cntrlblockref, GSN_DIH_RESTARTCONF, signal, 2, JBB);
-
+  
+  NodeRecordPtr nodePtr;
   Uint32 node_groups[MAX_NDB_NODES];
   memset(node_groups, 0, sizeof(node_groups));
   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
@@ -3550,10 +3589,10 @@
     if(count != 0 && count != cnoReplicas){
       char buf[255];
       BaseString::snprintf(buf, sizeof(buf), 
-	       "Illegal configuration change."
-	       " Initial start needs to be performed "
-	       " when changing no of replicas (%d != %d)", 
-	       node_groups[nodePtr.i], cnoReplicas);
+			   "Illegal configuration change."
+			   " Initial start needs to be performed "
+			   " when changing no of replicas (%d != %d)", 
+			   node_groups[nodePtr.i], cnoReplicas);
       progError(__LINE__, 
 		ERR_INVALID_CONFIG,
 		buf);
@@ -13358,6 +13397,22 @@
   if(dumpState->args[0] == DumpStateOrd::DihStartLcpImmediately){
     c_lcpState.ctimer += (1 << c_lcpState.clcpDelay);
     return;
+  }
+
+  if (dumpState->args[0] == DumpStateOrd::DihSetTimeBetweenGcp)
+  {
+    if (signal->getLength() == 1)
+    {
+      const ndb_mgm_configuration_iterator * p = 
+	theConfiguration.getOwnConfigIterator();
+      ndbrequire(p != 0);
+      ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &cgcpDelay);
+    }
+    else
+    {
+      cgcpDelay = signal->theData[1];
+    }
+    ndbout_c("Setting time between gcp : %d", cgcpDelay);
   }
 }//Dbdih::execDUMP_STATE_ORD()
 

--- 1.4/ndb/test/include/NdbRestarter.hpp	2004-11-22 11:47:53 +01:00
+++ 1.5/ndb/test/include/NdbRestarter.hpp	2006-03-21 14:47:08 +01:00
@@ -62,6 +62,7 @@
   int dumpStateAllNodes(int * _args, int _num_args);
 
   int getMasterNodeId();
+  int getRandomNodeSameNodeGroup(int nodeId, int randomNumber);
   int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber);
   int getRandomNotMasterNodeId(int randomNumber);
   

--- 1.8/ndb/test/ndbapi/testSystemRestart.cpp	2004-11-08 13:58:37 +01:00
+++ 1.9/ndb/test/ndbapi/testSystemRestart.cpp	2006-03-21 14:47:08 +01:00
@@ -1051,6 +1051,52 @@
   return result;
 }
 
+int runBug18385(NDBT_Context* ctx, NDBT_Step* step){
+  NdbRestarter restarter;
+  const Uint32 nodeCount = restarter.getNumDbNodes();
+  if(nodeCount < 2){
+    g_info << "Bug18385 - Needs atleast 2 nodes to test" << endl;
+    return NDBT_OK;
+  }
+
+  int node1 = restarter.getDbNodeId(rand() % nodeCount);
+  int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand());
+
+  if (node1 == -1 || node2 == -1)
+    return NDBT_OK;
+  
+  int dump[] = { DumpStateOrd::DihSetTimeBetweenGcp, 300 };
+  
+  int result = NDBT_OK;
+  do {
+    CHECK(restarter.dumpStateAllNodes(dump, 2) == 0);
+    CHECK(restarter.restartOneDbNode(node1, false, true, false) == 0);
+    NdbSleep_SecSleep(3);
+    CHECK(restarter.restartAll(false, true, false) == 0);
+    
+    Uint32 cnt = 0;
+    int nodes[128];
+    for(Uint32 i = 0; i<nodeCount; i++)
+      if ((nodes[cnt] = restarter.getDbNodeId(i)) != node2)
+	cnt++;
+    
+    assert(cnt == nodeCount - 1);
+    
+    CHECK(restarter.startNodes(nodes, cnt) == 0);
+    CHECK(restarter.waitNodesStarted(nodes, cnt, 300) == 0);
+    
+    CHECK(restarter.insertErrorInNode(node2, 7170) == 0);
+    CHECK(restarter.waitNodesNoStart(&node2, 1) == 0);
+    CHECK(restarter.restartOneDbNode(node2, true, false, true) == 0);
+    CHECK(restarter.waitNodesStarted(&node2, 1) == 0);
+
+  } while(0);
+  
+  g_info << "Bug18385 finished" << endl;  
+  
+  return result;
+}
+
 int runWaitStarted(NDBT_Context* ctx, NDBT_Step* step){
 
   NdbRestarter restarter;
@@ -1232,6 +1278,13 @@
   INITIALIZER(runWaitStarted);
   INITIALIZER(runClearTable);
   STEP(runSystemRestart9);
+  FINALIZER(runClearTable);
+}
+TESTCASE("Bug18385", 
+	 "Perform partition system restart with other nodes with higher GCI"){
+  INITIALIZER(runWaitStarted);
+  INITIALIZER(runClearTable);
+  STEP(runBug18385);
   FINALIZER(runClearTable);
 }
 NDBT_TESTSUITE_END(testSystemRestart);

--- 1.10/ndb/test/src/NdbRestarter.cpp	2004-12-17 10:36:11 +01:00
+++ 1.11/ndb/test/src/NdbRestarter.cpp	2006-03-21 14:47:08 +01:00
@@ -174,6 +174,39 @@
   return -1;
 }
 
+int
+NdbRestarter::getRandomNodeSameNodeGroup(int nodeId, int rand){
+  if (!isConnected())
+    return -1;
+  
+  if (getStatus() != 0)
+    return -1;
+  
+  int node_group = -1;
+  for(size_t i = 0; i < ndbNodes.size(); i++){
+    if(ndbNodes[i].node_id == nodeId){
+      node_group = ndbNodes[i].node_group;
+      break;
+    }
+  }
+  if(node_group == -1){
+    return -1;
+  }
+
+  Uint32 counter = 0;
+  rand = rand % ndbNodes.size();
+  while(counter++ < ndbNodes.size() && 
+	(ndbNodes[rand].node_id == nodeId || 
+	 ndbNodes[rand].node_group != node_group))
+    rand = (rand + 1) % ndbNodes.size();
+  
+  if(ndbNodes[rand].node_group == node_group &&
+     ndbNodes[rand].node_id != nodeId)
+    return ndbNodes[rand].node_id;
+  
+  return -1;
+}
+
 int 
 NdbRestarter::waitClusterStarted(unsigned int _timeout){
   return waitClusterState(NDB_MGM_NODE_STATUS_STARTED, _timeout);
Thread
bk commit into 4.1 tree (jonas:1.2473) BUG#18385jonas21 Mar