MySQL Lists are EOL. Please join:

List:Commits« Previous MessageNext Message »
From:jonas Date:January 10 2007 7:50pm
Subject:bk commit into 5.1 tree (jonas:1.2362) BUG#25468
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-01-10 20:50:19+01:00, jonas@stripped +4 -0
  ndb - bug#25468
    handle partially transfered LCP_FRAG_REP after node failure
    recommit to 51-work

  storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2007-01-10 20:50:17+01:00, jonas@stripped +1 -0
    handle partially transfered LCP_FRAG_REP after node failure

  storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-01-10 20:50:17+01:00, jonas@stripped +74 -5
    handle partially transfered LCP_FRAG_REP after node failure

  storage/ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2007-01-10 20:50:17+01:00, jonas@stripped +60 -0
    testcase

  storage/ndb/test/run-test/daily-basic-tests.txt@stripped, 2007-01-10 20:50:17+01:00, jonas@stripped +4 -0
    testcase

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/51-work

--- 1.60/storage/ndb/test/run-test/daily-basic-tests.txt	2007-01-10 20:50:23 +01:00
+++ 1.61/storage/ndb/test/run-test/daily-basic-tests.txt	2007-01-10 20:50:23 +01:00
@@ -768,6 +768,10 @@
 cmd: testSystemRestart
 args: -n Bug24664
 
+max-time: 1000
+cmd: testNodeRestart
+args: -n Bug25468 T1
+
 # OLD FLEX
 max-time: 500
 cmd: flexBench

--- 1.23/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2007-01-10 20:50:23 +01:00
+++ 1.24/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2007-01-10 20:50:23 +01:00
@@ -637,6 +637,7 @@
   void execTCGETOPSIZECONF(Signal *);
   void execTC_CLOPSIZECONF(Signal *);
   
+  int handle_invalid_lcp_no(const class LcpFragRep*, ReplicaRecordPtr);
   void execLCP_FRAG_REP(Signal *);
   void execLCP_COMPLETE_REP(Signal *);
   void execSTART_LCP_REQ(Signal *);

--- 1.94/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-01-10 20:50:23 +01:00
+++ 1.95/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-01-10 20:50:24 +01:00
@@ -4046,6 +4046,11 @@
   Uint32 newMasterId = nodeFail->masterNodeId;
   const Uint32 noOfFailedNodes = nodeFail->noOfNodes;
 
+  if (ERROR_INSERTED(7179))
+  {
+    CLEAR_ERROR_INSERT_VALUE;
+  }
+
   /*-------------------------------------------------------------------------*/
   // The first step is to convert from a bit mask to an array of failed nodes.
   /*-------------------------------------------------------------------------*/
@@ -10256,12 +10261,42 @@
   Uint32 fragId = lcpReport->fragId;
   
   jamEntry();
+
+  if (ERROR_INSERTED(7178) && nodeId != getOwnNodeId())
+  {
+    jam();
+    Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups);
+    Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups);
+    if (owng == nodeg)
+    {
+      jam();
+      ndbout_c("throwing away LCP_FRAG_REP from  (and killing) %d", nodeId);
+      SET_ERROR_INSERT_VALUE(7179);
+      signal->theData[0] = 9999;
+      sendSignal(numberToRef(CMVMI, nodeId), 
+		 GSN_NDB_TAMPER, signal, 1, JBA);  
+      return;
+    }
+  }
  
+  if (ERROR_INSERTED(7179) && nodeId != getOwnNodeId())
+  {
+    jam();
+    Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups);
+    Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups);
+    if (owng == nodeg)
+    {
+      jam();
+      ndbout_c("throwing away LCP_FRAG_REP from %d", nodeId);
+      return;
+    }
+  }    
+
   CRASH_INSERTION2(7025, isMaster());
   CRASH_INSERTION2(7016, !isMaster());
-
+  
   bool fromTimeQueue = (signal->senderBlockRef() == reference());
-
+  
   TabRecordPtr tabPtr;
   tabPtr.i = tableId;
   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
@@ -10463,6 +10498,37 @@
   ndbrequire(false);
 }//Dbdih::findReplica()
 
+
+int
+Dbdih::handle_invalid_lcp_no(const LcpFragRep* rep, 
+			     ReplicaRecordPtr replicaPtr)
+{
+  ndbrequire(!isMaster());
+  Uint32 lcpNo = rep->lcpNo;
+  Uint32 lcpId = rep->lcpId;
+  Uint32 replicaLcpNo = replicaPtr.p->nextLcp;
+  Uint32 prevReplicaLcpNo = prevLcpNo(replicaLcpNo);
+
+  warningEvent("Detected previous node failure of %d during lcp",
+	       rep->nodeId);
+  replicaPtr.p->nextLcp = lcpNo;
+  replicaPtr.p->lcpId[lcpNo] = 0;
+  replicaPtr.p->lcpStatus[lcpNo] = ZINVALID;
+  
+  for (Uint32 i = lcpNo; i != lcpNo; i = nextLcpNo(i))
+  {
+    jam();
+    if (replicaPtr.p->lcpStatus[i] == ZVALID &&
+	replicaPtr.p->lcpId[i] >= lcpId)
+    {
+      ndbout_c("i: %d lcpId: %d", i, replicaPtr.p->lcpId[i]);
+      ndbrequire(false);
+    }
+  }
+
+  return 0;
+}
+
 /**
  * Return true  if table is all fragment replicas have been checkpointed
  *                 to disk (in all LQHs)
@@ -10491,9 +10557,12 @@
   
   ndbrequire(replicaPtr.p->lcpOngoingFlag == true);
   if(lcpNo != replicaPtr.p->nextLcp){
-    ndbout_c("lcpNo = %d replicaPtr.p->nextLcp = %d", 
-	     lcpNo, replicaPtr.p->nextLcp);
-    ndbrequire(false);
+    if (handle_invalid_lcp_no(lcpReport, replicaPtr))
+    {
+      ndbout_c("lcpNo = %d replicaPtr.p->nextLcp = %d", 
+	       lcpNo, replicaPtr.p->nextLcp);
+      ndbrequire(false);
+    }
   }
   ndbrequire(lcpNo == replicaPtr.p->nextLcp);
   ndbrequire(lcpNo < MAX_LCP_STORED);

--- 1.36/storage/ndb/test/ndbapi/testNodeRestart.cpp	2007-01-10 20:50:24 +01:00
+++ 1.37/storage/ndb/test/ndbapi/testNodeRestart.cpp	2007-01-10 20:50:24 +01:00
@@ -1073,6 +1073,63 @@
   return NDBT_OK;
 }
 
+int runBug25468(NDBT_Context* ctx, NDBT_Step* step){
+  
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  int records = ctx->getNumRecords();
+  NdbRestarter restarter;
+  
+  for (int i = 0; i<loops; i++)
+  {
+    int master = restarter.getMasterNodeId();
+    int node1, node2;
+    switch(i % 5){
+    case 0:
+      node1 = master;
+      node2 = restarter.getRandomNodeSameNodeGroup(master, rand());
+      break;
+    case 1:
+      node1 = restarter.getRandomNodeSameNodeGroup(master, rand());
+      node2 = master;
+      break;
+    case 2:
+    case 3:
+    case 4:
+      node1 = restarter.getRandomNodeOtherNodeGroup(master, rand());
+      if (node1 == -1)
+	node1 = master;
+      node2 = restarter.getRandomNodeSameNodeGroup(node1, rand());
+      break;
+    }
+
+    ndbout_c("node1: %d node2: %d master: %d", node1, node2, master);
+
+    int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+  
+    if (restarter.dumpStateOneNode(node2, val2, 2))
+      return NDBT_FAILED;
+
+    if (restarter.insertErrorInNode(node1, 7178))
+      return NDBT_FAILED;
+
+    int val1 = 7099;
+    if (restarter.dumpStateOneNode(master, &val1, 1))
+      return NDBT_FAILED;
+
+    if (restarter.waitNodesNoStart(&node2, 1))
+      return NDBT_FAILED;
+
+    if (restarter.startAll())
+      return NDBT_FAILED;
+
+    if (restarter.waitClusterStarted())
+      return NDBT_FAILED;
+  }    
+
+  return NDBT_OK;
+}
+
 
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
@@ -1402,6 +1459,9 @@
 }
 TESTCASE("Bug25364", ""){
   INITIALIZER(runBug25364);
+}
+TESTCASE("Bug25468", ""){
+  INITIALIZER(runBug25468);
 }
 NDBT_TESTSUITE_END(testNodeRestart);
 
Thread
bk commit into 5.1 tree (jonas:1.2362) BUG#25468jonas10 Jan