List:Commits« Previous MessageNext Message »
From:jonas Date:April 25 2008 8:36am
Subject:bk commit into 5.0 tree (jonas:1.2602) BUG#36245
View as plain text  
Below is the list of changes that have just been committed into a local
5.0 repository of jonas.  When jonas does a push these changes
will be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2008-04-25 08:36:45+02:00, jonas@stripped +6 -0
  ndb - bug#36245
    NF_COMPLETEREP can get lost on cascading master failure
    causing *big* pain and misery

  ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +2 -2
    new error codes

  ndb/src/kernel/blocks/dbtc/DbtcMain.cpp@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +3 -3
    new error codes

  ndb/src/kernel/blocks/qmgr/QmgrMain.cpp@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +39 -22
    - new error codes
    - fix by sending NF_COMPLETEREP from all nodes

  ndb/src/ndbapi/ClusterMgr.cpp@stripped, 2008-04-25 08:36:43+02:00, jonas@stripped
+5 -2
    only signal NF_COMPLETEREP once to TransportFacade

  ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +101 -0
    testcase

  ndb/test/run-test/daily-basic-tests.txt@stripped, 2008-04-25 08:36:43+02:00,
jonas@stripped +6 -1
    testcase

diff -Nrup a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt
--- a/ndb/src/kernel/blocks/ERROR_codes.txt	2008-04-23 16:08:36 +02:00
+++ b/ndb/src/kernel/blocks/ERROR_codes.txt	2008-04-25 08:36:43 +02:00
@@ -1,4 +1,4 @@
-Next QMGR 1
+Next QMGR 937
 Next NDBCNTR 1002
 Next NDBFS 2000
 Next DBACC 3002
@@ -6,7 +6,7 @@ Next DBTUP 4014
 Next DBLQH 5051
 Next DBDICT 6007
 Next DBDIH 7211
-Next DBTC 8063
+Next DBTC 8064
 Next CMVMI 9000
 Next BACKUP 10022
 Next DBUTIL 11002
diff -Nrup a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
--- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp	2008-04-23 16:08:36 +02:00
+++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp	2008-04-25 08:36:43 +02:00
@@ -2723,6 +2723,8 @@ void Dbtc::execTCKEYREQ(Signal* signal) 
   if (seizeCacheRecord(signal) != 0) {
     return;
   }//if
+
+  CRASH_INSERTION(8063);
   
   TcConnectRecord * const regTcPtr = tcConnectptr.p;
   CacheRecord * const regCachePtr = cachePtr.p;
@@ -4583,9 +4585,7 @@ void Dbtc::execCOMMITTED(Signal* signal)
     CLEAR_ERROR_INSERT_VALUE;
     return;
   }//if
-  if (ERROR_INSERTED(8030)) {
-    systemErrorLab(signal, __LINE__);
-  }//if
+  CRASH_INSERTION(8030);
   if (ERROR_INSERTED(8025)) {
     SET_ERROR_INSERT_VALUE(8026);
     return;
diff -Nrup a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
--- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2007-05-15 08:34:36 +02:00
+++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2008-04-25 08:36:43 +02:00
@@ -2474,28 +2474,37 @@ void Qmgr::execNDB_FAILCONF(Signal* sign
     progError(__LINE__, 0, buf);
     systemErrorLab(signal, __LINE__);
   }//if
-  if (cpresident == getOwnNodeId()) {
+
+  if (cpresident == getOwnNodeId()) 
+  {
     jam();
-    /** 
-     * Prepare a NFCompleteRep and send to all connected API's
-     * They can then abort all transaction waiting for response from 
-     * the failed node
-     */
-    NFCompleteRep * const nfComp = (NFCompleteRep *)&signal->theData[0];
-    nfComp->blockNo = QMGR_REF;
-    nfComp->nodeId = getOwnNodeId();
-    nfComp->failedNodeId = failedNodePtr.i;
+    
+    CRASH_INSERTION(936);
+  }
 
-    for (nodePtr.i = 1; nodePtr.i < MAX_NODES; nodePtr.i++) {
+  /** 
+   * Prepare a NFCompleteRep and send to all connected API's
+   * They can then abort all transaction waiting for response from 
+   * the failed node
+   *
+   * NOTE: This is sent from all nodes, as otherwise we would need
+   *       take-over if cpresident dies befor sending this
+   */
+  NFCompleteRep * const nfComp = (NFCompleteRep *)&signal->theData[0];
+  nfComp->blockNo = QMGR_REF;
+  nfComp->nodeId = getOwnNodeId();
+  nfComp->failedNodeId = failedNodePtr.i;
+  
+  for (nodePtr.i = 1; nodePtr.i < MAX_NODES; nodePtr.i++) 
+  {
+    jam();
+    ptrAss(nodePtr, nodeRec);
+    if (nodePtr.p->phase == ZAPI_ACTIVE){
       jam();
-      ptrAss(nodePtr, nodeRec);
-      if (nodePtr.p->phase == ZAPI_ACTIVE){
-        jam();
-        sendSignal(nodePtr.p->blockRef, GSN_NF_COMPLETEREP, signal, 
-                   NFCompleteRep::SignalLength, JBA);
-      }//if
-    }//for
-  }
+      sendSignal(nodePtr.p->blockRef, GSN_NF_COMPLETEREP, signal, 
+                 NFCompleteRep::SignalLength, JBA);
+    }//if
+  }//for
   return;
 }//Qmgr::execNDB_FAILCONF()
 
@@ -3332,9 +3341,17 @@ void Qmgr::execCOMMIT_FAILREQ(Signal* si
       jam();
       NodeBitmask::set(nodeFail->theNodes, ccommitFailedNodes[i]);
     }//if	
-    sendSignal(NDBCNTR_REF, GSN_NODE_FAILREP, signal, 
-	       NodeFailRep::SignalLength, JBB);
-
+    
+    if (ERROR_INSERTED(936))
+    {
+      sendSignalWithDelay(NDBCNTR_REF, GSN_NODE_FAILREP, signal, 
+                          200, NodeFailRep::SignalLength);
+    }
+    else
+    {
+      sendSignal(NDBCNTR_REF, GSN_NODE_FAILREP, signal, 
+                 NodeFailRep::SignalLength, JBB);
+    }
     guard0 = cnoCommitFailedNodes - 1;
     arrGuard(guard0, MAX_NDB_NODES);
     /**--------------------------------------------------------------------
diff -Nrup a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp
--- a/ndb/src/ndbapi/ClusterMgr.cpp	2007-05-09 15:02:59 +02:00
+++ b/ndb/src/ndbapi/ClusterMgr.cpp	2008-04-25 08:36:43 +02:00
@@ -472,8 +472,11 @@ ClusterMgr::execNF_COMPLETEREP(const Uin
   const NodeId nodeId = nfComp->failedNodeId;
   assert(nodeId > 0 && nodeId < MAX_NODES);
   
-  theFacade.ReportNodeFailureComplete(nodeId);
-  theNodes[nodeId].nfCompleteRep = true;
+  if (theNodes[nodeId].nfCompleteRep == false)
+  {
+    theFacade.ReportNodeFailureComplete(nodeId);
+    theNodes[nodeId].nfCompleteRep = true;
+  }
 }
 
 void
diff -Nrup a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp
--- a/ndb/test/ndbapi/testNodeRestart.cpp	2008-04-23 16:08:36 +02:00
+++ b/ndb/test/ndbapi/testNodeRestart.cpp	2008-04-25 08:36:43 +02:00
@@ -1867,6 +1867,102 @@ runBug36276(NDBT_Context* ctx, NDBT_Step
   return NDBT_OK;
 }
 
+int 
+runBug36245(NDBT_Context* ctx, NDBT_Step* step)
+{ 
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  NdbRestarter res;
+  Ndb* pNdb = GETNDB(step);
+
+  if (res.getNumDbNodes() < 4)
+    return NDBT_OK;
+
+  /**
+   * Make sure master and nextMaster is in different node groups
+   */
+loop1:
+  int master = res.getMasterNodeId();
+  int nextMaster = res.getNextMasterNodeId(master);
+  
+  printf("master: %u nextMaster: %u", master, nextMaster);
+  if (res.getNodeGroup(master) == res.getNodeGroup(nextMaster))
+  {
+    ndbout_c(" -> restarting next master: %u", nextMaster);
+    res.restartOneDbNode(nextMaster,
+                         /** initial */ false, 
+                         /** nostart */ true,
+                         /** abort   */ true);
+    
+    res.waitNodesNoStart(&nextMaster, 1);
+    res.startNodes(&nextMaster, 1);
+    if (res.waitClusterStarted())
+    {
+      ndbout_c("cluster didnt restart!!");
+      return NDBT_FAILED;
+    }
+    goto loop1;
+  }
+  ndbout_c(" -> go go gadget skates");
+
+  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };    
+  res.dumpStateOneNode(master, val2, 2);
+  res.dumpStateOneNode(nextMaster, val2, 2);
+
+  res.insertErrorInNode(master, 8063);
+  res.insertErrorInNode(nextMaster, 936);
+
+
+  int err = 0;
+  HugoOperations hugoOps(*ctx->getTab());
+loop2:
+  if((err = hugoOps.startTransaction(pNdb)) != 0)
+  {
+    ndbout_c("failed to start transaction: %u", err);
+    return NDBT_FAILED;
+  }
+  
+  int victim = hugoOps.getTransaction()->getConnectedNodeId();
+  if (victim != master)
+  {
+    ndbout_c("transnode: %u != master: %u -> loop",
+             victim, master);
+    hugoOps.closeTransaction(pNdb);
+    goto loop2;
+  }
+
+  if((err = hugoOps.pkUpdateRecord(pNdb, 1)) != 0)
+  {
+    ndbout_c("failed to update: %u", err);
+    return NDBT_FAILED;
+  }
+  
+  if((err = hugoOps.execute_Commit(pNdb)) != 4010)
+  {
+    ndbout_c("incorrect error code: %u", err);
+    return NDBT_FAILED;
+  }
+  hugoOps.closeTransaction(pNdb);
+  
+  int nodes[2];
+  nodes[0] = master;
+  nodes[1] = nextMaster;
+  if (res.waitNodesNoStart(nodes, 2))
+  {
+    return NDBT_FAILED;
+  }
+  
+  if (res.startNodes(nodes, 2))
+  {
+    return NDBT_FAILED;
+  }
+  
+  if (res.waitClusterStarted())
+    return NDBT_FAILED;
+  
+  return NDBT_OK;
+}
+
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
 	 "Test that one node at a time can be stopped and then restarted "\
@@ -2230,6 +2326,11 @@ TESTCASE("Bug36247", ""){
 TESTCASE("Bug36276", ""){
   INITIALIZER(runLoadTable);
   STEP(runBug36276);
+  VERIFIER(runClearTable);
+}
+TESTCASE("Bug36245", ""){
+  INITIALIZER(runLoadTable);
+  STEP(runBug36245);
   VERIFIER(runClearTable);
 }
 NDBT_TESTSUITE_END(testNodeRestart);
diff -Nrup a/ndb/test/run-test/daily-basic-tests.txt
b/ndb/test/run-test/daily-basic-tests.txt
--- a/ndb/test/run-test/daily-basic-tests.txt	2008-04-23 16:20:37 +02:00
+++ b/ndb/test/run-test/daily-basic-tests.txt	2008-04-25 08:36:43 +02:00
@@ -812,4 +812,9 @@ max-time: 300
 cmd: testNodeRestart
 args: -n Bug36276 T1
 
-# EOF
+# 2008-04-25
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36245 T1
+
+# EOF 2008-04-25
Thread
bk commit into 5.0 tree (jonas:1.2602) BUG#36245jonas25 Apr