List:Commits« Previous MessageNext Message »
From:jonas Date:April 23 2008 4:08pm
Subject:bk commit into 5.0 tree (jonas:1.2600) BUG#36199
View as plain text  
Below is the list of changes that have just been committed into a local
5.0 repository of jonas.  When jonas does a push these changes
will be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2008-04-23 16:08:38+02:00, jonas@stripped +7 -0
  ndb -
    fix for bug#36199, bug#36246, bug#36247, bug#36276
    all related to cascading master failure

  ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +3 -3
    update error codes

  ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +3 -0
    fix for bug#36199, bug#36246, bug#36247, bug#36276

  ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +163 -50
    fix for bug#36199, bug#36246, bug#36247, bug#36276

  ndb/src/kernel/blocks/dblqh/DblqhMain.cpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +30 -0
    fix for bug#36199, bug#36246, bug#36247, bug#36276

  ndb/src/kernel/blocks/dbtc/DbtcMain.cpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +60 -25
    fix for bug#36199, bug#36246, bug#36247, bug#36276

  ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +498 -0
    fix for bug#36199, bug#36246, bug#36247, bug#36276

  ndb/test/run-test/daily-basic-tests.txt@stripped, 2008-04-23 16:08:36+02:00,
jonas@stripped +22 -0
    fix for bug#36199, bug#36246, bug#36247, bug#36276

diff -Nrup a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt
--- a/ndb/src/kernel/blocks/ERROR_codes.txt	2007-11-07 20:57:19 +01:00
+++ b/ndb/src/kernel/blocks/ERROR_codes.txt	2008-04-23 16:08:36 +02:00
@@ -3,10 +3,10 @@ Next NDBCNTR 1002
 Next NDBFS 2000
 Next DBACC 3002
 Next DBTUP 4014
-Next DBLQH 5043
+Next DBLQH 5051
 Next DBDICT 6007
-Next DBDIH 7195
-Next DBTC 8052
+Next DBDIH 7211
+Next DBTC 8063
 Next CMVMI 9000
 Next BACKUP 10022
 Next DBUTIL 11002
diff -Nrup a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
--- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2007-11-07 20:57:19 +01:00
+++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2008-04-23 16:08:36 +02:00
@@ -1342,6 +1342,9 @@ private:
     Uint32 m_masterLcpDihRef;
     bool   m_MASTER_LCPREQ_Received;
     Uint32 m_MASTER_LCPREQ_FailedNodeId;
+
+    Uint32 m_lastLCP_COMPLETE_REP_id;
+    Uint32 m_lastLCP_COMPLETE_REP_ref;
   } c_lcpState;
   
   /*------------------------------------------------------------------------*/
diff -Nrup a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
--- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-11-07 20:57:19 +01:00
+++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2008-04-23 16:08:36 +02:00
@@ -4501,37 +4501,47 @@ void Dbdih::failedNodeLcpHandling(Signal
   c_lcpState.m_participatingDIH.clear(failedNodePtr.i);
   c_lcpState.m_participatingLQH.clear(failedNodePtr.i);
 
-  if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i)){
+  bool wf = c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i);
+
+  if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i))
+  {
     jam();
     LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
     rep->nodeId = failedNodePtr.i;
     rep->lcpId = SYSFILE->latestLCP_ID;
     rep->blockNo = DBDIH;
     sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal, 
-	       LcpCompleteRep::SignalLength, JBB);
+               LcpCompleteRep::SignalLength, JBB);
   }
-
-  /**
-   * Check if we'r waiting for the failed node's LQH to complete
-   *
-   * Note that this is ran "before" LCP master take over
-   */
-  if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){
+   
+  bool lcp_complete_rep = false;
+  if (!wf)
+  {
     jam();
-
-    LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
-    rep->nodeId  = nodeId;
-    rep->lcpId   = SYSFILE->latestLCP_ID;
-    rep->blockNo = DBLQH;
-    sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal, 
-	       LcpCompleteRep::SignalLength, JBB);
-
-    if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){
+ 
+    /**
+     * Check if we'r waiting for the failed node's LQH to complete
+     *
+     * Note that this is ran "before" LCP master take over
+     */
+    if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){
       jam();
-      /**
-       * Make sure we're ready to accept it
-       */
-      c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId);
+      
+      lcp_complete_rep = true;
+      LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
+      rep->nodeId  = nodeId;
+      rep->lcpId   = SYSFILE->latestLCP_ID;
+      rep->blockNo = DBLQH;
+      sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal, 
+                 LcpCompleteRep::SignalLength, JBB);
+      
+      if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){
+        jam();
+        /**
+         * Make sure we're ready to accept it
+         */
+        c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId);
+      }
     }
   }
   
@@ -4557,7 +4567,9 @@ void Dbdih::failedNodeLcpHandling(Signal
 	       StartLcpConf::SignalLength, JBB);
   }//if
   
-  if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i)) {
+dosend:
+  if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i))
+  {
     jam();
     EmptyLcpConf * const rep = (EmptyLcpConf *)&signal->theData[0];
     rep->senderNodeId = failedNodePtr.i;
@@ -4568,8 +4580,14 @@ void Dbdih::failedNodeLcpHandling(Signal
     rep->idle = true;
     sendSignal(reference(), GSN_EMPTY_LCP_CONF, signal, 
 	       EmptyLcpConf::SignalLength, JBB);
-  }//if
-
+  }
+  else if (!c_EMPTY_LCP_REQ_Counter.done() && lcp_complete_rep)
+  {
+    jam();
+    c_EMPTY_LCP_REQ_Counter.setWaitingFor(failedNodePtr.i);
+    goto dosend;
+  }
+  
   if (c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i)) {
     jam();
     MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
@@ -4637,19 +4655,36 @@ Dbdih::startLcpMasterTakeOver(Signal* si
   
   c_lcpMasterTakeOverState.set(LMTOS_WAIT_EMPTY_LCP, __LINE__);
   
-  if(c_EMPTY_LCP_REQ_Counter.done()){
-    jam();
-    c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor();
-
-    EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend();
-    req->senderRef = reference();
-    sendLoopMacro(EMPTY_LCP_REQ, sendEMPTY_LCP_REQ);
-    ndbrequire(!c_EMPTY_LCP_REQ_Counter.done());
-  } else {
-    /**
-     * Node failure during master take over...
-     */
-    g_eventLogger.info("Nodefail during master take over (old: %d)", oldNode);
+   
+  EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend();
+  req->senderRef = reference();
+  {
+    NodeRecordPtr specNodePtr;
+    specNodePtr.i = cfirstAliveNode;
+    do {
+      jam();
+      ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
+      if (!c_EMPTY_LCP_REQ_Counter.isWaitingFor(specNodePtr.i))
+      {
+        jam();
+        c_EMPTY_LCP_REQ_Counter.setWaitingFor(specNodePtr.i);
+        if (!(ERROR_INSERTED(7209) && specNodePtr.i == getOwnNodeId()))
+        {
+          sendEMPTY_LCP_REQ(signal, specNodePtr.i);
+        }
+        else
+        {
+          ndbout_c("NOT sending EMPTY_LCP_REQ to %u", specNodePtr.i);
+        }
+        
+        if (c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(specNodePtr.i))
+        {
+          jam();
+          c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor();
+        }
+      }
+      specNodePtr.i = specNodePtr.p->nextNode;
+    } while (specNodePtr.i != RNIL);
   }
   
   NodeRecordPtr nodePtr;
@@ -5639,6 +5674,9 @@ void Dbdih::execEMPTY_LCP_CONF(Signal* s
   const EmptyLcpConf * const conf = (EmptyLcpConf *)&signal->theData[0];
   Uint32 nodeId = conf->senderNodeId;
 
+  CRASH_INSERTION(7206);
+
+
   if(!conf->idle){
     jam();
     if (conf->tableId < c_lcpMasterTakeOverState.minTableId) {
@@ -5716,6 +5754,25 @@ void Dbdih::execMASTER_LCPREQ(Signal* si
   jamEntry();
   const BlockReference newMasterBlockref = req->masterRef;
 
+  CRASH_INSERTION(7205);
+
+  if (ERROR_INSERTED(7207))
+  {
+    jam();
+    SET_ERROR_INSERT_VALUE(7208);
+    sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
+			500, signal->getLength());
+    return;
+  }
+  
+  if (ERROR_INSERTED(7208))
+  {
+    jam();
+    signal->theData[0] = 9999;
+    sendSignal(numberToRef(CMVMI, refToNode(newMasterBlockref)), 
+               GSN_NDB_TAMPER, signal, 1, JBB);
+  }
+  
   if (newMasterBlockref != cmasterdihref)
   {
     jam();
@@ -5738,6 +5795,11 @@ void Dbdih::execMASTER_LCPREQ(Signal* si
     jam();
     ndbrequire(0);
   }
+
+  if (ERROR_INSERTED(7209))
+  {
+    SET_ERROR_INSERT_VALUE(7210);
+  }
   
   sendMASTER_LCPCONF(signal);
 }//Dbdih::execMASTER_LCPREQ()
@@ -6081,12 +6143,22 @@ void Dbdih::execMASTER_LCPREF(Signal* si
 {
   const MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
   jamEntry();
-  receiveLoopMacro(MASTER_LCPREQ, ref->senderNodeId);
+
+  Uint32 senderNodeId = ref->senderNodeId;
+  Uint32 failedNodeId = ref->failedNodeId;
+  
+  if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(senderNodeId))
+  {
+    jam();
+    c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.clearWaitingFor(senderNodeId);
+  }
+
+  receiveLoopMacro(MASTER_LCPREQ, senderNodeId);
   /*-------------------------------------------------------------------------*/
   // We have now received all responses and are ready to take over the LCP
   // protocol as master.
   /*-------------------------------------------------------------------------*/
-  MASTER_LCPhandling(signal, ref->failedNodeId);
+  MASTER_LCPhandling(signal, failedNodeId);
 }//Dbdih::execMASTER_LCPREF()
 
 void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId) 
@@ -10053,7 +10125,15 @@ void Dbdih::execLCP_FRAG_REP(Signal* sig
       signal->theData[1] = tabPtr.i;
       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
       
-      checkLcpAllTablesDoneInLqh();
+      bool ret = checkLcpAllTablesDoneInLqh();
+      if (ret && ERROR_INSERTED(7209))
+      {
+        jam();
+        
+        signal->theData[0] = 9999;
+        sendSignal(numberToRef(CMVMI, cmasterNodeId), 
+                   GSN_NDB_TAMPER, signal, 1, JBB);
+      }
     }
   }
 
@@ -10348,12 +10428,30 @@ void Dbdih::checkLcpCompletedLab(Signal*
   CRASH_INSERTION2(7027, isMaster());
   CRASH_INSERTION2(7018, !isMaster());
 
-  if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED){
+  if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED)
+  {
     /**
      * We'r done
      */
+
+    if (ERROR_INSERTED(7209))
+    {
+      signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED;
+      sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
+      return;
+    }
+    
     c_lcpState.setLcpStatus(LCP_TAB_SAVED, __LINE__);
     sendLCP_COMPLETE_REP(signal);
+
+    if (ERROR_INSERTED(7210))
+    {
+      CLEAR_ERROR_INSERT_VALUE;
+      EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtr();
+      req->senderRef = reference();
+      sendEMPTY_LCP_REQ(signal, getOwnNodeId());
+    }
+    
     return;
   }
 
@@ -10365,13 +10463,28 @@ void Dbdih::checkLcpCompletedLab(Signal*
 void
 Dbdih::sendLCP_COMPLETE_REP(Signal* signal){
   jam();
-  LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
-  rep->nodeId = getOwnNodeId();
-  rep->lcpId = SYSFILE->latestLCP_ID;
-  rep->blockNo = DBDIH;
-  
-  sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal, 
-	     LcpCompleteRep::SignalLength, JBB);
+
+  /**
+   * Quick and dirty fix for bug#36276 dont save
+   * LCP_COMPLETE_REP to same node same LCP twice
+   */
+  bool alreadysent = 
+    c_lcpState.m_lastLCP_COMPLETE_REP_id == SYSFILE->latestLCP_ID &&
+    c_lcpState.m_lastLCP_COMPLETE_REP_ref == c_lcpState.m_masterLcpDihRef;
+
+  if (!alreadysent)
+  {
+    LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
+    rep->nodeId = getOwnNodeId();
+    rep->lcpId = SYSFILE->latestLCP_ID;
+    rep->blockNo = DBDIH;
+    
+    sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal, 
+               LcpCompleteRep::SignalLength, JBB);
+
+    c_lcpState.m_lastLCP_COMPLETE_REP_id = SYSFILE->latestLCP_ID;
+    c_lcpState.m_lastLCP_COMPLETE_REP_ref = c_lcpState.m_masterLcpDihRef;
+  }
 
   /**
    * Say that an initial node restart does not need to be redone
@@ -11426,7 +11539,7 @@ void Dbdih::initCommonData()
   c_lcpState.ctimer = 0;
   c_lcpState.immediateLcpStart = false;
   c_lcpState.m_MASTER_LCPREQ_Received = false;
-    
+  c_lcpState.m_lastLCP_COMPLETE_REP_ref = 0;
   cmasterdihref = 0;
   cmasterNodeId = 0;
   cmasterState = MASTER_IDLE;
diff -Nrup a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
--- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2007-07-04 22:42:28 +02:00
+++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2008-04-23 16:08:36 +02:00
@@ -6810,6 +6810,22 @@ void Dblqh::lqhTransNextLab(Signal* sign
        *
        * now scan markers
        */
+      if (ERROR_INSERTED(5050))
+      {
+        ndbout_c("send ZSCAN_MARKERS with 5s delay and killing master");
+        CLEAR_ERROR_INSERT_VALUE;
+        signal->theData[0] = ZSCAN_MARKERS;
+        signal->theData[1] = tcNodeFailptr.i;
+        signal->theData[2] = 0;
+        signal->theData[3] = RNIL;
+        sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 5000, 4);
+        
+        signal->theData[0] = 9999;
+        sendSignal(numberToRef(CMVMI, 
+                               refToNode(tcNodeFailptr.p->newTcBlockref)), 
+                   GSN_NDB_TAMPER, signal, 1, JBB);
+        return;
+      }
       scanMarkers(signal, tcNodeFailptr.i, 0, RNIL);
       return;
     }//if
@@ -6894,6 +6910,20 @@ Dblqh::scanMarkers(Signal* signal, 
   tcNodeFailPtr.i = tcNodeFail;
   ptrCheckGuard(tcNodeFailPtr, ctcNodeFailrecFileSize, tcNodeFailRecord);
   const Uint32 crashedTcNodeId = tcNodeFailPtr.p->oldNodeId;
+
+  if (tcNodeFailPtr.p->tcFailStatus == TcNodeFailRecord::TC_STATE_BREAK)
+  {
+    jam();
+    
+    /* ----------------------------------------------------------------------
+     *  AN INTERRUPTION TO THIS NODE FAIL HANDLING WAS RECEIVED AND A NEW 
+     *  TC HAVE BEEN ASSIGNED TO TAKE OVER THE FAILED TC. PROBABLY THE OLD 
+     *  NEW TC HAVE FAILED.
+     * ---------------------------------------------------------------------- */
+    tcNodeFailptr = tcNodeFailPtr;
+    lqhTransNextLab(signal);
+    return;
+  }
   
   CommitAckMarkerIterator iter;
   if(i == RNIL){
diff -Nrup a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
--- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp	2007-12-13 21:31:36 +01:00
+++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp	2008-04-23 16:08:36 +02:00
@@ -7154,21 +7154,10 @@ void Dbtc::execNODE_FAILREP(Signal* sign
       }//if
     }//if
     
-    if (getOwnNodeId() != tnewMasterId)
-    {
-      jam();
-      /**
-       * Only master does takeover currently
-       */
-      hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER;
-    }
-    else
-    {
-      jam();
-      signal->theData[0] = hostptr.i;
-      sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB);
-    }
-
+    jam();
+    signal->theData[0] = hostptr.i;
+    sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB);
+    
     checkScanActiveInFailedLqh(signal, 0, hostptr.i);
     checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid
     nodeFailCheckTransactions(signal, 0, hostptr.i);
@@ -7205,6 +7194,14 @@ Dbtc::checkNodeFailComplete(Signal* sign
     sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, 
 	       NFCompleteRep::SignalLength, JBB);
   }
+
+  CRASH_INSERTION(8058);
+  if (ERROR_INSERTED(8059))
+  {
+    signal->theData[0] = 9999;
+    sendSignalWithDelay(numberToRef(CMVMI, hostptr.i), 
+                        GSN_NDB_TAMPER, signal, 100, 1);
+  }
 }
 
 void Dbtc::checkScanActiveInFailedLqh(Signal* signal, 
@@ -7273,7 +7270,14 @@ Dbtc::nodeFailCheckTransactions(Signal* 
   Ptr<ApiConnectRecord> transPtr;
   Uint32 TtcTimer = ctcTimer;
   Uint32 TapplTimeout = c_appl_timeout_value;
-  for (transPtr.i = transPtrI; transPtr.i < capiConnectFilesize; transPtr.i++)
+  Uint32 RT_BREAK = 64;
+  Uint32 endPtrI = transPtrI + RT_BREAK;
+  if (endPtrI > capiConnectFilesize)
+  {
+    endPtrI = capiConnectFilesize;
+  }
+
+  for (transPtr.i = transPtrI; transPtr.i < endPtrI; transPtr.i++)
   {
     ptrCheckGuard(transPtr, capiConnectFilesize, apiConnectRecord); 
     if (transPtr.p->m_transaction_nodes.get(failedNodeId))
@@ -7285,18 +7289,25 @@ Dbtc::nodeFailCheckTransactions(Signal* 
       setApiConTimer(transPtr.i, TtcTimer - 2, __LINE__);
       timeOutFoundLab(signal, transPtr.i, ZNODEFAIL_BEFORE_COMMIT);
       c_appl_timeout_value = TapplTimeout;
+      
+      transPtr.i++;
+      break;
     }
-    
-    // Send CONTINUEB to continue later
+  }
+  
+  if (transPtr.i == capiConnectFilesize)
+  {
+    jam();
+    checkNodeFailComplete(signal, failedNodeId, 
+                          HostRecord::NF_CHECK_TRANSACTION);
+  }
+  else
+  {
     signal->theData[0] = TcContinueB::ZNF_CHECK_TRANSACTIONS;
-    signal->theData[1] = transPtr.i + 1; // Check next
+    signal->theData[1] = transPtr.i;
     signal->theData[2] = failedNodeId;
     sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
-    return;
   }
-
-  checkNodeFailComplete(signal, failedNodeId, 
-			HostRecord::NF_CHECK_TRANSACTION);
 }
 
 
@@ -7319,7 +7330,23 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* s
   if (signal->getSendersBlockRef() != reference())
   {
     jam();
-    return;
+    /**
+     * Node should be in queue
+     */
+    Uint32 i = 0;
+    Uint32 end = tcNodeFailptr.p->queueIndex;
+    for (; i<end; i++)
+    {
+      jam();
+      if (tcNodeFailptr.p->queueList[i] == hostptr.i)
+      {
+        jam();
+        break;
+      }
+    }
+    ndbrequire(i != end);
+    tcNodeFailptr.p->queueList[i] = tcNodeFailptr.p->queueList[end-1];
+    tcNodeFailptr.p->queueIndex = end - 1;
   }
   
   checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER);
@@ -7331,7 +7358,9 @@ void Dbtc::execTAKE_OVERTCREQ(Signal* si
   tfailedNodeId = signal->theData[0];
   tcNodeFailptr.i = 0;
   ptrAss(tcNodeFailptr, tcFailRecord);
-  if (tcNodeFailptr.p->failStatus != FS_IDLE) {
+  if (tcNodeFailptr.p->failStatus != FS_IDLE ||
+      cmasterNodeId != getOwnNodeId())
+  {
     jam();
     /*------------------------------------------------------------*/
     /*       WE CAN CURRENTLY ONLY HANDLE ONE TAKE OVER AT A TIME */
@@ -7385,6 +7414,8 @@ void Dbtc::execLQH_TRANSCONF(Signal* sig
   jamEntry();
   LqhTransConf * const lqhTransConf = (LqhTransConf *)&signal->theData[0];
   
+  CRASH_INSERTION(8060);
+
   tcNodeFailptr.i = lqhTransConf->tcRef;
   ptrCheckGuard(tcNodeFailptr, 1, tcFailRecord);
   tnodeid = lqhTransConf->lqhNodeId;
@@ -7447,6 +7478,8 @@ void Dbtc::nodeTakeOverCompletedLab(Sign
 {
   Uint32 guard0;
 
+  CRASH_INSERTION(8061);
+
   hostptr.i = tnodeid;
   ptrCheckGuard(hostptr, chostFilesize, hostRecord);
   hostptr.p->lqhTransStatus = LTS_IDLE;
@@ -7554,6 +7587,8 @@ void Dbtc::completeTransAtTakeOverDoLast
   }//if
   tcNodeFailptr.p->takeOverProcState[TtakeOverInd] = ZTAKE_OVER_IDLE;
   tcNodeFailptr.p->completedTakeOver++;
+
+  CRASH_INSERTION(8062);
 
   if (tcNodeFailptr.p->completedTakeOver == cnoParallelTakeOver) {
     jam();
diff -Nrup a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp
--- a/ndb/test/ndbapi/testNodeRestart.cpp	2007-11-07 20:57:19 +01:00
+++ b/ndb/test/ndbapi/testNodeRestart.cpp	2008-04-23 16:08:36 +02:00
@@ -23,6 +23,7 @@
 #include <signaldata/DumpStateOrd.hpp>
 #include <Bitmask.hpp>
 #include <RefConvert.hpp>
+#include <NdbEnv.h>
 
 int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){
 
@@ -1392,6 +1393,480 @@ runBug32160(NDBT_Context* ctx, NDBT_Step
   return NDBT_OK;
 }
 
+int
+runMNF(NDBT_Context* ctx, NDBT_Step* step)
+{
+  int result = NDBT_OK;
+  NdbRestarter res;
+  
+  if (res.getNumDbNodes() < 2)
+  {
+    return NDBT_OK;
+  }
+
+  Vector<int> part0;
+  Vector<int> part1;
+  Bitmask<255> part0mask;
+  Bitmask<255> part1mask;
+  Bitmask<255> ngmask;
+  for (int i = 0; i<res.getNumDbNodes(); i++)
+  {
+    int nodeId = res.getDbNodeId(i);
+    int ng = res.getNodeGroup(nodeId);
+    if (ngmask.get(ng))
+    {
+      part1.push_back(nodeId);
+      part1mask.set(nodeId);
+    }
+    else
+    {
+      ngmask.set(ng);
+      part0.push_back(nodeId);
+      part0mask.set(nodeId);
+    }
+  }
+
+  printf("part0: ");
+  for (size_t i = 0; i<part0.size(); i++)
+    printf("%u ", part0[i]);
+  printf("\n");
+
+  printf("part1: ");
+  for (size_t i = 0; i<part1.size(); i++)
+    printf("%u ", part1[i]);
+  printf("\n");
+
+  int loops = ctx->getNumLoops();
+  while (loops-- && !ctx->isTestStopped())
+  {
+    int cnt, *nodes;
+    int master = res.getMasterNodeId();
+    int nextMaster = res.getNextMasterNodeId(master);
+
+    bool cmf = false;
+    if (part0mask.get(master) && part0mask.get(nextMaster))
+    {
+      cmf = true;
+      cnt = part0.size();
+      nodes = part0.getBase();
+      printf("restarting part0");
+    }
+    else if(part1mask.get(master) && part1mask.get(nextMaster))
+    {
+      cmf = true;
+      cnt = part1.size();
+      nodes = part1.getBase();
+      printf("restarting part1");
+    }
+    else
+    {
+      cmf = false;
+      if (loops & 1)
+      {
+        cnt = part0.size();
+        nodes = part0.getBase();
+        printf("restarting part0");
+      } 
+      else 
+      {
+        cnt = part1.size();
+        nodes = part0.getBase();
+        printf("restarting part0");
+      }
+    }
+    
+    int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };    
+    for (int i = 0; i<cnt; i++)
+      if (res.dumpStateOneNode(nodes[i], val2, 2))
+        return NDBT_FAILED;
+    
+    int type = loops;
+    char buf[100];
+    if (NdbEnv_GetEnv("MNF", buf, sizeof(buf)))
+    {
+      type = atoi(buf);
+    }
+    if (cmf)
+    {
+      type = type % 7;
+    }
+    else
+    {
+      type = type % 4;
+    }
+    ndbout_c(" type: %u (cmf: %u)", type, cmf);
+    switch(type){
+    case 0:
+      for (int i = 0; i<cnt; i++)
+      {
+        if (res.restartOneDbNode(nodes[i],
+                                 /** initial */ false, 
+                                 /** nostart */ true,
+                                 /** abort   */ true))
+          return NDBT_FAILED;
+        
+        NdbSleep_MilliSleep(10);
+      }
+      break;
+    case 1:
+      for (int i = 0; i<cnt; i++)
+      {
+        if (res.restartOneDbNode(nodes[i],
+                                 /** initial */ false, 
+                                 /** nostart */ true,
+                                 /** abort   */ true))
+          return NDBT_FAILED;
+        
+      }
+      break;
+    case 2:
+      for (int i = 0; i<cnt; i++)
+      {
+        res.insertErrorInNode(nodes[i], 8058);
+      }
+      res.restartOneDbNode(nodes[0],
+                           /** initial */ false, 
+                           /** nostart */ true,
+                           /** abort   */ true);
+      break;
+    case 3:
+      for (int i = 0; i<cnt; i++)
+      {
+        res.insertErrorInNode(nodes[i], 8059);
+      }
+      res.restartOneDbNode(nodes[0],
+                           /** initial */ false, 
+                           /** nostart */ true,
+                           /** abort   */ true);
+      break;
+    case 4:
+    {
+      for (int i = 0; i<cnt; i++)
+      {
+        res.insertErrorInNode(nodes[i], 7180);
+      }
+
+      int lcp = 7099;
+      res.insertErrorInNode(master, 7193);
+      res.dumpStateOneNode(master, &lcp, 1);
+      break;
+    }
+    case 5:
+    {
+      for (int i = 0; i<cnt; i++)
+      {
+        res.insertErrorInNode(nodes[i], 7206);
+      }
+
+      int lcp = 7099;
+      res.insertErrorInNode(master, 7193);
+      res.dumpStateOneNode(master, &lcp, 1);
+      break;
+    }
+    case 6:
+    {
+      for (int i = 0; i<cnt; i++)
+      {
+        res.insertErrorInNode(nodes[i], 5008);
+      }
+      
+      int lcp = 7099;
+      res.insertErrorInNode(master, 7193);
+      res.dumpStateOneNode(master, &lcp, 1);
+      break;
+    }
+    }
+    
+    if (res.waitNodesNoStart(nodes, cnt))
+      return NDBT_FAILED;
+    
+    if (res.startNodes(nodes, cnt))
+      return NDBT_FAILED;
+    
+    if (res.waitClusterStarted())
+      return NDBT_FAILED; 
+  }
+
+  ctx->stopTest();
+  return NDBT_OK;
+}
+
+int 
+runBug36199(NDBT_Context* ctx, NDBT_Step* step)
+{
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  NdbRestarter res;
+
+  if (res.getNumDbNodes() < 4)
+    return NDBT_OK;
+
+  int master = res.getMasterNodeId();
+  int nextMaster = res.getNextMasterNodeId(master);
+  int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand());
+  if (victim == master)
+  {
+    victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand());
+  }
+
+  ndbout_c("master: %u next master: %u victim: %u",
+           master, nextMaster, victim);
+
+  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };    
+  res.dumpStateOneNode(master, val2, 2);
+  res.dumpStateOneNode(victim, val2, 2);
+
+  res.insertErrorInNode(victim, 7205);
+  res.insertErrorInNode(master, 7014);
+  int lcp = 7099;
+  res.dumpStateOneNode(master, &lcp, 1);
+  
+  int nodes[2];
+  nodes[0] = master;
+  nodes[1] = victim;
+  if (res.waitNodesNoStart(nodes, 2))
+  {
+    return NDBT_FAILED;
+  }
+
+  if (res.startNodes(nodes, 2))
+  {
+    return NDBT_FAILED;
+  }
+  
+  if (res.waitClusterStarted())
+    return NDBT_FAILED;
+
+  return NDBT_OK;
+}
+
+int 
+runBug36246(NDBT_Context* ctx, NDBT_Step* step)
+{ 
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  NdbRestarter res;
+  Ndb* pNdb = GETNDB(step);
+
+  if (res.getNumDbNodes() < 4)
+    return NDBT_OK;
+
+  HugoOperations hugoOps(*ctx->getTab());
+restartloop:
+  int tryloop = 0;
+  int master = res.getMasterNodeId();
+  int nextMaster = res.getNextMasterNodeId(master);
+
+loop:
+  if(hugoOps.startTransaction(pNdb) != 0)
+    return NDBT_FAILED;
+      
+  if(hugoOps.pkUpdateRecord(pNdb, 1, 1) != 0)
+    return NDBT_FAILED;
+  
+  if(hugoOps.execute_NoCommit(pNdb) != 0)
+    return NDBT_FAILED;
+  
+  int victim = hugoOps.getTransaction()->getConnectedNodeId();
+  printf("master: %u nextMaster: %u victim: %u",
+         master, nextMaster, victim);
+  if (victim == master || victim == nextMaster ||
+      res.getNodeGroup(victim) == res.getNodeGroup(master) ||
+      res.getNodeGroup(victim) == res.getNodeGroup(nextMaster))
+  {
+    hugoOps.execute_Rollback(pNdb);
+    hugoOps.closeTransaction(pNdb);
+    tryloop++;
+    if (tryloop == 10)
+    {
+      ndbout_c(" -> restarting next master: %u", nextMaster);
+      res.restartOneDbNode(nextMaster,
+                           /** initial */ false, 
+                           /** nostart */ true,
+                           /** abort   */ true);
+    
+      res.waitNodesNoStart(&nextMaster, 1);
+      res.startNodes(&nextMaster, 1);
+      if (res.waitClusterStarted())
+        return NDBT_FAILED;
+      goto restartloop;
+    }
+    else
+    {
+      ndbout_c(" -> loop");
+      goto loop;
+    }
+  }
+  ndbout_c(" -> go go gadget skates");
+
+  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };    
+  res.dumpStateOneNode(master, val2, 2);
+  res.dumpStateOneNode(victim, val2, 2);
+
+  res.insertErrorInNode(master, 8060);
+  res.insertErrorInNode(victim, 9999);
+  
+  int nodes[2];
+  nodes[0] = master;
+  nodes[1] = victim;
+  if (res.waitNodesNoStart(nodes, 2))
+  {
+    return NDBT_FAILED;
+  }
+  
+  if (res.startNodes(nodes, 2))
+  {
+    return NDBT_FAILED;
+  }
+  
+  if (res.waitClusterStarted())
+    return NDBT_FAILED;
+
+  hugoOps.execute_Rollback(pNdb);
+  hugoOps.closeTransaction(pNdb);
+
+  return NDBT_OK;
+}
+
+int 
+runBug36247(NDBT_Context* ctx, NDBT_Step* step)
+{ 
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  NdbRestarter res;
+  Ndb* pNdb = GETNDB(step);
+
+  if (res.getNumDbNodes() < 4)
+    return NDBT_OK;
+
+  HugoOperations hugoOps(*ctx->getTab());
+
+restartloop:
+  int tryloop = 0;
+  int master = res.getMasterNodeId();
+  int nextMaster = res.getNextMasterNodeId(master);
+
+loop:
+  if(hugoOps.startTransaction(pNdb) != 0)
+    return NDBT_FAILED;
+      
+  if(hugoOps.pkUpdateRecord(pNdb, 1, 100) != 0)
+    return NDBT_FAILED;
+  
+  if(hugoOps.execute_NoCommit(pNdb) != 0)
+    return NDBT_FAILED;
+  
+  int victim = hugoOps.getTransaction()->getConnectedNodeId();
+  printf("master: %u nextMaster: %u victim: %u",
+         master, nextMaster, victim);
+  if (victim == master || victim == nextMaster ||
+      res.getNodeGroup(victim) == res.getNodeGroup(master) ||
+      res.getNodeGroup(victim) == res.getNodeGroup(nextMaster))
+  {
+    hugoOps.execute_Rollback(pNdb);
+    hugoOps.closeTransaction(pNdb);
+    tryloop++;
+    if (tryloop == 10)
+    {
+      ndbout_c(" -> restarting next master: %u", nextMaster);
+      res.restartOneDbNode(nextMaster,
+                           /** initial */ false, 
+                           /** nostart */ true,
+                           /** abort   */ true);
+      
+      res.waitNodesNoStart(&nextMaster, 1);
+      res.startNodes(&nextMaster, 1);
+      if (res.waitClusterStarted())
+        return NDBT_FAILED;
+      goto restartloop;
+    }
+    else
+    {
+      ndbout_c(" -> loop");
+      goto loop;
+    }
+  }
+  ndbout_c(" -> go go gadget skates");
+  
+  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };    
+  res.dumpStateOneNode(master, val2, 2);
+  res.dumpStateOneNode(victim, val2, 2);
+  
+  for (int i = 0; i<res.getNumDbNodes(); i++)
+  {
+    int nodeId = res.getDbNodeId(i);
+    res.insertErrorInNode(nodeId, 5050);
+  }
+  
+  res.insertErrorInNode(victim, 9999);
+  
+  int nodes[2];
+  nodes[0] = master;
+  nodes[1] = victim;
+  if (res.waitNodesNoStart(nodes, 2))
+  {
+    return NDBT_FAILED;
+  }
+  
+  if (res.startNodes(nodes, 2))
+  {
+    return NDBT_FAILED;
+  }
+  
+  if (res.waitClusterStarted())
+    return NDBT_FAILED;
+  
+  hugoOps.execute_Rollback(pNdb);
+  hugoOps.closeTransaction(pNdb);
+  
+  return NDBT_OK;
+}
+
+int 
+runBug36276(NDBT_Context* ctx, NDBT_Step* step)
+{ 
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  NdbRestarter res;
+  Ndb* pNdb = GETNDB(step);
+  
+  if (res.getNumDbNodes() < 4)
+    return NDBT_OK;
+  
+  int master = res.getMasterNodeId();
+  int nextMaster = res.getNextMasterNodeId(master);
+  int victim = res.getRandomNodeSameNodeGroup(nextMaster, rand());
+  if (victim == master)
+  {
+    victim = res.getRandomNodeOtherNodeGroup(nextMaster, rand());
+  }
+
+  ndbout_c("master: %u nextMaster: %u victim: %u",
+           master, nextMaster, victim);
+
+  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };    
+  res.dumpStateOneNode(master, val2, 2);
+  res.insertErrorInNode(victim, 7209);
+
+  int lcp = 7099;
+  res.dumpStateOneNode(master, &lcp, 1);
+  
+  if (res.waitNodesNoStart(&master, 1))
+  {
+    return NDBT_FAILED;
+  }
+  
+  if (res.startNodes(&master, 1))
+  {
+    return NDBT_FAILED;
+  }
+  
+  if (res.waitClusterStarted())
+    return NDBT_FAILED;
+  
+  return NDBT_OK;
+}
+
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
 	 "Test that one node at a time can be stopped and then restarted "\
@@ -1733,6 +2208,29 @@ TESTCASE("Bug29364", ""){
 }
 TESTCASE("Bug32160", ""){
   INITIALIZER(runBug32160);
+}
+TESTCASE("MNF", ""){
+  INITIALIZER(runLoadTable);
+  STEP(runMNF);
+  STEP(runScanUpdateUntilStopped);
+}
+TESTCASE("Bug36199", ""){
+  INITIALIZER(runBug36199);
+}
+TESTCASE("Bug36246", ""){
+  INITIALIZER(runLoadTable);
+  STEP(runBug36246);
+  VERIFIER(runClearTable);
+}
+TESTCASE("Bug36247", ""){
+  INITIALIZER(runLoadTable);
+  STEP(runBug36247);
+  VERIFIER(runClearTable);
+}
+TESTCASE("Bug36276", ""){
+  INITIALIZER(runLoadTable);
+  STEP(runBug36276);
+  VERIFIER(runClearTable);
 }
 NDBT_TESTSUITE_END(testNodeRestart);
 
diff -Nrup a/ndb/test/run-test/daily-basic-tests.txt
b/ndb/test/run-test/daily-basic-tests.txt
--- a/ndb/test/run-test/daily-basic-tests.txt	2008-01-31 23:14:21 +01:00
+++ b/ndb/test/run-test/daily-basic-tests.txt	2008-04-23 16:08:36 +02:00
@@ -791,3 +791,25 @@ max-time: 180
 cmd: testIndex
 args: -n Bug28804_ATTRINFO T1 T3
 
+# 2008-04-22
+max-time: 1500
+cmd: testNodeRestart
+args: -n MNF T1
+
+max-time: 300
+cmd: testNodeRestart
+args:  -n Bug36199 T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36246 T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36247 T1
+
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug36276 T1
+
+# EOF
Thread
bk commit into 5.0 tree (jonas:1.2600) BUG#36199jonas23 Apr 2008