List:Commits« Previous MessageNext Message »
From:tomas Date:March 24 2006 5:13pm
Subject:bk commit into 5.1 tree (tomas:1.1995) BUG#18118
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of tomas. When tomas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.1995 06/03/24 17:12:03 tomas@stripped +25 -0
  Bug #18385 partial restart failure after create table
  Bug #16772 Starting node joins cluster too early
  Bug #18352 Rollback + NF can lead to stray locks
  Bug #18298 8 Node restart with table/index wo/ logging cause crash
  Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/tools/desc.cpp
    1.21 06/03/24 17:11:40 tomas@stripped +78 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/test/src/NdbRestarter.cpp
    1.13 06/03/24 17:11:40 tomas@stripped +33 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/test/run-test/ndb-autotest.sh
    1.10 06/03/24 17:11:39 tomas@stripped +8 -4
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/test/run-test/conf-daily-devel-ndbmaster.txt
    1.5 06/03/24 17:11:39 tomas@stripped +3 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/test/ndbapi/testTimeout.cpp
    1.14 06/03/24 17:11:39 tomas@stripped +101 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/test/ndbapi/testSystemRestart.cpp
    1.10 06/03/24 17:11:39 tomas@stripped +53 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/test/ndbapi/testNodeRestart.cpp
    1.21 06/03/24 17:11:39 tomas@stripped +280 -0
    copied from 5.1-new

  storage/ndb/test/include/NdbRestarter.hpp
    1.6 06/03/24 17:11:39 tomas@stripped +1 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/ndbapi/TransporterFacade.hpp
    1.29 06/03/24 17:11:39 tomas@stripped +1 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/ndbapi/TransporterFacade.cpp
    1.45 06/03/24 17:11:39 tomas@stripped +13 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/ndbapi/Ndbif.cpp
    1.35 06/03/24 17:11:39 tomas@stripped +7 -5
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/ndbapi/NdbTransaction.cpp
    1.49 06/03/24 17:11:39 tomas@stripped +2 -2
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/kernel/vm/TransporterCallback.cpp
    1.13 06/03/24 17:11:38 tomas@stripped +2 -1
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/kernel/error/ndbd_exit_codes.c
    1.7 06/03/24 17:11:38 tomas@stripped +3 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
    1.27 06/03/24 17:11:37 tomas@stripped +87 -14
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
    1.97 06/03/24 17:11:37 tomas@stripped +221 -192
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp
    1.34 06/03/24 17:11:37 tomas@stripped +17 -10
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
    1.83 06/03/24 17:11:37 tomas@stripped +174 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
    1.43 06/03/24 17:11:36 tomas@stripped +162 -79
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
    1.13 06/03/24 17:11:35 tomas@stripped +2 -2
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/src/kernel/blocks/ERROR_codes.txt
    1.14 06/03/24 17:11:35 tomas@stripped +6 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/include/mgmapi/ndbd_exit_codes.h
    1.9 06/03/24 17:11:35 tomas@stripped +0 -2
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/include/kernel/signaldata/TcContinueB.hpp
    1.4 06/03/24 17:11:35 tomas@stripped +2 -1
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/include/kernel/signaldata/StartPerm.hpp
    1.3 06/03/24 17:11:35 tomas@stripped +6 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

  storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp
    1.6 06/03/24 17:11:35 tomas@stripped +4 -0
    Bug #18385 partial restart failure after create table
    Bug #16772 Starting node joins cluster too early
    Bug #18352 Rollback + NF can lead to stray locks
    Bug #18298 8 Node restart with table/index wo/ logging cause crash
    Bug #18118 Master node shutdown caused full cluster crash

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	tomas
# Host:	poseidon.ndb.mysql.com
# Root:	/home/tomas/wl2325-alcatel

--- 1.4/storage/ndb/test/run-test/conf-daily-devel-ndbmaster.txt	2005-05-13 10:51:53
+02:00
+++ 1.5/storage/ndb/test/run-test/conf-daily-devel-ndbmaster.txt	2006-03-24 17:11:39
+01:00
@@ -17,3 +17,6 @@
 PortNumber: 16000
 ArbitrationRank: 1
 DataDir: .
+
+[TCP DEFAULT]
+SendBufferMemory: 2M

--- 1.9/storage/ndb/test/run-test/ndb-autotest.sh	2005-07-22 15:27:09 +02:00
+++ 1.10/storage/ndb/test/run-test/ndb-autotest.sh	2006-03-24 17:11:39 +01:00
@@ -13,7 +13,7 @@
 VERSION="ndb-autotest.sh version 1.04"
 
 DATE=`date '+%Y-%m-%d'`
-HOST=`hostname`
+HOST=`hostname -s`
 export DATE HOST
 
 set -e
@@ -35,6 +35,7 @@
 clone=5.0-ndb
 RUN="daily-basic daily-devel"
 conf=autotest.conf
+LOCK=$HOME/.autotest-lock
 
 ############################
 # Read command line entries#
@@ -66,7 +67,7 @@
 
 if [ -f $conf ]
 then
-	. ./$conf
+	. $conf
 else
 	echo "Can't find config file: $conf"
 	exit
@@ -105,7 +106,6 @@
 # Setup the clone source location  #
 ####################################
 
-LOCK=$HOME/.autotest-lock
 src_clone=$src_clone_base-$clone
 
 #######################################
@@ -299,9 +299,12 @@
     elif [ -f $test_dir/conf-$1.txt ]
     then
 	echo "$test_dir/conf-$1.txt"
+    elif [ -f $test_dir/conf-$HOST.txt ]
+	echo "$test_dir/conf-$HOST.txt"
     else
 	echo "Unable to find conf file looked for" 1>&2
 	echo "$test_dir/conf-$1-$HOST.txt and" 1>&2
+	echo "$test_dir/conf-$HOST.txt" 1>&2
 	echo "$test_dir/conf-$1.txt" 1>&2
 	exit
     fi
@@ -386,7 +389,8 @@
                        awk '{for(i=1;i<='$count';i++)print $i;}'`
 	    echo $run_hosts >> /tmp/filter_hosts.$$	
 	
-	    choose $conf $run_hosts > d.tmp
+	    choose $conf $run_hosts > d.tmp.$$
+            sed -e s,CHOOSE_dir,"$install_dir",g < d.tmp.$$ > d.tmp
 	    $mkconfig d.tmp
 	fi
 	

--- 1.5/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2006-01-18 12:00:49 +01:00
+++ 1.6/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2006-03-24 17:11:35 +01:00
@@ -124,7 +124,11 @@
     DihAllAllowNodeStart = 7016,
     DihMinTimeBetweenLCP = 7017,
     DihMaxTimeBetweenLCP = 7018,
+    // 7019
+    // 7020
+    // 7021
     EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP
+    DihSetTimeBetweenGcp = 7090,
     DihStartLcpImmediately = 7099,
     // 8000 Suma
     // 12000 Tux

--- 1.2/storage/ndb/include/kernel/signaldata/StartPerm.hpp	2005-04-08 02:43:51 +02:00
+++ 1.3/storage/ndb/include/kernel/signaldata/StartPerm.hpp	2006-03-24 17:11:35 +01:00
@@ -64,5 +64,11 @@
   
   Uint32 startingNodeId;
   Uint32 errorCode;  
+
+  enum ErrorCode
+  {
+    ZNODE_ALREADY_STARTING_ERROR = 305,
+    InitialStartRequired = 320
+  };
 };
 #endif

--- 1.3/storage/ndb/include/kernel/signaldata/TcContinueB.hpp	2005-04-08 02:43:52 +02:00
+++ 1.4/storage/ndb/include/kernel/signaldata/TcContinueB.hpp	2006-03-24 17:11:35 +01:00
@@ -44,7 +44,8 @@
     CHECK_WAIT_DROP_TAB_FAILED_LQH         = 16,
     TRIGGER_PENDING                        = 17,
     
-    DelayTCKEYCONF = 18
+    DelayTCKEYCONF = 18,
+    ZNF_CHECK_TRANSACTIONS = 19
   };
 };
 

--- 1.13/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2005-09-16 06:52:32 +02:00
+++ 1.14/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2006-03-24 17:11:35 +01:00
@@ -219,6 +219,8 @@
 8045: (ABORTCONF only as part of take-over)
 Delay execution of ABORTCONF signal 2 seconds to generate time-out.
 
+8050: Send ZABORT_TIMEOUT_BREAK delayed
+
 ERROR CODES FOR TESTING TIME-OUT HANDLING IN DBTC
 -------------------------------------------------
 
@@ -296,6 +298,8 @@
 7131: Crash when receiving START_COPYREQ in master node
 7132: Crash when receiving START_COPYCONF in starting node
 
+7170: Crash when receiving START_PERMREF (InitialStartRequired)
+
 DICT:
 6000  Crash during NR when receiving DICTSTARTREQ
 6001  Crash during NR when receiving SCHEMA_INFO
@@ -304,6 +308,8 @@
 LQH:
 5026  Crash when receiving COPY_ACTIVEREQ
 5027  Crash when receiving STAT_RECREQ
+
+5043  Crash starting node, when scan is finished on primary replica
 
 Test Crashes in handling take over
 ----------------------------------

--- 1.12/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2006-02-16 14:29:28 +01:00
+++ 1.13/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2006-03-24 17:11:35 +01:00
@@ -82,7 +82,6 @@
 #define ZWRONG_FAILURE_NUMBER_ERROR 302
 #define ZWRONG_START_NODE_ERROR 303
 #define ZNO_REPLICA_FOUND_ERROR 304
-#define ZNODE_ALREADY_STARTING_ERROR 305
 #define ZNODE_START_DISALLOWED_ERROR 309
 
 // --------------------------------------
@@ -1041,7 +1040,8 @@
   void prepareReplicas(FragmentstorePtr regFragptr);
   void removeNodeFromStored(Uint32 nodeId,
                             FragmentstorePtr regFragptr,
-                            ReplicaRecordPtr replicaPtr);
+                            ReplicaRecordPtr replicaPtr,
+			    bool temporary);
   void removeOldStoredReplica(FragmentstorePtr regFragptr,
                               ReplicaRecordPtr replicaPtr);
   void removeStoredReplica(FragmentstorePtr regFragptr,

--- 1.42/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2006-03-20 12:25:54 +01:00
+++ 1.43/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2006-03-24 17:11:36 +01:00
@@ -1428,6 +1428,33 @@
     return;
   }
   
+  NodeRecordPtr nodePtr;
+  Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()];
+  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) 
+  {
+    jam();
+    ptrAss(nodePtr, nodeRecord);
+    if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) 
+    {
+      jam();
+      /**
+       * Since we're starting(is master) and there 
+       *   there are other nodes with higher GCI...
+       *   there gci's must be invalidated...
+       *   and they _must_ do an initial start
+       *   indicate this by setting lastCompletedGCI = 0
+       */
+      SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
+      ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
+      warningEvent("Making filesystem for node %d unusable",
+		   nodePtr.i);
+    }
+  }
+  /**
+   * This set which GCI we will try to restart to
+   */
+  SYSFILE->newestRestorableGCI = gci;
+  
   ndbrequire(isMaster());
   copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file!
 }//Dbdih::ndbStartReqLab()
@@ -1563,7 +1590,7 @@
 {
   jamEntry();
   Uint32 errorCode = signal->theData[1];
-  if (errorCode == ZNODE_ALREADY_STARTING_ERROR) {
+  if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) {
     jam();
     /*-----------------------------------------------------------------------*/
     // The master was busy adding another node. We will wait for a second and
@@ -1573,6 +1600,20 @@
     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1);
     return;
   }//if
+
+  if (errorCode == StartPermRef::InitialStartRequired)
+  {
+    CRASH_INSERTION(7170);
+    char buf[255];
+    BaseString::snprintf(buf, sizeof(buf), 
+			 "Cluster requires this node to be started "
+			 " with --initial as partial start has been performed"
+			 " and this filesystem is unusable");
+    progError(__LINE__, 
+	      NDBD_EXIT_SR_RESTARTCONFLICT,
+	      buf);
+    ndbrequire(false);
+  }
   /*------------------------------------------------------------------------*/
   // Some node process in another node involving our node was still active. We
   // will recover from this by crashing here. 
@@ -1663,7 +1704,7 @@
       (c_nodeStartMaster.wait != ZFALSE)) {
     jam();
     signal->theData[0] = nodeId;
-    signal->theData[1] = ZNODE_ALREADY_STARTING_ERROR;
+    signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR;
     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
     return;
   }//if
@@ -1673,6 +1714,16 @@
     ndbrequire(false);
   }//if
 
+  if (SYSFILE->lastCompletedGCI[nodeId] == 0 &&
+      typeStart != NodeState::ST_INITIAL_NODE_RESTART)
+  {
+    jam();
+    signal->theData[0] = nodeId;
+    signal->theData[1] = StartPermRef::InitialStartRequired;
+    sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
+    return;
+  }
+
   /*----------------------------------------------------------------------
    * WE START THE INCLUSION PROCEDURE 
    * ---------------------------------------------------------------------*/
@@ -3508,24 +3559,12 @@
 /* ------------------------------------------------------------------------- */
 void Dbdih::selectMasterCandidateAndSend(Signal* signal)
 {
-  Uint32 gci = 0;
-  Uint32 masterCandidateId = 0;
-  NodeRecordPtr nodePtr;
-  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
-    jam();
-    ptrAss(nodePtr, nodeRecord);
-    if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) {
-      jam();
-      masterCandidateId = nodePtr.i;
-      gci = SYSFILE->lastCompletedGCI[nodePtr.i];
-    }//if
-  }//for
-  ndbrequire(masterCandidateId != 0);
   setNodeGroups();
-  signal->theData[0] = masterCandidateId;
-  signal->theData[1] = gci;
+  signal->theData[0] = getOwnNodeId();
+  signal->theData[1] = SYSFILE->lastCompletedGCI[getOwnNodeId()];
   sendSignal(cntrlblockref, GSN_DIH_RESTARTCONF, signal, 2, JBB);
-
+  
+  NodeRecordPtr nodePtr;
   Uint32 node_groups[MAX_NDB_NODES];
   memset(node_groups, 0, sizeof(node_groups));
   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
@@ -3543,10 +3582,10 @@
     if(count != 0 && count != cnoReplicas){
       char buf[255];
       BaseString::snprintf(buf, sizeof(buf), 
-	       "Illegal configuration change."
-	       " Initial start needs to be performed "
-	       " when changing no of replicas (%d != %d)", 
-	       node_groups[nodePtr.i], cnoReplicas);
+			   "Illegal configuration change."
+			   " Initial start needs to be performed "
+			   " when changing no of replicas (%d != %d)", 
+			   node_groups[nodePtr.i], cnoReplicas);
       progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
     }
   }
@@ -5193,6 +5232,7 @@
 
   //const Uint32 lcpId = SYSFILE->latestLCP_ID;
   const bool lcpOngoingFlag = (tabPtr.p->tabLcpStatus== TabRecord::TLS_ACTIVE);
+  const bool temporary = !tabPtr.p->storedTable;
   
   FragmentstorePtr fragPtr;
   for(Uint32 fragNo = 0; fragNo < tabPtr.p->totalfragments; fragNo++){
@@ -5213,7 +5253,7 @@
         jam();
         found = true;
 	noOfRemovedReplicas++;
-	removeNodeFromStored(nodeId, fragPtr, replicaPtr);
+	removeNodeFromStored(nodeId, fragPtr, replicaPtr, temporary);
 	if(replicaPtr.p->lcpOngoingFlag){
 	  jam();
 	  /**
@@ -5918,9 +5958,6 @@
   signal->theData[0] = 7012;
   execDUMP_STATE_ORD(signal);
 
-  signal->theData[0] = 7015;
-  execDUMP_STATE_ORD(signal);
-
   c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
 
   checkLocalNodefailComplete(signal, failedNodePtr.i, NF_LCP_TAKE_OVER);
@@ -12256,9 +12293,18 @@
 /*---------------------------------------------------------------*/
 void Dbdih::removeNodeFromStored(Uint32 nodeId,
                                  FragmentstorePtr fragPtr,
-                                 ReplicaRecordPtr replicatePtr)
+                                 ReplicaRecordPtr replicatePtr,
+				 bool temporary)
 {
-  newCrashedReplica(nodeId, replicatePtr);
+  if (!temporary)
+  {
+    jam();
+    newCrashedReplica(nodeId, replicatePtr);
+  }
+  else
+  {
+    jam();
+  }
   removeStoredReplica(fragPtr, replicatePtr);
   linkOldStoredReplica(fragPtr, replicatePtr);
   ndbrequire(fragPtr.p->storedReplicas != RNIL);
@@ -13192,7 +13238,8 @@
 Dbdih::execDUMP_STATE_ORD(Signal* signal)
 {
   DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0];
-  if (dumpState->args[0] == DumpStateOrd::DihDumpNodeRestartInfo) {
+  Uint32 arg = dumpState->args[0];
+  if (arg == DumpStateOrd::DihDumpNodeRestartInfo) {
     infoEvent("c_nodeStartMaster.blockLcp = %d, c_nodeStartMaster.blockGcp = %d,
c_nodeStartMaster.wait = %d",
 	      c_nodeStartMaster.blockLcp, c_nodeStartMaster.blockGcp, c_nodeStartMaster.wait);
     infoEvent("cstartGcpNow = %d, cgcpStatus = %d",
@@ -13202,7 +13249,7 @@
     infoEvent("cgcpOrderBlocked = %d, cgcpStartCounter = %d",
               cgcpOrderBlocked, cgcpStartCounter);
   }//if  
-  if (dumpState->args[0] == DumpStateOrd::DihDumpNodeStatusInfo) {
+  if (arg == DumpStateOrd::DihDumpNodeStatusInfo) {
     NodeRecordPtr localNodePtr;
     infoEvent("Printing nodeStatus of all nodes");
     for (localNodePtr.i = 1; localNodePtr.i < MAX_NDB_NODES; localNodePtr.i++) {
@@ -13214,7 +13261,7 @@
     }//for
   }//if
   
-  if (dumpState->args[0] == DumpStateOrd::DihPrintFragmentation){
+  if (arg == DumpStateOrd::DihPrintFragmentation){
     infoEvent("Printing fragmentation of all tables --");
     for(Uint32 i = 0; i<ctabFileSize; i++){
       TabRecordPtr tabPtr;
@@ -13389,7 +13436,7 @@
     }
   }
 
-  if(dumpState->args[0] == 7019 && signal->getLength() == 2)
+  if(arg == 7019 && signal->getLength() == 2)
   {
     char buf2[8+1];
     NodeRecordPtr nodePtr;
@@ -13407,7 +13454,7 @@
 	      nodePtr.p->m_nodefailSteps.getText(buf2));
   }
   
-  if(dumpState->args[0] == 7020 && signal->getLength() > 3)
+  if(arg == 7020 && signal->getLength() > 3)
   {
     Uint32 gsn= signal->theData[1];
     Uint32 block= signal->theData[2];
@@ -13431,7 +13478,7 @@
 			  gsn, getBlockName(block, "UNKNOWN"), length, buf);
   }
   
-  if(dumpState->args[0] == DumpStateOrd::DihDumpLCPState){
+  if(arg == DumpStateOrd::DihDumpLCPState){
     infoEvent("-- Node %d LCP STATE --", getOwnNodeId());
     infoEvent("lcpStatus = %d (update place = %d) ",
 	      c_lcpState.lcpStatus, c_lcpState.lcpStatusUpdatedPlace);
@@ -13447,7 +13494,7 @@
     infoEvent("-- Node %d LCP STATE --", getOwnNodeId());
   }
 
-  if(dumpState->args[0] == DumpStateOrd::DihDumpLCPMasterTakeOver){
+  if(arg == DumpStateOrd::DihDumpLCPMasterTakeOver){
     infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId());
     infoEvent
       ("c_lcpMasterTakeOverState.state = %d updatePlace = %d failedNodeId = %d",
@@ -13462,52 +13509,25 @@
     infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId());
   }
 
-  if (signal->theData[0] == 7015){
-    for(Uint32 i = 0; i<ctabFileSize; i++){
-      TabRecordPtr tabPtr;
-      tabPtr.i = i;
-      ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
-      
-      if(tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
-	continue;
-      
-      infoEvent
-	("Table %d: TabCopyStatus: %d TabUpdateStatus: %d TabLcpStatus: %d",
-	 tabPtr.i, 
-	 tabPtr.p->tabCopyStatus, 
-	 tabPtr.p->tabUpdateState,
-	 tabPtr.p->tabLcpStatus);
+  if (signal->theData[0] == 7015)
+  {
+    if (signal->getLength() == 1)
+    {
+      signal->theData[1] = 0;
+    }
 
-      FragmentstorePtr fragPtr;
-      for (Uint32 fid = 0; fid < tabPtr.p->totalfragments; fid++) {
-	jam();
-	getFragstore(tabPtr.p, fid, fragPtr);
-	
-	char buf[100], buf2[100];
-	BaseString::snprintf(buf, sizeof(buf), " Fragment %d: noLcpReplicas==%d ", 
-		 fid, fragPtr.p->noLcpReplicas);
-	
-	Uint32 num=0;
-	ReplicaRecordPtr replicaPtr;
-	replicaPtr.i = fragPtr.p->storedReplicas;
-	do {
-	  ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord);
-	  BaseString::snprintf(buf2, sizeof(buf2), "%s %d(on %d)=%d(%s)",
-		   buf, num, 
-		   replicaPtr.p->procNode, 
-		   replicaPtr.p->lcpIdStarted,
-		   replicaPtr.p->lcpOngoingFlag ? "Ongoing" : "Idle");
-	  BaseString::snprintf(buf, sizeof(buf), "%s", buf2);
-	  
-	  num++;
-	  replicaPtr.i = replicaPtr.p->nextReplica;
-	} while (replicaPtr.i != RNIL);
-	infoEvent(buf);
-      }
+    Uint32 tableId = signal->theData[1];
+    if (tableId < ctabFileSize)
+    {
+      signal->theData[0] = 7021;
+      execDUMP_STATE_ORD(signal);
+      signal->theData[0] = 7015;
+      signal->theData[1] = tableId + 1;
+      sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, 2, JBB);
     }
   }
 
-  if(dumpState->args[0] == DumpStateOrd::EnableUndoDelayDataWrite){
+  if(arg == DumpStateOrd::EnableUndoDelayDataWrite){
     ndbout << "Dbdih:: delay write of datapages for table = " 
 	   << dumpState->args[1]<< endl;
     // Send this dump to ACC and TUP
@@ -13537,7 +13557,7 @@
     return;
   }
   
-  if(dumpState->args[0] == 7098){
+  if(arg == 7098){
     if(signal->length() == 3){
       jam();
       infoEvent("startLcpRoundLoopLab(tabel=%d, fragment=%d)",
@@ -13550,9 +13570,72 @@
     }
   }
 
-  if(dumpState->args[0] == DumpStateOrd::DihStartLcpImmediately){
+  if(arg == DumpStateOrd::DihStartLcpImmediately){
     c_lcpState.ctimer += (1 << c_lcpState.clcpDelay);
     return;
+  }
+
+  if (arg == DumpStateOrd::DihSetTimeBetweenGcp)
+  {
+    if (signal->getLength() == 1)
+    {
+      const ndb_mgm_configuration_iterator * p = 
+	theConfiguration.getOwnConfigIterator();
+      ndbrequire(p != 0);
+      ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &cgcpDelay);
+    }
+    else
+    {
+      cgcpDelay = signal->theData[1];
+    }
+    ndbout_c("Setting time between gcp : %d", cgcpDelay);
+  }
+
+  if (arg == 7021 && signal->getLength() == 2)
+  {
+    TabRecordPtr tabPtr;
+    tabPtr.i = signal->theData[1];
+    if (tabPtr.i >= ctabFileSize)
+      return;
+
+    ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
+    
+    if(tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
+      return;
+    
+    infoEvent
+      ("Table %d: TabCopyStatus: %d TabUpdateStatus: %d TabLcpStatus: %d",
+       tabPtr.i, 
+       tabPtr.p->tabCopyStatus, 
+       tabPtr.p->tabUpdateState,
+       tabPtr.p->tabLcpStatus);
+    
+    FragmentstorePtr fragPtr;
+    for (Uint32 fid = 0; fid < tabPtr.p->totalfragments; fid++) {
+      jam();
+      getFragstore(tabPtr.p, fid, fragPtr);
+      
+      char buf[100], buf2[100];
+      BaseString::snprintf(buf, sizeof(buf), " Fragment %d: noLcpReplicas==%d ", 
+			   fid, fragPtr.p->noLcpReplicas);
+      
+      Uint32 num=0;
+      ReplicaRecordPtr replicaPtr;
+      replicaPtr.i = fragPtr.p->storedReplicas;
+      do {
+	ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord);
+	BaseString::snprintf(buf2, sizeof(buf2), "%s %d(on %d)=%d(%s)",
+			     buf, num, 
+			     replicaPtr.p->procNode, 
+			     replicaPtr.p->lcpIdStarted,
+			     replicaPtr.p->lcpOngoingFlag ? "Ongoing" : "Idle");
+	BaseString::snprintf(buf, sizeof(buf), "%s", buf2);
+	
+	num++;
+	replicaPtr.i = replicaPtr.p->nextReplica;
+      } while (replicaPtr.i != RNIL);
+      infoEvent(buf);
+    }
   }
 }//Dbdih::execDUMP_STATE_ORD()
 

--- 1.82/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2006-02-16 14:28:49 +01:00
+++ 1.83/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2006-03-24 17:11:37 +01:00
@@ -9267,6 +9267,14 @@
 /*---------------------------------------------------------------------------*/
     scanptr.p->scanCompletedStatus = ZTRUE;
     scanptr.p->scanState = ScanRecord::WAIT_LQHKEY_COPY;
+    if (ERROR_INSERTED(5043))
+    {
+      CLEAR_ERROR_INSERT_VALUE;
+      tcConnectptr.p->copyCountWords = ~0;
+      signal->theData[0] = 9999;
+      sendSignal(numberToRef(CMVMI, scanptr.p->scanNodeId),
+		 GSN_NDB_TAMPER, signal, 1, JBA);
+    }
     return;
   }//if
 
@@ -18543,6 +18551,172 @@
       
     }
   }
+
+  TcConnectionrec *regTcConnectionrec = tcConnectionrec;
+  Uint32 ttcConnectrecFileSize = ctcConnectrecFileSize;
+  if(arg == 2306)
+  {
+    for(Uint32 i = 0; i<1024; i++)
+    {
+      TcConnectionrecPtr tcRec;
+      tcRec.i = ctransidHash[i];
+      while(tcRec.i != RNIL)
+      {
+	ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec);
+	ndbout << "TcConnectionrec " << tcRec.i;
+	signal->theData[0] = 2307;
+	signal->theData[1] = tcRec.i;
+	execDUMP_STATE_ORD(signal);
+	tcRec.i = tcRec.p->nextHashRec;
+      }
+    }
+  }
+
+  if(arg == 2307 || arg == 2308)
+  {
+    TcConnectionrecPtr tcRec;
+    tcRec.i = signal->theData[1];
+    ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec);
+    
+    ndbout << " transactionState = " <<
tcRec.p->transactionState<<endl;
+    ndbout << " operation = " << tcRec.p->operation<<endl;
+    ndbout << " tcNodeFailrec = " << tcRec.p->tcNodeFailrec
+	   << " seqNoReplica = " << tcRec.p->seqNoReplica
+	   << " simpleRead = " << tcRec.p->simpleRead
+	   << endl;
+    ndbout << " replicaType = " << tcRec.p->replicaType
+	   << " reclenAiLqhkey = " << tcRec.p->reclenAiLqhkey
+	   << " opExec = " << tcRec.p->opExec
+	   << endl;
+    ndbout << " opSimple = " << tcRec.p->opSimple
+	   << " nextSeqNoReplica = " << tcRec.p->nextSeqNoReplica
+	   << " lockType = " << tcRec.p->lockType
+	   << endl;
+    ndbout << " lastReplicaNo = " << tcRec.p->lastReplicaNo
+	   << " indTakeOver = " << tcRec.p->indTakeOver
+	   << " dirtyOp = " << tcRec.p->dirtyOp
+	   << endl;
+    ndbout << " activeCreat = " << tcRec.p->activeCreat
+	   << " tcBlockref = " << hex << tcRec.p->tcBlockref
+	   << " reqBlockref = " << hex << tcRec.p->reqBlockref
+	   << " primKeyLen = " << tcRec.p->primKeyLen
+	   << endl;
+    ndbout << " nextReplica = " << tcRec.p->nextReplica
+	   << " tcBlockref = " << hex << tcRec.p->tcBlockref
+	   << " reqBlockref = " << hex << tcRec.p->reqBlockref
+	   << " primKeyLen = " << tcRec.p->primKeyLen
+	   << endl;
+    ndbout << " logStopPageNo = " << tcRec.p->logStopPageNo
+	   << " logStartPageNo = " << tcRec.p->logStartPageNo
+	   << " logStartPageIndex = " << tcRec.p->logStartPageIndex
+	   << endl;
+    ndbout << " errorCode = " << tcRec.p->errorCode
+	   << " clientBlockref = " << hex << tcRec.p->clientBlockref
+	   << " applRef = " << hex << tcRec.p->applRef
+	   << " totSendlenAi = " << tcRec.p->totSendlenAi
+	   << endl;
+    ndbout << " totReclenAi = " << tcRec.p->totReclenAi
+	   << " tcScanRec = " << tcRec.p->tcScanRec
+	   << " tcScanInfo = " << tcRec.p->tcScanInfo
+	   << " tcOprec = " << hex << tcRec.p->tcOprec
+	   << endl;
+    ndbout << " tableref = " << tcRec.p->tableref
+	   << " simpleTcConnect = " << tcRec.p->simpleTcConnect
+	   << " storedProcId = " << tcRec.p->storedProcId
+	   << " schemaVersion = " << tcRec.p->schemaVersion
+	   << endl;
+    ndbout << " reqinfo = " << tcRec.p->reqinfo
+	   << " reqRef = " << tcRec.p->reqRef
+	   << " readlenAi = " << tcRec.p->readlenAi
+	   << " prevTc = " << tcRec.p->prevTc
+	   << endl;
+    ndbout << " prevLogTcrec = " << tcRec.p->prevLogTcrec
+	   << " prevHashRec = " << tcRec.p->prevHashRec
+	   << " nodeAfterNext0 = " << tcRec.p->nodeAfterNext[0]
+	   << " nodeAfterNext1 = " << tcRec.p->nodeAfterNext[1]
+	   << endl;
+    ndbout << " nextTcConnectrec = " << tcRec.p->nextTcConnectrec
+	   << " nextTc = " << tcRec.p->nextTc
+	   << " nextTcLogQueue = " << tcRec.p->nextTcLogQueue
+	   << " nextLogTcrec = " << tcRec.p->nextLogTcrec
+	   << endl;
+    ndbout << " nextHashRec = " << tcRec.p->nextHashRec
+	   << " logWriteState = " << tcRec.p->logWriteState
+	   << " logStartFileNo = " << tcRec.p->logStartFileNo
+	   << " listState = " << tcRec.p->listState
+	   << endl;
+    ndbout << " lastAttrinbuf = " << tcRec.p->lastAttrinbuf
+	   << " lastTupkeybuf = " << tcRec.p->lastTupkeybuf
+	   << " hashValue = " << tcRec.p->hashValue
+	   << endl;
+    ndbout << " gci = " << tcRec.p->gci
+	   << " fragmentptr = " << tcRec.p->fragmentptr
+	   << " fragmentid = " << tcRec.p->fragmentid
+	   << " firstTupkeybuf = " << tcRec.p->firstTupkeybuf
+	   << endl;
+    ndbout << " firstAttrinbuf = " << tcRec.p->firstAttrinbuf
+	   << " currTupAiLen = " << tcRec.p->currTupAiLen
+	   << " currReclenAi = " << tcRec.p->currReclenAi
+	   << endl;
+    ndbout << " tcTimer = " << tcRec.p->tcTimer
+	   << " clientConnectrec = " << tcRec.p->clientConnectrec
+	   << " applOprec = " << hex << tcRec.p->applOprec
+	   << " abortState = " << tcRec.p->abortState
+	   << endl;
+    ndbout << " transid0 = " << hex << tcRec.p->transid[0]
+	   << " transid1 = " << hex << tcRec.p->transid[1]
+	   << " tupkeyData0 = " << tcRec.p->tupkeyData[0]
+	   << " tupkeyData1 = " << tcRec.p->tupkeyData[1]
+	   << endl;
+    ndbout << " tupkeyData2 = " << tcRec.p->tupkeyData[2]
+	   << " tupkeyData3 = " << tcRec.p->tupkeyData[3]
+	   << endl;
+    switch (tcRec.p->transactionState) {
+	
+    case TcConnectionrec::SCAN_STATE_USED:
+      if (tcRec.p->tcScanRec < cscanrecFileSize){
+	ScanRecordPtr TscanPtr;
+	c_scanRecordPool.getPtr(TscanPtr, tcRec.p->tcScanRec);
+	ndbout << " scanState = " << TscanPtr.p->scanState << endl;
+	//TscanPtr.p->scanLocalref[2];
+	ndbout << " copyPtr="<<TscanPtr.p->copyPtr
+	       << " scanAccPtr="<<TscanPtr.p->scanAccPtr
+	       << " scanAiLength="<<TscanPtr.p->scanAiLength
+	       << endl;
+	ndbout << " m_curr_batch_size_rows="<<
+	  TscanPtr.p->m_curr_batch_size_rows
+	       << " m_max_batch_size_rows="<<
+	  TscanPtr.p->m_max_batch_size_rows
+	       << " scanErrorCounter="<<TscanPtr.p->scanErrorCounter
+	       << endl;
+	ndbout << " scanSchemaVersion="<<TscanPtr.p->scanSchemaVersion
+	       << "  scanStoredProcId="<<TscanPtr.p->scanStoredProcId
+	       << "  scanTcrec="<<TscanPtr.p->scanTcrec
+	       << endl;
+	ndbout << "  scanType="<<TscanPtr.p->scanType
+	       << "  scanApiBlockref="<<TscanPtr.p->scanApiBlockref
+	       << "  scanNodeId="<<TscanPtr.p->scanNodeId
+	       << "  scanCompletedStatus="<<TscanPtr.p->scanCompletedStatus
+	       << endl;
+	ndbout << "  scanFlag="<<TscanPtr.p->scanFlag
+	       << "  scanLockHold="<<TscanPtr.p->scanLockHold
+	       << "  scanLockMode="<<TscanPtr.p->scanLockMode
+	       << "  scanNumber="<<TscanPtr.p->scanNumber
+	       << endl;
+	ndbout << "  scanReleaseCounter="<<TscanPtr.p->scanReleaseCounter
+	       << "  scanTcWaiting="<<TscanPtr.p->scanTcWaiting
+	       << "  scanKeyinfoFlag="<<TscanPtr.p->scanKeyinfoFlag
+	       << endl;
+      } else{
+	ndbout << "No connected scan record found" << endl;
+      }
+      break;
+    default:
+      break;
+    }
+    ndbrequire(arg != 2308);
+  }
+  
 }//Dblqh::execDUMP_STATE_ORD()
 
 void Dblqh::execSET_VAR_REQ(Signal* signal) 

--- 1.33/storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp	2006-01-18 11:05:56 +01:00
+++ 1.34/storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp	2006-03-24 17:11:37 +01:00
@@ -213,14 +213,6 @@
     LTS_ACTIVE = 1
   };
 
-  enum TakeOverState {
-    TOS_NOT_DEFINED = 0,
-    TOS_IDLE = 1,
-    TOS_ACTIVE = 2,
-    TOS_COMPLETED = 3,
-    TOS_NODE_FAILED = 4
-  };
-
   enum FailState {
     FS_IDLE = 0,
     FS_LISTENING = 1,
@@ -645,6 +637,7 @@
     ConnectionState apiConnectstate;
     UintR transid[2];
     UintR firstTcConnect;
+    NdbNodeBitmask m_transaction_nodes; 
     
     //---------------------------------------------------
     // Second 16 byte cache line. Hot variables.
@@ -941,7 +934,6 @@
   struct HostRecord {
     HostState hostStatus;
     LqhTransState lqhTransStatus;
-    TakeOverState takeOverStatus;
     bool  inPackedList;
     UintR noOfPackedWordsLqh;
     UintR packedWordsLqh[26];
@@ -950,6 +942,17 @@
     UintR noOfWordsTCINDXCONF;
     UintR packedWordsTCINDXCONF[30];
     BlockReference hostLqhBlockRef;
+
+    enum NodeFailBits
+    {
+      NF_TAKEOVER          = 0x1,
+      NF_CHECK_SCAN        = 0x2,
+      NF_CHECK_TRANSACTION = 0x4,
+      NF_CHECK_DROP_TAB    = 0x8,
+      NF_NODE_FAIL_BITS    = 0xF // All bits...
+    };
+    Uint32 m_nf_bits;
+    NdbNodeBitmask m_lqh_trans_conf;
   }; /* p2c: size = 128 bytes */
   
   typedef Ptr<HostRecord> HostRecordPtr;
@@ -1596,7 +1599,7 @@
   void wrongSchemaVersionErrorLab(Signal* signal);
   void noFreeConnectionErrorLab(Signal* signal);
   void tckeyreq050Lab(Signal* signal);
-  void timeOutFoundLab(Signal* signal, UintR anAdd);
+  void timeOutFoundLab(Signal* signal, UintR anAdd, Uint32 errCode);
   void completeTransAtTakeOverLab(Signal* signal, UintR TtakeOverInd);
   void completeTransAtTakeOverDoLast(Signal* signal, UintR TtakeOverInd);
   void completeTransAtTakeOverDoOne(Signal* signal, UintR TtakeOverInd);
@@ -1618,6 +1621,9 @@
   void checkScanFragList(Signal*, Uint32 failedNodeId, ScanRecord * scanP, 
 			 LocalDLList<ScanFragRec>::Head&);
 
+  void nodeFailCheckTransactions(Signal*,Uint32 transPtrI,Uint32 failedNodeId);
+  void checkNodeFailComplete(Signal* signal, Uint32 failedNodeId, Uint32 bit);
+  
   // Initialisation
   void initData();
   void initRecords();
@@ -1644,6 +1650,7 @@
   HostRecord *hostRecord;
   HostRecordPtr hostptr;
   UintR chostFilesize;
+  NdbNodeBitmask c_alive_nodes;
 
   GcpRecord *gcpRecord;
   GcpRecordPtr gcpPtr;

--- 1.96/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp	2006-01-18 12:00:49 +01:00
+++ 1.97/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp	2006-03-24 17:11:37 +01:00
@@ -266,6 +266,10 @@
     jam();
     checkScanActiveInFailedLqh(signal, Tdata0, Tdata1);
     return;
+  case TcContinueB::ZNF_CHECK_TRANSACTIONS:
+    jam();
+    nodeFailCheckTransactions(signal, Tdata0, Tdata1);
+    return;
   case TcContinueB::CHECK_WAIT_DROP_TAB_FAILED_LQH:
     jam();
     checkWaitDropTabFailedLqh(signal, Tdata0, Tdata1);
@@ -303,8 +307,8 @@
   hostptr.i = signal->theData[1];
   ptrCheckGuard(hostptr, chostFilesize, hostRecord);
   hostptr.p->hostStatus = HS_ALIVE;
-  hostptr.p->takeOverStatus = TOS_IDLE;
   signal->theData[0] = cownref;
+  c_alive_nodes.set(hostptr.i);
   sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB);
 }
 
@@ -503,6 +507,7 @@
      * Finished
      */
     jam();
+    checkNodeFailComplete(signal, nodeId, HostRecord::NF_CHECK_DROP_TAB);
     return;
   }
   
@@ -868,8 +873,6 @@
       hostptr.i = i;
       ptrCheckGuard(hostptr, chostFilesize, hostRecord);
 
-      hostptr.p->takeOverStatus = TOS_IDLE;
-      
       if (NodeBitmask::get(readNodes->inactiveNodes, i)) {
         jam();
         hostptr.p->hostStatus = HS_DEAD;
@@ -877,6 +880,7 @@
         jam();
         con_lineNodes++;
         hostptr.p->hostStatus = HS_ALIVE;
+	c_alive_nodes.set(i);
       }//if
     }//if
   }//for
@@ -2378,6 +2382,7 @@
   regApiPtr->commitAckMarker = RNIL;
   regApiPtr->buddyPtr = RNIL;
   regApiPtr->currSavePointId = 0;
+  regApiPtr->m_transaction_nodes.clear();
   // Trigger data
   releaseFiredTriggerData(&regApiPtr->theFiredTriggers),
   // Index data
@@ -2986,6 +2991,10 @@
   signal->theData[0] = TdihConnectptr;
   signal->theData[1] = Ttableref;
   signal->theData[2] = TdistrHashValue;
+  signal->theData[3] = 0;
+  signal->theData[4] = 0;
+  signal->theData[5] = 0;
+  signal->theData[6] = 0;
 
   /*-------------------------------------------------------------*/
   /* FOR EFFICIENCY REASONS WE AVOID THE SIGNAL SENDING HERE AND */
@@ -3165,6 +3174,7 @@
   TcConnectRecord * const regTcPtr = tcConnectptr.p;
   ApiConnectRecord * const regApiPtr = apiConnectptr.p;
   CacheRecord * const regCachePtr = cachePtr.p;
+  UintR sig0, sig1, sig2, sig3, sig4, sig5, sig6;
 #ifdef ERROR_INSERT
   if (ERROR_INSERTED(8002)) {
     systemErrorLab(signal, __LINE__);
@@ -3202,6 +3212,9 @@
   LqhKeyReq::setScanTakeOverFlag(tslrAttrLen, regCachePtr->scanTakeOverInd);
 
   Tdata10 = 0;
+  sig0 = regCachePtr->opSimple;
+  sig1 = regTcPtr->operation;
+  bool simpleRead = (sig1 == ZREAD && sig0 == ZTRUE);
   LqhKeyReq::setKeyLen(Tdata10, regCachePtr->keylen);
   LqhKeyReq::setLastReplicaNo(Tdata10, regTcPtr->lastReplicaNo);
   LqhKeyReq::setLockType(Tdata10, regCachePtr->opLock);
@@ -3211,8 +3224,8 @@
   LqhKeyReq::setApplicationAddressFlag(Tdata10, 1);
   LqhKeyReq::setDirtyFlag(Tdata10, regTcPtr->dirtyOp);
   LqhKeyReq::setInterpretedFlag(Tdata10, regCachePtr->opExec);
-  LqhKeyReq::setSimpleFlag(Tdata10, regCachePtr->opSimple);
-  LqhKeyReq::setOperation(Tdata10, regTcPtr->operation);
+  LqhKeyReq::setSimpleFlag(Tdata10, sig0);
+  LqhKeyReq::setOperation(Tdata10, sig1);
   /* ----------------------------------------------------------------------- 
    * Sequential Number of first LQH = 0, bit 22-23                           
    * IF ATTRIBUTE INFORMATION IS SENT IN TCKEYREQ,
@@ -3225,18 +3238,16 @@
    * ----------------------------------------------------------------------- */
   //LqhKeyReq::setAPIVersion(Tdata10, regCachePtr->apiVersionNo);
   Uint32 commitAckMarker = regTcPtr->commitAckMarker;
+  const Uint32 noOfLqhs = regTcPtr->noOfNodes;
   if(commitAckMarker != RNIL){
     jam();
-    
     LqhKeyReq::setMarkerFlag(Tdata10, 1);
 
-    CommitAckMarker * tmp;
-    tmp = m_commitAckMarkerHash.getPtr(commitAckMarker);
+    CommitAckMarker * tmp = m_commitAckMarkerHash.getPtr(commitAckMarker);
     
     /**
      * Populate LQH array
      */
-    const Uint32 noOfLqhs = regTcPtr->noOfNodes;
     tmp->noOfLqhs = noOfLqhs;
     for(Uint32 i = 0; i<noOfLqhs; i++){
       tmp->lqhNodeId[i] = regTcPtr->tcNodedata[i];
@@ -3247,7 +3258,6 @@
   /* NO READ LENGTH SENT FROM TC. SEQUENTIAL NUMBER IS 1 AND IT    */
   /* IS SENT TO A PRIMARY NODE.                                    */
   /* ************************************************************> */
-  UintR sig0, sig1, sig2, sig3, sig4, sig5, sig6;
 
   LqhKeyReq * const lqhKeyReq = (LqhKeyReq *)signal->getDataPtrSend();
 
@@ -3271,6 +3281,14 @@
   sig5 = regTcPtr->clientData;
   sig6 = regCachePtr->scanInfo;
 
+  if (! simpleRead)
+  {
+    regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[0]);
+    regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[1]);
+    regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[2]);
+    regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[3]);  
+  }
+  
   lqhKeyReq->tableSchemaVersion = sig0;
   lqhKeyReq->fragmentData = sig1;
   lqhKeyReq->transId1 = sig2;
@@ -4656,6 +4674,7 @@
   UintR TgcpPointer = regTmpApiPtr->gcpPointer;
   UintR TgcpFilesize = cgcpFilesize;
   UintR TcommitAckMarker = regTmpApiPtr->commitAckMarker;
+  NdbNodeBitmask Tnodes = regTmpApiPtr->m_transaction_nodes;
   GcpRecord *localGcpRecord = gcpRecord;
 
   regApiPtr->ndbapiBlockref = regTmpApiPtr->ndbapiBlockref;
@@ -4666,6 +4685,7 @@
   regApiPtr->transid[1] = Ttransid2;
   regApiPtr->lqhkeyconfrec = Tlqhkeyconfrec;
   regApiPtr->commitAckMarker = TcommitAckMarker;
+  regApiPtr->m_transaction_nodes = Tnodes;
 
   gcpPtr.i = TgcpPointer;
   ptrCheckGuard(gcpPtr, TgcpFilesize, localGcpRecord);
@@ -4676,6 +4696,7 @@
   regTmpApiPtr->commitAckMarker = RNIL;
   regTmpApiPtr->firstTcConnect = RNIL;
   regTmpApiPtr->lastTcConnect = RNIL;
+  regTmpApiPtr->m_transaction_nodes.clear();
   releaseAllSeizedIndexOperations(regTmpApiPtr);
 }//Dbtc::copyApi()
 
@@ -4934,7 +4955,7 @@
   TcConnectRecordPtr localTcConnectptr;
   UintR TtcConnectFilesize = ctcConnectFilesize;
   TcConnectRecord *localTcConnectRecord = tcConnectRecord;
-
+  apiConnectptr.p->m_transaction_nodes.clear();
   localTcConnectptr.i = apiConnectptr.p->firstTcConnect;
   do {
     jam();
@@ -5352,7 +5373,8 @@
       break;
     case CS_ABORTING:
       jam();
-      errorCode = ZABORTINPROGRESS;
+      errorCode = regApiPtr->returncode ? 
+	regApiPtr->returncode : ZABORTINPROGRESS;
       break;
     case CS_START_SCAN:
       jam();
@@ -5891,9 +5913,9 @@
 
   if (transP->firstTcConnect == RNIL) {
     jam();
-    /*-----------------------------------------------------------------------*/
-    /*    WE HAVE NO PARTICIPANTS IN THE TRANSACTION.                        */
-    /*-----------------------------------------------------------------------*/
+    /*--------------------------------------------------------------------*/
+    /* WE HAVE NO PARTICIPANTS IN THE TRANSACTION.                        */
+    /*--------------------------------------------------------------------*/
     releaseAbortResources(signal);
     return;
   }//if
@@ -6170,10 +6192,12 @@
     if (api_timer != 0) {
       time_out_value= time_out_param + (api_con_ptr & mask_value);
       time_passed= tc_timer - api_timer;
-      if (time_passed > time_out_value) {
+      if (time_passed > time_out_value) 
+      {
         jam();
-        timeOutFoundLab(signal, api_con_ptr);
-        return;
+        timeOutFoundLab(signal, api_con_ptr, ZTIME_OUT_ERROR);
+	api_con_ptr++;
+	break;
       }
     }
   }
@@ -6193,10 +6217,8 @@
   return;
 }//Dbtc::timeOutLoopStartLab()
 
-void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) 
+void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr, Uint32 errCode) 
 {
-  sendContinueTimeOutControl(signal, TapiConPtr + 1);
-  
   apiConnectptr.i = TapiConPtr;
   ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord);
   /*------------------------------------------------------------------*/
@@ -6209,7 +6231,8 @@
 	<< "Time-out in state = " << apiConnectptr.p->apiConnectstate
 	<< " apiConnectptr.i = " << apiConnectptr.i 
 	<< " - exec: " << apiConnectptr.p->m_exec_flag
-	<< " - place: " << c_apiConTimer_line[apiConnectptr.i]);
+	<< " - place: " << c_apiConTimer_line[apiConnectptr.i]
+	<< " code: " << errCode);
   switch (apiConnectptr.p->apiConnectstate) {
   case CS_STARTED:
     ndbrequire(c_apiConTimer_line[apiConnectptr.i] != 3615);
@@ -6227,7 +6250,7 @@
       }//if
     }
     apiConnectptr.p->returnsignal = RS_TCROLLBACKREP;      
-    apiConnectptr.p->returncode = ZTIME_OUT_ERROR;
+    apiConnectptr.p->returncode = errCode;
     abort010Lab(signal);
     return;
   case CS_RECEIVING:
@@ -6240,7 +6263,7 @@
     /*       START ABORTING THE TRANSACTION. ALSO START CHECKING THE    */
     /*       REMAINING TRANSACTIONS.                                    */
     /*------------------------------------------------------------------*/
-    terrorCode = ZTIME_OUT_ERROR;
+    terrorCode = errCode;
     abortErrorLab(signal);
     return;
   case CS_COMMITTING:
@@ -6447,6 +6470,7 @@
     return;
   }
   
+  bool found = false;
   OperationState tmp[16];
   
   Uint32 TloopCount = 0;
@@ -6454,7 +6478,31 @@
     jam();
     if (tcConnectptr.i == RNIL) {
       jam();
-      if (Tcheck == 0) {
+
+#ifdef VM_TRACE
+      ndbout_c("found: %d Tcheck: %d apiConnectptr.p->counter: %d",
+	       found, Tcheck, apiConnectptr.p->counter);
+#endif
+      if (found || apiConnectptr.p->counter)
+      {
+	jam();
+	/**
+	 * We sent atleast one ABORT/ABORTED
+	 *   or ZABORT_TIMEOUT_BREAK is in job buffer
+	 *   wait for reception...
+	 */
+	return;
+      }
+      
+      if (Tcheck == 1)
+      {
+	jam();
+	releaseAbortResources(signal);
+	return;
+      }
+      
+      if (Tcheck == 0)
+      {
         jam();
 	/*------------------------------------------------------------------
 	 * All nodes had already reported ABORTED for all tcConnect records.
@@ -6463,9 +6511,11 @@
 	 *------------------------------------------------------------------*/
 	char buf[96]; buf[0] = 0;
 	char buf2[96];
-	BaseString::snprintf(buf, sizeof(buf), "TC %d: %d ops:",
-		 __LINE__, apiConnectptr.i);
-	for(Uint32 i = 0; i<TloopCount; i++){
+	BaseString::snprintf(buf, sizeof(buf), "TC %d: %d counter: %d ops:",
+			     __LINE__, apiConnectptr.i,
+			     apiConnectptr.p->counter);
+	for(Uint32 i = 0; i<TloopCount; i++)
+	{
 	  BaseString::snprintf(buf2, sizeof(buf2), "%s %d", buf, tmp[i]);
 	  BaseString::snprintf(buf, sizeof(buf), buf2);
 	}
@@ -6488,7 +6538,16 @@
       signal->theData[0] = TcContinueB::ZABORT_TIMEOUT_BREAK;
       signal->theData[1] = tcConnectptr.i;
       signal->theData[2] = apiConnectptr.i;      
-      sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
+      if (ERROR_INSERTED(8050))
+      {
+	ndbout_c("sending ZABORT_TIMEOUT_BREAK delayed (%d %d)", 
+		 Tcheck, apiConnectptr.p->counter);
+	sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 2000, 3);
+      }
+      else
+      {
+	sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
+      }
       return;
     }//if
     ptrCheckGuard(tcConnectptr, ctcConnectFilesize, tcConnectRecord);
@@ -6511,7 +6570,7 @@
         jam();
         if (tcConnectptr.p->tcNodedata[Ti] != 0) {
           TloopCount += 31;
-          Tcheck = 1;
+	  found = true;
           hostptr.i = tcConnectptr.p->tcNodedata[Ti];
           ptrCheckGuard(hostptr, chostFilesize, hostRecord);
           if (hostptr.p->hostStatus == HS_ALIVE) {
@@ -6884,58 +6943,44 @@
   const Uint32 tnewMasterId = nodeFail->masterNodeId;
   
   arrGuard(tnoOfNodes, MAX_NDB_NODES);
+  Uint32 i;
   int index = 0;
-  for (unsigned i = 1; i< MAX_NDB_NODES; i++) {
-    if(NodeBitmask::get(nodeFail->theNodes, i)){
+  for (i = 1; i< MAX_NDB_NODES; i++) 
+  {
+    if(NodeBitmask::get(nodeFail->theNodes, i))
+    {
       cdata[index] = i;
       index++;
     }//if
   }//for
 
+  cmasterNodeId = tnewMasterId;
+  
   tcNodeFailptr.i = 0;
   ptrAss(tcNodeFailptr, tcFailRecord);
-  Uint32 tindex;
-  for (tindex = 0; tindex < tnoOfNodes; tindex++) {
+  for (i = 0; i < tnoOfNodes; i++) 
+  {
     jam();
-    hostptr.i = cdata[tindex];
+    hostptr.i = cdata[i];
     ptrCheckGuard(hostptr, chostFilesize, hostRecord);
+    
     /*------------------------------------------------------------*/
     /*       SET STATUS OF THE FAILED NODE TO DEAD SINCE IT HAS   */
     /*       FAILED.                                              */
     /*------------------------------------------------------------*/
     hostptr.p->hostStatus = HS_DEAD;
+    hostptr.p->m_nf_bits = HostRecord::NF_NODE_FAIL_BITS;
+    c_alive_nodes.clear(hostptr.i);
 
-    if (hostptr.p->takeOverStatus == TOS_COMPLETED) {
-      jam();
-      /*------------------------------------------------------------*/
-      /*       A VERY UNUSUAL SITUATION. THE TAKE OVER WAS COMPLETED*/
-      /*       EVEN BEFORE WE HEARD ABOUT THE NODE FAILURE REPORT.  */
-      /*       HOWEVER UNUSUAL THIS SITUATION IS POSSIBLE.          */
-      /*------------------------------------------------------------*/
-      /*       RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE    */
-      /*       REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */
-      /*       USED THEM IS COMPLETED.                              */
-      /*------------------------------------------------------------*/
-      {
-	NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0];
-	nfRep->blockNo      = DBTC;
-	nfRep->nodeId       = cownNodeid;
-	nfRep->failedNodeId = hostptr.i;
-      }
-      sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, 
-		 NFCompleteRep::SignalLength, JBB);
-    } else {
-      ndbrequire(hostptr.p->takeOverStatus == TOS_IDLE);
-      hostptr.p->takeOverStatus = TOS_NODE_FAILED;
-    }//if
-    
-    if (tcNodeFailptr.p->failStatus == FS_LISTENING) {
+    if (tcNodeFailptr.p->failStatus == FS_LISTENING) 
+    {
       jam();
       /*------------------------------------------------------------*/
       /*       THE CURRENT TAKE OVER CAN BE AFFECTED BY THIS NODE   */
       /*       FAILURE.                                             */
       /*------------------------------------------------------------*/
-      if (hostptr.p->lqhTransStatus == LTS_ACTIVE) {
+      if (hostptr.p->lqhTransStatus == LTS_ACTIVE) 
+      {
 	jam();
 	/*------------------------------------------------------------*/
 	/*       WE WERE WAITING FOR THE FAILED NODE IN THE TAKE OVER */
@@ -6947,86 +6992,46 @@
       }//if
     }//if
     
-  }//for
-
-  const bool masterFailed = (cmasterNodeId != tnewMasterId);
-  cmasterNodeId = tnewMasterId;
-
-  if(getOwnNodeId() == cmasterNodeId && masterFailed){
-    /**
-     * Master has failed and I'm the new master
-     */
-    jam();
-    
-    for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) {
+    if (getOwnNodeId() != tnewMasterId)
+    {
       jam();
-      ptrAss(hostptr, hostRecord);
-      if (hostptr.p->hostStatus != HS_ALIVE) {
-	jam();
-	if (hostptr.p->takeOverStatus == TOS_COMPLETED) {
-	  jam();
-	  /*------------------------------------------------------------*/
-	  /*       SEND TAKE OVER CONFIRMATION TO ALL ALIVE NODES IF    */
-	  /*       TAKE OVER IS COMPLETED. THIS IS PERFORMED TO ENSURE  */
-	  /*       THAT ALL NODES AGREE ON THE IDLE STATE OF THE TAKE   */
-	  /*       OVER. THIS MIGHT BE MISSED IN AN ERROR SITUATION IF  */
-	  /*       MASTER FAILS AFTER SENDING CONFIRMATION TO NEW       */
-	  /*       MASTER BUT FAILING BEFORE SENDING TO ANOTHER NODE    */
-	  /*       WHICH WAS NOT MASTER. IF THIS NODE LATER BECOMES     */
-	  /*       MASTER IT MIGHT START A NEW TAKE OVER EVEN AFTER THE */
-	  /*       CRASHED NODE HAVE ALREADY RECOVERED.                 */
-	  /*------------------------------------------------------------*/
-	  for(tmpHostptr.i = 1; tmpHostptr.i < MAX_NDB_NODES;tmpHostptr.i++) {
-	    jam();
-	    ptrAss(tmpHostptr, hostRecord);
-	    if (tmpHostptr.p->hostStatus == HS_ALIVE) {
-	      jam();
-	      tblockref = calcTcBlockRef(tmpHostptr.i);
-	      signal->theData[0] = hostptr.i;
-	      sendSignal(tblockref, GSN_TAKE_OVERTCCONF, signal, 1, JBB);
-	    }//if
-	  }//for
-	}//if
-      }//if
-    }//for
-  }
-
-  if(getOwnNodeId() == cmasterNodeId){
-    jam();
-    for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) {
+      /**
+       * Only master does takeover currently
+       */
+      hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER;
+    }
+    else
+    {
       jam();
-      ptrAss(hostptr, hostRecord);
-      if (hostptr.p->hostStatus != HS_ALIVE) {
-        jam();
-        if (hostptr.p->takeOverStatus == TOS_NODE_FAILED) {
-          jam();
-	  /*------------------------------------------------------------*/
-	  /*       CONCLUDE ALL ACTIVITIES THE FAILED TC DID CONTROL    */
-	  /*       SINCE WE ARE THE MASTER. THIS COULD HAVE BEEN STARTED*/
-	  /*       BY A PREVIOUS MASTER BUT HAVE NOT BEEN CONCLUDED YET.*/
-	  /*------------------------------------------------------------*/
-          hostptr.p->takeOverStatus = TOS_ACTIVE;
-          signal->theData[0] = hostptr.i;
-          sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB);
-        }//if
-      }//if
-    }//for
-  }//if
-  for (tindex = 0; tindex < tnoOfNodes; tindex++) {
-    jam();
-    hostptr.i = cdata[tindex];
-    ptrCheckGuard(hostptr, chostFilesize, hostRecord);
-    /*------------------------------------------------------------*/
-    /*       LOOP THROUGH AND ABORT ALL SCANS THAT WHERE          */
-    /*       CONTROLLED BY THIS TC AND ACTIVE IN THE FAILED       */
-    /*       NODE'S LQH                                           */
-    /*------------------------------------------------------------*/
+      signal->theData[0] = hostptr.i;
+      sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB);
+    }
+
     checkScanActiveInFailedLqh(signal, 0, hostptr.i);
     checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid
-  }//for
-
+    nodeFailCheckTransactions(signal, 0, hostptr.i);
+  }
 }//Dbtc::execNODE_FAILREP()
 
+void
+Dbtc::checkNodeFailComplete(Signal* signal, 
+			    Uint32 failedNodeId,
+			    Uint32 bit)
+{
+  hostptr.i = failedNodeId;
+  ptrCheckGuard(hostptr, chostFilesize, hostRecord);
+  hostptr.p->m_nf_bits &= ~bit;
+  if (hostptr.p->m_nf_bits == 0)
+  {
+    NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0];
+    nfRep->blockNo      = DBTC;
+    nfRep->nodeId       = cownNodeid;
+    nfRep->failedNodeId = hostptr.i;
+    sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, 
+	       NFCompleteRep::SignalLength, JBB);
+  }
+}
+
 void Dbtc::checkScanActiveInFailedLqh(Signal* signal, 
 				      Uint32 scanPtrI, 
 				      Uint32 failedNodeId){
@@ -7068,8 +7073,44 @@
     sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
     return;
   }//for
+
+  checkNodeFailComplete(signal, failedNodeId, HostRecord::NF_CHECK_SCAN);
+}
+
+void
+Dbtc::nodeFailCheckTransactions(Signal* signal, 
+				Uint32 transPtrI, 
+				Uint32 failedNodeId)
+{
+  jam();
+  Ptr<ApiConnectRecord> transPtr;
+  for (transPtr.i = transPtrI; transPtr.i < capiConnectFilesize; transPtr.i++)
+  {
+    ptrCheckGuard(transPtr, capiConnectFilesize, apiConnectRecord); 
+    if (transPtr.p->m_transaction_nodes.get(failedNodeId))
+    {
+      jam();
+      // Force timeout regardless of state      
+      Uint32 save = c_appl_timeout_value;
+      c_appl_timeout_value = 1;
+      setApiConTimer(transPtr.i, 0, __LINE__);
+      timeOutFoundLab(signal, transPtr.i, ZNODEFAIL_BEFORE_COMMIT);
+      c_appl_timeout_value = save;
+    }
+    
+    // Send CONTINUEB to continue later
+    signal->theData[0] = TcContinueB::ZNF_CHECK_TRANSACTIONS;
+    signal->theData[1] = transPtr.i + 1; // Check next
+    signal->theData[2] = failedNodeId;
+    sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB);
+    return;
+  }
+
+  checkNodeFailComplete(signal, failedNodeId, 
+			HostRecord::NF_CHECK_TRANSACTION);
 }
 
+
 void
 Dbtc::checkScanFragList(Signal* signal,
 			Uint32 failedNodeId,
@@ -7085,54 +7126,14 @@
   tfailedNodeId = signal->theData[0];
   hostptr.i = tfailedNodeId;
   ptrCheckGuard(hostptr, chostFilesize, hostRecord);
-  switch (hostptr.p->takeOverStatus) {
-  case TOS_IDLE:
-    jam();
-    /*------------------------------------------------------------*/
-    /*       THIS MESSAGE ARRIVED EVEN BEFORE THE NODE_FAILREP    */
-    /*       MESSAGE. THIS IS POSSIBLE IN EXTREME SITUATIONS.     */
-    /*       WE SET THE STATE TO TAKE_OVER_COMPLETED AND WAIT     */
-    /*       FOR THE NODE_FAILREP MESSAGE.                        */
-    /*------------------------------------------------------------*/
-    hostptr.p->takeOverStatus = TOS_COMPLETED;
-    break;
-  case TOS_NODE_FAILED:
-  case TOS_ACTIVE:
-    jam();
-    /*------------------------------------------------------------*/
-    /*       WE ARE NOT MASTER AND THE TAKE OVER IS ACTIVE OR WE  */
-    /*       ARE MASTER AND THE TAKE OVER IS ACTIVE. IN BOTH      */
-    /*       WE SET THE STATE TO TAKE_OVER_COMPLETED.             */
-    /*------------------------------------------------------------*/
-    /*       RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE    */
-    /*       REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */
-    /*       USED THEM IS COMPLETED.                              */
-    /*------------------------------------------------------------*/
-    hostptr.p->takeOverStatus = TOS_COMPLETED;
-    {
-      NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0];
-      nfRep->blockNo      = DBTC;
-      nfRep->nodeId       = cownNodeid;
-      nfRep->failedNodeId = hostptr.i;
-    }
-    sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, 
-               NFCompleteRep::SignalLength, JBB);
-    break;
-  case TOS_COMPLETED:
-    jam();
-    /*------------------------------------------------------------*/
-    /*       WE HAVE ALREADY RECEIVED THE CONF SIGNAL. IT IS MOST */
-    /*       LIKELY SENT FROM A NEW MASTER WHICH WASN'T SURE IF   */
-    /*       THIS NODE HEARD THE CONF SIGNAL FROM THE OLD MASTER. */
-    /*       WE SIMPLY IGNORE THE MESSAGE.                        */
-    /*------------------------------------------------------------*/
-    /*empty*/;
-    break;
-  default:
+
+  if (signal->getSendersBlockRef() != reference())
+  {
     jam();
-    systemErrorLab(signal, __LINE__);
     return;
-  }//switch
+  }
+  
+  checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER);
 }//Dbtc::execTAKE_OVERTCCONF()
 
 void Dbtc::execTAKE_OVERTCREQ(Signal* signal) 
@@ -7372,16 +7373,10 @@
     /*       TO REPORT THE COMPLETION OF THE TAKE OVER TO ALL     */
     /*       NODES THAT ARE ALIVE.                                */
     /*------------------------------------------------------------*/
-    for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) {
-      jam();
-      ptrAss(hostptr, hostRecord);
-      if (hostptr.p->hostStatus == HS_ALIVE) {
-        jam();
-        tblockref = calcTcBlockRef(hostptr.i);
-        signal->theData[0] = tcNodeFailptr.p->takeOverNode;
-        sendSignal(tblockref, GSN_TAKE_OVERTCCONF, signal, 1, JBB);
-      }//if
-    }//for
+    NodeReceiverGroup rg(DBTC, c_alive_nodes);
+    signal->theData[0] = tcNodeFailptr.p->takeOverNode;
+    sendSignal(rg, GSN_TAKE_OVERTCCONF, signal, 1, JBB);
+    
     if (tcNodeFailptr.p->queueIndex > 0) {
       jam();
       /*------------------------------------------------------------*/
@@ -8063,6 +8058,7 @@
   apiConnectptr.p->ndbapiBlockref = 0;
   apiConnectptr.p->ndbapiConnect = 0;
   apiConnectptr.p->buddyPtr = RNIL;
+  apiConnectptr.p->m_transaction_nodes.clear();
   setApiConTimer(apiConnectptr.i, 0, __LINE__);
   switch(ttransStatus){
   case LqhTransConf::Committed:
@@ -9890,6 +9886,7 @@
     apiConnectptr.p->executingIndexOp = RNIL;
     apiConnectptr.p->buddyPtr = RNIL;
     apiConnectptr.p->currSavePointId = 0;
+    apiConnectptr.p->m_transaction_nodes.clear();
   }//for
   apiConnectptr.i = tiacTmp - 1;
   ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord);
@@ -9917,6 +9914,7 @@
       apiConnectptr.p->executingIndexOp = RNIL;
       apiConnectptr.p->buddyPtr = RNIL;
       apiConnectptr.p->currSavePointId = 0;
+      apiConnectptr.p->m_transaction_nodes.clear();
     }//for
   apiConnectptr.i = (2 * tiacTmp) - 1;
   ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord);
@@ -9944,6 +9942,7 @@
     apiConnectptr.p->executingIndexOp = RNIL;
     apiConnectptr.p->buddyPtr = RNIL;
     apiConnectptr.p->currSavePointId = 0;
+    apiConnectptr.p->m_transaction_nodes.clear();
   }//for
   apiConnectptr.i = (3 * tiacTmp) - 1;
   ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord);
@@ -10004,13 +10003,13 @@
     ptrAss(hostptr, hostRecord);
     hostptr.p->hostStatus = HS_DEAD;
     hostptr.p->inPackedList = false;
-    hostptr.p->takeOverStatus = TOS_NOT_DEFINED;
     hostptr.p->lqhTransStatus = LTS_IDLE;
     hostptr.p->noOfWordsTCKEYCONF = 0;
     hostptr.p->noOfWordsTCINDXCONF = 0;
     hostptr.p->noOfPackedWordsLqh = 0;
     hostptr.p->hostLqhBlockRef = calcLqhBlockRef(hostptr.i);
   }//for
+  c_alive_nodes.clear();
 }//Dbtc::inithost()
 
 void Dbtc::initialiseRecordsLab(Signal* signal, UintR Tdata0, 
@@ -10263,6 +10262,7 @@
   }//while
   apiConnectptr.p->firstTcConnect = RNIL;
   apiConnectptr.p->lastTcConnect = RNIL;
+  apiConnectptr.p->m_transaction_nodes.clear();
 
   // MASV let state be CS_ABORTING until all 
   // signals in the "air" have been received. Reset to CS_CONNECTED
@@ -10336,6 +10336,7 @@
   cfirstfreeApiConnect = TlocalApiConnectptr.i;
   setApiConTimer(TlocalApiConnectptr.i, 0, __LINE__);
   TlocalApiConnectptr.p->apiConnectstate = CS_DISCONNECTED;
+  ndbassert(TlocalApiConnectptr.p->m_transaction_nodes.isclear());
   ndbassert(TlocalApiConnectptr.p->apiScanRec == RNIL);
   TlocalApiConnectptr.p->ndbapiBlockref = 0;
 }//Dbtc::releaseApiCon()
@@ -10870,6 +10871,34 @@
     infoEvent("IndexOpCount: pool: %d free: %d", 
 	      c_theIndexOperationPool.getSize(),
 	      c_theIndexOperationPool.getNoOfFree());
+  }
+
+  if (dumpState->args[0] == 2514)
+  {
+    if (signal->getLength() == 2)
+    {
+      dumpState->args[0] = DumpStateOrd::TcDumpOneApiConnectRec;
+      execDUMP_STATE_ORD(signal);
+    }
+
+    NodeReceiverGroup rg(CMVMI, c_alive_nodes);
+    dumpState->args[0] = 15;
+    sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB);
+
+    signal->theData[0] = 2515;
+    sendSignalWithDelay(cownref, GSN_DUMP_STATE_ORD, signal, 1000, 1);    
+    return;
+  }
+
+  if (dumpState->args[0] == 2515)
+  {
+    NdbNodeBitmask mask = c_alive_nodes;
+    mask.clear(getOwnNodeId());
+    NodeReceiverGroup rg(NDBCNTR, mask);
+    
+    sendSignal(rg, GSN_SYSTEM_ERROR, signal, 1, JBB);
+    sendSignalWithDelay(cownref, GSN_SYSTEM_ERROR, signal, 300, 1);    
+    return;
   }
 }//Dbtc::execDUMP_STATE_ORD()
 

--- 1.26/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2006-01-27 18:17:33 +01:00
+++ 1.27/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp	2006-03-24 17:11:37 +01:00
@@ -278,6 +278,7 @@
 
 void Qmgr::execCONNECT_REP(Signal* signal)
 {
+  jamEntry();
   const Uint32 nodeId = signal->theData[0];
   c_connectedNodes.set(nodeId);
   NodeRecPtr nodePtr;
@@ -285,9 +286,13 @@
   ptrCheckGuard(nodePtr, MAX_NODES, nodeRec);
   switch(nodePtr.p->phase){
   case ZSTARTING:
+  case ZRUNNING:
     jam();
+    if(!c_start.m_nodes.isWaitingFor(nodeId)){
+      jam();
+      return;
+    }
     break;
-  case ZRUNNING:
   case ZPREPARE_FAIL:
   case ZFAIL_CLOSING:
     jam();
@@ -298,21 +303,28 @@
   case ZAPI_INACTIVE:
     return;
   }
-
-  if(!c_start.m_nodes.isWaitingFor(nodeId)){
-    jam();
-    return;
-  }
-
+  
   switch(c_start.m_gsn){
   case GSN_CM_REGREQ:
     jam();
     sendCmRegReq(signal, nodeId);
     return;
-  case GSN_CM_NODEINFOREQ:{
+  case GSN_CM_NODEINFOREQ:
     jam();
     sendCmNodeInfoReq(signal, nodeId, nodePtr.p);
     return;
+  case GSN_CM_ADD:{
+    jam();
+
+    ndbrequire(getOwnNodeId() != cpresident);
+    c_start.m_nodes.clearWaitingFor(nodeId);
+    c_start.m_gsn = RNIL;
+    
+    NodeRecPtr addNodePtr;
+    addNodePtr.i = nodeId;
+    ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec);
+    cmAddPrepare(signal, addNodePtr, nodePtr.p);
+    return;
   }
   default:
     return;
@@ -945,15 +957,27 @@
     return;
   case ZFAIL_CLOSING:
     jam();
-//#ifdef VM_TRACE
-    ndbout_c("Enabling communication to CM_ADD node state=%d", 
-	     nodePtr.p->phase);
-//#endif
+    
+#if 1
+    warningEvent("Recieved request to incorperate node %u, "
+ 		 "while error handling has not yet completed",
+ 		 nodePtr.i);
+    
+    ndbrequire(getOwnNodeId() != cpresident);
+    ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD);
+    c_start.m_nodes.clearWaitingFor();
+    c_start.m_nodes.setWaitingFor(nodePtr.i);
+    c_start.m_gsn = GSN_CM_ADD;
+#else
+    warningEvent("Enabling communication to CM_ADD node %u state=%d", 
+ 		 nodePtr.i,
+ 		 nodePtr.p->phase);
     nodePtr.p->phase = ZSTARTING;
     nodePtr.p->failState = NORMAL;
     signal->theData[0] = 0;
     signal->theData[1] = nodePtr.i;
     sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA);
+#endif
     return;
   case ZSTARTING:
     break;
@@ -1788,11 +1812,27 @@
 
   jamEntry();
   failedNodePtr.i = signal->theData[0];  
+
+  if (ERROR_INSERTED(930))
+  {
+    CLEAR_ERROR_INSERT_VALUE;
+    infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i);
+    return;
+  }
+  
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
   if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){
     failedNodePtr.p->failState = NORMAL;
   } else {
     jam();
+
+    char buf[100];
+    BaseString::snprintf(buf, 100, 
+			 "Received NDB_FAILCONF for node %u with state: %d %d",
+			 failedNodePtr.i,
+			 failedNodePtr.p->phase,
+			 failedNodePtr.p->failState);
+    progError(__LINE__, 0, buf);
     systemErrorLab(signal, __LINE__);
   }//if
   if (cpresident == getOwnNodeId()) {
@@ -2114,10 +2154,42 @@
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
   if (failedNodePtr.i == getOwnNodeId()) {
     jam();
-    systemErrorLab(signal, __LINE__);
+
+    const char * msg = 0;
+    switch(aFailCause){
+    case FailRep::ZOWN_FAILURE: 
+      msg = "Own failure"; 
+      break;
+    case FailRep::ZOTHER_NODE_WHEN_WE_START: 
+    case FailRep::ZOTHERNODE_FAILED_DURING_START:
+      msg = "Other node died during start"; 
+      break;
+    case FailRep::ZIN_PREP_FAIL_REQ:
+      msg = "Prep fail";
+      break;
+    case FailRep::ZSTART_IN_REGREQ:
+      msg = "Start timeout";
+      break;
+    case FailRep::ZHEARTBEAT_FAILURE:
+      msg = "Hearbeat failure";
+      break;
+    case FailRep::ZLINK_FAILURE:
+      msg = "Connection failure";
+      break;
+    }
+    
+    char buf[100];
+    BaseString::snprintf(buf, 100, 
+			 "We(%u) have been declared dead by %u reason: %s(%u)",
+			 getOwnNodeId(),
+			 refToNode(signal->getSendersBlockRef()),
+			 aFailCause,
+			 msg ? msg : "<Unknown>");
+
+    progError(__LINE__, 0, buf);
     return;
   }//if
-
+  
   myNodePtr.i = getOwnNodeId();
   ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec);
   if (myNodePtr.p->phase != ZRUNNING) {
@@ -2828,6 +2900,7 @@
         cfailureNr = cprepareFailureNr;
         ctoFailureNr = 0;
         ctoStatus = Q_ACTIVE;
+	c_start.reset(); // Don't take over nodes being started
         if (cnoCommitFailedNodes > 0) {
           jam();
 	  /**-----------------------------------------------------------------

--- 1.8/storage/ndb/include/mgmapi/ndbd_exit_codes.h	2005-10-11 17:05:50 +02:00
+++ 1.9/storage/ndb/include/mgmapi/ndbd_exit_codes.h	2006-03-24 17:11:35 +01:00
@@ -74,9 +74,7 @@
 #define NDBD_EXIT_SR_OTHERNODEFAILED          2308
 #define NDBD_EXIT_NODE_NOT_DEAD               2309
 #define NDBD_EXIT_SR_REDOLOG                  2310
-/*
 #define NDBD_EXIT_SR_RESTARTCONFLICT          2311
-*/
 #define NDBD_EXIT_NO_MORE_UNDOLOG             2312 
 #define NDBD_EXIT_SR_UNDOLOG                  2313 
 #define NDBD_EXIT_MEMALLOC                    2327

--- 1.6/storage/ndb/src/kernel/error/ndbd_exit_codes.c	2005-10-11 17:05:51 +02:00
+++ 1.7/storage/ndb/src/kernel/error/ndbd_exit_codes.c	2006-03-24 17:11:38 +01:00
@@ -85,6 +85,9 @@
    */
    {NDBD_EXIT_OS_SIGNAL_RECEIVED, XIE, "Error OS signal received"},
 
+   {NDBD_EXIT_SR_RESTARTCONFLICT, XRE,
+    "Partial system restart causing conflicting file systems"},
+   
    /* VM */
    {NDBD_EXIT_OUT_OF_LONG_SIGNAL_MEMORY,    XCR,
     "Signal lost, out of long signal memory, please increase LongMessageBuffer"},

--- 1.12/storage/ndb/src/kernel/vm/TransporterCallback.cpp	2006-01-26 09:48:42 +01:00
+++ 1.13/storage/ndb/src/kernel/vm/TransporterCallback.cpp	2006-03-24 17:11:38 +01:00
@@ -55,7 +55,8 @@
 const char *lookupConnectionError(Uint32 err)
 {
   int i= 0;
-  while ((Uint32)connectionError[i].err != err && (Uint32)connectionError[i].err
!= -1)
+  while ((Uint32)connectionError[i].err != err &&
+         (Uint32)connectionError[i].err != -1)
     i++;
   return connectionError[i].text;
 }

--- 1.48/storage/ndb/src/ndbapi/NdbTransaction.cpp	2005-09-09 15:26:43 +02:00
+++ 1.49/storage/ndb/src/ndbapi/NdbTransaction.cpp	2006-03-24 17:11:39 +01:00
@@ -434,12 +434,12 @@
 //------------------------------------------------------------------------
   Ndb* tNdb = theNdb;
 
+  Uint32 timeout = TransporterFacade::instance()->m_waitfor_timeout;
   m_waitForReply = false;
   executeAsynchPrepare(aTypeOfExec, NULL, NULL, abortOption);
   if (m_waitForReply){
     while (1) {
-      int noOfComp = tNdb->sendPollNdb((3 * WAITFOR_RESPONSE_TIMEOUT),
-                                       1, forceSend);
+      int noOfComp = tNdb->sendPollNdb(3 * timeout, 1, forceSend);
       if (noOfComp == 0) {
         /** 
          * This timeout situation can occur if NDB crashes.

--- 1.34/storage/ndb/src/ndbapi/Ndbif.cpp	2005-10-07 10:58:07 +02:00
+++ 1.35/storage/ndb/src/ndbapi/Ndbif.cpp	2006-03-24 17:11:39 +01:00
@@ -1034,23 +1034,25 @@
 void
 Ndb::check_send_timeout()
 {
+  Uint32 timeout = TransporterFacade::instance()->m_waitfor_timeout;
   NDB_TICKS current_time = NdbTick_CurrentMillisecond();
   if (current_time - the_last_check_time > 1000) {
     the_last_check_time = current_time;
     Uint32 no_of_sent = theNoOfSentTransactions;
     for (Uint32 i = 0; i < no_of_sent; i++) {
       NdbTransaction* a_con = theSentTransactionsArray[i];
-      if ((current_time - a_con->theStartTransTime) >
-          WAITFOR_RESPONSE_TIMEOUT) {
+      if ((current_time - a_con->theStartTransTime) > timeout)
+      {
 #ifdef VM_TRACE
         a_con->printState();
 	Uint32 t1 = a_con->theTransactionId;
 	Uint32 t2 = a_con->theTransactionId >> 32;
-	ndbout_c("[%.8x %.8x]", t1, t2);
-	abort();
+	ndbout_c("4012 [%.8x %.8x]", t1, t2);
+	//abort();
 #endif
+        a_con->theReleaseOnClose = true;
         a_con->setOperationErrorCodeAbort(4012);
-        a_con->theCommitStatus = NdbTransaction::Aborted;
+	a_con->theCommitStatus = NdbTransaction::NeedAbort;
         a_con->theCompletionStatus = NdbTransaction::CompletedFailure;
         a_con->handleExecuteCompletion();
         remove_sent_list(i);

--- 1.44/storage/ndb/src/ndbapi/TransporterFacade.cpp	2005-10-14 11:55:29 +02:00
+++ 1.45/storage/ndb/src/ndbapi/TransporterFacade.cpp	2006-03-24 17:11:39 +01:00
@@ -724,6 +724,19 @@
     m_batch_size= batch_size;
   }
   
+  Uint32 timeout = 120000;
+  iter.first();
+  for (iter.first(); iter.valid(); iter.next())
+  {
+    Uint32 tmp1 = 0, tmp2 = 0;
+    iter.get(CFG_DB_TRANSACTION_CHECK_INTERVAL, &tmp1);
+    iter.get(CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, &tmp2);
+    tmp1 += tmp2;
+    if (tmp1 > timeout)
+      timeout = tmp1;
+  }
+  m_waitfor_timeout = timeout;
+  
   if (!theTransporterRegistry->start_service(m_socket_server)){
     ndbout_c("Unable to start theTransporterRegistry->start_service");
     DBUG_RETURN(false);

--- 1.28/storage/ndb/src/ndbapi/TransporterFacade.hpp	2005-09-15 12:30:51 +02:00
+++ 1.29/storage/ndb/src/ndbapi/TransporterFacade.hpp	2006-03-24 17:11:39 +01:00
@@ -122,6 +122,7 @@
   Uint32 get_scan_batch_size();
   Uint32 get_batch_byte_size();
   Uint32 get_batch_size();
+  Uint32 m_waitfor_timeout; // in milli seconds...
 
   TransporterRegistry* get_registry() { return theTransporterRegistry;};
 

--- 1.5/storage/ndb/test/include/NdbRestarter.hpp	2005-04-08 02:44:22 +02:00
+++ 1.6/storage/ndb/test/include/NdbRestarter.hpp	2006-03-24 17:11:39 +01:00
@@ -62,6 +62,7 @@
   int dumpStateAllNodes(int * _args, int _num_args);
 
   int getMasterNodeId();
+  int getRandomNodeSameNodeGroup(int nodeId, int randomNumber);
   int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber);
   int getRandomNotMasterNodeId(int randomNumber);
   

--- 1.20/storage/ndb/test/ndbapi/testNodeRestart.cpp	2005-06-13 15:16:12 +02:00
+++ 1.21/storage/ndb/test/ndbapi/testNodeRestart.cpp	2006-03-24 17:11:39 +01:00
@@ -21,6 +21,7 @@
 #include <NdbRestarter.hpp>
 #include <NdbRestarts.hpp>
 #include <Vector.hpp>
+#include <signaldata/DumpStateOrd.hpp>
 
 
 int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){
@@ -409,6 +410,253 @@
   return NDBT_OK;
 }
 
+int runBug15587(NDBT_Context* ctx, NDBT_Step* step){
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  int records = ctx->getNumRecords();
+  NdbRestarter restarter;
+  
+  Uint32 tableId = ctx->getTab()->getTableId();
+  int dump[2] = { DumpStateOrd::LqhErrorInsert5042, 0 };
+  dump[1] = tableId;
+
+  int nodeId = restarter.getDbNodeId(1);
+
+  ndbout << "Restart node " << nodeId << endl; 
+  
+  if (restarter.restartOneDbNode(nodeId,
+				 /** initial */ false, 
+				 /** nostart */ true,
+				 /** abort   */ true))
+    return NDBT_FAILED;
+  
+  if (restarter.waitNodesNoStart(&nodeId, 1))
+    return NDBT_FAILED; 
+   
+  if (restarter.dumpStateOneNode(nodeId, dump, 2))
+    return NDBT_FAILED;
+
+  if (restarter.startNodes(&nodeId, 1))
+    return NDBT_FAILED;
+
+  restarter.waitNodesStartPhase(&nodeId, 1, 3);
+  
+  if (restarter.waitNodesNoStart(&nodeId, 1))
+    return NDBT_FAILED; 
+   
+  if (restarter.startNodes(&nodeId, 1))
+    return NDBT_FAILED;
+  
+  if (restarter.waitNodesStarted(&nodeId, 1))
+    return NDBT_FAILED;
+  
+  ctx->stopTest();
+  return NDBT_OK;
+}
+
+int runBug15632(NDBT_Context* ctx, NDBT_Step* step){
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  int records = ctx->getNumRecords();
+  NdbRestarter restarter;
+  
+  int nodeId = restarter.getDbNodeId(1);
+
+  ndbout << "Restart node " << nodeId << endl; 
+  
+  if (restarter.restartOneDbNode(nodeId,
+				 /** initial */ false, 
+				 /** nostart */ true,
+				 /** abort   */ true))
+    return NDBT_FAILED;
+  
+  if (restarter.waitNodesNoStart(&nodeId, 1))
+    return NDBT_FAILED; 
+   
+  if (restarter.insertErrorInNode(nodeId, 7165))
+    return NDBT_FAILED;
+  
+  if (restarter.startNodes(&nodeId, 1))
+    return NDBT_FAILED;
+
+  if (restarter.waitNodesStarted(&nodeId, 1))
+    return NDBT_FAILED;
+
+  if (restarter.restartOneDbNode(nodeId,
+				 /** initial */ false, 
+				 /** nostart */ true,
+				 /** abort   */ true))
+    return NDBT_FAILED;
+  
+  if (restarter.waitNodesNoStart(&nodeId, 1))
+    return NDBT_FAILED; 
+   
+  if (restarter.insertErrorInNode(nodeId, 7171))
+    return NDBT_FAILED;
+  
+  if (restarter.startNodes(&nodeId, 1))
+    return NDBT_FAILED;
+  
+  if (restarter.waitNodesStarted(&nodeId, 1))
+    return NDBT_FAILED;
+  
+  ctx->stopTest();
+  return NDBT_OK;
+}
+
+int runBug15685(NDBT_Context* ctx, NDBT_Step* step){
+
+  Ndb* pNdb = GETNDB(step);
+  HugoOperations hugoOps(*ctx->getTab());
+  NdbRestarter restarter;
+
+  HugoTransactions hugoTrans(*ctx->getTab());
+  if (hugoTrans.loadTable(GETNDB(step), 10) != 0){
+    return NDBT_FAILED;
+  }
+
+  if(hugoOps.startTransaction(pNdb) != 0)
+    goto err;
+  
+  if(hugoOps.pkUpdateRecord(pNdb, 0, 1, rand()) != 0)
+    goto err;
+
+  if(hugoOps.execute_NoCommit(pNdb) != 0)
+    goto err;
+
+  if (restarter.insertErrorInAllNodes(5100))
+    return NDBT_FAILED;
+  
+  hugoOps.execute_Rollback(pNdb);
+
+  if (restarter.waitClusterStarted() != 0)
+    goto err;
+
+  if (restarter.insertErrorInAllNodes(0))
+    return NDBT_FAILED;
+  
+  ctx->stopTest();
+  return NDBT_OK;
+  
+err:
+  ctx->stopTest();
+  return NDBT_FAILED;
+}
+
+int 
+runBug16772(NDBT_Context* ctx, NDBT_Step* step){
+
+  NdbRestarter restarter;
+  if (restarter.getNumDbNodes() < 2)
+  {
+    ctx->stopTest();
+    return NDBT_OK;
+  }
+
+  int aliveNodeId = restarter.getRandomNotMasterNodeId(rand());
+  int deadNodeId = aliveNodeId;
+  while (deadNodeId == aliveNodeId)
+    deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes());
+  
+  if (restarter.insertErrorInNode(aliveNodeId, 930))
+    return NDBT_FAILED;
+
+  if (restarter.restartOneDbNode(deadNodeId,
+				 /** initial */ false, 
+				 /** nostart */ true,
+				 /** abort   */ true))
+    return NDBT_FAILED;
+  
+  if (restarter.waitNodesNoStart(&deadNodeId, 1))
+    return NDBT_FAILED;
+
+  if (restarter.startNodes(&deadNodeId, 1))
+    return NDBT_FAILED;
+
+  // It should now be hanging since we throw away NDB_FAILCONF
+  int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10);
+  // So this should fail...i.e it should not reach startphase 3
+
+  // Now send a NDB_FAILCONF for deadNo
+  int dump[] = { 7020, 323, 252, 0 };
+  dump[3] = deadNodeId;
+  if (restarter.dumpStateOneNode(aliveNodeId, dump, 4))
+    return NDBT_FAILED;
+  
+  if (restarter.waitNodesStarted(&deadNodeId, 1))
+    return NDBT_FAILED;
+
+  return ret ? NDBT_OK : NDBT_FAILED;
+}
+
+int 
+runBug18414(NDBT_Context* ctx, NDBT_Step* step){
+
+  NdbRestarter restarter;
+  if (restarter.getNumDbNodes() < 2)
+  {
+    ctx->stopTest();
+    return NDBT_OK;
+  }
+
+  Ndb* pNdb = GETNDB(step);
+  HugoOperations hugoOps(*ctx->getTab());
+  HugoTransactions hugoTrans(*ctx->getTab());
+  int loop = 0;
+  do 
+  {
+    if(hugoOps.startTransaction(pNdb) != 0)
+      goto err;
+    
+    if(hugoOps.pkUpdateRecord(pNdb, 0, 128, rand()) != 0)
+      goto err;
+    
+    if(hugoOps.execute_NoCommit(pNdb) != 0)
+      goto err;
+
+    int node1 = hugoOps.getTransaction()->getConnectedNodeId();
+    int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand());
+    
+    if (node1 == -1 || node2 == -1)
+      break;
+    
+    if (loop & 1)
+    {
+      if (restarter.insertErrorInNode(node1, 8050))
+	goto err;
+    }
+    
+    if (restarter.insertErrorInNode(node2, 5003))
+      goto err;
+    
+    int res= hugoOps.execute_Rollback(pNdb);
+  
+    if (restarter.waitNodesNoStart(&node2, 1) != 0)
+      goto err;
+    
+    if (restarter.insertErrorInAllNodes(0))
+      goto err;
+    
+    if (restarter.startNodes(&node2, 1) != 0)
+      goto err;
+    
+    if (restarter.waitClusterStarted() != 0)
+      goto err;
+    
+    if (hugoTrans.scanUpdateRecords(pNdb, 128) != 0)
+      goto err;
+
+    hugoOps.closeTransaction(pNdb);
+    
+  } while(++loop < 5);
+  
+  return NDBT_OK;
+  
+err:
+  hugoOps.closeTransaction(pNdb);
+  return NDBT_FAILED;    
+}
+
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
 	 "Test that one node at a time can be stopped and then restarted "\
@@ -558,6 +806,8 @@
   INITIALIZER(runCheckAllNodesStarted);
   INITIALIZER(runLoadTable);
   STEP(runRestarts);
+  STEP(runPkUpdateUntilStopped);
+  STEP(runScanUpdateUntilStopped);
   FINALIZER(runScanReadVerify);
   FINALIZER(runClearTable);
 }
@@ -647,6 +897,8 @@
   INITIALIZER(runCheckAllNodesStarted);
   INITIALIZER(runLoadTable);
   STEP(runRestarts);
+  STEP(runPkUpdateUntilStopped);
+  STEP(runScanUpdateUntilStopped);
   FINALIZER(runScanReadVerify);
   FINALIZER(runClearTable);
 }
@@ -669,6 +921,34 @@
 	 "Test commit after node failure"){
   INITIALIZER(runLoadTable);
   STEP(runLateCommit);