List:Commits« Previous MessageNext Message »
From:jonas Date:October 6 2006 4:05pm
Subject:bk commit into 4.1 tree (jonas:1.2550) BUG#22893
View as plain text  
Below is the list of changes that have just been committed into a local
4.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2006-10-06 16:05:46+02:00, jonas@stripped +2 -0
  ndb - bug#22893
    Add checking of REDO to earlier during SR
        so take-over of node can be performed
        if it can't be restarted using logs
        (which btw is really weird...as it _should_ be able to use logs of other node in
node group)
  
    Otherwise cluster could be started and 1 fragment on one node could not have been
restored
    Making the cluster inconsisten, VERY BAD

  ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2006-10-06 16:05:44+02:00,
jonas@stripped +2 -0
    Break-out methods which searches for REDO for a fragment, so it can be used earlier
during SR

  ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2006-10-06 16:05:44+02:00,
jonas@stripped +90 -71
    Add checking of REDO to earlier during SR
      so take-over of node can be performed
      if it can't be restarted using logs
      (which btw is really weird...as it _should_ be able to use logs of other node in
node group)
    

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/41-work

--- 1.10/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2006-10-06 16:05:49 +02:00
+++ 1.11/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2006-10-06 16:05:49 +02:00
@@ -1044,6 +1044,8 @@
   void removeStoredReplica(FragmentstorePtr regFragptr,
                            ReplicaRecordPtr replicaPtr);
   void searchStoredReplicas(FragmentstorePtr regFragptr);
+  bool setup_create_replica(FragmentstorePtr, CreateReplicaRecord*,
+			    ConstPtr<ReplicaRecord>);
   void updateNodeInfo(FragmentstorePtr regFragptr);
 
 //------------------------------------

--- 1.41/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2006-10-06 16:05:49 +02:00
+++ 1.42/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2006-10-06 16:05:49 +02:00
@@ -8344,14 +8344,30 @@
 
 	  resetReplicaLcp(replicaPtr.p, newestRestorableGCI);
 
-	  /* -----------------------------------------------------------------
-	   *   LINK THE REPLICA INTO THE STORED REPLICA LIST. WE WILL USE THIS
-	   *   NODE AS A STORED REPLICA.                                      
-	   *   WE MUST FIRST LINK IT OUT OF THE LIST OF OLD STORED REPLICAS.  
-	   * --------------------------------------------------------------- */
-	  removeOldStoredReplica(fragPtr, replicaPtr);
-	  linkStoredReplica(fragPtr, replicaPtr);
-
+	  /**
+	   * Make sure we can also find REDO for restoring replica...
+	   */
+	  {
+	    CreateReplicaRecord createReplica;
+	    ConstPtr<ReplicaRecord> constReplicaPtr;
+	    constReplicaPtr.i = replicaPtr.i;
+	    constReplicaPtr.p = replicaPtr.p;
+	    if (setup_create_replica(fragPtr,
+				     &createReplica, constReplicaPtr))
+	    {
+	      removeOldStoredReplica(fragPtr, replicaPtr);
+	      linkStoredReplica(fragPtr, replicaPtr);
+	    }
+	    else
+	    {
+	      infoEvent("Forcing take-over of node %d due to unsufficient REDO"
+			" for table %d fragment: %d",
+			nodePtr.i, tabPtr.i, i);
+	      
+	      setNodeActiveStatus(nodePtr.i, 
+				  Sysfile::NS_NotActive_NotTakenOver);
+	    }
+	  }
 	}
         default:
 	  jam();
@@ -12282,16 +12298,75 @@
 /*               CHECKPOINT WITHOUT NEEDING ANY EXTRA LOGGING FACILITIES.*/
 /*               A MAXIMUM OF FOUR NODES IS RETRIEVED.                   */
 /*************************************************************************/
+bool
+Dbdih::setup_create_replica(FragmentstorePtr fragPtr,
+			    CreateReplicaRecord* createReplicaPtrP,
+			    ConstPtr<ReplicaRecord> replicaPtr)
+{
+  createReplicaPtrP->dataNodeId = replicaPtr.p->procNode;
+  createReplicaPtrP->replicaRec = replicaPtr.i;
+
+  /* ----------------------------------------------------------------- */
+  /*   WE NEED TO SEARCH FOR A PROPER LOCAL CHECKPOINT TO USE FOR THE  */
+  /*   SYSTEM RESTART.                                                 */
+  /* ----------------------------------------------------------------- */
+  Uint32 startGci;
+  Uint32 startLcpNo;
+  Uint32 stopGci = SYSFILE->newestRestorableGCI;
+  bool result = findStartGci(replicaPtr,
+			     stopGci,
+			     startGci,
+			     startLcpNo);
+  if (!result) 
+  {
+    jam();
+    /* --------------------------------------------------------------- */
+    /* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/
+    /* CONTAIN ANY VALID LOCAL CHECKPOINT. IT DOES HOWEVER CONTAIN A   */
+    /* VALID FRAGMENT LOG. THUS BY FIRST CREATING THE FRAGMENT AND THEN*/
+    /* EXECUTING THE FRAGMENT LOG WE CAN CREATE THE FRAGMENT AS        */
+    /* DESIRED. THIS SHOULD ONLY OCCUR AFTER CREATING A FRAGMENT.      */
+    /*                                                                 */
+    /* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE   */
+    /* LOCAL CHECKPOINT TO ZNIL.                                       */
+    /* --------------------------------------------------------------- */
+    createReplicaPtrP->lcpNo = ZNIL;
+  } 
+  else 
+  {
+    jam();
+    /* --------------------------------------------------------------- */
+    /* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM.             */
+    /* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER.            */
+    /* --------------------------------------------------------------- */
+    createReplicaPtrP->lcpNo = startLcpNo;
+    arrGuard(startLcpNo, MAX_LCP_STORED);
+    createReplicaPtrP->createLcpId = replicaPtr.p->lcpId[startLcpNo];
+  }//if
+  
+  
+  /* ----------------------------------------------------------------- */
+  /*   WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO   */
+  /*   EXECUTE THE LOG FROM THE INITIAL CREATION OF THE TABLE. IN BOTH */
+  /*   CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT  */
+  /*   WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT.             */
+  /* -_--------------------------------------------------------------- */
+  return findLogNodes(createReplicaPtrP, fragPtr, startGci, stopGci);
+}			    
+
 void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr) 
 {
   Uint32 nextReplicaPtrI;
-  ConstPtr<ReplicaRecord> replicaPtr;
+  Ptr<ReplicaRecord> replicaPtr;
 
   replicaPtr.i = fragPtr.p->storedReplicas;
   while (replicaPtr.i != RNIL) {
     jam();
     ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord);
     nextReplicaPtrI = replicaPtr.p->nextReplica;
+    ConstPtr<ReplicaRecord> constReplicaPtr;
+    constReplicaPtr.i = replicaPtr.i;
+    constReplicaPtr.p = replicaPtr.p;
     NodeRecordPtr nodePtr;
     nodePtr.i = replicaPtr.p->procNode;
     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
@@ -12311,69 +12386,13 @@
 	createReplicaPtr.i = cnoOfCreateReplicas;
 	ptrCheckGuard(createReplicaPtr, 4, createReplicaRecord);
 	cnoOfCreateReplicas++;
-	createReplicaPtr.p->dataNodeId = replicaPtr.p->procNode;
-	createReplicaPtr.p->replicaRec = replicaPtr.i;
-	/* ----------------------------------------------------------------- */
-	/*   WE NEED TO SEARCH FOR A PROPER LOCAL CHECKPOINT TO USE FOR THE  */
-	/*   SYSTEM RESTART.                                                 */
-	/* ----------------------------------------------------------------- */
-	Uint32 startGci;
-	Uint32 startLcpNo;
-	Uint32 stopGci = SYSFILE->newestRestorableGCI;
-	bool result = findStartGci(replicaPtr,
-				   stopGci,
-				   startGci,
-				   startLcpNo);
-	if (!result) {
-	  jam();
-	  /* --------------------------------------------------------------- */
-	  /* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/
-	  /* CONTAIN ANY VALID LOCAL CHECKPOINT. IT DOES HOWEVER CONTAIN A   */
-	  /* VALID FRAGMENT LOG. THUS BY FIRST CREATING THE FRAGMENT AND THEN*/
-	  /* EXECUTING THE FRAGMENT LOG WE CAN CREATE THE FRAGMENT AS        */
-	  /* DESIRED. THIS SHOULD ONLY OCCUR AFTER CREATING A FRAGMENT.      */
-	  /*                                                                 */
-	  /* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE   */
-	  /* LOCAL CHECKPOINT TO ZNIL.                                       */
-	  /* --------------------------------------------------------------- */
-	  createReplicaPtr.p->lcpNo = ZNIL;
-	} else {
-	  jam();
-	  /* --------------------------------------------------------------- */
-	  /* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM.             */
-	  /* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER.            */
-	  /* --------------------------------------------------------------- */
-	  createReplicaPtr.p->lcpNo = startLcpNo;
-	  arrGuard(startLcpNo, MAX_LCP_STORED);
-	  createReplicaPtr.p->createLcpId = replicaPtr.p->lcpId[startLcpNo];
-	}//if
-
-	if(ERROR_INSERTED(7073) || ERROR_INSERTED(7074)){
-	  jam();
-	  nodePtr.p->nodeStatus = NodeRecord::DEAD;
-	}
-
-	/* ----------------------------------------------------------------- */
-	/*   WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO   */
-	/*   EXECUTE THE LOG FROM THE INITIAL CREATION OF THE TABLE. IN BOTH */
-	/*   CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT  */
-	/*   WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT.             */
-	/* -_--------------------------------------------------------------- */
-	if (!findLogNodes(createReplicaPtr.p, fragPtr, startGci, stopGci)) {
-	  jam();
-	  /* --------------------------------------------------------------- */
-	  /* WE WERE NOT ABLE TO FIND ANY WAY OF RESTORING THIS REPLICA.     */
-	  /* THIS IS A POTENTIAL SYSTEM ERROR.                               */
-	  /* --------------------------------------------------------------- */
-	  cnoOfCreateReplicas--;
-	  return;
-	}//if
-	
-	if(ERROR_INSERTED(7073) || ERROR_INSERTED(7074)){
-	  jam();
-	  nodePtr.p->nodeStatus = NodeRecord::ALIVE;
-	}
 	
+	/**
+	 * Should have been checked in resetReplicaSr
+	 */
+	ndbrequire(setup_create_replica(fragPtr,
+					createReplicaPtr.p, 
+					constReplicaPtr));
 	break;
       }
       default:
Thread
bk commit into 4.1 tree (jonas:1.2550) BUG#22893jonas6 Oct