List:Commits« Previous MessageNext Message »
From:jonas Date:June 26 2007 3:06pm
Subject:bk commit into 5.1 tree (jonas:1.2156) BUG#29331
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-06-26 15:06:12+02:00, jonas@stripped +3 -0
  ndb - bug#29331 (wl2325-5.0)
    Add better handling of GCP Stop
    Only kill "offending" node

  storage/ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2007-06-26 15:06:11+02:00,
jonas@stripped +4 -0
    Add new error codes

  storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2007-06-26 15:06:11+02:00,
jonas@stripped +1 -1
    Add better handling of GCP stop

  storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-06-26 15:06:11+02:00,
jonas@stripped +154 -27
    Add better handling of GCP stop

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/drop5

--- 1.25/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2007-06-26 15:06:15 +02:00
+++ 1.26/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2007-06-26 15:06:15 +02:00
@@ -76,6 +76,10 @@
 
 7183: Crash when receiving COPY_GCIREQ
 
+7184: Crash before starting next GCP after a node failure
+
+7185: Dont reply to COPY_GCI_REQ where reason == GCP
+
 ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
 -----------------------------------------------------------------
 

--- 1.19/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2007-06-26 15:06:15 +02:00
+++ 1.20/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2007-06-26 15:06:15 +02:00
@@ -897,7 +897,7 @@
   void ndbsttorry10Lab(Signal *, Uint32 _line);
   void createMutexes(Signal* signal, Uint32 no);
   void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal);
-  void crashSystemAtGcpStop(Signal *);
+  void crashSystemAtGcpStop(Signal *, bool);
   void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr);
   void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode);
   void GCP_SAVEhandling(Signal *, Uint32 nodeId);

--- 1.67/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-06-26 15:06:15 +02:00
+++ 1.68/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-06-26 15:06:15 +02:00
@@ -737,6 +737,12 @@
   ndbrequire(ok);
   
   CRASH_INSERTION(7183);
+
+  if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT)
+  {
+    jam();
+    return;
+  }
   
   /* ----------------------------------------------------------------------- */
   /*     WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE.           */
@@ -3964,6 +3970,11 @@
     CLEAR_ERROR_INSERT_VALUE;
   }
 
+  if (ERROR_INSERTED(7184))
+  {
+    SET_ERROR_INSERT_VALUE(7000);
+  }
+
   /*-------------------------------------------------------------------------*/
   // The first step is to convert from a bit mask to an array of failed nodes.
   /*-------------------------------------------------------------------------*/
@@ -7568,7 +7579,7 @@
           ndbout << "System crash due to GCP Stop in state = ";
           ndbout << (Uint32) cgcpStatus << endl;
 #endif
-          crashSystemAtGcpStop(signal);
+          crashSystemAtGcpStop(signal, false);
           return;
         }//if
       } else {
@@ -7582,7 +7593,7 @@
             ndbout << "System crash due to GCP Stop in state = ";
             ndbout << (Uint32) cgcpStatus << endl;
 #endif
-	    crashSystemAtGcpStop(signal);
+	    crashSystemAtGcpStop(signal, false);
             return;
           }//if
         } else {
@@ -10916,31 +10927,125 @@
  * GCP stop detected, 
  * send SYSTEM_ERROR to all other alive nodes
  */
-void Dbdih::crashSystemAtGcpStop(Signal* signal)
+void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local)
 {
+  if (local)
+    goto dolocal;
+
   switch(cgcpStatus){
+  case GCP_PREPARE_SENT:
+  {
+    jam();
+    /**
+     * We're waiting for a GCP PREPARE CONF
+     */
+    infoEvent("Detected GCP stop(%d)...sending kill to %s", 
+              cgcpStatus, c_GCP_PREPARE_Counter.getText());
+    ndbout_c("Detected GCP stop(%d)...sending kill to %s", 
+             cgcpStatus, c_GCP_PREPARE_Counter.getText());
+    
+    {
+      NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter);
+      signal->theData[0] = 7022;
+      sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
+    }
+    
+    {
+      NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter);
+      SystemError * const sysErr = (SystemError*)&signal->theData[0];
+      sysErr->errorCode = SystemError::GCPStopDetected;
+      sysErr->errorRef = reference();
+      sysErr->data1 = cgcpStatus;
+      sysErr->data2 = cgcpOrderBlocked;
+      sendSignal(rg, GSN_SYSTEM_ERROR, signal, 
+                 SystemError::SignalLength, JBA);
+    }
+    ndbrequire(!c_GCP_PREPARE_Counter.done());
+    return;
+  }
+  case GCP_COMMIT_SENT:
+  {
+    jam();
+    /**
+     * We're waiting for a GCP_NODEFINISH
+     */
+    infoEvent("Detected GCP stop(%d)...sending kill to %s", 
+	      cgcpStatus, c_GCP_COMMIT_Counter.getText());
+    ndbout_c("Detected GCP stop(%d)...sending kill to %s", 
+	     cgcpStatus, c_GCP_COMMIT_Counter.getText());
+    
+    {
+      NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter);
+      signal->theData[0] = 7022;
+      sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
+    }
+
+    {
+      NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter);
+      SystemError * const sysErr = (SystemError*)&signal->theData[0];
+      sysErr->errorCode = SystemError::GCPStopDetected;
+      sysErr->errorRef = reference();
+      sysErr->data1 = cgcpStatus;
+      sysErr->data2 = cgcpOrderBlocked;
+      sendSignal(rg, GSN_SYSTEM_ERROR, signal, 
+                 SystemError::SignalLength, JBA);
+    }
+    ndbrequire(!c_GCP_COMMIT_Counter.done());
+    return;
+  }
   case GCP_NODE_FINISHED:
   {
+    jam();
     /**
      * We're waiting for a GCP save conf
      */
-    ndbrequire(!c_GCP_SAVEREQ_Counter.done());
     NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter);
     signal->theData[0] = 2305;
     sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB);
     
-    infoEvent("Detected GCP stop...sending kill to %s", 
-	      c_GCP_SAVEREQ_Counter.getText());
-    ndbout_c("Detected GCP stop...sending kill to %s", 
-	     c_GCP_SAVEREQ_Counter.getText());
+    infoEvent("Detected GCP stop(%d)...sending kill to %s", 
+              cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
+    ndbout_c("Detected GCP stop(%d)...sending kill to %s", 
+	     cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
+    ndbrequire(!c_GCP_SAVEREQ_Counter.done());
     return;
   }
   case GCP_SAVE_LQH_FINISHED:
-    ndbout_c("m_copyReason: %d m_waiting: %d",
-	     c_copyGCIMaster.m_copyReason,
-	     c_copyGCIMaster.m_waiting);
-    break;
+  {
+    jam();
+    /**
+     * We're waiting for a COPY_GCICONF
+     */
+    infoEvent("Detected GCP stop(%d)...sending kill to %s", 
+	      cgcpStatus, c_COPY_GCIREQ_Counter.getText());
+    ndbout_c("Detected GCP stop(%d)...sending kill to %s", 
+	     cgcpStatus, c_COPY_GCIREQ_Counter.getText());
+
+    {
+      NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter);
+      signal->theData[0] = 7022;
+      sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
+    }
+    
+    {
+      NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter);
+      SystemError * const sysErr = (SystemError*)&signal->theData[0];
+      sysErr->errorCode = SystemError::GCPStopDetected;
+      sysErr->errorRef = reference();
+      sysErr->data1 = cgcpStatus;
+      sysErr->data2 = cgcpOrderBlocked;
+      sendSignal(rg, GSN_SYSTEM_ERROR, signal, 
+                 SystemError::SignalLength, JBA);
+    }
+    ndbrequire(!c_COPY_GCIREQ_Counter.done());
+    return;
+  }
   }
+
+dolocal:  
+  ndbout_c("m_copyReason: %d m_waiting: %d",
+           c_copyGCIMaster.m_copyReason,
+           c_copyGCIMaster.m_waiting);
   
   ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
 	   c_copyGCISlave.m_senderData,
@@ -10997,23 +11102,39 @@
 	   c_TCGETOPSIZEREQ_Counter.getText());
   ndbout_c("c_UPDATE_TOREQ_Counter = %s", c_UPDATE_TOREQ_Counter.getText());
 
-  NodeRecordPtr nodePtr;
-  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
+  if (local == false)
+  {
     jam();
-    ptrAss(nodePtr, nodeRecord);
-    if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
+    NodeRecordPtr nodePtr;
+    for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
       jam();
-      const BlockReference ref = 
-	numberToRef(refToBlock(cntrlblockref), nodePtr.i);
-      SystemError * const sysErr = (SystemError*)&signal->theData[0];
-      sysErr->errorCode = SystemError::GCPStopDetected;
-      sysErr->errorRef = reference();
-      sysErr->data1 = cgcpStatus;
-      sysErr->data2 = cgcpOrderBlocked;
-      sendSignal(ref, GSN_SYSTEM_ERROR, signal, 
-		 SystemError::SignalLength, JBA);
-    }//if
-  }//for
+      ptrAss(nodePtr, nodeRecord);
+      if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
+        jam();
+        const BlockReference ref = 
+          numberToRef(refToBlock(cntrlblockref), nodePtr.i);
+        SystemError * const sysErr = (SystemError*)&signal->theData[0];
+        sysErr->errorCode = SystemError::GCPStopDetected;
+        sysErr->errorRef = reference();
+        sysErr->data1 = cgcpStatus;
+        sysErr->data2 = cgcpOrderBlocked;
+        sendSignal(ref, GSN_SYSTEM_ERROR, signal, 
+                   SystemError::SignalLength, JBA);
+      }//if
+    }//for
+  }
+  else
+  {
+    jam();
+    SystemError * const sysErr = (SystemError*)&signal->theData[0];
+    sysErr->errorCode = SystemError::GCPStopDetected;
+    sysErr->errorRef = reference();
+    sysErr->data1 = cgcpStatus;
+    sysErr->data2 = cgcpOrderBlocked;
+    EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR, 
+                   signal, SystemError::SignalLength);
+    ndbrequire(false);
+  }
   return;
 }//Dbdih::crashSystemAtGcpStop()
 
@@ -14096,6 +14217,12 @@
       } while (replicaPtr.i != RNIL);
       infoEvent(buf);
     }
+  }
+
+  if (arg == 7022)
+  {
+    jam();
+    crashSystemAtGcpStop(signal, true);
   }
 }//Dbdih::execDUMP_STATE_ORD()
 
Thread
bk commit into 5.1 tree (jonas:1.2156) BUG#29331jonas26 Jun