List:Commits« Previous MessageNext Message »
From:jonas Date:June 26 2007 3:19pm
Subject:bk commit into 5.1 tree (jonas:1.2499) BUG#29331
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-06-26 15:19:42+02:00, jonas@stripped +3 -0
  ndb - bug#29331 (51)
      Add better handling of GCP Stop
      Only kill "offending" node

  storage/ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2007-06-26 15:19:40+02:00,
jonas@stripped +5 -1
    add new error codes

  storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2007-06-26 15:19:40+02:00,
jonas@stripped +1 -1
    add better GCP stop handling

  storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-06-26 15:19:40+02:00,
jonas@stripped +161 -36
    add better GCP stop handling

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/51-telco-gca

--- 1.37/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2007-06-26 15:19:48 +02:00
+++ 1.38/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2007-06-26 15:19:48 +02:00
@@ -5,7 +5,7 @@
 Next DBTUP 4029
 Next DBLQH 5045
 Next DBDICT 6007
-Next DBDIH 7183
+Next DBDIH 7186
 Next DBTC 8040
 Next CMVMI 9000
 Next BACKUP 10038
@@ -74,6 +74,10 @@
 7177: Delay copying of sysfileData in execCOPY_GCIREQ
 
 7180: Crash master during master-take-over in execMASTER_LCPCONF
+
+7184: Crash before starting next GCP after a node failure
+
+7185: Dont reply to COPY_GCI_REQ where reason == GCP
 
 ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
 -----------------------------------------------------------------

--- 1.28/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2007-06-26 15:19:48 +02:00
+++ 1.29/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2007-06-26 15:19:48 +02:00
@@ -899,7 +899,7 @@
   void ndbsttorry10Lab(Signal *, Uint32 _line);
   void createMutexes(Signal* signal, Uint32 no);
   void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal);
-  void crashSystemAtGcpStop(Signal *);
+  void crashSystemAtGcpStop(Signal *, bool);
   void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr);
   void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode);
   void GCP_SAVEhandling(Signal *, Uint32 nodeId);

--- 1.110/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-06-26 15:19:48 +02:00
+++ 1.111/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-06-26 15:19:48 +02:00
@@ -747,6 +747,13 @@
   }
   ndbrequire(ok);
   
+ 
+  if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT)
+  {
+    jam();
+    return;
+  }
+
   /* ----------------------------------------------------------------------- */
   /*     WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE.           */
   /* ----------------------------------------------------------------------- */
@@ -4071,6 +4078,11 @@
     CLEAR_ERROR_INSERT_VALUE;
   }
 
+  if (ERROR_INSERTED(7184))
+  {
+    SET_ERROR_INSERT_VALUE(7000);
+  }
+
   /*-------------------------------------------------------------------------*/
   // The first step is to convert from a bit mask to an array of failed nodes.
   /*-------------------------------------------------------------------------*/
@@ -7745,7 +7757,7 @@
           g_eventLogger.error("System crash due to GCP Stop in state = %u",
                               (Uint32) cgcpStatus);
 #endif
-          crashSystemAtGcpStop(signal);
+          crashSystemAtGcpStop(signal, false);
           return;
         }//if
       } else {
@@ -7759,7 +7771,7 @@
             g_eventLogger.error("System crash due to GCP Stop in state = %u",
                                 (Uint32) cgcpStatus);
 #endif
-	    crashSystemAtGcpStop(signal);
+	    crashSystemAtGcpStop(signal, false);
             return;
           }//if
         } else {
@@ -11117,41 +11129,132 @@
  * GCP stop detected, 
  * send SYSTEM_ERROR to all other alive nodes
  */
-void Dbdih::crashSystemAtGcpStop(Signal* signal)
+void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local)
 {
+  if (local)
+    goto dolocal;
+
   switch(cgcpStatus){
+  case GCP_PREPARE_SENT:
+  {
+    jam();
+    /**
+     * We're waiting for a GCP PREPARE CONF
+     */
+    infoEvent("Detected GCP stop(%d)...sending kill to %s", 
+              cgcpStatus, c_GCP_PREPARE_Counter.getText());
+    ndbout_c("Detected GCP stop(%d)...sending kill to %s", 
+             cgcpStatus, c_GCP_PREPARE_Counter.getText());
+    
+    {
+      NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter);
+      signal->theData[0] = 7022;
+      sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
+    }
+    
+    {
+      NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter);
+      SystemError * const sysErr = (SystemError*)&signal->theData[0];
+      sysErr->errorCode = SystemError::GCPStopDetected;
+      sysErr->errorRef = reference();
+      sysErr->data1 = cgcpStatus;
+      sysErr->data2 = cgcpOrderBlocked;
+      sendSignal(rg, GSN_SYSTEM_ERROR, signal, 
+                 SystemError::SignalLength, JBA);
+    }
+    ndbrequire(!c_GCP_PREPARE_Counter.done());
+    return;
+  }
+  case GCP_COMMIT_SENT:
+  {
+    jam();
+    /**
+     * We're waiting for a GCP_NODEFINISH
+     */
+    infoEvent("Detected GCP stop(%d)...sending kill to %s", 
+	      cgcpStatus, c_GCP_COMMIT_Counter.getText());
+    ndbout_c("Detected GCP stop(%d)...sending kill to %s", 
+	     cgcpStatus, c_GCP_COMMIT_Counter.getText());
+    
+    {
+      NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter);
+      signal->theData[0] = 7022;
+      sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
+    }
+
+    {
+      NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter);
+      SystemError * const sysErr = (SystemError*)&signal->theData[0];
+      sysErr->errorCode = SystemError::GCPStopDetected;
+      sysErr->errorRef = reference();
+      sysErr->data1 = cgcpStatus;
+      sysErr->data2 = cgcpOrderBlocked;
+      sendSignal(rg, GSN_SYSTEM_ERROR, signal, 
+                 SystemError::SignalLength, JBA);
+    }
+    ndbrequire(!c_GCP_COMMIT_Counter.done());
+    return;
+  }
   case GCP_NODE_FINISHED:
   {
+    jam();
     /**
      * We're waiting for a GCP save conf
      */
-    ndbrequire(!c_GCP_SAVEREQ_Counter.done());
     NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter);
     signal->theData[0] = 2305;
     sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB);
     
-    infoEvent("Detected GCP stop...sending kill to %s", 
-	      c_GCP_SAVEREQ_Counter.getText());
-    g_eventLogger.error("Detected GCP stop...sending kill to %s", 
-                        c_GCP_SAVEREQ_Counter.getText());
+    infoEvent("Detected GCP stop(%d)...sending kill to %s", 
+              cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
+    ndbout_c("Detected GCP stop(%d)...sending kill to %s", 
+	     cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
+    ndbrequire(!c_GCP_SAVEREQ_Counter.done());
     return;
   }
   case GCP_SAVE_LQH_FINISHED:
-    g_eventLogger.error("m_copyReason: %d m_waiting: %d",
-                        c_copyGCIMaster.m_copyReason,
-                        c_copyGCIMaster.m_waiting);
-    break;
-  case GCP_READY: // shut up lint
-  case GCP_PREPARE_SENT:
-  case GCP_COMMIT_SENT:
-    break;
+  {
+    jam();
+    /**
+     * We're waiting for a COPY_GCICONF
+     */
+    infoEvent("Detected GCP stop(%d)...sending kill to %s", 
+	      cgcpStatus, c_COPY_GCIREQ_Counter.getText());
+    ndbout_c("Detected GCP stop(%d)...sending kill to %s", 
+	     cgcpStatus, c_COPY_GCIREQ_Counter.getText());
+
+    {
+      NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter);
+      signal->theData[0] = 7022;
+      sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
+    }
+    
+    {
+      NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter);
+      SystemError * const sysErr = (SystemError*)&signal->theData[0];
+      sysErr->errorCode = SystemError::GCPStopDetected;
+      sysErr->errorRef = reference();
+      sysErr->data1 = cgcpStatus;
+      sysErr->data2 = cgcpOrderBlocked;
+      sendSignal(rg, GSN_SYSTEM_ERROR, signal, 
+                 SystemError::SignalLength, JBA);
+    }
+    ndbrequire(!c_COPY_GCIREQ_Counter.done());
+    return;
+  }
+  case GCP_READY: (void)1;
   }
+
+dolocal:  
+  ndbout_c("m_copyReason: %d m_waiting: %d",
+           c_copyGCIMaster.m_copyReason,
+           c_copyGCIMaster.m_waiting);
   
-  g_eventLogger.error("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
-                      c_copyGCISlave.m_senderData,
-                      c_copyGCISlave.m_senderRef,
-                      c_copyGCISlave.m_copyReason,
-                      c_copyGCISlave.m_expectedNextWord);
+  ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
+	   c_copyGCISlave.m_senderData,
+	   c_copyGCISlave.m_senderRef,
+	   c_copyGCISlave.m_copyReason,
+	   c_copyGCISlave.m_expectedNextWord);
 
   FileRecordPtr file0Ptr;
   file0Ptr.i = crestartInfoFile[0];
@@ -11202,23 +11305,39 @@
 	   c_TCGETOPSIZEREQ_Counter.getText());
   ndbout_c("c_UPDATE_TOREQ_Counter = %s", c_UPDATE_TOREQ_Counter.getText());
 
-  NodeRecordPtr nodePtr;
-  for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
+  if (local == false)
+  {
     jam();
-    ptrAss(nodePtr, nodeRecord);
-    if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
+    NodeRecordPtr nodePtr;
+    for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
       jam();
-      const BlockReference ref = 
-	numberToRef(refToBlock(cntrlblockref), nodePtr.i);
-      SystemError * const sysErr = (SystemError*)&signal->theData[0];
-      sysErr->errorCode = SystemError::GCPStopDetected;
-      sysErr->errorRef = reference();
-      sysErr->data1 = cgcpStatus;
-      sysErr->data2 = cgcpOrderBlocked;
-      sendSignal(ref, GSN_SYSTEM_ERROR, signal, 
-		 SystemError::SignalLength, JBA);
-    }//if
-  }//for
+      ptrAss(nodePtr, nodeRecord);
+      if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
+        jam();
+        const BlockReference ref = 
+          numberToRef(refToBlock(cntrlblockref), nodePtr.i);
+        SystemError * const sysErr = (SystemError*)&signal->theData[0];
+        sysErr->errorCode = SystemError::GCPStopDetected;
+        sysErr->errorRef = reference();
+        sysErr->data1 = cgcpStatus;
+        sysErr->data2 = cgcpOrderBlocked;
+        sendSignal(ref, GSN_SYSTEM_ERROR, signal, 
+                   SystemError::SignalLength, JBA);
+      }//if
+    }//for
+  }
+  else
+  {
+    jam();
+    SystemError * const sysErr = (SystemError*)&signal->theData[0];
+    sysErr->errorCode = SystemError::GCPStopDetected;
+    sysErr->errorRef = reference();
+    sysErr->data1 = cgcpStatus;
+    sysErr->data2 = cgcpOrderBlocked;
+    EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR, 
+                   signal, SystemError::SignalLength);
+    ndbrequire(false);
+  }
   return;
 }//Dbdih::crashSystemAtGcpStop()
 
@@ -14303,6 +14422,12 @@
       } while (replicaPtr.i != RNIL);
       infoEvent(buf);
     }
+  }
+
+  if (arg == 7022)
+  {
+    jam();
+    crashSystemAtGcpStop(signal, true);
   }
 }//Dbdih::execDUMP_STATE_ORD()
 
Thread
bk commit into 5.1 tree (jonas:1.2499) BUG#29331jonas26 Jun