List:Commits« Previous MessageNext Message »
From:Jonas Oreland Date:May 24 2009 6:29pm
Subject:bzr commit into mysql-5.1-telco-6.2 branch (jonas:2933) Bug#44915
View as plain text  
#At file:///home/jonas/src/telco-6.2/ based on revid:jonas@stripped

 2933 Jonas Oreland	2009-05-24
      ndb - bug#44915 - fix bug with crash during restart, where a mbyte incorrectly could be skipped, leading to "end of log wo/ finding gci"

    M  storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
    M  storage/ndb/test/ndbapi/testNodeRestart.cpp
    M  storage/ndb/test/run-test/daily-basic-tests.txt
=== modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2009-04-05 20:56:43 +0000
+++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2009-05-24 18:29:37 +0000
@@ -12452,9 +12452,18 @@ void Dblqh::execGCP_SAVEREQ(Signal* sign
     saveRef->errorCode = GCPSaveRef::NodeRestartInProgress;
     sendSignal(dihBlockRef, GSN_GCP_SAVEREF, signal, 
 	       GCPSaveRef::SignalLength, JBB);
+
+    if (ERROR_INSERTED(5052))
+    {
+      jam();
+      signal->theData[0] = 9999;
+      sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 300, 1);
+    }
     return;
   }
 
+  CRASH_INSERTION(5052);
+
 #ifdef GCP_TIMER_HACK
   NdbTick_getMicroTimer(&globalData.gcp_timer_save[0]);
 #endif
@@ -13943,6 +13952,14 @@ void Dblqh::writeFileDescriptor(Signal* 
 /*       START BY WRITING TO LOG FILE RECORD          */
 /* -------------------------------------------------- */
   arrGuard(logFilePtr.p->currentMbyte, clogFileSize);
+  if (DEBUG_REDO)
+  {
+    ndbout_c("part: %u file: %u setting logMaxGciCompleted[%u] = %u",
+             logPartPtr.i,
+             logFilePtr.p->fileNo,
+             logFilePtr.p->currentMbyte,
+             logPartPtr.p->logPartNewestCompletedGCI);
+  }
   logFilePtr.p->logMaxGciCompleted[logFilePtr.p->currentMbyte] = 
     logPartPtr.p->logPartNewestCompletedGCI;
   logFilePtr.p->logMaxGciStarted[logFilePtr.p->currentMbyte] = cnewestGci;
@@ -14193,10 +14210,16 @@ void Dblqh::readSrLastFileLab(Signal* si
 
 void Dblqh::readSrLastMbyteLab(Signal* signal) 
 {
-  if (logPartPtr.p->lastMbyte == ZNIL) {
+  if (logPartPtr.p->lastMbyte == ZNIL)
+  {
     if (logPagePtr.p->logPageWord[ZPOS_LOG_LAP] < logPartPtr.p->logLap) {
       jam();
       logPartPtr.p->lastMbyte = logFilePtr.p->currentMbyte - 1;
+      if (DEBUG_REDO)
+      {
+        ndbout_c("readSrLastMbyteLab part: %u lastMbyte: %u",
+                 logPartPtr.i, logPartPtr.p->lastMbyte);
+      }
     }//if
   }//if
   arrGuard(logFilePtr.p->currentMbyte, clogFileSize);
@@ -15036,15 +15059,6 @@ void Dblqh::srGciLimits(Signal* signal) 
     }//if
   }
 
-  for(Uint32 i = 1; i<4; i++)
-  {
-    LogPartRecordPtr tmp;
-    tmp.i = i;
-    ptrAss(tmp, logPartRecord);
-    tmp.p->logStartGci = logPartPtr.p->logStartGci;
-    tmp.p->logLastGci = logPartPtr.p->logLastGci;
-  }
-
   if (logPartPtr.p->logStartGci == (UintR)-1) {
     jam();
       /* --------------------------------------------------------------------
@@ -15055,6 +15069,15 @@ void Dblqh::srGciLimits(Signal* signal) 
     logPartPtr.p->logStartGci = logPartPtr.p->logLastGci;
   }//if
   
+  for(Uint32 i = 1; i<4; i++)
+  {
+    LogPartRecordPtr tmp;
+    tmp.i = i;
+    ptrAss(tmp, logPartRecord);
+    tmp.p->logStartGci = logPartPtr.p->logStartGci;
+    tmp.p->logLastGci = logPartPtr.p->logLastGci;
+  }
+
   for (logPartPtr.i = 0; logPartPtr.i < 4; logPartPtr.i++) {
     jam();
     ptrAss(logPartPtr, logPartRecord);
@@ -15091,7 +15114,7 @@ void Dblqh::srLogLimits(Signal* signal) 
   while(true) {
     ndbrequire(tmbyte < clogFileSize);
     if (logPartPtr.p->logExecState == LogPartRecord::LES_SEARCH_STOP) {
-      if (logFilePtr.p->logMaxGciCompleted[tmbyte] < logPartPtr.p->logLastGci) {
+      if (logFilePtr.p->logMaxGciCompleted[tmbyte] <= logPartPtr.p->logLastGci) {
         jam();
       /* --------------------------------------------------------------------
        *  WE ARE STEPPING BACKWARDS FROM MBYTE TO MBYTE. THIS IS THE FIRST 
@@ -15103,6 +15126,15 @@ void Dblqh::srLogLimits(Signal* signal) 
         logPartPtr.p->stopMbyte = tmbyte;
         logPartPtr.p->logExecState = LogPartRecord::LES_SEARCH_START;
       }//if
+      else if (DEBUG_REDO)
+      {
+        ndbout_c("SKIP part: %u file: %u mb: %u logMaxGciCompleted: %u >= %u",
+                 logPartPtr.i,
+                 logFilePtr.p->fileNo,
+                 tmbyte,
+                 logFilePtr.p->logMaxGciCompleted[tmbyte],
+                 logPartPtr.p->logLastGci);
+      }
     }//if
   /* ------------------------------------------------------------------------
    *  WHEN WE HAVEN'T FOUND THE STOP MBYTE IT IS NOT NECESSARY TO LOOK FOR THE
@@ -15161,8 +15193,10 @@ void Dblqh::srLogLimits(Signal* signal) 
     LogFileRecordPtr tmp;
     tmp.i = logPartPtr.p->stopLogfile;
     ptrCheckGuard(tmp, clogFileFileSize, logFileRecord);
-    ndbout_c("srLogLimits part: %u start file: %u mb: %u stop file: %u mb: %u",
+    ndbout_c("srLogLimits part: %u gci: %u-%u start file: %u mb: %u stop file: %u mb: %u",
              logPartPtr.i,
+             logPartPtr.p->logStartGci,
+             logPartPtr.p->logLastGci,
              tlastPrepRef >> 16,
              tlastPrepRef & 65535,
              tmp.p->fileNo,
@@ -15665,8 +15699,9 @@ crash:
   char buf[255];
   BaseString::snprintf(buf, sizeof(buf), 
 		       "Error while reading REDO log. from %d\n"
-		       "D=%d, F=%d Mb=%d FP=%d W1=%d W2=%d : %s gci: %u",
+		       "part: %u D=%d, F=%d Mb=%d FP=%d W1=%d W2=%d : %s gci: %u",
 		       signal->theData[8],
+                       logPartPtr.i,
 		       signal->theData[2], 
 		       signal->theData[3], 
 		       signal->theData[4],
@@ -18768,6 +18803,12 @@ void Dblqh::writeCompletedGciLog(Signal*
     changeMbyte(signal);
   }//if
 
+  if (ERROR_INSERTED(5051) && (logFilePtr.p->currentFilepage > 0) &&
+      (logFilePtr.p->currentFilepage % 32) == 0)
+  {
+    SET_ERROR_INSERT_VALUE(5000);
+  }
+
   logFilePtr.p->remainingWordsInMbyte = 
     logFilePtr.p->remainingWordsInMbyte - ZCOMPLETED_GCI_LOG_SIZE;
 

=== modified file 'storage/ndb/test/ndbapi/testNodeRestart.cpp'
--- a/storage/ndb/test/ndbapi/testNodeRestart.cpp	2009-04-01 11:25:38 +0000
+++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp	2009-05-24 18:29:37 +0000
@@ -3273,6 +3273,78 @@ runBug43888(NDBT_Context* ctx, NDBT_Step
   return NDBT_OK;
 }
 
+#define CHECK(b, m) { int _xx = b; if (!(_xx)) { \
+  ndbout << "ERR: "<< m \
+           << "   " << "File: " << __FILE__ \
+           << " (Line: " << __LINE__ << ")" << "- " << _xx << endl; \
+  return NDBT_FAILED; } }
+
+int
+runBug44952(NDBT_Context* ctx, NDBT_Step* step)
+{
+  NdbRestarter res;
+  NdbDictionary::Dictionary* pDict = GETNDB(step)->getDictionary();
+
+  const int codes [] = {
+    5051, 5052, 5053, 0
+  };
+
+  int randomId = myRandom48(res.getNumDbNodes());
+  int nodeId = res.getDbNodeId(randomId);
+
+  int loops = ctx->getNumLoops();
+  const int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
+  for (int l = 0; l < loops; l++)
+  {
+    int randomId = myRandom48(res.getNumDbNodes());
+    int nodeId = res.getDbNodeId(randomId);
+
+    ndbout_c("killing node %u error 5051 loop %u/%u", nodeId, l+1, loops);
+    CHECK(res.dumpStateOneNode(nodeId, val, 2) == 0,
+          "failed to set RestartOnErrorInsert");
+
+    CHECK(res.insertErrorInNode(nodeId, 5051) == 0,
+          "failed to insert error 5051");
+
+    while (res.waitNodesNoStart(&nodeId, 1, 1 /* seconds */) != 0)
+    {
+      pDict->forceGCPWait();
+    }
+
+    ndbout_c("killing node %u during restart error 5052", nodeId);
+    for (int j = 0; j < 3; j++)
+    {
+      ndbout_c("loop: %d - killing node %u during restart error 5052",
+               j, nodeId);
+      int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
+      CHECK(res.dumpStateOneNode(nodeId, val, 2) == 0,
+            "failed to set RestartOnErrorInsert");
+
+      CHECK(res.insertErrorInNode(nodeId, 5052) == 0,
+            "failed to set error insert");
+
+      NdbSleep_SecSleep(3); // ...
+
+      CHECK(res.startNodes(&nodeId, 1) == 0,
+            "failed to start node");
+
+      NdbSleep_SecSleep(3);
+
+      CHECK(res.waitNodesNoStart(&nodeId, 1) == 0,
+            "waitNodesNoStart failed");
+    }
+
+    CHECK(res.startNodes(&nodeId, 1) == 0,
+          "failed to start node");
+
+    CHECK(res.waitNodesStarted(&nodeId, 1) == 0,
+          "waitNodesStarted failed");
+  }
+
+  ctx->stopTest();
+  return NDBT_OK;
+}
+
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
 	 "Test that one node at a time can be stopped and then restarted "\
@@ -3732,6 +3804,17 @@ TESTCASE("Bug42422", ""){
 TESTCASE("Bug43888", ""){
   INITIALIZER(runBug43888);
 }
+TESTCASE("Bug44952",
+	 "Test that we can execute the restart RestartNFDuringNR loop\n" \
+	 "number of times"){
+  INITIALIZER(runCheckAllNodesStarted);
+  INITIALIZER(runLoadTable);
+  STEP(runBug44952);
+  STEP(runPkUpdateUntilStopped);
+  STEP(runScanUpdateUntilStopped);
+  FINALIZER(runScanReadVerify);
+  FINALIZER(runClearTable);
+}
 NDBT_TESTSUITE_END(testNodeRestart);
 
 int main(int argc, const char** argv){

=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt	2009-05-17 07:03:37 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt	2009-05-24 18:29:37 +0000
@@ -1284,3 +1284,7 @@ max-time: 1200
 cmd: test_event
 args: -n Bug44915 T1
 
+max-time: 3600
+cmd: testNodeRestart
+args: -n Bug44952 T1
+


Attachment: [text/bzr-bundle] bzr/jonas@mysql.com-20090524182937-woj16eknnx8bxerj.bundle
Thread
bzr commit into mysql-5.1-telco-6.2 branch (jonas:2933) Bug#44915Jonas Oreland24 May