#At file:///home/jonas/src/telco-6.2/ based on revid:jonas@stripped
2933 Jonas Oreland 2009-05-24
ndb - bug#44915 - fix bug with crash during restart, where a mbyte incorrectly could be skipped, leading to "end of log wo/ finding gci"
M storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
M storage/ndb/test/ndbapi/testNodeRestart.cpp
M storage/ndb/test/run-test/daily-basic-tests.txt
=== modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2009-04-05 20:56:43 +0000
+++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2009-05-24 18:29:37 +0000
@@ -12452,9 +12452,18 @@ void Dblqh::execGCP_SAVEREQ(Signal* sign
saveRef->errorCode = GCPSaveRef::NodeRestartInProgress;
sendSignal(dihBlockRef, GSN_GCP_SAVEREF, signal,
GCPSaveRef::SignalLength, JBB);
+
+ if (ERROR_INSERTED(5052))
+ {
+ jam();
+ signal->theData[0] = 9999;
+ sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 300, 1);
+ }
return;
}
+ CRASH_INSERTION(5052);
+
#ifdef GCP_TIMER_HACK
NdbTick_getMicroTimer(&globalData.gcp_timer_save[0]);
#endif
@@ -13943,6 +13952,14 @@ void Dblqh::writeFileDescriptor(Signal*
/* START BY WRITING TO LOG FILE RECORD */
/* -------------------------------------------------- */
arrGuard(logFilePtr.p->currentMbyte, clogFileSize);
+ if (DEBUG_REDO)
+ {
+ ndbout_c("part: %u file: %u setting logMaxGciCompleted[%u] = %u",
+ logPartPtr.i,
+ logFilePtr.p->fileNo,
+ logFilePtr.p->currentMbyte,
+ logPartPtr.p->logPartNewestCompletedGCI);
+ }
logFilePtr.p->logMaxGciCompleted[logFilePtr.p->currentMbyte] =
logPartPtr.p->logPartNewestCompletedGCI;
logFilePtr.p->logMaxGciStarted[logFilePtr.p->currentMbyte] = cnewestGci;
@@ -14193,10 +14210,16 @@ void Dblqh::readSrLastFileLab(Signal* si
void Dblqh::readSrLastMbyteLab(Signal* signal)
{
- if (logPartPtr.p->lastMbyte == ZNIL) {
+ if (logPartPtr.p->lastMbyte == ZNIL)
+ {
if (logPagePtr.p->logPageWord[ZPOS_LOG_LAP] < logPartPtr.p->logLap) {
jam();
logPartPtr.p->lastMbyte = logFilePtr.p->currentMbyte - 1;
+ if (DEBUG_REDO)
+ {
+ ndbout_c("readSrLastMbyteLab part: %u lastMbyte: %u",
+ logPartPtr.i, logPartPtr.p->lastMbyte);
+ }
}//if
}//if
arrGuard(logFilePtr.p->currentMbyte, clogFileSize);
@@ -15036,15 +15059,6 @@ void Dblqh::srGciLimits(Signal* signal)
}//if
}
- for(Uint32 i = 1; i<4; i++)
- {
- LogPartRecordPtr tmp;
- tmp.i = i;
- ptrAss(tmp, logPartRecord);
- tmp.p->logStartGci = logPartPtr.p->logStartGci;
- tmp.p->logLastGci = logPartPtr.p->logLastGci;
- }
-
if (logPartPtr.p->logStartGci == (UintR)-1) {
jam();
/* --------------------------------------------------------------------
@@ -15055,6 +15069,15 @@ void Dblqh::srGciLimits(Signal* signal)
logPartPtr.p->logStartGci = logPartPtr.p->logLastGci;
}//if
+ for(Uint32 i = 1; i<4; i++)
+ {
+ LogPartRecordPtr tmp;
+ tmp.i = i;
+ ptrAss(tmp, logPartRecord);
+ tmp.p->logStartGci = logPartPtr.p->logStartGci;
+ tmp.p->logLastGci = logPartPtr.p->logLastGci;
+ }
+
for (logPartPtr.i = 0; logPartPtr.i < 4; logPartPtr.i++) {
jam();
ptrAss(logPartPtr, logPartRecord);
@@ -15091,7 +15114,7 @@ void Dblqh::srLogLimits(Signal* signal)
while(true) {
ndbrequire(tmbyte < clogFileSize);
if (logPartPtr.p->logExecState == LogPartRecord::LES_SEARCH_STOP) {
- if (logFilePtr.p->logMaxGciCompleted[tmbyte] < logPartPtr.p->logLastGci) {
+ if (logFilePtr.p->logMaxGciCompleted[tmbyte] <= logPartPtr.p->logLastGci) {
jam();
/* --------------------------------------------------------------------
* WE ARE STEPPING BACKWARDS FROM MBYTE TO MBYTE. THIS IS THE FIRST
@@ -15103,6 +15126,15 @@ void Dblqh::srLogLimits(Signal* signal)
logPartPtr.p->stopMbyte = tmbyte;
logPartPtr.p->logExecState = LogPartRecord::LES_SEARCH_START;
}//if
+ else if (DEBUG_REDO)
+ {
+ ndbout_c("SKIP part: %u file: %u mb: %u logMaxGciCompleted: %u >= %u",
+ logPartPtr.i,
+ logFilePtr.p->fileNo,
+ tmbyte,
+ logFilePtr.p->logMaxGciCompleted[tmbyte],
+ logPartPtr.p->logLastGci);
+ }
}//if
/* ------------------------------------------------------------------------
* WHEN WE HAVEN'T FOUND THE STOP MBYTE IT IS NOT NECESSARY TO LOOK FOR THE
@@ -15161,8 +15193,10 @@ void Dblqh::srLogLimits(Signal* signal)
LogFileRecordPtr tmp;
tmp.i = logPartPtr.p->stopLogfile;
ptrCheckGuard(tmp, clogFileFileSize, logFileRecord);
- ndbout_c("srLogLimits part: %u start file: %u mb: %u stop file: %u mb: %u",
+ ndbout_c("srLogLimits part: %u gci: %u-%u start file: %u mb: %u stop file: %u mb: %u",
logPartPtr.i,
+ logPartPtr.p->logStartGci,
+ logPartPtr.p->logLastGci,
tlastPrepRef >> 16,
tlastPrepRef & 65535,
tmp.p->fileNo,
@@ -15665,8 +15699,9 @@ crash:
char buf[255];
BaseString::snprintf(buf, sizeof(buf),
"Error while reading REDO log. from %d\n"
- "D=%d, F=%d Mb=%d FP=%d W1=%d W2=%d : %s gci: %u",
+ "part: %u D=%d, F=%d Mb=%d FP=%d W1=%d W2=%d : %s gci: %u",
signal->theData[8],
+ logPartPtr.i,
signal->theData[2],
signal->theData[3],
signal->theData[4],
@@ -18768,6 +18803,12 @@ void Dblqh::writeCompletedGciLog(Signal*
changeMbyte(signal);
}//if
+ if (ERROR_INSERTED(5051) && (logFilePtr.p->currentFilepage > 0) &&
+ (logFilePtr.p->currentFilepage % 32) == 0)
+ {
+ SET_ERROR_INSERT_VALUE(5000);
+ }
+
logFilePtr.p->remainingWordsInMbyte =
logFilePtr.p->remainingWordsInMbyte - ZCOMPLETED_GCI_LOG_SIZE;
=== modified file 'storage/ndb/test/ndbapi/testNodeRestart.cpp'
--- a/storage/ndb/test/ndbapi/testNodeRestart.cpp 2009-04-01 11:25:38 +0000
+++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp 2009-05-24 18:29:37 +0000
@@ -3273,6 +3273,78 @@ runBug43888(NDBT_Context* ctx, NDBT_Step
return NDBT_OK;
}
+#define CHECK(b, m) { int _xx = b; if (!(_xx)) { \
+ ndbout << "ERR: "<< m \
+ << " " << "File: " << __FILE__ \
+ << " (Line: " << __LINE__ << ")" << "- " << _xx << endl; \
+ return NDBT_FAILED; } }
+
+int
+runBug44952(NDBT_Context* ctx, NDBT_Step* step)
+{
+ NdbRestarter res;
+ NdbDictionary::Dictionary* pDict = GETNDB(step)->getDictionary();
+
+ const int codes [] = {
+ 5051, 5052, 5053, 0
+ };
+
+ int randomId = myRandom48(res.getNumDbNodes());
+ int nodeId = res.getDbNodeId(randomId);
+
+ int loops = ctx->getNumLoops();
+ const int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
+ for (int l = 0; l < loops; l++)
+ {
+ int randomId = myRandom48(res.getNumDbNodes());
+ int nodeId = res.getDbNodeId(randomId);
+
+ ndbout_c("killing node %u error 5051 loop %u/%u", nodeId, l+1, loops);
+ CHECK(res.dumpStateOneNode(nodeId, val, 2) == 0,
+ "failed to set RestartOnErrorInsert");
+
+ CHECK(res.insertErrorInNode(nodeId, 5051) == 0,
+ "failed to insert error 5051");
+
+ while (res.waitNodesNoStart(&nodeId, 1, 1 /* seconds */) != 0)
+ {
+ pDict->forceGCPWait();
+ }
+
+ ndbout_c("killing node %u during restart error 5052", nodeId);
+ for (int j = 0; j < 3; j++)
+ {
+ ndbout_c("loop: %d - killing node %u during restart error 5052",
+ j, nodeId);
+ int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
+ CHECK(res.dumpStateOneNode(nodeId, val, 2) == 0,
+ "failed to set RestartOnErrorInsert");
+
+ CHECK(res.insertErrorInNode(nodeId, 5052) == 0,
+ "failed to set error insert");
+
+ NdbSleep_SecSleep(3); // ...
+
+ CHECK(res.startNodes(&nodeId, 1) == 0,
+ "failed to start node");
+
+ NdbSleep_SecSleep(3);
+
+ CHECK(res.waitNodesNoStart(&nodeId, 1) == 0,
+ "waitNodesNoStart failed");
+ }
+
+ CHECK(res.startNodes(&nodeId, 1) == 0,
+ "failed to start node");
+
+ CHECK(res.waitNodesStarted(&nodeId, 1) == 0,
+ "waitNodesStarted failed");
+ }
+
+ ctx->stopTest();
+ return NDBT_OK;
+}
+
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
@@ -3732,6 +3804,17 @@ TESTCASE("Bug42422", ""){
TESTCASE("Bug43888", ""){
INITIALIZER(runBug43888);
}
+TESTCASE("Bug44952",
+ "Test that we can execute the restart RestartNFDuringNR loop\n" \
+ "number of times"){
+ INITIALIZER(runCheckAllNodesStarted);
+ INITIALIZER(runLoadTable);
+ STEP(runBug44952);
+ STEP(runPkUpdateUntilStopped);
+ STEP(runScanUpdateUntilStopped);
+ FINALIZER(runScanReadVerify);
+ FINALIZER(runClearTable);
+}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt 2009-05-17 07:03:37 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt 2009-05-24 18:29:37 +0000
@@ -1284,3 +1284,7 @@ max-time: 1200
cmd: test_event
args: -n Bug44915 T1
+max-time: 3600
+cmd: testNodeRestart
+args: -n Bug44952 T1
+
Attachment: [text/bzr-bundle] bzr/jonas@mysql.com-20090524182937-woj16eknnx8bxerj.bundle
| Thread |
|---|
| • bzr commit into mysql-5.1-telco-6.2 branch (jonas:2933) Bug#44915 | Jonas Oreland | 24 May |