3459 Frazer Clement 2011-10-28
bug#13323589 Ndb : Multi node failure during slow LCP causes crash on node recovery
Fix for problem when allowing nodes to start before DIH Master takeover completed.
modified:
storage/ndb/src/kernel/blocks/ERROR_codes.txt
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
storage/ndb/test/ndbapi/testNodeRestart.cpp
storage/ndb/test/run-test/daily-basic-tests.txt
3458 jonas oreland 2011-10-21
ndb - bug#62856, find_stack_direction fails with gcc-4.6 -O3
modified:
config/ac-macros/misc.m4
=== modified file 'storage/ndb/src/kernel/blocks/ERROR_codes.txt'
--- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2011-01-30 20:42:21 +0000
+++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2011-10-28 13:49:12 +0000
@@ -18,7 +18,7 @@ Next NDBCNTR 1002
Next NDBFS 2000
Next DBACC 3002
Next DBTUP 4032
-Next DBLQH 5064
+Next DBLQH 5074
Next DBDICT 6026
Next DBDIH 7229
Next DBTC 8090
=== modified file 'storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2011-08-19 08:16:25 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2011-10-28 13:49:12 +0000
@@ -2182,6 +2182,17 @@ void Dbdih::execSTART_PERMREQ(Signal* si
CRASH_INSERTION(7122);
ndbrequire(isMaster());
ndbrequire(refToNode(retRef) == nodeId);
+ if (c_lcpMasterTakeOverState.state != LMTOS_IDLE)
+ {
+ jam();
+ infoEvent("DIH : Denied request for start permission from %u "
+ "while LCP Master takeover in progress.",
+ nodeId);
+ signal->theData[0] = nodeId;
+ signal->theData[1] = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
+ sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
+ return;
+ }
if ((c_nodeStartMaster.activeState) ||
(c_nodeStartMaster.wait != ZFALSE) ||
ERROR_INSERTED_CLEAR(7175)) {
=== modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2011-07-04 12:36:04 +0000
+++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2011-10-28 13:49:12 +0000
@@ -12389,6 +12389,15 @@ void Dblqh::execBACKUP_FRAGMENT_REF(Sign
void Dblqh::execBACKUP_FRAGMENT_CONF(Signal* signal)
{
jamEntry();
+
+ if (ERROR_INSERTED(5073))
+ {
+ ndbout_c("Delaying BACKUP_FRAGMENT_CONF");
+ sendSignalWithDelay(reference(), GSN_BACKUP_FRAGMENT_CONF, signal, 500,
+ signal->getLength());
+ return;
+ }
+
//BackupFragmentConf* conf= (BackupFragmentConf*)signal->getDataPtr();
lcpPtr.i = 0;
=== modified file 'storage/ndb/test/ndbapi/testNodeRestart.cpp'
--- a/storage/ndb/test/ndbapi/testNodeRestart.cpp 2011-06-30 15:55:35 +0000
+++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp 2011-10-28 13:49:12 +0000
@@ -4029,6 +4029,124 @@ runBug58453(NDBT_Context* ctx, NDBT_Step
return NDBT_OK;
}
+int
+runMasterFailSlowLCP(NDBT_Context* ctx, NDBT_Step* step)
+{
+ /* Motivated by bug# 13323589 */
+ NdbRestarter res;
+
+ if (res.getNumDbNodes() < 4)
+ {
+ return NDBT_OK;
+ }
+
+ int master = res.getMasterNodeId();
+ int otherVictim = res.getRandomNodeOtherNodeGroup(master, rand());
+ int nextMaster = res.getNextMasterNodeId(master);
+ nextMaster = (nextMaster == otherVictim) ? res.getNextMasterNodeId(otherVictim) :
+ nextMaster;
+ assert(nextMaster != master);
+ assert(nextMaster != otherVictim);
+
+ /* Get a node which is not current or next master */
+ int slowNode= nextMaster;
+ while ((slowNode == nextMaster) ||
+ (slowNode == otherVictim) ||
+ (slowNode == master))
+ {
+ slowNode = res.getRandomNotMasterNodeId(rand());
+ }
+
+ ndbout_c("master: %d otherVictim : %d nextMaster: %d slowNode: %d",
+ master,
+ otherVictim,
+ nextMaster,
+ slowNode);
+
+ /* Steps :
+ * 1. Insert slow LCP frag error in slowNode
+ * 2. Start LCP
+ * 3. Wait for LCP to start
+ * 4. Kill at least two nodes including Master
+ * 5. Wait for killed nodes to attempt to rejoin
+ * 6. Remove slow LCP error
+ * 7. Allow system to stabilise + check no errors
+ */
+ // 5073 = Delay on handling BACKUP_FRAGMENT_CONF in LQH
+ if (res.insertErrorInNode(slowNode, 5073))
+ {
+ return NDBT_FAILED;
+ }
+
+ {
+ int req[1] = {DumpStateOrd::DihStartLcpImmediately};
+ if (res.dumpStateOneNode(master, req, 1))
+ {
+ return NDBT_FAILED;
+ }
+ }
+
+ ndbout_c("Giving LCP time to start...");
+
+ NdbSleep_SecSleep(10);
+
+ ndbout_c("Killing other victim node (%u)...", otherVictim);
+
+ if (res.restartOneDbNode(otherVictim, false, false, true))
+ {
+ return NDBT_FAILED;
+ }
+
+ ndbout_c("Killing Master node (%u)...", master);
+
+ if (res.restartOneDbNode(master, false, false, true))
+ {
+ return NDBT_FAILED;
+ }
+
+ /*
+ ndbout_c("Waiting for old Master node to enter NoStart state...");
+ if (res.waitNodesNoStart(&master, 1, 10))
+ return NDBT_FAILED;
+
+ ndbout_c("Starting old Master...");
+ if (res.startNodes(&master, 1))
+ return NDBT_FAILED;
+
+ */
+ ndbout_c("Waiting for some progress on old Master and other victim restart");
+ NdbSleep_SecSleep(15);
+
+ ndbout_c("Now removing error insert on slow node (%u)", slowNode);
+
+ if (res.insertErrorInNode(slowNode, 0))
+ {
+ return NDBT_FAILED;
+ }
+
+ ndbout_c("Now wait a while to check stability...");
+ NdbSleep_SecSleep(30);
+
+ if (res.getNodeStatus(master) == NDB_MGM_NODE_STATUS_NOT_STARTED)
+ {
+ ndbout_c("Old Master needs kick to restart");
+ if (res.startNodes(&master, 1))
+ {
+ return NDBT_FAILED;
+ }
+ }
+
+ ndbout_c("Wait for cluster recovery...");
+ if (res.waitClusterStarted())
+ {
+ return NDBT_FAILED;
+ }
+
+
+ ndbout_c("Done");
+ return NDBT_OK;
+}
+
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
@@ -4550,6 +4668,11 @@ TESTCASE("Bug57522", "")
{
INITIALIZER(runBug57522);
}
+TESTCASE("MasterFailSlowLCP",
+ "DIH Master failure during a slow LCP can cause a crash.")
+{
+ INITIALIZER(runMasterFailSlowLCP);
+}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt 2011-06-28 08:40:56 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt 2011-10-28 13:49:12 +0000
@@ -1529,3 +1529,8 @@ max-time: 300
cmd: testDict
args: -n Bug57057 T1
+max-time: 300
+cmd: testNodeRestart
+args: -nMasterFailSlowLCP T1
+
+
No bundle (reason: useless for push emails).
| Thread |
|---|
| • bzr push into mysql-5.1-telco-6.3 branch (frazer.clement:3458 to 3459)Bug#13323589 | Frazer Clement | 1 Nov |