4629 Frazer Clement 2011-10-28 [merge]
Merge 6.3->7.0
modified:
storage/ndb/src/kernel/blocks/ERROR_codes.txt
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
storage/ndb/test/ndbapi/testNodeRestart.cpp
storage/ndb/test/run-test/daily-basic-tests.txt
4628 Ole John Aske 2011-10-28
Fix for handling errors during execute and retrieve of SPJ results
from a pushed lookup query:
If we missed to catch an error returned by
NdbTransaction::execute() or available through
NdbQuery::getErrorCode() a later ::fetchNext()
will succeed without returning an error code. It could also return an
incorrect result consisting of a partial results set from those NdbOperation
which did return something before the failure was received
This fix will reuse the same error handling mechanism for a lookup
which is already used by a scan query.
This will put the query into a persistent 'failed' state where any
further operations (except ::close()) on that query will return
the error :
"ERROR: 4816 A previous query operation failed, which you missed to catch."
modified:
storage/ndb/src/ndbapi/NdbQueryOperation.cpp
=== modified file 'storage/ndb/src/kernel/blocks/ERROR_codes.txt'
--- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2011-05-25 15:03:11 +0000
+++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2011-10-28 14:17:25 +0000
@@ -18,7 +18,7 @@ Next NDBCNTR 1002
Next NDBFS 2000
Next DBACC 3002
Next DBTUP 4035
-Next DBLQH 5072
+Next DBLQH 5074
Next DBDICT 6026
Next DBDIH 7229
Next DBTC 8092
=== modified file 'storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2011-10-23 08:34:49 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2011-10-28 14:17:25 +0000
@@ -2372,6 +2372,17 @@ void Dbdih::execSTART_PERMREQ(Signal* si
CRASH_INSERTION(7122);
ndbrequire(isMaster());
ndbrequire(refToNode(retRef) == nodeId);
+ if (c_lcpMasterTakeOverState.state != LMTOS_IDLE)
+ {
+ jam();
+ infoEvent("DIH : Denied request for start permission from %u "
+ "while LCP Master takeover in progress.",
+ nodeId);
+ signal->theData[0] = nodeId;
+ signal->theData[1] = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
+ sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
+ return;
+ }
if ((c_nodeStartMaster.activeState) ||
(c_nodeStartMaster.wait != ZFALSE) ||
ERROR_INSERTED_CLEAR(7175)) {
=== modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2011-10-17 13:32:49 +0000
+++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2011-10-28 14:17:25 +0000
@@ -13788,6 +13788,15 @@ void Dblqh::execBACKUP_FRAGMENT_REF(Sign
void Dblqh::execBACKUP_FRAGMENT_CONF(Signal* signal)
{
jamEntry();
+
+ if (ERROR_INSERTED(5073))
+ {
+ ndbout_c("Delaying BACKUP_FRAGMENT_CONF");
+ sendSignalWithDelay(reference(), GSN_BACKUP_FRAGMENT_CONF, signal, 500,
+ signal->getLength());
+ return;
+ }
+
//BackupFragmentConf* conf= (BackupFragmentConf*)signal->getDataPtr();
lcpPtr.i = 0;
=== modified file 'storage/ndb/test/ndbapi/testNodeRestart.cpp'
--- a/storage/ndb/test/ndbapi/testNodeRestart.cpp 2011-10-17 13:54:09 +0000
+++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp 2011-10-28 14:17:25 +0000
@@ -4757,6 +4757,125 @@ int runSplitLatency25PctFail(NDBT_Contex
return NDBT_OK;
}
+int
+runMasterFailSlowLCP(NDBT_Context* ctx, NDBT_Step* step)
+{
+ /* Motivated by bug# 13323589 */
+ NdbRestarter res;
+
+ if (res.getNumDbNodes() < 4)
+ {
+ return NDBT_OK;
+ }
+
+ int master = res.getMasterNodeId();
+ int otherVictim = res.getRandomNodeOtherNodeGroup(master, rand());
+ int nextMaster = res.getNextMasterNodeId(master);
+ nextMaster = (nextMaster == otherVictim) ? res.getNextMasterNodeId(otherVictim) :
+ nextMaster;
+ assert(nextMaster != master);
+ assert(nextMaster != otherVictim);
+
+ /* Get a node which is not current or next master */
+ int slowNode= nextMaster;
+ while ((slowNode == nextMaster) ||
+ (slowNode == otherVictim) ||
+ (slowNode == master))
+ {
+ slowNode = res.getRandomNotMasterNodeId(rand());
+ }
+
+ ndbout_c("master: %d otherVictim : %d nextMaster: %d slowNode: %d",
+ master,
+ otherVictim,
+ nextMaster,
+ slowNode);
+
+ /* Steps :
+ * 1. Insert slow LCP frag error in slowNode
+ * 2. Start LCP
+ * 3. Wait for LCP to start
+ * 4. Kill at least two nodes including Master
+ * 5. Wait for killed nodes to attempt to rejoin
+ * 6. Remove slow LCP error
+ * 7. Allow system to stabilise + check no errors
+ */
+ // 5073 = Delay on handling BACKUP_FRAGMENT_CONF in LQH
+ if (res.insertErrorInNode(slowNode, 5073))
+ {
+ return NDBT_FAILED;
+ }
+
+ {
+ int req[1] = {DumpStateOrd::DihStartLcpImmediately};
+ if (res.dumpStateOneNode(master, req, 1))
+ {
+ return NDBT_FAILED;
+ }
+ }
+
+ ndbout_c("Giving LCP time to start...");
+
+ NdbSleep_SecSleep(10);
+
+ ndbout_c("Killing other victim node (%u)...", otherVictim);
+
+ if (res.restartOneDbNode(otherVictim, false, false, true))
+ {
+ return NDBT_FAILED;
+ }
+
+ ndbout_c("Killing Master node (%u)...", master);
+
+ if (res.restartOneDbNode(master, false, false, true))
+ {
+ return NDBT_FAILED;
+ }
+
+ /*
+ ndbout_c("Waiting for old Master node to enter NoStart state...");
+ if (res.waitNodesNoStart(&master, 1, 10))
+ return NDBT_FAILED;
+
+ ndbout_c("Starting old Master...");
+ if (res.startNodes(&master, 1))
+ return NDBT_FAILED;
+
+ */
+ ndbout_c("Waiting for some progress on old Master and other victim restart");
+ NdbSleep_SecSleep(15);
+
+ ndbout_c("Now removing error insert on slow node (%u)", slowNode);
+
+ if (res.insertErrorInNode(slowNode, 0))
+ {
+ return NDBT_FAILED;
+ }
+
+ ndbout_c("Now wait a while to check stability...");
+ NdbSleep_SecSleep(30);
+
+ if (res.getNodeStatus(master) == NDB_MGM_NODE_STATUS_NOT_STARTED)
+ {
+ ndbout_c("Old Master needs kick to restart");
+ if (res.startNodes(&master, 1))
+ {
+ return NDBT_FAILED;
+ }
+ }
+
+ ndbout_c("Wait for cluster recovery...");
+ if (res.waitClusterStarted())
+ {
+ return NDBT_FAILED;
+ }
+
+
+ ndbout_c("Done");
+ return NDBT_OK;
+}
+
+
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
@@ -5288,6 +5407,11 @@ TESTCASE("Bug57522", "")
{
INITIALIZER(runBug57522);
}
+TESTCASE("MasterFailSlowLCP",
+ "DIH Master failure during a slow LCP can cause a crash.")
+{
+ INITIALIZER(runMasterFailSlowLCP);
+}
TESTCASE("ForceStopAndRestart", "Test restart and stop -with force flag")
{
STEP(runForceStopAndRestart);
=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt 2011-10-14 13:24:26 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt 2011-10-28 14:17:25 +0000
@@ -1835,3 +1835,8 @@ max-time 1800
cmd: testNdbApi
args: -n TestFragmentedSend T1
+max-time: 300
+cmd: testNodeRestart
+args: -nMasterFailSlowLCP T1
+
+
No bundle (reason: useless for push emails).
| Thread |
|---|
| • bzr push into mysql-5.1-telco-7.0 branch (frazer.clement:4628 to 4629) | Frazer Clement | 1 Nov |