4031 Jan Wedvik 2012-10-15
This commit fixes Bug#14647210 "CAN CRASH ALL NODES EASILY WHEN
RESTARTING MORE THAN 6 NODES SIMULTANEOUSLY":
* It implements Dbdict::execGET_TABINFOREF(). This function will resend
GET_TABINFOREQ if the DICT master was busy.
* It removes local queueing of GET_TABINFOREQ signals (via delayed
signals) for signals coming from another DICT block.
* It implements a regression test for this bug.
* It adds a version check, such that GET_TABINFOREF will not be sent
to DICT blocks where the software is too old to handle that signal.
modified:
storage/ndb/include/ndb_version.h.in
storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp
storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp
storage/ndb/test/ndbapi/testDict.cpp
storage/ndb/test/run-test/daily-basic-tests.txt
4030 John David Duncan 2012-10-12 [merge]
merge
added:
mysql-test/suite/ndb_memcache/include/misc_tables.inc
modified:
mysql-test/lib/My/Memcache.pm
mysql-test/suite/ndb_memcache/include/datatypes_tables.inc
mysql-test/suite/ndb_memcache/r/external_values.result
mysql-test/suite/ndb_memcache/t/external_values.test
mysql-test/suite/ndb_memcache/t/math3.test
mysql-test/suite/ndb_memcache/t/mpart_key.test
mysql-test/suite/ndb_memcache/t/type_char.test
mysql-test/suite/ndb_memcache/t/unique_idx.test
=== modified file 'storage/ndb/include/ndb_version.h.in'
--- a/storage/ndb/include/ndb_version.h.in 2012-08-24 11:53:18 +0000
+++ b/storage/ndb/include/ndb_version.h.in 2012-10-15 11:46:54 +0000
@@ -809,4 +809,17 @@ ndb_join_pushdown(Uint32 x)
return x >= NDBD_JOIN_PUSHDOWN;
}
+/**
+ * DICT did not have a function for receiving GET_TABINFOREF signals
+ * (see Bug#14647210).
+ */
+#define NDBD_DICT_GET_TABINFOREF_IMPLEMENTED NDB_MAKE_VERSION(7, 2, 9)
+
+inline
+int
+ndbd_dict_get_tabinforef_implemented(Uint32 x)
+{
+ return x >= NDBD_DICT_GET_TABINFOREF_IMPLEMENTED;
+}
+
#endif
=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp 2012-09-21 12:41:59 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp 2012-10-15 11:46:54 +0000
@@ -431,6 +431,13 @@ void Dbdict::execCONTINUEB(Signal* signa
jam();
trans_commit_wait_gci(signal);
break;
+ case ZGET_TABINFO_RETRY:
+ // We have waited a while. Now we send a new request to the master.
+ memmove(signal->theData, signal->theData+1,
+ GetTabInfoReq::SignalLength * sizeof *signal->theData);
+ sendSignal(calcDictBlockRef(c_masterNodeId), GSN_GET_TABINFOREQ, signal,
+ GetTabInfoReq::SignalLength, JBB);
+ break;
default :
ndbrequire(false);
break;
@@ -1974,6 +1981,7 @@ Dbdict::Dbdict(Block_context& ctx):
addRecSignal(GSN_DUMP_STATE_ORD, &Dbdict::execDUMP_STATE_ORD);
addRecSignal(GSN_GET_TABINFOREQ, &Dbdict::execGET_TABINFOREQ);
addRecSignal(GSN_GET_TABLEID_REQ, &Dbdict::execGET_TABLEDID_REQ);
+ addRecSignal(GSN_GET_TABINFOREF, &Dbdict::execGET_TABINFOREF);
addRecSignal(GSN_GET_TABINFO_CONF, &Dbdict::execGET_TABINFO_CONF);
addRecSignal(GSN_CONTINUEB, &Dbdict::execCONTINUEB);
@@ -4144,6 +4152,52 @@ Dbdict::restart_fromWriteSchemaFile(Sign
}
void
+Dbdict::execGET_TABINFOREF(Signal* signal){
+ jamEntry();
+ /**
+ * Make copy of 'ref' such that we can build 'req' without overwriting
+ * source.
+ */
+ const GetTabInfoRef ref_copy =
+ *reinterpret_cast<const GetTabInfoRef*>(signal->getDataPtr());
+
+ if (ref_copy.errorCode == GetTabInfoRef::Busy)
+ {
+ jam();
+
+ /**
+ * Master is busy. Send delayed CONTINUEB to self to add some delay, then
+ * send new GET_TABINFOREQ to master.
+ */
+ signal->getDataPtrSend()[0] = ZGET_TABINFO_RETRY;
+
+ GetTabInfoReq* const req =
+ reinterpret_cast<GetTabInfoReq*>(signal->getDataPtrSend()+1);
+ memset(req, 0, sizeof *req);
+ req->senderRef = reference();
+ req->senderData = ref_copy.senderData;
+ req->requestType =
+ GetTabInfoReq::RequestById | GetTabInfoReq::LongSignalConf;
+ req->tableId = ref_copy.tableId;
+ req->schemaTransId = ref_copy.schemaTransId;
+ // Add a random 5-10ms delay.
+ sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 5 + random()%6,
+ GetTabInfoReq::SignalLength+1);
+ }
+ else
+ {
+ // Other error. Restart node.
+ char msg[250];
+ BaseString::snprintf(msg, sizeof(msg),
+ "Got GET_TABINFOREF from node %u with unexpected "
+ " error code %u",
+ refToNode(signal->getSendersBlockRef()),
+ ref_copy.errorCode);
+ progError(__LINE__, NDBD_EXIT_RESTORE_SCHEMA, msg);
+ }
+} // Dbdict::execGET_TABINFOREF()
+
+void
Dbdict::execGET_TABINFO_CONF(Signal* signal)
{
jamEntry();
@@ -9993,12 +10047,31 @@ void Dbdict::execGET_TABINFOREQ(Signal*
return;
}//if
- const Uint32 MAX_WAITERS = 5;
+ const Uint32 MAX_WAITERS = (MAX_NDB_NODES*3)/2;
- if(c_retrieveRecord.busyState && fromTimeQueue == false)
+ // Test sending GET_TABINFOREF to DICT (Bug#14647210).
+ const bool testRef = refToMain(signal->senderBlockRef()) == DBDICT &&
+ ERROR_INSERTED_CLEAR(6026);
+
+ if ((c_retrieveRecord.busyState || testRef) && fromTimeQueue == false)
{
jam();
- if(c_retrieveRecord.noOfWaiters < MAX_WAITERS){
+
+ const Uint32 senderVersion =
+ getNodeInfo(refToNode(signal->senderBlockRef())).m_version;
+
+ /**
+ * DBDICT may possibly generate large numbers of signals if many nodes
+ * are started at the same time, so we do not want to queue those using
+ * sendSignalWithDelay(). See also Bug#14647210. Signals from other
+ * blocks we do queue localy, since these blocks may not retry on
+ * GET_TABINFOREF with error==busy, and since they also should not
+ * generate large bursts of GET_TABINFOREQ.
+ */
+ if (c_retrieveRecord.noOfWaiters < MAX_WAITERS &&
+ (refToMain(signal->senderBlockRef()) != DBDICT ||
+ !ndbd_dict_get_tabinforef_implemented(senderVersion)))
+ {
jam();
c_retrieveRecord.noOfWaiters++;
@@ -10007,6 +10080,12 @@ void Dbdict::execGET_TABINFOREQ(Signal*
&handle);
return;
}
+
+ if (!c_retrieveRecord.busyState)
+ {
+ ndbout << "Sending extra TABINFOREF to node"
+ << refToNode(signal->senderBlockRef()) << endl;
+ }
releaseSections(handle);
sendGET_TABINFOREF(signal, req, GetTabInfoRef::Busy, __LINE__);
return;
=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp 2012-09-19 07:09:57 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp 2012-10-15 11:46:54 +0000
@@ -98,6 +98,7 @@
#define ZCOMMIT_WAIT_GCI 6
#define ZINDEX_STAT_BG_PROCESS 7
+#define ZGET_TABINFO_RETRY 8
/*--------------------------------------------------------------*/
// Other constants in alphabetical order
@@ -856,7 +857,7 @@ private:
void execGET_TABINFOREQ(Signal* signal);
void execGET_TABLEDID_REQ(Signal* signal);
- void execGET_TABINFO_REF(Signal* signal);
+ void execGET_TABINFOREF(Signal* signal);
void execGET_TABINFO_CONF(Signal* signal);
void execCONTINUEB(Signal* signal);
=== modified file 'storage/ndb/test/ndbapi/testDict.cpp'
--- a/storage/ndb/test/ndbapi/testDict.cpp 2012-09-21 12:34:28 +0000
+++ b/storage/ndb/test/ndbapi/testDict.cpp 2012-10-15 11:46:54 +0000
@@ -8965,6 +8965,56 @@ runBug57057(NDBT_Context* ctx, NDBT_Step
return result;
}
+/**
+ * This is a regression test for Bug #14647210 "CAN CRASH ALL NODES EASILY
+ * WHEN RESTARTING MORE THAN 6 NODES SIMULTANEOUSLY". The cause of this bug
+ * was that DICT did not handle GET_TABINFOREF signals.
+ */
+static int
+runGetTabInfoRef(NDBT_Context* ctx, NDBT_Step* step)
+{
+ NdbRestarter restarter;
+ if (restarter.getNumDbNodes() == 1)
+ {
+ g_info << "Cannot do this test with just one datanode." << endl;
+ return NDBT_OK;
+ }
+
+ /**
+ * This error insert makes DICT respond with GET_TABINFOREF where
+ * error==busy when receiving the next GET_TABINFOREQ signal.
+ */
+ require(restarter.insertErrorInAllNodes(6026) == 0);
+
+ int nodeSet[MAX_NDB_NODES];
+ for (int i = 0; i < restarter.getNumDbNodes() - 1; i++)
+ {
+ nodeSet[i] = restarter.getDbNodeId(i);
+ g_info << "Node " << nodeSet[i] << " will be stopped." << endl;
+ }
+
+ require(restarter.restartNodes(nodeSet, restarter.getNumDbNodes() - 1,
+ NdbRestarter::NRRF_NOSTART |
+ NdbRestarter::NRRF_ABORT) == 0);
+
+ g_info << "Waiting for nodes to stop." << endl;
+ require(restarter.waitNodesNoStart(nodeSet, restarter.getNumDbNodes() - 1)
+ == 0);
+
+ require(restarter.startNodes(nodeSet, restarter.getNumDbNodes() - 1) == 0);
+
+ g_info << "Waiting for nodes to start again." << endl;
+ if (restarter.waitClusterStarted() != 0)
+ {
+ g_err << "Failed to restart cluster " << endl;
+ require(restarter.insertErrorInAllNodes(0) == 0);
+ return NDBT_FAILED;
+ }
+
+ require(restarter.insertErrorInAllNodes(0) == 0);
+ return NDBT_OK;
+} // runGetTabInfoRef()
+
int
runBug13416603(NDBT_Context* ctx, NDBT_Step* step)
{
@@ -9655,6 +9705,12 @@ TESTCASE("Bug57057",
TC_PROPERTY("SubSteps", 1);
STEP(runBug58277scan);
}
+TESTCASE("GetTabInfoRef", "Regression test for bug #14647210 'CAN CRASH ALL "
+ "NODES EASILY WHEN RESTARTING MORE THAN 6 NODES SIMULTANEOUSLY'"
+ " (missing handling of GET_TABINFOREF signal).")
+{
+ INITIALIZER(runGetTabInfoRef);
+}
TESTCASE("Bug13416603", "")
{
INITIALIZER(runCreateTheTable);
=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt 2012-09-21 12:41:59 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt 2012-10-15 11:46:54 +0000
@@ -1876,6 +1876,10 @@ cmd: testDict
args: -n DropWithTakeover T1
max-time: 300
+cmd: testDict
+args: -n GetTabInfoRef T1
+
+max-time: 300
cmd: testBasic
args: -n LeakApiConnectObjects T1
No bundle (reason: useless for push emails).
| Thread |
|---|
| • bzr push into mysql-5.5-cluster-7.2 branch (jan.wedvik:4030 to 4031)Bug#14647210 | Jan Wedvik | 15 Oct |