3966 Mikael Ronstrom 2012-08-23 [merge]
merge
modified:
storage/ndb/include/kernel/signaldata/SchemaTrans.hpp
storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp
storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp
storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
storage/ndb/test/ndbapi/testDict.cpp
storage/ndb/test/run-test/daily-basic-tests.txt
3965 Mikael Ronstrom 2012-08-23
Fixes of indentation and such
modified:
storage/ndb/src/ndbapi/TransporterFacade.cpp
=== modified file 'storage/ndb/include/kernel/signaldata/SchemaTrans.hpp'
--- a/storage/ndb/include/kernel/signaldata/SchemaTrans.hpp revid:mikael.ronstrom@stripped
+++ b/storage/ndb/include/kernel/signaldata/SchemaTrans.hpp revid:mikael.ronstrom@stripped
@@ -81,6 +81,7 @@ struct SchemaTransEndConf {
struct SchemaTransEndRef {
enum ErrorCode {
NoError = 0,
+ Busy = 701,
NotMaster = 702,
InvalidTransKey = 781,
InvalidTransId = 782,
=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp revid:mikael.ronstrom@stripped
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp revid:mikael.ronstrom@stripped
@@ -2225,6 +2225,7 @@ void Dbdict::initCommonData()
c_systemRestart = false;
c_initialNodeRestart = false;
c_nodeRestart = false;
+ c_takeOverInProgress = false;
c_outstanding_sub_startstop = 0;
c_sub_startstop_lock.clear();
@@ -4619,6 +4620,7 @@ void Dbdict::handle_master_takeover(Sign
}
DictTakeoverReq* req = (DictTakeoverReq*)signal->getDataPtrSend();
req->senderRef = reference();
+ c_takeOverInProgress = true;
sendSignal(rg, GSN_DICT_TAKEOVER_REQ, signal,
DictTakeoverReq::SignalLength, JBB);
}
@@ -7827,8 +7829,8 @@ Dbdict::dropTable_complete_done(Signal*
sendSignal(SUMA_REF, GSN_DROP_TAB_CONF, signal,
DropTabConf::SignalLength, JBB);
}
-
- sendTransConf(signal, trans_ptr);
+ ndbassert(op_ptr.p->m_state == SchemaOp::OS_COMPLETING);
+ sendTransConf(signal, op_ptr);
}
// DropTable: ABORT
@@ -19745,12 +19747,16 @@ Dbdict::execDICT_TAKEOVER_REQ(Signal* si
case SchemaOp::OS_COMMITTING:
case SchemaOp::OS_COMPLETING:
{
+ /**
+ * Wait 100ms and check again. This delay is there to save CPU cycles
+ * and to avoid filling the jam trace buffer.
+ */
jam();
Uint32* data = &signal->theData[0];
- data[0] = ZDICT_TAKEOVER_REQ;
memmove(&data[1], &data[0], DictTakeoverReq::SignalLength << 2);
- sendSignal(reference(), GSN_CONTINUEB, signal,
- 1 + DictTakeoverReq::SignalLength, JBB);
+ data[0] = ZDICT_TAKEOVER_REQ;
+ sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
+ 100, 1 + DictTakeoverReq::SignalLength);
return;
}
default:
@@ -19873,6 +19879,7 @@ Dbdict::execDICT_TAKEOVER_REF(Signal* si
return;
}
}
+ c_takeOverInProgress = false;
check_takeover_replies(signal);
}
@@ -19918,6 +19925,7 @@ Dbdict::execDICT_TAKEOVER_CONF(Signal* s
return;
}
}
+ c_takeOverInProgress = false;
check_takeover_replies(signal);
}
@@ -24813,6 +24821,18 @@ Dbdict::execSCHEMA_TRANS_BEGIN_REQ(Signa
break;
}
+ if (c_takeOverInProgress)
+ {
+ /**
+ * There is a dict takeover in progress. There may thus another
+ * transaction that should be rolled backward or forward before we
+ * can allow another transaction to start.
+ */
+ jam();
+ setError(error, SchemaTransBeginRef::Busy, __LINE__);
+ break;
+ }
+
if (!check_ndb_versions() && !localTrans)
{
jam();
@@ -25010,6 +25030,18 @@ Dbdict::execSCHEMA_TRANS_END_REQ(Signal*
setError(error, SchemaTransEndRef::NotMaster, __LINE__);
break;
}
+
+ if (c_takeOverInProgress)
+ {
+ /**
+ * There is a dict takeover in progress, and the transaction may thus
+ * be in an inconsistent state. Therefore we cannot process this request
+ * now.
+ */
+ jam();
+ setError(error, SchemaTransEndRef::Busy, __LINE__);
+ break;
+ }
#ifdef MARTIN
ndbout_c("Dbdict::execSCHEMA_TRANS_END_REQ: trans %u, state %u", trans_ptr.i, trans_ptr.p->m_state);
#endif
@@ -25222,6 +25254,17 @@ Dbdict::execSCHEMA_TRANS_IMPL_CONF(Signa
jamEntry();
ndbrequire(signal->getNoOfSections() == 0);
+ if (c_takeOverInProgress)
+ {
+ /**
+ * The new master will rebuild the transaction state from the
+ * DICT_TAKEOVER_CONF signals. Therefore we ignore this signal during
+ * takeover.
+ */
+ jam();
+ return;
+ }
+
const SchemaTransImplConf* conf =
(const SchemaTransImplConf*)signal->getDataPtr();
@@ -25248,6 +25291,17 @@ Dbdict::execSCHEMA_TRANS_IMPL_REF(Signal
jamEntry();
ndbrequire(signal->getNoOfSections() == 0);
+ if (c_takeOverInProgress)
+ {
+ /**
+ * The new master will rebuild the transaction state from the
+ * DICT_TAKEOVER_CONF signals. Therefore we ignore this signal during
+ * takeover.
+ */
+ jam();
+ return;
+ }
+
SchemaTransImplRef refCopy =
*(SchemaTransImplRef*)signal->getDataPtr();
SchemaTransImplRef * ref = &refCopy;
=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp revid:mikael.ronstrom@stripped
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp revid:mikael.ronstrom@stripped
@@ -1319,6 +1319,13 @@ private:
bool c_nodeRestart;
bool c_initialNodeRestart;
Uint32 c_tabinfoReceived;
+ /**
+ * This flag indicates that a dict takeover is in progress, specifically
+ * that the new master has outstanding DICT_TAKEOVER_REQ messages. The flag
+ * is used to prevent client from starting (or ending) transactions during
+ * takeover.
+ */
+ bool c_takeOverInProgress;
/**
* Temporary structure used when parsing table info
=== modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp revid:mikael.ronstrom@stripped
+++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp revid:mikael.ronstrom@stripped
@@ -2513,7 +2513,31 @@ Dblqh::dropTab_wait_usage(Signal* signal
void
Dblqh::execDROP_TAB_REQ(Signal* signal){
jamEntry();
-
+ if (ERROR_INSERTED(5076))
+ {
+ /**
+ * This error insert simulates a situation where it takes a long time
+ * to execute DROP_TAB_REQ, such that we can crash the (dict) master
+ * while there is an outstanding DROP_TAB_REQ.
+ */
+ jam();
+ sendSignalWithDelay(reference(), GSN_DROP_TAB_REQ, signal, 1000,
+ signal->getLength());
+ return;
+ }
+ if (ERROR_INSERTED(5077))
+ {
+ jam();
+ CLEAR_ERROR_INSERT_VALUE;
+ /**
+ * Kill this node 2 seconds from now. We wait for two seconds to make sure
+ * that DROP_TAB_REQ messages have reached other nodes before this one
+ * dies.
+ */
+ signal->theData[0] = 9999;
+ sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 2000, 1);
+ return;
+ }
DropTabReq reqCopy = * (DropTabReq*)signal->getDataPtr();
DropTabReq* req = &reqCopy;
@@ -23709,7 +23733,7 @@ void Dblqh::execDBINFO_SCANREQ(Signal *s
row.write_uint32(getOwnNodeId());
row.write_uint32(0); // log type, 0 = REDO
row.write_uint32(0); // log id, always 0 in LQH
- row.write_uint32(logpart); // log part
+ row.write_uint32(logPartPtr.p->logPartNo); // log part
row.write_uint64(total*1024*1024); // total allocated
row.write_uint64((total-mb)*1024*1024); // currently in use
=== modified file 'storage/ndb/test/ndbapi/testDict.cpp'
--- a/storage/ndb/test/ndbapi/testDict.cpp revid:mikael.ronstrom@stripped
+++ b/storage/ndb/test/ndbapi/testDict.cpp revid:mikael.ronstrom@stripped
@@ -776,6 +776,144 @@ int runUseTableUntilStopped3(NDBT_Contex
return NDBT_OK;
}
+/**
+ * This is a regression test for bug 14190114
+ * "CLUSTER CRASH DUE TO NDBREQUIRE IN ./LOCALPROXY.HPP DBLQH (LINE: 234)".
+ * This bug occurs if there is a takeover (i.e. the master node crashes)
+ * while an LQH block is executing a DROP_TAB_REQ signal. It only affects
+ * multi-threaded ndb.
+ */
+static int
+runDropTakeoverTest(NDBT_Context* ctx, NDBT_Step* step)
+{
+ NdbRestarter restarter;
+ if (restarter.getNumDbNodes() == 1)
+ {
+ g_info << "Cannot do this test with just one datanode." << endl;
+ return NDBT_OK;
+ }
+
+ Ndb* const ndb = GETNDB(step);
+ NdbDictionary::Dictionary* const dict = ndb->getDictionary();
+
+ // First we create a table that is a copy of ctx->getTab().
+ NdbDictionary::Table copyTab(*ctx->getTab());
+ const char* copyName = "copyTab";
+
+ copyTab.setName(copyName);
+ if (dict->createTable(copyTab) != 0)
+ {
+ g_err << "Failed to create table " << copyName << endl
+ << dict->getNdbError() << endl;
+ return NDBT_FAILED;
+ }
+
+ /**
+ * Find the node id of the master node and another data node that is not
+ * the master.
+ */
+ const int masterNodeId = restarter.getMasterNodeId();
+ const int nonMasterNodeId =
+ masterNodeId == restarter.getDbNodeId(0) ?
+ restarter.getDbNodeId(1) :
+ restarter.getDbNodeId(0);
+
+ /**
+ * This error insert makes LQH resend the DROP_TAB_REQ to itself (with a
+ * delay) rather than executing it, until the error insert is reset.
+ * This makes it appear as if though the LQH block spends a long time
+ * executing the DROP_TAB_REQ signal.
+ */
+ g_info << "Insert error 5076 in node " << nonMasterNodeId << endl;
+ require(restarter.insertErrorInNode(nonMasterNodeId, 5076) == 0);
+ /**
+ * This error insert makes the master node crash when one of its LQH
+ * blocks tries to execute a DROP_TAB_REQ signal. This will then trigger
+ * a takeover.
+ */
+ g_info << "Insert error 5077 in node " << masterNodeId << endl;
+ require(restarter.insertErrorInNode(masterNodeId, 5077) == 0);
+
+ // This dropTable should fail, since the master node dies.
+ g_info << "Trying to drop table " << copyName << endl;
+ if (dict->dropTable(copyName) == 0)
+ {
+ g_err << "Unexpectedly managed to drop table " << copyName << endl;
+ return NDBT_FAILED;
+ }
+
+ /**
+ * Check that only old master is dead. Bug 14190114 would cause other nodes
+ * to die as well.
+ */
+ const int deadNodeId = restarter.checkClusterAlive(&masterNodeId, 1);
+ if (deadNodeId != 0)
+ {
+ g_err << "NodeId " << deadNodeId << " is down." << endl;
+ return NDBT_FAILED;
+ }
+
+ // Reset error insert.
+ g_info << "insert error 0 in node " << nonMasterNodeId << endl;
+ require(restarter.insertErrorInNode(nonMasterNodeId, 0) == 0);
+
+ // Verify that old master comes back up, and that no other node crashed.
+ g_info << "Waiting for all nodes to be up." << endl;
+ if (restarter.waitClusterStarted() != 0)
+ {
+ g_err << "One or more cluster nodes are not up." << endl;
+ return NDBT_FAILED;
+ }
+
+ /**
+ * The 'drop table' operation should have been rolled forward, since the
+ * node crash happened in the complete phase. Verify that the table is
+ * gone.
+ */
+ g_info << "Verifying that table " << copyName << " was deleted." << endl;
+ if (dict->getTable(copyName) == NULL)
+ {
+ if (dict->getNdbError().code != 723) // 723 = no such table existed.
+ {
+ g_err << "dict->getTable() for " << copyName
+ << " failed in unexpedted way:" << endl
+ << dict->getNdbError() << endl;
+ return NDBT_FAILED;
+ }
+ }
+ else
+ {
+ g_err << "Transaction dropping " << copyName << " was not rolled forward"
+ << endl;
+ return NDBT_FAILED;
+ }
+
+ /**
+ * Do another dictionary transaction, to verify that the cluster allows that.
+ */
+ NdbDictionary::Table extraTab(*ctx->getTab());
+ const char* extraName = "extraTab";
+
+ extraTab.setName(extraName);
+ g_info << "Trying to create table " << extraName << endl;
+ if (dict->createTable(extraTab) != 0)
+ {
+ g_err << "Failed to create table " << extraName << endl
+ << dict->getNdbError() << endl;
+ return NDBT_FAILED;
+ }
+
+ // Clean up by dropping extraTab.
+ g_info << "Trying to drop table " << extraName << endl;
+ if (dict->dropTable(extraName) != 0)
+ {
+ g_err << "Failed to drop table " << extraName << endl
+ << dict->getNdbError() << endl;
+ return NDBT_FAILED;
+ }
+
+ return NDBT_OK;
+}
int
runCreateMaxTables(NDBT_Context* ctx, NDBT_Step* step)
@@ -9076,6 +9214,9 @@ TESTCASE("CreateAndDropDuring",
STEP(runCreateAndDropDuring);
STEP(runUseTableUntilStopped);
}
+TESTCASE("DropWithTakeover","bug 14190114"){
+ INITIALIZER(runDropTakeoverTest);
+}
TESTCASE("CreateInvalidTables",
"Try to create the invalid tables we have defined\n"){
INITIALIZER(runCreateInvalidTables);
=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt revid:mikael.ronstrom@stripped
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt revid:mikael.ronstrom@stripped
@@ -1868,6 +1868,10 @@ cmd: testDict
args: -n IndexStatCreate T1
max-time: 300
+cmd: testDict
+args: -n DropWithTakeover T1
+
+max-time: 300
cmd: testBasic
args: -n LeakApiConnectObjects T1
No bundle (reason: useless for push emails).
| Thread |
|---|
| • bzr push into mysql-5.5-cluster-7.2 branch (mikael.ronstrom:3965 to 3966) | Mikael Ronstrom | 23 Aug |