From: Jan Wedvik Date: August 22 2012 11:05am Subject: bzr push into mysql-5.1-telco-7.1 branch (jan.wedvik:4594 to 4595) List-Archive: http://lists.mysql.com/commits/144593 Message-Id: <20120822110511.22802.20397.4595@atum17.no.oracle.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit 4595 Jan Wedvik 2012-08-22 [merge] Merged 7.0->7.1 modified: storage/ndb/include/kernel/signaldata/SchemaTrans.hpp storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp storage/ndb/test/ndbapi/testDict.cpp storage/ndb/test/run-test/daily-basic-tests.txt 4594 Martin Skold 2012-08-21 ndb - bump version to 7.1.25 modified: configure.in storage/ndb/ndb_configure.m4 === modified file 'storage/ndb/include/kernel/signaldata/SchemaTrans.hpp' --- a/storage/ndb/include/kernel/signaldata/SchemaTrans.hpp 2011-10-07 13:15:08 +0000 +++ b/storage/ndb/include/kernel/signaldata/SchemaTrans.hpp 2012-08-22 10:49:42 +0000 @@ -81,6 +81,7 @@ struct SchemaTransEndConf { struct SchemaTransEndRef { enum ErrorCode { NoError = 0, + Busy = 701, NotMaster = 702, InvalidTransKey = 781, InvalidTransId = 782, === modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp' --- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp 2012-06-21 15:17:40 +0000 +++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp 2012-08-22 11:04:43 +0000 @@ -2225,6 +2225,7 @@ void Dbdict::initCommonData() c_systemRestart = false; c_initialNodeRestart = false; c_nodeRestart = false; + c_takeOverInProgress = false; c_outstanding_sub_startstop = 0; c_sub_startstop_lock.clear(); @@ -4619,6 +4620,7 @@ void Dbdict::handle_master_takeover(Sign } DictTakeoverReq* req = (DictTakeoverReq*)signal->getDataPtrSend(); req->senderRef = reference(); + c_takeOverInProgress = true; sendSignal(rg, GSN_DICT_TAKEOVER_REQ, signal, DictTakeoverReq::SignalLength, JBB); } @@ -7827,8 +7829,8 @@ Dbdict::dropTable_complete_done(Signal* sendSignal(SUMA_REF, GSN_DROP_TAB_CONF, signal, DropTabConf::SignalLength, JBB); } - - sendTransConf(signal, trans_ptr); + ndbassert(op_ptr.p->m_state == SchemaOp::OS_COMPLETING); + sendTransConf(signal, op_ptr); } // DropTable: ABORT @@ -19745,12 +19747,16 @@ Dbdict::execDICT_TAKEOVER_REQ(Signal* si case SchemaOp::OS_COMMITTING: case SchemaOp::OS_COMPLETING: { + /** + * Wait 100ms and check again. This delay is there to save CPU cycles + * and to avoid filling the jam trace buffer. + */ jam(); Uint32* data = &signal->theData[0]; - data[0] = ZDICT_TAKEOVER_REQ; memmove(&data[1], &data[0], DictTakeoverReq::SignalLength << 2); - sendSignal(reference(), GSN_CONTINUEB, signal, - 1 + DictTakeoverReq::SignalLength, JBB); + data[0] = ZDICT_TAKEOVER_REQ; + sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, + 100, 1 + DictTakeoverReq::SignalLength); return; } default: @@ -19873,6 +19879,7 @@ Dbdict::execDICT_TAKEOVER_REF(Signal* si return; } } + c_takeOverInProgress = false; check_takeover_replies(signal); } @@ -19918,6 +19925,7 @@ Dbdict::execDICT_TAKEOVER_CONF(Signal* s return; } } + c_takeOverInProgress = false; check_takeover_replies(signal); } @@ -24813,6 +24821,18 @@ Dbdict::execSCHEMA_TRANS_BEGIN_REQ(Signa break; } + if (c_takeOverInProgress) + { + /** + * There is a dict takeover in progress. There may thus another + * transaction that should be rolled backward or forward before we + * can allow another transaction to start. + */ + jam(); + setError(error, SchemaTransBeginRef::Busy, __LINE__); + break; + } + if (!check_ndb_versions() && !localTrans) { jam(); @@ -25010,6 +25030,18 @@ Dbdict::execSCHEMA_TRANS_END_REQ(Signal* setError(error, SchemaTransEndRef::NotMaster, __LINE__); break; } + + if (c_takeOverInProgress) + { + /** + * There is a dict takeover in progress, and the transaction may thus + * be in an inconsistent state. Therefore we cannot process this request + * now. + */ + jam(); + setError(error, SchemaTransEndRef::Busy, __LINE__); + break; + } #ifdef MARTIN ndbout_c("Dbdict::execSCHEMA_TRANS_END_REQ: trans %u, state %u", trans_ptr.i, trans_ptr.p->m_state); #endif @@ -25222,6 +25254,17 @@ Dbdict::execSCHEMA_TRANS_IMPL_CONF(Signa jamEntry(); ndbrequire(signal->getNoOfSections() == 0); + if (c_takeOverInProgress) + { + /** + * The new master will rebuild the transaction state from the + * DICT_TAKEOVER_CONF signals. Therefore we ignore this signal during + * takeover. + */ + jam(); + return; + } + const SchemaTransImplConf* conf = (const SchemaTransImplConf*)signal->getDataPtr(); @@ -25248,6 +25291,17 @@ Dbdict::execSCHEMA_TRANS_IMPL_REF(Signal jamEntry(); ndbrequire(signal->getNoOfSections() == 0); + if (c_takeOverInProgress) + { + /** + * The new master will rebuild the transaction state from the + * DICT_TAKEOVER_CONF signals. Therefore we ignore this signal during + * takeover. + */ + jam(); + return; + } + SchemaTransImplRef refCopy = *(SchemaTransImplRef*)signal->getDataPtr(); SchemaTransImplRef * ref = &refCopy; === modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp' --- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp 2012-01-23 08:00:45 +0000 +++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp 2012-08-22 10:49:42 +0000 @@ -1319,6 +1319,13 @@ private: bool c_nodeRestart; bool c_initialNodeRestart; Uint32 c_tabinfoReceived; + /** + * This flag indicates that a dict takeover is in progress, specifically + * that the new master has outstanding DICT_TAKEOVER_REQ messages. The flag + * is used to prevent client from starting (or ending) transactions during + * takeover. + */ + bool c_takeOverInProgress; /** * Temporary structure used when parsing table info === modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp' --- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2012-08-21 07:33:17 +0000 +++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2012-08-22 11:04:43 +0000 @@ -2513,7 +2513,31 @@ Dblqh::dropTab_wait_usage(Signal* signal void Dblqh::execDROP_TAB_REQ(Signal* signal){ jamEntry(); - + if (ERROR_INSERTED(5076)) + { + /** + * This error insert simulates a situation where it takes a long time + * to execute DROP_TAB_REQ, such that we can crash the (dict) master + * while there is an outstanding DROP_TAB_REQ. + */ + jam(); + sendSignalWithDelay(reference(), GSN_DROP_TAB_REQ, signal, 1000, + signal->getLength()); + return; + } + if (ERROR_INSERTED(5077)) + { + jam(); + CLEAR_ERROR_INSERT_VALUE; + /** + * Kill this node 2 seconds from now. We wait for two seconds to make sure + * that DROP_TAB_REQ messages have reached other nodes before this one + * dies. + */ + signal->theData[0] = 9999; + sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 2000, 1); + return; + } DropTabReq reqCopy = * (DropTabReq*)signal->getDataPtr(); DropTabReq* req = &reqCopy; === modified file 'storage/ndb/test/ndbapi/testDict.cpp' --- a/storage/ndb/test/ndbapi/testDict.cpp 2011-12-15 08:28:12 +0000 +++ b/storage/ndb/test/ndbapi/testDict.cpp 2012-08-22 10:49:42 +0000 @@ -776,6 +776,144 @@ int runUseTableUntilStopped3(NDBT_Contex return NDBT_OK; } +/** + * This is a regression test for bug 14190114 + * "CLUSTER CRASH DUE TO NDBREQUIRE IN ./LOCALPROXY.HPP DBLQH (LINE: 234)". + * This bug occurs if there is a takeover (i.e. the master node crashes) + * while an LQH block is executing a DROP_TAB_REQ signal. It only affects + * multi-threaded ndb. + */ +static int +runDropTakeoverTest(NDBT_Context* ctx, NDBT_Step* step) +{ + NdbRestarter restarter; + if (restarter.getNumDbNodes() == 1) + { + g_info << "Cannot do this test with just one datanode." << endl; + return NDBT_OK; + } + + Ndb* const ndb = GETNDB(step); + NdbDictionary::Dictionary* const dict = ndb->getDictionary(); + + // First we create a table that is a copy of ctx->getTab(). + NdbDictionary::Table copyTab(*ctx->getTab()); + const char* copyName = "copyTab"; + + copyTab.setName(copyName); + if (dict->createTable(copyTab) != 0) + { + g_err << "Failed to create table " << copyName << endl + << dict->getNdbError() << endl; + return NDBT_FAILED; + } + + /** + * Find the node id of the master node and another data node that is not + * the master. + */ + const int masterNodeId = restarter.getMasterNodeId(); + const int nonMasterNodeId = + masterNodeId == restarter.getDbNodeId(0) ? + restarter.getDbNodeId(1) : + restarter.getDbNodeId(0); + + /** + * This error insert makes LQH resend the DROP_TAB_REQ to itself (with a + * delay) rather than executing it, until the error insert is reset. + * This makes it appear as if though the LQH block spends a long time + * executing the DROP_TAB_REQ signal. + */ + g_info << "Insert error 5076 in node " << nonMasterNodeId << endl; + require(restarter.insertErrorInNode(nonMasterNodeId, 5076) == 0); + /** + * This error insert makes the master node crash when one of its LQH + * blocks tries to execute a DROP_TAB_REQ signal. This will then trigger + * a takeover. + */ + g_info << "Insert error 5077 in node " << masterNodeId << endl; + require(restarter.insertErrorInNode(masterNodeId, 5077) == 0); + + // This dropTable should fail, since the master node dies. + g_info << "Trying to drop table " << copyName << endl; + if (dict->dropTable(copyName) == 0) + { + g_err << "Unexpectedly managed to drop table " << copyName << endl; + return NDBT_FAILED; + } + + /** + * Check that only old master is dead. Bug 14190114 would cause other nodes + * to die as well. + */ + const int deadNodeId = restarter.checkClusterAlive(&masterNodeId, 1); + if (deadNodeId != 0) + { + g_err << "NodeId " << deadNodeId << " is down." << endl; + return NDBT_FAILED; + } + + // Reset error insert. + g_info << "insert error 0 in node " << nonMasterNodeId << endl; + require(restarter.insertErrorInNode(nonMasterNodeId, 0) == 0); + + // Verify that old master comes back up, and that no other node crashed. + g_info << "Waiting for all nodes to be up." << endl; + if (restarter.waitClusterStarted() != 0) + { + g_err << "One or more cluster nodes are not up." << endl; + return NDBT_FAILED; + } + + /** + * The 'drop table' operation should have been rolled forward, since the + * node crash happened in the complete phase. Verify that the table is + * gone. + */ + g_info << "Verifying that table " << copyName << " was deleted." << endl; + if (dict->getTable(copyName) == NULL) + { + if (dict->getNdbError().code != 723) // 723 = no such table existed. + { + g_err << "dict->getTable() for " << copyName + << " failed in unexpedted way:" << endl + << dict->getNdbError() << endl; + return NDBT_FAILED; + } + } + else + { + g_err << "Transaction dropping " << copyName << " was not rolled forward" + << endl; + return NDBT_FAILED; + } + + /** + * Do another dictionary transaction, to verify that the cluster allows that. + */ + NdbDictionary::Table extraTab(*ctx->getTab()); + const char* extraName = "extraTab"; + + extraTab.setName(extraName); + g_info << "Trying to create table " << extraName << endl; + if (dict->createTable(extraTab) != 0) + { + g_err << "Failed to create table " << extraName << endl + << dict->getNdbError() << endl; + return NDBT_FAILED; + } + + // Clean up by dropping extraTab. + g_info << "Trying to drop table " << extraName << endl; + if (dict->dropTable(extraName) != 0) + { + g_err << "Failed to drop table " << extraName << endl + << dict->getNdbError() << endl; + return NDBT_FAILED; + } + + return NDBT_OK; +} int runCreateMaxTables(NDBT_Context* ctx, NDBT_Step* step) @@ -9076,6 +9214,9 @@ TESTCASE("CreateAndDropDuring", STEP(runCreateAndDropDuring); STEP(runUseTableUntilStopped); } +TESTCASE("DropWithTakeover","bug 14190114"){ + INITIALIZER(runDropTakeoverTest); +} TESTCASE("CreateInvalidTables", "Try to create the invalid tables we have defined\n"){ INITIALIZER(runCreateInvalidTables); === modified file 'storage/ndb/test/run-test/daily-basic-tests.txt' --- a/storage/ndb/test/run-test/daily-basic-tests.txt 2012-08-13 13:03:29 +0000 +++ b/storage/ndb/test/run-test/daily-basic-tests.txt 2012-08-22 10:49:42 +0000 @@ -1868,6 +1868,10 @@ cmd: testDict args: -n IndexStatCreate T1 max-time: 300 +cmd: testDict +args: -n DropWithTakeover T1 + +max-time: 300 cmd: testBasic args: -n LeakApiConnectObjects T1 No bundle (reason: useless for push emails).