3510 Frazer Clement 2012-11-20
Bug #14787522 PROD: NDBD NODE HANGING IN PHASE 3 DUE TO DICT LOCK WITH UNKNOWN HOLDER.
The various distributed protocols involved in dropping a table in 6.3 were
not well protected against node failures.
This could result in the drop table operation waiting indefinitely for a
reply from some node which had failed.
The indefinite wait caused the originating DDL operation to block until
NdbApi timed out.
Additionally, the 'DICT' (cluster internal dictionary) lock taken by
the drop table operation is held indefinitely.
Additionally, the logical 'Global schema lock', taken by the
MySQLD originating the drop table operation is held until the NdbApi
operation times out.
This is fixed by adding node failure handling to the drop table protocols
in 6.3, for DICT Slave node failures.
A new testcase is added to testDict, and the daily-basic suite.
Debugging / visibility enhancements :
A new dump code, 1228 (DictDumpLockQueue) is added to dump the contents
of the DICT lock queue.
A delayed DICT lock grant is now logged.
The DICT_TRACE mechanism is extended with DICT LOCK debugging.
modified:
storage/ndb/include/kernel/signaldata/DropTab.hpp
storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp
storage/ndb/include/kernel/signaldata/PrepDropTab.hpp
storage/ndb/src/kernel/blocks/ERROR_codes.txt
storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp
storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp
storage/ndb/src/kernel/vm/LockQueue.cpp
storage/ndb/src/kernel/vm/LockQueue.hpp
storage/ndb/test/ndbapi/testDict.cpp
storage/ndb/test/run-test/daily-basic-tests.txt
3509 magnus.blaudd@stripped 2012-11-20
ndb
- backport fix for ndb_backup_rate.test also to 6.3
modified:
mysql-test/suite/ndb/t/ndb_backup_rate.test
=== modified file 'storage/ndb/include/kernel/signaldata/DropTab.hpp'
--- a/storage/ndb/include/kernel/signaldata/DropTab.hpp 2011-06-30 15:55:35 +0000
+++ b/storage/ndb/include/kernel/signaldata/DropTab.hpp 2012-11-20 23:37:20 +0000
@@ -21,7 +21,7 @@
#include "SignalData.hpp"
-class DropTabReq {
+struct DropTabReq {
/**
* Sender(s)
*/
@@ -53,7 +53,7 @@ private:
Uint32 requestType;
};
-class DropTabConf {
+struct DropTabConf {
/**
* Sender(s)
*/
@@ -80,7 +80,7 @@ private:
Uint32 tableId;
};
-class DropTabRef {
+struct DropTabRef {
/**
* Sender(s)
*/
@@ -95,6 +95,7 @@ class DropTabRef {
* Receiver(s)
*/
friend class Dbdict;
+ friend class SafeCounter;
friend bool printDROP_TAB_REF(FILE *, const Uint32 *, Uint32, Uint16);
public:
@@ -109,7 +110,6 @@ public:
InvalidTableState = 6
};
-private:
Uint32 senderRef;
Uint32 senderData;
Uint32 tableId;
=== modified file 'storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp'
--- a/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp 2012-10-31 09:17:12 +0000
+++ b/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp 2012-11-20 23:37:20 +0000
@@ -92,6 +92,7 @@ public:
CmvmiMaintLockCPU = 505,
CmvmiSchedulerSpinTimer = 506,
// 1222-1225 DICT
+ DictDumpLockQueue = 1228,
LqhDumpAllDefinedTabs = 1332,
LqhDumpNoLogPages = 1333,
LqhDumpOneScanRec = 2300,
=== modified file 'storage/ndb/include/kernel/signaldata/PrepDropTab.hpp'
--- a/storage/ndb/include/kernel/signaldata/PrepDropTab.hpp 2011-06-30 15:55:35 +0000
+++ b/storage/ndb/include/kernel/signaldata/PrepDropTab.hpp 2012-11-20 23:37:20 +0000
@@ -21,7 +21,7 @@
#include "SignalData.hpp"
-class PrepDropTabReq {
+struct PrepDropTabReq {
/**
* Sender(s)
*/
@@ -45,7 +45,7 @@ private:
Uint32 requestType; // @see DropTabReq::RequestType
};
-class PrepDropTabConf {
+struct PrepDropTabConf {
/**
* Sender(s)
*/
@@ -68,7 +68,7 @@ private:
Uint32 tableId;
};
-class PrepDropTabRef {
+struct PrepDropTabRef {
/**
* Sender(s)
*/
@@ -80,6 +80,7 @@ class PrepDropTabRef {
* Receiver(s)
*/
friend class Dbdict;
+ friend class SafeCounter;
friend bool printPREP_DROP_TAB_REF(FILE *, const Uint32 *, Uint32, Uint16);
public:
@@ -94,7 +95,6 @@ public:
NF_FakeErrorREF = 5
};
-private:
Uint32 senderRef;
Uint32 senderData;
Uint32 tableId;
=== modified file 'storage/ndb/src/kernel/blocks/ERROR_codes.txt'
--- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2012-10-29 18:34:05 +0000
+++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2012-11-20 23:37:20 +0000
@@ -19,7 +19,7 @@ Next NDBFS 2000
Next DBACC 3002
Next DBTUP 4032
Next DBLQH 5074
-Next DBDICT 6026
+Next DBDICT 6029
Next DBDIH 7236
Next DBTC 8090
Next CMVMI 9000
@@ -582,6 +582,9 @@ Dbdict:
6202 Set error code before CREATE_FRAGMENTATION in master
6203 Set error code before CREATE_FRAGMENTATION in master (index)
+6027 Delay execution of DROP_TAB_REQ.
+6028 Delay execution of PREP_DROP_TAB_REQ.
+
Dbtup:
4014 - handleInsert - Out of undo buffer
4015 - handleInsert - Out of log space
=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp 2011-06-30 15:55:35 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp 2012-11-20 23:37:20 +0000
@@ -307,6 +307,17 @@ Dbdict::execDUMP_STATE_ORD(Signal* signa
ndbout_c("%s m_ref_count: %d", buf, iter.curr.p->m_ref_count);
}
}
+ if (signal->theData[0] == DumpStateOrd::DictDumpLockQueue)
+ {
+ jam();
+ m_dict_lock.dump_queue(m_dict_lock_pool, this);
+
+ /* Space for hex form of enough words for node bitmask + \0 */
+ char buf[(((MAX_NDB_NODES + 31)/32) * 8) + 1 ];
+ infoEvent("DICT : c_sub_startstop _outstanding %u _lock %s",
+ c_outstanding_sub_startstop,
+ c_sub_startstop_lock.getText(buf));
+ }
if (signal->theData[0] == 8004)
{
@@ -4070,7 +4081,15 @@ void Dbdict::execNODE_FAILREP(Signal* si
lockReq.lockId = 0;
lockReq.requestInfo = UtilLockReq::SharedLock;
lockReq.extra = DictLockReq::NodeFailureLock;
- m_dict_lock.lock(m_dict_lock_pool, &lockReq, 0);
+
+ Uint32 rc = m_dict_lock.lock(m_dict_lock_pool, &lockReq, 0);
+ debugLockInfo(signal,
+ "NODE_FAILREP Shared lock claim",
+ rc);
+ if (rc != UtilLockRef::OK)
+ {
+ m_dict_lock.dump_queue(m_dict_lock_pool, this);
+ }
}
}
@@ -4196,6 +4215,9 @@ Dbdict::execCREATE_TABLE_REQ(Signal* sig
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::CreateTableLock;
parseRecord.errorCode = dict_lock_trylock(&lockReq);
+ debugLockInfo(signal,
+ "CREATE_TABLE_REQ trylock",
+ parseRecord.errorCode);
if (parseRecord.errorCode)
{
jam();
@@ -4226,7 +4248,10 @@ Dbdict::execCREATE_TABLE_REQ(Signal* sig
if(parseRecord.errorCode != 0){
jam();
- dict_lock_unlock(0, &lockReq);
+ Uint32 rc = dict_lock_unlock(0, &lockReq);
+ debugLockInfo(signal,
+ "CREATE_TABLE_REQ unlock on parse error",
+ rc);
c_opCreateTable.release(createTabPtr);
if (!parseRecord.tablePtr.isNull())
{
@@ -4300,7 +4325,10 @@ Dbdict::execCREATE_TABLE_REQ(Signal* sig
{
jam();
parseRecord.errorCode= signal->theData[0];
- dict_lock_unlock(0, &lockReq);
+ Uint32 rc = dict_lock_unlock(0, &lockReq);
+ debugLockInfo(signal,
+ "CREATE_TABLE_REQ unlock on create frag error",
+ rc);
c_opCreateTable.release(createTabPtr);
releaseTableObject(parseRecord.tablePtr.i, true);
break;
@@ -4469,6 +4497,9 @@ Dbdict::execALTER_TABLE_REQ(Signal* sign
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::AlterTableLock;
aParseRecord.errorCode = dict_lock_trylock(&lockReq);
+ debugLockInfo(signal,
+ "ALTER_TABLE_REQ trylock",
+ aParseRecord.errorCode);
if (aParseRecord.errorCode)
{
jam();
@@ -4501,7 +4532,10 @@ Dbdict::execALTER_TABLE_REQ(Signal* sign
if(aParseRecord.errorCode != 0)
{
jam();
- dict_lock_unlock(0, &lockReq);
+ Uint32 rc = dict_lock_unlock(0, &lockReq);
+ debugLockInfo(signal,
+ "ALTER_TABLE_REQ unlock on parse error",
+ rc);
c_opCreateTable.release(alterTabPtr);
break;
}
@@ -4597,7 +4631,10 @@ Dbdict::alterTable_backup_mutex_locked(S
lockReq.userPtr = alterTabPtr.i;
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::AlterTableLock;
- dict_lock_unlock(signal, &lockReq);
+ Uint32 rc = dict_lock_unlock(signal, &lockReq);
+ debugLockInfo(signal,
+ "ALTER_TABLE_REQ unlock on backup ongoing",
+ rc);
c_opCreateTable.release(alterTabPtr);
releaseSections(handle);
@@ -5246,8 +5283,11 @@ Dbdict::execALTER_TAB_CONF(Signal * sign
lockReq.userRef = reference();
lockReq.userPtr = alterTabPtr.i;
lockReq.lockType = DictLockReq::AlterTableLock;
- dict_lock_unlock(signal, &lockReq);
-
+ Uint32 rc = dict_lock_unlock(signal, &lockReq);
+ debugLockInfo(signal,
+ "ALTER_TAB_CONF unlock on completion",
+ rc);
+
TableRecordPtr tabPtr;
c_tableRecordPool.getPtr(tabPtr, alterTabPtr.p->m_tablePtrI);
releaseTableObject(tabPtr.i, false);
@@ -5739,7 +5779,10 @@ Dbdict::createTab_reply(Signal* signal,
lockReq.userPtr = createTabPtr.i;
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::CreateTableLock;
- dict_lock_unlock(signal, &lockReq);
+ Uint32 rc = dict_lock_unlock(signal, &lockReq);
+ debugLockInfo(signal,
+ "CREATE_TABLE_REQ unlock on CreateTableDrop",
+ rc);
releaseCreateTableOp(signal,createTabPtr);
@@ -5805,7 +5848,10 @@ Dbdict::createTab_startLcpMutex_unlocked
lockReq.userPtr = createTabPtr.i;
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::CreateTableLock;
- dict_lock_unlock(signal, &lockReq);
+ Uint32 rc = dict_lock_unlock(signal, &lockReq);
+ debugLockInfo(signal,
+ "CREATE_TABLE_REQ commit unlock",
+ rc);
releaseCreateTableOp(signal,createTabPtr);
return;
@@ -7303,6 +7349,9 @@ Dbdict::execDROP_TABLE_REQ(Signal* signa
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::DropTableLock;
Uint32 err = dict_lock_trylock(&lockReq);
+ debugLockInfo(signal,
+ "DROP_TABLE_REQ trylock",
+ err);
if (err)
{
jam();
@@ -7357,7 +7406,10 @@ Dbdict::dropTable_backup_mutex_locked(Si
lockReq.userPtr = dropTabPtr.i;
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::DropTableLock;
- dict_lock_unlock(signal, &lockReq);
+ Uint32 rc = dict_lock_unlock(signal, &lockReq);
+ debugLockInfo(signal,
+ "DROP_TABLE_REQ unlock on backup ongoing",
+ rc);
c_opDropTable.release(dropTabPtr);
}
@@ -7429,8 +7481,12 @@ Dbdict::prepDropTab_nextStep(Signal* sig
prep->tableId = dropTabPtr.p->m_request.tableId;
prep->requestType = dropTabPtr.p->m_requestType;
- dropTabPtr.p->m_coordinatorData.m_signalCounter = c_aliveNodes;
NodeReceiverGroup rg(block, c_aliveNodes);
+ {
+ SafeCounter safeCounter(c_counterMgr,
+ dropTabPtr.p->m_coordinatorData.m_counter);
+ safeCounter.init<PrepDropTabRef>(rg, GSN_PREP_DROP_TAB_REF, dropTabPtr.p->key);
+ }
sendSignal(rg, GSN_PREP_DROP_TAB_REQ, signal,
PrepDropTabReq::SignalLength, JBB);
@@ -7459,12 +7515,14 @@ Dbdict::execPREP_DROP_TAB_CONF(Signal *
ndbrequire(dropTabPtr.p->m_request.tableId == prep->tableId);
ndbrequire(dropTabPtr.p->m_coordinatorData.m_gsn == GSN_PREP_DROP_TAB_REQ);
- Uint32 nodeId = refToNode(prep->senderRef);
- dropTabPtr.p->m_coordinatorData.m_signalCounter.clearWaitingFor(nodeId);
-
- if(!dropTabPtr.p->m_coordinatorData.m_signalCounter.done()){
- jam();
- return;
+ {
+ Uint32 nodeId = refToNode(prep->senderRef);
+ SafeCounter safeCounter(c_counterMgr, dropTabPtr.p->m_coordinatorData.m_counter);
+ if(!safeCounter.clearWaitingFor(nodeId))
+ {
+ jam();
+ return;
+ }
}
prepDropTab_nextStep(signal, dropTabPtr);
}
@@ -7479,12 +7537,10 @@ Dbdict::execPREP_DROP_TAB_REF(Signal* si
ndbrequire(c_opDropTable.find(dropTabPtr, prep->senderData));
ndbrequire(dropTabPtr.p->m_coordinatorRef == reference());
- ndbrequire(dropTabPtr.p->m_request.tableId == prep->tableId);
+ ndbrequire((dropTabPtr.p->m_request.tableId == prep->tableId) ||
+ (prep->errorCode == PrepDropTabRef::NF_FakeErrorREF));
ndbrequire(dropTabPtr.p->m_coordinatorData.m_gsn == GSN_PREP_DROP_TAB_REQ);
- Uint32 nodeId = refToNode(prep->senderRef);
- dropTabPtr.p->m_coordinatorData.m_signalCounter.clearWaitingFor(nodeId);
-
Uint32 block = refToBlock(prep->senderRef);
if((prep->errorCode == PrepDropTabRef::NoSuchTable && block == DBLQH) ||
(prep->errorCode == PrepDropTabRef::NF_FakeErrorREF)){
@@ -7498,9 +7554,14 @@ Dbdict::execPREP_DROP_TAB_REF(Signal* si
dropTabPtr.p->setErrorCode((Uint32)prep->errorCode);
}
- if(!dropTabPtr.p->m_coordinatorData.m_signalCounter.done()){
- jam();
- return;
+ {
+ Uint32 nodeId = refToNode(prep->senderRef);
+ SafeCounter safeCounter(c_counterMgr, dropTabPtr.p->m_coordinatorData.m_counter);
+ if(!safeCounter.clearWaitingFor(nodeId))
+ {
+ jam();
+ return;
+ }
}
prepDropTab_nextStep(signal, dropTabPtr);
}
@@ -7568,8 +7629,11 @@ Dbdict::dropTableWaitGci(Signal* signal)
req->tableId = dropTabPtr.p->m_request.tableId;
req->requestType = dropTabPtr.p->m_requestType;
- dropTabPtr.p->m_coordinatorData.m_signalCounter = c_aliveNodes;
+
NodeReceiverGroup rg(DBDICT, c_aliveNodes);
+ SafeCounter safeCounter(c_counterMgr,
+ dropTabPtr.p->m_coordinatorData.m_counter);
+ safeCounter.init<DropTabRef>(rg, GSN_DROP_TAB_REF, dropTabPtr.p->key);
sendSignal(rg, GSN_DROP_TAB_REQ, signal,
DropTabReq::SignalLength, JBB);
}
@@ -7591,7 +7655,24 @@ Dbdict::execDROP_TAB_REF(Signal* signal)
dropTab_localDROP_TAB_CONF(signal);
return;
}
- ndbrequire(false);
+ else
+ {
+ jam();
+ ndbrequire(req->errorCode == DropTabRef::NF_FakeErrorREF);
+ DropTableRecordPtr dropTabPtr;
+ ndbrequire(c_opDropTable.find(dropTabPtr, req->senderData));
+ ndbrequire(dropTabPtr.p->m_coordinatorRef == reference());
+ ndbrequire(dropTabPtr.p->m_coordinatorData.m_gsn == GSN_DROP_TAB_REQ);
+
+ /* Extract tableid, and process as CONF */
+ Uint32 tableId = dropTabPtr.p->m_request.tableId;
+ DropTabConf* conf = (DropTabConf*) signal->getDataPtrSend();
+ conf->senderRef = req->senderRef;
+ conf->senderData = req->senderData;
+ conf->tableId = tableId;
+ signal->header.theLength = DropTabConf::SignalLength;
+ execDROP_TAB_CONF(signal);
+ }
}
void
@@ -7615,9 +7696,10 @@ Dbdict::execDROP_TAB_CONF(Signal* signal
ndbrequire(dropTabPtr.p->m_coordinatorData.m_gsn == GSN_DROP_TAB_REQ);
Uint32 nodeId = refToNode(req->senderRef);
- dropTabPtr.p->m_coordinatorData.m_signalCounter.clearWaitingFor(nodeId);
-
- if(!dropTabPtr.p->m_coordinatorData.m_signalCounter.done()){
+ SafeCounter safeCounter(c_counterMgr,
+ dropTabPtr.p->m_coordinatorData.m_counter);
+ if (!safeCounter.clearWaitingFor(nodeId))
+ {
jam();
return;
}
@@ -7635,7 +7717,10 @@ Dbdict::execDROP_TAB_CONF(Signal* signal
lockReq.userPtr = dropTabPtr.i;
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::DropTableLock;
- dict_lock_unlock(signal, &lockReq);
+ Uint32 rc = dict_lock_unlock(signal, &lockReq);
+ debugLockInfo(signal,
+ "DROP_TABLE_CONF unlock",
+ rc);
c_opDropTable.release(dropTabPtr);
}
@@ -7646,6 +7731,16 @@ Dbdict::execDROP_TAB_CONF(Signal* signal
void
Dbdict::execPREP_DROP_TAB_REQ(Signal* signal){
jamEntry();
+
+ if (ERROR_INSERTED(6028))
+ {
+ jam();
+ /* Defer */
+ sendSignalWithDelay(reference(), GSN_PREP_DROP_TAB_REQ, signal,
+ 1000, signal->length());
+ return;
+ }
+
PrepDropTabReq * prep = (PrepDropTabReq*)signal->getDataPtrSend();
DropTableRecordPtr dropTabPtr;
@@ -7740,6 +7835,15 @@ void
Dbdict::execDROP_TAB_REQ(Signal* signal){
jamEntry();
DropTabReq * req = (DropTabReq*)signal->getDataPtrSend();
+
+ if (ERROR_INSERTED(6027))
+ {
+ jam();
+ /* Defer */
+ sendSignalWithDelay(reference(), GSN_DROP_TAB_REQ, signal,
+ 1000, signal->length());
+ return;
+ }
DropTableRecordPtr dropTabPtr;
ndbrequire(c_opDropTable.find(dropTabPtr, req->senderData));
@@ -8850,10 +8954,17 @@ Dbdict::execCREATE_INDX_REQ(Signal* sign
lockReq.lockType = DictLockReq::CreateIndexLock;
tmperr = (CreateIndxRef::ErrorCode)dict_lock_trylock(&lockReq);
+ debugLockInfo(signal,
+ "CREATE_INDEX_REQ trylock",
+ (Uint32) tmperr);
+
if (tmperr == 0)
{
jam();
- dict_lock_unlock(0, &lockReq);
+ Uint32 rc = dict_lock_unlock(0, &lockReq);
+ debugLockInfo(signal,
+ "CREATE_INDEX_REQ immediate unlock",
+ rc);
}
}
@@ -9520,11 +9631,16 @@ Dbdict::execDROP_INDX_REQ(Signal* signal
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::DropIndexLock;
tmperr = (DropIndxRef::ErrorCode)dict_lock_trylock(&lockReq);
-
+ debugLockInfo(signal,
+ "DROP_INDEX_REQ trylock",
+ (Uint32) tmperr);
if (tmperr == 0)
{
jam();
- dict_lock_unlock(0, &lockReq);
+ Uint32 rc = dict_lock_unlock(0, &lockReq);
+ debugLockInfo(signal,
+ "DROP_INDEX_REQ immediate unlock",
+ rc);
}
}
@@ -15015,6 +15131,53 @@ Dbdict::getDictLockType(Uint32 lockType)
}
void
+Dbdict::debugLockInfo(Signal* signal,
+ const char* text,
+ Uint32 rc)
+{
+ if (!g_trace)
+ return;
+
+ static const char* rctext = "Unknown result";
+
+ switch(rc)
+ {
+ case UtilLockRef::OK:
+ rctext = "Success";
+ break;
+ case UtilLockRef::NoSuchLock:
+ rctext = "No such lock";
+ break;
+ case UtilLockRef::OutOfLockRecords:
+ rctext = "Out of records";
+ break;
+ case UtilLockRef::DistributedLockNotSupported:
+ rctext = "Distributed lock not supported";
+ break;
+ case UtilLockRef::LockAlreadyHeld:
+ rctext = "Already held";
+ break;
+ case UtilLockRef::InLockQueue:
+ rctext = "Queued";
+ break;
+ /* try returns these... */
+ case CreateTableRef::Busy:
+ rctext = "CreateTableRef::Busy";
+ break;
+ case CreateTableRef::BusyWithNR:
+ rctext = "CreateTableRef::BusyWithNR";
+ break;
+ default:
+ break;
+ }
+
+ infoEvent("DICT : %s %u %s",
+ text,
+ rc,
+ rctext);
+}
+
+void
Dbdict::sendDictLockInfoEvent(Signal*, const UtilLockReq* req, const char* text)
{
const Dbdict::DictLockType* lt = getDictLockType(req->extra);
@@ -15054,7 +15217,7 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal
c_sub_startstop_lock.set(refToNode(req.userRef));
- g_eventLogger->info("granting dict lock to %u", refToNode(req.userRef));
+ g_eventLogger->info("granting SumaStartMe dict lock to %u", refToNode(req.userRef));
DictLockConf* conf = (DictLockConf*)signal->getDataPtrSend();
conf->userPtr = req.userPtr;
conf->lockType = req.lockType;
@@ -15102,6 +15265,10 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal
}
res = m_dict_lock.lock(m_dict_lock_pool, &lockReq, 0);
+ debugLockInfo(signal,
+ "DICT_LOCK_REQ lock",
+ res);
+
switch(res){
case 0:
jam();
@@ -15115,7 +15282,8 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal
break;
default:
jam();
- sendDictLockInfoEvent(signal, &lockReq, "lock request by node");
+ sendDictLockInfoEvent(signal, &lockReq, "lock request by node");
+ m_dict_lock.dump_queue(m_dict_lock_pool, this);
break;
}
return;
@@ -15166,7 +15334,7 @@ Dbdict::execDICT_UNLOCK_ORD(Signal* sign
if (ord->lockType == DictLockReq::SumaStartMe)
{
jam();
- g_eventLogger->info("clearing dict lock for %u", refToNode(ord->senderRef));
+ g_eventLogger->info("clearing SumaStartMe dict lock for %u", refToNode(ord->senderRef));
c_sub_startstop_lock.clear(refToNode(ord->senderRef));
return;
}
@@ -15174,8 +15342,12 @@ Dbdict::execDICT_UNLOCK_ORD(Signal* sign
UtilLockReq lockReq;
lockReq.senderData = req.userPtr;
lockReq.senderRef = req.userRef;
- lockReq.extra = DictLockReq::NodeRestartLock; // Should check...
- Uint32 res = dict_lock_unlock(signal, &req);
+ DictLockReq::LockType lockType = DictLockReq::NodeRestartLock;
+ Uint32 res = dict_lock_unlock(signal, &req, &lockType);
+ debugLockInfo(signal,
+ "DICT_UNLOCK_ORD unlock",
+ res);
+ lockReq.extra = lockType;
switch(res){
case UtilUnlockRef::OK:
jam();
@@ -15261,17 +15433,22 @@ Dbdict::dict_lock_trylock(const DictLock
break;
}
+ if (g_trace)
+ m_dict_lock.dump_queue(m_dict_lock_pool, this);
+
return CreateTableRef::Busy;
}
Uint32
-Dbdict::dict_lock_unlock(Signal* signal, const DictLockReq* _req)
+Dbdict::dict_lock_unlock(Signal* signal, const DictLockReq* _req,
+ DictLockReq::LockType* type)
{
UtilUnlockReq req;
req.senderData = _req->userPtr;
req.senderRef = _req->userRef;
- Uint32 res = m_dict_lock.unlock(m_dict_lock_pool, &req);
+ UtilLockReq orig_lock_req;
+ Uint32 res = m_dict_lock.unlock(m_dict_lock_pool, &req, &orig_lock_req);
switch(res){
case UtilUnlockRef::OK:
case UtilUnlockRef::NotLockOwner:
@@ -15281,6 +15458,11 @@ Dbdict::dict_lock_unlock(Signal* signal,
return res;
}
+ if (type)
+ {
+ *type = (DictLockReq::LockType) orig_lock_req.extra;
+ }
+
UtilLockReq lockReq;
LockQueue::Iterator iter;
if (m_dict_lock.first(m_dict_lock_pool, iter))
@@ -15301,6 +15483,8 @@ Dbdict::dict_lock_unlock(Signal* signal,
conf->lockType = lockReq.extra;
sendSignal(lockReq.senderRef, GSN_DICT_LOCK_CONF, signal,
DictLockConf::SignalLength, JBB);
+
+ sendDictLockInfoEvent(signal, &lockReq, "queued lock request granted for node");
}
if (!m_dict_lock.next(iter))
@@ -15476,7 +15660,11 @@ Dbdict::execCREATE_FILE_REQ(Signal* sign
lockReq.userPtr = trans_ptr.i;
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::CreateFileLock;
- if ((ref->errorCode = dict_lock_trylock(&lockReq)))
+ ref->errorCode = dict_lock_trylock(&lockReq);
+ debugLockInfo(signal,
+ "CREATE_FILE_REQ trylock",
+ ref->errorCode);
+ if (ref->errorCode != 0)
{
jam();
ref->errorLine = __LINE__;
@@ -15507,7 +15695,10 @@ Dbdict::execCREATE_FILE_REQ(Signal* sign
ref->status = 0;
ref->errorKey = 0;
ref->errorLine = __LINE__;
- dict_lock_unlock(0, &lockReq);
+ Uint32 rc = dict_lock_unlock(0, &lockReq);
+ debugLockInfo(signal,
+ "CREATE_FILE_REQ unlock at error 1",
+ rc);
c_Trans.release(trans_ptr);
break;
}
@@ -15598,7 +15789,12 @@ Dbdict::execCREATE_FILEGROUP_REQ(Signal*
lockReq.userPtr = trans_ptr.i;
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::CreateFilegroupLock;
- if ((ref->errorCode = dict_lock_trylock(&lockReq)))
+ ref->errorCode = dict_lock_trylock(&lockReq);
+ debugLockInfo(signal,
+ "CREATE_FILEGROUP_REQ trylock",
+ ref->errorCode);
+
+ if (ref->errorCode != 0)
{
jam();
ref->errorLine = __LINE__;
@@ -15628,7 +15824,10 @@ Dbdict::execCREATE_FILEGROUP_REQ(Signal*
ref->status = 0;
ref->errorKey = 0;
ref->errorLine = __LINE__;
- dict_lock_unlock(0, &lockReq);
+ Uint32 rc = dict_lock_unlock(0, &lockReq);
+ debugLockInfo(signal,
+ "CREATE_FILEGROUP_REQ no free obj unlock",
+ rc);
c_Trans.release(trans_ptr);
break;
}
@@ -15733,7 +15932,12 @@ Dbdict::execDROP_FILE_REQ(Signal* signal
lockReq.userPtr = trans_ptr.i;
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::DropFileLock;
- if ((ref->errorCode = dict_lock_trylock(&lockReq)))
+ ref->errorCode = dict_lock_trylock(&lockReq);
+ debugLockInfo(signal,
+ "DROP_FILE_REQ trylock",
+ ref->errorCode);
+
+ if (ref->errorCode != 0)
{
jam();
ref->errorLine = __LINE__;
@@ -15849,7 +16053,11 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* s
lockReq.userPtr = trans_ptr.i;
lockReq.userRef = reference();
lockReq.lockType = DictLockReq::DropFilegroupLock;
- if ((ref->errorCode = dict_lock_trylock(&lockReq)))
+ ref->errorCode = dict_lock_trylock(&lockReq);
+ debugLockInfo(signal,
+ "DROP_FILEGROUP trylock",
+ ref->errorCode);
+ if (ref->errorCode != 0)
{
jam();
ref->errorLine = __LINE__;
@@ -16134,7 +16342,10 @@ Dbdict::trans_commit_complete_done(Signa
ndbrequire(false);
}
- dict_lock_unlock(signal, &lockReq);
+ Uint32 rc = dict_lock_unlock(signal, &lockReq);
+ debugLockInfo(signal,
+ "FILE/FILEGROUP CREATE/DROP completed unlock",
+ rc);
c_Trans.release(trans_ptr);
return;
}
@@ -16258,7 +16469,10 @@ Dbdict::trans_abort_complete_done(Signal
ndbrequire(false);
}
- dict_lock_unlock(signal, &lockReq);
+ Uint32 rc = dict_lock_unlock(signal, &lockReq);
+ debugLockInfo(signal,
+ "FILE/FILEGROUP CREATE/DROP aborted unlock",
+ rc);
c_Trans.release(trans_ptr);
return;
}
=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp 2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp 2012-11-20 23:37:20 +0000
@@ -1190,7 +1190,7 @@ private:
struct CoordinatorData {
Uint32 m_gsn;
Uint32 m_block;
- SignalCounter m_signalCounter;
+ SafeCounterHandle m_counter;
} m_coordinatorData;
struct ParticipantData {
@@ -2618,11 +2618,15 @@ public:
};
static const DictLockType* getDictLockType(Uint32 lockType);
void sendDictLockInfoEvent(Signal*, const UtilLockReq*, const char* text);
+ void debugLockInfo(Signal* signal,
+ const char* text,
+ Uint32 rc);
void removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes);
Uint32 dict_lock_trylock(const DictLockReq* req);
- Uint32 dict_lock_unlock(Signal* signal, const DictLockReq* req);
+ Uint32 dict_lock_unlock(Signal* signal, const DictLockReq* req,
+ DictLockReq::LockType* type=0);
LockQueue::Pool m_dict_lock_pool;
LockQueue m_dict_lock;
=== modified file 'storage/ndb/src/kernel/vm/LockQueue.cpp'
--- a/storage/ndb/src/kernel/vm/LockQueue.cpp 2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/vm/LockQueue.cpp 2012-11-20 23:37:20 +0000
@@ -92,7 +92,8 @@ LockQueue::lock(Pool & thePool,
Uint32
LockQueue::unlock(Pool & thePool,
- const UtilUnlockReq* req)
+ const UtilUnlockReq* req,
+ UtilLockReq* orig_req)
{
const Uint32 senderRef = req->senderRef;
const Uint32 senderData = req->senderData;
@@ -119,6 +120,11 @@ LockQueue::unlock(Pool & thePool,
jam();
res = UtilUnlockRef::NotLockOwner;
}
+
+ /* Copy out orig request if ptr supplied */
+ if (orig_req)
+ *orig_req = lockEPtr.p->m_req;
+
queue.release(lockEPtr);
return res;
}
=== modified file 'storage/ndb/src/kernel/vm/LockQueue.hpp'
--- a/storage/ndb/src/kernel/vm/LockQueue.hpp 2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/vm/LockQueue.hpp 2012-11-20 23:37:20 +0000
@@ -47,7 +47,7 @@ public:
typedef ArrayPool<LockQueueElement> Pool;
Uint32 lock(Pool&, const UtilLockReq * req, const UtilLockReq** lockOwner= 0);
- Uint32 unlock(Pool&, const UtilUnlockReq* req);
+ Uint32 unlock(Pool&, const UtilUnlockReq* req, UtilLockReq* orig_req= 0);
/**
* After unlock
=== modified file 'storage/ndb/test/ndbapi/testDict.cpp'
--- a/storage/ndb/test/ndbapi/testDict.cpp 2011-01-30 20:42:21 +0000
+++ b/storage/ndb/test/ndbapi/testDict.cpp 2012-11-20 23:37:20 +0000
@@ -1211,6 +1211,7 @@ NF_codes[] = {
int
runNF1(NDBT_Context* ctx, NDBT_Step* step){
+
NdbRestarter restarter;
if(restarter.getNumDbNodes() < 2)
return NDBT_OK;
@@ -4788,6 +4789,124 @@ runBug57057(NDBT_Context* ctx, NDBT_Step
}
+static const char* control = "DropTabWorkerState";
+enum WorkerStates
+{
+ WS_INIT,
+ WS_IDLE,
+ WS_ACTIVE
+};
+
+int
+runDropTabWorker(NDBT_Context* ctx, NDBT_Step* step)
+{
+ while (!ctx->isTestStopped())
+ {
+ ctx->setProperty(control, WS_IDLE);
+ ctx->getPropertyWait(control, WS_ACTIVE);
+ if (ctx->isTestStopped())
+ return NDBT_OK;
+
+ Ndb* pNdb = GETNDB(step);
+ const char* tabName = ctx->getTab()->getName();
+ ndbout_c("Dropping table %s", tabName);
+ int rc = pNdb->getDictionary()->dropTable(tabName);
+ ndbout_c("Table drop return code : %d",
+ rc);
+ }
+ return NDBT_OK;
+}
+
+struct DropTabNFScenario
+{
+ Uint32 errorCode;
+ bool masterVictim;
+};
+
+static const DropTabNFScenario DropTabNFScenarios[] =
+{
+ { 6028, false } /* Kill slave at top of PREP_DROP_TAB_REQ */
+ ,{ 6027, false } /* Kill slave at top of DROP_TAB_REQ */
+// ,{ 6028, true } /* Kill master at top of PREP_DROP_TAB_REQ */
+ ,{ 6027, true } /* Kill master at top of DROP_TAB_REQ */
+
+};
+
+int
+runDropTabNF(NDBT_Context* ctx, NDBT_Step* step)
+{
+ /*
+ 1. Create table
+ 2. Insert error(s) on slave node
+ 3. Drop table
+ 4. Kill slave node
+ 5. Wait for drop to complete
+ 6. Wait for restart to complete
+
+ Variants
+ 1. Insert on slave/master
+ 2. Error code types
+ */
+
+ NdbRestarter restarter;
+ Uint32 numScenarios = sizeof(DropTabNFScenarios) / sizeof(DropTabNFScenario);
+ int numLoops = ctx->getNumLoops();
+
+ for (int r=0; r < numLoops; r++)
+ {
+ ndbout_c("**** loop %d ****", r);
+ for (int n=0; n < numScenarios; n++)
+ {
+ ndbout_c("Creating table");
+ if (runCreateTheTable(ctx, step) != NDBT_OK)
+ {
+ return NDBT_FAILED;
+ }
+
+ Uint32 errorCode = DropTabNFScenarios[n].errorCode;
+ int victimNode = 0;
+ const char* role;
+ if (DropTabNFScenarios[n].masterVictim)
+ {
+ victimNode = restarter.getMasterNodeId();
+ role = "M";
+ }
+ else
+ {
+ victimNode = restarter.getRandomNotMasterNodeId(rand());
+ role = "S";
+ }
+ ndbout_c("Chosen victim node : %u (%s)", victimNode, role);
+
+ restarter.insertErrorInNode(victimNode, errorCode);
+
+ ndbout_c("Inserted error %u in node %u", errorCode, victimNode);
+
+ ndbout_c("Requesting drop tab");
+ ctx->getPropertyWait(control, WS_IDLE);
+ ctx->setProperty(control, WS_ACTIVE);
+
+ ndbout_c("Restarting node %u", victimNode);
+ restarter.restartOneDbNode(victimNode);
+ ndbout_c("Node restarting....");
+
+ ndbout_c("Waiting for drop table to complete...");
+ ctx->getPropertyWait(control, WS_IDLE);
+ ndbout_c("Drop table completed");
+
+ ndbout_c("Waiting for node to recover");
+ restarter.waitNodesStarted(&victimNode, 1);
+ ndbout_c("Node started");
+ }
+ }
+
+ ndbout_c("**** stop ****");
+ ctx->stopTest();
+
+ return NDBT_OK;
+}
+
+
NDBT_TESTSUITE(testDict);
TESTCASE("testDropDDObjects",
"* 1. start cluster\n"
@@ -5051,6 +5170,13 @@ TESTCASE("Bug57057",
TC_PROPERTY("SubSteps", 1);
STEP(runBug58277scan);
}
+TESTCASE("DropTabNF",
+ "Drop table and node failure causes hang")
+{
+ STEP(runDropTabWorker);
+ STEP(runDropTabNF);
+}
+
NDBT_TESTSUITE_END(testDict);
int main(int argc, const char** argv){
=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt 2012-10-29 18:34:05 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt 2012-11-20 23:37:20 +0000
@@ -1557,3 +1557,7 @@ max-time : 1200
cmd: testLimits
args: -n SlowDihFileWrites T1
+max-time: 600
+cmd: testDict
+args: -n DropTabNF -l 3 T1
+
No bundle (reason: useless for push emails).
| Thread |
|---|
| • bzr push into mysql-5.1-telco-6.3 branch (frazer.clement:3509 to 3510)Bug#14787522 | Frazer Clement | 21 Nov |