List:Commits« Previous MessageNext Message »
From:Frazer Clement Date:November 21 2012 12:38am
Subject:bzr push into mysql-5.1-telco-6.3 branch (frazer.clement:3509 to 3510)
Bug#14787522
View as plain text  
 3510 Frazer Clement	2012-11-20
      Bug #14787522 	PROD: NDBD NODE HANGING IN PHASE 3 DUE TO DICT LOCK WITH UNKNOWN HOLDER.
      
      The various distributed protocols involved in dropping a table in 6.3 were
      not well protected against node failures.
      This could result in the drop table operation waiting indefinitely for a
      reply from some node which had failed.
      The indefinite wait caused the originating DDL operation to block until
      NdbApi timed out.
      Additionally, the 'DICT' (cluster internal dictionary) lock taken by 
      the drop table operation is held indefinitely.
      Additionally, the logical 'Global schema lock', taken by the 
      MySQLD originating the drop table operation is held until the NdbApi
      operation times out.
      
      This is fixed by adding node failure handling to the drop table protocols
      in 6.3, for DICT Slave node failures.
      
      A new testcase is added to testDict, and the daily-basic suite.
      
      Debugging / visibility enhancements : 
      
      A new dump code, 1228 (DictDumpLockQueue) is added to dump the contents
      of the DICT lock queue.
      
      A delayed DICT lock grant is now logged.
      
      The DICT_TRACE mechanism is extended with DICT LOCK debugging.

    modified:
      storage/ndb/include/kernel/signaldata/DropTab.hpp
      storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp
      storage/ndb/include/kernel/signaldata/PrepDropTab.hpp
      storage/ndb/src/kernel/blocks/ERROR_codes.txt
      storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp
      storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp
      storage/ndb/src/kernel/vm/LockQueue.cpp
      storage/ndb/src/kernel/vm/LockQueue.hpp
      storage/ndb/test/ndbapi/testDict.cpp
      storage/ndb/test/run-test/daily-basic-tests.txt
 3509 magnus.blaudd@stripped	2012-11-20
      ndb
       - backport fix for ndb_backup_rate.test also to 6.3

    modified:
      mysql-test/suite/ndb/t/ndb_backup_rate.test
=== modified file 'storage/ndb/include/kernel/signaldata/DropTab.hpp'
--- a/storage/ndb/include/kernel/signaldata/DropTab.hpp	2011-06-30 15:55:35 +0000
+++ b/storage/ndb/include/kernel/signaldata/DropTab.hpp	2012-11-20 23:37:20 +0000
@@ -21,7 +21,7 @@
 
 #include "SignalData.hpp"
 
-class DropTabReq {
+struct DropTabReq {
   /**
    * Sender(s)
    */
@@ -53,7 +53,7 @@ private:
   Uint32 requestType;
 };
 
-class DropTabConf {
+struct DropTabConf {
   /**
    * Sender(s)
    */
@@ -80,7 +80,7 @@ private:
   Uint32 tableId;
 };
 
-class DropTabRef {
+struct DropTabRef {
   /**
    * Sender(s)
    */
@@ -95,6 +95,7 @@ class DropTabRef {
    * Receiver(s)
    */
   friend class Dbdict;
+  friend class SafeCounter;
 
   friend bool printDROP_TAB_REF(FILE *, const Uint32 *, Uint32, Uint16);
 public:
@@ -109,7 +110,6 @@ public:
     InvalidTableState = 6
   };
   
-private:
   Uint32 senderRef;
   Uint32 senderData;
   Uint32 tableId;

=== modified file 'storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp'
--- a/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2012-10-31 09:17:12 +0000
+++ b/storage/ndb/include/kernel/signaldata/DumpStateOrd.hpp	2012-11-20 23:37:20 +0000
@@ -92,6 +92,7 @@ public:
     CmvmiMaintLockCPU = 505,
     CmvmiSchedulerSpinTimer = 506,
     // 1222-1225 DICT
+    DictDumpLockQueue = 1228,
     LqhDumpAllDefinedTabs = 1332,
     LqhDumpNoLogPages = 1333,
     LqhDumpOneScanRec = 2300,

=== modified file 'storage/ndb/include/kernel/signaldata/PrepDropTab.hpp'
--- a/storage/ndb/include/kernel/signaldata/PrepDropTab.hpp	2011-06-30 15:55:35 +0000
+++ b/storage/ndb/include/kernel/signaldata/PrepDropTab.hpp	2012-11-20 23:37:20 +0000
@@ -21,7 +21,7 @@
 
 #include "SignalData.hpp"
 
-class PrepDropTabReq {
+struct PrepDropTabReq {
   /**
    * Sender(s)
    */
@@ -45,7 +45,7 @@ private:
   Uint32 requestType; // @see DropTabReq::RequestType
 };
 
-class PrepDropTabConf {
+struct PrepDropTabConf {
   /**
    * Sender(s)
    */
@@ -68,7 +68,7 @@ private:
   Uint32 tableId;
 };
 
-class PrepDropTabRef {
+struct PrepDropTabRef {
   /**
    * Sender(s)
    */
@@ -80,6 +80,7 @@ class PrepDropTabRef {
    * Receiver(s)
    */
   friend class Dbdict;
+  friend class SafeCounter;
 
   friend bool printPREP_DROP_TAB_REF(FILE *, const Uint32 *, Uint32, Uint16);
 public:
@@ -94,7 +95,6 @@ public:
     NF_FakeErrorREF = 5
   };
   
-private:
   Uint32 senderRef;
   Uint32 senderData;
   Uint32 tableId;

=== modified file 'storage/ndb/src/kernel/blocks/ERROR_codes.txt'
--- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2012-10-29 18:34:05 +0000
+++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2012-11-20 23:37:20 +0000
@@ -19,7 +19,7 @@ Next NDBFS 2000
 Next DBACC 3002
 Next DBTUP 4032
 Next DBLQH 5074
-Next DBDICT 6026
+Next DBDICT 6029
 Next DBDIH 7236
 Next DBTC 8090
 Next CMVMI 9000
@@ -582,6 +582,9 @@ Dbdict:
 6202 Set error code before CREATE_FRAGMENTATION in master
 6203 Set error code before CREATE_FRAGMENTATION in master (index)
 
+6027 Delay execution of DROP_TAB_REQ.
+6028 Delay execution of PREP_DROP_TAB_REQ.
+
 Dbtup:
 4014 - handleInsert - Out of undo buffer
 4015 - handleInsert - Out of log space

=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp	2011-06-30 15:55:35 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp	2012-11-20 23:37:20 +0000
@@ -307,6 +307,17 @@ Dbdict::execDUMP_STATE_ORD(Signal* signa
       ndbout_c("%s m_ref_count: %d", buf, iter.curr.p->m_ref_count); 
     }
   }    
+  if (signal->theData[0] == DumpStateOrd::DictDumpLockQueue)
+  {
+    jam();
+    m_dict_lock.dump_queue(m_dict_lock_pool, this);
+    
+    /* Space for hex form of enough words for node bitmask + \0 */
+    char buf[(((MAX_NDB_NODES + 31)/32) * 8) + 1 ];
+    infoEvent("DICT : c_sub_startstop _outstanding %u _lock %s",
+              c_outstanding_sub_startstop,
+              c_sub_startstop_lock.getText(buf));
+  }
   
   if (signal->theData[0] == 8004)
   {
@@ -4070,7 +4081,15 @@ void Dbdict::execNODE_FAILREP(Signal* si
       lockReq.lockId = 0;
       lockReq.requestInfo = UtilLockReq::SharedLock;
       lockReq.extra = DictLockReq::NodeFailureLock;
-      m_dict_lock.lock(m_dict_lock_pool, &lockReq, 0);
+      
+      Uint32 rc = m_dict_lock.lock(m_dict_lock_pool, &lockReq, 0);
+      debugLockInfo(signal,
+                    "NODE_FAILREP Shared lock claim",
+                    rc);
+      if (rc != UtilLockRef::OK)
+      {
+        m_dict_lock.dump_queue(m_dict_lock_pool, this);
+      }
     }
   }
   
@@ -4196,6 +4215,9 @@ Dbdict::execCREATE_TABLE_REQ(Signal* sig
     lockReq.userRef = reference();
     lockReq.lockType = DictLockReq::CreateTableLock;
     parseRecord.errorCode = dict_lock_trylock(&lockReq);
+    debugLockInfo(signal,
+                  "CREATE_TABLE_REQ trylock",
+                  parseRecord.errorCode);
     if (parseRecord.errorCode)
     {
       jam();
@@ -4226,7 +4248,10 @@ Dbdict::execCREATE_TABLE_REQ(Signal* sig
     
     if(parseRecord.errorCode != 0){
       jam();
-      dict_lock_unlock(0, &lockReq);
+      Uint32 rc = dict_lock_unlock(0, &lockReq);
+      debugLockInfo(signal,
+                    "CREATE_TABLE_REQ unlock on parse error",
+                    rc);
       c_opCreateTable.release(createTabPtr);
       if (!parseRecord.tablePtr.isNull())
       {
@@ -4300,7 +4325,10 @@ Dbdict::execCREATE_TABLE_REQ(Signal* sig
     {
       jam();
       parseRecord.errorCode= signal->theData[0];
-      dict_lock_unlock(0, &lockReq);
+      Uint32 rc = dict_lock_unlock(0, &lockReq);
+      debugLockInfo(signal,
+                    "CREATE_TABLE_REQ unlock on create frag error",
+                    rc);
       c_opCreateTable.release(createTabPtr);
       releaseTableObject(parseRecord.tablePtr.i, true);
       break;
@@ -4469,6 +4497,9 @@ Dbdict::execALTER_TABLE_REQ(Signal* sign
     lockReq.userRef = reference();
     lockReq.lockType = DictLockReq::AlterTableLock;
     aParseRecord.errorCode = dict_lock_trylock(&lockReq);
+    debugLockInfo(signal,
+                  "ALTER_TABLE_REQ trylock",
+                  aParseRecord.errorCode);
     if (aParseRecord.errorCode)
     {
       jam();
@@ -4501,7 +4532,10 @@ Dbdict::execALTER_TABLE_REQ(Signal* sign
     if(aParseRecord.errorCode != 0)
     {
       jam();
-      dict_lock_unlock(0, &lockReq);
+      Uint32 rc = dict_lock_unlock(0, &lockReq);
+      debugLockInfo(signal,
+                    "ALTER_TABLE_REQ unlock on parse error",
+                    rc);
       c_opCreateTable.release(alterTabPtr);
       break;
     }
@@ -4597,7 +4631,10 @@ Dbdict::alterTable_backup_mutex_locked(S
     lockReq.userPtr = alterTabPtr.i;
     lockReq.userRef = reference();
     lockReq.lockType = DictLockReq::AlterTableLock;
-    dict_lock_unlock(signal, &lockReq);
+    Uint32 rc = dict_lock_unlock(signal, &lockReq);
+    debugLockInfo(signal,
+                  "ALTER_TABLE_REQ unlock on backup ongoing",
+                  rc);
 
     c_opCreateTable.release(alterTabPtr);
     releaseSections(handle);
@@ -5246,8 +5283,11 @@ Dbdict::execALTER_TAB_CONF(Signal * sign
       lockReq.userRef = reference();
       lockReq.userPtr = alterTabPtr.i;
       lockReq.lockType = DictLockReq::AlterTableLock;
-      dict_lock_unlock(signal, &lockReq);
-
+      Uint32 rc = dict_lock_unlock(signal, &lockReq);
+      debugLockInfo(signal,
+                    "ALTER_TAB_CONF unlock on completion",
+                    rc);
+      
       TableRecordPtr tabPtr;
       c_tableRecordPool.getPtr(tabPtr, alterTabPtr.p->m_tablePtrI);  
       releaseTableObject(tabPtr.i, false);
@@ -5739,7 +5779,10 @@ Dbdict::createTab_reply(Signal* signal,
     lockReq.userPtr = createTabPtr.i;
     lockReq.userRef = reference();
     lockReq.lockType = DictLockReq::CreateTableLock;
-    dict_lock_unlock(signal, &lockReq);
+    Uint32 rc = dict_lock_unlock(signal, &lockReq);
+    debugLockInfo(signal,
+                  "CREATE_TABLE_REQ unlock on CreateTableDrop",
+                  rc);
 
     releaseCreateTableOp(signal,createTabPtr);
     
@@ -5805,7 +5848,10 @@ Dbdict::createTab_startLcpMutex_unlocked
   lockReq.userPtr = createTabPtr.i;
   lockReq.userRef = reference();
   lockReq.lockType = DictLockReq::CreateTableLock;
-  dict_lock_unlock(signal, &lockReq);
+  Uint32 rc = dict_lock_unlock(signal, &lockReq);
+  debugLockInfo(signal,
+                "CREATE_TABLE_REQ commit unlock",
+                rc);
 
   releaseCreateTableOp(signal,createTabPtr);
   return;
@@ -7303,6 +7349,9 @@ Dbdict::execDROP_TABLE_REQ(Signal* signa
   lockReq.userRef = reference();
   lockReq.lockType = DictLockReq::DropTableLock;
   Uint32 err = dict_lock_trylock(&lockReq);
+  debugLockInfo(signal,
+                "DROP_TABLE_REQ trylock",
+                err);
   if (err)
   {
     jam();
@@ -7357,7 +7406,10 @@ Dbdict::dropTable_backup_mutex_locked(Si
     lockReq.userPtr = dropTabPtr.i;
     lockReq.userRef = reference();
     lockReq.lockType = DictLockReq::DropTableLock;
-    dict_lock_unlock(signal, &lockReq);
+    Uint32 rc = dict_lock_unlock(signal, &lockReq);
+    debugLockInfo(signal,
+                  "DROP_TABLE_REQ unlock on backup ongoing",
+                  rc);
 
     c_opDropTable.release(dropTabPtr);
   }
@@ -7429,8 +7481,12 @@ Dbdict::prepDropTab_nextStep(Signal* sig
   prep->tableId = dropTabPtr.p->m_request.tableId;
   prep->requestType = dropTabPtr.p->m_requestType;
   
-  dropTabPtr.p->m_coordinatorData.m_signalCounter = c_aliveNodes;
   NodeReceiverGroup rg(block, c_aliveNodes);
+  {
+    SafeCounter safeCounter(c_counterMgr,
+                          dropTabPtr.p->m_coordinatorData.m_counter);
+    safeCounter.init<PrepDropTabRef>(rg, GSN_PREP_DROP_TAB_REF, dropTabPtr.p->key);
+  }
   sendSignal(rg, GSN_PREP_DROP_TAB_REQ, signal, 
 	     PrepDropTabReq::SignalLength, JBB);
   
@@ -7459,12 +7515,14 @@ Dbdict::execPREP_DROP_TAB_CONF(Signal *
   ndbrequire(dropTabPtr.p->m_request.tableId == prep->tableId);
   ndbrequire(dropTabPtr.p->m_coordinatorData.m_gsn == GSN_PREP_DROP_TAB_REQ);
   
-  Uint32 nodeId = refToNode(prep->senderRef);
-  dropTabPtr.p->m_coordinatorData.m_signalCounter.clearWaitingFor(nodeId);
-  
-  if(!dropTabPtr.p->m_coordinatorData.m_signalCounter.done()){
-    jam();
-    return;
+  {
+    Uint32 nodeId = refToNode(prep->senderRef);
+    SafeCounter safeCounter(c_counterMgr, dropTabPtr.p->m_coordinatorData.m_counter);
+    if(!safeCounter.clearWaitingFor(nodeId))
+    {
+      jam();
+      return;
+    }
   }
   prepDropTab_nextStep(signal, dropTabPtr);
 }
@@ -7479,12 +7537,10 @@ Dbdict::execPREP_DROP_TAB_REF(Signal* si
   ndbrequire(c_opDropTable.find(dropTabPtr, prep->senderData));
   
   ndbrequire(dropTabPtr.p->m_coordinatorRef == reference());
-  ndbrequire(dropTabPtr.p->m_request.tableId == prep->tableId);
+  ndbrequire((dropTabPtr.p->m_request.tableId == prep->tableId) ||
+             (prep->errorCode == PrepDropTabRef::NF_FakeErrorREF));
   ndbrequire(dropTabPtr.p->m_coordinatorData.m_gsn == GSN_PREP_DROP_TAB_REQ);
   
-  Uint32 nodeId = refToNode(prep->senderRef);
-  dropTabPtr.p->m_coordinatorData.m_signalCounter.clearWaitingFor(nodeId);
-  
   Uint32 block = refToBlock(prep->senderRef);
   if((prep->errorCode == PrepDropTabRef::NoSuchTable && block == DBLQH) ||
      (prep->errorCode == PrepDropTabRef::NF_FakeErrorREF)){
@@ -7498,9 +7554,14 @@ Dbdict::execPREP_DROP_TAB_REF(Signal* si
     dropTabPtr.p->setErrorCode((Uint32)prep->errorCode);
   }
   
-  if(!dropTabPtr.p->m_coordinatorData.m_signalCounter.done()){
-    jam();
-    return;
+  {
+    Uint32 nodeId = refToNode(prep->senderRef);
+    SafeCounter safeCounter(c_counterMgr, dropTabPtr.p->m_coordinatorData.m_counter);
+    if(!safeCounter.clearWaitingFor(nodeId))
+    {
+      jam();
+      return;
+    }
   }
   prepDropTab_nextStep(signal, dropTabPtr);
 }
@@ -7568,8 +7629,11 @@ Dbdict::dropTableWaitGci(Signal* signal)
   req->tableId = dropTabPtr.p->m_request.tableId;
   req->requestType = dropTabPtr.p->m_requestType;
 
-  dropTabPtr.p->m_coordinatorData.m_signalCounter = c_aliveNodes;
+  
   NodeReceiverGroup rg(DBDICT, c_aliveNodes);
+  SafeCounter safeCounter(c_counterMgr,
+                          dropTabPtr.p->m_coordinatorData.m_counter);
+  safeCounter.init<DropTabRef>(rg, GSN_DROP_TAB_REF, dropTabPtr.p->key);
   sendSignal(rg, GSN_DROP_TAB_REQ, signal, 
 	     DropTabReq::SignalLength, JBB);
 }
@@ -7591,7 +7655,24 @@ Dbdict::execDROP_TAB_REF(Signal* signal)
     dropTab_localDROP_TAB_CONF(signal);
     return;
   }
-  ndbrequire(false);
+  else
+  {
+    jam();
+    ndbrequire(req->errorCode == DropTabRef::NF_FakeErrorREF);
+    DropTableRecordPtr dropTabPtr;  
+    ndbrequire(c_opDropTable.find(dropTabPtr, req->senderData));
+    ndbrequire(dropTabPtr.p->m_coordinatorRef == reference());
+    ndbrequire(dropTabPtr.p->m_coordinatorData.m_gsn == GSN_DROP_TAB_REQ);
+    
+    /* Extract tableid, and process as CONF */
+    Uint32 tableId = dropTabPtr.p->m_request.tableId;
+    DropTabConf* conf = (DropTabConf*) signal->getDataPtrSend();
+    conf->senderRef = req->senderRef;
+    conf->senderData = req->senderData;
+    conf->tableId = tableId;
+    signal->header.theLength = DropTabConf::SignalLength;
+    execDROP_TAB_CONF(signal);
+  }
 }
 
 void
@@ -7615,9 +7696,10 @@ Dbdict::execDROP_TAB_CONF(Signal* signal
   ndbrequire(dropTabPtr.p->m_coordinatorData.m_gsn == GSN_DROP_TAB_REQ);
 
   Uint32 nodeId = refToNode(req->senderRef);
-  dropTabPtr.p->m_coordinatorData.m_signalCounter.clearWaitingFor(nodeId);
-  
-  if(!dropTabPtr.p->m_coordinatorData.m_signalCounter.done()){
+  SafeCounter safeCounter(c_counterMgr,
+                          dropTabPtr.p->m_coordinatorData.m_counter);
+  if (!safeCounter.clearWaitingFor(nodeId))
+  {
     jam();
     return;
   }
@@ -7635,7 +7717,10 @@ Dbdict::execDROP_TAB_CONF(Signal* signal
   lockReq.userPtr = dropTabPtr.i;
   lockReq.userRef = reference();
   lockReq.lockType = DictLockReq::DropTableLock;
-  dict_lock_unlock(signal, &lockReq);
+  Uint32 rc = dict_lock_unlock(signal, &lockReq);
+  debugLockInfo(signal,
+                "DROP_TABLE_CONF unlock",
+                rc);
   
   c_opDropTable.release(dropTabPtr);
 }
@@ -7646,6 +7731,16 @@ Dbdict::execDROP_TAB_CONF(Signal* signal
 void
 Dbdict::execPREP_DROP_TAB_REQ(Signal* signal){
   jamEntry();
+
+  if (ERROR_INSERTED(6028))
+  {
+    jam();
+    /* Defer */
+    sendSignalWithDelay(reference(), GSN_PREP_DROP_TAB_REQ, signal,
+                        1000, signal->length());
+    return;
+  }
+
   PrepDropTabReq * prep = (PrepDropTabReq*)signal->getDataPtrSend();  
 
   DropTableRecordPtr dropTabPtr;  
@@ -7740,6 +7835,15 @@ void
 Dbdict::execDROP_TAB_REQ(Signal* signal){
   jamEntry();
   DropTabReq * req = (DropTabReq*)signal->getDataPtrSend();  
+ 
+  if (ERROR_INSERTED(6027))
+  {
+    jam();
+    /* Defer */
+    sendSignalWithDelay(reference(), GSN_DROP_TAB_REQ, signal,
+                        1000, signal->length());
+    return;
+  }
 
   DropTableRecordPtr dropTabPtr;  
   ndbrequire(c_opDropTable.find(dropTabPtr, req->senderData));
@@ -8850,10 +8954,17 @@ Dbdict::execCREATE_INDX_REQ(Signal* sign
         lockReq.lockType = DictLockReq::CreateIndexLock;
         tmperr = (CreateIndxRef::ErrorCode)dict_lock_trylock(&lockReq);
 
+        debugLockInfo(signal,
+                      "CREATE_INDEX_REQ trylock",
+                      (Uint32) tmperr);
+
         if (tmperr == 0)
         {
           jam();
-          dict_lock_unlock(0, &lockReq);
+          Uint32 rc = dict_lock_unlock(0, &lockReq);
+          debugLockInfo(signal,
+                        "CREATE_INDEX_REQ immediate unlock",
+                        rc);
         }
       }
 
@@ -9520,11 +9631,16 @@ Dbdict::execDROP_INDX_REQ(Signal* signal
         lockReq.userRef = reference();
         lockReq.lockType = DictLockReq::DropIndexLock;
         tmperr = (DropIndxRef::ErrorCode)dict_lock_trylock(&lockReq);
-
+        debugLockInfo(signal,
+                      "DROP_INDEX_REQ trylock",
+                      (Uint32) tmperr);
         if (tmperr == 0)
         {
           jam();
-          dict_lock_unlock(0, &lockReq);
+          Uint32 rc = dict_lock_unlock(0, &lockReq);
+          debugLockInfo(signal,
+                        "DROP_INDEX_REQ immediate unlock",
+                        rc);
         }
       }
 
@@ -15015,6 +15131,53 @@ Dbdict::getDictLockType(Uint32 lockType)
 }
 
 void
+Dbdict::debugLockInfo(Signal* signal, 
+                      const char* text,
+                      Uint32 rc)
+{
+  if (!g_trace)
+    return;
+  
+  static const char* rctext = "Unknown result";
+  
+  switch(rc)
+  {
+  case UtilLockRef::OK:
+    rctext = "Success";
+    break;
+  case UtilLockRef::NoSuchLock:
+    rctext = "No such lock";
+    break;
+  case UtilLockRef::OutOfLockRecords:
+    rctext = "Out of records";
+    break;
+  case UtilLockRef::DistributedLockNotSupported:
+    rctext = "Distributed lock not supported";
+    break;
+  case UtilLockRef::LockAlreadyHeld:
+    rctext = "Already held";
+    break;
+  case UtilLockRef::InLockQueue:
+    rctext = "Queued";
+    break;
+    /* try returns these... */
+  case CreateTableRef::Busy:
+    rctext = "CreateTableRef::Busy";
+    break;
+  case CreateTableRef::BusyWithNR:
+    rctext = "CreateTableRef::BusyWithNR";
+    break;
+  default:
+    break;
+  }
+  
+  infoEvent("DICT : %s %u %s",
+            text,
+            rc,
+            rctext);
+}
+
+void
 Dbdict::sendDictLockInfoEvent(Signal*, const UtilLockReq* req, const char* text)
 {
   const Dbdict::DictLockType* lt = getDictLockType(req->extra);
@@ -15054,7 +15217,7 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal
     
     c_sub_startstop_lock.set(refToNode(req.userRef));
     
-    g_eventLogger->info("granting dict lock to %u", refToNode(req.userRef));
+    g_eventLogger->info("granting SumaStartMe dict lock to %u", refToNode(req.userRef));
     DictLockConf* conf = (DictLockConf*)signal->getDataPtrSend();
     conf->userPtr = req.userPtr;
     conf->lockType = req.lockType;
@@ -15102,6 +15265,10 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal
   }
   
   res = m_dict_lock.lock(m_dict_lock_pool, &lockReq, 0);
+  debugLockInfo(signal,
+                "DICT_LOCK_REQ lock",
+                res);
+
   switch(res){
   case 0:
     jam();
@@ -15115,7 +15282,8 @@ Dbdict::execDICT_LOCK_REQ(Signal* signal
     break;
   default:
     jam();
-    sendDictLockInfoEvent(signal, &lockReq, "lock request by node");    
+    sendDictLockInfoEvent(signal, &lockReq, "lock request by node");  
+    m_dict_lock.dump_queue(m_dict_lock_pool, this);
     break;
   }
   return;
@@ -15166,7 +15334,7 @@ Dbdict::execDICT_UNLOCK_ORD(Signal* sign
   if (ord->lockType ==  DictLockReq::SumaStartMe)
   {
     jam();
-    g_eventLogger->info("clearing dict lock for %u", refToNode(ord->senderRef));
+    g_eventLogger->info("clearing SumaStartMe dict lock for %u", refToNode(ord->senderRef));
     c_sub_startstop_lock.clear(refToNode(ord->senderRef));
     return;
   }
@@ -15174,8 +15342,12 @@ Dbdict::execDICT_UNLOCK_ORD(Signal* sign
   UtilLockReq lockReq;
   lockReq.senderData = req.userPtr;
   lockReq.senderRef = req.userRef;
-  lockReq.extra = DictLockReq::NodeRestartLock; // Should check...
-  Uint32 res = dict_lock_unlock(signal, &req);
+  DictLockReq::LockType lockType = DictLockReq::NodeRestartLock;
+  Uint32 res = dict_lock_unlock(signal, &req, &lockType);
+  debugLockInfo(signal,
+                "DICT_UNLOCK_ORD unlock",
+                res);
+  lockReq.extra = lockType;
   switch(res){
   case UtilUnlockRef::OK:
     jam();
@@ -15261,17 +15433,22 @@ Dbdict::dict_lock_trylock(const DictLock
     break;
   }
   
+  if (g_trace)
+    m_dict_lock.dump_queue(m_dict_lock_pool, this);
+  
   return CreateTableRef::Busy;
 }
 
 Uint32
-Dbdict::dict_lock_unlock(Signal* signal, const DictLockReq* _req)
+Dbdict::dict_lock_unlock(Signal* signal, const DictLockReq* _req,
+                         DictLockReq::LockType* type)
 {
   UtilUnlockReq req;
   req.senderData = _req->userPtr;
   req.senderRef = _req->userRef;
   
-  Uint32 res = m_dict_lock.unlock(m_dict_lock_pool, &req);
+  UtilLockReq orig_lock_req;
+  Uint32 res = m_dict_lock.unlock(m_dict_lock_pool, &req, &orig_lock_req);
   switch(res){
   case UtilUnlockRef::OK:
   case UtilUnlockRef::NotLockOwner:
@@ -15281,6 +15458,11 @@ Dbdict::dict_lock_unlock(Signal* signal,
     return res;
   }
 
+  if (type)
+  {
+    *type = (DictLockReq::LockType) orig_lock_req.extra;
+  }
+  
   UtilLockReq lockReq;
   LockQueue::Iterator iter;
   if (m_dict_lock.first(m_dict_lock_pool, iter))
@@ -15301,6 +15483,8 @@ Dbdict::dict_lock_unlock(Signal* signal,
         conf->lockType = lockReq.extra;
         sendSignal(lockReq.senderRef, GSN_DICT_LOCK_CONF, signal,
                    DictLockConf::SignalLength, JBB);
+        
+        sendDictLockInfoEvent(signal, &lockReq, "queued lock request granted for node");
       }        
       
       if (!m_dict_lock.next(iter))
@@ -15476,7 +15660,11 @@ Dbdict::execCREATE_FILE_REQ(Signal* sign
     lockReq.userPtr = trans_ptr.i;
     lockReq.userRef = reference();
     lockReq.lockType = DictLockReq::CreateFileLock;
-    if ((ref->errorCode = dict_lock_trylock(&lockReq)))
+    ref->errorCode = dict_lock_trylock(&lockReq);
+    debugLockInfo(signal,
+                  "CREATE_FILE_REQ trylock",
+                  ref->errorCode);
+    if (ref->errorCode != 0)
     {
       jam();
       ref->errorLine = __LINE__;      
@@ -15507,7 +15695,10 @@ Dbdict::execCREATE_FILE_REQ(Signal* sign
         ref->status    = 0;
         ref->errorKey  = 0;
         ref->errorLine = __LINE__;
-        dict_lock_unlock(0, &lockReq);
+        Uint32 rc = dict_lock_unlock(0, &lockReq);
+        debugLockInfo(signal,
+                      "CREATE_FILE_REQ unlock at error 1",
+                      rc);
         c_Trans.release(trans_ptr);
         break;
       }
@@ -15598,7 +15789,12 @@ Dbdict::execCREATE_FILEGROUP_REQ(Signal*
     lockReq.userPtr = trans_ptr.i;
     lockReq.userRef = reference();
     lockReq.lockType = DictLockReq::CreateFilegroupLock;
-    if ((ref->errorCode = dict_lock_trylock(&lockReq)))
+    ref->errorCode = dict_lock_trylock(&lockReq);
+    debugLockInfo(signal,
+                  "CREATE_FILEGROUP_REQ trylock",
+                  ref->errorCode);
+
+    if (ref->errorCode != 0)
     {
       jam();
       ref->errorLine = __LINE__;      
@@ -15628,7 +15824,10 @@ Dbdict::execCREATE_FILEGROUP_REQ(Signal*
         ref->status    = 0;
         ref->errorKey  = 0;
         ref->errorLine = __LINE__;
-        dict_lock_unlock(0, &lockReq);
+        Uint32 rc = dict_lock_unlock(0, &lockReq);
+        debugLockInfo(signal,
+                      "CREATE_FILEGROUP_REQ no free obj unlock",
+                      rc);
         c_Trans.release(trans_ptr);
         break;
       }
@@ -15733,7 +15932,12 @@ Dbdict::execDROP_FILE_REQ(Signal* signal
     lockReq.userPtr = trans_ptr.i;
     lockReq.userRef = reference();
     lockReq.lockType = DictLockReq::DropFileLock;
-    if ((ref->errorCode = dict_lock_trylock(&lockReq)))
+    ref->errorCode = dict_lock_trylock(&lockReq);
+    debugLockInfo(signal,
+                  "DROP_FILE_REQ trylock",
+                  ref->errorCode);
+
+    if (ref->errorCode != 0)
     {
       jam();
       ref->errorLine = __LINE__;      
@@ -15849,7 +16053,11 @@ Dbdict::execDROP_FILEGROUP_REQ(Signal* s
     lockReq.userPtr = trans_ptr.i;
     lockReq.userRef = reference();
     lockReq.lockType = DictLockReq::DropFilegroupLock;
-    if ((ref->errorCode = dict_lock_trylock(&lockReq)))
+    ref->errorCode = dict_lock_trylock(&lockReq);
+    debugLockInfo(signal,
+                  "DROP_FILEGROUP trylock",
+                  ref->errorCode);
+    if (ref->errorCode != 0)
     {
       jam();
       ref->errorLine = __LINE__;      
@@ -16134,7 +16342,10 @@ Dbdict::trans_commit_complete_done(Signa
     ndbrequire(false);
   }
   
-  dict_lock_unlock(signal, &lockReq);
+  Uint32 rc = dict_lock_unlock(signal, &lockReq);
+  debugLockInfo(signal,
+                "FILE/FILEGROUP CREATE/DROP completed unlock",
+                rc);
   c_Trans.release(trans_ptr);
   return;
 }
@@ -16258,7 +16469,10 @@ Dbdict::trans_abort_complete_done(Signal
     ndbrequire(false);
   }
   
-  dict_lock_unlock(signal, &lockReq);
+  Uint32 rc = dict_lock_unlock(signal, &lockReq);
+  debugLockInfo(signal,
+                "FILE/FILEGROUP CREATE/DROP aborted unlock",
+                rc);
   c_Trans.release(trans_ptr);
   return;
 }

=== modified file 'storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp'
--- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp	2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.hpp	2012-11-20 23:37:20 +0000
@@ -1190,7 +1190,7 @@ private:
     struct CoordinatorData {
       Uint32 m_gsn;
       Uint32 m_block;
-      SignalCounter m_signalCounter;
+      SafeCounterHandle m_counter;
     } m_coordinatorData;
 
     struct ParticipantData {
@@ -2618,11 +2618,15 @@ public:
   };
   static const DictLockType* getDictLockType(Uint32 lockType);
   void sendDictLockInfoEvent(Signal*, const UtilLockReq*, const char* text);
+  void debugLockInfo(Signal* signal, 
+                     const char* text,
+                     Uint32 rc);
   void removeStaleDictLocks(Signal* signal, const Uint32* theFailedNodes);
 
 
   Uint32 dict_lock_trylock(const DictLockReq* req);
-  Uint32 dict_lock_unlock(Signal* signal, const DictLockReq* req);
+  Uint32 dict_lock_unlock(Signal* signal, const DictLockReq* req, 
+                          DictLockReq::LockType* type=0);
   
   LockQueue::Pool m_dict_lock_pool;
   LockQueue m_dict_lock;

=== modified file 'storage/ndb/src/kernel/vm/LockQueue.cpp'
--- a/storage/ndb/src/kernel/vm/LockQueue.cpp	2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/vm/LockQueue.cpp	2012-11-20 23:37:20 +0000
@@ -92,7 +92,8 @@ LockQueue::lock(Pool & thePool,
 
 Uint32
 LockQueue::unlock(Pool & thePool, 
-                  const UtilUnlockReq* req)
+                  const UtilUnlockReq* req,
+                  UtilLockReq* orig_req)
 {
   const Uint32 senderRef = req->senderRef;
   const Uint32 senderData = req->senderData;
@@ -119,6 +120,11 @@ LockQueue::unlock(Pool & thePool,
         jam();
         res = UtilUnlockRef::NotLockOwner;
       }
+      
+      /* Copy out orig request if ptr supplied */
+      if (orig_req)
+        *orig_req = lockEPtr.p->m_req;
+      
       queue.release(lockEPtr);
       return res;
     }

=== modified file 'storage/ndb/src/kernel/vm/LockQueue.hpp'
--- a/storage/ndb/src/kernel/vm/LockQueue.hpp	2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/vm/LockQueue.hpp	2012-11-20 23:37:20 +0000
@@ -47,7 +47,7 @@ public:
   typedef ArrayPool<LockQueueElement> Pool;
   
   Uint32 lock(Pool&, const UtilLockReq * req, const UtilLockReq** lockOwner= 0);
-  Uint32 unlock(Pool&, const UtilUnlockReq* req);
+  Uint32 unlock(Pool&, const UtilUnlockReq* req, UtilLockReq* orig_req= 0);
   
   /**
    * After unlock

=== modified file 'storage/ndb/test/ndbapi/testDict.cpp'
--- a/storage/ndb/test/ndbapi/testDict.cpp	2011-01-30 20:42:21 +0000
+++ b/storage/ndb/test/ndbapi/testDict.cpp	2012-11-20 23:37:20 +0000
@@ -1211,6 +1211,7 @@ NF_codes[] = {
 
 int
 runNF1(NDBT_Context* ctx, NDBT_Step* step){
+
   NdbRestarter restarter;
   if(restarter.getNumDbNodes() < 2)
     return NDBT_OK;
@@ -4788,6 +4789,124 @@ runBug57057(NDBT_Context* ctx, NDBT_Step
 }
 
 
+static const char* control = "DropTabWorkerState";
+enum WorkerStates
+{
+  WS_INIT,
+  WS_IDLE,
+  WS_ACTIVE
+};
+
+int
+runDropTabWorker(NDBT_Context* ctx, NDBT_Step* step)
+{
+  while (!ctx->isTestStopped())
+  {
+    ctx->setProperty(control, WS_IDLE);
+    ctx->getPropertyWait(control, WS_ACTIVE);
+    if (ctx->isTestStopped())
+      return NDBT_OK;
+
+    Ndb* pNdb = GETNDB(step);
+    const char* tabName = ctx->getTab()->getName();
+    ndbout_c("Dropping table %s", tabName);
+    int rc = pNdb->getDictionary()->dropTable(tabName);
+    ndbout_c("Table drop return code : %d",
+             rc);
+  }
+  return NDBT_OK;
+}
+
+struct DropTabNFScenario
+{
+  Uint32 errorCode;
+  bool masterVictim;
+};
+
+static const DropTabNFScenario DropTabNFScenarios[] =
+{
+  { 6028, false }     /* Kill slave at top of PREP_DROP_TAB_REQ */
+  ,{ 6027, false }      /* Kill slave at top of DROP_TAB_REQ */
+//  ,{ 6028, true }      /* Kill master at top of PREP_DROP_TAB_REQ */
+  ,{ 6027, true }      /* Kill master at top of DROP_TAB_REQ */
+
+};
+
+int
+runDropTabNF(NDBT_Context* ctx, NDBT_Step* step)
+{
+  /* 
+     1. Create table
+     2. Insert error(s) on slave node
+     3. Drop table
+     4. Kill slave node
+     5. Wait for drop to complete
+     6. Wait for restart to complete
+
+     Variants
+     1.  Insert on slave/master
+     2.  Error code types
+  */
+  
+  NdbRestarter restarter;
+  Uint32 numScenarios = sizeof(DropTabNFScenarios) / sizeof(DropTabNFScenario);
+  int numLoops = ctx->getNumLoops();
+
+  for (int r=0; r < numLoops; r++)
+  {
+    ndbout_c("**** loop %d ****", r);
+    for (int n=0; n < numScenarios; n++)
+    {
+      ndbout_c("Creating table");
+      if (runCreateTheTable(ctx, step) != NDBT_OK)
+      {
+        return NDBT_FAILED;
+      }
+      
+      Uint32 errorCode = DropTabNFScenarios[n].errorCode;
+      int victimNode = 0;
+      const char* role;
+      if (DropTabNFScenarios[n].masterVictim)
+      {
+        victimNode = restarter.getMasterNodeId();
+        role = "M";
+      }
+      else
+      {
+        victimNode = restarter.getRandomNotMasterNodeId(rand());
+        role = "S";
+      }
+      ndbout_c("Chosen victim node : %u (%s)", victimNode, role);
+      
+      restarter.insertErrorInNode(victimNode, errorCode);
+      
+      ndbout_c("Inserted error %u in node %u", errorCode, victimNode);
+      
+      ndbout_c("Requesting drop tab");
+      ctx->getPropertyWait(control, WS_IDLE);
+      ctx->setProperty(control, WS_ACTIVE);
+      
+      ndbout_c("Restarting node %u", victimNode);
+      restarter.restartOneDbNode(victimNode);
+      ndbout_c("Node restarting....");
+      
+      ndbout_c("Waiting for drop table to complete...");
+      ctx->getPropertyWait(control, WS_IDLE);
+      ndbout_c("Drop table completed");
+      
+      ndbout_c("Waiting for node to recover");
+      restarter.waitNodesStarted(&victimNode, 1);
+      ndbout_c("Node started");
+    }
+  }
+
+  ndbout_c("**** stop ****");
+  ctx->stopTest();
+
+  return NDBT_OK;
+}
+
+
 NDBT_TESTSUITE(testDict);
 TESTCASE("testDropDDObjects",
          "* 1. start cluster\n"
@@ -5051,6 +5170,13 @@ TESTCASE("Bug57057",
   TC_PROPERTY("SubSteps", 1);
   STEP(runBug58277scan);
 }
+TESTCASE("DropTabNF",
+         "Drop table and node failure causes hang")
+{
+  STEP(runDropTabWorker);
+  STEP(runDropTabNF);
+}
+
 NDBT_TESTSUITE_END(testDict);
 
 int main(int argc, const char** argv){

=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt	2012-10-29 18:34:05 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt	2012-11-20 23:37:20 +0000
@@ -1557,3 +1557,7 @@ max-time : 1200
 cmd: testLimits
 args: -n SlowDihFileWrites T1
 
+max-time: 600
+cmd: testDict
+args: -n DropTabNF -l 3 T1
+

No bundle (reason: useless for push emails).
Thread
bzr push into mysql-5.1-telco-6.3 branch (frazer.clement:3509 to 3510)Bug#14787522Frazer Clement21 Nov