List:Commits« Previous MessageNext Message »
From:Frazer Clement Date:May 31 2012 3:31pm
Subject:bzr push into mysql-5.1-telco-7.0 branch (frazer.clement:4930 to 4931)
Bug#14083116
View as plain text  
 4931 Frazer Clement	2012-05-31
      Bug#14083116 (Bad error handling in LQH corrupts transid hash)
      
      Early exits when processing LQHKEYREQ signals could result in 
      corruption of the LQH transid hash, due to the wrong error handling
      path being used.
      
      This patch fixes the early exits, and adds assertions to the transid
      hash code to help catch such problems quicker in future.
      
      Testcase to be added on merge to cluster-7.2.

    modified:
      storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp
      storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
 4930 Frazer Clement	2012-05-29
      Bug #14075825 LCP Watchdog : Fragment scan check - use specific error
      
      This patch extends the previous patch implementing the 
      LCP Fragment scan watchdog.
      
      This patch adds a specific ndbd/mtd process return code for the
      LCP Fragment scan watchdog failure scenario.
      
      This can allow external monitoring frameworks to detect this scenario
      at the process execution level and take specific steps. 

    modified:
      storage/ndb/include/mgmapi/ndbd_exit_codes.h
      storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
      storage/ndb/src/kernel/error/ndbd_exit_codes.c
=== modified file 'storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp'
--- a/storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp	2012-05-21 22:27:28 +0000
+++ b/storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp	2012-05-31 14:42:04 +0000
@@ -2632,9 +2632,9 @@ private:
   bool checkTransporterOverloaded(Signal* signal,
                                   const NodeBitmask& all,
                                   const class LqhKeyReq* req);
-  void noFreeRecordLab(Signal* signal, 
-		       const class LqhKeyReq * lqhKeyReq, 
-		       Uint32 errorCode);
+  void earlyKeyReqAbort(Signal* signal, 
+                        const class LqhKeyReq * lqhKeyReq, 
+                        Uint32 errorCode);
   void logLqhkeyrefLab(Signal* signal);
   void closeCopyLab(Signal* signal);
   void commitReplyLab(Signal* signal);

=== modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2012-05-29 17:04:36 +0000
+++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2012-05-31 14:42:04 +0000
@@ -2973,22 +2973,53 @@ void Dblqh::execTIME_SIGNAL(Signal* sign
 /* INVOLVE COMMUNICATION WITH ACC AND TUP.                                   */
 /* ######################################################################### */
 
-void Dblqh::noFreeRecordLab(Signal* signal, 
-			    const LqhKeyReq * lqhKeyReq,
-			    Uint32 errCode) 
+/**
+ * earlyKeyReqAbort
+ *
+ * Exit early from handling an LQHKEYREQ request.
+ * Method determines which resources (if any) need freed, then
+ * signals requestor with error response.
+ * * Verify all required resources are freed if adding new callers *
+ */
+void Dblqh::earlyKeyReqAbort(Signal* signal, 
+                             const LqhKeyReq * lqhKeyReq,
+                             Uint32 errCode) 
 {
   jamEntry();
   const Uint32 transid1  = lqhKeyReq->transId1;
   const Uint32 transid2  = lqhKeyReq->transId2;
   const Uint32 reqInfo   = lqhKeyReq->requestInfo;
   
-  if(errCode == ZNO_FREE_MARKER_RECORDS_ERROR ||
-     errCode == ZNODE_SHUTDOWN_IN_PROGESS ||
-     errCode == ZNODE_FAILURE_ERROR){
+  bool tcConnectRecAllocated = (tcConnectptr.i != RNIL);
+
+  if (tcConnectRecAllocated)
+  {
     jam();
+    
+    /* Could have a commit-ack marker allocated. */
+    remove_commit_marker(tcConnectptr.p);
+    
+    /* Could have long key/attr sections linked */
+    ndbrequire(tcConnectptr.p->m_dealloc == 0);
+    releaseOprec(signal);
+    
+    
+    /* 
+     * Free the TcConnectRecord, ensuring that the
+     * table reference counts have not been incremented and
+     * so will not be decremented.
+     * Also verify that we're not present in the transid 
+     * hash
+     */
+    ndbrequire(tcConnectptr.p->tableref == RNIL);
+    /* Following is not 100% check, but a reasonable guard */
+    ndbrequire(tcConnectptr.p->nextHashRec == RNIL);
+    ndbrequire(tcConnectptr.p->prevHashRec == RNIL);
     releaseTcrec(signal, tcConnectptr);
   }
 
+  /* Now perform signalling */
+
   if (LqhKeyReq::getDirtyFlag(reqInfo) && 
       LqhKeyReq::getOperation(reqInfo) == ZREAD &&
       !LqhKeyReq::getNormalProtocolFlag(reqInfo)){
@@ -3030,7 +3061,7 @@ void Dblqh::noFreeRecordLab(Signal* sign
 	       LqhKeyRef::SignalLength, JBB);
   }//if
   return;
-}//Dblqh::noFreeRecordLab()
+}//Dblqh::earlyKeyReqAbort()
 
 Uint32
 Dblqh::get_table_state_error(Ptr<Tablerec> tabPtr) const
@@ -4349,10 +4380,11 @@ void Dblqh::execSIGNAL_DROPPED_REP(Signa
      * We will notify the client that their LQHKEYREQ
      * failed
      */
+    tcConnectptr.i = RNIL;
     const LqhKeyReq * const truncatedLqhKeyReq = 
       (LqhKeyReq *) &rep->originalData[0];
     
-    noFreeRecordLab(signal, truncatedLqhKeyReq, ZGET_DATAREC_ERROR);
+    earlyKeyReqAbort(signal, truncatedLqhKeyReq, ZGET_DATAREC_ERROR);
 
     break;
   }
@@ -4407,6 +4439,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal
 
   const LqhKeyReq * const lqhKeyReq = (LqhKeyReq *)signal->getDataPtr();
   SectionHandle handle(this, signal);
+  tcConnectptr.i = RNIL;
 
   {
     const NodeBitmask& all = globalTransporterRegistry.get_status_overloaded();
@@ -4419,7 +4452,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal
          */
         jam();
         releaseSections(handle);
-        noFreeRecordLab(signal, lqhKeyReq, ZTRANSPORTER_OVERLOADED_ERROR);
+        earlyKeyReqAbort(signal, lqhKeyReq, ZTRANSPORTER_OVERLOADED_ERROR);
         return;
       }
     }
@@ -4429,7 +4462,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal
   {
     jam();
     releaseSections(handle);
-    noFreeRecordLab(signal, lqhKeyReq, ZTRANSPORTER_OVERLOADED_ERROR);
+    earlyKeyReqAbort(signal, lqhKeyReq, ZTRANSPORTER_OVERLOADED_ERROR);
     return;
   }
 
@@ -4442,7 +4475,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal
 /* NO FREE TC RECORD AVAILABLE, THUS WE CANNOT HANDLE THE REQUEST.           */
 /* ------------------------------------------------------------------------- */
     releaseSections(handle);
-    noFreeRecordLab(signal, lqhKeyReq, ZNO_TC_CONNECT_ERROR);
+    earlyKeyReqAbort(signal, lqhKeyReq, ZNO_TC_CONNECT_ERROR);
     return;
   }//if
 
@@ -4488,14 +4521,14 @@ void Dblqh::execLQHKEYREQ(Signal* signal
   const Uint8 op = LqhKeyReq::getOperation(Treqinfo);
   if ((op == ZREAD || op == ZREAD_EX) && !getAllowRead()){
     releaseSections(handle);
-    noFreeRecordLab(signal, lqhKeyReq, ZNODE_SHUTDOWN_IN_PROGESS);
+    earlyKeyReqAbort(signal, lqhKeyReq, ZNODE_SHUTDOWN_IN_PROGESS);
     return;
   }
 
   if (unlikely(get_node_status(refToNode(sig5)) != ZNODE_UP))
   {
     releaseSections(handle);
-    noFreeRecordLab(signal, lqhKeyReq, ZNODE_FAILURE_ERROR);
+    earlyKeyReqAbort(signal, lqhKeyReq, ZNODE_FAILURE_ERROR);
     return;
   }
   
@@ -4553,7 +4586,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal
       if (markerPtr.i == RNIL)
       {
         releaseSections(handle);
-        noFreeRecordLab(signal, lqhKeyReq, ZNO_FREE_MARKER_RECORDS_ERROR);
+        earlyKeyReqAbort(signal, lqhKeyReq, ZNO_FREE_MARKER_RECORDS_ERROR);
         return;
       }
       markerPtr.p->transid1 = sig1;
@@ -4708,8 +4741,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal
     if (unlikely(!ok))
     {
       jam();
-      terrorCode= ZGET_DATAREC_ERROR;
-      abortErrorLab(signal);
+      earlyKeyReqAbort(signal, lqhKeyReq, ZGET_DATAREC_ERROR);
       return;
     }
 
@@ -4725,8 +4757,9 @@ void Dblqh::execLQHKEYREQ(Signal* signal
     {
       jam();
       ndbassert(! LqhKeyReq::getNrCopyFlag(Treqinfo));
-      terrorCode = ZNO_TUPLE_FOUND;
-      abortErrorLab(signal);
+      
+      /* Reply with NO_TUPLE_FOUND */
+      earlyKeyReqAbort(signal, lqhKeyReq, ZNO_TUPLE_FOUND);
       return;
     }
 
@@ -4784,6 +4817,9 @@ void Dblqh::execLQHKEYREQ(Signal* signal
       return;
     }//if
   }//if
+  /* Check that no equal element exists */
+  ndbassert(findTransaction(regTcPtr->transid[0], regTcPtr->transid[1], 
+                            regTcPtr->tcOprec, 0) == ZNOT_FOUND);
   TcConnectionrecPtr localNextTcConnectptr;
   Uint32 hashIndex = (regTcPtr->transid[0] ^ regTcPtr->tcOprec) & 1023;
   localNextTcConnectptr.i = ctransidHash[hashIndex];
@@ -4797,6 +4833,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal
     ptrCheckGuard(localNextTcConnectptr, 
                   ctcConnectrecFileSize, tcConnectionrec);
     jam();
+    ndbassert(localNextTcConnectptr.p->prevHashRec == RNIL);
     localNextTcConnectptr.p->prevHashRec = tcConnectptr.i;
   }//if
   if (tabptr.i >= ctabrecFileSize) {
@@ -7244,9 +7281,12 @@ void Dblqh::deleteTransidHash(Signal* si
 
   prevHashptr.i = regTcPtr->prevHashRec;
   nextHashptr.i = regTcPtr->nextHashRec;
+  /* prevHashptr and nextHashptr may be RNIL when the bucket has 1 element */
+
   if (prevHashptr.i != RNIL) {
     jam();
     ptrCheckGuard(prevHashptr, ctcConnectrecFileSize, tcConnectionrec);
+    ndbassert(prevHashptr.p->nextHashRec == tcConnectptr.i);
     prevHashptr.p->nextHashRec = nextHashptr.i;
   } else {
     jam();
@@ -7255,11 +7295,13 @@ void Dblqh::deleteTransidHash(Signal* si
 /* A NEW LEADER OF THE LIST.                                                 */
 /* ------------------------------------------------------------------------- */
     Uint32 hashIndex = (regTcPtr->transid[0] ^ regTcPtr->tcOprec) & 1023;
+    ndbassert(ctransidHash[hashIndex] == tcConnectptr.i);
     ctransidHash[hashIndex] = nextHashptr.i;
   }//if
   if (nextHashptr.i != RNIL) {
     jam();
     ptrCheckGuard(nextHashptr, ctcConnectrecFileSize, tcConnectionrec);
+    ndbassert(nextHashptr.p->prevHashRec == tcConnectptr.i);
     nextHashptr.p->prevHashRec = prevHashptr.i;
   }//if
 
@@ -10448,6 +10490,11 @@ void Dblqh::execSCAN_FRAGREQ(Signal* sig
   }//if
   cbookedAccOps += max_rows;
 
+  /* Check that no equal element already exists */
+  ndbassert(findTransaction(tcConnectptr.p->transid[0],
+                            tcConnectptr.p->transid[1],
+                            tcConnectptr.p->tcOprec,
+                            senderHi) == ZNOT_FOUND);
   hashIndex = (tcConnectptr.p->transid[0] ^ tcConnectptr.p->tcOprec) & 1023;
   nextHashptr.i = ctransidHash[hashIndex];
   ctransidHash[hashIndex] = tcConnectptr.i;
@@ -10460,6 +10507,7 @@ void Dblqh::execSCAN_FRAGREQ(Signal* sig
      *   IF IT EXISTS
      * --------------------------------------------------------------------- */
     ptrCheckGuard(nextHashptr, ctcConnectrecFileSize, tcConnectionrec);
+    ndbassert(nextHashptr.p->prevHashRec == RNIL);
     nextHashptr.p->prevHashRec = tcConnectptr.i;
   }//if
   if ((! isLongReq ) && 
@@ -20800,6 +20848,8 @@ void Dblqh::initialiseTcrec(Signal* sign
       tcConnectptr.p->attrInfoIVal = RNIL;
       tcConnectptr.p->m_flags= 0;
       tcConnectptr.p->tcTimer = 0;
+      tcConnectptr.p->nextHashRec = RNIL;
+      tcConnectptr.p->prevHashRec = RNIL;
       tcConnectptr.p->nextTcConnectrec = tcConnectptr.i + 1;
     }//for
     tcConnectptr.i = ctcConnectrecFileSize - 1;
@@ -23007,10 +23057,12 @@ Dblqh::execDUMP_STATE_ORD(Signal* signal
   Uint32 ttcConnectrecFileSize = ctcConnectrecFileSize;
   if(arg == 2306)
   {
+    Uint32 bucketLen[1024];
     for(Uint32 i = 0; i<1024; i++)
     {
       TcConnectionrecPtr tcRec;
       tcRec.i = ctransidHash[i];
+      bucketLen[i] = 0;
       while(tcRec.i != RNIL)
       {
 	ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec);
@@ -23019,8 +23071,18 @@ Dblqh::execDUMP_STATE_ORD(Signal* signal
 	signal->theData[1] = tcRec.i;
 	execDUMP_STATE_ORD(signal);
 	tcRec.i = tcRec.p->nextHashRec;
+        bucketLen[i]++;
+      }
+    }
+    ndbout << "LQH transid hash bucket lengths : " << endl;
+    for (Uint32 i = 0; i < 1024; i++)
+    {
+      if (bucketLen[i] > 0)
+      {
+        ndbout << " bucket " << i << " len " << bucketLen[i] << endl;
       }
     }
+    ndbout << "Done." << endl;
   }
 
   if(arg == 2307 || arg == 2308)

No bundle (reason: useless for push emails).
Thread
bzr push into mysql-5.1-telco-7.0 branch (frazer.clement:4930 to 4931)Bug#14083116Frazer Clement31 May