From: Frazer Clement Date: May 31 2012 3:31pm Subject: bzr push into mysql-5.1-telco-7.0 branch (frazer.clement:4930 to 4931) Bug#14083116 List-Archive: http://lists.mysql.com/commits/144058 X-Bug: 14083116 Message-Id: <201205311531.q4VFVmYs023598@acsmt356.oracle.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit 4931 Frazer Clement 2012-05-31 Bug#14083116 (Bad error handling in LQH corrupts transid hash) Early exits when processing LQHKEYREQ signals could result in corruption of the LQH transid hash, due to the wrong error handling path being used. This patch fixes the early exits, and adds assertions to the transid hash code to help catch such problems quicker in future. Testcase to be added on merge to cluster-7.2. modified: storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 4930 Frazer Clement 2012-05-29 Bug #14075825 LCP Watchdog : Fragment scan check - use specific error This patch extends the previous patch implementing the LCP Fragment scan watchdog. This patch adds a specific ndbd/mtd process return code for the LCP Fragment scan watchdog failure scenario. This can allow external monitoring frameworks to detect this scenario at the process execution level and take specific steps. modified: storage/ndb/include/mgmapi/ndbd_exit_codes.h storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp storage/ndb/src/kernel/error/ndbd_exit_codes.c === modified file 'storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp' --- a/storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp 2012-05-21 22:27:28 +0000 +++ b/storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp 2012-05-31 14:42:04 +0000 @@ -2632,9 +2632,9 @@ private: bool checkTransporterOverloaded(Signal* signal, const NodeBitmask& all, const class LqhKeyReq* req); - void noFreeRecordLab(Signal* signal, - const class LqhKeyReq * lqhKeyReq, - Uint32 errorCode); + void earlyKeyReqAbort(Signal* signal, + const class LqhKeyReq * lqhKeyReq, + Uint32 errorCode); void logLqhkeyrefLab(Signal* signal); void closeCopyLab(Signal* signal); void commitReplyLab(Signal* signal); === modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp' --- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2012-05-29 17:04:36 +0000 +++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2012-05-31 14:42:04 +0000 @@ -2973,22 +2973,53 @@ void Dblqh::execTIME_SIGNAL(Signal* sign /* INVOLVE COMMUNICATION WITH ACC AND TUP. */ /* ######################################################################### */ -void Dblqh::noFreeRecordLab(Signal* signal, - const LqhKeyReq * lqhKeyReq, - Uint32 errCode) +/** + * earlyKeyReqAbort + * + * Exit early from handling an LQHKEYREQ request. + * Method determines which resources (if any) need freed, then + * signals requestor with error response. + * * Verify all required resources are freed if adding new callers * + */ +void Dblqh::earlyKeyReqAbort(Signal* signal, + const LqhKeyReq * lqhKeyReq, + Uint32 errCode) { jamEntry(); const Uint32 transid1 = lqhKeyReq->transId1; const Uint32 transid2 = lqhKeyReq->transId2; const Uint32 reqInfo = lqhKeyReq->requestInfo; - if(errCode == ZNO_FREE_MARKER_RECORDS_ERROR || - errCode == ZNODE_SHUTDOWN_IN_PROGESS || - errCode == ZNODE_FAILURE_ERROR){ + bool tcConnectRecAllocated = (tcConnectptr.i != RNIL); + + if (tcConnectRecAllocated) + { jam(); + + /* Could have a commit-ack marker allocated. */ + remove_commit_marker(tcConnectptr.p); + + /* Could have long key/attr sections linked */ + ndbrequire(tcConnectptr.p->m_dealloc == 0); + releaseOprec(signal); + + + /* + * Free the TcConnectRecord, ensuring that the + * table reference counts have not been incremented and + * so will not be decremented. + * Also verify that we're not present in the transid + * hash + */ + ndbrequire(tcConnectptr.p->tableref == RNIL); + /* Following is not 100% check, but a reasonable guard */ + ndbrequire(tcConnectptr.p->nextHashRec == RNIL); + ndbrequire(tcConnectptr.p->prevHashRec == RNIL); releaseTcrec(signal, tcConnectptr); } + /* Now perform signalling */ + if (LqhKeyReq::getDirtyFlag(reqInfo) && LqhKeyReq::getOperation(reqInfo) == ZREAD && !LqhKeyReq::getNormalProtocolFlag(reqInfo)){ @@ -3030,7 +3061,7 @@ void Dblqh::noFreeRecordLab(Signal* sign LqhKeyRef::SignalLength, JBB); }//if return; -}//Dblqh::noFreeRecordLab() +}//Dblqh::earlyKeyReqAbort() Uint32 Dblqh::get_table_state_error(Ptr tabPtr) const @@ -4349,10 +4380,11 @@ void Dblqh::execSIGNAL_DROPPED_REP(Signa * We will notify the client that their LQHKEYREQ * failed */ + tcConnectptr.i = RNIL; const LqhKeyReq * const truncatedLqhKeyReq = (LqhKeyReq *) &rep->originalData[0]; - noFreeRecordLab(signal, truncatedLqhKeyReq, ZGET_DATAREC_ERROR); + earlyKeyReqAbort(signal, truncatedLqhKeyReq, ZGET_DATAREC_ERROR); break; } @@ -4407,6 +4439,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal const LqhKeyReq * const lqhKeyReq = (LqhKeyReq *)signal->getDataPtr(); SectionHandle handle(this, signal); + tcConnectptr.i = RNIL; { const NodeBitmask& all = globalTransporterRegistry.get_status_overloaded(); @@ -4419,7 +4452,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal */ jam(); releaseSections(handle); - noFreeRecordLab(signal, lqhKeyReq, ZTRANSPORTER_OVERLOADED_ERROR); + earlyKeyReqAbort(signal, lqhKeyReq, ZTRANSPORTER_OVERLOADED_ERROR); return; } } @@ -4429,7 +4462,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal { jam(); releaseSections(handle); - noFreeRecordLab(signal, lqhKeyReq, ZTRANSPORTER_OVERLOADED_ERROR); + earlyKeyReqAbort(signal, lqhKeyReq, ZTRANSPORTER_OVERLOADED_ERROR); return; } @@ -4442,7 +4475,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal /* NO FREE TC RECORD AVAILABLE, THUS WE CANNOT HANDLE THE REQUEST. */ /* ------------------------------------------------------------------------- */ releaseSections(handle); - noFreeRecordLab(signal, lqhKeyReq, ZNO_TC_CONNECT_ERROR); + earlyKeyReqAbort(signal, lqhKeyReq, ZNO_TC_CONNECT_ERROR); return; }//if @@ -4488,14 +4521,14 @@ void Dblqh::execLQHKEYREQ(Signal* signal const Uint8 op = LqhKeyReq::getOperation(Treqinfo); if ((op == ZREAD || op == ZREAD_EX) && !getAllowRead()){ releaseSections(handle); - noFreeRecordLab(signal, lqhKeyReq, ZNODE_SHUTDOWN_IN_PROGESS); + earlyKeyReqAbort(signal, lqhKeyReq, ZNODE_SHUTDOWN_IN_PROGESS); return; } if (unlikely(get_node_status(refToNode(sig5)) != ZNODE_UP)) { releaseSections(handle); - noFreeRecordLab(signal, lqhKeyReq, ZNODE_FAILURE_ERROR); + earlyKeyReqAbort(signal, lqhKeyReq, ZNODE_FAILURE_ERROR); return; } @@ -4553,7 +4586,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal if (markerPtr.i == RNIL) { releaseSections(handle); - noFreeRecordLab(signal, lqhKeyReq, ZNO_FREE_MARKER_RECORDS_ERROR); + earlyKeyReqAbort(signal, lqhKeyReq, ZNO_FREE_MARKER_RECORDS_ERROR); return; } markerPtr.p->transid1 = sig1; @@ -4708,8 +4741,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal if (unlikely(!ok)) { jam(); - terrorCode= ZGET_DATAREC_ERROR; - abortErrorLab(signal); + earlyKeyReqAbort(signal, lqhKeyReq, ZGET_DATAREC_ERROR); return; } @@ -4725,8 +4757,9 @@ void Dblqh::execLQHKEYREQ(Signal* signal { jam(); ndbassert(! LqhKeyReq::getNrCopyFlag(Treqinfo)); - terrorCode = ZNO_TUPLE_FOUND; - abortErrorLab(signal); + + /* Reply with NO_TUPLE_FOUND */ + earlyKeyReqAbort(signal, lqhKeyReq, ZNO_TUPLE_FOUND); return; } @@ -4784,6 +4817,9 @@ void Dblqh::execLQHKEYREQ(Signal* signal return; }//if }//if + /* Check that no equal element exists */ + ndbassert(findTransaction(regTcPtr->transid[0], regTcPtr->transid[1], + regTcPtr->tcOprec, 0) == ZNOT_FOUND); TcConnectionrecPtr localNextTcConnectptr; Uint32 hashIndex = (regTcPtr->transid[0] ^ regTcPtr->tcOprec) & 1023; localNextTcConnectptr.i = ctransidHash[hashIndex]; @@ -4797,6 +4833,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal ptrCheckGuard(localNextTcConnectptr, ctcConnectrecFileSize, tcConnectionrec); jam(); + ndbassert(localNextTcConnectptr.p->prevHashRec == RNIL); localNextTcConnectptr.p->prevHashRec = tcConnectptr.i; }//if if (tabptr.i >= ctabrecFileSize) { @@ -7244,9 +7281,12 @@ void Dblqh::deleteTransidHash(Signal* si prevHashptr.i = regTcPtr->prevHashRec; nextHashptr.i = regTcPtr->nextHashRec; + /* prevHashptr and nextHashptr may be RNIL when the bucket has 1 element */ + if (prevHashptr.i != RNIL) { jam(); ptrCheckGuard(prevHashptr, ctcConnectrecFileSize, tcConnectionrec); + ndbassert(prevHashptr.p->nextHashRec == tcConnectptr.i); prevHashptr.p->nextHashRec = nextHashptr.i; } else { jam(); @@ -7255,11 +7295,13 @@ void Dblqh::deleteTransidHash(Signal* si /* A NEW LEADER OF THE LIST. */ /* ------------------------------------------------------------------------- */ Uint32 hashIndex = (regTcPtr->transid[0] ^ regTcPtr->tcOprec) & 1023; + ndbassert(ctransidHash[hashIndex] == tcConnectptr.i); ctransidHash[hashIndex] = nextHashptr.i; }//if if (nextHashptr.i != RNIL) { jam(); ptrCheckGuard(nextHashptr, ctcConnectrecFileSize, tcConnectionrec); + ndbassert(nextHashptr.p->prevHashRec == tcConnectptr.i); nextHashptr.p->prevHashRec = prevHashptr.i; }//if @@ -10448,6 +10490,11 @@ void Dblqh::execSCAN_FRAGREQ(Signal* sig }//if cbookedAccOps += max_rows; + /* Check that no equal element already exists */ + ndbassert(findTransaction(tcConnectptr.p->transid[0], + tcConnectptr.p->transid[1], + tcConnectptr.p->tcOprec, + senderHi) == ZNOT_FOUND); hashIndex = (tcConnectptr.p->transid[0] ^ tcConnectptr.p->tcOprec) & 1023; nextHashptr.i = ctransidHash[hashIndex]; ctransidHash[hashIndex] = tcConnectptr.i; @@ -10460,6 +10507,7 @@ void Dblqh::execSCAN_FRAGREQ(Signal* sig * IF IT EXISTS * --------------------------------------------------------------------- */ ptrCheckGuard(nextHashptr, ctcConnectrecFileSize, tcConnectionrec); + ndbassert(nextHashptr.p->prevHashRec == RNIL); nextHashptr.p->prevHashRec = tcConnectptr.i; }//if if ((! isLongReq ) && @@ -20800,6 +20848,8 @@ void Dblqh::initialiseTcrec(Signal* sign tcConnectptr.p->attrInfoIVal = RNIL; tcConnectptr.p->m_flags= 0; tcConnectptr.p->tcTimer = 0; + tcConnectptr.p->nextHashRec = RNIL; + tcConnectptr.p->prevHashRec = RNIL; tcConnectptr.p->nextTcConnectrec = tcConnectptr.i + 1; }//for tcConnectptr.i = ctcConnectrecFileSize - 1; @@ -23007,10 +23057,12 @@ Dblqh::execDUMP_STATE_ORD(Signal* signal Uint32 ttcConnectrecFileSize = ctcConnectrecFileSize; if(arg == 2306) { + Uint32 bucketLen[1024]; for(Uint32 i = 0; i<1024; i++) { TcConnectionrecPtr tcRec; tcRec.i = ctransidHash[i]; + bucketLen[i] = 0; while(tcRec.i != RNIL) { ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec); @@ -23019,8 +23071,18 @@ Dblqh::execDUMP_STATE_ORD(Signal* signal signal->theData[1] = tcRec.i; execDUMP_STATE_ORD(signal); tcRec.i = tcRec.p->nextHashRec; + bucketLen[i]++; + } + } + ndbout << "LQH transid hash bucket lengths : " << endl; + for (Uint32 i = 0; i < 1024; i++) + { + if (bucketLen[i] > 0) + { + ndbout << " bucket " << i << " len " << bucketLen[i] << endl; } } + ndbout << "Done." << endl; } if(arg == 2307 || arg == 2308) No bundle (reason: useless for push emails).