From: Date: April 14 2005 1:43pm Subject: bk commit into 4.1 tree (joreland:1.2178) BUG#9891 List-Archive: http://lists.mysql.com/internals/24017 X-Bug: 9891 Message-Id: <20050414114313.6F83D4BF4D@eel.ndb.mysql.com.ndb.mysql.com.ndb.mysql.com.ndb.mysql.com> Below is the list of changes that have just been committed into a local 4.1 repository of jonas. When jonas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet 1.2178 05/04/14 13:43:07 joreland@stripped +3 -0 BUG#9891 - ndb lcp Crash if ACC_CONTOPREQ was sent while ACC_LCPCONF was in job buffer if ACC_LCPCONF would have arrived eariler (before TUP_LCPSTARTED) operations could lockup. But would be restarted on next LCP -- LQH 1) Better check for LCP started that will also return true if ACC or TUP already has completed 2) Remove incorrect if statement that prevented operations to be started if ACC has completed -- ACC Make sure all ACC_CONTOPCONF are sent before releasing lcp record i.e. use noOfLcpConf == 4 (2 ACC_LCPCONF + 2 ACC_CONTOPCONF) Check for == 4 also when sending ACC_CONTOPCONF ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 1.50 05/04/14 13:43:05 joreland@stripped +25 -41 1) Better check for LCP started that will also return true if ACC or TUP already has completed 2) Remove incorrect if statement that prevented operations to be started if ACC has completed ndb/src/kernel/blocks/dblqh/Dblqh.hpp 1.26 05/04/14 13:43:05 joreland@stripped +1 -2 Remove LCP_STARTED state ndb/src/kernel/blocks/dbacc/DbaccMain.cpp 1.30 05/04/14 13:43:05 joreland@stripped +18 -2 Make sure all ACC_CONTOPCONF are sent before releasing lcp record i.e. use noOfLcpConf == 4 (2 ACC_LCPCONF + 2 ACC_CONTOPCONF) Check for == 4 also when sending ACC_CONTOPCONF # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: joreland # Host: eel.ndb.mysql.com.ndb.mysql.com.ndb.mysql.com.ndb.mysql.com # Root: /home/jonas/src/mysql-4.1 --- 1.29/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp Wed Apr 13 09:54:37 2005 +++ 1.30/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp Thu Apr 14 13:43:05 2005 @@ -8486,7 +8486,7 @@ break; }//switch lcpConnectptr.p->noOfLcpConf++; - ndbrequire(lcpConnectptr.p->noOfLcpConf <= 2); + ndbrequire(lcpConnectptr.p->noOfLcpConf <= 4); fragrecptr.p->fragState = ACTIVEFRAG; rlpPageptr.i = fragrecptr.p->zeroPagePtr; ptrCheckGuard(rlpPageptr, cpagesize, page8); @@ -8504,7 +8504,7 @@ }//for signal->theData[0] = fragrecptr.p->lcpLqhPtr; sendSignal(lcpConnectptr.p->lcpUserblockref, GSN_ACC_LCPCONF, signal, 1, JBB); - if (lcpConnectptr.p->noOfLcpConf == 2) { + if (lcpConnectptr.p->noOfLcpConf == 4) { jam(); releaseLcpConnectRec(signal); rootfragrecptr.i = fragrecptr.p->myroot; @@ -8535,6 +8535,13 @@ /* LOCAL FRAG ID */ tresult = 0; ptrCheckGuard(lcpConnectptr, clcpConnectsize, lcpConnectrec); + if(ERROR_INSERTED(3002) && lcpConnectptr.p->noOfLcpConf < 2) + { + sendSignalWithDelay(cownBlockref, GSN_ACC_CONTOPREQ, signal, 300, + signal->getLength()); + return; + } + ndbrequire(lcpConnectptr.p->lcpstate == LCP_ACTIVE); rootfragrecptr.i = lcpConnectptr.p->rootrecptr; ptrCheckGuard(rootfragrecptr, crootfragmentsize, rootfragmentrec); @@ -8568,6 +8575,15 @@ }//while signal->theData[0] = fragrecptr.p->lcpLqhPtr; sendSignal(lcpConnectptr.p->lcpUserblockref, GSN_ACC_CONTOPCONF, signal, 1, JBA); + + lcpConnectptr.p->noOfLcpConf++; + if (lcpConnectptr.p->noOfLcpConf == 4) { + jam(); + releaseLcpConnectRec(signal); + rootfragrecptr.i = fragrecptr.p->myroot; + ptrCheckGuard(rootfragrecptr, crootfragmentsize, rootfragmentrec); + rootfragrecptr.p->rootState = ACTIVEROOT; + }//if return; /* ALL QUEUED OPERATION ARE RESTARTED IF NEEDED. */ }//Dbacc::execACC_CONTOPREQ() --- 1.25/ndb/src/kernel/blocks/dblqh/Dblqh.hpp Wed Nov 24 16:47:06 2004 +++ 1.26/ndb/src/kernel/blocks/dblqh/Dblqh.hpp Thu Apr 14 13:43:05 2005 @@ -968,7 +968,6 @@ enum LcpState { LCP_IDLE = 0, - LCP_STARTED = 1, LCP_COMPLETED = 2, LCP_WAIT_FRAGID = 3, LCP_WAIT_TUP_PREPLCP = 4, @@ -2266,7 +2265,7 @@ void sendCopyActiveConf(Signal* signal,Uint32 tableId); void checkLcpCompleted(Signal* signal); void checkLcpHoldop(Signal* signal); - void checkLcpStarted(Signal* signal); + bool checkLcpStarted(Signal* signal); void checkLcpTupprep(Signal* signal); void getNextFragForLcp(Signal* signal); void initLcpLocAcc(Signal* signal, Uint32 fragId); --- 1.49/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp Sun Feb 6 10:00:26 2005 +++ 1.50/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp Thu Apr 14 13:43:05 2005 @@ -10351,8 +10351,8 @@ void Dblqh::lcpStartedLab(Signal* signal) { - checkLcpStarted(signal); - if (lcpPtr.p->lcpState == LcpRecord::LCP_STARTED) { + if (checkLcpStarted(signal)) + { jam(); /* ---------------------------------------------------------------------- * THE LOCAL CHECKPOINT HAS BEEN STARTED. IT IS NOW TIME TO @@ -10432,26 +10432,7 @@ lcpPtr.i = signal->theData[1]; ptrCheckGuard(lcpPtr, clcpFileSize, lcpRecord); ndbrequire(fragptr.p->fragStatus == Fragrecord::BLOCKED); - if (lcpPtr.p->lcpState == LcpRecord::LCP_STARTED) { - jam(); - /***********************************************************************/ - /* THIS SIGNAL CAN ONLY BE RECEIVED WHEN FRAGMENT IS BLOCKED AND - * THE LOCAL CHECKPOINT HAS BEEN STARTED. THE BLOCKING WILL BE - * REMOVED AS SOON AS ALL OPERATIONS HAVE BEEN STARTED. - ***********************************************************************/ - restartOperationsLab(signal); - } else if (lcpPtr.p->lcpState == LcpRecord::LCP_BLOCKED_COMP) { - jam(); - /*******************************************************************> - * THE CHECKPOINT IS COMPLETED BUT HAS NOT YET STARTED UP - * ALL OPERATIONS AGAIN. - * WE PERFORM THIS START-UP BEFORE CONTINUING WITH THE NEXT - * FRAGMENT OF THE LOCAL CHECKPOINT TO AVOID ANY STRANGE ERRORS. - *******************************************************************> */ - restartOperationsLab(signal); - } else { - ndbrequire(false); - } + restartOperationsLab(signal); }//Dblqh::execLQH_RESTART_OP() void Dblqh::restartOperationsLab(Signal* signal) @@ -11000,7 +10981,8 @@ * * SUBROUTINE SHORT NAME = CLS * ========================================================================== */ -void Dblqh::checkLcpStarted(Signal* signal) +bool +Dblqh::checkLcpStarted(Signal* signal) { LcpLocRecordPtr clsLcpLocptr; @@ -11010,7 +10992,7 @@ do { ptrCheckGuard(clsLcpLocptr, clcpLocrecFileSize, lcpLocRecord); if (clsLcpLocptr.p->lcpLocstate == LcpLocRecord::ACC_WAIT_STARTED){ - return; + return false; }//if clsLcpLocptr.i = clsLcpLocptr.p->nextLcpLoc; i++; @@ -11021,12 +11003,13 @@ do { ptrCheckGuard(clsLcpLocptr, clcpLocrecFileSize, lcpLocRecord); if (clsLcpLocptr.p->lcpLocstate == LcpLocRecord::TUP_WAIT_STARTED){ - return; + return false; }//if clsLcpLocptr.i = clsLcpLocptr.p->nextLcpLoc; i++; } while (clsLcpLocptr.i != RNIL); - lcpPtr.p->lcpState = LcpRecord::LCP_STARTED; + + return true; }//Dblqh::checkLcpStarted() /* ========================================================================== @@ -11187,20 +11170,12 @@ do { ptrCheckGuard(sacLcpLocptr, clcpLocrecFileSize, lcpLocRecord); sacLcpLocptr.p->accContCounter = 0; - if(sacLcpLocptr.p->lcpLocstate == LcpLocRecord::ACC_STARTED){ - /* ------------------------------------------------------------------- */ - /*SEND START OPERATIONS TO ACC AGAIN */ - /* ------------------------------------------------------------------- */ - signal->theData[0] = lcpPtr.p->lcpAccptr; - signal->theData[1] = sacLcpLocptr.p->locFragid; - sendSignal(fragptr.p->accBlockref, GSN_ACC_CONTOPREQ, signal, 2, JBA); - count++; - } else if(sacLcpLocptr.p->lcpLocstate == LcpLocRecord::ACC_COMPLETED){ - signal->theData[0] = sacLcpLocptr.i; - sendSignal(reference(), GSN_ACC_CONTOPCONF, signal, 1, JBB); - } else { - ndbrequire(false); - } + /* ------------------------------------------------------------------- */ + /*SEND START OPERATIONS TO ACC AGAIN */ + /* ------------------------------------------------------------------- */ + signal->theData[0] = lcpPtr.p->lcpAccptr; + signal->theData[1] = sacLcpLocptr.p->locFragid; + sendSignal(fragptr.p->accBlockref, GSN_ACC_CONTOPREQ, signal, 2, JBA); sacLcpLocptr.i = sacLcpLocptr.p->nextLcpLoc; } while (sacLcpLocptr.i != RNIL); @@ -11236,9 +11211,18 @@ signal->theData[0] = stlLcpLocptr.i; signal->theData[1] = cownref; signal->theData[2] = stlLcpLocptr.p->tupRef; - sendSignal(fragptr.p->tupBlockref, GSN_TUP_LCPREQ, signal, 3, JBA); + if(ERROR_INSERTED(5077)) + sendSignalWithDelay(fragptr.p->tupBlockref, GSN_TUP_LCPREQ, + signal, 5000, 3); + else + sendSignal(fragptr.p->tupBlockref, GSN_TUP_LCPREQ, signal, 3, JBA); stlLcpLocptr.i = stlLcpLocptr.p->nextLcpLoc; } while (stlLcpLocptr.i != RNIL); + + if(ERROR_INSERTED(5077)) + { + ndbout_c("Delayed TUP_LCPREQ with 5 sec"); + } }//Dblqh::sendStartLcp() /* ------------------------------------------------------------------------- */