From: Date: November 7 2007 8:57pm Subject: bk commit into 5.0 tree (jonas:1.2476) BUG#32160 List-Archive: http://lists.mysql.com/commits/37291 X-Bug: 32160 Message-Id: <20071107195725.E47C27D6C20@perch.ndb.mysql.com> Below is the list of changes that have just been committed into a local 5.0 repository of jonas. When jonas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet@stripped, 2007-11-07 20:57:21+01:00, jonas@stripped +5 -0 ndb - bug#32160 (recommit to 5.0) fix lcp master take over bug ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2007-11-07 20:57:19+01:00, jonas@stripped +6 -1 new error codes ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2007-11-07 20:57:19+01:00, jonas@stripped +10 -0 add debug code ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-11-07 20:57:19+01:00, jonas@stripped +54 -6 fix master lcp bug add 2 new error codes ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2007-11-07 20:57:19+01:00, jonas@stripped +48 -0 testcase ndb/test/run-test/daily-basic-tests.txt@stripped, 2007-11-07 20:57:19+01:00, jonas@stripped +4 -0 testcase # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: jonas # Host: perch.ndb.mysql.com # Root: /home/jonas/src/mysql-5.0-ndb --- 1.56/ndb/test/run-test/daily-basic-tests.txt 2007-11-07 20:57:25 +01:00 +++ 1.57/ndb/test/run-test/daily-basic-tests.txt 2007-11-07 20:57:25 +01:00 @@ -497,6 +497,10 @@ cmd: testNodeRestart args: -n Bug26481 T1 +max-time: 300 +cmd: testNodeRestart +args: -n Bug32160 T1 + # OLD FLEX max-time: 500 cmd: flexBench --- 1.30/ndb/src/kernel/blocks/ERROR_codes.txt 2007-11-07 20:57:25 +01:00 +++ 1.31/ndb/src/kernel/blocks/ERROR_codes.txt 2007-11-07 20:57:25 +01:00 @@ -5,7 +5,7 @@ Next DBTUP 4014 Next DBLQH 5043 Next DBDICT 6007 -Next DBDIH 7183 +Next DBDIH 7195 Next DBTC 8052 Next CMVMI 9000 Next BACKUP 10022 @@ -72,6 +72,11 @@ 7177: Delay copying of sysfileData in execCOPY_GCIREQ 7180: Crash master during master-take-over in execMASTER_LCPCONF + +7193: Dont send LCP_FRAG_ORD to self, and crash when sending first + LCP_FRAG_ORD(last) + +7194: Force removeNodeFromStored to complete in the middle of MASTER_LCPCONF ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ----------------------------------------------------------------- --- 1.20/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-11-07 20:57:25 +01:00 +++ 1.21/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-11-07 20:57:25 +01:00 @@ -1291,7 +1291,17 @@ LcpStatus lcpStatus; Uint32 lcpStatusUpdatedPlace; + struct Save { + LcpStatus m_status; + Uint32 m_place; + } m_saveState[10]; + void setLcpStatus(LcpStatus status, Uint32 line){ + for (Uint32 i = 9; i > 0; i--) + m_saveState[i] = m_saveState[i-1]; + m_saveState[0].m_status = lcpStatus; + m_saveState[0].m_place = lcpStatusUpdatedPlace; + lcpStatus = status; lcpStatusUpdatedPlace = line; } --- 1.73/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-11-07 20:57:25 +01:00 +++ 1.74/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-11-07 20:57:25 +01:00 @@ -4764,11 +4764,19 @@ } jam(); - signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE; - signal->theData[1] = failedNodePtr.i; - signal->theData[2] = 0; // Tab id - sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB); - + + if (!ERROR_INSERTED(7194)) + { + signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE; + signal->theData[1] = failedNodePtr.i; + signal->theData[2] = 0; // Tab id + sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB); + } + else + { + ndbout_c("7194 Not starting ZREMOVE_NODE_FROM_TABLE"); + } + setLocalNodefailHandling(signal, failedNodePtr.i, NF_REMOVE_NODE_FROM_TABLE); }//Dbdih::startRemoveFailedNode() @@ -5676,12 +5684,22 @@ signal->theData[0] = 7012; execDUMP_STATE_ORD(signal); + + if (ERROR_INSERTED(7194)) + { + ndbout_c("7194 starting ZREMOVE_NODE_FROM_TABLE"); + signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE; + signal->theData[1] = c_lcpMasterTakeOverState.failedNodeId; + signal->theData[2] = 0; // Tab id + sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB); + } c_lcpMasterTakeOverState.set(LMTOS_INITIAL, __LINE__); MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0]; req->masterRef = reference(); req->failedNodeId = c_lcpMasterTakeOverState.failedNodeId; sendLoopMacro(MASTER_LCPREQ, sendMASTER_LCPREQ); + } else { sendMASTER_LCPCONF(signal); } @@ -5998,6 +6016,15 @@ { const MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0]; jamEntry(); + + if (ERROR_INSERTED(7194)) + { + ndbout_c("delaying MASTER_LCPCONF due to error 7194"); + sendSignalWithDelay(reference(), GSN_MASTER_LCPCONF, signal, + 300, signal->getLength()); + return; + } + Uint32 senderNodeId = conf->senderNodeId; MasterLCPConf::State lcpState = (MasterLCPConf::State)conf->lcpState; const Uint32 failedNodeId = conf->failedNodeId; @@ -6132,7 +6159,6 @@ #endif c_lcpState.keepGci = SYSFILE->keepGCI; - c_lcpState.setLcpStatus(LCP_START_LCP_ROUND, __LINE__); startLcpRoundLoopLab(signal, 0, 0); break; } @@ -9924,6 +9950,8 @@ if(ERROR_INSERTED(7075)){ continue; } + + CRASH_INSERTION(7193); BlockReference ref = calcLqhBlockRef(nodePtr.i); sendSignal(ref, GSN_LCP_FRAG_ORD, signal,LcpFragOrd::SignalLength, JBB); } @@ -10121,6 +10149,13 @@ CRASH_INSERTION2(7017, !isMaster()); c_lcpState.setLcpStatus(LCP_TAB_COMPLETED, __LINE__); + + if (ERROR_INSERTED(7194)) + { + ndbout_c("CLEARING 7194"); + CLEAR_ERROR_INSERT_VALUE; + } + return true; } @@ -10276,6 +10311,11 @@ BlockReference ref = calcLqhBlockRef(replicaPtr.p->procNode); + if (ERROR_INSERTED(7193) && replicaPtr.p->procNode == getOwnNodeId()) + { + return; + } + LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0]; lcpFragOrd->tableId = info.tableId; lcpFragOrd->fragmentId = info.fragId; @@ -13686,6 +13726,14 @@ ("immediateLcpStart = %d masterLcpNodeId = %d", c_lcpState.immediateLcpStart, refToNode(c_lcpState.m_masterLcpDihRef)); + + for (Uint32 i = 0; i<10; i++) + { + infoEvent("%u : status: %u place: %u", i, + c_lcpState.m_saveState[i].m_status, + c_lcpState.m_saveState[i].m_place); + } + infoEvent("-- Node %d LCP STATE --", getOwnNodeId()); } --- 1.32/ndb/test/ndbapi/testNodeRestart.cpp 2007-11-07 20:57:25 +01:00 +++ 1.33/ndb/test/ndbapi/testNodeRestart.cpp 2007-11-07 20:57:25 +01:00 @@ -1347,6 +1347,51 @@ return NDBT_OK; } +int +runBug32160(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int records = ctx->getNumRecords(); + Ndb* pNdb = GETNDB(step); + NdbRestarter res; + + if (res.getNumDbNodes() < 2) + { + return NDBT_OK; + } + + int master = res.getMasterNodeId(); + int next = res.getNextMasterNodeId(master); + + if (res.insertErrorInNode(next, 7194)) + { + return NDBT_FAILED; + } + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + if (res.dumpStateOneNode(master, val2, 2)) + return NDBT_FAILED; + + if (res.insertErrorInNode(master, 7193)) + return NDBT_FAILED; + + int val3[] = { 7099 }; + if (res.dumpStateOneNode(master, val3, 1)) + return NDBT_FAILED; + + if (res.waitNodesNoStart(&master, 1)) + return NDBT_FAILED; + + if (res.startNodes(&master, 1)) + return NDBT_FAILED; + + if (res.waitClusterStarted()) + return NDBT_FAILED; + + return NDBT_OK; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", "Test that one node at a time can be stopped and then restarted "\ @@ -1685,6 +1730,9 @@ } TESTCASE("Bug29364", ""){ INITIALIZER(runBug29364); +} +TESTCASE("Bug32160", ""){ + INITIALIZER(runBug32160); } NDBT_TESTSUITE_END(testNodeRestart);