From: Ole John Aske Date: October 31 2011 10:02am Subject: bzr push into mysql-5.1-telco-7.0-spj-scan-vs-scan branch (ole.john.aske:3577 to 3578) List-Archive: http://lists.mysql.com/commits/141671 Message-Id: <20111031100232.78B54233@fimafeng09.norway.sun.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit 3578 Ole John Aske 2011-10-31 [merge] Merge telco-7.0 -> telco-7.0-spj-scan-scan modified: storage/ndb/src/kernel/blocks/ERROR_codes.txt storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp storage/ndb/src/kernel/blocks/dbspj/DbspjMain.cpp storage/ndb/test/ndbapi/testNodeRestart.cpp storage/ndb/test/run-test/daily-basic-tests.txt 3577 Ole John Aske 2011-10-28 [merge] merge telco-7.0 -> 7.0-spj-scan-scan added: storage/ndb/include/kernel/statedesc.hpp storage/ndb/src/kernel/blocks/dblqh/DblqhStateDesc.cpp storage/ndb/src/kernel/blocks/dbtc/DbtcStateDesc.cpp modified: mysql-test/suite/ndb/r/ndbinfo.result mysql-test/suite/ndb/t/ndbinfo.test sql/ha_ndbcluster_connection.cc storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp storage/ndb/src/ndbapi/NdbQueryOperation.cpp storage/ndb/tools/CMakeLists.txt storage/ndb/tools/Makefile.am storage/ndb/tools/ndbinfo_sql.cpp === modified file 'storage/ndb/src/kernel/blocks/ERROR_codes.txt' --- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2011-06-07 12:19:47 +0000 +++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2011-10-31 10:01:23 +0000 @@ -18,7 +18,7 @@ Next NDBCNTR 1002 Next NDBFS 2000 Next DBACC 3002 Next DBTUP 4035 -Next DBLQH 5072 +Next DBLQH 5074 Next DBDICT 6026 Next DBDIH 7229 Next DBTC 8092 === modified file 'storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp' --- a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2011-10-28 09:04:10 +0000 +++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2011-10-31 10:01:23 +0000 @@ -2372,6 +2372,17 @@ void Dbdih::execSTART_PERMREQ(Signal* si CRASH_INSERTION(7122); ndbrequire(isMaster()); ndbrequire(refToNode(retRef) == nodeId); + if (c_lcpMasterTakeOverState.state != LMTOS_IDLE) + { + jam(); + infoEvent("DIH : Denied request for start permission from %u " + "while LCP Master takeover in progress.", + nodeId); + signal->theData[0] = nodeId; + signal->theData[1] = StartPermRef::ZNODE_START_DISALLOWED_ERROR; + sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB); + return; + } if ((c_nodeStartMaster.activeState) || (c_nodeStartMaster.wait != ZFALSE) || ERROR_INSERTED_CLEAR(7175)) { === modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp' --- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2011-10-20 12:51:03 +0000 +++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2011-10-31 10:01:23 +0000 @@ -13788,6 +13788,15 @@ void Dblqh::execBACKUP_FRAGMENT_REF(Sign void Dblqh::execBACKUP_FRAGMENT_CONF(Signal* signal) { jamEntry(); + + if (ERROR_INSERTED(5073)) + { + ndbout_c("Delaying BACKUP_FRAGMENT_CONF"); + sendSignalWithDelay(reference(), GSN_BACKUP_FRAGMENT_CONF, signal, 500, + signal->getLength()); + return; + } + //BackupFragmentConf* conf= (BackupFragmentConf*)signal->getDataPtr(); lcpPtr.i = 0; === modified file 'storage/ndb/src/kernel/blocks/dbspj/DbspjMain.cpp' --- a/storage/ndb/src/kernel/blocks/dbspj/DbspjMain.cpp 2011-10-28 09:04:10 +0000 +++ b/storage/ndb/src/kernel/blocks/dbspj/DbspjMain.cpp 2011-10-31 10:01:23 +0000 @@ -2705,6 +2705,23 @@ Dbspj::lookup_build(Build_context& ctx, const QN_LookupParameters * param = (const QN_LookupParameters*)qp; do { + err = DbspjErr::InvalidTreeNodeSpecification; + if (unlikely(node->len < QN_LookupNode::NodeSize)) + { + jam(); + DEBUG_CRASH(); + break; + } + + err = DbspjErr::InvalidTreeParametersSpecification; + DEBUG("param len: " << param->len); + if (unlikely(param->len < QN_LookupParameters::NodeSize)) + { + jam(); + DEBUG_CRASH(); + break; + } + err = createNode(ctx, requestPtr, treeNodePtr); if (unlikely(err != 0)) { @@ -2756,13 +2773,6 @@ Dbspj::lookup_build(Build_context& ctx, dst->requestInfo = requestInfo; } - err = DbspjErr::InvalidTreeNodeSpecification; - if (unlikely(node->len < QN_LookupNode::NodeSize)) - { - DEBUG_CRASH(); - break; - } - if (treeBits & QN_LookupNode::L_UNIQUE_INDEX) { jam(); @@ -2775,14 +2785,6 @@ Dbspj::lookup_build(Build_context& ctx, Uint32 tableSchemaVersion = tableId + ((schemaVersion << 16) & 0xFFFF0000); dst->tableSchemaVersion = tableSchemaVersion; - err = DbspjErr::InvalidTreeParametersSpecification; - DEBUG("param len: " << param->len); - if (unlikely(param->len < QN_LookupParameters::NodeSize)) - { - DEBUG_CRASH(); - break; - } - ctx.m_resultData = param->resultData; treeNodePtr.p->m_lookup_data.m_api_resultRef = ctx.m_resultRef; treeNodePtr.p->m_lookup_data.m_api_resultData = param->resultData; @@ -3765,6 +3767,24 @@ Dbspj::scanFrag_build(Build_context& ctx do { + err = DbspjErr::InvalidTreeNodeSpecification; + DEBUG("scanFrag_build: len=" << node->len); + if (unlikely(node->len < QN_ScanFragNode::NodeSize)) + { + jam(); + DEBUG_CRASH(); + break; + } + + err = DbspjErr::InvalidTreeParametersSpecification; + DEBUG("param len: " << param->len); + if (unlikely(param->len < QN_ScanFragParameters::NodeSize)) + { + jam(); + DEBUG_CRASH(); + break; + } + err = createNode(ctx, requestPtr, treeNodePtr); if (unlikely(err != 0)) break; @@ -3810,24 +3830,9 @@ Dbspj::scanFrag_build(Build_context& ctx (treeBits & DABits::NI_LINKED_DISK) == 0 && (paramBits & DABits::PI_DISK_ATTR) == 0); dst->requestInfo = requestInfo; - - err = DbspjErr::InvalidTreeNodeSpecification; - DEBUG("scanFrag_build: len=" << node->len); - if (unlikely(node->len < QN_ScanFragNode::NodeSize)) - break; - dst->tableId = node->tableId; dst->schemaVersion = node->tableVersion; - err = DbspjErr::InvalidTreeParametersSpecification; - DEBUG("param len: " << param->len); - if (unlikely(param->len < QN_ScanFragParameters::NodeSize)) - { - jam(); - DEBUG_CRASH(); - break; - } - ctx.m_resultData = param->resultData; /** @@ -4315,6 +4320,24 @@ Dbspj::scanIndex_build(Build_context& ct do { + err = DbspjErr::InvalidTreeNodeSpecification; + DEBUG("scanIndex_build: len=" << node->len); + if (unlikely(node->len < QN_ScanIndexNode::NodeSize)) + { + jam(); + DEBUG_CRASH(); + break; + } + + err = DbspjErr::InvalidTreeParametersSpecification; + DEBUG("param len: " << param->len); + if (unlikely(param->len < QN_ScanIndexParameters::NodeSize)) + { + jam(); + DEBUG_CRASH(); + break; + } + err = createNode(ctx, requestPtr, treeNodePtr); if (unlikely(err != 0)) break; @@ -4355,24 +4378,9 @@ Dbspj::scanIndex_build(Build_context& ct (paramBits & DABits::PI_DISK_ATTR) == 0); ScanFragReq::setCorrFactorFlag(requestInfo, 1); dst->requestInfo = requestInfo; - - err = DbspjErr::InvalidTreeNodeSpecification; - DEBUG("scanIndex_build: len=" << node->len); - if (unlikely(node->len < QN_ScanIndexNode::NodeSize)) - break; - dst->tableId = node->tableId; dst->schemaVersion = node->tableVersion; - err = DbspjErr::InvalidTreeParametersSpecification; - DEBUG("param len: " << param->len); - if (unlikely(param->len < QN_ScanIndexParameters::NodeSize)) - { - jam(); - DEBUG_CRASH(); - break; - } - ctx.m_resultData = param->resultData; /** @@ -5735,7 +5743,7 @@ Dbspj::scanIndex_execSCAN_NEXTREQ(Signal DEBUG("scanIndex_execSCAN_NEXTREQ to: " << hex << treeNodePtr.p->m_send.m_ref - << ", m_node_no=" << treeNodePtr.p->m_node_no + << ", m_node_no=" << treeNodePtr.p->m_node_no << ", senderData: " << req->senderData); #ifdef DEBUG_SCAN_FRAGREQ === modified file 'storage/ndb/test/ndbapi/testNodeRestart.cpp' --- a/storage/ndb/test/ndbapi/testNodeRestart.cpp 2011-10-17 13:54:09 +0000 +++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp 2011-10-28 14:17:25 +0000 @@ -4757,6 +4757,125 @@ int runSplitLatency25PctFail(NDBT_Contex return NDBT_OK; } +int +runMasterFailSlowLCP(NDBT_Context* ctx, NDBT_Step* step) +{ + /* Motivated by bug# 13323589 */ + NdbRestarter res; + + if (res.getNumDbNodes() < 4) + { + return NDBT_OK; + } + + int master = res.getMasterNodeId(); + int otherVictim = res.getRandomNodeOtherNodeGroup(master, rand()); + int nextMaster = res.getNextMasterNodeId(master); + nextMaster = (nextMaster == otherVictim) ? res.getNextMasterNodeId(otherVictim) : + nextMaster; + assert(nextMaster != master); + assert(nextMaster != otherVictim); + + /* Get a node which is not current or next master */ + int slowNode= nextMaster; + while ((slowNode == nextMaster) || + (slowNode == otherVictim) || + (slowNode == master)) + { + slowNode = res.getRandomNotMasterNodeId(rand()); + } + + ndbout_c("master: %d otherVictim : %d nextMaster: %d slowNode: %d", + master, + otherVictim, + nextMaster, + slowNode); + + /* Steps : + * 1. Insert slow LCP frag error in slowNode + * 2. Start LCP + * 3. Wait for LCP to start + * 4. Kill at least two nodes including Master + * 5. Wait for killed nodes to attempt to rejoin + * 6. Remove slow LCP error + * 7. Allow system to stabilise + check no errors + */ + // 5073 = Delay on handling BACKUP_FRAGMENT_CONF in LQH + if (res.insertErrorInNode(slowNode, 5073)) + { + return NDBT_FAILED; + } + + { + int req[1] = {DumpStateOrd::DihStartLcpImmediately}; + if (res.dumpStateOneNode(master, req, 1)) + { + return NDBT_FAILED; + } + } + + ndbout_c("Giving LCP time to start..."); + + NdbSleep_SecSleep(10); + + ndbout_c("Killing other victim node (%u)...", otherVictim); + + if (res.restartOneDbNode(otherVictim, false, false, true)) + { + return NDBT_FAILED; + } + + ndbout_c("Killing Master node (%u)...", master); + + if (res.restartOneDbNode(master, false, false, true)) + { + return NDBT_FAILED; + } + + /* + ndbout_c("Waiting for old Master node to enter NoStart state..."); + if (res.waitNodesNoStart(&master, 1, 10)) + return NDBT_FAILED; + + ndbout_c("Starting old Master..."); + if (res.startNodes(&master, 1)) + return NDBT_FAILED; + + */ + ndbout_c("Waiting for some progress on old Master and other victim restart"); + NdbSleep_SecSleep(15); + + ndbout_c("Now removing error insert on slow node (%u)", slowNode); + + if (res.insertErrorInNode(slowNode, 0)) + { + return NDBT_FAILED; + } + + ndbout_c("Now wait a while to check stability..."); + NdbSleep_SecSleep(30); + + if (res.getNodeStatus(master) == NDB_MGM_NODE_STATUS_NOT_STARTED) + { + ndbout_c("Old Master needs kick to restart"); + if (res.startNodes(&master, 1)) + { + return NDBT_FAILED; + } + } + + ndbout_c("Wait for cluster recovery..."); + if (res.waitClusterStarted()) + { + return NDBT_FAILED; + } + + + ndbout_c("Done"); + return NDBT_OK; +} + + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", "Test that one node at a time can be stopped and then restarted "\ @@ -5288,6 +5407,11 @@ TESTCASE("Bug57522", "") { INITIALIZER(runBug57522); } +TESTCASE("MasterFailSlowLCP", + "DIH Master failure during a slow LCP can cause a crash.") +{ + INITIALIZER(runMasterFailSlowLCP); +} TESTCASE("ForceStopAndRestart", "Test restart and stop -with force flag") { STEP(runForceStopAndRestart); === modified file 'storage/ndb/test/run-test/daily-basic-tests.txt' --- a/storage/ndb/test/run-test/daily-basic-tests.txt 2011-10-20 12:51:03 +0000 +++ b/storage/ndb/test/run-test/daily-basic-tests.txt 2011-10-31 10:01:23 +0000 @@ -1835,3 +1835,8 @@ max-time 1800 cmd: testNdbApi args: -n TestFragmentedSend T1 +max-time: 300 +cmd: testNodeRestart +args: -nMasterFailSlowLCP T1 + + No bundle (reason: useless for push emails).