List:Commits« Previous MessageNext Message »
From:Frazer Clement Date:October 28 2011 2:19pm
Subject:bzr push into mysql-5.1-telco-7.0 branch (frazer.clement:4628 to 4629)
View as plain text  
 4629 Frazer Clement	2011-10-28 [merge]
      Merge 6.3->7.0

    modified:
      storage/ndb/src/kernel/blocks/ERROR_codes.txt
      storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
      storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
      storage/ndb/test/ndbapi/testNodeRestart.cpp
      storage/ndb/test/run-test/daily-basic-tests.txt
 4628 Ole John Aske	2011-10-28
      
      Fix for handling errors during execute and retrieve of SPJ results
      from a pushed lookup query:
      
      If we missed to catch an error returned by 
      NdbTransaction::execute() or available through
      NdbQuery::getErrorCode() a later ::fetchNext()
      will succeed without returning an error code. It could also return an
      incorrect result consisting of a partial results set from those NdbOperation
      which did return something before the failure was received
      
      This fix will reuse the same error handling mechanism for a lookup
      which is already used by a scan query.
      
      This will put the query into a persistent 'failed' state where any
      further operations (except ::close()) on that query will return
      the error :
      
      "ERROR: 4816 A previous query operation failed, which you missed to catch." 

    modified:
      storage/ndb/src/ndbapi/NdbQueryOperation.cpp
=== modified file 'storage/ndb/src/kernel/blocks/ERROR_codes.txt'
--- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2011-05-25 15:03:11 +0000
+++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2011-10-28 14:17:25 +0000
@@ -18,7 +18,7 @@ Next NDBCNTR 1002
 Next NDBFS 2000
 Next DBACC 3002
 Next DBTUP 4035
-Next DBLQH 5072
+Next DBLQH 5074
 Next DBDICT 6026
 Next DBDIH 7229
 Next DBTC 8092

=== modified file 'storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2011-10-23 08:34:49 +0000
+++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2011-10-28 14:17:25 +0000
@@ -2372,6 +2372,17 @@ void Dbdih::execSTART_PERMREQ(Signal* si
   CRASH_INSERTION(7122);
   ndbrequire(isMaster());
   ndbrequire(refToNode(retRef) == nodeId);
+  if (c_lcpMasterTakeOverState.state != LMTOS_IDLE)
+  {
+    jam();
+    infoEvent("DIH : Denied request for start permission from %u "
+              "while LCP Master takeover in progress.",
+              nodeId);
+    signal->theData[0] = nodeId;
+    signal->theData[1] = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
+    sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
+    return;
+  }
   if ((c_nodeStartMaster.activeState) ||
       (c_nodeStartMaster.wait != ZFALSE) ||
       ERROR_INSERTED_CLEAR(7175)) {

=== modified file 'storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2011-10-17 13:32:49 +0000
+++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2011-10-28 14:17:25 +0000
@@ -13788,6 +13788,15 @@ void Dblqh::execBACKUP_FRAGMENT_REF(Sign
 void Dblqh::execBACKUP_FRAGMENT_CONF(Signal* signal) 
 {
   jamEntry();
+
+  if (ERROR_INSERTED(5073))
+  {
+    ndbout_c("Delaying BACKUP_FRAGMENT_CONF");
+    sendSignalWithDelay(reference(), GSN_BACKUP_FRAGMENT_CONF, signal, 500,
+                        signal->getLength());
+    return;
+  }
+
   //BackupFragmentConf* conf= (BackupFragmentConf*)signal->getDataPtr();
 
   lcpPtr.i = 0;

=== modified file 'storage/ndb/test/ndbapi/testNodeRestart.cpp'
--- a/storage/ndb/test/ndbapi/testNodeRestart.cpp	2011-10-17 13:54:09 +0000
+++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp	2011-10-28 14:17:25 +0000
@@ -4757,6 +4757,125 @@ int runSplitLatency25PctFail(NDBT_Contex
   return NDBT_OK;
 }
 
+int
+runMasterFailSlowLCP(NDBT_Context* ctx, NDBT_Step* step)
+{
+  /* Motivated by bug# 13323589 */
+  NdbRestarter res;
+
+  if (res.getNumDbNodes() < 4)
+  {
+    return NDBT_OK;
+  }
+
+  int master = res.getMasterNodeId();
+  int otherVictim = res.getRandomNodeOtherNodeGroup(master, rand());
+  int nextMaster = res.getNextMasterNodeId(master);
+  nextMaster = (nextMaster == otherVictim) ? res.getNextMasterNodeId(otherVictim) :
+    nextMaster;
+  assert(nextMaster != master);
+  assert(nextMaster != otherVictim);
+
+  /* Get a node which is not current or next master */
+  int slowNode= nextMaster;
+  while ((slowNode == nextMaster) ||
+         (slowNode == otherVictim) ||
+         (slowNode == master))
+  {
+    slowNode = res.getRandomNotMasterNodeId(rand());
+  }
+
+  ndbout_c("master: %d otherVictim : %d nextMaster: %d slowNode: %d",
+           master,
+           otherVictim,
+           nextMaster,
+           slowNode);
+
+  /* Steps :
+   * 1. Insert slow LCP frag error in slowNode
+   * 2. Start LCP
+   * 3. Wait for LCP to start
+   * 4. Kill at least two nodes including Master
+   * 5. Wait for killed nodes to attempt to rejoin
+   * 6. Remove slow LCP error
+   * 7. Allow system to stabilise + check no errors
+   */
+  // 5073 = Delay on handling BACKUP_FRAGMENT_CONF in LQH
+  if (res.insertErrorInNode(slowNode, 5073))
+  {
+    return NDBT_FAILED;
+  }
+
+  {
+    int req[1] = {DumpStateOrd::DihStartLcpImmediately};
+    if (res.dumpStateOneNode(master, req, 1))
+    {
+      return NDBT_FAILED;
+    }
+  }
+
+  ndbout_c("Giving LCP time to start...");
+
+  NdbSleep_SecSleep(10);
+
+  ndbout_c("Killing other victim node (%u)...", otherVictim);
+
+  if (res.restartOneDbNode(otherVictim, false, false, true))
+  {
+    return NDBT_FAILED;
+  }
+
+  ndbout_c("Killing Master node (%u)...", master);
+
+  if (res.restartOneDbNode(master, false, false, true))
+  {
+    return NDBT_FAILED;
+  }
+
+  /*
+     ndbout_c("Waiting for old Master node to enter NoStart state...");
+     if (res.waitNodesNoStart(&master, 1, 10))
+     return NDBT_FAILED;
+
+     ndbout_c("Starting old Master...");
+     if (res.startNodes(&master, 1))
+     return NDBT_FAILED;
+
+  */
+  ndbout_c("Waiting for some progress on old Master and other victim restart");
+  NdbSleep_SecSleep(15);
+
+  ndbout_c("Now removing error insert on slow node (%u)", slowNode);
+
+  if (res.insertErrorInNode(slowNode, 0))
+  {
+    return NDBT_FAILED;
+  }
+
+  ndbout_c("Now wait a while to check stability...");
+  NdbSleep_SecSleep(30);
+
+  if (res.getNodeStatus(master) == NDB_MGM_NODE_STATUS_NOT_STARTED)
+  {
+    ndbout_c("Old Master needs kick to restart");
+    if (res.startNodes(&master, 1))
+    {
+      return NDBT_FAILED;
+    }
+  }
+
+  ndbout_c("Wait for cluster recovery...");
+  if (res.waitClusterStarted())
+  {
+    return NDBT_FAILED;
+  }
+
+
+  ndbout_c("Done");
+  return NDBT_OK;
+}
+
+
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
 	 "Test that one node at a time can be stopped and then restarted "\
@@ -5288,6 +5407,11 @@ TESTCASE("Bug57522", "")
 {
   INITIALIZER(runBug57522);
 }
+TESTCASE("MasterFailSlowLCP",
+         "DIH Master failure during a slow LCP can cause a crash.")
+{
+  INITIALIZER(runMasterFailSlowLCP);
+}
 TESTCASE("ForceStopAndRestart", "Test restart and stop -with force flag")
 {
   STEP(runForceStopAndRestart);

=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt	2011-10-14 13:24:26 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt	2011-10-28 14:17:25 +0000
@@ -1835,3 +1835,8 @@ max-time 1800
 cmd: testNdbApi
 args: -n TestFragmentedSend T1
 
+max-time: 300
+cmd: testNodeRestart
+args: -nMasterFailSlowLCP T1
+
+

No bundle (reason: useless for push emails).
Thread
bzr push into mysql-5.1-telco-7.0 branch (frazer.clement:4628 to 4629) Frazer Clement1 Nov