List:Commits« Previous MessageNext Message »
From:jonas Date:November 7 2007 7:43pm
Subject:bk commit into 5.1 tree (jonas:1.2172) BUG#32160
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-11-07 20:43:34+01:00, jonas@stripped +5 -0
  ndb - bug#32160
    lcp master take over bug
    - fix bug
    - add extra debugging

  storage/ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2007-11-07 20:43:33+01:00, jonas@stripped +6 -1
    new error code

  storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2007-11-07 20:43:33+01:00, jonas@stripped +10 -0
    save the last 10 updates...

  storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-11-07 20:43:33+01:00, jonas@stripped +54 -6
    1) fix lcp master take over bug
    2) add new error code 7193 & 7194

  storage/ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2007-11-07 20:43:33+01:00, jonas@stripped +48 -0
    add test prg

  storage/ndb/test/run-test/daily-basic-tests.txt@stripped, 2007-11-07 20:43:33+01:00, jonas@stripped +4 -0
    new testcase

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	jonas
# Host:	perch.ndb.mysql.com
# Root:	/home/jonas/src/drop6

--- 1.61/storage/ndb/test/run-test/daily-basic-tests.txt	2007-11-07 20:43:38 +01:00
+++ 1.62/storage/ndb/test/run-test/daily-basic-tests.txt	2007-11-07 20:43:38 +01:00
@@ -513,6 +513,10 @@
 cmd: testNodeRestart
 args: -n Bug31980 T1
 
+max-time: 300
+cmd: testNodeRestart
+args: -n Bug32160 T1
+
 # OLD FLEX
 max-time: 500
 cmd: flexBench

--- 1.29/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2007-11-07 20:43:38 +01:00
+++ 1.30/storage/ndb/src/kernel/blocks/ERROR_codes.txt	2007-11-07 20:43:38 +01:00
@@ -5,7 +5,7 @@
 Next DBTUP 4013
 Next DBLQH 5047
 Next DBDICT 6007
-Next DBDIH 7193
+Next DBDIH 7195
 Next DBTC 8057
 Next CMVMI 9000
 Next BACKUP 10022
@@ -158,6 +158,11 @@
 
 7191: Crash when receiving LCP_COMPLETE_REP
 7192: Crash in setLcpActiveStatusStart - when dead node missed to LCP's
+
+7193: Dont send LCP_FRAG_ORD to self, and crash when sending first
+      LCP_FRAG_ORD(last)
+
+7194: Force removeNodeFromStored to complete in the middle of MASTER_LCPCONF
 
 ERROR CODES FOR TESTING NODE FAILURE, FAILURE IN COPY FRAGMENT PROCESS:
 -----------------------------------------------------------------------

--- 1.20/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2007-11-07 20:43:38 +01:00
+++ 1.21/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp	2007-11-07 20:43:38 +01:00
@@ -1295,7 +1295,17 @@
     LcpStatus lcpStatus;
     Uint32 lcpStatusUpdatedPlace;
 
+    struct Save {
+      LcpStatus m_status;
+      Uint32 m_place;
+    } m_saveState[10];
+
     void setLcpStatus(LcpStatus status, Uint32 line){
+      for (Uint32 i = 9; i > 0; i--)
+        m_saveState[i] = m_saveState[i-1];
+      m_saveState[0].m_status = lcpStatus;
+      m_saveState[0].m_place = lcpStatusUpdatedPlace;
+
       lcpStatus = status;
       lcpStatusUpdatedPlace = line;
     }

--- 1.63/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-11-07 20:43:38 +01:00
+++ 1.64/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp	2007-11-07 20:43:38 +01:00
@@ -4864,11 +4864,19 @@
   }
   
   jam();
-  signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
-  signal->theData[1] = failedNodePtr.i;
-  signal->theData[2] = 0; // Tab id
-  sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
-  
+
+  if (!ERROR_INSERTED(7194))
+  {
+    signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
+    signal->theData[1] = failedNodePtr.i;
+    signal->theData[2] = 0; // Tab id
+    sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
+  }    
+  else
+  {
+    ndbout_c("7194 Not starting ZREMOVE_NODE_FROM_TABLE");
+  }
+
   setLocalNodefailHandling(signal, failedNodePtr.i, NF_REMOVE_NODE_FROM_TABLE);
 }//Dbdih::startRemoveFailedNode()
 
@@ -5775,12 +5783,22 @@
     
     signal->theData[0] = 7012;
     execDUMP_STATE_ORD(signal);
+
+    if (ERROR_INSERTED(7194))
+    {
+      ndbout_c("7194 starting ZREMOVE_NODE_FROM_TABLE");
+      signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
+      signal->theData[1] = c_lcpMasterTakeOverState.failedNodeId;
+      signal->theData[2] = 0; // Tab id
+      sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
+    }
     
     c_lcpMasterTakeOverState.set(LMTOS_INITIAL, __LINE__);
     MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0];
     req->masterRef = reference();
     req->failedNodeId = c_lcpMasterTakeOverState.failedNodeId;
     sendLoopMacro(MASTER_LCPREQ, sendMASTER_LCPREQ);
+
   } else {
     sendMASTER_LCPCONF(signal);
   }
@@ -6092,6 +6110,15 @@
 {
   const MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0];
   jamEntry();
+
+  if (ERROR_INSERTED(7194))
+  {
+    ndbout_c("delaying MASTER_LCPCONF due to error 7194");
+    sendSignalWithDelay(reference(), GSN_MASTER_LCPCONF, signal, 
+                        300, signal->getLength());
+    return;
+  }
+
   Uint32 senderNodeId = conf->senderNodeId;
   MasterLCPConf::State lcpState = (MasterLCPConf::State)conf->lcpState;
   const Uint32 failedNodeId = conf->failedNodeId;
@@ -6226,7 +6253,6 @@
 #endif
     
       c_lcpState.keepGci = SYSFILE->keepGCI;
-      c_lcpState.setLcpStatus(LCP_START_LCP_ROUND, __LINE__);
       startLcpRoundLoopLab(signal, 0, 0);
       break;
     }
@@ -10133,6 +10159,8 @@
       if(ERROR_INSERTED(7075)){
 	continue;
       }
+
+      CRASH_INSERTION(7193);
       BlockReference ref = calcLqhBlockRef(nodePtr.i);
       sendSignal(ref, GSN_LCP_FRAG_ORD, signal,LcpFragOrd::SignalLength, JBB);
     }
@@ -10361,6 +10389,13 @@
   CRASH_INSERTION2(7017, !isMaster());
   
   c_lcpState.setLcpStatus(LCP_TAB_COMPLETED, __LINE__);
+
+  if (ERROR_INSERTED(7194))
+  {
+    ndbout_c("CLEARING 7194");
+    CLEAR_ERROR_INSERT_VALUE;
+  }
+  
   return true;
 }
 
@@ -10550,6 +10585,11 @@
   
   BlockReference ref = calcLqhBlockRef(replicaPtr.p->procNode);
   
+  if (ERROR_INSERTED(7193) && replicaPtr.p->procNode == getOwnNodeId())
+  {
+    return;
+  }
+  
   LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0];
   lcpFragOrd->tableId    = info.tableId;
   lcpFragOrd->fragmentId = info.fragId;
@@ -14076,6 +14116,14 @@
       ("immediateLcpStart = %d masterLcpNodeId = %d",
        c_lcpState.immediateLcpStart,
        refToNode(c_lcpState.m_masterLcpDihRef));
+
+    for (Uint32 i = 0; i<10; i++)
+    {
+      infoEvent("%u : status: %u place: %u", i, 
+                c_lcpState.m_saveState[i].m_status,
+                c_lcpState.m_saveState[i].m_place);
+    }
+    
     infoEvent("-- Node %d LCP STATE --", getOwnNodeId());
   }
 

--- 1.43/storage/ndb/test/ndbapi/testNodeRestart.cpp	2007-11-07 20:43:38 +01:00
+++ 1.44/storage/ndb/test/ndbapi/testNodeRestart.cpp	2007-11-07 20:43:38 +01:00
@@ -1722,6 +1722,51 @@
   return NDBT_OK;
 }
 
+int
+runBug32160(NDBT_Context* ctx, NDBT_Step* step)
+{
+  int result = NDBT_OK;
+  int loops = ctx->getNumLoops();
+  int records = ctx->getNumRecords();
+  Ndb* pNdb = GETNDB(step);
+  NdbRestarter res;
+
+  if (res.getNumDbNodes() < 2)
+  {
+    return NDBT_OK;
+  }
+
+  int master = res.getMasterNodeId();
+  int next = res.getNextMasterNodeId(master);
+
+  if (res.insertErrorInNode(next, 7194))
+  {
+    return NDBT_FAILED;
+  }
+
+  int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };    
+  if (res.dumpStateOneNode(master, val2, 2))
+    return NDBT_FAILED;
+
+  if (res.insertErrorInNode(master, 7193))
+    return NDBT_FAILED;
+
+  int val3[] = { 7099 };
+  if (res.dumpStateOneNode(master, val3, 1))
+    return NDBT_FAILED;
+
+  if (res.waitNodesNoStart(&master, 1))
+    return NDBT_FAILED;
+
+  if (res.startNodes(&master, 1))
+    return NDBT_FAILED;
+
+  if (res.waitClusterStarted())
+    return NDBT_FAILED;
+  
+  return NDBT_OK;
+}
+
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
 	 "Test that one node at a time can be stopped and then restarted "\
@@ -2079,6 +2124,9 @@
 }
 TESTCASE("Bug31980", ""){
   INITIALIZER(runBug31980);
+}
+TESTCASE("Bug32160", ""){
+  INITIALIZER(runBug32160);
 }
 NDBT_TESTSUITE_END(testNodeRestart);
 
Thread
bk commit into 5.1 tree (jonas:1.2172) BUG#32160jonas7 Nov