Below is the list of changes that have just been committed into a local
4.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet
1.2473 06/03/21 14:47:10 jonas@stripped +9 -0
ndb - bug#18385
Partial system restart, can not try to start with higher GCI that own
even if knowing about a higher number
ndb/test/src/NdbRestarter.cpp
1.11 06/03/21 14:47:08 jonas@stripped +33 -0
Add new method for selecting random node
ndb/test/run-test/daily-basic-tests.txt
1.27 06/03/21 14:47:08 jonas@stripped +4 -0
Run test in daily-basic
ndb/test/ndbapi/testSystemRestart.cpp
1.9 06/03/21 14:47:08 jonas@stripped +53 -0
Add new testcase for bug#18385
ndb/test/include/NdbRestarter.hpp
1.5 06/03/21 14:47:08 jonas@stripped +1 -0
Add new method for selecting random node
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
1.33 06/03/21 14:47:08 jonas@stripped +77 -22
Fix so that we don't try to restart to a too new GCI when doing a partial start
Add new error code when this node later tries to join
ndb/src/kernel/blocks/dbdih/Dbdih.hpp
1.10 06/03/21 14:47:08 jonas@stripped +0 -1
Move error codes into StartPerm + Add new error code
ndb/src/kernel/blocks/ERROR_codes.txt
1.14 06/03/21 14:47:08 jonas@stripped +2 -0
Add new error insert
ndb/include/kernel/signaldata/StartPerm.hpp
1.2 06/03/21 14:47:08 jonas@stripped +6 -0
Move error codes into StartPerm + Add new error code
ndb/include/kernel/signaldata/DumpStateOrd.hpp
1.6 06/03/21 14:47:08 jonas@stripped +1 -0
Add new dump for setting time between gcp
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: jonas
# Host: perch.ndb.mysql.com
# Root: /home/jonas/src/41-work
--- 1.26/ndb/test/run-test/daily-basic-tests.txt 2006-03-20 14:49:44 +01:00
+++ 1.27/ndb/test/run-test/daily-basic-tests.txt 2006-03-21 14:47:08 +01:00
@@ -454,6 +454,10 @@
cmd: testNodeRestart
args: -n Bug16772 T1
+max-time: 500
+cmd: testSystemRestart
+args: -n Bug18385 T1
+
# OLD FLEX
max-time: 500
cmd: flexBench
--- 1.5/ndb/include/kernel/signaldata/DumpStateOrd.hpp 2005-12-08 15:28:13 +01:00
+++ 1.6/ndb/include/kernel/signaldata/DumpStateOrd.hpp 2006-03-21 14:47:08 +01:00
@@ -127,6 +127,7 @@
DihMinTimeBetweenLCP = 7017,
DihMaxTimeBetweenLCP = 7018,
EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP
+ DihSetTimeBetweenGcp = 7090,
DihStartLcpImmediately = 7099,
// 8000 Suma
// 12000 Tux
--- 1.1/ndb/include/kernel/signaldata/StartPerm.hpp 2004-04-14 10:23:55 +02:00
+++ 1.2/ndb/include/kernel/signaldata/StartPerm.hpp 2006-03-21 14:47:08 +01:00
@@ -64,5 +64,11 @@
Uint32 startingNodeId;
Uint32 errorCode;
+
+ enum ErrorCode
+ {
+ ZNODE_ALREADY_STARTING_ERROR = 305,
+ InitialStartRequired = 320
+ };
};
#endif
--- 1.13/ndb/src/kernel/blocks/ERROR_codes.txt 2005-12-21 16:31:56 +01:00
+++ 1.14/ndb/src/kernel/blocks/ERROR_codes.txt 2006-03-21 14:47:08 +01:00
@@ -303,6 +303,8 @@
7131: Crash when receiving START_COPYREQ in master node
7132: Crash when receiving START_COPYCONF in starting node
+7170: Crash when receiving START_PERMREF (InitialStartRequired)
+
DICT:
6000 Crash during NR when receiving DICTSTARTREQ
6001 Crash during NR when receiving SCHEMA_INFO
--- 1.9/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2006-03-17 10:09:33 +01:00
+++ 1.10/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2006-03-21 14:47:08 +01:00
@@ -81,7 +81,6 @@
#define ZWRONG_FAILURE_NUMBER_ERROR 302
#define ZWRONG_START_NODE_ERROR 303
#define ZNO_REPLICA_FOUND_ERROR 304
-#define ZNODE_ALREADY_STARTING_ERROR 305
#define ZNODE_START_DISALLOWED_ERROR 309
// --------------------------------------
--- 1.32/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2006-03-17 10:09:33 +01:00
+++ 1.33/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2006-03-21 14:47:08 +01:00
@@ -1420,6 +1420,33 @@
return;
}
+ NodeRecordPtr nodePtr;
+ Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()];
+ for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
+ {
+ jam();
+ ptrAss(nodePtr, nodeRecord);
+ if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci)
+ {
+ jam();
+ /**
+ * Since we're starting(is master) and there
+ * there are other nodes with higher GCI...
+ * there gci's must be invalidated...
+ * and they _must_ do an initial start
+ * indicate this by setting lastCompletedGCI = 0
+ */
+ SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
+ ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
+ warningEvent("Making filesystem for node %d unusable",
+ nodePtr.i);
+ }
+ }
+ /**
+ * This set which GCI we will try to restart to
+ */
+ SYSFILE->newestRestorableGCI = gci;
+
ndbrequire(isMaster());
copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file!
}//Dbdih::ndbStartReqLab()
@@ -1557,7 +1584,7 @@
{
jamEntry();
Uint32 errorCode = signal->theData[1];
- if (errorCode == ZNODE_ALREADY_STARTING_ERROR) {
+ if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) {
jam();
/*-----------------------------------------------------------------------*/
// The master was busy adding another node. We will wait for a second and
@@ -1567,6 +1594,20 @@
sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1);
return;
}//if
+
+ if (errorCode == StartPermRef::InitialStartRequired)
+ {
+ CRASH_INSERTION(7170);
+ char buf[255];
+ BaseString::snprintf(buf, sizeof(buf),
+ "Cluster requires this node to be started "
+ " with --initial as partial start has been performed"
+ " and this filesystem is unusable");
+ progError(__LINE__,
+ ERR_SR_RESTARTCONFLICT,
+ buf);
+ ndbrequire(false);
+ }
/*------------------------------------------------------------------------*/
// Some node process in another node involving our node was still active. We
// will recover from this by crashing here.
@@ -1657,7 +1698,7 @@
(c_nodeStartMaster.wait != ZFALSE)) {
jam();
signal->theData[0] = nodeId;
- signal->theData[1] = ZNODE_ALREADY_STARTING_ERROR;
+ signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR;
sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
return;
}//if
@@ -1667,6 +1708,16 @@
ndbrequire(false);
}//if
+ if (SYSFILE->lastCompletedGCI[nodeId] == 0 &&
+ typeStart != NodeState::ST_INITIAL_NODE_RESTART)
+ {
+ jam();
+ signal->theData[0] = nodeId;
+ signal->theData[1] = StartPermRef::InitialStartRequired;
+ sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
+ return;
+ }
+
/*----------------------------------------------------------------------
* WE START THE INCLUSION PROCEDURE
* ---------------------------------------------------------------------*/
@@ -3515,24 +3566,12 @@
/* ------------------------------------------------------------------------- */
void Dbdih::selectMasterCandidateAndSend(Signal* signal)
{
- Uint32 gci = 0;
- Uint32 masterCandidateId = 0;
- NodeRecordPtr nodePtr;
- for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
- jam();
- ptrAss(nodePtr, nodeRecord);
- if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) {
- jam();
- masterCandidateId = nodePtr.i;
- gci = SYSFILE->lastCompletedGCI[nodePtr.i];
- }//if
- }//for
- ndbrequire(masterCandidateId != 0);
setNodeGroups();
- signal->theData[0] = masterCandidateId;
- signal->theData[1] = gci;
+ signal->theData[0] = getOwnNodeId();
+ signal->theData[1] = SYSFILE->lastCompletedGCI[getOwnNodeId()];
sendSignal(cntrlblockref, GSN_DIH_RESTARTCONF, signal, 2, JBB);
-
+
+ NodeRecordPtr nodePtr;
Uint32 node_groups[MAX_NDB_NODES];
memset(node_groups, 0, sizeof(node_groups));
for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
@@ -3550,10 +3589,10 @@
if(count != 0 && count != cnoReplicas){
char buf[255];
BaseString::snprintf(buf, sizeof(buf),
- "Illegal configuration change."
- " Initial start needs to be performed "
- " when changing no of replicas (%d != %d)",
- node_groups[nodePtr.i], cnoReplicas);
+ "Illegal configuration change."
+ " Initial start needs to be performed "
+ " when changing no of replicas (%d != %d)",
+ node_groups[nodePtr.i], cnoReplicas);
progError(__LINE__,
ERR_INVALID_CONFIG,
buf);
@@ -13358,6 +13397,22 @@
if(dumpState->args[0] == DumpStateOrd::DihStartLcpImmediately){
c_lcpState.ctimer += (1 << c_lcpState.clcpDelay);
return;
+ }
+
+ if (dumpState->args[0] == DumpStateOrd::DihSetTimeBetweenGcp)
+ {
+ if (signal->getLength() == 1)
+ {
+ const ndb_mgm_configuration_iterator * p =
+ theConfiguration.getOwnConfigIterator();
+ ndbrequire(p != 0);
+ ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &cgcpDelay);
+ }
+ else
+ {
+ cgcpDelay = signal->theData[1];
+ }
+ ndbout_c("Setting time between gcp : %d", cgcpDelay);
}
}//Dbdih::execDUMP_STATE_ORD()
--- 1.4/ndb/test/include/NdbRestarter.hpp 2004-11-22 11:47:53 +01:00
+++ 1.5/ndb/test/include/NdbRestarter.hpp 2006-03-21 14:47:08 +01:00
@@ -62,6 +62,7 @@
int dumpStateAllNodes(int * _args, int _num_args);
int getMasterNodeId();
+ int getRandomNodeSameNodeGroup(int nodeId, int randomNumber);
int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber);
int getRandomNotMasterNodeId(int randomNumber);
--- 1.8/ndb/test/ndbapi/testSystemRestart.cpp 2004-11-08 13:58:37 +01:00
+++ 1.9/ndb/test/ndbapi/testSystemRestart.cpp 2006-03-21 14:47:08 +01:00
@@ -1051,6 +1051,52 @@
return result;
}
+int runBug18385(NDBT_Context* ctx, NDBT_Step* step){
+ NdbRestarter restarter;
+ const Uint32 nodeCount = restarter.getNumDbNodes();
+ if(nodeCount < 2){
+ g_info << "Bug18385 - Needs atleast 2 nodes to test" << endl;
+ return NDBT_OK;
+ }
+
+ int node1 = restarter.getDbNodeId(rand() % nodeCount);
+ int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand());
+
+ if (node1 == -1 || node2 == -1)
+ return NDBT_OK;
+
+ int dump[] = { DumpStateOrd::DihSetTimeBetweenGcp, 300 };
+
+ int result = NDBT_OK;
+ do {
+ CHECK(restarter.dumpStateAllNodes(dump, 2) == 0);
+ CHECK(restarter.restartOneDbNode(node1, false, true, false) == 0);
+ NdbSleep_SecSleep(3);
+ CHECK(restarter.restartAll(false, true, false) == 0);
+
+ Uint32 cnt = 0;
+ int nodes[128];
+ for(Uint32 i = 0; i<nodeCount; i++)
+ if ((nodes[cnt] = restarter.getDbNodeId(i)) != node2)
+ cnt++;
+
+ assert(cnt == nodeCount - 1);
+
+ CHECK(restarter.startNodes(nodes, cnt) == 0);
+ CHECK(restarter.waitNodesStarted(nodes, cnt, 300) == 0);
+
+ CHECK(restarter.insertErrorInNode(node2, 7170) == 0);
+ CHECK(restarter.waitNodesNoStart(&node2, 1) == 0);
+ CHECK(restarter.restartOneDbNode(node2, true, false, true) == 0);
+ CHECK(restarter.waitNodesStarted(&node2, 1) == 0);
+
+ } while(0);
+
+ g_info << "Bug18385 finished" << endl;
+
+ return result;
+}
+
int runWaitStarted(NDBT_Context* ctx, NDBT_Step* step){
NdbRestarter restarter;
@@ -1232,6 +1278,13 @@
INITIALIZER(runWaitStarted);
INITIALIZER(runClearTable);
STEP(runSystemRestart9);
+ FINALIZER(runClearTable);
+}
+TESTCASE("Bug18385",
+ "Perform partition system restart with other nodes with higher GCI"){
+ INITIALIZER(runWaitStarted);
+ INITIALIZER(runClearTable);
+ STEP(runBug18385);
FINALIZER(runClearTable);
}
NDBT_TESTSUITE_END(testSystemRestart);
--- 1.10/ndb/test/src/NdbRestarter.cpp 2004-12-17 10:36:11 +01:00
+++ 1.11/ndb/test/src/NdbRestarter.cpp 2006-03-21 14:47:08 +01:00
@@ -174,6 +174,39 @@
return -1;
}
+int
+NdbRestarter::getRandomNodeSameNodeGroup(int nodeId, int rand){
+ if (!isConnected())
+ return -1;
+
+ if (getStatus() != 0)
+ return -1;
+
+ int node_group = -1;
+ for(size_t i = 0; i < ndbNodes.size(); i++){
+ if(ndbNodes[i].node_id == nodeId){
+ node_group = ndbNodes[i].node_group;
+ break;
+ }
+ }
+ if(node_group == -1){
+ return -1;
+ }
+
+ Uint32 counter = 0;
+ rand = rand % ndbNodes.size();
+ while(counter++ < ndbNodes.size() &&
+ (ndbNodes[rand].node_id == nodeId ||
+ ndbNodes[rand].node_group != node_group))
+ rand = (rand + 1) % ndbNodes.size();
+
+ if(ndbNodes[rand].node_group == node_group &&
+ ndbNodes[rand].node_id != nodeId)
+ return ndbNodes[rand].node_id;
+
+ return -1;
+}
+
int
NdbRestarter::waitClusterStarted(unsigned int _timeout){
return waitClusterState(NDB_MGM_NODE_STATUS_STARTED, _timeout);
| Thread |
|---|
| • bk commit into 4.1 tree (jonas:1.2473) BUG#18385 | jonas | 21 Mar |