From: Date: March 27 2007 2:17pm Subject: bk commit into 5.1 tree (jonas:1.2131) BUG#27466 List-Archive: http://lists.mysql.com/commits/22999 X-Bug: 27466 Message-Id: <20070327121747.2ACED47BAE2@perch.ndb.mysql.com> Below is the list of changes that have just been committed into a local 5.1 repository of jonas. When jonas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet@stripped, 2007-03-27 14:17:43+02:00, jonas@stripped +8 -0 ndb - bug#27466 nf during nr can leave cluster in inconsistent state Fix race condition between NODE_FAILREP and local INCL_NODEREQ loop Also retry on ZNODE_START_DISALLOWED_ERROR storage/ndb/include/kernel/signaldata/StartPerm.hpp@stripped, 2007-03-27 14:17:41+02:00, jonas@stripped +1 -0 Move error code storage/ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2007-03-27 14:17:41+02:00, jonas@stripped +3 -1 new error code storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2007-03-27 14:17:41+02:00, jonas@stripped +0 -1 Move error code storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-03-27 14:17:41+02:00, jonas@stripped +46 -45 1) retry also on ZNODE_START_DISALLOWED_ERROR 2) Change if() else in INCL_NODECONF to for-loop instead 3) (last but not least) fix bug, that could cause different block withing same node to have different opinion about node status solution is to check if node is still alive before sening next local INCL_NODEREQ storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp@stripped, 2007-03-27 14:17:41+02:00, jonas@stripped +13 -0 Add error insert to allow node to die during INCL_NODEREQ storage/ndb/src/kernel/blocks/suma/Suma.cpp@stripped, 2007-03-27 14:17:42+02:00, jonas@stripped +10 -4 1) let suma be well behaved (i.e reply to INCL_NODEREQ) 2) Add dump to print c_connceted_nodes/c_subscriber_nodes (8010) storage/ndb/test/ndbapi/testNodeRestart.cpp@stripped, 2007-03-27 14:17:42+02:00, jonas@stripped +53 -0 add testcase storage/ndb/test/run-test/daily-basic-tests.txt@stripped, 2007-03-27 14:17:42+02:00, jonas@stripped +4 -0 add testcase # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: jonas # Host: perch.ndb.mysql.com # Root: /home/jonas/src/drop5 --- 1.51/storage/ndb/test/run-test/daily-basic-tests.txt 2007-03-27 14:17:46 +02:00 +++ 1.52/storage/ndb/test/run-test/daily-basic-tests.txt 2007-03-27 14:17:46 +02:00 @@ -486,6 +486,10 @@ args: -n Bug26481 T1 max-time: 1000 +cmd: testNodeRestart +args: -n Bug27466 T1 + +max-time: 1000 cmd: test_event args: -l 10 -n Bug27169 T1 --- 1.3/storage/ndb/include/kernel/signaldata/StartPerm.hpp 2007-03-27 14:17:47 +02:00 +++ 1.4/storage/ndb/include/kernel/signaldata/StartPerm.hpp 2007-03-27 14:17:47 +02:00 @@ -68,6 +68,7 @@ enum ErrorCode { ZNODE_ALREADY_STARTING_ERROR = 305, + ZNODE_START_DISALLOWED_ERROR = 309, InitialStartRequired = 320 }; }; --- 1.21/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2007-03-27 14:17:47 +02:00 +++ 1.22/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2007-03-27 14:17:47 +02:00 @@ -6,7 +6,7 @@ Next DBLQH 5042 Next DBDICT 6007 Next DBDIH 7183 -Next DBTC 8039 +Next DBTC 8040 Next CMVMI 9000 Next BACKUP 10022 Next DBUTIL 11002 @@ -316,6 +316,8 @@ 7132: Crash when receiving START_COPYCONF in starting node 7170: Crash when receiving START_PERMREF (InitialStartRequired) + +8039: DBTC delay INCL_NODECONF and kill starting node DICT: 6000 Crash during NR when receiving DICTSTARTREQ --- 1.17/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-03-27 14:17:47 +02:00 +++ 1.18/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-03-27 14:17:47 +02:00 @@ -82,7 +82,6 @@ #define ZWRONG_FAILURE_NUMBER_ERROR 302 #define ZWRONG_START_NODE_ERROR 303 #define ZNO_REPLICA_FOUND_ERROR 304 -#define ZNODE_START_DISALLOWED_ERROR 309 // -------------------------------------- // Codes from LQH --- 1.61/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-03-27 14:17:47 +02:00 +++ 1.62/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-03-27 14:17:47 +02:00 @@ -1690,7 +1690,8 @@ { jamEntry(); Uint32 errorCode = signal->theData[1]; - if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) { + if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR || + errorCode == StartPermRef::ZNODE_START_DISALLOWED_ERROR) { jam(); /*-----------------------------------------------------------------------*/ // The master was busy adding another node. We will wait for a second and @@ -2032,49 +2033,49 @@ TstartNode_or_blockref = signal->theData[0]; TsendNodeId = signal->theData[1]; - if (TstartNode_or_blockref == clocallqhblockref) { - jam(); - /*-----------------------------------------------------------------------*/ - // THIS SIGNAL CAME FROM THE LOCAL LQH BLOCK. - // WE WILL NOW SEND INCLUDE TO THE TC BLOCK. - /*-----------------------------------------------------------------------*/ - signal->theData[0] = reference(); - signal->theData[1] = c_nodeStartSlave.nodeId; - sendSignal(clocaltcblockref, GSN_INCL_NODEREQ, signal, 2, JBB); - return; - }//if - if (TstartNode_or_blockref == clocaltcblockref) { - jam(); - /*----------------------------------------------------------------------*/ - // THIS SIGNAL CAME FROM THE LOCAL LQH BLOCK. - // WE WILL NOW SEND INCLUDE TO THE DICT BLOCK. - /*----------------------------------------------------------------------*/ - signal->theData[0] = reference(); - signal->theData[1] = c_nodeStartSlave.nodeId; - sendSignal(cdictblockref, GSN_INCL_NODEREQ, signal, 2, JBB); - return; - }//if - if (TstartNode_or_blockref == cdictblockref) { - jam(); - /*-----------------------------------------------------------------------*/ - // THIS SIGNAL CAME FROM THE LOCAL DICT BLOCK. WE WILL NOW SEND CONF TO THE - // BACKUP. - /*-----------------------------------------------------------------------*/ - signal->theData[0] = reference(); - signal->theData[1] = c_nodeStartSlave.nodeId; - sendSignal(BACKUP_REF, GSN_INCL_NODEREQ, signal, 2, JBB); - - // Suma will not send response to this for now, later... - sendSignal(SUMA_REF, GSN_INCL_NODEREQ, signal, 2, JBB); - return; - }//if - if (TstartNode_or_blockref == numberToRef(BACKUP, getOwnNodeId())){ - jam(); - signal->theData[0] = c_nodeStartSlave.nodeId; - signal->theData[1] = cownNodeId; - sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB); - c_nodeStartSlave.nodeId = 0; - return; + static Uint32 blocklist[] = { + clocallqhblockref, + clocaltcblockref, + cdictblockref, + 0, + 0, + 0 + }; + blocklist[3] = numberToRef(BACKUP, getOwnNodeId()); + blocklist[4] = numberToRef(SUMA, getOwnNodeId()); + + Uint32 i = 0; + for (Uint32 i = 0; blocklist[i] != 0; i++) + { + if (TstartNode_or_blockref == blocklist[i]) + { + jam(); + if (getNodeStatus(c_nodeStartSlave.nodeId) == NodeRecord::ALIVE && + blocklist[i+1] != 0) + { + /** + * Send to next in block list + */ + jam(); + signal->theData[0] = reference(); + signal->theData[1] = c_nodeStartSlave.nodeId; + sendSignal(blocklist[i+1], GSN_INCL_NODEREQ, signal, 2, JBB); + return; + } + else + { + /** + * All done, reply to master + */ + jam(); + signal->theData[0] = c_nodeStartSlave.nodeId; + signal->theData[1] = cownNodeId; + sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB); + + c_nodeStartSlave.nodeId = 0; + return; + } + } } ndbrequire(cmasterdihref = reference()); @@ -2193,7 +2194,7 @@ StartInfoRef *const ref =(StartInfoRef*)&signal->theData[0]; ref->startingNodeId = startNode; ref->sendingNodeId = cownNodeId; - ref->errorCode = ZNODE_START_DISALLOWED_ERROR; + ref->errorCode = StartPermRef::ZNODE_START_DISALLOWED_ERROR; sendSignal(cmasterdihref, GSN_START_INFOREF, signal, StartInfoRef::SignalLength, JBB); return; --- 1.105/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2007-03-27 14:17:47 +02:00 +++ 1.106/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2007-03-27 14:17:47 +02:00 @@ -309,6 +309,19 @@ hostptr.p->hostStatus = HS_ALIVE; signal->theData[0] = cownref; c_alive_nodes.set(hostptr.i); + + if (ERROR_INSERTED(8039)) + { + CLEAR_ERROR_INSERT_VALUE; + Uint32 save = signal->theData[0]; + signal->theData[0] = 9999; + sendSignal(numberToRef(CMVMI, hostptr.i), + GSN_NDB_TAMPER, signal, 1, JBB); + signal->theData[0] = save; + sendSignalWithDelay(tblockref, GSN_INCL_NODECONF, signal, 5000, 1); + return; + } + sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB); } --- 1.37/storage/ndb/src/kernel/blocks/suma/Suma.cpp 2007-03-27 14:17:47 +02:00 +++ 1.38/storage/ndb/src/kernel/blocks/suma/Suma.cpp 2007-03-27 14:17:47 +02:00 @@ -780,17 +780,14 @@ Suma::execINCL_NODEREQ(Signal* signal){ jamEntry(); - //const Uint32 senderRef = signal->theData[0]; + const Uint32 senderRef = signal->theData[0]; const Uint32 nodeId = signal->theData[1]; ndbrequire(!c_alive_nodes.get(nodeId)); c_alive_nodes.set(nodeId); -#if 0 // if we include this DIH's got to be prepared, later if needed... signal->theData[0] = reference(); - sendSignal(senderRef, GSN_INCL_NODECONF, signal, 1, JBB); -#endif } void @@ -918,6 +915,15 @@ if (tCase == 8008) { CLEAR_ERROR_INSERT_VALUE; + } + + if (tCase == 8010) + { + char buf1[255], buf2[255]; + c_subscriber_nodes.getText(buf1); + c_connected_nodes.getText(buf2); + infoEvent("c_subscriber_nodes: %s", buf1); + infoEvent("c_connected_nodes: %s", buf2); } } --- 1.38/storage/ndb/test/ndbapi/testNodeRestart.cpp 2007-03-27 14:17:47 +02:00 +++ 1.39/storage/ndb/test/ndbapi/testNodeRestart.cpp 2007-03-27 14:17:47 +02:00 @@ -1418,6 +1418,56 @@ return NDBT_OK; } +int +runBug27466(NDBT_Context* ctx, NDBT_Step* step) +{ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int records = ctx->getNumRecords(); + NdbRestarter res; + + if (res.getNumDbNodes() < 2) + { + return NDBT_OK; + } + + Uint32 pos = 0; + for (Uint32 i = 0; i