From: Frazer Clement Date: October 18 2011 8:40am Subject: bzr push into mysql-5.1-telco-7.1 branch (frazer.clement:4021 to 4022) List-Archive: http://lists.mysql.com/commits/141490 Message-Id: <201110180840.p9I8eqlk032338@acsmt358.oracle.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit 4022 Frazer Clement 2010-12-15 Commit of Connect Check functionality Default for ConnectCheckIntervalDelay set to 0. Modify CMakelists.txt for Windows added: storage/ndb/include/kernel/signaldata/NodePing.hpp storage/ndb/src/common/debugger/signaldata/NodePing.cpp modified: storage/ndb/include/kernel/GlobalSignalNumbers.h storage/ndb/include/kernel/signaldata/FailRep.hpp storage/ndb/include/kernel/signaldata/SignalData.hpp storage/ndb/include/mgmapi/mgmapi_config_parameters.h storage/ndb/include/mgmapi/ndb_logevent.h storage/ndb/include/transporter/TransporterRegistry.hpp storage/ndb/src/common/debugger/EventLogger.cpp storage/ndb/src/common/debugger/signaldata/CMakeLists.txt storage/ndb/src/common/debugger/signaldata/Makefile.am storage/ndb/src/common/debugger/signaldata/SignalDataPrint.cpp storage/ndb/src/common/debugger/signaldata/SignalNames.cpp storage/ndb/src/common/transporter/TransporterRegistry.cpp storage/ndb/src/kernel/blocks/ERROR_codes.txt storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp storage/ndb/src/mgmsrv/ConfigInfo.cpp storage/ndb/test/ndbapi/testNodeRestart.cpp 4021 Craig L Russell 2010-12-14 Fix bug in varchar silently truncating data too long to fit in column modified: storage/ndb/clusterj/clusterj-tie/src/main/java/com/mysql/clusterj/tie/Utility.java storage/ndb/clusterj/clusterj-tie/src/main/resources/com/mysql/clusterj/tie/Bundle.properties === modified file 'storage/ndb/include/kernel/GlobalSignalNumbers.h' --- a/storage/ndb/include/kernel/GlobalSignalNumbers.h 2010-10-20 07:12:58 +0000 +++ b/storage/ndb/include/kernel/GlobalSignalNumbers.h 2010-12-15 13:50:17 +0000 @@ -25,7 +25,7 @@ * * When adding a new signal, remember to update MAX_GSN and SignalNames.cpp */ -const GlobalSignalNumber MAX_GSN = 768; +const GlobalSignalNumber MAX_GSN = 770; struct GsnName { GlobalSignalNumber gsn; @@ -1075,4 +1075,8 @@ extern const GlobalSignalNumber NO_OF_SI #define GSN_RELEASE_PAGES_REQ 680 #define GSN_RELEASE_PAGES_CONF 681 +/* NODE_PING signals */ +#define GSN_NODE_PING_REQ 769 /* distr. */ +#define GSN_NODE_PING_CONF 770 /* distr. */ + #endif === modified file 'storage/ndb/include/kernel/signaldata/FailRep.hpp' --- a/storage/ndb/include/kernel/signaldata/FailRep.hpp 2010-12-13 16:43:31 +0000 +++ b/storage/ndb/include/kernel/signaldata/FailRep.hpp 2010-12-15 13:50:17 +0000 @@ -52,7 +52,8 @@ public: ZLINK_FAILURE=5, ZOTHERNODE_FAILED_DURING_START=6, ZMULTI_NODE_SHUTDOWN = 7, - ZPARTITIONED_CLUSTER = 8 + ZPARTITIONED_CLUSTER = 8, + ZCONNECT_CHECK_FAILURE = 9 }; Uint32 getFailSourceNodeId(Uint32 sigLen) const === added file 'storage/ndb/include/kernel/signaldata/NodePing.hpp' --- a/storage/ndb/include/kernel/signaldata/NodePing.hpp 1970-01-01 00:00:00 +0000 +++ b/storage/ndb/include/kernel/signaldata/NodePing.hpp 2010-12-15 13:50:17 +0000 @@ -0,0 +1,76 @@ +/* + Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef NODEPING_HPP +#define NODEPING_HPP + +#include "SignalData.hpp" + + +/* + * NodePingReq/Conf is sent between QMGR nodes to help determine the + * available connectivity in a cluster experiencing heartbeat problems + * + * When a node detects that it has not received a heartbeat from a + * connected node for the heartbeat period, it initiates a global + * connectivity check protocol by sending a NODE_PING_REQ signal to all + * nodes considered to be running. + * + * On receiving this signal, a node will respond with NODE_PING_CONF to + * the sender, and begin its own connectivity check, if it is not + * already involved in one. + * + * In this way, all nodes reachable within some latency n will begin + * a connectivity check. If they do not receive a NODE_PING_CONF from a + * peer node within some further latency m, then they consider it to + * be suspect, and after a further latency p they consider it failed. + * + * In environments where latency between nodes fluctuates, but + * connectivity is maintained (for example where TCP connections observe + * latency due to underlying IP re-routing/failover), the connectivity + * check allows nodes to arm themselves in preparation for the potential + * race of FAIL_REP signals that can arise in these situations, by marking + * connections experiencing latency as SUSPECT. Once a node is marked as + * SUSPECT, FAIL_REP signals originating from it may not be trusted or + * acted upon. + */ + +class NodePingReq { + /** + * Sender(s) / Receiver(s) + */ + friend class Qmgr; +public: + STATIC_CONST( SignalLength = 2 ); + + Uint32 senderData; + Uint32 senderRef; +}; + +class NodePingConf { + /** + * Sender(s) / Receiver(s) + */ + friend class Qmgr; +public: + STATIC_CONST( SignalLength = 2 ); + + Uint32 senderData; + Uint32 senderRef; +}; + +#endif === modified file 'storage/ndb/include/kernel/signaldata/SignalData.hpp' --- a/storage/ndb/include/kernel/signaldata/SignalData.hpp 2010-08-17 09:54:53 +0000 +++ b/storage/ndb/include/kernel/signaldata/SignalData.hpp 2010-12-15 13:50:17 +0000 @@ -305,4 +305,7 @@ GSN_PRINT_SIGNATURE(printLOCAL_ROUTE_ORD GSN_PRINT_SIGNATURE(printDBINFO_SCAN); GSN_PRINT_SIGNATURE(printDBINFO_SCAN_REF); +GSN_PRINT_SIGNATURE(printNODE_PING_REQ); +GSN_PRINT_SIGNATURE(printNODE_PING_CONF); + #endif === modified file 'storage/ndb/include/mgmapi/mgmapi_config_parameters.h' --- a/storage/ndb/include/mgmapi/mgmapi_config_parameters.h 2010-12-02 11:02:29 +0000 +++ b/storage/ndb/include/mgmapi/mgmapi_config_parameters.h 2010-12-15 13:50:17 +0000 @@ -179,6 +179,8 @@ #define CFG_DB_EVENTLOG_BUFFER_SIZE 613 #define CFG_DB_NUMA 614 +#define CFG_DB_CONNECT_CHECK_DELAY 615 + #define CFG_NODE_ARBIT_RANK 200 #define CFG_NODE_ARBIT_DELAY 201 #define CFG_RESERVED_SEND_BUFFER_MEMORY 202 === modified file 'storage/ndb/include/mgmapi/ndb_logevent.h' --- a/storage/ndb/include/mgmapi/ndb_logevent.h 2010-10-13 12:22:51 +0000 +++ b/storage/ndb/include/mgmapi/ndb_logevent.h 2010-12-15 13:50:17 +0000 @@ -118,6 +118,12 @@ extern "C" { NDB_LE_LCP_TakeoverStarted = 33, /** NDB_MGM_EVENT_CATEGORY_NODE_RESTART */ NDB_LE_LCP_TakeoverCompleted = 34, + /** NDB_MGM_EVENT_CATEGORY_NODE_RESTART */ + NDB_LE_ConnectCheckStarted = 82, + /** NDB_MGM_EVENT_CATEGORY_NODE_RESTART */ + NDB_LE_ConnectCheckCompleted = 83, + /** NDB_MGM_EVENT_CATEGORY_NODE_RESTART */ + NDB_LE_NodeFailRejected = 84, /** NDB_MGM_EVENT_CATEGORY_STATISTIC */ NDB_LE_TransReportCounters = 35, @@ -478,6 +484,21 @@ extern "C" { struct ndb_logevent_LCP_TakeoverCompleted { unsigned state; }; + struct ndb_logevent_ConnectCheckStarted { + unsigned other_node_count; + unsigned reason; + unsigned causing_node; + }; + struct ndb_logevent_ConnectCheckCompleted { + unsigned nodes_checked; + unsigned nodes_suspect; + unsigned nodes_failed; + }; + struct ndb_logevent_NodeFailRejected { + unsigned reason; + unsigned failed_node; + unsigned source_node; + }; /* STATISTIC */ struct ndb_logevent_TransReportCounters { @@ -834,6 +855,9 @@ extern "C" { struct ndb_logevent_GCP_TakeoverCompleted GCP_TakeoverCompleted; struct ndb_logevent_LCP_TakeoverStarted LCP_TakeoverStarted; struct ndb_logevent_LCP_TakeoverCompleted LCP_TakeoverCompleted; + struct ndb_logevent_ConnectCheckStarted ConnectCheckStarted; + struct ndb_logevent_ConnectCheckCompleted ConnectCheckCompleted; + struct ndb_logevent_NodeFailRejected NodeFailRejected; /* STATISTIC */ struct ndb_logevent_TransReportCounters TransReportCounters; === modified file 'storage/ndb/include/transporter/TransporterRegistry.hpp' --- a/storage/ndb/include/transporter/TransporterRegistry.hpp 2010-11-09 20:40:03 +0000 +++ b/storage/ndb/include/transporter/TransporterRegistry.hpp 2010-12-15 13:50:17 +0000 @@ -325,6 +325,13 @@ public: #ifdef DEBUG_TRANSPORTER void printState(); #endif + +#ifdef ERROR_INSERT + /* Utils for testing latency issues */ + bool isBlocked(NodeId nodeId); + void blockReceive(NodeId nodeId); + void unblockReceive(NodeId nodeId); +#endif class Transporter_interface { public: @@ -355,6 +362,13 @@ private: int nSCITransporters; int nSHMTransporters; +#ifdef ERROR_INSERT + Bitmask m_blocked; + Bitmask m_blocked_with_data; + Bitmask m_blocked_disconnected; + int m_disconnect_errors[MAX_NTRANSPORTERS]; +#endif + /** * Bitmask of transporters that has data "carried over" since * last performReceive === modified file 'storage/ndb/src/common/debugger/EventLogger.cpp' --- a/storage/ndb/src/common/debugger/EventLogger.cpp 2010-10-13 12:22:51 +0000 +++ b/storage/ndb/src/common/debugger/EventLogger.cpp 2010-12-15 13:50:17 +0000 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1181,6 +1182,141 @@ void getTextSavedEvent(QQQQ) abort(); } +void getTextConnectCheckStarted(QQQQ) +{ + /* EventReport format : + * 1 : other_node_count + * 2 : reason (FailRep causes or 0) + * 3 : causing_node (if from FailRep) + * 4 : bitmask wordsize + * 5 : bitmask[2] + */ + Uint32 other_node_count = theData[1]; + Uint32 reason = theData[2]; + Uint32 causing_node = theData[3]; + Uint32 bitmaskSz = theData[4]; + char otherNodeMask[100]; + char suspectNodeMask[100]; + BitmaskImpl::getText(bitmaskSz, theData + 5 + (0 * bitmaskSz), otherNodeMask); + BitmaskImpl::getText(bitmaskSz, theData + 5 + (1 * bitmaskSz), suspectNodeMask); + Uint32 suspectCount = BitmaskImpl::count(bitmaskSz, theData + 5 + (1 * bitmaskSz)); + + if (reason) + { + /* Connect check started for specific reason */ + const char * reasonText = NULL; + switch (reason) + { + case FailRep::ZHEARTBEAT_FAILURE: + reasonText = "Heartbeat failure"; + break; + case FailRep::ZCONNECT_CHECK_FAILURE: + reasonText = "Connectivity check request"; + break; + default: + reasonText = "UNKNOWN"; + break; + } + + BaseString::snprintf(m_text, m_text_len, + "Connectivity Check of %u other nodes (%s) started due to %s from node %u.", + other_node_count, + otherNodeMask, + reasonText, + causing_node); + } + else + { + /* Connect check restarted due to suspect nodes */ + BaseString::snprintf(m_text, m_text_len, + "Connectivity Check of %u nodes (%s) restarting due to %u suspect nodes (%s).", + other_node_count, + otherNodeMask, + suspectCount, + suspectNodeMask); + } +} + +void getTextConnectCheckCompleted(QQQQ) +{ + /* EventReport format + * 1 : Nodes checked + * 2 : Suspect nodes + * 3 : Failed nodes + */ + + Uint32 nodes_checked = theData[1]; + Uint32 suspect_nodes = theData[2]; + Uint32 failed_nodes = theData[3]; + + if ((failed_nodes + suspect_nodes) == 0) + { + /* All connectivity ok */ + BaseString::snprintf(m_text, m_text_len, + "Connectivity Check completed on %u nodes, connectivity ok", + nodes_checked); + } + else + { + if (failed_nodes > 0) + { + if (suspect_nodes > 0) + { + BaseString::snprintf(m_text, m_text_len, + "Connectivity Check completed on %u nodes. %u nodes failed. " + "%u nodes still suspect, repeating check.", + nodes_checked, + failed_nodes, + suspect_nodes); + } + else + { + BaseString::snprintf(m_text, m_text_len, + "Connectivity Check completed on %u nodes. %u nodes failed. " + "Connectivity now OK", + nodes_checked, + failed_nodes); + } + } + else + { + /* Just suspect nodes */ + BaseString::snprintf(m_text, m_text_len, + "Connectivity Check completed on %u nodes. %u nodes still suspect, " + "repeating check.", + nodes_checked, + suspect_nodes); + } + } +} + +void getTextNodeFailRejected(QQQQ) +{ + Uint32 reason = theData[1]; + Uint32 failed_node = theData[2]; + Uint32 source_node = theData[3]; + + const char* reasonText = "Unknown"; + switch (reason) + { + case FailRep::ZCONNECT_CHECK_FAILURE: + reasonText = "Connect Check Failure"; + break; + case FailRep::ZLINK_FAILURE: + reasonText = "Link Failure"; + break; + } + + BaseString::snprintf(m_text, m_text_len, + "Received FAIL_REP (%s (%u)) for node %u sourced by suspect node %u. " + "Rejecting as failure of node %u.", + reasonText, + reason, + failed_node, + source_node, + source_node); +} + #if 0 BaseString::snprintf(m_text, m_text_len, @@ -1254,6 +1390,10 @@ const EventLoggerBase::EventRepLogLevelM ROW(LCP_TakeoverStarted, LogLevel::llNodeRestart, 7, Logger::LL_INFO ), ROW(LCP_TakeoverCompleted, LogLevel::llNodeRestart, 7, Logger::LL_INFO ), + ROW(ConnectCheckStarted, LogLevel::llNodeRestart, 6, Logger::LL_INFO ), + ROW(ConnectCheckCompleted, LogLevel::llNodeRestart, 6, Logger::LL_INFO ), + ROW(NodeFailRejected, LogLevel::llNodeRestart, 6, Logger::LL_ALERT ), + // STATISTIC ROW(TransReportCounters, LogLevel::llStatistic, 8, Logger::LL_INFO ), ROW(OperationReportCounters, LogLevel::llStatistic, 8, Logger::LL_INFO ), === modified file 'storage/ndb/src/common/debugger/signaldata/CMakeLists.txt' --- a/storage/ndb/src/common/debugger/signaldata/CMakeLists.txt 2010-11-17 11:35:04 +0000 +++ b/storage/ndb/src/common/debugger/signaldata/CMakeLists.txt 2010-12-15 13:50:17 +0000 @@ -44,5 +44,5 @@ ADD_LIBRARY(ndbsignaldata STATIC LqhTrans.cpp ReadNodesConf.cpp CntrStart.cpp ScanFrag.cpp ApiVersion.cpp LocalRouteOrd.cpp - DbinfoScan.cpp) + DbinfoScan.cpp NodePing.cpp) === modified file 'storage/ndb/src/common/debugger/signaldata/Makefile.am' --- a/storage/ndb/src/common/debugger/signaldata/Makefile.am 2010-08-06 08:19:19 +0000 +++ b/storage/ndb/src/common/debugger/signaldata/Makefile.am 2010-12-15 13:50:17 +0000 @@ -47,7 +47,7 @@ libsignaldataprint_la_SOURCES = \ CreateTrigImpl.cpp DropTrigImpl.cpp \ CreateIndxImpl.cpp DropIndxImpl.cpp AlterIndxImpl.cpp \ BuildIndx.cpp BuildIndxImpl.cpp ApiVersion.cpp \ - LocalRouteOrd.cpp DbinfoScan.cpp + LocalRouteOrd.cpp DbinfoScan.cpp NodePing.cpp include $(top_srcdir)/storage/ndb/config/common.mk.am include $(top_srcdir)/storage/ndb/config/type_ndbapi.mk.am === added file 'storage/ndb/src/common/debugger/signaldata/NodePing.cpp' --- a/storage/ndb/src/common/debugger/signaldata/NodePing.cpp 1970-01-01 00:00:00 +0000 +++ b/storage/ndb/src/common/debugger/signaldata/NodePing.cpp 2010-12-15 13:50:17 +0000 @@ -0,0 +1,38 @@ +/* + Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include + +bool +printNODE_PING_REQ(FILE * output, const Uint32 * theData, + Uint32 len, Uint16 receiverBlockNo) { + const NodePingReq * const sig = CAST_CONSTPTR(NodePingReq, theData); + fprintf(output, " senderRef : %x round : %u\n", + sig->senderRef, + sig->senderData); + return true; +} + +bool +printNODE_PING_CONF(FILE * output, const Uint32 * theData, + Uint32 len, Uint16 receiverBlockNo) { + const NodePingConf * const sig = CAST_CONSTPTR(NodePingConf, theData); + fprintf(output, " senderRef : %x round : %u\n", + sig->senderRef, + sig->senderData); + return true; +} === modified file 'storage/ndb/src/common/debugger/signaldata/SignalDataPrint.cpp' --- a/storage/ndb/src/common/debugger/signaldata/SignalDataPrint.cpp 2010-08-27 17:07:19 +0000 +++ b/storage/ndb/src/common/debugger/signaldata/SignalDataPrint.cpp 2010-12-15 13:50:17 +0000 @@ -257,6 +257,9 @@ SignalDataPrintFunctions[] = { ,{ GSN_DBINFO_SCANCONF, printDBINFO_SCAN } ,{ GSN_DBINFO_SCANREF, printDBINFO_SCAN_REF } + ,{ GSN_NODE_PING_REQ, printNODE_PING_REQ } + ,{ GSN_NODE_PING_CONF, printNODE_PING_CONF } + ,{ 0, 0 } }; === modified file 'storage/ndb/src/common/debugger/signaldata/SignalNames.cpp' --- a/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp 2010-10-20 07:12:58 +0000 +++ b/storage/ndb/src/common/debugger/signaldata/SignalNames.cpp 2010-12-15 13:50:17 +0000 @@ -761,5 +761,8 @@ const GsnName SignalNames [] = { ,{ GSN_SYNC_PATH_REQ, "SYNC_PATH_REQ" } ,{ GSN_SYNC_PATH_CONF, "SYNC_PATH_CONF" } + + ,{ GSN_NODE_PING_REQ, "NODE_PING_REQ" } + ,{ GSN_NODE_PING_CONF, "NODE_PING_CONF" } }; const unsigned short NO_OF_SIGNAL_NAMES = sizeof(SignalNames)/sizeof(GsnName); === modified file 'storage/ndb/src/common/transporter/TransporterRegistry.cpp' --- a/storage/ndb/src/common/transporter/TransporterRegistry.cpp 2010-11-09 20:40:03 +0000 +++ b/storage/ndb/src/common/transporter/TransporterRegistry.cpp 2010-12-15 13:50:17 +0000 @@ -128,6 +128,11 @@ TransporterRegistry::TransporterRegistry } #endif +#ifdef ERROR_INSERT + m_blocked.clear(); + m_blocked_with_data.clear(); + m_blocked_disconnected.clear(); +#endif // Initialize member variables nTransporters = 0; nTCPTransporters = 0; @@ -665,6 +670,16 @@ TransporterRegistry::prepareSend(Transpo return SEND_MESSAGE_TOO_BIG; } } else { +#ifdef ERROR_INSERT + if (m_blocked.get(nodeId)) + { + /* Looks like it disconnected while blocked. We'll pretend + * not to notice for now + */ + WARNING("Signal to " << nodeId << " discarded as node blocked + disconnected"); + return SEND_OK; + } +#endif DEBUG("Signal to " << nodeId << " lost(disconnect) "); return SEND_DISCONNECTED; } @@ -738,6 +753,16 @@ TransporterRegistry::prepareSend(Transpo return SEND_MESSAGE_TOO_BIG; } } else { +#ifdef ERROR_INSERT + if (m_blocked.get(nodeId)) + { + /* Looks like it disconnected while blocked. We'll pretend + * not to notice for now + */ + WARNING("Signal to " << nodeId << " discarded as node blocked + disconnected"); + return SEND_OK; + } +#endif DEBUG("Signal to " << nodeId << " lost(disconnect) "); return SEND_DISCONNECTED; } @@ -951,6 +976,15 @@ TransporterRegistry::pollReceive(Uint32 { for (int i = 0; i < num_socket_events; i++) { + Uint32 trpid = m_epoll_events[i].data.u32; +#ifdef ERROR_INSERT + if (m_blocked.get(trpid)) + { + /* Don't pull from socket now, wait till unblocked */ + m_blocked_with_data.set(trpid); + continue; + } +#endif mask.set(m_epoll_events[i].data.u32); } } @@ -1090,6 +1124,14 @@ TransporterRegistry::poll_TCP(Uint32 tim if (idx[i] != MAX_NODES + 1) { Uint32 node_id = t->getRemoteNodeId(); +#ifdef ERROR_INSERT + if (m_blocked.get(i)) + { + /* Don't pull from socket now, wait till unblocked */ + m_blocked_with_data.set(i); + continue; + } +#endif if (m_socket_poller.has_read(idx[i])) mask.set(node_id); } @@ -1170,6 +1212,19 @@ TransporterRegistry::performReceive() consume_extra_sockets(); } +#ifdef ERROR_INSERT + if (!m_blocked.isclear()) + { + if (m_has_data_transporters.isclear()) + { + /* poll sees data, but we want to ignore for now + * sleep a little to avoid busy loop + */ + NdbSleep_MilliSleep(1); + } + } +#endif + #ifdef NDB_TCP_TRANSPORTER Uint32 id = 0; while ((id = m_has_data_transporters.find(id + 1)) != BitmaskImpl::NotFound) @@ -1360,6 +1415,59 @@ TransporterRegistry::printState(){ } #endif +#ifdef ERROR_INSERT +bool +TransporterRegistry::isBlocked(NodeId nodeId) +{ + return m_blocked.get(nodeId); +} + +void +TransporterRegistry::blockReceive(NodeId nodeId) +{ + /* Check that node is not already blocked? + * Stop pulling from its socket (but track received data etc) + */ + /* Shouldn't already be blocked with data */ + assert(!m_blocked.get(nodeId)); + + m_blocked.set(nodeId); + + if (m_has_data_transporters.get(nodeId)) + { + assert(!m_blocked_with_data.get(nodeId)); + m_blocked_with_data.set(nodeId); + m_has_data_transporters.clear(nodeId); + } +} + +void +TransporterRegistry::unblockReceive(NodeId nodeId) +{ + /* Check that node is blocked? + * Resume pulling from its socket + * Ensure in-flight data is processed if there was some + */ + assert(m_blocked.get(nodeId)); + assert(!m_has_data_transporters.get(nodeId)); + + m_blocked.clear(nodeId); + + if (m_blocked_with_data.get(nodeId)) + { + m_has_data_transporters.set(nodeId); + } + + if (m_blocked_disconnected.get(nodeId)) + { + /* Process disconnect notification/handling now */ + m_blocked_disconnected.clear(nodeId); + + report_disconnect(nodeId, m_disconnect_errors[nodeId]); + } +} +#endif + IOState TransporterRegistry::ioState(NodeId nodeId) { return ioStates[nodeId]; @@ -1479,6 +1587,19 @@ TransporterRegistry::report_disconnect(N { DBUG_ENTER("TransporterRegistry::report_disconnect"); DBUG_PRINT("info",("performStates[%d]=DISCONNECTED",node_id)); + +#ifdef ERROR_INSERT + if (m_blocked.get(node_id)) + { + /* We are simulating real latency, so control events experience + * it too + */ + m_blocked_disconnected.set(node_id); + m_disconnect_errors[node_id] = errnum; + DBUG_VOID_RETURN; + } +#endif + performStates[node_id] = DISCONNECTED; m_has_data_transporters.clear(node_id); callbackObj->reportDisconnect(node_id, errnum); === modified file 'storage/ndb/src/kernel/blocks/ERROR_codes.txt' --- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2010-12-04 11:20:36 +0000 +++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2010-12-15 13:50:17 +0000 @@ -28,6 +28,8 @@ Crash president when he starts to run in 935 : Crash master on node failure (delayed) and skip sending GSN_COMMIT_FAILREQ to specified node +938 : Resume communications (DUMP 9991) when > 25% nodes failed + ERROR CODES FOR TESTING NODE FAILURE, GLOBAL CHECKPOINT HANDLING: ----------------------------------------------------------------- === modified file 'storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp' --- a/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp 2010-11-10 10:14:04 +0000 +++ b/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp 2010-12-15 13:50:17 +0000 @@ -1781,6 +1781,108 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal #endif #endif +#ifdef ERROR_INSERT + /* dump 9992 + * On Target NodeId, block receiving signals from NodeId list + * + * dump 9993 + * On Target NodeId, resume receiving signals from NodeId list + * + * dump 9991 + * On Target NodeId, resume receiving signals from any blocked node + * + * + * See also code in QMGR for blocking receive from nodes based + * on HB roles. + * + */ + if((arg == 9993) || /* Unblock recv from nodeid */ + (arg == 9992)) /* Block recv from nodeid */ + { + bool block = (arg == 9992); + for (Uint32 n = 1; n < signal->getLength(); n++) + { + Uint32 nodeId = signal->theData[n]; + + if ((nodeId > 0) && + (nodeId < MAX_NODES)) + { + if (block) + { + ndbout_c("CMVMI : Blocking receive from node %u", nodeId); + + globalTransporterRegistry.blockReceive(nodeId); + } + else + { + ndbout_c("CMVMI : Unblocking receive from node %u", nodeId); + + globalTransporterRegistry.unblockReceive(nodeId); + } + } + else + { + ndbout_c("CMVMI : Ignoring dump %u for node %u", + arg, nodeId); + } + } + } + if (arg == 9990) /* Block recv from all ndbd matching pattern */ + { + Uint32 pattern = 0; + if (signal->getLength() > 1) + { + pattern = signal->theData[1]; + ndbout_c("CMVMI : Blocking receive from all ndbds matching pattern -%s-", + ((pattern == 1)? "Other side":"Unknown")); + } + + for (Uint32 node = 1; node < MAX_NDB_NODES; node++) + { + if (globalTransporterRegistry.is_connected(node)) + { + if (getNodeInfo(node).m_type == NodeInfo::DB) + { + if (!globalTransporterRegistry.isBlocked(node)) + { + switch (pattern) + { + case 1: + { + /* Match if given node is on 'other side' of + * 2-replica cluster + */ + if ((getOwnNodeId() & 1) != (node & 1)) + { + /* Node is on the 'other side', match */ + break; + } + /* Node is on 'my side', don't match */ + continue; + } + default: + break; + } + ndbout_c("CMVMI : Blocking receive from node %u", node); + globalTransporterRegistry.blockReceive(node); + } + } + } + } + } + if (arg == 9991) /* Unblock recv from all blocked */ + { + for (Uint32 node = 0; node < MAX_NODES; node++) + { + if (globalTransporterRegistry.isBlocked(node)) + { + ndbout_c("CMVMI : Unblocking receive from node %u", node); + globalTransporterRegistry.unblockReceive(node); + } + } + } +#endif + if (arg == 9999) { Uint32 delay = 1000; === modified file 'storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp' --- a/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2010-12-13 15:34:50 +0000 +++ b/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2010-12-15 13:50:17 +0000 @@ -147,7 +147,38 @@ public: NdbNodeBitmask c_readnodes_nodes; Uint32 c_maxDynamicId; - + + struct ConnectCheckRec + { + bool m_active; // Connectivity check underway? + Timer m_timer; // Check timer object + Uint32 m_currentRound; // Last round started + Uint32 m_tick; // Periods elapsed in current check + NdbNodeBitmask m_nodesPinged; // Nodes sent a NodePingReq in round + NdbNodeBitmask m_nodesWaiting; // Nodes which have not sent a response + NdbNodeBitmask m_nodesFailedDuring; // Nodes which failed during check + NdbNodeBitmask m_nodesSuspect; // Nodes with suspect connectivity + + ConnectCheckRec() + { + m_active = false; + m_currentRound = 0; + m_tick = 0; + m_nodesPinged.clear(); + m_nodesWaiting.clear(); + m_nodesFailedDuring.clear(); + m_nodesSuspect.clear(); + } + + void reportNodeConnect(Uint32 nodeId); + /* reportNodeFailure. + * return code true means the connect check is completed + */ + bool reportNodeFailure(Uint32 nodeId); + }; + + ConnectCheckRec m_connectivity_check; + // Records struct NodeRec { /* @@ -319,6 +350,10 @@ private: void execUPGRADE_PROTOCOL_ORD(Signal*); + // Connectivity check signals + void execNODE_PINGREQ(Signal* signal); + void execNODE_PINGCONF(Signal* signal); + // Statement blocks void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn); Uint32 check_startup(Signal* signal); @@ -377,6 +412,7 @@ private: void setHbDelay(UintR aHbDelay); void setHbApiDelay(UintR aHbApiDelay); void setArbitTimeout(UintR aArbitTimeout); + void setCCDelay(UintR aCCDelay); // Interface to arbitration module void handleArbitStart(Signal* signal); @@ -400,6 +436,17 @@ private: void computeArbitNdbMask(NdbNodeBitmaskPOD& aMask); void reportArbitEvent(Signal* signal, Ndb_logevent_type type, const NodeBitmask mask = NodeBitmask()); + + // Interface to Connectivity Check + void startConnectivityCheck(Signal* signal, Uint32 reason, Uint32 node); + void checkConnectivityTimeSignal(Signal* signal); + void connectivityCheckCompleted(Signal* signal); + bool isNodeConnectivitySuspect(Uint32 nodeId) const; + void handleFailFromSuspect(Signal* signal, + Uint32 reason, + Uint16 aFailedNode, + Uint16 sourceNode); + // Initialisation void initData(); void initRecords(); @@ -511,6 +558,10 @@ private: // user-defined hbOrder must set all values non-zero and distinct int check_hb_order_config(); bool m_hb_order_config_used; + +#ifdef ERROR_INSERT + Uint32 nodeFailCount; +#endif }; #endif === modified file 'storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp' --- a/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp 2010-06-16 16:56:34 +0000 +++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp 2010-12-15 13:50:17 +0000 @@ -70,6 +70,10 @@ void Qmgr::initData() ndb_mgm_get_int_parameter(p, CFG_DB_API_HEARTBEAT_INTERVAL, &hbDBAPI); setHbApiDelay(hbDBAPI); + +#ifdef ERROR_INSERT + nodeFailCount = 0; +#endif }//Qmgr::initData() void Qmgr::initRecords() @@ -146,6 +150,10 @@ Qmgr::Qmgr(Block_context& ctx) addRecSignal(GSN_UPGRADE_PROTOCOL_ORD, &Qmgr::execUPGRADE_PROTOCOL_ORD); + // Connectivity check signals + addRecSignal(GSN_NODE_PING_REQ, &Qmgr::execNODE_PINGREQ); + addRecSignal(GSN_NODE_PING_CONF, &Qmgr::execNODE_PINGCONF); + initData(); }//Qmgr::Qmgr() === modified file 'storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp' --- a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2010-12-13 16:52:27 +0000 +++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2010-12-15 13:50:17 +0000 @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -437,6 +438,21 @@ void Qmgr::setArbitTimeout(UintR aArbitT arbitRec.timeout = (aArbitTimeout < 10 ? 10 : aArbitTimeout); } +void Qmgr::setCCDelay(UintR aCCDelay) +{ + NDB_TICKS now = NdbTick_CurrentMillisecond(); + if (aCCDelay == 0) + { + /* Connectivity check disabled */ + m_connectivity_check.m_timer.setDelay(0); + } + else + { + m_connectivity_check.m_timer.setDelay(aCCDelay < 10 ? 10 : aCCDelay); + m_connectivity_check.m_timer.reset(now); + } +} + void Qmgr::execCONNECT_REP(Signal* signal) { jamEntry(); @@ -1991,6 +2007,7 @@ void Qmgr::execCM_ADD(Signal* signal) jam(); ndbrequire(addNodePtr.p->phase == ZSTARTING); addNodePtr.p->phase = ZRUNNING; + m_connectivity_check.reportNodeConnect(addNodePtr.i); setNodeInfo(addNodePtr.i).m_heartbeat_cnt= 0; c_clusterNodes.set(addNodePtr.i); findNeighbours(signal, __LINE__); @@ -2389,6 +2406,7 @@ void Qmgr::initData(Signal* signal) Uint32 hbDBDB = 1500; Uint32 arbitTimeout = 1000; Uint32 arbitMethod = ARBIT_METHOD_DEFAULT; + Uint32 ccInterval = 0; c_restartPartialTimeout = 30000; c_restartPartionedTimeout = 60000; c_restartFailureTimeout = ~0; @@ -2401,6 +2419,8 @@ void Qmgr::initData(Signal* signal) &c_restartPartionedTimeout); ndb_mgm_get_int_parameter(p, CFG_DB_START_FAILURE_TIMEOUT, &c_restartFailureTimeout); + ndb_mgm_get_int_parameter(p, CFG_DB_CONNECT_CHECK_DELAY, + &ccInterval); if(c_restartPartialTimeout == 0) { @@ -2418,6 +2438,7 @@ void Qmgr::initData(Signal* signal) } setHbDelay(hbDBDB); + setCCDelay(ccInterval); setArbitTimeout(arbitTimeout); arbitRec.method = (ArbitRec::Method)arbitMethod; @@ -2536,10 +2557,22 @@ void Qmgr::timerHandlingLab(Signal* sign sendHeartbeat(signal); hb_send_timer.reset(TcurrentTime); } - if (hb_check_timer.check(TcurrentTime)) { - jam(); - checkHeartbeat(signal); - hb_check_timer.reset(TcurrentTime); + if (likely(! m_connectivity_check.m_active)) + { + if (hb_check_timer.check(TcurrentTime)) { + jam(); + checkHeartbeat(signal); + hb_check_timer.reset(TcurrentTime); + } + } + else + { + /* Connectivity check */ + if (m_connectivity_check.m_timer.check(TcurrentTime)) { + jam(); + checkConnectivityTimeSignal(signal); + m_connectivity_check.m_timer.reset(TcurrentTime); + } } } @@ -2644,16 +2677,26 @@ void Qmgr::checkHeartbeat(Signal* signal if (getNodeInfo(nodePtr.i).m_heartbeat_cnt > 4) { jam(); - /**---------------------------------------------------------------------- - * OUR LEFT NEIGHBOUR HAVE KEPT QUIET FOR THREE CONSECUTIVE HEARTBEAT - * PERIODS. THUS WE DECLARE HIM DOWN. - *----------------------------------------------------------------------*/ - signal->theData[0] = NDB_LE_DeadDueToHeartbeat; - signal->theData[1] = nodePtr.i; - sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB); - - failReportLab(signal, nodePtr.i, FailRep::ZHEARTBEAT_FAILURE, getOwnNodeId()); - return; + if (m_connectivity_check.m_timer.getDelay() > 0) + { + jam(); + /* Start connectivity check, indicating the cause */ + startConnectivityCheck(signal, FailRep::ZHEARTBEAT_FAILURE, nodePtr.i); + return; + } + else + { + /**---------------------------------------------------------------------- + * OUR LEFT NEIGHBOUR HAVE KEPT QUIET FOR THREE CONSECUTIVE HEARTBEAT + * PERIODS. THUS WE DECLARE HIM DOWN. + *----------------------------------------------------------------------*/ + signal->theData[0] = NDB_LE_DeadDueToHeartbeat; + signal->theData[1] = nodePtr.i; + sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB); + + failReportLab(signal, nodePtr.i, FailRep::ZHEARTBEAT_FAILURE, getOwnNodeId()); + return; + } }//if }//Qmgr::checkHeartbeat() @@ -3574,6 +3617,24 @@ void Qmgr::failReportLab(Signal* signal, jam(); return; } + + if (isNodeConnectivitySuspect(sourceNode) && + // (! isNodeConnectivitySuspect(aFailedNode)) && // TODO : Required? + ((aFailCause == FailRep::ZCONNECT_CHECK_FAILURE) || + (aFailCause == FailRep::ZLINK_FAILURE))) + { + jam(); + /* Connectivity related failure report from a node with suspect + * connectivity, handle differently + */ + ndbrequire(sourceNode != getOwnNodeId()); + + handleFailFromSuspect(signal, + aFailCause, + aFailedNode, + sourceNode); + return; + } if (failedNodePtr.i == getOwnNodeId()) { jam(); @@ -3630,6 +3691,9 @@ void Qmgr::failReportLab(Signal* signal, case FailRep::ZMULTI_NODE_SHUTDOWN: msg = "Multi node shutdown"; break; + case FailRep::ZCONNECT_CHECK_FAILURE: + msg = "Connectivity check failure"; + break; default: msg = ""; } @@ -4485,6 +4549,34 @@ void Qmgr::failReport(Signal* signal, ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRec); if (failedNodePtr.p->phase == ZRUNNING) { jam(); + +#ifdef ERROR_INSERT + if (ERROR_INSERTED(938)) + { + nodeFailCount++; + ndbout_c("QMGR : execFAIL_REP : %u nodes have failed", nodeFailCount); + /* Count DB nodes */ + Uint32 nodeCount = 0; + for (Uint32 i = 1; i < MAX_NDB_NODES; i++) + { + if (getNodeInfo(i).getType() == NODE_TYPE_DB) + nodeCount++; + } + + /* When > 25% of cluster has failed, resume communications */ + if (nodeFailCount > (nodeCount / 4)) + { + ndbout_c("QMGR : execFAIL_REP > 25%% nodes failed, resuming comms"); + Signal save = *signal; + signal->theData[0] = 9991; + sendSignal(CMVMI_REF, GSN_DUMP_STATE_ORD, signal, 1, JBB); + *signal = save; + nodeFailCount = 0; + CLEAR_ERROR_INSERT_VALUE; + } + } +#endif + /* WE ALSO NEED TO ADD HERE SOME CODE THAT GETS OUR NEW NEIGHBOURS. */ if (cpresident == getOwnNodeId()) { jam(); @@ -4534,6 +4626,13 @@ void Qmgr::failReport(Signal* signal, jam(); return; }//if + + if (unlikely(m_connectivity_check.reportNodeFailure(failedNodePtr.i))) + { + jam(); + connectivityCheckCompleted(signal); + } + failedNodePtr.p->ndynamicId = 0; findNeighbours(signal, __LINE__); if (failedNodePtr.i == cpresident) { @@ -5810,6 +5909,55 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal) } ndbout << buf << endl; } + +#ifdef ERROR_INSERT + Uint32 dumpCode = signal->theData[0]; + if ((dumpCode == 9992) || + (dumpCode == 9993)) + { + if (signal->getLength() == 2) + { + Uint32 nodeId = signal->theData[1]; + Uint32& newNodeId = signal->theData[1]; + Uint32 length = 2; + assert(257 > MAX_NODES); + if (nodeId > MAX_NODES) + { + const char* type = "None"; + switch (nodeId) + { + case 257: + { + /* Left (lower) neighbour */ + newNodeId = cneighbourl; + type = "Left neighbour"; + break; + } + case 258: + { + /* Right (higher) neighbour */ + newNodeId = cneighbourh; + type = "Right neighbour"; + break; + } + case 259: + { + /* President */ + newNodeId = cpresident; + type = "President"; + break; + } + } + ndbout_c("QMGR : Mapping request on node id %u to node id %u (%s)", + nodeId, newNodeId, type); + if (newNodeId != nodeId) + { + sendSignal(CMVMI_REF, GSN_DUMP_STATE_ORD, signal, length, JBB); + } + } + } + } +#endif }//Qmgr::execDUMP_STATE_ORD() @@ -6206,3 +6354,491 @@ Qmgr::check_hb_order_config() m_hb_order_config_used = true; return 0; } + +static const Uint32 CC_SuspectTicks = 1; +static const Uint32 CC_FailedTicks = 2; + +void +Qmgr::startConnectivityCheck(Signal* signal, Uint32 reason, Uint32 causingNode) +{ + jam(); + ndbrequire(m_connectivity_check.m_timer.getDelay() > 0); + + if (m_connectivity_check.m_active) + { + jam(); + /* Connectivity check underway already + * do nothing + */ + return; + } + + + m_connectivity_check.m_nodesPinged.clear(); + + /* Send NODE_PINGREQ signal to all other running nodes, and + * initialise connectivity check bitmasks. + * Note that nodes may already be considered suspect due to + * a previous connectivity check round. + */ + Uint32 ownId = getOwnNodeId(); + NodePingReq* pingReq = CAST_PTR(NodePingReq, &signal->theData[0]); + pingReq->senderData = ++m_connectivity_check.m_currentRound; + pingReq->senderRef = reference(); + + for (Uint32 i=1; i < MAX_NDB_NODES; i++) + { + if (i != ownId) + { + NodeRec& node = nodeRec[i]; + if (node.phase == ZRUNNING) + { + /* If connection was considered ok, treat as unknown, + * If it was considered slow, continue to treat + * as slow + */ + sendSignal(node.blockRef, + GSN_NODE_PING_REQ, + signal, + NodePingReq::SignalLength, + JBA); + + m_connectivity_check.m_nodesPinged.set(i); + } + } + } + + /* Initialise result bitmasks */ + m_connectivity_check.m_nodesWaiting.assign(m_connectivity_check.m_nodesPinged); + m_connectivity_check.m_nodesFailedDuring.clear(); + + /* Ensure only live nodes are considered suspect */ + m_connectivity_check.m_nodesSuspect.bitAND(m_connectivity_check.m_nodesPinged); + + const char* reasonText = "Unknown"; + bool firstTime = true; + + switch(reason) + { + case FailRep::ZHEARTBEAT_FAILURE: + reasonText = "Heartbeat failure"; + break; + case FailRep::ZCONNECT_CHECK_FAILURE: + reasonText = "Connectivity check request"; + break; + default: + firstTime = false; + ndbrequire(m_connectivity_check.m_nodesSuspect.count() > 0); + break; + } + + if (!m_connectivity_check.m_nodesPinged.isclear()) + { + jam(); + { + char buff[100]; + m_connectivity_check.m_nodesPinged.getText(buff); + if (firstTime) + { + g_eventLogger->info("QMGR : Starting connectivity check of %u other nodes (%s) due to %s from node %u.", + m_connectivity_check.m_nodesPinged.count(), + buff, + reasonText, + causingNode); + } + else + { + char buff2[100]; + m_connectivity_check.m_nodesSuspect.getText(buff2); + g_eventLogger->info("QMGR : Restarting connectivity check of %u other nodes (%s) due to %u syspect nodes (%s)", + m_connectivity_check.m_nodesPinged.count(), + buff, + m_connectivity_check.m_nodesSuspect.count(), + buff2); + } + } + + /* Generate cluster log event */ + Uint32 bitmaskSz = NdbNodeBitmask::Size; + signal->theData[0] = NDB_LE_ConnectCheckStarted; + signal->theData[1] = m_connectivity_check.m_nodesPinged.count(); + signal->theData[2] = reason; + signal->theData[3] = causingNode; + signal->theData[4] = bitmaskSz; + Uint32* sigPtr = &signal->theData[5]; + m_connectivity_check.m_nodesPinged.copyto(bitmaskSz, sigPtr); sigPtr+= bitmaskSz; + m_connectivity_check.m_nodesSuspect.copyto(bitmaskSz, sigPtr); + sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 5 + (2 * bitmaskSz), JBB); + + m_connectivity_check.m_active = true; + m_connectivity_check.m_tick = 0; + NDB_TICKS now = NdbTick_CurrentMillisecond(); + m_connectivity_check.m_timer.reset(now); + } + else + { + g_eventLogger->info("QMGR : Connectivity check requested due to %s (from %u) not started as no other running nodes.", + reasonText, + causingNode); + } +} + +void +Qmgr::execNODE_PINGREQ(Signal* signal) +{ + jamEntry(); + Uint32 ownId = getOwnNodeId(); + const NodePingReq* pingReq = CAST_CONSTPTR(NodePingReq, &signal->theData[0]); + Uint32 sendersRef = signal->getSendersBlockRef(); + Uint32 sendersNodeId = refToNode(sendersRef); + Uint32 senderData = pingReq->senderData; + + ndbrequire(sendersNodeId != ownId); + + /* We will start our own connectivity check if necessary + * before responding with PING_CONF to the requestor. + * This means that the sending node will receive our PING_REQ + * before our PING_CONF, which should avoid them starting an + * unnecessary extra connectivity check round in some cases. + */ + if (likely(m_connectivity_check.m_timer.getDelay() > 0)) + { + jam(); + /* We have connectivity checking configured */ + if (! m_connectivity_check.m_active) + { + jam(); + + { + /* Don't start a new connectivity check if the requesting + * node has failed from our point of view + */ + NodeRecPtr nodePtr; + nodePtr.i = sendersNodeId; + ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec); + if (unlikely(nodePtr.p->phase != ZRUNNING)) + { + jam(); + + g_eventLogger->warning("QMGR : Discarding NODE_PINGREQ from non-running node %u (%u)", + sendersNodeId, nodePtr.p->phase); + return; + } + } + + /* Start our own Connectivity Check now indicating reason and causing node */ + startConnectivityCheck(signal, FailRep::ZCONNECT_CHECK_FAILURE, sendersNodeId); + } + } + else + { + jam(); + g_eventLogger->warning("QMGR : NODE_PINGREQ received from node %u, but connectivity " + "checking not configured on this node. Ensure all " + "nodes have the same configuration for parameter " + "ConnectCheckIntervalMillis.", + sendersNodeId); + } + + /* Now respond with NODE_PINGCONF */ + NodePingConf* pingConf = CAST_PTR(NodePingConf, &signal->theData[0]); + + pingConf->senderData = senderData; + pingConf->senderRef = reference(); + + sendSignal(sendersRef, + GSN_NODE_PING_CONF, + signal, + NodePingConf::SignalLength, + JBA); +} + +void +Qmgr::ConnectCheckRec::reportNodeConnect(Uint32 nodeId) +{ + /* Clear any suspicion */ + m_nodesSuspect.clear(nodeId); +} + +bool +Qmgr::ConnectCheckRec::reportNodeFailure(Uint32 nodeId) +{ + if (unlikely(m_active)) + { + m_nodesFailedDuring.set(nodeId); + + if (m_nodesWaiting.get(nodeId)) + { + /* We were waiting for a NODE_PING_CONF from this node, + * remove it from the set + */ + m_nodesWaiting.clear(nodeId); + + return m_nodesWaiting.isclear(); + } + } + return false; +} + +void +Qmgr::execNODE_PINGCONF(Signal* signal) +{ + jamEntry(); + + ndbrequire(m_connectivity_check.m_timer.getDelay() > 0); + + const NodePingConf* pingConf = CAST_CONSTPTR(NodePingConf, &signal->theData[0]); + Uint32 sendersBlockRef = signal->getSendersBlockRef(); + Uint32 sendersNodeId = refToNode(sendersBlockRef); + Uint32 roundNumber = pingConf->senderData; + + ndbrequire(sendersNodeId != getOwnNodeId()); + ndbrequire((m_connectivity_check.m_active) || /* Normal */ + (m_connectivity_check.m_nodesWaiting.get(sendersNodeId) || /* We killed last round */ + m_connectivity_check.m_nodesFailedDuring.get(sendersNodeId))); /* Someone killed */ + + if (unlikely((! m_connectivity_check.m_active) || + (roundNumber != m_connectivity_check.m_currentRound))) + { + g_eventLogger->warning("QMGR : Received NODEPING_CONF from node %u for round %u, " + "but we are %sactive on round %u. Discarding.", + sendersNodeId, + roundNumber, + ((m_connectivity_check.m_active)?"":"in"), + m_connectivity_check.m_currentRound); + return; + } + + /* Node must have been pinged, we must be waiting for the response, + * or the node must have already failed + */ + ndbrequire(m_connectivity_check.m_nodesPinged.get(sendersNodeId)); + ndbrequire(m_connectivity_check.m_nodesWaiting.get(sendersNodeId) || + m_connectivity_check.m_nodesFailedDuring.get(sendersNodeId)); + + m_connectivity_check.m_nodesWaiting.clear(sendersNodeId); + + if (likely(m_connectivity_check.m_tick < CC_SuspectTicks)) + { + jam(); + /* Node responded on time, clear any suspicion about it */ + m_connectivity_check.m_nodesSuspect.clear(sendersNodeId); + } + + if (m_connectivity_check.m_nodesWaiting.isclear()) + { + jam(); + /* Connectivity check round is now finished */ + connectivityCheckCompleted(signal); + } +} + +void +Qmgr::connectivityCheckCompleted(Signal* signal) +{ + jam(); + + m_connectivity_check.m_active = false; + + /* Log the following : + * Nodes checked + * Nodes responded ok + * Nodes responded late (now suspect) + * Nodes failed to respond. + * Nodes failed during + */ + char pinged[100]; + char late[100]; + char silent[100]; + char failed[100]; + + /* Any 'waiting' nodes have been killed + * Surviving suspects do not include them. + */ + NdbNodeBitmask survivingSuspects(m_connectivity_check.m_nodesSuspect); + survivingSuspects.bitANDC(m_connectivity_check.m_nodesWaiting); + + /* Nodes that failed during the check are also excluded */ + survivingSuspects.bitANDC(m_connectivity_check.m_nodesFailedDuring); + + m_connectivity_check.m_nodesPinged.getText(pinged); + survivingSuspects.getText(late); + m_connectivity_check.m_nodesWaiting.getText(silent); + m_connectivity_check.m_nodesFailedDuring.getText(failed); + + g_eventLogger->info("QMGR : Connectivity check completed, " + "%u other nodes checked (%s), " + "%u responded on time, " + "%u responded late (%s), " + "%u no response will be failed (%s), " + "%u failed during check (%s)\n", + m_connectivity_check.m_nodesPinged.count(), + pinged, + m_connectivity_check.m_nodesPinged.count() - + m_connectivity_check.m_nodesSuspect.count(), + survivingSuspects.count(), + late, + m_connectivity_check.m_nodesWaiting.count(), + silent, + m_connectivity_check.m_nodesFailedDuring.count(), + failed); + + /* Log in Cluster log */ + signal->theData[0] = NDB_LE_ConnectCheckCompleted; + signal->theData[1] = m_connectivity_check.m_nodesPinged.count(); + signal->theData[2] = survivingSuspects.count(); + signal->theData[3] = m_connectivity_check.m_nodesWaiting.count() + + m_connectivity_check.m_nodesFailedDuring.count(); + + sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB); + + if (survivingSuspects.count() > 0) + { + jam(); + /* Still suspect nodes, start another round */ + g_eventLogger->info("QMGR : Starting new connectivity check due to suspect nodes."); + /* Restart connectivity check, no external reason or cause */ + startConnectivityCheck(signal, 0, 0); + } + else + { + jam(); + /* No suspect nodes, stop the protocol now */ + + g_eventLogger->info("QMGR : All other nodes (%u) connectivity ok.", + m_connectivity_check.m_nodesPinged.count() - + (m_connectivity_check.m_nodesWaiting.count() + + m_connectivity_check.m_nodesFailedDuring.count())); + + /* Send a heartbeat to our right neighbour at this point as a gesture + * of goodwill + */ + sendHeartbeat(signal); + hb_send_timer.reset(NdbTick_CurrentMillisecond()); + }; +} + +void +Qmgr::checkConnectivityTimeSignal(Signal* signal) +{ + /* Executed periodically when a connectivity check is + * underway. + * After CC_SuspectTicks have elapsed, any nodes + * which have not responded are considered + * 'Suspect'. + * After CC_FailedTicks have elapsed, any nodes + * which have not responded are considered + * to have failed, and failure handling + * begins. + */ + jam(); + + /* Preconditions, otherwise we shouldn't have been called */ + ndbrequire(m_connectivity_check.m_timer.getDelay() > 0); + ndbrequire(m_connectivity_check.m_active); + ndbrequire(!m_connectivity_check.m_nodesWaiting.isclear()); + + m_connectivity_check.m_tick++; + + switch (m_connectivity_check.m_tick) + { + case CC_SuspectTicks: + { + jam(); + /* Still waiting to hear from some nodes, they are now + * suspect + */ + m_connectivity_check.m_nodesSuspect.bitOR(m_connectivity_check.m_nodesWaiting); + return; + } + case CC_FailedTicks: + { + jam(); + /* Still waiting to hear from some nodes, they will now + * be failed + */ + m_connectivity_check.m_active = false; + Uint32 nodeId = 0; + + while ((nodeId = m_connectivity_check.m_nodesWaiting.find(nodeId)) + != BitmaskImpl::NotFound) + { + jam(); + /* Log failure reason */ + /* Todo : Connectivity Check specific failure log? */ + signal->theData[0] = NDB_LE_DeadDueToHeartbeat; + signal->theData[1] = nodeId; + + sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB); + + /* Fail the node */ + /* TODO : Consider real time break here */ + failReportLab(signal, nodeId, FailRep::ZCONNECT_CHECK_FAILURE, getOwnNodeId()); + nodeId++; + } + + /* Now handle the end of the Connectivity Check */ + connectivityCheckCompleted(signal); + } + } +} + +bool +Qmgr::isNodeConnectivitySuspect(Uint32 nodeId) const +{ + return m_connectivity_check.m_nodesSuspect.get(nodeId); +} + +void +Qmgr::handleFailFromSuspect(Signal* signal, + Uint32 reason, + Uint16 aFailedNode, + Uint16 sourceNode) +{ + jam(); + + const char* reasonText = "Unknown"; + + /* We have received a failure report about some node X from + * some other node that we consider to have suspect connectivity + * which may have caused the report. + * + * We will 'invert' the sense of this, and handle it as + * a failure report of the sender, with the same cause. + */ + switch(reason) + { + case FailRep::ZCONNECT_CHECK_FAILURE: + jam(); + /* Suspect says that connectivity check failed for another node. + * As suspect has bad connectivity from our point of view, we + * blame him. + */ + reasonText = "ZCONNECT_CHECK_FAILURE"; + break; + case FailRep::ZLINK_FAILURE: + jam(); + /* Suspect says that link failed for another node. + * As suspect has bad connectivity from our point of view, we + * blame her. + */ + reasonText = "ZLINK_FAILURE"; + break; + default: + ndbrequire(false); + } + + g_eventLogger->warning("QMGR : Received Connectivity failure notification about " + "%u from suspect node %u with reason %s. " + "Mapping to failure of %u sourced by me.", + aFailedNode, sourceNode, reasonText, sourceNode); + + signal->theData[0] = NDB_LE_NodeFailRejected; + signal->theData[1] = reason; + signal->theData[2] = aFailedNode; + signal->theData[3] = sourceNode; + + sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB); + + failReportLab(signal, sourceNode, (FailRep::FailCause) reason, getOwnNodeId()); +} === modified file 'storage/ndb/src/mgmsrv/ConfigInfo.cpp' --- a/storage/ndb/src/mgmsrv/ConfigInfo.cpp 2010-12-02 11:02:29 +0000 +++ b/storage/ndb/src/mgmsrv/ConfigInfo.cpp 2010-12-15 13:50:17 +0000 @@ -987,6 +987,18 @@ const ConfigInfo::ParamInfo ConfigInfo:: STR_VALUE(MAX_INT_RNIL) }, { + CFG_DB_CONNECT_CHECK_DELAY, + "ConnectCheckIntervalDelay", + DB_TOKEN, + "Time between "DB_TOKEN_PRINT" connectivity check stages. "DB_TOKEN_PRINT" considered suspect after 1 and dead after 2 intervals.", + ConfigInfo::CI_USED, + 0, + ConfigInfo::CI_INT, + "0", + "0", + STR_VALUE(MAX_INT_RNIL) }, + + { CFG_DB_API_HEARTBEAT_INTERVAL, "HeartbeatIntervalDbApi", DB_TOKEN, === modified file 'storage/ndb/test/ndbapi/testNodeRestart.cpp' --- a/storage/ndb/test/ndbapi/testNodeRestart.cpp 2010-11-24 12:16:55 +0000 +++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp 2010-12-15 13:50:17 +0000 @@ -4180,6 +4180,513 @@ runBug58453(NDBT_Context* ctx, NDBT_Step return NDBT_OK; } + + +int runRestartToDynamicOrder(NDBT_Context* ctx, NDBT_Step* step) +{ + /* Here we perform node restarts to get the various node's + * dynamic ids in a particular order + * This affects which nodes heartbeat which (low -> high) + * and which is the president (lowest). + * Each restarting node gets a higher dynamic id, so the + * first node restarted will eventually become president + * Note that we're assuming NoOfReplicas == 2 here. + */ + /* TODO : + * Refactor into + * 1) Get current cluster dynorder info + * 2) Choose a dynorder info + * 3) Restart to given dynorder if necessary + */ + Uint32 dynOrder = ctx->getProperty("DynamicOrder", Uint32(0)); + NdbRestarter restarter; + Uint32 numNodes = restarter.getNumDbNodes(); + + Vector currOrder; + Vector newOrder; + Vector odds; + Vector evens; + + if (numNodes == 2) + { + ndbout_c("No Dynamic reordering possible with 2 nodes"); + return NDBT_OK; + } + if (numNodes & 1) + { + ndbout_c("Non multiple-of-2 number of nodes. Not supported"); + return NDBT_FAILED; + } + + Uint32 master = restarter.getMasterNodeId(); + + for (Uint32 n=0; n < numNodes; n++) + { + currOrder.push_back(master); + master = restarter.getNextMasterNodeId(master); + } + + for (Uint32 n=0; n < numNodes; n++) + { + Uint32 nodeId = restarter.getDbNodeId(n); + if (nodeId & 1) + { + odds.push_back(nodeId); + } + else + { + evens.push_back(nodeId); + } + } + + if (odds.size() != evens.size()) + { + ndbout_c("Failed - odds.size() (%u) != evens.size() (%u)", + odds.size(), + evens.size()); + return NDBT_FAILED; + } + + ndbout_c("Current dynamic ordering : "); + for (Uint32 n=0; n4->6->3->5->7 + * 2) Odd first, no interleave, no reverse + * e.g. 3->5->7->2->4->6 + * 3) Even first, interleave, no reverse + * e.g. 2->3->4->5->6->7 + * 9) Even first, no interleave, reverse B + * e.g. 2->4->6->7->5->3 + * + * 'First' node becomes president. + * Which node(s) monitor president affects when + * arbitration may be required + */ + + ndbout_c("Generating ordering with %s president, sides %sinterleaved", + (oddPresident?"odd": "even"), + (interleave?"":"not ")); + if (reverseSideA) + ndbout_c(" %s reversed", (oddPresident?"odds": "evens")); + + if (reverseSideB) + ndbout_c(" %s reversed", (oddPresident?"evens": "odds")); + + Vector* sideA; + Vector* sideB; + + if (oddPresident) + { + sideA = &odds; + sideB = &evens; + } + else + { + sideA = &evens; + sideB = &odds; + } + + if (interleave) + { + for (Uint32 n=0; n < sideA->size(); n++) + { + Uint32 indexA = reverseSideA? (sideA->size() - (n+1)) : n; + newOrder.push_back((*sideA)[indexA]); + Uint32 indexB = reverseSideB? (sideB->size() - (n+1)) : n; + newOrder.push_back((*sideB)[indexB]); + } + } + else + { + for (Uint32 n=0; n < sideA->size(); n++) + { + Uint32 indexA = reverseSideA? (sideA->size() - (n+1)) : n; + newOrder.push_back((*sideA)[indexA]); + } + for (Uint32 n=0; n < sideB->size(); n++) + { + Uint32 indexB = reverseSideB? (sideB->size() - (n+1)) : n; + newOrder.push_back((*sideB)[indexB]); + } + } + + + bool diff = false; + for (Uint32 n=0; n < newOrder.size(); n++) + { + ndbout_c(" %u %s", + newOrder[n], + ((n==0)?"*":" ")); + + diff |= (newOrder[n] != currOrder[n]); + } + + if (!diff) + { + ndbout_c("Cluster already in correct configuration"); + return NDBT_OK; + } + + for (Uint32 n=0; n < newOrder.size(); n++) + { + ndbout_c("Now restarting node %u", newOrder[n]); + if (restarter.restartOneDbNode(newOrder[n], + false, // initial + true, // nostart + true) // abort + != NDBT_OK) + { + ndbout_c("Failed to restart node"); + return NDBT_FAILED; + } + if (restarter.waitNodesNoStart((const int*) &newOrder[n], 1) != NDBT_OK) + { + ndbout_c("Failed waiting for node to enter NOSTART state"); + return NDBT_FAILED; + } + if (restarter.startNodes((const int*) &newOrder[n], 1) != NDBT_OK) + { + ndbout_c("Failed to start node"); + return NDBT_FAILED; + } + if (restarter.waitNodesStarted((const int*) &newOrder[n], 1) != NDBT_OK) + { + ndbout_c("Failed waiting for node to start"); + return NDBT_FAILED; + } + ndbout_c(" Done."); + } + + ndbout_c("All restarts completed. NdbRestarter says master is %u", + restarter.getMasterNodeId()); + if (restarter.getMasterNodeId() != (int) newOrder[0]) + { + ndbout_c(" Should be %u, failing", newOrder[0]); + return NDBT_FAILED; + } + + return NDBT_OK; +} + +struct NodeGroupMembers +{ + Uint32 ngid; + Uint32 membCount; + Uint32 members[4]; +}; + +template class Vector; + +int analyseDynamicOrder(NDBT_Context* ctx, NDBT_Step* step) +{ + NdbRestarter restarter; + Uint32 numNodes = restarter.getNumDbNodes(); + Uint32 master = restarter.getMasterNodeId(); + Vector dynamicOrder; + Vector nodeGroup; + Vector monitorsNode; + Vector monitoredByNode; + Vector monitorsRemote; + Vector remoteMonitored; + Vector sameNGMonitored; + Vector distanceToRemote; + Vector nodeIdToDynamicIndex; + Uint32 maxDistanceToRemoteLink = 0; + + /* TODO : + * Refactor into : + * 1) Determine dynorder from running cluster + * 2) Analyse dynorder in general + * 3) Analyse dynorder from point of view of latency split + * + * 4) Support splits other than odd/even total + * - Partial split + * - Some link failures + */ + + /* Determine dynamic order from running cluster */ + for (Uint32 n=0; n < numNodes; n++) + { + dynamicOrder.push_back(master); + nodeGroup.push_back(restarter.getNodeGroup(master)); + master = restarter.getNextMasterNodeId(master); + Uint32 zero=0; + nodeIdToDynamicIndex.set(n, master, zero); + } + + /* Look at implied HB links */ + for (Uint32 n=0; n < numNodes; n++) + { + Uint32 nodeId = dynamicOrder[n]; + Uint32 monitoredByIndex = (n+1) % numNodes; + Uint32 monitorsIndex = (n+ numNodes - 1) % numNodes; + monitoredByNode.push_back(dynamicOrder[monitoredByIndex]); + monitorsNode.push_back(dynamicOrder[monitorsIndex]); + remoteMonitored.push_back((nodeId & 1) != (monitoredByNode[n] & 1)); + monitorsRemote.push_back((nodeId & 1) != (monitorsNode[n] & 1)); + sameNGMonitored.push_back(nodeGroup[n] == nodeGroup[monitoredByIndex]); + } + + /* Look at split implications */ + for (Uint32 n=0; n < numNodes; n++) + { + Uint32 distanceToRemoteHBLink = 0; + for (Uint32 m=0; m < numNodes; m++) + { + if (remoteMonitored[n+m]) + break; + distanceToRemoteHBLink++; + } + + distanceToRemote.push_back(distanceToRemoteHBLink); + maxDistanceToRemoteLink = MAX(maxDistanceToRemoteLink, distanceToRemoteHBLink); + } + + + ndbout_c("Dynamic order analysis"); + + for (Uint32 n=0; n < numNodes; n++) + { + ndbout_c(" %u %s %u%s%u%s%u \t Monitored by %s nodegroup, Dist to remote link : %u", + dynamicOrder[n], + ((n==0)?"*":" "), + monitorsNode[n], + ((monitorsRemote[n])?" >":"-->"), + dynamicOrder[n], + ((remoteMonitored[n])?" >":"-->"), + monitoredByNode[n], + ((sameNGMonitored[n])?"same":"other"), + distanceToRemote[n]); + } + + ndbout_c("\n"); + + Vector nodeGroupMembers; + + for (Uint32 n=0; n < numNodes; n++) + { + Uint32 ng = nodeGroup[n]; + + bool ngfound = false; + for (Uint32 m = 0; m < nodeGroupMembers.size(); m++) + { + if (nodeGroupMembers[m].ngid == ng) + { + NodeGroupMembers& ngInfo = nodeGroupMembers[m]; + ngInfo.members[ngInfo.membCount++] = dynamicOrder[n]; + ngfound = true; + break; + } + } + + if (!ngfound) + { + NodeGroupMembers newGroupInfo; + newGroupInfo.ngid = ng; + newGroupInfo.membCount = 1; + newGroupInfo.members[0] = dynamicOrder[n]; + nodeGroupMembers.push_back(newGroupInfo); + } + } + + ndbout_c("Nodegroups"); + + for (Uint32 n=0; n < nodeGroupMembers.size(); n++) + { + ndbout << " " << nodeGroupMembers[n].ngid << " ("; + bool allRemoteMonitored = true; + for (Uint32 m=0; m < nodeGroupMembers[n].membCount; m++) + { + Uint32 nodeId = nodeGroupMembers[n].members[m]; + ndbout << nodeId; + if ((m+1) < nodeGroupMembers[n].membCount) + ndbout << ","; + Uint32 dynamicIndex = nodeIdToDynamicIndex[nodeId]; + allRemoteMonitored &= remoteMonitored[dynamicIndex]; + } + ndbout << ") Entirely remote monitored NGs risk : " + << (allRemoteMonitored?"Y":"N") << "\n"; + } + ndbout_c("\n"); + + ndbout_c("Cluster-split latency behaviour"); + + Uint32 oddPresident = dynamicOrder[0]; + Uint32 evenPresident = dynamicOrder[0]; + + for (Uint32 n=0; n <= maxDistanceToRemoteLink; n++) + { + Vector failedNodeGroups; + ndbout << " " << n <<" HB latency period(s), nodes ("; + bool useComma = false; + bool presidentFailed = false; + for (Uint32 m=0; m < numNodes; m++) + { + if (distanceToRemote[m] == n) + { + Uint32 failingNodeId = dynamicOrder[m]; + if (useComma) + ndbout << ","; + + useComma = true; + ndbout << failingNodeId; + + if ((failingNodeId == evenPresident) || + (failingNodeId == oddPresident)) + { + ndbout << "*"; + presidentFailed = true; + } + + { + Uint32 ng = nodeGroup[m]; + for (Uint32 i=0; i< nodeGroupMembers.size(); i++) + { + if (nodeGroupMembers[i].ngid == ng) + { + if ((--nodeGroupMembers[i].membCount) == 0) + { + failedNodeGroups.push_back(ng); + } + } + } + } + } + } + ndbout << ") will be declared failed." << endl; + if (failedNodeGroups.size() != 0) + { + ndbout << " NG failure risk on reconnect for nodegroups : "; + for (Uint32 i=0; i< failedNodeGroups.size(); i++) + { + if (i > 0) + ndbout << ","; + ndbout << failedNodeGroups[i]; + } + ndbout << endl; + } + if (presidentFailed) + { + /* A president (even/odd/both) has failed, we should + * calculate the new president(s) from the p.o.v. + * of both sides + */ + Uint32 newOdd=0; + Uint32 newEven=0; + for (Uint32 i=0; i< numNodes; i++) + { + /* Each side finds either the first node on their + * side, or the first node on the other side which + * is still 'alive' from their point of view + */ + bool candidateIsOdd = dynamicOrder[i] & 1; + + if (!newOdd) + { + if (candidateIsOdd || + (distanceToRemote[i] > n)) + { + newOdd = dynamicOrder[i]; + } + } + if (!newEven) + { + if ((!candidateIsOdd) || + (distanceToRemote[i] > n)) + { + newEven = dynamicOrder[i]; + } + } + } + + bool oddPresidentFailed = (oddPresident != newOdd); + bool evenPresidentFailed = (evenPresident != newEven); + + if (oddPresidentFailed) + { + ndbout_c(" Odd president (%u) failed, new odd president : %u", + oddPresident, newOdd); + oddPresident = newOdd; + } + if (evenPresidentFailed) + { + ndbout_c(" Even president (%u) failed, new even president : %u", + evenPresident, newEven); + evenPresident = newEven; + } + + if (oddPresident != evenPresident) + { + ndbout_c(" President role duplicated, Odd (%u), Even (%u)", + oddPresident, evenPresident); + } + + } + } + + ndbout << endl << endl; + + return NDBT_OK; +} + +int runSplitLatency25PctFail(NDBT_Context* ctx, NDBT_Step* step) +{ + /* Use dump commands to inject artificial inter-node latency + * Use an error insert to cause latency to disappear when + * a node observes > 25% of nodes failed. + * This should trigger a race of FAIL_REQs from both sides + * of the cluster, and can result in cluster failure + */ + NdbRestarter restarter; + + /* First the error insert which will drop latency (QMGR) */ + restarter.insertErrorInAllNodes(938); + + /* Now the dump code which causes the system to experience + * latency along odd/even lines (CMVMI) + */ + int dumpStateArgs[] = {9990, 1}; + restarter.dumpStateAllNodes(dumpStateArgs, 2); + + return NDBT_OK; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", "Test that one node at a time can be stopped and then restarted "\ @@ -4715,6 +5222,14 @@ TESTCASE("ForceStopAndRestart", "Test re { STEP(runForceStopAndRestart); } +TESTCASE("ClusterSplitLatency", + "Test behaviour of 2-replica cluster with latency between halves") +{ + TC_PROPERTY("DynamicOrder", Uint32(9)); + INITIALIZER(runRestartToDynamicOrder); + INITIALIZER(analyseDynamicOrder); + INITIALIZER(runSplitLatency25PctFail); +} NDBT_TESTSUITE_END(testNodeRestart); int main(int argc, const char** argv){ No bundle (reason: useless for push emails).