#At file:///home/jonas/src/telco-6.3/ based on revid:jonas@stripped
3439 Jonas Oreland 2011-05-30
ndb - bug#12589691 - improve error reporting when api-fail-req seems to happen
modified:
storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
storage/ndb/src/kernel/blocks/suma/Suma.cpp
storage/ndb/src/kernel/blocks/suma/Suma.hpp
=== modified file 'storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp'
--- a/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2011-03-31 12:31:43 +0000
+++ b/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2011-05-30 11:52:38 +0000
@@ -1260,7 +1260,7 @@ void Dbtc::handleApiFailState(Signal* si
if (capiConnectClosing[TfailedApiNode] == 0) {
jam();
signal->theData[0] = TfailedApiNode;
- signal->theData[1] = cownref;
+ signal->theData[1] = reference();
sendSignal(capiFailRef, GSN_API_FAILCONF, signal, 2, JBB);
}//if
}//Dbtc::handleApiFailState()
@@ -7661,7 +7661,7 @@ Dbtc::apiFailBlockCleanupCallback(Signal
jamEntry();
signal->theData[0] = failedNodeId;
- signal->theData[1] = cownref;
+ signal->theData[1] = reference();
sendSignal(capiFailRef, GSN_API_FAILCONF, signal, 2, JBB);
}
@@ -11897,6 +11897,21 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal)
return;
}
#endif
+
+ if (arg == 7019 && signal->getLength() == 2)
+ {
+ jam();
+ Uint32 nodeId = signal->theData[1];
+ if (nodeId < MAX_NODES && nodeId < NDB_ARRAY_SIZE(capiConnectClosing))
+ {
+ warningEvent(" DBTC: capiConnectClosing[%u]: %u",
+ nodeId, capiConnectClosing[nodeId]);
+ }
+ else
+ {
+ warningEvent(" DBTC: dump-7019 to unknown node: %u", nodeId);
+ }
+ }
}//Dbtc::execDUMP_STATE_ORD()
bool
=== modified file 'storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp'
--- a/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp 2011-05-30 11:52:38 +0000
@@ -79,6 +79,7 @@
#endif
+#define QMGR_MAX_FAIL_STATE_BLOCKS 4
class Qmgr : public SimulatedBlock {
public:
@@ -92,9 +93,7 @@ public:
NORMAL = 0,
WAITING_FOR_CLOSECOMCONF_ACTIVE = 1, /* Node had phase ZAPI_ACTIVE */
WAITING_FOR_CLOSECOMCONF_NOTACTIVE = 2, /* Node had phase != ZAPI_ACTIVE */
- WAITING_FOR_FAILCONF1 = 3,
- WAITING_FOR_FAILCONF2 = 4,
- WAITING_FOR_FAILCONF3 = 5,
+ WAITING_FOR_API_FAILCONF = 3,
WAITING_FOR_NDB_FAILCONF = 6
};
@@ -174,8 +173,9 @@ public:
BlockReference blockRef;
Uint64 m_secret;
Uint64 m_alloc_timeout;
+ Uint16 m_failconf_blocks[QMGR_MAX_FAIL_STATE_BLOCKS];
- NodeRec() { }
+ NodeRec() { bzero(m_failconf_blocks, sizeof(m_failconf_blocks)); }
}; /* p2c: size = 52 bytes */
typedef Ptr<NodeRec> NodeRecPtr;
@@ -408,7 +408,9 @@ private:
const NodeId theNodes[]);
void handleApiCloseComConf(Signal* signal);
-
+ void add_failconf_block(NodeRecPtr, Uint32 block);
+ bool remove_failconf_block(NodeRecPtr, Uint32 block);
+ bool is_empty_failconf_block(NodeRecPtr) const;
/* Wait this time until we try to join the */
/* cluster again */
=== modified file 'storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp'
--- a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2011-05-24 14:51:54 +0000
+++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp 2011-05-30 11:52:38 +0000
@@ -2679,23 +2679,70 @@ void Qmgr::checkStartInterface(Signal* s
if(((getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1) % 60) == 0)
{
jam();
- char buf[100];
- BaseString::snprintf(buf, sizeof(buf),
- "Failure handling of node %d has not completed"
- " in %d min - state = %d",
- nodePtr.i,
- (getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1)/60,
- nodePtr.p->failState);
- warningEvent(buf);
- if (((getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1) % 300) == 0)
+ char buf[256];
+ if (getNodeInfo(nodePtr.i).m_type == NodeInfo::DB)
{
jam();
- /**
- * Also dump DIH nf-state
- */
- signal->theData[0] = 7019;
- signal->theData[1] = nodePtr.i;
- sendSignal(DBDIH_REF, GSN_DUMP_STATE_ORD, signal, 2, JBB);
+ BaseString::snprintf(buf, sizeof(buf),
+ "Failure handling of node %d has not completed"
+ " in %d min - state = %d",
+ nodePtr.i,
+ (getNodeInfo(nodePtr.i).m_heartbeat_cnt+1)/60,
+ nodePtr.p->failState);
+ warningEvent("%s", buf);
+ if (((getNodeInfo(nodePtr.i).m_heartbeat_cnt + 1) % 300) == 0)
+ {
+ jam();
+ /**
+ * Also dump DIH nf-state
+ */
+ signal->theData[0] = 7019;
+ signal->theData[1] = nodePtr.i;
+ sendSignal(DBDIH_REF, GSN_DUMP_STATE_ORD, signal, 2, JBB);
+ }
+ }
+ else
+ {
+ jam();
+ BaseString::snprintf(buf, sizeof(buf),
+ "Failure handling of api %u has not completed"
+ " in %d min - state = %d",
+ nodePtr.i,
+ (getNodeInfo(nodePtr.i).m_heartbeat_cnt+1)/60,
+ nodePtr.p->failState);
+ warningEvent("%s", buf);
+ if (nodePtr.p->failState == WAITING_FOR_API_FAILCONF)
+ {
+ jam();
+ compile_time_assert(NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks) == 4);
+ BaseString::snprintf(buf, sizeof(buf),
+ " Waiting for blocks: %u %u %u %u",
+ nodePtr.p->m_failconf_blocks[0],
+ nodePtr.p->m_failconf_blocks[1],
+ nodePtr.p->m_failconf_blocks[2],
+ nodePtr.p->m_failconf_blocks[3]);
+ warningEvent("%s", buf);
+
+ for (Uint32 i = 0; i<NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks);
+ i++)
+ {
+ jam();
+ if (nodePtr.p->m_failconf_blocks[i] != 0)
+ {
+ jam();
+ signal->theData[0] = 7019;
+ signal->theData[1] = nodePtr.i;
+ sendSignal(numberToRef(nodePtr.p->m_failconf_blocks[i],
+ getOwnNodeId()),
+ GSN_DUMP_STATE_ORD, signal, 2, JBB);
+ }
+ else
+ {
+ jam();
+ break;
+ }
+ }
+ }
}
}
}
@@ -2726,14 +2773,18 @@ void Qmgr::sendApiFailReq(Signal* signal
signal->theData[1] = QMGR_REF;
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
-
- failedNodePtr.p->failState = WAITING_FOR_FAILCONF1;
+ failedNodePtr.p->failState = WAITING_FOR_API_FAILCONF;
/* JBB used to ensure delivery *after* any pending
* signals
*/
+ add_failconf_block(failedNodePtr, DBTC);
sendSignal(DBTC_REF, GSN_API_FAILREQ, signal, 2, JBB);
+
+ add_failconf_block(failedNodePtr, DBDICT);
sendSignal(DBDICT_REF, GSN_API_FAILREQ, signal, 2, JBB);
+
+ add_failconf_block(failedNodePtr, SUMA);
sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBB);
}//Qmgr::sendApiFailReq()
@@ -2758,30 +2809,118 @@ void Qmgr::execAPI_FAILCONF(Signal* sign
failedNodePtr.i = signal->theData[0];
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
- if (failedNodePtr.p->failState == WAITING_FOR_FAILCONF1)
+ Uint32 block = refToMain(signal->theData[1]);
+ if (failedNodePtr.p->failState != WAITING_FOR_API_FAILCONF ||
+ !remove_failconf_block(failedNodePtr, block))
+ {
+ jam();
+ ndbout << "execAPI_FAILCONF from " << block
+ << " failedNodePtr.p->failState = "
+ << (Uint32)(failedNodePtr.p->failState)
+ << " blocks: ";
+ for (Uint32 i = 0;i<NDB_ARRAY_SIZE(failedNodePtr.p->m_failconf_blocks);i++)
+ {
+ printf("%u ", failedNodePtr.p->m_failconf_blocks[i]);
+ }
+ ndbout << endl;
+ systemErrorLab(signal, __LINE__);
+ }//if
+
+ if (is_empty_failconf_block(failedNodePtr))
{
jam();
- failedNodePtr.p->failState = WAITING_FOR_FAILCONF2;
+ failedNodePtr.p->failState = NORMAL;
+
+ /**
+ * When we set this state, connection will later be opened
+ * in checkStartInterface
+ */
}
- else if (failedNodePtr.p->failState == WAITING_FOR_FAILCONF2)
+ return;
+}//Qmgr::execAPI_FAILCONF()
+
+void
+Qmgr::add_failconf_block(NodeRecPtr nodePtr, Uint32 block)
+{
+ // Check that it does not already exists!!
+ Uint32 pos = 0;
+ for (; pos < NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks); pos++)
{
jam();
- failedNodePtr.p->failState = WAITING_FOR_FAILCONF3;
+ if (nodePtr.p->m_failconf_blocks[pos] == 0)
+ {
+ jam();
+ break;
+ }
+ else if (nodePtr.p->m_failconf_blocks[pos] == block)
+ {
+ jam();
+ break;
+ }
}
- else if (failedNodePtr.p->failState == WAITING_FOR_FAILCONF3)
+
+ ndbrequire(pos != NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks));
+ ndbassert(nodePtr.p->m_failconf_blocks[pos] != block);
+ if (nodePtr.p->m_failconf_blocks[pos] == block)
{
jam();
- failedNodePtr.p->failState = NORMAL;
+ /**
+ * Already in list!!
+ */
+#ifdef ERROR_INSERT
+ ndbrequire(false);
+#endif
+ return;
}
- else
+ ndbrequire(nodePtr.p->m_failconf_blocks[pos] == 0);
+ nodePtr.p->m_failconf_blocks[pos] = block;
+}
+
+bool
+Qmgr::remove_failconf_block(NodeRecPtr nodePtr, Uint32 block)
+{
+ // Check that it does exists!!
+ Uint32 pos = 0;
+ for (; pos < NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks); pos++)
{
jam();
- ndbout << "failedNodePtr.p->failState = "
- << (Uint32)(failedNodePtr.p->failState) << endl;
- systemErrorLab(signal, __LINE__);
- }//if
- return;
-}//Qmgr::execAPI_FAILCONF()
+ if (nodePtr.p->m_failconf_blocks[pos] == 0)
+ {
+ jam();
+ break;
+ }
+ else if (nodePtr.p->m_failconf_blocks[pos] == block)
+ {
+ jam();
+ break;
+ }
+ }
+
+ if (pos == NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks) ||
+ nodePtr.p->m_failconf_blocks[pos] != block)
+ {
+ jam();
+ /**
+ * Not found!!
+ */
+ return false;
+ }
+
+ nodePtr.p->m_failconf_blocks[pos] = 0;
+ for (pos++; pos < NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks); pos++)
+ {
+ jam();
+ nodePtr.p->m_failconf_blocks[pos - 1] = nodePtr.p->m_failconf_blocks[pos];
+ }
+
+ return true;
+}
+
+bool
+Qmgr::is_empty_failconf_block(NodeRecPtr nodePtr) const
+{
+ return nodePtr.p->m_failconf_blocks[0] == 0;
+}
void Qmgr::execNDB_FAILCONF(Signal* signal)
{
@@ -3661,8 +3800,9 @@ void Qmgr::handleApiCloseComConf(Signal*
jam();
signal->theData[0] = nodeId;
signal->theData[1] = QMGR_REF;
+ add_failconf_block(failedNodePtr, SUMA);
sendSignal(SUMA_REF, GSN_API_FAILREQ, signal, 2, JBB);
- failedNodePtr.p->failState = WAITING_FOR_FAILCONF3;
+ failedNodePtr.p->failState = WAITING_FOR_API_FAILCONF;
}
if (getNodeInfo(failedNodePtr.i).getType() == NodeInfo::MGM)
=== modified file 'storage/ndb/src/kernel/blocks/suma/Suma.cpp'
--- a/storage/ndb/src/kernel/blocks/suma/Suma.cpp 2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/blocks/suma/Suma.cpp 2011-05-30 11:52:38 +0000
@@ -751,6 +751,7 @@ void Suma::execAPI_FAILREQ(Signal* signa
c_failedApiNodes.set(failedApiNode);
c_subscriber_nodes.clear(failedApiNode);
c_subscriber_per_node[failedApiNode] = 0;
+ c_failedApiNodesState[failedApiNode] = __LINE__;
check_start_handover(signal);
@@ -770,6 +771,8 @@ CONF:
signal->theData[1] = reference();
sendSignal(QMGR_REF, GSN_API_FAILCONF, signal, 2, JBB);
+ c_failedApiNodesState[failedApiNode] = 0;
+
DBUG_VOID_RETURN;
}//execAPI_FAILREQ()
@@ -791,6 +794,7 @@ Suma::api_fail_block_cleanup_callback(Si
signal->theData[1] = reference();
sendSignal(QMGR_REF, GSN_API_FAILCONF, signal, 2, JBB);
c_failedApiNodes.clear(failedNodeId);
+ c_failedApiNodesState[failedNodeId] = 0;
}
void
@@ -798,9 +802,11 @@ Suma::api_fail_block_cleanup(Signal* sig
{
jam();
+ c_failedApiNodesState[failedNode] = __LINE__;
+
Callback cb = {safe_cast(&Suma::api_fail_block_cleanup_callback),
failedNode};
-
+
simBlockNodeFailure(signal, failedNode, cb);
}
@@ -829,6 +835,7 @@ Suma::api_fail_gci_list(Signal* signal,
c_gcp_list.release(gcp);
+ c_failedApiNodesState[nodeId] = __LINE__;
signal->theData[0] = SumaContinueB::API_FAIL_GCI_LIST;
signal->theData[1] = nodeId;
sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 2, JBB);
@@ -851,11 +858,13 @@ Suma::api_fail_gci_list(Signal* signal,
Ptr<SubOpRecord> subOpPtr;
if (c_subOpPool.seize(subOpPtr))
{
+ c_failedApiNodesState[nodeId] = __LINE__;
signal->theData[2] = subOpPtr.i;
sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 6, JBB);
}
else
{
+ c_failedApiNodesState[nodeId] = __LINE__;
sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 3, JBB);
}
@@ -878,6 +887,7 @@ Suma::api_fail_subscriber_list(Signal* s
{
jam();
sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 3, JBB);
+ c_failedApiNodesState[nodeId] = __LINE__;
return;
}
}
@@ -896,6 +906,7 @@ Suma::api_fail_subscriber_list(Signal* s
{
jam();
c_subscriptions.first(iter);
+ c_failedApiNodesState[nodeId] = __LINE__;
}
else
{
@@ -911,6 +922,7 @@ Suma::api_fail_subscriber_list(Signal* s
* We restart from this bucket :-(
*/
c_subscriptions.next(bucket, iter);
+ c_failedApiNodesState[nodeId] = __LINE__;
}
else
{
@@ -922,6 +934,7 @@ Suma::api_fail_subscriber_list(Signal* s
{
jam();
api_fail_block_cleanup(signal, nodeId);
+ c_failedApiNodesState[nodeId] = __LINE__;
return;
}
@@ -936,11 +949,18 @@ Suma::api_fail_subscriber_list(Signal* s
if (empty)
{
+ jam();
+ c_failedApiNodesState[nodeId] = __LINE__;
signal->theData[0] = SumaContinueB::API_FAIL_SUBSCRIPTION;
signal->theData[1] = subOpPtr.i;
signal->theData[2] = RNIL;
sendSignal(SUMA_REF, GSN_CONTINUEB, signal, 3, JBB);
}
+ else
+ {
+ jam();
+ c_failedApiNodesState[nodeId] = __LINE__;
+ }
}
void
@@ -1001,6 +1021,7 @@ Suma::api_fail_subscription(Signal* sign
if (!ptr.isNull())
{
jam();
+ c_failedApiNodesState[nodeId] = __LINE__;
signal->theData[0] = SumaContinueB::API_FAIL_SUBSCRIPTION;
signal->theData[1] = subOpPtr.i;
signal->theData[2] = ptr.i;
@@ -1019,6 +1040,8 @@ Suma::api_fail_subscription(Signal* sign
if (c_subscriptions.next(iter))
{
+ jam();
+ c_failedApiNodesState[nodeId] = __LINE__;
signal->theData[0] = SumaContinueB::API_FAIL_SUBSCRIBER_LIST;
signal->theData[1] = nodeId;
signal->theData[2] = subOpPtr.i;
@@ -1560,6 +1583,29 @@ Suma::execDUMP_STATE_ORD(Signal* signal)
sendSignalWithDelay(reference(), GSN_DUMP_STATE_ORD, signal, 100, 2);
return;
}
+
+ if (tCase == 7019 && signal->getLength() == 2)
+ {
+ jam();
+ Uint32 nodeId = signal->theData[1];
+ if (nodeId < MAX_NODES)
+ {
+ warningEvent(" Suma 7019 %u line: %u", nodeId,
+ c_failedApiNodesState[nodeId]);
+ warningEvent(" c_connected_nodes.get(): %u",
+ c_connected_nodes.get(nodeId));
+ warningEvent(" c_failedApiNodes.get(): %u",
+ c_failedApiNodes.get(nodeId));
+ warningEvent(" c_subscriber_nodes.get(): %u",
+ c_subscriber_nodes.get(nodeId));
+ warningEvent(" c_subscriber_per_node[%u]: %u",
+ nodeId, c_subscriber_per_node[nodeId]);
+ }
+ else
+ {
+ warningEvent(" SUMP: dump-7019 to unknown node: %u", nodeId);
+ }
+ }
}
/*************************************************************
=== modified file 'storage/ndb/src/kernel/blocks/suma/Suma.hpp'
--- a/storage/ndb/src/kernel/blocks/suma/Suma.hpp 2011-02-01 21:05:11 +0000
+++ b/storage/ndb/src/kernel/blocks/suma/Suma.hpp 2011-05-30 11:52:38 +0000
@@ -360,7 +360,8 @@ public:
Uint32 c_maxBufferedEpochs;
NodeBitmask c_failedApiNodes;
-
+ Uint32 c_failedApiNodesState[MAX_NODES];
+
/**
* Functions
*/
Attachment: [text/bzr-bundle] bzr/jonas@mysql.com-20110530115238-ly2l52yz6temi631.bundle
| Thread |
|---|
| • bzr commit into mysql-5.1-telco-6.3 branch (jonas:3439) Bug#12589691 | Jonas Oreland | 31 May |