After (a heck of a) timeout, improve teh error message we display, and
attempt a rollback of the transaction to free resources in kernel.
Remove abort for this error case too... we can mostly continue just
fine. (only VM_TRACE builds)
I don't think this is perfect yet though... the thread can still
get rather confused until we close the transaction properly at the end...
this could be something to do with how the handler should be doing
things... I'm just not too sure. Thoughts quite welcome!
TAKE2 changes:
- use g_eventLogger
- restore abort()
===== ndb/src/ndbapi/NdbTransaction.cpp 1.59 vs edited =====
Index: ndb-work/ndb/src/ndbapi/NdbTransaction.cpp
===================================================================
--- ndb-work.orig/ndb/src/ndbapi/NdbTransaction.cpp 2007-07-12 12:45:11.957699763 +1000
+++ ndb-work/ndb/src/ndbapi/NdbTransaction.cpp 2007-07-16 15:20:25.350089392 +1000
@@ -481,12 +481,27 @@ NdbTransaction::executeNoBlobs(ExecType
while (1) {
int noOfComp = tNdb->sendPollNdb(3 * timeout, 1, forceSend);
if (noOfComp == 0) {
- /**
- * This timeout situation can occur if NDB crashes.
+ /*
+ * Just for fun, this is only one of two places where
+ * we could hit this error... It's quite possible we
+ * hit it in Ndbif.cpp in Ndb::check_send_timeout()
+ *
+ * We behave rather similarly in both places.
+ * Hitting this is certainly a bug though...
*/
- ndbout << "This timeout should never occur, execute(..)" << endl;
- theError.code = 4012;
- setOperationErrorCodeAbort(4012); // Error code for "Cluster Failure"
+ g_eventLogger.error("WARNING: Timeout in executeNoBlobs() waiting for "
+ "response from NDB data nodes. This should NEVER "
+ "occur. You have likely hit a NDB Bug. Please "
+ "file a bug.");
+ DBUG_PRINT("error",("This timeout should never occure, execute()"));
+ g_eventLogger.error("Forcibly trying to rollback txn (%p"
+ ") to try to clean up data node resources.",
+ this);
+ executeNoBlobs(NdbTransaction::Rollback);
+ theError.code = 4012;
+ theError.status= NdbError::PermanentError;
+ theError.classification= NdbError::TimeoutExpired;
+ setOperationErrorCodeAbort(4012); // ndbd timeout
DBUG_RETURN(-1);
}//if
@@ -550,7 +565,12 @@ NdbTransaction::executeAsynchPrepare( Ex
*/
if (theError.code != 0)
DBUG_PRINT("enter", ("Resetting error %d on execute", theError.code));
- theError.code = 0;
+ /**
+ * for timeout (4012) we want sendROLLBACK to behave differently.
+ * Else, normal behaviour of reset errcode
+ */
+ if (theError.code != 4012)
+ theError.code = 0;
NdbScanOperation* tcOp = m_theFirstScanOperation;
if (tcOp != 0){
// Execute any cursor operations
@@ -873,6 +893,12 @@ NdbTransaction::sendROLLBACK() // S
tSignal.setData(theTCConPtr, 1);
tSignal.setData(tTransId1, 2);
tSignal.setData(tTransId2, 3);
+ if(theError.code == 4012)
+ {
+ g_eventLogger.error("Sending TCROLLBACKREQ with Bad flag");
+ tSignal.setLength(tSignal.getLength() + 1); // + flags
+ tSignal.setData(0x1, 4); // potentially bad data
+ }
tReturnCode = tp->sendSignal(&tSignal,theDBnode);
if (tReturnCode != -1) {
theSendStatus = sendTC_ROLLBACK;
Index: ndb-work/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp
===================================================================
--- ndb-work.orig/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2007-07-12 12:56:12.499341886
+1000
+++ ndb-work/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp 2007-07-12 23:00:47.606542436 +1000
@@ -5456,11 +5456,32 @@ void Dbtc::execTC_COMMITREQ(Signal* sign
}
}//Dbtc::execTC_COMMITREQ()
+/**
+ * TCROLLBACKREQ
+ *
+ * Format is:
+ *
+ * thedata[0] = apiconnectptr
+ * thedata[1] = transid[0]
+ * thedata[2] = transid[1]
+ * OPTIONAL thedata[3] = flags
+ *
+ * Flags:
+ * 0x1 = potentiallyBad data from API (try not to assert)
+ */
void Dbtc::execTCROLLBACKREQ(Signal* signal)
{
+ bool potentiallyBad= false;
UintR compare_transid1, compare_transid2;
jamEntry();
+
+ if(unlikely((signal->getLength() >= 4) && (signal->theData[3] &
0x1)))
+ {
+ ndbout_c("Trying to roll back potentially bad txn\n");
+ potentiallyBad= true;
+ }
+
apiConnectptr.i = signal->theData[0];
if (apiConnectptr.i >= capiConnectFilesize) {
goto TC_ROLL_warning;
@@ -5547,12 +5568,14 @@ void Dbtc::execTCROLLBACKREQ(Signal* sig
TC_ROLL_warning:
jam();
- warningHandlerLab(signal, __LINE__);
+ if(likely(potentiallyBad==false))
+ warningHandlerLab(signal, __LINE__);
return;
TC_ROLL_system_error:
jam();
- systemErrorLab(signal, __LINE__);
+ if(likely(potentiallyBad==false))
+ systemErrorLab(signal, __LINE__);
return;
}//Dbtc::execTCROLLBACKREQ()
--
Stewart Smith