From: Date: June 26 2007 3:06pm Subject: bk commit into 5.1 tree (jonas:1.2156) BUG#29331 List-Archive: http://lists.mysql.com/commits/29602 X-Bug: 29331 Message-Id: <20070626130616.04E76718841@perch.ndb.mysql.com> Below is the list of changes that have just been committed into a local 5.1 repository of jonas. When jonas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet@stripped, 2007-06-26 15:06:12+02:00, jonas@stripped +3 -0 ndb - bug#29331 (wl2325-5.0) Add better handling of GCP Stop Only kill "offending" node storage/ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2007-06-26 15:06:11+02:00, jonas@stripped +4 -0 Add new error codes storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2007-06-26 15:06:11+02:00, jonas@stripped +1 -1 Add better handling of GCP stop storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-06-26 15:06:11+02:00, jonas@stripped +154 -27 Add better handling of GCP stop # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: jonas # Host: perch.ndb.mysql.com # Root: /home/jonas/src/drop5 --- 1.25/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2007-06-26 15:06:15 +02:00 +++ 1.26/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2007-06-26 15:06:15 +02:00 @@ -76,6 +76,10 @@ 7183: Crash when receiving COPY_GCIREQ +7184: Crash before starting next GCP after a node failure + +7185: Dont reply to COPY_GCI_REQ where reason == GCP + ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ----------------------------------------------------------------- --- 1.19/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-06-26 15:06:15 +02:00 +++ 1.20/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-06-26 15:06:15 +02:00 @@ -897,7 +897,7 @@ void ndbsttorry10Lab(Signal *, Uint32 _line); void createMutexes(Signal* signal, Uint32 no); void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal); - void crashSystemAtGcpStop(Signal *); + void crashSystemAtGcpStop(Signal *, bool); void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr); void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode); void GCP_SAVEhandling(Signal *, Uint32 nodeId); --- 1.67/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-06-26 15:06:15 +02:00 +++ 1.68/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-06-26 15:06:15 +02:00 @@ -737,6 +737,12 @@ ndbrequire(ok); CRASH_INSERTION(7183); + + if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT) + { + jam(); + return; + } /* ----------------------------------------------------------------------- */ /* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */ @@ -3964,6 +3970,11 @@ CLEAR_ERROR_INSERT_VALUE; } + if (ERROR_INSERTED(7184)) + { + SET_ERROR_INSERT_VALUE(7000); + } + /*-------------------------------------------------------------------------*/ // The first step is to convert from a bit mask to an array of failed nodes. /*-------------------------------------------------------------------------*/ @@ -7568,7 +7579,7 @@ ndbout << "System crash due to GCP Stop in state = "; ndbout << (Uint32) cgcpStatus << endl; #endif - crashSystemAtGcpStop(signal); + crashSystemAtGcpStop(signal, false); return; }//if } else { @@ -7582,7 +7593,7 @@ ndbout << "System crash due to GCP Stop in state = "; ndbout << (Uint32) cgcpStatus << endl; #endif - crashSystemAtGcpStop(signal); + crashSystemAtGcpStop(signal, false); return; }//if } else { @@ -10916,31 +10927,125 @@ * GCP stop detected, * send SYSTEM_ERROR to all other alive nodes */ -void Dbdih::crashSystemAtGcpStop(Signal* signal) +void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local) { + if (local) + goto dolocal; + switch(cgcpStatus){ + case GCP_PREPARE_SENT: + { + jam(); + /** + * We're waiting for a GCP PREPARE CONF + */ + infoEvent("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_PREPARE_Counter.getText()); + ndbout_c("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_PREPARE_Counter.getText()); + + { + NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter); + signal->theData[0] = 7022; + sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA); + } + + { + NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + sendSignal(rg, GSN_SYSTEM_ERROR, signal, + SystemError::SignalLength, JBA); + } + ndbrequire(!c_GCP_PREPARE_Counter.done()); + return; + } + case GCP_COMMIT_SENT: + { + jam(); + /** + * We're waiting for a GCP_NODEFINISH + */ + infoEvent("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_COMMIT_Counter.getText()); + ndbout_c("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_COMMIT_Counter.getText()); + + { + NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter); + signal->theData[0] = 7022; + sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA); + } + + { + NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + sendSignal(rg, GSN_SYSTEM_ERROR, signal, + SystemError::SignalLength, JBA); + } + ndbrequire(!c_GCP_COMMIT_Counter.done()); + return; + } case GCP_NODE_FINISHED: { + jam(); /** * We're waiting for a GCP save conf */ - ndbrequire(!c_GCP_SAVEREQ_Counter.done()); NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter); signal->theData[0] = 2305; sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB); - infoEvent("Detected GCP stop...sending kill to %s", - c_GCP_SAVEREQ_Counter.getText()); - ndbout_c("Detected GCP stop...sending kill to %s", - c_GCP_SAVEREQ_Counter.getText()); + infoEvent("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_SAVEREQ_Counter.getText()); + ndbout_c("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_SAVEREQ_Counter.getText()); + ndbrequire(!c_GCP_SAVEREQ_Counter.done()); return; } case GCP_SAVE_LQH_FINISHED: - ndbout_c("m_copyReason: %d m_waiting: %d", - c_copyGCIMaster.m_copyReason, - c_copyGCIMaster.m_waiting); - break; + { + jam(); + /** + * We're waiting for a COPY_GCICONF + */ + infoEvent("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_COPY_GCIREQ_Counter.getText()); + ndbout_c("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_COPY_GCIREQ_Counter.getText()); + + { + NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter); + signal->theData[0] = 7022; + sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA); + } + + { + NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + sendSignal(rg, GSN_SYSTEM_ERROR, signal, + SystemError::SignalLength, JBA); + } + ndbrequire(!c_COPY_GCIREQ_Counter.done()); + return; + } } + +dolocal: + ndbout_c("m_copyReason: %d m_waiting: %d", + c_copyGCIMaster.m_copyReason, + c_copyGCIMaster.m_waiting); ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d", c_copyGCISlave.m_senderData, @@ -10997,23 +11102,39 @@ c_TCGETOPSIZEREQ_Counter.getText()); ndbout_c("c_UPDATE_TOREQ_Counter = %s", c_UPDATE_TOREQ_Counter.getText()); - NodeRecordPtr nodePtr; - for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { + if (local == false) + { jam(); - ptrAss(nodePtr, nodeRecord); - if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) { + NodeRecordPtr nodePtr; + for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { jam(); - const BlockReference ref = - numberToRef(refToBlock(cntrlblockref), nodePtr.i); - SystemError * const sysErr = (SystemError*)&signal->theData[0]; - sysErr->errorCode = SystemError::GCPStopDetected; - sysErr->errorRef = reference(); - sysErr->data1 = cgcpStatus; - sysErr->data2 = cgcpOrderBlocked; - sendSignal(ref, GSN_SYSTEM_ERROR, signal, - SystemError::SignalLength, JBA); - }//if - }//for + ptrAss(nodePtr, nodeRecord); + if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) { + jam(); + const BlockReference ref = + numberToRef(refToBlock(cntrlblockref), nodePtr.i); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + sendSignal(ref, GSN_SYSTEM_ERROR, signal, + SystemError::SignalLength, JBA); + }//if + }//for + } + else + { + jam(); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR, + signal, SystemError::SignalLength); + ndbrequire(false); + } return; }//Dbdih::crashSystemAtGcpStop() @@ -14096,6 +14217,12 @@ } while (replicaPtr.i != RNIL); infoEvent(buf); } + } + + if (arg == 7022) + { + jam(); + crashSystemAtGcpStop(signal, true); } }//Dbdih::execDUMP_STATE_ORD()