Below is the list of changes that have just been committed into a local
5.1 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet@stripped, 2007-06-26 15:19:42+02:00, jonas@stripped +3 -0
ndb - bug#29331 (51)
Add better handling of GCP Stop
Only kill "offending" node
storage/ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2007-06-26 15:19:40+02:00,
jonas@stripped +5 -1
add new error codes
storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2007-06-26 15:19:40+02:00,
jonas@stripped +1 -1
add better GCP stop handling
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2007-06-26 15:19:40+02:00,
jonas@stripped +161 -36
add better GCP stop handling
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: jonas
# Host: perch.ndb.mysql.com
# Root: /home/jonas/src/51-telco-gca
--- 1.37/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2007-06-26 15:19:48 +02:00
+++ 1.38/storage/ndb/src/kernel/blocks/ERROR_codes.txt 2007-06-26 15:19:48 +02:00
@@ -5,7 +5,7 @@
Next DBTUP 4029
Next DBLQH 5045
Next DBDICT 6007
-Next DBDIH 7183
+Next DBDIH 7186
Next DBTC 8040
Next CMVMI 9000
Next BACKUP 10038
@@ -74,6 +74,10 @@
7177: Delay copying of sysfileData in execCOPY_GCIREQ
7180: Crash master during master-take-over in execMASTER_LCPCONF
+
+7184: Crash before starting next GCP after a node failure
+
+7185: Dont reply to COPY_GCI_REQ where reason == GCP
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
-----------------------------------------------------------------
--- 1.28/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-06-26 15:19:48 +02:00
+++ 1.29/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2007-06-26 15:19:48 +02:00
@@ -899,7 +899,7 @@
void ndbsttorry10Lab(Signal *, Uint32 _line);
void createMutexes(Signal* signal, Uint32 no);
void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal);
- void crashSystemAtGcpStop(Signal *);
+ void crashSystemAtGcpStop(Signal *, bool);
void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr);
void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode);
void GCP_SAVEhandling(Signal *, Uint32 nodeId);
--- 1.110/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-06-26 15:19:48 +02:00
+++ 1.111/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2007-06-26 15:19:48 +02:00
@@ -747,6 +747,13 @@
}
ndbrequire(ok);
+
+ if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT)
+ {
+ jam();
+ return;
+ }
+
/* ----------------------------------------------------------------------- */
/* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */
/* ----------------------------------------------------------------------- */
@@ -4071,6 +4078,11 @@
CLEAR_ERROR_INSERT_VALUE;
}
+ if (ERROR_INSERTED(7184))
+ {
+ SET_ERROR_INSERT_VALUE(7000);
+ }
+
/*-------------------------------------------------------------------------*/
// The first step is to convert from a bit mask to an array of failed nodes.
/*-------------------------------------------------------------------------*/
@@ -7745,7 +7757,7 @@
g_eventLogger.error("System crash due to GCP Stop in state = %u",
(Uint32) cgcpStatus);
#endif
- crashSystemAtGcpStop(signal);
+ crashSystemAtGcpStop(signal, false);
return;
}//if
} else {
@@ -7759,7 +7771,7 @@
g_eventLogger.error("System crash due to GCP Stop in state = %u",
(Uint32) cgcpStatus);
#endif
- crashSystemAtGcpStop(signal);
+ crashSystemAtGcpStop(signal, false);
return;
}//if
} else {
@@ -11117,41 +11129,132 @@
* GCP stop detected,
* send SYSTEM_ERROR to all other alive nodes
*/
-void Dbdih::crashSystemAtGcpStop(Signal* signal)
+void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local)
{
+ if (local)
+ goto dolocal;
+
switch(cgcpStatus){
+ case GCP_PREPARE_SENT:
+ {
+ jam();
+ /**
+ * We're waiting for a GCP PREPARE CONF
+ */
+ infoEvent("Detected GCP stop(%d)...sending kill to %s",
+ cgcpStatus, c_GCP_PREPARE_Counter.getText());
+ ndbout_c("Detected GCP stop(%d)...sending kill to %s",
+ cgcpStatus, c_GCP_PREPARE_Counter.getText());
+
+ {
+ NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter);
+ signal->theData[0] = 7022;
+ sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
+ }
+
+ {
+ NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter);
+ SystemError * const sysErr = (SystemError*)&signal->theData[0];
+ sysErr->errorCode = SystemError::GCPStopDetected;
+ sysErr->errorRef = reference();
+ sysErr->data1 = cgcpStatus;
+ sysErr->data2 = cgcpOrderBlocked;
+ sendSignal(rg, GSN_SYSTEM_ERROR, signal,
+ SystemError::SignalLength, JBA);
+ }
+ ndbrequire(!c_GCP_PREPARE_Counter.done());
+ return;
+ }
+ case GCP_COMMIT_SENT:
+ {
+ jam();
+ /**
+ * We're waiting for a GCP_NODEFINISH
+ */
+ infoEvent("Detected GCP stop(%d)...sending kill to %s",
+ cgcpStatus, c_GCP_COMMIT_Counter.getText());
+ ndbout_c("Detected GCP stop(%d)...sending kill to %s",
+ cgcpStatus, c_GCP_COMMIT_Counter.getText());
+
+ {
+ NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter);
+ signal->theData[0] = 7022;
+ sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
+ }
+
+ {
+ NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter);
+ SystemError * const sysErr = (SystemError*)&signal->theData[0];
+ sysErr->errorCode = SystemError::GCPStopDetected;
+ sysErr->errorRef = reference();
+ sysErr->data1 = cgcpStatus;
+ sysErr->data2 = cgcpOrderBlocked;
+ sendSignal(rg, GSN_SYSTEM_ERROR, signal,
+ SystemError::SignalLength, JBA);
+ }
+ ndbrequire(!c_GCP_COMMIT_Counter.done());
+ return;
+ }
case GCP_NODE_FINISHED:
{
+ jam();
/**
* We're waiting for a GCP save conf
*/
- ndbrequire(!c_GCP_SAVEREQ_Counter.done());
NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter);
signal->theData[0] = 2305;
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB);
- infoEvent("Detected GCP stop...sending kill to %s",
- c_GCP_SAVEREQ_Counter.getText());
- g_eventLogger.error("Detected GCP stop...sending kill to %s",
- c_GCP_SAVEREQ_Counter.getText());
+ infoEvent("Detected GCP stop(%d)...sending kill to %s",
+ cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
+ ndbout_c("Detected GCP stop(%d)...sending kill to %s",
+ cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
+ ndbrequire(!c_GCP_SAVEREQ_Counter.done());
return;
}
case GCP_SAVE_LQH_FINISHED:
- g_eventLogger.error("m_copyReason: %d m_waiting: %d",
- c_copyGCIMaster.m_copyReason,
- c_copyGCIMaster.m_waiting);
- break;
- case GCP_READY: // shut up lint
- case GCP_PREPARE_SENT:
- case GCP_COMMIT_SENT:
- break;
+ {
+ jam();
+ /**
+ * We're waiting for a COPY_GCICONF
+ */
+ infoEvent("Detected GCP stop(%d)...sending kill to %s",
+ cgcpStatus, c_COPY_GCIREQ_Counter.getText());
+ ndbout_c("Detected GCP stop(%d)...sending kill to %s",
+ cgcpStatus, c_COPY_GCIREQ_Counter.getText());
+
+ {
+ NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter);
+ signal->theData[0] = 7022;
+ sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
+ }
+
+ {
+ NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter);
+ SystemError * const sysErr = (SystemError*)&signal->theData[0];
+ sysErr->errorCode = SystemError::GCPStopDetected;
+ sysErr->errorRef = reference();
+ sysErr->data1 = cgcpStatus;
+ sysErr->data2 = cgcpOrderBlocked;
+ sendSignal(rg, GSN_SYSTEM_ERROR, signal,
+ SystemError::SignalLength, JBA);
+ }
+ ndbrequire(!c_COPY_GCIREQ_Counter.done());
+ return;
+ }
+ case GCP_READY: (void)1;
}
+
+dolocal:
+ ndbout_c("m_copyReason: %d m_waiting: %d",
+ c_copyGCIMaster.m_copyReason,
+ c_copyGCIMaster.m_waiting);
- g_eventLogger.error("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
- c_copyGCISlave.m_senderData,
- c_copyGCISlave.m_senderRef,
- c_copyGCISlave.m_copyReason,
- c_copyGCISlave.m_expectedNextWord);
+ ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
+ c_copyGCISlave.m_senderData,
+ c_copyGCISlave.m_senderRef,
+ c_copyGCISlave.m_copyReason,
+ c_copyGCISlave.m_expectedNextWord);
FileRecordPtr file0Ptr;
file0Ptr.i = crestartInfoFile[0];
@@ -11202,23 +11305,39 @@
c_TCGETOPSIZEREQ_Counter.getText());
ndbout_c("c_UPDATE_TOREQ_Counter = %s", c_UPDATE_TOREQ_Counter.getText());
- NodeRecordPtr nodePtr;
- for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
+ if (local == false)
+ {
jam();
- ptrAss(nodePtr, nodeRecord);
- if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
+ NodeRecordPtr nodePtr;
+ for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
jam();
- const BlockReference ref =
- numberToRef(refToBlock(cntrlblockref), nodePtr.i);
- SystemError * const sysErr = (SystemError*)&signal->theData[0];
- sysErr->errorCode = SystemError::GCPStopDetected;
- sysErr->errorRef = reference();
- sysErr->data1 = cgcpStatus;
- sysErr->data2 = cgcpOrderBlocked;
- sendSignal(ref, GSN_SYSTEM_ERROR, signal,
- SystemError::SignalLength, JBA);
- }//if
- }//for
+ ptrAss(nodePtr, nodeRecord);
+ if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
+ jam();
+ const BlockReference ref =
+ numberToRef(refToBlock(cntrlblockref), nodePtr.i);
+ SystemError * const sysErr = (SystemError*)&signal->theData[0];
+ sysErr->errorCode = SystemError::GCPStopDetected;
+ sysErr->errorRef = reference();
+ sysErr->data1 = cgcpStatus;
+ sysErr->data2 = cgcpOrderBlocked;
+ sendSignal(ref, GSN_SYSTEM_ERROR, signal,
+ SystemError::SignalLength, JBA);
+ }//if
+ }//for
+ }
+ else
+ {
+ jam();
+ SystemError * const sysErr = (SystemError*)&signal->theData[0];
+ sysErr->errorCode = SystemError::GCPStopDetected;
+ sysErr->errorRef = reference();
+ sysErr->data1 = cgcpStatus;
+ sysErr->data2 = cgcpOrderBlocked;
+ EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR,
+ signal, SystemError::SignalLength);
+ ndbrequire(false);
+ }
return;
}//Dbdih::crashSystemAtGcpStop()
@@ -14303,6 +14422,12 @@
} while (replicaPtr.i != RNIL);
infoEvent(buf);
}
+ }
+
+ if (arg == 7022)
+ {
+ jam();
+ crashSystemAtGcpStop(signal, true);
}
}//Dbdih::execDUMP_STATE_ORD()
| Thread |
|---|
| • bk commit into 5.1 tree (jonas:1.2499) BUG#29331 | jonas | 26 Jun |