Below is the list of changes that have just been committed into a local
5.0 repository of jonas. When jonas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet@stripped, 2006-10-12 14:04:20+02:00, jonas@stripped +3 -0
Merge perch.ndb.mysql.com:/home/jonas/src/41-work
into perch.ndb.mysql.com:/home/jonas/src/50-work
MERGE: 1.1616.2144.199
ndb/src/kernel/blocks/ERROR_codes.txt@stripped, 2006-10-12 14:04:19+02:00,
jonas@stripped +0 -1
merge
MERGE: 1.8.1.12
ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2006-10-12 14:03:30+02:00,
jonas@stripped +0 -0
Auto merged
MERGE: 1.6.1.5
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2006-10-12 14:03:31+02:00,
jonas@stripped +0 -0
Auto merged
MERGE: 1.14.1.29
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: jonas
# Host: perch.ndb.mysql.com
# Root: /home/jonas/src/50-work/RESYNC
--- 1.22/ndb/src/kernel/blocks/ERROR_codes.txt 2006-10-12 14:04:25 +02:00
+++ 1.23/ndb/src/kernel/blocks/ERROR_codes.txt 2006-10-12 14:04:25 +02:00
@@ -5,7 +5,7 @@
Next DBTUP 4014
Next DBLQH 5043
Next DBDICT 6007
-Next DBDIH 7177
+Next DBDIH 7178
Next DBTC 8038
Next CMVMI 9000
Next BACKUP 10022
@@ -65,6 +65,8 @@
7030: Delay in GCP_PREPARE until node has completed a node failure
7031: Delay in GCP_PREPARE and die 3s later
+
+7177: Delay copying of sysfileData in execCOPY_GCIREQ
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
-----------------------------------------------------------------
--- 1.12/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2006-10-12 14:04:25 +02:00
+++ 1.13/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2006-10-12 14:04:25 +02:00
@@ -1048,6 +1048,8 @@
void removeStoredReplica(FragmentstorePtr regFragptr,
ReplicaRecordPtr replicaPtr);
void searchStoredReplicas(FragmentstorePtr regFragptr);
+ bool setup_create_replica(FragmentstorePtr, CreateReplicaRecord*,
+ ConstPtr<ReplicaRecord>);
void updateNodeInfo(FragmentstorePtr regFragptr);
//------------------------------------
--- 1.61/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2006-10-12 14:04:25 +02:00
+++ 1.62/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2006-10-12 14:04:25 +02:00
@@ -627,22 +627,48 @@
ndbrequire(c_copyGCISlave.m_copyReason == CopyGCIReq::IDLE);
ndbrequire(c_copyGCISlave.m_expectedNextWord == tstart);
ndbrequire(reason != CopyGCIReq::IDLE);
-
+ bool isdone = (tstart + CopyGCIReq::DATA_SIZE) >= Sysfile::SYSFILE_SIZE32;
+
+ if (ERROR_INSERTED(7177))
+ {
+ jam();
+
+ if (signal->getLength() == 3)
+ {
+ jam();
+ goto done;
+ }
+ }
+
arrGuard(tstart + CopyGCIReq::DATA_SIZE, sizeof(sysfileData)/4);
for(Uint32 i = 0; i<CopyGCIReq::DATA_SIZE; i++)
cdata[tstart+i] = copyGCI->data[i];
- if ((tstart + CopyGCIReq::DATA_SIZE) >= Sysfile::SYSFILE_SIZE32) {
+ if (ERROR_INSERTED(7177) && isMaster() && isdone)
+ {
+ sendSignalWithDelay(reference(), GSN_COPY_GCIREQ, signal, 1000, 3);
+ return;
+ }
+
+done:
+ if (isdone)
+ {
jam();
c_copyGCISlave.m_expectedNextWord = 0;
- } else {
+ }
+ else
+ {
jam();
c_copyGCISlave.m_expectedNextWord += CopyGCIReq::DATA_SIZE;
return;
- }//if
-
- memcpy(sysfileData, cdata, sizeof(sysfileData));
+ }
+ if (cmasterdihref != reference())
+ {
+ jam();
+ memcpy(sysfileData, cdata, sizeof(sysfileData));
+ }
+
c_copyGCISlave.m_copyReason = reason;
c_copyGCISlave.m_senderRef = signal->senderBlockRef();
c_copyGCISlave.m_senderData = copyGCI->anyData;
@@ -8441,14 +8467,30 @@
resetReplicaLcp(replicaPtr.p, newestRestorableGCI);
- /* -----------------------------------------------------------------
- * LINK THE REPLICA INTO THE STORED REPLICA LIST. WE WILL USE THIS
- * NODE AS A STORED REPLICA.
- * WE MUST FIRST LINK IT OUT OF THE LIST OF OLD STORED REPLICAS.
- * --------------------------------------------------------------- */
- removeOldStoredReplica(fragPtr, replicaPtr);
- linkStoredReplica(fragPtr, replicaPtr);
-
+ /**
+ * Make sure we can also find REDO for restoring replica...
+ */
+ {
+ CreateReplicaRecord createReplica;
+ ConstPtr<ReplicaRecord> constReplicaPtr;
+ constReplicaPtr.i = replicaPtr.i;
+ constReplicaPtr.p = replicaPtr.p;
+ if (setup_create_replica(fragPtr,
+ &createReplica, constReplicaPtr))
+ {
+ removeOldStoredReplica(fragPtr, replicaPtr);
+ linkStoredReplica(fragPtr, replicaPtr);
+ }
+ else
+ {
+ infoEvent("Forcing take-over of node %d due to unsufficient REDO"
+ " for table %d fragment: %d",
+ nodePtr.i, tabPtr.i, i);
+
+ setNodeActiveStatus(nodePtr.i,
+ Sysfile::NS_NotActive_NotTakenOver);
+ }
+ }
}
default:
jam();
@@ -9492,6 +9534,7 @@
FragmentstorePtr fragPtr;
getFragstore(tabPtr.p, fragId, fragPtr);
checkKeepGci(tabPtr, fragId, fragPtr.p, fragPtr.p->storedReplicas);
+ checkKeepGci(tabPtr, fragId, fragPtr.p, fragPtr.p->oldStoredReplicas);
fragId++;
if (fragId >= tabPtr.p->totalfragments) {
jam();
@@ -12487,16 +12530,75 @@
/* CHECKPOINT WITHOUT NEEDING ANY EXTRA LOGGING FACILITIES.*/
/* A MAXIMUM OF FOUR NODES IS RETRIEVED. */
/*************************************************************************/
+bool
+Dbdih::setup_create_replica(FragmentstorePtr fragPtr,
+ CreateReplicaRecord* createReplicaPtrP,
+ ConstPtr<ReplicaRecord> replicaPtr)
+{
+ createReplicaPtrP->dataNodeId = replicaPtr.p->procNode;
+ createReplicaPtrP->replicaRec = replicaPtr.i;
+
+ /* ----------------------------------------------------------------- */
+ /* WE NEED TO SEARCH FOR A PROPER LOCAL CHECKPOINT TO USE FOR THE */
+ /* SYSTEM RESTART. */
+ /* ----------------------------------------------------------------- */
+ Uint32 startGci;
+ Uint32 startLcpNo;
+ Uint32 stopGci = SYSFILE->newestRestorableGCI;
+ bool result = findStartGci(replicaPtr,
+ stopGci,
+ startGci,
+ startLcpNo);
+ if (!result)
+ {
+ jam();
+ /* --------------------------------------------------------------- */
+ /* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/
+ /* CONTAIN ANY VALID LOCAL CHECKPOINT. IT DOES HOWEVER CONTAIN A */
+ /* VALID FRAGMENT LOG. THUS BY FIRST CREATING THE FRAGMENT AND THEN*/
+ /* EXECUTING THE FRAGMENT LOG WE CAN CREATE THE FRAGMENT AS */
+ /* DESIRED. THIS SHOULD ONLY OCCUR AFTER CREATING A FRAGMENT. */
+ /* */
+ /* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE */
+ /* LOCAL CHECKPOINT TO ZNIL. */
+ /* --------------------------------------------------------------- */
+ createReplicaPtrP->lcpNo = ZNIL;
+ }
+ else
+ {
+ jam();
+ /* --------------------------------------------------------------- */
+ /* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM. */
+ /* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER. */
+ /* --------------------------------------------------------------- */
+ createReplicaPtrP->lcpNo = startLcpNo;
+ arrGuard(startLcpNo, MAX_LCP_STORED);
+ createReplicaPtrP->createLcpId = replicaPtr.p->lcpId[startLcpNo];
+ }//if
+
+
+ /* ----------------------------------------------------------------- */
+ /* WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO */
+ /* EXECUTE THE LOG FROM THE INITIAL CREATION OF THE TABLE. IN BOTH */
+ /* CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT */
+ /* WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT. */
+ /* -_--------------------------------------------------------------- */
+ return findLogNodes(createReplicaPtrP, fragPtr, startGci, stopGci);
+}
+
void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr)
{
Uint32 nextReplicaPtrI;
- ConstPtr<ReplicaRecord> replicaPtr;
+ Ptr<ReplicaRecord> replicaPtr;
replicaPtr.i = fragPtr.p->storedReplicas;
while (replicaPtr.i != RNIL) {
jam();
ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord);
nextReplicaPtrI = replicaPtr.p->nextReplica;
+ ConstPtr<ReplicaRecord> constReplicaPtr;
+ constReplicaPtr.i = replicaPtr.i;
+ constReplicaPtr.p = replicaPtr.p;
NodeRecordPtr nodePtr;
nodePtr.i = replicaPtr.p->procNode;
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
@@ -12516,69 +12618,13 @@
createReplicaPtr.i = cnoOfCreateReplicas;
ptrCheckGuard(createReplicaPtr, 4, createReplicaRecord);
cnoOfCreateReplicas++;
- createReplicaPtr.p->dataNodeId = replicaPtr.p->procNode;
- createReplicaPtr.p->replicaRec = replicaPtr.i;
- /* ----------------------------------------------------------------- */
- /* WE NEED TO SEARCH FOR A PROPER LOCAL CHECKPOINT TO USE FOR THE */
- /* SYSTEM RESTART. */
- /* ----------------------------------------------------------------- */
- Uint32 startGci;
- Uint32 startLcpNo;
- Uint32 stopGci = SYSFILE->newestRestorableGCI;
- bool result = findStartGci(replicaPtr,
- stopGci,
- startGci,
- startLcpNo);
- if (!result) {
- jam();
- /* --------------------------------------------------------------- */
- /* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/
- /* CONTAIN ANY VALID LOCAL CHECKPOINT. IT DOES HOWEVER CONTAIN A */
- /* VALID FRAGMENT LOG. THUS BY FIRST CREATING THE FRAGMENT AND THEN*/
- /* EXECUTING THE FRAGMENT LOG WE CAN CREATE THE FRAGMENT AS */
- /* DESIRED. THIS SHOULD ONLY OCCUR AFTER CREATING A FRAGMENT. */
- /* */
- /* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE */
- /* LOCAL CHECKPOINT TO ZNIL. */
- /* --------------------------------------------------------------- */
- createReplicaPtr.p->lcpNo = ZNIL;
- } else {
- jam();
- /* --------------------------------------------------------------- */
- /* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM. */
- /* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER. */
- /* --------------------------------------------------------------- */
- createReplicaPtr.p->lcpNo = startLcpNo;
- arrGuard(startLcpNo, MAX_LCP_STORED);
- createReplicaPtr.p->createLcpId = replicaPtr.p->lcpId[startLcpNo];
- }//if
-
- if(ERROR_INSERTED(7073) || ERROR_INSERTED(7074)){
- jam();
- nodePtr.p->nodeStatus = NodeRecord::DEAD;
- }
-
- /* ----------------------------------------------------------------- */
- /* WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO */
- /* EXECUTE THE LOG FROM THE INITIAL CREATION OF THE TABLE. IN BOTH */
- /* CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT */
- /* WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT. */
- /* -_--------------------------------------------------------------- */
- if (!findLogNodes(createReplicaPtr.p, fragPtr, startGci, stopGci)) {
- jam();
- /* --------------------------------------------------------------- */
- /* WE WERE NOT ABLE TO FIND ANY WAY OF RESTORING THIS REPLICA. */
- /* THIS IS A POTENTIAL SYSTEM ERROR. */
- /* --------------------------------------------------------------- */
- cnoOfCreateReplicas--;
- return;
- }//if
-
- if(ERROR_INSERTED(7073) || ERROR_INSERTED(7074)){
- jam();
- nodePtr.p->nodeStatus = NodeRecord::ALIVE;
- }
+ /**
+ * Should have been checked in resetReplicaSr
+ */
+ ndbrequire(setup_create_replica(fragPtr,
+ createReplicaPtr.p,
+ constReplicaPtr));
break;
}
default:
| Thread |
|---|
| • bk commit into 5.0 tree (jonas:1.2257) | jonas | 12 Oct |