From: Date: October 10 2006 10:42am Subject: bk commit into 5.1 tree (jonas:1.2057) BUG#22893 List-Archive: http://lists.mysql.com/commits/13377 X-Bug: 22893 Message-Id: <20061010084230.B27A059BF07@perch.ndb.mysql.com> Below is the list of changes that have just been committed into a local 5.1 repository of jonas. When jonas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet@stripped, 2006-10-10 10:42:26+02:00, jonas@stripped +2 -0 ndb - bug#22893 Add checking of REDO to earlier during SR so take-over of node can be performed if it can't be restarted using logs (which btw is really weird...as it _should_ be able to use logs of other node in node group) recommit in mysql-5.1-wl2325-5.0 storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp@stripped, 2006-10-10 10:42:25+02:00, jonas@stripped +2 -0 Break-out methods which searches for REDO for a fragment, so it can be used earlier during SR storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2006-10-10 10:42:25+02:00, jonas@stripped +90 -71 Add checking of REDO to earlier during SR so take-over of node can be performed if it can't be restarted using logs (which btw is really weird...as it _should_ be able to use logs of other node in node group) # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: jonas # Host: perch.ndb.mysql.com # Root: /home/jonas/src/mysql-5.1-wl2325-5.0 --- 1.13/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2006-10-10 10:42:30 +02:00 +++ 1.14/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp 2006-10-10 10:42:30 +02:00 @@ -1047,6 +1047,8 @@ void removeStoredReplica(FragmentstorePtr regFragptr, ReplicaRecordPtr replicaPtr); void searchStoredReplicas(FragmentstorePtr regFragptr); + bool setup_create_replica(FragmentstorePtr, CreateReplicaRecord*, + ConstPtr); void updateNodeInfo(FragmentstorePtr regFragptr); //------------------------------------ --- 1.49/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2006-10-10 10:42:30 +02:00 +++ 1.50/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2006-10-10 10:42:30 +02:00 @@ -8402,14 +8402,30 @@ resetReplicaLcp(replicaPtr.p, newestRestorableGCI); - /* ----------------------------------------------------------------- - * LINK THE REPLICA INTO THE STORED REPLICA LIST. WE WILL USE THIS - * NODE AS A STORED REPLICA. - * WE MUST FIRST LINK IT OUT OF THE LIST OF OLD STORED REPLICAS. - * --------------------------------------------------------------- */ - removeOldStoredReplica(fragPtr, replicaPtr); - linkStoredReplica(fragPtr, replicaPtr); - + /** + * Make sure we can also find REDO for restoring replica... + */ + { + CreateReplicaRecord createReplica; + ConstPtr constReplicaPtr; + constReplicaPtr.i = replicaPtr.i; + constReplicaPtr.p = replicaPtr.p; + if (setup_create_replica(fragPtr, + &createReplica, constReplicaPtr)) + { + removeOldStoredReplica(fragPtr, replicaPtr); + linkStoredReplica(fragPtr, replicaPtr); + } + else + { + infoEvent("Forcing take-over of node %d due to unsufficient REDO" + " for table %d fragment: %d", + nodePtr.i, tabPtr.i, i); + + setNodeActiveStatus(nodePtr.i, + Sysfile::NS_NotActive_NotTakenOver); + } + } } default: jam(); @@ -12518,16 +12534,75 @@ /* CHECKPOINT WITHOUT NEEDING ANY EXTRA LOGGING FACILITIES.*/ /* A MAXIMUM OF FOUR NODES IS RETRIEVED. */ /*************************************************************************/ +bool +Dbdih::setup_create_replica(FragmentstorePtr fragPtr, + CreateReplicaRecord* createReplicaPtrP, + ConstPtr replicaPtr) +{ + createReplicaPtrP->dataNodeId = replicaPtr.p->procNode; + createReplicaPtrP->replicaRec = replicaPtr.i; + + /* ----------------------------------------------------------------- */ + /* WE NEED TO SEARCH FOR A PROPER LOCAL CHECKPOINT TO USE FOR THE */ + /* SYSTEM RESTART. */ + /* ----------------------------------------------------------------- */ + Uint32 startGci; + Uint32 startLcpNo; + Uint32 stopGci = SYSFILE->newestRestorableGCI; + bool result = findStartGci(replicaPtr, + stopGci, + startGci, + startLcpNo); + if (!result) + { + jam(); + /* --------------------------------------------------------------- */ + /* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/ + /* CONTAIN ANY VALID LOCAL CHECKPOINT. IT DOES HOWEVER CONTAIN A */ + /* VALID FRAGMENT LOG. THUS BY FIRST CREATING THE FRAGMENT AND THEN*/ + /* EXECUTING THE FRAGMENT LOG WE CAN CREATE THE FRAGMENT AS */ + /* DESIRED. THIS SHOULD ONLY OCCUR AFTER CREATING A FRAGMENT. */ + /* */ + /* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE */ + /* LOCAL CHECKPOINT TO ZNIL. */ + /* --------------------------------------------------------------- */ + createReplicaPtrP->lcpNo = ZNIL; + } + else + { + jam(); + /* --------------------------------------------------------------- */ + /* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM. */ + /* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER. */ + /* --------------------------------------------------------------- */ + createReplicaPtrP->lcpNo = startLcpNo; + arrGuard(startLcpNo, MAX_LCP_STORED); + createReplicaPtrP->createLcpId = replicaPtr.p->lcpId[startLcpNo]; + }//if + + + /* ----------------------------------------------------------------- */ + /* WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO */ + /* EXECUTE THE LOG FROM THE INITIAL CREATION OF THE TABLE. IN BOTH */ + /* CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT */ + /* WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT. */ + /* -_--------------------------------------------------------------- */ + return findLogNodes(createReplicaPtrP, fragPtr, startGci, stopGci); +} + void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr) { Uint32 nextReplicaPtrI; - ConstPtr replicaPtr; + Ptr replicaPtr; replicaPtr.i = fragPtr.p->storedReplicas; while (replicaPtr.i != RNIL) { jam(); ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord); nextReplicaPtrI = replicaPtr.p->nextReplica; + ConstPtr constReplicaPtr; + constReplicaPtr.i = replicaPtr.i; + constReplicaPtr.p = replicaPtr.p; NodeRecordPtr nodePtr; nodePtr.i = replicaPtr.p->procNode; ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord); @@ -12547,69 +12622,13 @@ createReplicaPtr.i = cnoOfCreateReplicas; ptrCheckGuard(createReplicaPtr, 4, createReplicaRecord); cnoOfCreateReplicas++; - createReplicaPtr.p->dataNodeId = replicaPtr.p->procNode; - createReplicaPtr.p->replicaRec = replicaPtr.i; - /* ----------------------------------------------------------------- */ - /* WE NEED TO SEARCH FOR A PROPER LOCAL CHECKPOINT TO USE FOR THE */ - /* SYSTEM RESTART. */ - /* ----------------------------------------------------------------- */ - Uint32 startGci; - Uint32 startLcpNo; - Uint32 stopGci = SYSFILE->newestRestorableGCI; - bool result = findStartGci(replicaPtr, - stopGci, - startGci, - startLcpNo); - if (!result) { - jam(); - /* --------------------------------------------------------------- */ - /* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/ - /* CONTAIN ANY VALID LOCAL CHECKPOINT. IT DOES HOWEVER CONTAIN A */ - /* VALID FRAGMENT LOG. THUS BY FIRST CREATING THE FRAGMENT AND THEN*/ - /* EXECUTING THE FRAGMENT LOG WE CAN CREATE THE FRAGMENT AS */ - /* DESIRED. THIS SHOULD ONLY OCCUR AFTER CREATING A FRAGMENT. */ - /* */ - /* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE */ - /* LOCAL CHECKPOINT TO ZNIL. */ - /* --------------------------------------------------------------- */ - createReplicaPtr.p->lcpNo = ZNIL; - } else { - jam(); - /* --------------------------------------------------------------- */ - /* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM. */ - /* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER. */ - /* --------------------------------------------------------------- */ - createReplicaPtr.p->lcpNo = startLcpNo; - arrGuard(startLcpNo, MAX_LCP_STORED); - createReplicaPtr.p->createLcpId = replicaPtr.p->lcpId[startLcpNo]; - }//if - - if(ERROR_INSERTED(7073) || ERROR_INSERTED(7074)){ - jam(); - nodePtr.p->nodeStatus = NodeRecord::DEAD; - } - - /* ----------------------------------------------------------------- */ - /* WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO */ - /* EXECUTE THE LOG FROM THE INITIAL CREATION OF THE TABLE. IN BOTH */ - /* CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT */ - /* WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT. */ - /* -_--------------------------------------------------------------- */ - if (!findLogNodes(createReplicaPtr.p, fragPtr, startGci, stopGci)) { - jam(); - /* --------------------------------------------------------------- */ - /* WE WERE NOT ABLE TO FIND ANY WAY OF RESTORING THIS REPLICA. */ - /* THIS IS A POTENTIAL SYSTEM ERROR. */ - /* --------------------------------------------------------------- */ - cnoOfCreateReplicas--; - return; - }//if - - if(ERROR_INSERTED(7073) || ERROR_INSERTED(7074)){ - jam(); - nodePtr.p->nodeStatus = NodeRecord::ALIVE; - } + /** + * Should have been checked in resetReplicaSr + */ + ndbrequire(setup_create_replica(fragPtr, + createReplicaPtr.p, + constReplicaPtr)); break; } default: