From: Date: September 26 2006 1:19pm Subject: bk commit into 4.1 tree (jonas:1.2546) BUG#20895 List-Archive: http://lists.mysql.com/commits/12513 X-Bug: 20895 Message-Id: <20060926111903.1596851D564@perch.ndb.mysql.com> Below is the list of changes that have just been committed into a local 4.1 repository of jonas. When jonas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet@stripped, 2006-09-26 13:19:00+02:00, jonas@stripped +1 -0 ndb - bug#20895 Fix occational LCP hang!!! Make sure only to consider alive nodes in startNextChkpt ndb/src/kernel/blocks/dbdih/DbdihMain.cpp@stripped, 2006-09-26 13:18:59+02:00, jonas@stripped +71 -60 Make sure only to consider alive nodes in startNextChkpt # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: jonas # Host: perch.ndb.mysql.com # Root: /home/jonas/src/41-work --- 1.38/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2006-09-26 13:19:03 +02:00 +++ 1.39/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp 2006-09-26 13:19:03 +02:00 @@ -9561,73 +9561,84 @@ nodePtr.i = replicaPtr.p->procNode; ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord); - if (replicaPtr.p->lcpOngoingFlag && - replicaPtr.p->lcpIdStarted < lcpId) { - jam(); - //------------------------------------------------------------------- - // We have found a replica on a node that performs local checkpoint - // that is alive and that have not yet been started. - //------------------------------------------------------------------- - - if (nodePtr.p->noOfStartedChkpt < 2) { - jam(); - /** - * Send LCP_FRAG_ORD to LQH - */ - - /** - * Mark the replica so with lcpIdStarted == true - */ - replicaPtr.p->lcpIdStarted = lcpId; - - Uint32 i = nodePtr.p->noOfStartedChkpt; - nodePtr.p->startedChkpt[i].tableId = tabPtr.i; - nodePtr.p->startedChkpt[i].fragId = curr.fragmentId; - nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i; - nodePtr.p->noOfStartedChkpt = i + 1; - - sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]); - } else if (nodePtr.p->noOfQueuedChkpt < 2) { - jam(); - /** - * Put LCP_FRAG_ORD "in queue" - */ - - /** - * Mark the replica so with lcpIdStarted == true - */ - replicaPtr.p->lcpIdStarted = lcpId; + if (c_lcpState.m_participatingLQH.get(nodePtr.i)) + { + if (replicaPtr.p->lcpOngoingFlag && + replicaPtr.p->lcpIdStarted < lcpId) + { + jam(); + //------------------------------------------------------------------- + // We have found a replica on a node that performs local checkpoint + // that is alive and that have not yet been started. + //------------------------------------------------------------------- - Uint32 i = nodePtr.p->noOfQueuedChkpt; - nodePtr.p->queuedChkpt[i].tableId = tabPtr.i; - nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId; - nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i; - nodePtr.p->noOfQueuedChkpt = i + 1; - } else { - jam(); + if (nodePtr.p->noOfStartedChkpt < 2) + { + jam(); + /** + * Send LCP_FRAG_ORD to LQH + */ + + /** + * Mark the replica so with lcpIdStarted == true + */ + replicaPtr.p->lcpIdStarted = lcpId; - if(save){ + Uint32 i = nodePtr.p->noOfStartedChkpt; + nodePtr.p->startedChkpt[i].tableId = tabPtr.i; + nodePtr.p->startedChkpt[i].fragId = curr.fragmentId; + nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i; + nodePtr.p->noOfStartedChkpt = i + 1; + + sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]); + } + else if (nodePtr.p->noOfQueuedChkpt < 2) + { + jam(); /** - * Stop increasing value on first that was "full" + * Put LCP_FRAG_ORD "in queue" */ - c_lcpState.currentFragment = curr; - save = false; - } - - busyNodes.set(nodePtr.i); - if(busyNodes.count() == lcpNodes){ + /** - * There were no possibility to start the local checkpoint - * and it was not possible to queue it up. In this case we - * stop the start of local checkpoints until the nodes with a - * backlog have performed more checkpoints. We will return and - * will not continue the process of starting any more checkpoints. + * Mark the replica so with lcpIdStarted == true */ - return; + replicaPtr.p->lcpIdStarted = lcpId; + + Uint32 i = nodePtr.p->noOfQueuedChkpt; + nodePtr.p->queuedChkpt[i].tableId = tabPtr.i; + nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId; + nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i; + nodePtr.p->noOfQueuedChkpt = i + 1; + } + else + { + jam(); + + if(save) + { + /** + * Stop increasing value on first that was "full" + */ + c_lcpState.currentFragment = curr; + save = false; + } + + busyNodes.set(nodePtr.i); + if(busyNodes.count() == lcpNodes) + { + /** + * There were no possibility to start the local checkpoint + * and it was not possible to queue it up. In this case we + * stop the start of local checkpoints until the nodes with a + * backlog have performed more checkpoints. We will return and + * will not continue the process of starting any more checkpoints. + */ + return; + }//if }//if - }//if - } - }//while + } + }//while + } curr.fragmentId++; if (curr.fragmentId >= tabPtr.p->totalfragments) { jam();