List:Commits« Previous MessageNext Message »
From:Jonas Oreland Date:November 2 2010 2:53pm
Subject:bzr commit into mysql-5.1-telco-7.0 branch (jonas:3916) Bug#57650
View as plain text  
#At file:///home/jonas/src/telco-7.0/ based on revid:magnus.blaudd@stripped

 3916 Jonas Oreland	2010-11-02
      ndb - bug#57650 - add retries on transient errors of backup/lcp

    modified:
      storage/ndb/include/kernel/signaldata/ScanFrag.hpp
      storage/ndb/src/kernel/blocks/backup/Backup.cpp
      storage/ndb/src/kernel/blocks/backup/Backup.hpp
      storage/ndb/test/ndbapi/testBackup.cpp
      storage/ndb/test/run-test/daily-basic-tests.txt
=== modified file 'storage/ndb/include/kernel/signaldata/ScanFrag.hpp'
--- a/storage/ndb/include/kernel/signaldata/ScanFrag.hpp	2010-10-04 11:13:22 +0000
+++ b/storage/ndb/include/kernel/signaldata/ScanFrag.hpp	2010-11-02 14:53:26 +0000
@@ -194,6 +194,7 @@ public:
     ZNO_FREE_SCANREC_ERROR = 489,
     ZWRONG_BATCH_SIZE = 1230,
     ZSTANDBY_SCAN_ERROR = 1209,
+    NO_TC_CONNECT_ERROR = 1217,
     ZSCAN_BOOK_ACC_OP_ERROR = 1219,
     ZUNKNOWN_TRANS_ERROR = 1227
   };

=== modified file 'storage/ndb/src/kernel/blocks/backup/Backup.cpp'
--- a/storage/ndb/src/kernel/blocks/backup/Backup.cpp	2010-10-04 11:13:22 +0000
+++ b/storage/ndb/src/kernel/blocks/backup/Backup.cpp	2010-11-02 14:53:26 +0000
@@ -4069,7 +4069,19 @@ Backup::execBACKUP_FRAGMENT_REQ(Signal* 
    */
   fragPtr.p->scanning = 1;
   filePtr.p->fragmentNo = fragPtr.p->fragmentId;
-  
+  filePtr.p->m_retry_count = 0;
+
+  sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr, 0);
+}
+
+void
+Backup::sendScanFragReq(Signal* signal,
+                        Ptr<BackupRecord> ptr,
+                        Ptr<BackupFile> filePtr,
+                        Ptr<Table> tabPtr,
+                        Ptr<Fragment> fragPtr,
+                        Uint32 delay)
+{
   /**
    * Start scan
    */
@@ -4079,7 +4091,6 @@ Backup::execBACKUP_FRAGMENT_REQ(Signal* 
     Table & table = * tabPtr.p;
     ScanFragReq * req = (ScanFragReq *)signal->getDataPtrSend();
     const Uint32 parallelism = 16;
-    const Uint32 attrLen = 5 + table.attrInfoLen;
 
     req->senderData = filePtr.i;
     req->resultRef = reference();
@@ -4123,8 +4134,21 @@ Backup::execBACKUP_FRAGMENT_REQ(Signal* 
     LinearSectionPtr ptr[3];
     ptr[0].p = attrInfo;
     ptr[0].sz = 5 + table.attrInfoLen;
-    sendSignal(lqhRef, GSN_SCAN_FRAGREQ, signal,
-               ScanFragReq::SignalLength, JBB, ptr, 1);
+    if (delay == 0)
+    {
+      jam();
+      sendSignal(lqhRef, GSN_SCAN_FRAGREQ, signal,
+                 ScanFragReq::SignalLength, JBB, ptr, 1);
+    }
+    else
+    {
+      jam();
+      SectionHandle handle(this);
+      ndbrequire(import(handle.m_ptr[0], ptr[0].p, ptr[0].sz));
+      handle.m_cnt = 1;
+      sendSignalWithDelay(lqhRef, GSN_SCAN_FRAGREQ, signal,
+                          delay, ScanFragReq::SignalLength, &handle);
+    }
   }
 }
 
@@ -4323,11 +4347,54 @@ Backup::execSCAN_FRAGREF(Signal* signal)
   const Uint32 filePtrI = ref->senderData;
   BackupFilePtr filePtr LINT_SET_PTR;
   c_backupFilePool.getPtr(filePtr, filePtrI);
-  
-  filePtr.p->errorCode = ref->errorCode;
-  filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
-  
-  backupFragmentRef(signal, filePtr);
+
+  Uint32 errCode = ref->errorCode;
+  if (filePtr.p->errorCode == 0)
+  {
+    // check for transient errors
+    switch(errCode){
+    case ScanFragRef::ZSCAN_BOOK_ACC_OP_ERROR:
+    case ScanFragRef::NO_TC_CONNECT_ERROR:
+    case ScanFragRef::ZTOO_MANY_ACTIVE_SCAN_ERROR:
+      jam();
+      break;
+    default:
+      jam();
+      filePtr.p->errorCode = errCode;
+    }
+  }
+
+  if (filePtr.p->errorCode == 0)
+  {
+    jam();
+    filePtr.p->m_retry_count++;
+    if (filePtr.p->m_retry_count == 10)
+    {
+      jam();
+      filePtr.p->errorCode = errCode;
+    }
+  }
+
+  if (filePtr.p->errorCode != 0)
+  {
+    jam();
+    filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
+    backupFragmentRef(signal, filePtr);
+  }
+  else
+  {
+    jam();
+
+    // retry
+
+    BackupRecordPtr ptr;
+    c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
+    TablePtr tabPtr;
+    ndbrequire(findTable(ptr, tabPtr, filePtr.p->tableId));
+    FragmentPtr fragPtr;
+    tabPtr.p->fragments.getPtr(fragPtr, filePtr.p->fragmentNo);
+    sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr, 100);
+  }
 }
 
 void

=== modified file 'storage/ndb/src/kernel/blocks/backup/Backup.hpp'
--- a/storage/ndb/src/kernel/blocks/backup/Backup.hpp	2010-03-02 09:16:27 +0000
+++ b/storage/ndb/src/kernel/blocks/backup/Backup.hpp	2010-11-02 14:53:26 +0000
@@ -318,12 +318,13 @@ public:
    */
   struct BackupFile {
     BackupFile(Backup & backup, ArrayPool<Page32> & pp) 
-      : operation(backup),  pages(pp) {}
+      : operation(backup),  pages(pp) { m_retry_count = 0; }
     
     Uint32 backupPtr; // Pointer to backup record
     Uint32 tableId;
     Uint32 fragmentNo;
     Uint32 filePointer;
+    Uint32 m_retry_count;
     Uint32 errorCode;
     BackupFormat::FileType fileType;
     OperationRecord operation;
@@ -612,6 +613,13 @@ public:
   void sendStartBackup(Signal*, BackupRecordPtr, TablePtr);
   void sendAlterTrig(Signal*, BackupRecordPtr ptr);
 
+  void sendScanFragReq(Signal*,
+                       BackupRecordPtr,
+                       BackupFilePtr,
+                       TablePtr,
+                       FragmentPtr,
+                       Uint32 delay);
+
   void sendDropTrig(Signal*, BackupRecordPtr ptr);
   void sendDropTrig(Signal* signal, BackupRecordPtr ptr, TablePtr tabPtr);
   void dropTrigReply(Signal*, BackupRecordPtr ptr);

=== modified file 'storage/ndb/test/ndbapi/testBackup.cpp'
--- a/storage/ndb/test/ndbapi/testBackup.cpp	2009-05-27 15:21:45 +0000
+++ b/storage/ndb/test/ndbapi/testBackup.cpp	2010-11-02 14:53:26 +0000
@@ -649,6 +649,22 @@ int runVerifyUndoData(NDBT_Context* ctx,
   return NDBT_OK;
 }
 
+int
+runBug57650(NDBT_Context* ctx, NDBT_Step* step)
+{
+  NdbBackup backup(GETNDB(step)->getNodeId()+1);
+  NdbRestarter res;
+
+  int node0 = res.getNode(NdbRestarter::NS_RANDOM);
+  res.insertErrorInNode(node0, 5057);
+
+  unsigned backupId = 0;
+  if (backup.start(backupId) == -1)
+    return NDBT_FAILED;
+
+  return NDBT_OK;
+}
+
 NDBT_TESTSUITE(testBackup);
 TESTCASE("BackupOne", 
 	 "Test that backup and restore works on one table \n"
@@ -778,6 +794,10 @@ TESTCASE("FailSlave", 
   STEP(runFail);
 
 }
+TESTCASE("Bug57650", "")
+{
+  INITIALIZER(runBug57650);
+}
 NDBT_TESTSUITE_END(testBackup);
 
 int main(int argc, const char** argv){

=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt	2010-11-01 10:11:47 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt	2010-11-02 14:53:26 +0000
@@ -54,6 +54,10 @@ max-time: 600
 cmd: atrt-testBackup
 args: -n BackupDDL T1
 
+max-time: 600
+cmd: atrt-testBackup
+args: -n Bug57650 T1
+
 # BASIC FUNCTIONALITY
 max-time: 500
 cmd: testBasic


Attachment: [text/bzr-bundle] bzr/jonas@mysql.com-20101102145326-mqsgv1srv7ns52db.bundle
Thread
bzr commit into mysql-5.1-telco-7.0 branch (jonas:3916) Bug#57650Jonas Oreland2 Nov