#At file:///home/jonas/src/telco-7.0/ based on revid:magnus.blaudd@stripped
3916 Jonas Oreland 2010-11-02
ndb - bug#57650 - add retries on transient errors of backup/lcp
modified:
storage/ndb/include/kernel/signaldata/ScanFrag.hpp
storage/ndb/src/kernel/blocks/backup/Backup.cpp
storage/ndb/src/kernel/blocks/backup/Backup.hpp
storage/ndb/test/ndbapi/testBackup.cpp
storage/ndb/test/run-test/daily-basic-tests.txt
=== modified file 'storage/ndb/include/kernel/signaldata/ScanFrag.hpp'
--- a/storage/ndb/include/kernel/signaldata/ScanFrag.hpp 2010-10-04 11:13:22 +0000
+++ b/storage/ndb/include/kernel/signaldata/ScanFrag.hpp 2010-11-02 14:53:26 +0000
@@ -194,6 +194,7 @@ public:
ZNO_FREE_SCANREC_ERROR = 489,
ZWRONG_BATCH_SIZE = 1230,
ZSTANDBY_SCAN_ERROR = 1209,
+ NO_TC_CONNECT_ERROR = 1217,
ZSCAN_BOOK_ACC_OP_ERROR = 1219,
ZUNKNOWN_TRANS_ERROR = 1227
};
=== modified file 'storage/ndb/src/kernel/blocks/backup/Backup.cpp'
--- a/storage/ndb/src/kernel/blocks/backup/Backup.cpp 2010-10-04 11:13:22 +0000
+++ b/storage/ndb/src/kernel/blocks/backup/Backup.cpp 2010-11-02 14:53:26 +0000
@@ -4069,7 +4069,19 @@ Backup::execBACKUP_FRAGMENT_REQ(Signal*
*/
fragPtr.p->scanning = 1;
filePtr.p->fragmentNo = fragPtr.p->fragmentId;
-
+ filePtr.p->m_retry_count = 0;
+
+ sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr, 0);
+}
+
+void
+Backup::sendScanFragReq(Signal* signal,
+ Ptr<BackupRecord> ptr,
+ Ptr<BackupFile> filePtr,
+ Ptr<Table> tabPtr,
+ Ptr<Fragment> fragPtr,
+ Uint32 delay)
+{
/**
* Start scan
*/
@@ -4079,7 +4091,6 @@ Backup::execBACKUP_FRAGMENT_REQ(Signal*
Table & table = * tabPtr.p;
ScanFragReq * req = (ScanFragReq *)signal->getDataPtrSend();
const Uint32 parallelism = 16;
- const Uint32 attrLen = 5 + table.attrInfoLen;
req->senderData = filePtr.i;
req->resultRef = reference();
@@ -4123,8 +4134,21 @@ Backup::execBACKUP_FRAGMENT_REQ(Signal*
LinearSectionPtr ptr[3];
ptr[0].p = attrInfo;
ptr[0].sz = 5 + table.attrInfoLen;
- sendSignal(lqhRef, GSN_SCAN_FRAGREQ, signal,
- ScanFragReq::SignalLength, JBB, ptr, 1);
+ if (delay == 0)
+ {
+ jam();
+ sendSignal(lqhRef, GSN_SCAN_FRAGREQ, signal,
+ ScanFragReq::SignalLength, JBB, ptr, 1);
+ }
+ else
+ {
+ jam();
+ SectionHandle handle(this);
+ ndbrequire(import(handle.m_ptr[0], ptr[0].p, ptr[0].sz));
+ handle.m_cnt = 1;
+ sendSignalWithDelay(lqhRef, GSN_SCAN_FRAGREQ, signal,
+ delay, ScanFragReq::SignalLength, &handle);
+ }
}
}
@@ -4323,11 +4347,54 @@ Backup::execSCAN_FRAGREF(Signal* signal)
const Uint32 filePtrI = ref->senderData;
BackupFilePtr filePtr LINT_SET_PTR;
c_backupFilePool.getPtr(filePtr, filePtrI);
-
- filePtr.p->errorCode = ref->errorCode;
- filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
-
- backupFragmentRef(signal, filePtr);
+
+ Uint32 errCode = ref->errorCode;
+ if (filePtr.p->errorCode == 0)
+ {
+ // check for transient errors
+ switch(errCode){
+ case ScanFragRef::ZSCAN_BOOK_ACC_OP_ERROR:
+ case ScanFragRef::NO_TC_CONNECT_ERROR:
+ case ScanFragRef::ZTOO_MANY_ACTIVE_SCAN_ERROR:
+ jam();
+ break;
+ default:
+ jam();
+ filePtr.p->errorCode = errCode;
+ }
+ }
+
+ if (filePtr.p->errorCode == 0)
+ {
+ jam();
+ filePtr.p->m_retry_count++;
+ if (filePtr.p->m_retry_count == 10)
+ {
+ jam();
+ filePtr.p->errorCode = errCode;
+ }
+ }
+
+ if (filePtr.p->errorCode != 0)
+ {
+ jam();
+ filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
+ backupFragmentRef(signal, filePtr);
+ }
+ else
+ {
+ jam();
+
+ // retry
+
+ BackupRecordPtr ptr;
+ c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
+ TablePtr tabPtr;
+ ndbrequire(findTable(ptr, tabPtr, filePtr.p->tableId));
+ FragmentPtr fragPtr;
+ tabPtr.p->fragments.getPtr(fragPtr, filePtr.p->fragmentNo);
+ sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr, 100);
+ }
}
void
=== modified file 'storage/ndb/src/kernel/blocks/backup/Backup.hpp'
--- a/storage/ndb/src/kernel/blocks/backup/Backup.hpp 2010-03-02 09:16:27 +0000
+++ b/storage/ndb/src/kernel/blocks/backup/Backup.hpp 2010-11-02 14:53:26 +0000
@@ -318,12 +318,13 @@ public:
*/
struct BackupFile {
BackupFile(Backup & backup, ArrayPool<Page32> & pp)
- : operation(backup), pages(pp) {}
+ : operation(backup), pages(pp) { m_retry_count = 0; }
Uint32 backupPtr; // Pointer to backup record
Uint32 tableId;
Uint32 fragmentNo;
Uint32 filePointer;
+ Uint32 m_retry_count;
Uint32 errorCode;
BackupFormat::FileType fileType;
OperationRecord operation;
@@ -612,6 +613,13 @@ public:
void sendStartBackup(Signal*, BackupRecordPtr, TablePtr);
void sendAlterTrig(Signal*, BackupRecordPtr ptr);
+ void sendScanFragReq(Signal*,
+ BackupRecordPtr,
+ BackupFilePtr,
+ TablePtr,
+ FragmentPtr,
+ Uint32 delay);
+
void sendDropTrig(Signal*, BackupRecordPtr ptr);
void sendDropTrig(Signal* signal, BackupRecordPtr ptr, TablePtr tabPtr);
void dropTrigReply(Signal*, BackupRecordPtr ptr);
=== modified file 'storage/ndb/test/ndbapi/testBackup.cpp'
--- a/storage/ndb/test/ndbapi/testBackup.cpp 2009-05-27 15:21:45 +0000
+++ b/storage/ndb/test/ndbapi/testBackup.cpp 2010-11-02 14:53:26 +0000
@@ -649,6 +649,22 @@ int runVerifyUndoData(NDBT_Context* ctx,
return NDBT_OK;
}
+int
+runBug57650(NDBT_Context* ctx, NDBT_Step* step)
+{
+ NdbBackup backup(GETNDB(step)->getNodeId()+1);
+ NdbRestarter res;
+
+ int node0 = res.getNode(NdbRestarter::NS_RANDOM);
+ res.insertErrorInNode(node0, 5057);
+
+ unsigned backupId = 0;
+ if (backup.start(backupId) == -1)
+ return NDBT_FAILED;
+
+ return NDBT_OK;
+}
+
NDBT_TESTSUITE(testBackup);
TESTCASE("BackupOne",
"Test that backup and restore works on one table \n"
@@ -778,6 +794,10 @@ TESTCASE("FailSlave",
STEP(runFail);
}
+TESTCASE("Bug57650", "")
+{
+ INITIALIZER(runBug57650);
+}
NDBT_TESTSUITE_END(testBackup);
int main(int argc, const char** argv){
=== modified file 'storage/ndb/test/run-test/daily-basic-tests.txt'
--- a/storage/ndb/test/run-test/daily-basic-tests.txt 2010-11-01 10:11:47 +0000
+++ b/storage/ndb/test/run-test/daily-basic-tests.txt 2010-11-02 14:53:26 +0000
@@ -54,6 +54,10 @@ max-time: 600
cmd: atrt-testBackup
args: -n BackupDDL T1
+max-time: 600
+cmd: atrt-testBackup
+args: -n Bug57650 T1
+
# BASIC FUNCTIONALITY
max-time: 500
cmd: testBasic
Attachment: [text/bzr-bundle] bzr/jonas@mysql.com-20101102145326-mqsgv1srv7ns52db.bundle
| Thread |
|---|
| • bzr commit into mysql-5.1-telco-7.0 branch (jonas:3916) Bug#57650 | Jonas Oreland | 2 Nov |