From: Date: June 5 2007 5:30pm Subject: bk commit into 5.1 tree (tomas:1.2490) BUG#28751 List-Archive: http://lists.mysql.com/commits/28130 X-Bug: 28751 Message-Id: <20070605153025.BB547501B1@linux.local> Below is the list of changes that have just been committed into a local 5.1 repository of tomas. When tomas does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet 1.2490 07/06/05 17:29:50 tomas@stripped +18 -0 Bug #28751 Lots of memory locked in memory causes high kswapd - add odirect option for lcp+backup+redo log to lower CPU/kswapd usage - writing odirect removes need for kernel write buffers avoiding kswapd to kick in storage/ndb/tools/restore/Restore.cpp 1.46 07/06/05 17:29:35 tomas@stripped +24 -5 ndb_restore to skip empty_record alignment padding in backup file storage/ndb/src/mgmsrv/ConfigInfo.cpp 1.99 07/06/05 17:29:35 tomas@stripped +12 -0 new config param for odirect, default false storage/ndb/src/kernel/vm/SimulatedBlock.hpp 1.31 07/06/05 17:29:35 tomas@stripped +1 -0 alligend log buffer allocation for odirect storage/ndb/src/kernel/vm/SimulatedBlock.cpp 1.41 07/06/05 17:29:35 tomas@stripped +22 -3 alligend log buffer allocation for odirect storage/ndb/src/kernel/blocks/restore.cpp 1.14 07/06/05 17:29:35 tomas@stripped +3 -0 restor block to ignore new lcp padding empty_record storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.hpp 1.10 07/06/05 17:29:34 tomas@stripped +4 -0 align + odirect check storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.cpp 1.39 07/06/05 17:29:34 tomas@stripped +114 -37 aligned writing for odirect correct odirect open options with test+fallback if odirect fails storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 1.142 07/06/05 17:29:34 tomas@stripped +11 -3 read config params and open redo log files with odirect if set storage/ndb/src/kernel/blocks/dblqh/DblqhInit.cpp 1.24 07/06/05 17:29:32 tomas@stripped +9 -5 read odirect config param and align buffers storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp 1.61 07/06/05 17:29:32 tomas@stripped +2 -3 read odirect config param and align buffers storage/ndb/src/kernel/blocks/backup/FsBuffer.hpp 1.9 07/06/05 17:29:32 tomas@stripped +14 -10 correct debug printouts storage/ndb/src/kernel/blocks/backup/BackupInit.cpp 1.27 07/06/05 17:29:32 tomas@stripped +5 -2 read odirect config and allocate aligned storage/ndb/src/kernel/blocks/backup/BackupFormat.hpp 1.10 07/06/05 17:29:32 tomas@stripped +9 -1 add empty_record in file format storage/ndb/src/kernel/blocks/backup/Backup.hpp 1.28 07/06/05 17:29:32 tomas@stripped +2 -1 odirect and padding options storage/ndb/src/kernel/blocks/backup/Backup.cpp 1.64 07/06/05 17:29:32 tomas@stripped +42 -7 read odirect config param open LCP and Backup datafiles with odirect if specified insert empty padding record if odirect is used allocate buffers aligned to be able to use odirect storage/ndb/include/ndb_global.h.in 1.17 07/06/05 17:29:31 tomas@stripped +2 -0 specify alignment needed for odirect storage/ndb/include/mgmapi/mgmapi_config_parameters.h 1.33 07/06/05 17:29:31 tomas@stripped +2 -0 add new config parameter to choose ODirect mysql-test/ndb/ndb_config_2_node.ini 1.21 07/06/05 17:29:31 tomas@stripped +1 -0 run mysql-test-run using ODirect # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: tomas # Host: poseidon.mysql.com # Root: /home/tomas/mysql-5.1-telco-gca --- 1.13/storage/ndb/src/kernel/blocks/restore.cpp 2007-03-13 12:38:45 +01:00 +++ 1.14/storage/ndb/src/kernel/blocks/restore.cpp 2007-06-05 17:29:35 +02:00 @@ -559,6 +559,9 @@ Restore::restore_next(Signal* signal, Fi case BackupFormat::GCP_ENTRY: parse_gcp_entry(signal, file_ptr, data, len); break; + case BackupFormat::EMPTY_ENTRY: + // skip + break; case 0x4e444242: // 'NDBB' if (check_file_version(signal, ntohl(* (data+2))) == 0) { --- 1.32/storage/ndb/include/mgmapi/mgmapi_config_parameters.h 2007-06-05 17:06:23 +02:00 +++ 1.33/storage/ndb/include/mgmapi/mgmapi_config_parameters.h 2007-06-05 17:29:31 +02:00 @@ -115,6 +115,8 @@ #define CFG_DB_MEMREPORT_FREQUENCY 166 +#define CFG_DB_O_DIRECT 168 + #define CFG_DB_SGA 198 /* super pool mem */ #define CFG_DB_DATA_MEM_2 199 /* used in special build in 5.1 */ --- 1.16/storage/ndb/include/ndb_global.h.in 2006-12-23 20:20:02 +01:00 +++ 1.17/storage/ndb/include/ndb_global.h.in 2007-06-05 17:29:31 +02:00 @@ -146,4 +146,6 @@ extern "C" { #define MAX(x,y) (((x)>(y))?(x):(y)) #endif +#define NDB_O_DIRECT_WRITE_ALIGNMENT 512 + #endif --- 1.98/storage/ndb/src/mgmsrv/ConfigInfo.cpp 2007-06-05 17:06:24 +02:00 +++ 1.99/storage/ndb/src/mgmsrv/ConfigInfo.cpp 2007-06-05 17:29:35 +02:00 @@ -1313,6 +1313,18 @@ const ConfigInfo::ParamInfo ConfigInfo:: "0", STR_VALUE(MAX_INT_RNIL) }, + { + CFG_DB_O_DIRECT, + "ODirect", + DB_TOKEN, + "Use O_DIRECT file write/read when possible", + ConfigInfo::CI_USED, + true, + ConfigInfo::CI_BOOL, + "false", + "false", + "true"}, + /*************************************************************************** * API ***************************************************************************/ --- 1.63/storage/ndb/src/kernel/blocks/backup/Backup.cpp 2007-05-22 17:53:04 +02:00 +++ 1.64/storage/ndb/src/kernel/blocks/backup/Backup.cpp 2007-06-05 17:29:32 +02:00 @@ -2761,6 +2761,8 @@ Backup::openFiles(Signal* signal, Backup c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr); filePtr.p->m_flags |= BackupFile::BF_OPENING; + if (c_defaults.m_o_direct) + req->fileFlags |= FsOpenReq::OM_DIRECT; req->userPointer = filePtr.i; FsOpenReq::setVersion(req->fileNumber, 2); FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA); @@ -3735,12 +3737,31 @@ Backup::OperationRecord::newFragment(Uin } bool -Backup::OperationRecord::fragComplete(Uint32 tableId, Uint32 fragNo) +Backup::OperationRecord::fragComplete(Uint32 tableId, Uint32 fragNo, bool fill_record) { Uint32 * tmp; const Uint32 footSz = sizeof(BackupFormat::DataFile::FragmentFooter) >> 2; + Uint32 sz = footSz + 1; - if(dataBuffer.getWritePtr(&tmp, footSz + 1)) { + if (fill_record) + { + Uint32 * new_tmp; + if (!dataBuffer.getWritePtr(&tmp, sz)) + return false; + new_tmp = tmp + sz; + + if ((UintPtr)new_tmp & (sizeof(Page32)-1)) + { + /* padding is needed to get full write */ + new_tmp += 2 /* to fit empty header minimum 2 words*/; + new_tmp = (Uint32 *)(((UintPtr)new_tmp + sizeof(Page32)-1) & + ~(UintPtr)(sizeof(Page32)-1)); + /* new write sz */ + sz = new_tmp - tmp; + } + } + + if(dataBuffer.getWritePtr(&tmp, sz)) { jam(); * tmp = 0; // Finish record stream tmp++; @@ -3752,7 +3773,17 @@ Backup::OperationRecord::fragComplete(Ui foot->FragmentNo = htonl(fragNo); foot->NoOfRecords = htonl(noOfRecords); foot->Checksum = htonl(0); - dataBuffer.updateWritePtr(footSz + 1); + + if (sz != footSz + 1) + { + tmp += footSz; + memset(tmp, 0, (sz - footSz - 1) * 4); + *tmp = htonl(BackupFormat::EMPTY_ENTRY); + tmp++; + *tmp = htonl(sz - footSz - 1); + } + + dataBuffer.updateWritePtr(sz); return true; }//if return false; @@ -3854,8 +3885,13 @@ Backup::fragmentCompleted(Signal* signal return; }//if + BackupRecordPtr ptr LINT_SET_PTR; + c_backupPool.getPtr(ptr, filePtr.p->backupPtr); + OperationRecord & op = filePtr.p->operation; - if(!op.fragComplete(filePtr.p->tableId, filePtr.p->fragmentNo)) { + if(!op.fragComplete(filePtr.p->tableId, filePtr.p->fragmentNo, + c_defaults.m_o_direct)) + { jam(); signal->theData[0] = BackupContinueB::BUFFER_FULL_FRAG_COMPLETE; signal->theData[1] = filePtr.i; @@ -3865,9 +3901,6 @@ Backup::fragmentCompleted(Signal* signal filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD; - BackupRecordPtr ptr LINT_SET_PTR; - c_backupPool.getPtr(ptr, filePtr.p->backupPtr); - if (ptr.p->is_lcp()) { ptr.p->slaveState.setState(STOPPING); @@ -4905,6 +4938,8 @@ Backup::lcp_open_file(Signal* signal, Ba FsOpenReq::OM_CREATE | FsOpenReq::OM_APPEND | FsOpenReq::OM_AUTOSYNC; + if (c_defaults.m_o_direct) + req->fileFlags |= FsOpenReq::OM_DIRECT; FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF); req->auto_sync_size = c_defaults.m_disk_synch_size; --- 1.27/storage/ndb/src/kernel/blocks/backup/Backup.hpp 2006-12-23 20:20:15 +01:00 +++ 1.28/storage/ndb/src/kernel/blocks/backup/Backup.hpp 2007-06-05 17:29:32 +02:00 @@ -240,7 +240,7 @@ public: * Once per fragment */ bool newFragment(Uint32 tableId, Uint32 fragNo); - bool fragComplete(Uint32 tableId, Uint32 fragNo); + bool fragComplete(Uint32 tableId, Uint32 fragNo, bool fill_record); /** * Once per scan frag (next) req/conf @@ -534,6 +534,7 @@ public: Uint32 m_disk_write_speed; Uint32 m_disk_synch_size; Uint32 m_diskless; + Uint32 m_o_direct; }; /** --- 1.9/storage/ndb/src/kernel/blocks/backup/BackupFormat.hpp 2006-12-23 20:20:15 +01:00 +++ 1.10/storage/ndb/src/kernel/blocks/backup/BackupFormat.hpp 2007-06-05 17:29:32 +02:00 @@ -32,7 +32,8 @@ struct BackupFormat { TABLE_LIST = 4, TABLE_DESCRIPTION = 5, GCP_ENTRY = 6, - FRAGMENT_INFO = 7 + FRAGMENT_INFO = 7, + EMPTY_ENTRY = 8 }; struct FileHeader { @@ -92,6 +93,13 @@ struct BackupFormat { Uint32 FragmentNo; Uint32 NoOfRecords; Uint32 Checksum; + }; + + /* optional padding for O_DIRECT */ + struct EmptyEntry { + Uint32 SectionType; + Uint32 SectionLength; + /* not used data */ }; }; --- 1.26/storage/ndb/src/kernel/blocks/backup/BackupInit.cpp 2006-12-23 20:20:15 +01:00 +++ 1.27/storage/ndb/src/kernel/blocks/backup/BackupInit.cpp 2007-06-05 17:29:32 +02:00 @@ -148,10 +148,13 @@ Backup::execREAD_CONFIG_REQ(Signal* sign c_defaults.m_disk_write_speed = 10 * (1024 * 1024); c_defaults.m_disk_write_speed_sr = 100 * (1024 * 1024); c_defaults.m_disk_synch_size = 4 * (1024 * 1024); - + c_defaults.m_o_direct = true; + Uint32 noBackups = 0, noTables = 0, noAttribs = 0, noFrags = 0; ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, &c_defaults.m_diskless)); + ndb_mgm_get_int_parameter(p, CFG_DB_O_DIRECT, + &c_defaults.m_o_direct); ndb_mgm_get_int_parameter(p, CFG_DB_CHECKPOINT_SPEED_SR, &c_defaults.m_disk_write_speed_sr); ndb_mgm_get_int_parameter(p, CFG_DB_CHECKPOINT_SPEED, @@ -204,7 +207,7 @@ Backup::execREAD_CONFIG_REQ(Signal* sign / sizeof(Page32); // We need to allocate an additional of 2 pages. 1 page because of a bug in // ArrayPool and another one for DICTTAINFO. - c_pagePool.setSize(noPages + NO_OF_PAGES_META_FILE + 2); + c_pagePool.setSize(noPages + NO_OF_PAGES_META_FILE + 2, true); { // Init all tables SLList tables(c_tablePool); --- 1.8/storage/ndb/src/kernel/blocks/backup/FsBuffer.hpp 2006-12-23 20:20:15 +01:00 +++ 1.9/storage/ndb/src/kernel/blocks/backup/FsBuffer.hpp 2007-06-05 17:29:32 +02:00 @@ -270,8 +270,8 @@ FsBuffer::getReadPtr(Uint32 ** ptr, Uint * ptr = &Tp[Tr]; - DEBUG(ndbout_c("getReadPtr() Tr: %d Tw: %d Ts: %d Tm: %d sz1: %d -> %d", - Tr, Tw, Ts, Tm, sz1, * sz)); + DEBUG(ndbout_c("getReadPtr() Tr: %d Tmw: %d Ts: %d Tm: %d sz1: %d -> %d", + Tr, Tmw, Ts, Tm, sz1, * sz)); return true; } @@ -279,8 +279,8 @@ FsBuffer::getReadPtr(Uint32 ** ptr, Uint if(!m_eof){ * _eof = false; - DEBUG(ndbout_c("getReadPtr() Tr: %d Tw: %d Ts: %d Tm: %d sz1: %d -> false", - Tr, Tw, Ts, Tm, sz1)); + DEBUG(ndbout_c("getReadPtr() Tr: %d Tmw: %d Ts: %d Tm: %d sz1: %d -> false", + Tr, Tmw, Ts, Tm, sz1)); return false; } @@ -289,8 +289,8 @@ FsBuffer::getReadPtr(Uint32 ** ptr, Uint * _eof = true; * ptr = &Tp[Tr]; - DEBUG(ndbout_c("getReadPtr() Tr: %d Tw: %d Ts: %d Tm: %d sz1: %d -> %d eof", - Tr, Tw, Ts, Tm, sz1, * sz)); + DEBUG(ndbout_c("getReadPtr() Tr: %d Tmw: %d Ts: %d Tm: %d sz1: %d -> %d eof", + Tr, Tmw, Ts, Tm, sz1, * sz)); return false; } @@ -316,13 +316,13 @@ FsBuffer::getWritePtr(Uint32 ** ptr, Uin if(sz1 > sz){ // Note at least 1 word of slack * ptr = &Tp[Tw]; - DEBUG(ndbout_c("getWritePtr(%d) Tr: %d Tw: %d Ts: %d sz1: %d -> true", - sz, Tr, Tw, Ts, sz1)); + DEBUG(ndbout_c("getWritePtr(%d) Tw: %d sz1: %d -> true", + sz, Tw, sz1)); return true; } - DEBUG(ndbout_c("getWritePtr(%d) Tr: %d Tw: %d Ts: %d sz1: %d -> false", - sz, Tr, Tw, Ts, sz1)); + DEBUG(ndbout_c("getWritePtr(%d) Tw: %d sz1: %d -> false", + sz, Tw, sz1)); return false; } @@ -339,11 +339,15 @@ FsBuffer::updateWritePtr(Uint32 sz){ m_free -= sz; if(Tnew < Ts){ m_writeIndex = Tnew; + DEBUG(ndbout_c("updateWritePtr(%d) m_writeIndex: %d", + sz, m_writeIndex)); return; } memcpy(Tp, &Tp[Ts], (Tnew - Ts) << 2); m_writeIndex = Tnew - Ts; + DEBUG(ndbout_c("updateWritePtr(%d) m_writeIndex: %d", + sz, m_writeIndex)); } inline --- 1.45/storage/ndb/tools/restore/Restore.cpp 2007-03-01 02:40:13 +01:00 +++ 1.46/storage/ndb/tools/restore/Restore.cpp 2007-06-05 17:29:35 +02:00 @@ -867,13 +867,32 @@ bool RestoreDataIterator::readFragmentHe debug << "RestoreDataIterator::getNextFragment" << endl; - if (buffer_read(&Header, sizeof(Header), 1) != 1){ + while (1) + { + /* read first part of header */ + if (buffer_read(&Header, 8, 1) != 1) + { + ret = 0; + return false; + } // if + + /* skip if EMPTY_ENTRY */ + Header.SectionType = ntohl(Header.SectionType); + Header.SectionLength = ntohl(Header.SectionLength); + if (Header.SectionType == BackupFormat::EMPTY_ENTRY) + { + void *tmp; + buffer_get_ptr(&tmp, Header.SectionLength*4-8, 1); + continue; + } + break; + } + /* read rest of header */ + if (buffer_read(((char*)&Header)+8, sizeof(Header)-8, 1) != 1) + { ret = 0; return false; - } // if - - Header.SectionType = ntohl(Header.SectionType); - Header.SectionLength = ntohl(Header.SectionLength); + } Header.TableId = ntohl(Header.TableId); Header.FragmentNo = ntohl(Header.FragmentNo); Header.ChecksumType = ntohl(Header.ChecksumType); --- 1.60/storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp 2007-05-29 07:20:27 +02:00 +++ 1.61/storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp 2007-06-05 17:29:32 +02:00 @@ -115,9 +115,6 @@ class Dbtup; /* ------------------------------------------------------------------------- */ /* VARIOUS CONSTANTS USED AS FLAGS TO THE FILE MANAGER. */ /* ------------------------------------------------------------------------- */ -#define ZOPEN_READ 0 -#define ZOPEN_WRITE 1 -#define ZOPEN_READ_WRITE 2 #define ZVAR_NO_LOG_PAGE_WORD 1 #define ZLIST_OF_PAIRS 0 #define ZLIST_OF_PAIRS_SYNCH 16 @@ -2686,6 +2683,7 @@ private: UintR clfoFileSize; LogPageRecord *logPageRecord; + void *logPageRecordUnaligned; LogPageRecordPtr logPagePtr; UintR cfirstfreeLogPage; UintR clogPageFileSize; @@ -2889,6 +2887,7 @@ private: UintR ctransidHash[1024]; Uint32 c_diskless; + Uint32 c_o_direct; Uint32 c_error_insert_table_id; public: --- 1.23/storage/ndb/src/kernel/blocks/dblqh/DblqhInit.cpp 2007-05-24 16:40:03 +02:00 +++ 1.24/storage/ndb/src/kernel/blocks/dblqh/DblqhInit.cpp 2007-06-05 17:29:32 +02:00 @@ -49,6 +49,7 @@ void Dblqh::initData() logFileRecord = 0; logFileOperationRecord = 0; logPageRecord = 0; + logPageRecordUnaligned= 0; pageRefRecord = 0; tablerec = 0; tcConnectionrec = 0; @@ -105,10 +106,13 @@ void Dblqh::initRecords() sizeof(LogFileOperationRecord), clfoFileSize); - logPageRecord = (LogPageRecord*)allocRecord("LogPageRecord", - sizeof(LogPageRecord), - clogPageFileSize, - false); + logPageRecord = + (LogPageRecord*)allocRecordAligned("LogPageRecord", + sizeof(LogPageRecord), + clogPageFileSize, + &logPageRecordUnaligned, + NDB_O_DIRECT_WRITE_ALIGNMENT, + false); pageRefRecord = (PageRefRecord*)allocRecord("PageRefRecord", sizeof(PageRefRecord), @@ -378,7 +382,7 @@ Dblqh::~Dblqh() sizeof(LogFileOperationRecord), clfoFileSize); - deallocRecord((void**)&logPageRecord, + deallocRecord((void**)&logPageRecordUnaligned, "LogPageRecord", sizeof(LogPageRecord), clogPageFileSize); --- 1.141/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2007-05-29 07:20:27 +02:00 +++ 1.142/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp 2007-06-05 17:29:34 +02:00 @@ -1015,6 +1015,8 @@ void Dblqh::execREAD_CONFIG_REQ(Signal* cmaxAccOps = cscanrecFileSize * MAX_PARALLEL_OP_PER_SCAN; ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, &c_diskless)); + c_o_direct = true; + ndb_mgm_get_int_parameter(p, CFG_DB_O_DIRECT, &c_o_direct); Uint32 tmp= 0; ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_LQH_FRAG, &tmp)); @@ -13243,7 +13245,9 @@ void Dblqh::openFileRw(Signal* signal, L signal->theData[3] = olfLogFilePtr.p->fileName[1]; signal->theData[4] = olfLogFilePtr.p->fileName[2]; signal->theData[5] = olfLogFilePtr.p->fileName[3]; - signal->theData[6] = ZOPEN_READ_WRITE | FsOpenReq::OM_AUTOSYNC; + signal->theData[6] = FsOpenReq::OM_READWRITE | FsOpenReq::OM_AUTOSYNC; + if (c_o_direct) + signal->theData[6] |= FsOpenReq::OM_DIRECT; req->auto_sync_size = MAX_REDO_PAGES_WITHOUT_SYNCH * sizeof(LogPageRecord); sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA); }//Dblqh::openFileRw() @@ -13263,7 +13267,9 @@ void Dblqh::openLogfileInit(Signal* sign signal->theData[3] = logFilePtr.p->fileName[1]; signal->theData[4] = logFilePtr.p->fileName[2]; signal->theData[5] = logFilePtr.p->fileName[3]; - signal->theData[6] = 0x302 | FsOpenReq::OM_AUTOSYNC; + signal->theData[6] = FsOpenReq::OM_READWRITE | FsOpenReq::OM_TRUNCATE | FsOpenReq::OM_CREATE | FsOpenReq::OM_AUTOSYNC; + if (c_o_direct) + signal->theData[6] |= FsOpenReq::OM_DIRECT; req->auto_sync_size = MAX_REDO_PAGES_WITHOUT_SYNCH * sizeof(LogPageRecord); sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA); }//Dblqh::openLogfileInit() @@ -13299,7 +13305,9 @@ void Dblqh::openNextLogfile(Signal* sign signal->theData[3] = onlLogFilePtr.p->fileName[1]; signal->theData[4] = onlLogFilePtr.p->fileName[2]; signal->theData[5] = onlLogFilePtr.p->fileName[3]; - signal->theData[6] = 2 | FsOpenReq::OM_AUTOSYNC; + signal->theData[6] = FsOpenReq::OM_READWRITE | FsOpenReq::OM_AUTOSYNC; + if (c_o_direct) + signal->theData[6] |= FsOpenReq::OM_DIRECT; req->auto_sync_size = MAX_REDO_PAGES_WITHOUT_SYNCH * sizeof(LogPageRecord); sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA); }//if --- 1.38/storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.cpp 2006-12-23 20:20:18 +01:00 +++ 1.39/storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.cpp 2007-06-05 17:29:34 +02:00 @@ -163,7 +163,12 @@ AsyncFile::run() theStartFlag = true; // Create write buffer for bigger writes theWriteBufferSize = WRITEBUFFERSIZE; - theWriteBuffer = (char *) ndbd_malloc(theWriteBufferSize); + theWriteBufferUnaligned = (char *) ndbd_malloc(theWriteBufferSize + + NDB_O_DIRECT_WRITE_ALIGNMENT-1); + theWriteBuffer = (char *) + (((UintPtr)theWriteBufferUnaligned + NDB_O_DIRECT_WRITE_ALIGNMENT - 1) & + ~(UintPtr)(NDB_O_DIRECT_WRITE_ALIGNMENT - 1)); + NdbMutex_Unlock(theStartMutexPtr); NdbCondition_Signal(theStartConditionPtr); @@ -247,6 +252,78 @@ AsyncFile::run() static char g_odirect_readbuf[2*GLOBAL_PAGE_SIZE -1]; #endif +int +AsyncFile::check_odirect_write(Uint32 flags, int& new_flags, int mode) +{ + assert(new_flags & (O_CREAT | O_TRUNC)); +#ifdef O_DIRECT + int ret; + char * bufptr = (char*)((UintPtr(g_odirect_readbuf)+(GLOBAL_PAGE_SIZE - 1)) & ~(GLOBAL_PAGE_SIZE - 1)); + while (((ret = ::write(theFd, bufptr, GLOBAL_PAGE_SIZE)) == -1) && + (errno == EINTR)); + if (ret == -1) + { + new_flags &= ~O_DIRECT; + ndbout_c("%s Failed to write using O_DIRECT, disabling", + theFileName.c_str()); + } + + close(theFd); + theFd = ::open(theFileName.c_str(), new_flags, mode); + if (theFd == -1) + return errno; +#endif + + return 0; +} + +int +AsyncFile::check_odirect_read(Uint32 flags, int &new_flags, int mode) +{ +#ifdef O_DIRECT + int ret; + char * bufptr = (char*)((UintPtr(g_odirect_readbuf)+(GLOBAL_PAGE_SIZE - 1)) & ~(GLOBAL_PAGE_SIZE - 1)); + while (((ret = ::read(theFd, bufptr, GLOBAL_PAGE_SIZE)) == -1) && + (errno == EINTR)); + if (ret == -1) + { + ndbout_c("%s Failed to read using O_DIRECT, disabling", + theFileName.c_str()); + goto reopen; + } + + if(lseek(theFd, 0, SEEK_SET) != 0) + { + return errno; + } + + if ((flags & FsOpenReq::OM_CHECK_SIZE) == 0) + { + struct stat buf; + if ((fstat(theFd, &buf) == -1)) + { + return errno; + } + else if ((buf.st_size % GLOBAL_PAGE_SIZE) != 0) + { + ndbout_c("%s filesize not a multiple of %d, disabling O_DIRECT", + theFileName.c_str(), GLOBAL_PAGE_SIZE); + goto reopen; + } + } + + return 0; + +reopen: + close(theFd); + new_flags &= ~O_DIRECT; + theFd = ::open(theFileName.c_str(), new_flags, mode); + if (theFd == -1) + return errno; +#endif + return 0; +} + void AsyncFile::openReq(Request* request) { m_auto_sync_freq = 0; @@ -312,7 +389,7 @@ void AsyncFile::openReq(Request* request } #else Uint32 flags = request->par.open.flags; - Uint32 new_flags = 0; + int new_flags = 0; // Convert file open flags from Solaris to Liux if (flags & FsOpenReq::OM_CREATE) @@ -343,10 +420,6 @@ void AsyncFile::openReq(Request* request { new_flags |= O_DIRECT; } -#elif defined O_SYNC - { - flags |= FsOpenReq::OM_SYNC; - } #endif if ((flags & FsOpenReq::OM_SYNC) && ! (flags & FsOpenReq::OM_INIT)) @@ -355,15 +428,19 @@ void AsyncFile::openReq(Request* request new_flags |= O_SYNC; #endif } - + + const char * rw = ""; switch(flags & 0x3){ case FsOpenReq::OM_READONLY: + rw = "r"; new_flags |= O_RDONLY; break; case FsOpenReq::OM_WRITEONLY: + rw = "w"; new_flags |= O_WRONLY; break; case FsOpenReq::OM_READWRITE: + rw = "rw"; new_flags |= O_RDWR; break; default: @@ -404,11 +481,6 @@ no_odirect: if (new_flags & O_DIRECT) { new_flags &= ~O_DIRECT; - flags |= FsOpenReq::OM_SYNC; -#ifdef O_SYNC - if (! (flags & FsOpenReq::OM_INIT)) - new_flags |= O_SYNC; -#endif goto no_odirect; } #endif @@ -421,11 +493,6 @@ no_odirect: else if (new_flags & O_DIRECT) { new_flags &= ~O_DIRECT; - flags |= FsOpenReq::OM_SYNC; -#ifdef O_SYNC - if (! (flags & FsOpenReq::OM_INIT)) - new_flags |= O_SYNC; -#endif goto no_odirect; } #endif @@ -512,7 +579,6 @@ no_odirect: { ndbout_c("error on first write(%d), disable O_DIRECT", err); new_flags &= ~O_DIRECT; - flags |= FsOpenReq::OM_SYNC; close(theFd); theFd = ::open(theFileName.c_str(), new_flags, mode); if (theFd != -1) @@ -532,26 +598,32 @@ no_odirect: else if (flags & FsOpenReq::OM_DIRECT) { #ifdef O_DIRECT - do { - int ret; - char * bufptr = (char*)((UintPtr(g_odirect_readbuf)+(GLOBAL_PAGE_SIZE - 1)) & ~(GLOBAL_PAGE_SIZE - 1)); - while (((ret = ::read(theFd, bufptr, GLOBAL_PAGE_SIZE)) == -1) && (errno == EINTR)); - if (ret == -1) - { - ndbout_c("%s Failed to read using O_DIRECT, disabling", theFileName.c_str()); - flags |= FsOpenReq::OM_SYNC; - flags |= FsOpenReq::OM_INIT; - break; - } - if(lseek(theFd, 0, SEEK_SET) != 0) - { - request->error = errno; - return; - } - } while (0); + if (flags & (FsOpenReq::OM_TRUNCATE | FsOpenReq::OM_CREATE)) + { + request->error = check_odirect_write(flags, new_flags, mode); + } + else + { + request->error = check_odirect_read(flags, new_flags, mode); + } + + if (request->error) + return; #endif } - +#ifdef VM_TRACE + if (flags & FsOpenReq::OM_DIRECT) + { +#ifdef O_DIRECT + ndbout_c("%s %s O_DIRECT: %d", + theFileName.c_str(), rw, + !!(new_flags & O_DIRECT)); +#else + ndbout_c("%s %s O_DIRECT: 0", + theFileName.c_str(), rw); +#endif + } +#endif if ((flags & FsOpenReq::OM_SYNC) && (flags & FsOpenReq::OM_INIT)) { #ifdef O_SYNC @@ -562,6 +634,10 @@ no_odirect: new_flags &= ~(O_CREAT | O_TRUNC); new_flags |= O_SYNC; theFd = ::open(theFileName.c_str(), new_flags, mode); + if (theFd == -1) + { + request->error = errno; + } #endif } #endif @@ -1079,7 +1155,8 @@ AsyncFile::rmrfReq(Request * request, ch void AsyncFile::endReq() { // Thread is ended with return - if (theWriteBuffer) ndbd_free(theWriteBuffer, theWriteBufferSize); + if (theWriteBufferUnaligned) + ndbd_free(theWriteBufferUnaligned, theWriteBufferSize); } --- 1.9/storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.hpp 2006-12-23 20:20:18 +01:00 +++ 1.10/storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.hpp 2007-06-05 17:29:34 +02:00 @@ -232,9 +232,13 @@ private: bool theStartFlag; int theWriteBufferSize; char* theWriteBuffer; + void* theWriteBufferUnaligned; size_t m_write_wo_sync; // Writes wo/ sync size_t m_auto_sync_freq; // Auto sync freq in bytes + + int check_odirect_read(Uint32 flags, int&new_flags, int mode); + int check_odirect_write(Uint32 flags, int&new_flags, int mode); public: SimulatedBlock& m_fs; Ptr m_page_ptr; --- 1.40/storage/ndb/src/kernel/vm/SimulatedBlock.cpp 2007-06-05 17:06:24 +02:00 +++ 1.41/storage/ndb/src/kernel/vm/SimulatedBlock.cpp 2007-06-05 17:29:35 +02:00 @@ -39,6 +39,9 @@ #include #include +#include +extern EventLogger g_eventLogger; + #define ljamEntry() jamEntryLine(30000 + __LINE__) #define ljam() jamLine(30000 + __LINE__) @@ -656,13 +659,19 @@ SimulatedBlock::getBatSize(Uint16 blockN return sb->theBATSize; } +void* SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, Uint32 paramId) +{ + return allocRecordAligned(type, s, n, 0, 0, clear, paramId); +} + void* -SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, Uint32 paramId) +SimulatedBlock::allocRecordAligned(const char * type, size_t s, size_t n, void **unaligned_buffer, Uint32 align, bool clear, Uint32 paramId) { void * p = NULL; - size_t size = n*s; - Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s); + Uint32 over_alloc = unaligned_buffer ? (align - 1) : 0; + size_t size = n*s + over_alloc; + Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s) + over_alloc; refresh_watch_dog(9); if (real_size > 0){ #ifdef VM_TRACE_MEM @@ -704,6 +713,16 @@ SimulatedBlock::allocRecord(const char * } refresh_watch_dog(9); memset(ptr, 0, size); + } + if (unaligned_buffer) + { + *unaligned_buffer = p; + p = (void *)(((UintPtr)p + over_alloc) & ~(UintPtr)(over_alloc)); +#ifdef VM_TRACE + g_eventLogger.info("'%s' (%u) %llu %llu, alignment correction %u bytes", + type, align, (Uint64)p, (Uint64)p+n*s, + (Uint32)((UintPtr)p - (UintPtr)*unaligned_buffer)); +#endif } } return p; --- 1.30/storage/ndb/src/kernel/vm/SimulatedBlock.hpp 2007-06-05 17:06:24 +02:00 +++ 1.31/storage/ndb/src/kernel/vm/SimulatedBlock.hpp 2007-06-05 17:29:35 +02:00 @@ -378,6 +378,7 @@ protected: * */ void* allocRecord(const char * type, size_t s, size_t n, bool clear = true, Uint32 paramId = 0); + void* allocRecordAligned(const char * type, size_t s, size_t n, void **unaligned_buffer, Uint32 align = NDB_O_DIRECT_WRITE_ALIGNMENT, bool clear = true, Uint32 paramId = 0); /** * Deallocate record --- 1.20/mysql-test/ndb/ndb_config_2_node.ini 2006-08-21 07:53:25 +02:00 +++ 1.21/mysql-test/ndb/ndb_config_2_node.ini 2007-06-05 17:29:31 +02:00 @@ -12,6 +12,7 @@ MaxNoOfAttributes= CHOOSE_MaxNoOfAttribu TimeBetweenGlobalCheckpoints= 500 NoOfFragmentLogFiles= 3 DiskPageBufferMemory= CHOOSE_DiskPageBufferMemory +ODirect= 1 # the following parametes just function as a small regression # test that the parameter exists InitialNoOfOpenFiles= 27