List:Commits« Previous MessageNext Message »
From:tomas Date:June 5 2007 5:30pm
Subject:bk commit into 5.1 tree (tomas:1.2490) BUG#28751
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of tomas. When tomas does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2490 07/06/05 17:29:50 tomas@stripped +18 -0
  Bug #28751  	Lots of memory locked in memory causes high kswapd
  - add odirect option for lcp+backup+redo log to lower CPU/kswapd usage
  - writing odirect removes need for kernel write buffers avoiding kswapd to kick in

  storage/ndb/tools/restore/Restore.cpp
    1.46 07/06/05 17:29:35 tomas@stripped +24 -5
    ndb_restore to skip empty_record alignment padding in backup file 

  storage/ndb/src/mgmsrv/ConfigInfo.cpp
    1.99 07/06/05 17:29:35 tomas@stripped +12 -0
    new config param for odirect, default false

  storage/ndb/src/kernel/vm/SimulatedBlock.hpp
    1.31 07/06/05 17:29:35 tomas@stripped +1 -0
    alligend log buffer allocation for odirect

  storage/ndb/src/kernel/vm/SimulatedBlock.cpp
    1.41 07/06/05 17:29:35 tomas@stripped +22 -3
    alligend log buffer allocation for odirect

  storage/ndb/src/kernel/blocks/restore.cpp
    1.14 07/06/05 17:29:35 tomas@stripped +3 -0
    restor block to ignore new lcp padding empty_record

  storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.hpp
    1.10 07/06/05 17:29:34 tomas@stripped +4 -0
    align + odirect check

  storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.cpp
    1.39 07/06/05 17:29:34 tomas@stripped +114 -37
    aligned writing for odirect
    correct odirect open options with test+fallback if odirect fails

  storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp
    1.142 07/06/05 17:29:34 tomas@stripped +11 -3
    read config params and open redo log files with odirect if set

  storage/ndb/src/kernel/blocks/dblqh/DblqhInit.cpp
    1.24 07/06/05 17:29:32 tomas@stripped +9 -5
    read odirect config param and align buffers

  storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp
    1.61 07/06/05 17:29:32 tomas@stripped +2 -3
    read odirect config param and align buffers

  storage/ndb/src/kernel/blocks/backup/FsBuffer.hpp
    1.9 07/06/05 17:29:32 tomas@stripped +14 -10
    correct debug printouts

  storage/ndb/src/kernel/blocks/backup/BackupInit.cpp
    1.27 07/06/05 17:29:32 tomas@stripped +5 -2
    read odirect config and allocate aligned

  storage/ndb/src/kernel/blocks/backup/BackupFormat.hpp
    1.10 07/06/05 17:29:32 tomas@stripped +9 -1
    add empty_record in file format

  storage/ndb/src/kernel/blocks/backup/Backup.hpp
    1.28 07/06/05 17:29:32 tomas@stripped +2 -1
    odirect and padding options

  storage/ndb/src/kernel/blocks/backup/Backup.cpp
    1.64 07/06/05 17:29:32 tomas@stripped +42 -7
    read odirect config param
    open LCP and Backup datafiles with odirect if specified
    insert empty padding record if odirect is used
    allocate buffers aligned to be able to use odirect

  storage/ndb/include/ndb_global.h.in
    1.17 07/06/05 17:29:31 tomas@stripped +2 -0
    specify alignment needed for odirect

  storage/ndb/include/mgmapi/mgmapi_config_parameters.h
    1.33 07/06/05 17:29:31 tomas@stripped +2 -0
    add new config parameter to choose ODirect

  mysql-test/ndb/ndb_config_2_node.ini
    1.21 07/06/05 17:29:31 tomas@stripped +1 -0
    run mysql-test-run using ODirect

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	tomas
# Host:	poseidon.mysql.com
# Root:	/home/tomas/mysql-5.1-telco-gca

--- 1.13/storage/ndb/src/kernel/blocks/restore.cpp	2007-03-13 12:38:45 +01:00
+++ 1.14/storage/ndb/src/kernel/blocks/restore.cpp	2007-06-05 17:29:35 +02:00
@@ -559,6 +559,9 @@ Restore::restore_next(Signal* signal, Fi
       case BackupFormat::GCP_ENTRY:
 	parse_gcp_entry(signal, file_ptr, data, len);
 	break;
+      case BackupFormat::EMPTY_ENTRY:
+        // skip
+        break;
       case 0x4e444242: // 'NDBB'
 	if (check_file_version(signal, ntohl(* (data+2))) == 0)
 	{

--- 1.32/storage/ndb/include/mgmapi/mgmapi_config_parameters.h	2007-06-05 17:06:23 +02:00
+++ 1.33/storage/ndb/include/mgmapi/mgmapi_config_parameters.h	2007-06-05 17:29:31 +02:00
@@ -115,6 +115,8 @@
 
 #define CFG_DB_MEMREPORT_FREQUENCY    166
 
+#define CFG_DB_O_DIRECT               168
+
 #define CFG_DB_SGA                    198 /* super pool mem */
 #define CFG_DB_DATA_MEM_2             199 /* used in special build in 5.1 */
 

--- 1.16/storage/ndb/include/ndb_global.h.in	2006-12-23 20:20:02 +01:00
+++ 1.17/storage/ndb/include/ndb_global.h.in	2007-06-05 17:29:31 +02:00
@@ -146,4 +146,6 @@ extern "C" {
 #define MAX(x,y) (((x)>(y))?(x):(y))
 #endif
 
+#define NDB_O_DIRECT_WRITE_ALIGNMENT 512
+
 #endif

--- 1.98/storage/ndb/src/mgmsrv/ConfigInfo.cpp	2007-06-05 17:06:24 +02:00
+++ 1.99/storage/ndb/src/mgmsrv/ConfigInfo.cpp	2007-06-05 17:29:35 +02:00
@@ -1313,6 +1313,18 @@ const ConfigInfo::ParamInfo ConfigInfo::
     "0",
     STR_VALUE(MAX_INT_RNIL) },
   
+  {
+    CFG_DB_O_DIRECT,
+    "ODirect",
+    DB_TOKEN,
+    "Use O_DIRECT file write/read when possible",
+    ConfigInfo::CI_USED,
+    true,
+    ConfigInfo::CI_BOOL,
+    "false",
+    "false",
+    "true"},
+
   /***************************************************************************
    * API
    ***************************************************************************/

--- 1.63/storage/ndb/src/kernel/blocks/backup/Backup.cpp	2007-05-22 17:53:04 +02:00
+++ 1.64/storage/ndb/src/kernel/blocks/backup/Backup.cpp	2007-06-05 17:29:32 +02:00
@@ -2761,6 +2761,8 @@ Backup::openFiles(Signal* signal, Backup
   c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr);
   filePtr.p->m_flags |= BackupFile::BF_OPENING;
 
+  if (c_defaults.m_o_direct)
+    req->fileFlags |= FsOpenReq::OM_DIRECT;
   req->userPointer = filePtr.i;
   FsOpenReq::setVersion(req->fileNumber, 2);
   FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
@@ -3735,12 +3737,31 @@ Backup::OperationRecord::newFragment(Uin
 }
 
 bool
-Backup::OperationRecord::fragComplete(Uint32 tableId, Uint32 fragNo)
+Backup::OperationRecord::fragComplete(Uint32 tableId, Uint32 fragNo, bool fill_record)
 {
   Uint32 * tmp;
   const Uint32 footSz = sizeof(BackupFormat::DataFile::FragmentFooter) >> 2;
+  Uint32 sz = footSz + 1;
 
-  if(dataBuffer.getWritePtr(&tmp, footSz + 1)) {
+  if (fill_record)
+  {
+    Uint32 * new_tmp;
+    if (!dataBuffer.getWritePtr(&tmp, sz))
+      return false;
+    new_tmp = tmp + sz;
+
+    if ((UintPtr)new_tmp & (sizeof(Page32)-1))
+    {
+      /* padding is needed to get full write */
+      new_tmp += 2 /* to fit empty header minimum 2 words*/;
+      new_tmp = (Uint32 *)(((UintPtr)new_tmp + sizeof(Page32)-1) &
+                            ~(UintPtr)(sizeof(Page32)-1));
+      /* new write sz */
+      sz = new_tmp - tmp;
+    }
+  }
+
+  if(dataBuffer.getWritePtr(&tmp, sz)) {
     jam();
     * tmp = 0; // Finish record stream
     tmp++;
@@ -3752,7 +3773,17 @@ Backup::OperationRecord::fragComplete(Ui
     foot->FragmentNo    = htonl(fragNo);
     foot->NoOfRecords   = htonl(noOfRecords);
     foot->Checksum      = htonl(0);
-    dataBuffer.updateWritePtr(footSz + 1);
+
+    if (sz != footSz + 1)
+    {
+      tmp += footSz;
+      memset(tmp, 0, (sz - footSz - 1) * 4);
+      *tmp = htonl(BackupFormat::EMPTY_ENTRY);
+      tmp++;
+      *tmp = htonl(sz - footSz - 1);
+    }
+
+    dataBuffer.updateWritePtr(sz);
     return true;
   }//if
   return false;
@@ -3854,8 +3885,13 @@ Backup::fragmentCompleted(Signal* signal
     return;
   }//if
     
+  BackupRecordPtr ptr LINT_SET_PTR;
+  c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
+
   OperationRecord & op = filePtr.p->operation;
-  if(!op.fragComplete(filePtr.p->tableId, filePtr.p->fragmentNo)) {
+  if(!op.fragComplete(filePtr.p->tableId, filePtr.p->fragmentNo,
+                      c_defaults.m_o_direct))
+  {
     jam();
     signal->theData[0] = BackupContinueB::BUFFER_FULL_FRAG_COMPLETE;
     signal->theData[1] = filePtr.i;
@@ -3865,9 +3901,6 @@ Backup::fragmentCompleted(Signal* signal
   
   filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
   
-  BackupRecordPtr ptr LINT_SET_PTR;
-  c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
-
   if (ptr.p->is_lcp())
   {
     ptr.p->slaveState.setState(STOPPING);
@@ -4905,6 +4938,8 @@ Backup::lcp_open_file(Signal* signal, Ba
     FsOpenReq::OM_CREATE | 
     FsOpenReq::OM_APPEND |
     FsOpenReq::OM_AUTOSYNC;
+  if (c_defaults.m_o_direct)
+    req->fileFlags |= FsOpenReq::OM_DIRECT;
   FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
   req->auto_sync_size = c_defaults.m_disk_synch_size;
   

--- 1.27/storage/ndb/src/kernel/blocks/backup/Backup.hpp	2006-12-23 20:20:15 +01:00
+++ 1.28/storage/ndb/src/kernel/blocks/backup/Backup.hpp	2007-06-05 17:29:32 +02:00
@@ -240,7 +240,7 @@ public:
      * Once per fragment
      */
     bool newFragment(Uint32 tableId, Uint32 fragNo);
-    bool fragComplete(Uint32 tableId, Uint32 fragNo);
+    bool fragComplete(Uint32 tableId, Uint32 fragNo, bool fill_record);
     
     /**
      * Once per scan frag (next) req/conf
@@ -534,6 +534,7 @@ public:
     Uint32 m_disk_write_speed;
     Uint32 m_disk_synch_size;
     Uint32 m_diskless;
+    Uint32 m_o_direct;
   };
   
   /**

--- 1.9/storage/ndb/src/kernel/blocks/backup/BackupFormat.hpp	2006-12-23 20:20:15 +01:00
+++ 1.10/storage/ndb/src/kernel/blocks/backup/BackupFormat.hpp	2007-06-05 17:29:32 +02:00
@@ -32,7 +32,8 @@ struct BackupFormat {
     TABLE_LIST        = 4,
     TABLE_DESCRIPTION = 5,
     GCP_ENTRY         = 6,
-    FRAGMENT_INFO     = 7
+    FRAGMENT_INFO     = 7,
+    EMPTY_ENTRY       = 8
   };
 
   struct FileHeader {
@@ -92,6 +93,13 @@ struct BackupFormat {
       Uint32 FragmentNo;
       Uint32 NoOfRecords;
       Uint32 Checksum;
+    };
+
+    /* optional padding for O_DIRECT */
+    struct EmptyEntry {
+      Uint32 SectionType;
+      Uint32 SectionLength;
+      /* not used data */
     };
   };
 

--- 1.26/storage/ndb/src/kernel/blocks/backup/BackupInit.cpp	2006-12-23 20:20:15 +01:00
+++ 1.27/storage/ndb/src/kernel/blocks/backup/BackupInit.cpp	2007-06-05 17:29:32 +02:00
@@ -148,10 +148,13 @@ Backup::execREAD_CONFIG_REQ(Signal* sign
   c_defaults.m_disk_write_speed = 10 * (1024 * 1024);
   c_defaults.m_disk_write_speed_sr = 100 * (1024 * 1024);
   c_defaults.m_disk_synch_size = 4 * (1024 * 1024);
-  
+  c_defaults.m_o_direct = true;
+
   Uint32 noBackups = 0, noTables = 0, noAttribs = 0, noFrags = 0;
   ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, 
 					&c_defaults.m_diskless));
+  ndb_mgm_get_int_parameter(p, CFG_DB_O_DIRECT,
+                            &c_defaults.m_o_direct);
   ndb_mgm_get_int_parameter(p, CFG_DB_CHECKPOINT_SPEED_SR,
 			    &c_defaults.m_disk_write_speed_sr);
   ndb_mgm_get_int_parameter(p, CFG_DB_CHECKPOINT_SPEED,
@@ -204,7 +207,7 @@ Backup::execREAD_CONFIG_REQ(Signal* sign
     / sizeof(Page32);
   // We need to allocate an additional of 2 pages. 1 page because of a bug in
   // ArrayPool and another one for DICTTAINFO.
-  c_pagePool.setSize(noPages + NO_OF_PAGES_META_FILE + 2); 
+  c_pagePool.setSize(noPages + NO_OF_PAGES_META_FILE + 2, true); 
   
   { // Init all tables
     SLList<Table> tables(c_tablePool);

--- 1.8/storage/ndb/src/kernel/blocks/backup/FsBuffer.hpp	2006-12-23 20:20:15 +01:00
+++ 1.9/storage/ndb/src/kernel/blocks/backup/FsBuffer.hpp	2007-06-05 17:29:32 +02:00
@@ -270,8 +270,8 @@ FsBuffer::getReadPtr(Uint32 ** ptr, Uint
     
     * ptr = &Tp[Tr];
 
-    DEBUG(ndbout_c("getReadPtr() Tr: %d Tw: %d Ts: %d Tm: %d sz1: %d -> %d",
-		   Tr, Tw, Ts, Tm, sz1, * sz));
+    DEBUG(ndbout_c("getReadPtr() Tr: %d Tmw: %d Ts: %d Tm: %d sz1: %d -> %d",
+		   Tr, Tmw, Ts, Tm, sz1, * sz));
 
     return true;
   }
@@ -279,8 +279,8 @@ FsBuffer::getReadPtr(Uint32 ** ptr, Uint
   if(!m_eof){
     * _eof = false;
     
-    DEBUG(ndbout_c("getReadPtr() Tr: %d Tw: %d Ts: %d Tm: %d sz1: %d -> false",
-		   Tr, Tw, Ts, Tm, sz1));
+    DEBUG(ndbout_c("getReadPtr() Tr: %d Tmw: %d Ts: %d Tm: %d sz1: %d -> false",
+		   Tr, Tmw, Ts, Tm, sz1));
     
     return false;
   }
@@ -289,8 +289,8 @@ FsBuffer::getReadPtr(Uint32 ** ptr, Uint
   * _eof = true;
   * ptr = &Tp[Tr];
 
-  DEBUG(ndbout_c("getReadPtr() Tr: %d Tw: %d Ts: %d Tm: %d sz1: %d -> %d eof",
-		 Tr, Tw, Ts, Tm, sz1, * sz));
+  DEBUG(ndbout_c("getReadPtr() Tr: %d Tmw: %d Ts: %d Tm: %d sz1: %d -> %d eof",
+		 Tr, Tmw, Ts, Tm, sz1, * sz));
   
   return false;
 }
@@ -316,13 +316,13 @@ FsBuffer::getWritePtr(Uint32 ** ptr, Uin
   if(sz1 > sz){ // Note at least 1 word of slack
     * ptr = &Tp[Tw];
 
-    DEBUG(ndbout_c("getWritePtr(%d) Tr: %d Tw: %d Ts: %d sz1: %d -> true",
-		   sz, Tr, Tw, Ts, sz1));
+    DEBUG(ndbout_c("getWritePtr(%d) Tw: %d sz1: %d -> true",
+		   sz, Tw, sz1));
     return true;
   }
 
-  DEBUG(ndbout_c("getWritePtr(%d) Tr: %d Tw: %d Ts: %d sz1: %d -> false",
-		 sz, Tr, Tw, Ts, sz1));
+  DEBUG(ndbout_c("getWritePtr(%d) Tw: %d sz1: %d -> false",
+		 sz, Tw, sz1));
 
   return false;
 }
@@ -339,11 +339,15 @@ FsBuffer::updateWritePtr(Uint32 sz){
   m_free -= sz;
   if(Tnew < Ts){
     m_writeIndex = Tnew;
+    DEBUG(ndbout_c("updateWritePtr(%d) m_writeIndex: %d",
+                   sz, m_writeIndex));
     return;
   }
 
   memcpy(Tp, &Tp[Ts], (Tnew - Ts) << 2);
   m_writeIndex = Tnew - Ts;
+  DEBUG(ndbout_c("updateWritePtr(%d) m_writeIndex: %d",
+                 sz, m_writeIndex));
 }
 
 inline

--- 1.45/storage/ndb/tools/restore/Restore.cpp	2007-03-01 02:40:13 +01:00
+++ 1.46/storage/ndb/tools/restore/Restore.cpp	2007-06-05 17:29:35 +02:00
@@ -867,13 +867,32 @@ bool RestoreDataIterator::readFragmentHe
   
   debug << "RestoreDataIterator::getNextFragment" << endl;
   
-  if (buffer_read(&Header, sizeof(Header), 1) != 1){
+  while (1)
+  {
+    /* read first part of header */
+    if (buffer_read(&Header, 8, 1) != 1)
+    {
+      ret = 0;
+      return false;
+    } // if
+
+    /* skip if EMPTY_ENTRY */
+    Header.SectionType  = ntohl(Header.SectionType);
+    Header.SectionLength  = ntohl(Header.SectionLength);
+    if (Header.SectionType == BackupFormat::EMPTY_ENTRY)
+    {
+      void *tmp;
+      buffer_get_ptr(&tmp, Header.SectionLength*4-8, 1);
+      continue;
+    }
+    break;
+  }
+  /* read rest of header */
+  if (buffer_read(((char*)&Header)+8, sizeof(Header)-8, 1) != 1)
+  {
     ret = 0;
     return false;
-  } // if
-  
-  Header.SectionType  = ntohl(Header.SectionType);
-  Header.SectionLength  = ntohl(Header.SectionLength);
+  }
   Header.TableId  = ntohl(Header.TableId);
   Header.FragmentNo  = ntohl(Header.FragmentNo);
   Header.ChecksumType  = ntohl(Header.ChecksumType);

--- 1.60/storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp	2007-05-29 07:20:27 +02:00
+++ 1.61/storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp	2007-06-05 17:29:32 +02:00
@@ -115,9 +115,6 @@ class Dbtup;
 /* ------------------------------------------------------------------------- */
 /*       VARIOUS CONSTANTS USED AS FLAGS TO THE FILE MANAGER.                */
 /* ------------------------------------------------------------------------- */
-#define ZOPEN_READ 0
-#define ZOPEN_WRITE 1
-#define ZOPEN_READ_WRITE 2
 #define ZVAR_NO_LOG_PAGE_WORD 1
 #define ZLIST_OF_PAIRS 0
 #define ZLIST_OF_PAIRS_SYNCH 16
@@ -2686,6 +2683,7 @@ private:
   UintR clfoFileSize;
 
   LogPageRecord *logPageRecord;
+  void *logPageRecordUnaligned;
   LogPageRecordPtr logPagePtr;
   UintR cfirstfreeLogPage;
   UintR clogPageFileSize;
@@ -2889,6 +2887,7 @@ private:
   UintR ctransidHash[1024];
   
   Uint32 c_diskless;
+  Uint32 c_o_direct;
   Uint32 c_error_insert_table_id;
   
 public:

--- 1.23/storage/ndb/src/kernel/blocks/dblqh/DblqhInit.cpp	2007-05-24 16:40:03 +02:00
+++ 1.24/storage/ndb/src/kernel/blocks/dblqh/DblqhInit.cpp	2007-06-05 17:29:32 +02:00
@@ -49,6 +49,7 @@ void Dblqh::initData() 
   logFileRecord = 0;
   logFileOperationRecord = 0;
   logPageRecord = 0;
+  logPageRecordUnaligned= 0;
   pageRefRecord = 0;
   tablerec = 0;
   tcConnectionrec = 0;
@@ -105,10 +106,13 @@ void Dblqh::initRecords() 
 		sizeof(LogFileOperationRecord), 
 		clfoFileSize);
 
-  logPageRecord = (LogPageRecord*)allocRecord("LogPageRecord",
-					      sizeof(LogPageRecord),
-					      clogPageFileSize,
-					      false);
+  logPageRecord =
+    (LogPageRecord*)allocRecordAligned("LogPageRecord",
+                                       sizeof(LogPageRecord),
+                                       clogPageFileSize,
+                                       &logPageRecordUnaligned,
+                                       NDB_O_DIRECT_WRITE_ALIGNMENT,
+                                       false);
 
   pageRefRecord = (PageRefRecord*)allocRecord("PageRefRecord",
 					      sizeof(PageRefRecord),
@@ -378,7 +382,7 @@ Dblqh::~Dblqh() 
 		sizeof(LogFileOperationRecord), 
 		clfoFileSize);
   
-  deallocRecord((void**)&logPageRecord,
+  deallocRecord((void**)&logPageRecordUnaligned,
 		"LogPageRecord",
 		sizeof(LogPageRecord),
 		clogPageFileSize);

--- 1.141/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2007-05-29 07:20:27 +02:00
+++ 1.142/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp	2007-06-05 17:29:34 +02:00
@@ -1015,6 +1015,8 @@ void Dblqh::execREAD_CONFIG_REQ(Signal* 
   cmaxAccOps = cscanrecFileSize * MAX_PARALLEL_OP_PER_SCAN;
 
   ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, &c_diskless));
+  c_o_direct = true;
+  ndb_mgm_get_int_parameter(p, CFG_DB_O_DIRECT, &c_o_direct);
   
   Uint32 tmp= 0;
   ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_LQH_FRAG, &tmp));
@@ -13243,7 +13245,9 @@ void Dblqh::openFileRw(Signal* signal, L
   signal->theData[3] = olfLogFilePtr.p->fileName[1];
   signal->theData[4] = olfLogFilePtr.p->fileName[2];
   signal->theData[5] = olfLogFilePtr.p->fileName[3];
-  signal->theData[6] = ZOPEN_READ_WRITE | FsOpenReq::OM_AUTOSYNC;
+  signal->theData[6] = FsOpenReq::OM_READWRITE | FsOpenReq::OM_AUTOSYNC;
+  if (c_o_direct)
+    signal->theData[6] |= FsOpenReq::OM_DIRECT;
   req->auto_sync_size = MAX_REDO_PAGES_WITHOUT_SYNCH * sizeof(LogPageRecord);
   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
 }//Dblqh::openFileRw()
@@ -13263,7 +13267,9 @@ void Dblqh::openLogfileInit(Signal* sign
   signal->theData[3] = logFilePtr.p->fileName[1];
   signal->theData[4] = logFilePtr.p->fileName[2];
   signal->theData[5] = logFilePtr.p->fileName[3];
-  signal->theData[6] = 0x302 | FsOpenReq::OM_AUTOSYNC;
+  signal->theData[6] = FsOpenReq::OM_READWRITE | FsOpenReq::OM_TRUNCATE |
FsOpenReq::OM_CREATE | FsOpenReq::OM_AUTOSYNC;
+  if (c_o_direct)
+    signal->theData[6] |= FsOpenReq::OM_DIRECT;
   req->auto_sync_size = MAX_REDO_PAGES_WITHOUT_SYNCH * sizeof(LogPageRecord);
   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
 }//Dblqh::openLogfileInit()
@@ -13299,7 +13305,9 @@ void Dblqh::openNextLogfile(Signal* sign
     signal->theData[3] = onlLogFilePtr.p->fileName[1];
     signal->theData[4] = onlLogFilePtr.p->fileName[2];
     signal->theData[5] = onlLogFilePtr.p->fileName[3];
-    signal->theData[6] = 2 | FsOpenReq::OM_AUTOSYNC;
+    signal->theData[6] = FsOpenReq::OM_READWRITE | FsOpenReq::OM_AUTOSYNC;
+    if (c_o_direct)
+      signal->theData[6] |= FsOpenReq::OM_DIRECT;
     req->auto_sync_size = MAX_REDO_PAGES_WITHOUT_SYNCH * sizeof(LogPageRecord);
     sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
   }//if

--- 1.38/storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.cpp	2006-12-23 20:20:18 +01:00
+++ 1.39/storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.cpp	2007-06-05 17:29:34 +02:00
@@ -163,7 +163,12 @@ AsyncFile::run()
   theStartFlag = true;
   // Create write buffer for bigger writes
   theWriteBufferSize = WRITEBUFFERSIZE;
-  theWriteBuffer = (char *) ndbd_malloc(theWriteBufferSize); 
+  theWriteBufferUnaligned = (char *) ndbd_malloc(theWriteBufferSize +
+                                                 NDB_O_DIRECT_WRITE_ALIGNMENT-1);
+  theWriteBuffer = (char *)
+    (((UintPtr)theWriteBufferUnaligned + NDB_O_DIRECT_WRITE_ALIGNMENT - 1) &
+     ~(UintPtr)(NDB_O_DIRECT_WRITE_ALIGNMENT - 1));
+
   NdbMutex_Unlock(theStartMutexPtr);
   NdbCondition_Signal(theStartConditionPtr);
   
@@ -247,6 +252,78 @@ AsyncFile::run()
 static char g_odirect_readbuf[2*GLOBAL_PAGE_SIZE -1];
 #endif
 
+int
+AsyncFile::check_odirect_write(Uint32 flags, int& new_flags, int mode)
+{
+  assert(new_flags & (O_CREAT | O_TRUNC));
+#ifdef O_DIRECT
+  int ret;
+  char * bufptr = (char*)((UintPtr(g_odirect_readbuf)+(GLOBAL_PAGE_SIZE - 1)) &
~(GLOBAL_PAGE_SIZE - 1));
+  while (((ret = ::write(theFd, bufptr, GLOBAL_PAGE_SIZE)) == -1) && 
+         (errno == EINTR));
+  if (ret == -1)
+  {
+    new_flags &= ~O_DIRECT;
+    ndbout_c("%s Failed to write using O_DIRECT, disabling", 
+             theFileName.c_str());
+  }
+  
+  close(theFd);
+  theFd = ::open(theFileName.c_str(), new_flags, mode);
+  if (theFd == -1)
+    return errno;
+#endif
+
+  return 0;
+}
+
+int
+AsyncFile::check_odirect_read(Uint32 flags, int &new_flags, int mode)
+{
+#ifdef O_DIRECT
+  int ret;
+  char * bufptr = (char*)((UintPtr(g_odirect_readbuf)+(GLOBAL_PAGE_SIZE - 1)) &
~(GLOBAL_PAGE_SIZE - 1));
+  while (((ret = ::read(theFd, bufptr, GLOBAL_PAGE_SIZE)) == -1) && 
+         (errno == EINTR));
+  if (ret == -1)
+  {
+    ndbout_c("%s Failed to read using O_DIRECT, disabling", 
+             theFileName.c_str());
+    goto reopen;
+  }
+  
+  if(lseek(theFd, 0, SEEK_SET) != 0)
+  {
+    return errno;
+  }
+  
+  if ((flags & FsOpenReq::OM_CHECK_SIZE) == 0)
+  {
+    struct stat buf;
+    if ((fstat(theFd, &buf) == -1))
+    {
+      return errno;
+    } 
+    else if ((buf.st_size % GLOBAL_PAGE_SIZE) != 0)
+    {
+      ndbout_c("%s filesize not a multiple of %d, disabling O_DIRECT", 
+               theFileName.c_str(), GLOBAL_PAGE_SIZE);
+      goto reopen;
+    }
+  }
+  
+  return 0;
+  
+reopen:
+  close(theFd);
+  new_flags &= ~O_DIRECT;
+  theFd = ::open(theFileName.c_str(), new_flags, mode);
+  if (theFd == -1)
+    return errno;  
+#endif
+  return 0;
+}
+
 void AsyncFile::openReq(Request* request)
 {  
   m_auto_sync_freq = 0;
@@ -312,7 +389,7 @@ void AsyncFile::openReq(Request* request
   }
 #else
   Uint32 flags = request->par.open.flags;
-  Uint32 new_flags = 0;
+  int new_flags = 0;
 
   // Convert file open flags from Solaris to Liux
   if (flags & FsOpenReq::OM_CREATE)
@@ -343,10 +420,6 @@ void AsyncFile::openReq(Request* request
   {
     new_flags |= O_DIRECT;
   }
-#elif defined O_SYNC
-  {
-    flags |= FsOpenReq::OM_SYNC;
-  }
 #endif
   
   if ((flags & FsOpenReq::OM_SYNC) && ! (flags & FsOpenReq::OM_INIT))
@@ -355,15 +428,19 @@ void AsyncFile::openReq(Request* request
     new_flags |= O_SYNC;
 #endif
   }
-    
+
+  const char * rw = "";
   switch(flags & 0x3){
   case FsOpenReq::OM_READONLY:
+    rw = "r";
     new_flags |= O_RDONLY;
     break;
   case FsOpenReq::OM_WRITEONLY:
+    rw = "w";
     new_flags |= O_WRONLY;
     break;
   case FsOpenReq::OM_READWRITE:
+    rw = "rw";
     new_flags |= O_RDWR;
     break;
   default:
@@ -404,11 +481,6 @@ no_odirect:
 	if (new_flags & O_DIRECT)
 	{
 	  new_flags &= ~O_DIRECT;
-	  flags |= FsOpenReq::OM_SYNC;
-#ifdef O_SYNC
-	  if (! (flags & FsOpenReq::OM_INIT))
-	    new_flags |= O_SYNC;
-#endif
 	  goto no_odirect;
 	}
 #endif
@@ -421,11 +493,6 @@ no_odirect:
     else if (new_flags & O_DIRECT)
     {
       new_flags &= ~O_DIRECT;
-      flags |= FsOpenReq::OM_SYNC;
-#ifdef O_SYNC
-      if (! (flags & FsOpenReq::OM_INIT))
-	new_flags |= O_SYNC;
-#endif
       goto no_odirect;
     }
 #endif
@@ -512,7 +579,6 @@ no_odirect:
 	{
 	  ndbout_c("error on first write(%d), disable O_DIRECT", err);
 	  new_flags &= ~O_DIRECT;
-	  flags |= FsOpenReq::OM_SYNC;
 	  close(theFd);
 	  theFd = ::open(theFileName.c_str(), new_flags, mode);
 	  if (theFd != -1)
@@ -532,26 +598,32 @@ no_odirect:
   else if (flags & FsOpenReq::OM_DIRECT)
   {
 #ifdef O_DIRECT
-    do {
-      int ret;
-      char * bufptr = (char*)((UintPtr(g_odirect_readbuf)+(GLOBAL_PAGE_SIZE - 1)) &
~(GLOBAL_PAGE_SIZE - 1));
-      while (((ret = ::read(theFd, bufptr, GLOBAL_PAGE_SIZE)) == -1) && (errno ==
EINTR));
-      if (ret == -1)
-      {
-	ndbout_c("%s Failed to read using O_DIRECT, disabling", theFileName.c_str());
-	flags |= FsOpenReq::OM_SYNC;
-	flags |= FsOpenReq::OM_INIT;
-	break;
-      }
-      if(lseek(theFd, 0, SEEK_SET) != 0)
-      {
-	request->error = errno;
-	return;
-      }
-    } while (0);
+    if (flags & (FsOpenReq::OM_TRUNCATE | FsOpenReq::OM_CREATE))
+    {
+      request->error = check_odirect_write(flags, new_flags, mode);
+    }
+    else
+    {
+      request->error = check_odirect_read(flags, new_flags, mode);
+    }
+    
+    if (request->error)
+      return;
 #endif
   }
-
+#ifdef VM_TRACE
+  if (flags & FsOpenReq::OM_DIRECT)
+  {
+#ifdef O_DIRECT
+    ndbout_c("%s %s O_DIRECT: %d",
+             theFileName.c_str(), rw,
+             !!(new_flags & O_DIRECT));
+#else
+    ndbout_c("%s %s O_DIRECT: 0",
+             theFileName.c_str(), rw);
+#endif
+  }
+#endif  
   if ((flags & FsOpenReq::OM_SYNC) && (flags & FsOpenReq::OM_INIT))
   {
 #ifdef O_SYNC
@@ -562,6 +634,10 @@ no_odirect:
     new_flags &= ~(O_CREAT | O_TRUNC);
     new_flags |= O_SYNC;
     theFd = ::open(theFileName.c_str(), new_flags, mode);
+    if (theFd == -1)
+    {
+      request->error = errno;
+    }
 #endif
   }
 #endif
@@ -1079,7 +1155,8 @@ AsyncFile::rmrfReq(Request * request, ch
 void AsyncFile::endReq()
 {
   // Thread is ended with return
-  if (theWriteBuffer) ndbd_free(theWriteBuffer, theWriteBufferSize);
+  if (theWriteBufferUnaligned)
+    ndbd_free(theWriteBufferUnaligned, theWriteBufferSize);
 }
 
 

--- 1.9/storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.hpp	2006-12-23 20:20:18 +01:00
+++ 1.10/storage/ndb/src/kernel/blocks/ndbfs/AsyncFile.hpp	2007-06-05 17:29:34 +02:00
@@ -232,9 +232,13 @@ private:
   bool   theStartFlag;
   int theWriteBufferSize;
   char* theWriteBuffer;
+  void* theWriteBufferUnaligned;
   
   size_t m_write_wo_sync;  // Writes wo/ sync
   size_t m_auto_sync_freq; // Auto sync freq in bytes
+
+  int check_odirect_read(Uint32 flags, int&new_flags, int mode);
+  int check_odirect_write(Uint32 flags, int&new_flags, int mode);
 public:
   SimulatedBlock& m_fs;
   Ptr<GlobalPage> m_page_ptr;

--- 1.40/storage/ndb/src/kernel/vm/SimulatedBlock.cpp	2007-06-05 17:06:24 +02:00
+++ 1.41/storage/ndb/src/kernel/vm/SimulatedBlock.cpp	2007-06-05 17:29:35 +02:00
@@ -39,6 +39,9 @@
 #include <AttributeDescriptor.hpp>
 #include <NdbSqlUtil.hpp>
 
+#include <EventLogger.hpp>
+extern EventLogger g_eventLogger;
+
 #define ljamEntry() jamEntryLine(30000 + __LINE__)
 #define ljam() jamLine(30000 + __LINE__)
 
@@ -656,13 +659,19 @@ SimulatedBlock::getBatSize(Uint16 blockN
   return sb->theBATSize;
 }
 
+void* SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear,
Uint32 paramId)
+{
+  return allocRecordAligned(type, s, n, 0, 0, clear, paramId);
+}
+
 void* 
-SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, Uint32
paramId) 
+SimulatedBlock::allocRecordAligned(const char * type, size_t s, size_t n, void
**unaligned_buffer, Uint32 align, bool clear, Uint32 paramId)
 {
 
   void * p = NULL;
-  size_t size = n*s;
-  Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s);
+  Uint32 over_alloc = unaligned_buffer ? (align - 1) : 0;
+  size_t size = n*s + over_alloc;
+  Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s) + over_alloc;
   refresh_watch_dog(9);
   if (real_size > 0){
 #ifdef VM_TRACE_MEM
@@ -704,6 +713,16 @@ SimulatedBlock::allocRecord(const char *
       }
       refresh_watch_dog(9);
       memset(ptr, 0, size);
+    }
+    if (unaligned_buffer)
+    {
+      *unaligned_buffer = p;
+      p = (void *)(((UintPtr)p + over_alloc) & ~(UintPtr)(over_alloc));
+#ifdef VM_TRACE
+      g_eventLogger.info("'%s' (%u) %llu %llu, alignment correction %u bytes",
+                         type, align, (Uint64)p, (Uint64)p+n*s,
+                         (Uint32)((UintPtr)p - (UintPtr)*unaligned_buffer));
+#endif
     }
   }
   return p;

--- 1.30/storage/ndb/src/kernel/vm/SimulatedBlock.hpp	2007-06-05 17:06:24 +02:00
+++ 1.31/storage/ndb/src/kernel/vm/SimulatedBlock.hpp	2007-06-05 17:29:35 +02:00
@@ -378,6 +378,7 @@ protected:
    *
    */
   void* allocRecord(const char * type, size_t s, size_t n, bool clear = true, Uint32
paramId = 0);
+  void* allocRecordAligned(const char * type, size_t s, size_t n, void
**unaligned_buffer, Uint32 align = NDB_O_DIRECT_WRITE_ALIGNMENT, bool clear = true,
Uint32 paramId = 0);
   
   /**
    * Deallocate record

--- 1.20/mysql-test/ndb/ndb_config_2_node.ini	2006-08-21 07:53:25 +02:00
+++ 1.21/mysql-test/ndb/ndb_config_2_node.ini	2007-06-05 17:29:31 +02:00
@@ -12,6 +12,7 @@ MaxNoOfAttributes= CHOOSE_MaxNoOfAttribu
 TimeBetweenGlobalCheckpoints= 500
 NoOfFragmentLogFiles= 3
 DiskPageBufferMemory= CHOOSE_DiskPageBufferMemory
+ODirect= 1
 # the following parametes just function as a small regression
 # test that the parameter exists
 InitialNoOfOpenFiles= 27
Thread
bk commit into 5.1 tree (tomas:1.2490) BUG#28751tomas5 Jun