List:Commits« Previous MessageNext Message »
From:knielsen Date:November 1 2007 11:30am
Subject:bk commit into 5.1 tree (knielsen:1.2614)
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of knielsen. When knielsen does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-11-01 11:30:33+01:00, knielsen@ymer.(none) +1 -0
  WL#1498: Multi-threaded ndbd
  
  Add some prefetching found with OProfile to decrease cache misses.

  storage/ndb/src/kernel/vm/mt/mt.cpp@stripped, 2007-11-01 11:30:29+01:00, knielsen@ymer.(none) +24 -3
    Add some prefetching found with OProfile to decrease cache misses.

diff -Nrup a/storage/ndb/src/kernel/vm/mt/mt.cpp b/storage/ndb/src/kernel/vm/mt/mt.cpp
--- a/storage/ndb/src/kernel/vm/mt/mt.cpp	2007-10-30 11:34:36 +01:00
+++ b/storage/ndb/src/kernel/vm/mt/mt.cpp	2007-11-01 11:30:29 +01:00
@@ -33,6 +33,7 @@
 #include <sys/syscall.h>
 #include <sys/types.h>
 
+//#define memcpy __builtin_memcpy
 
 /* Constants found by benchmarks to be reasonable values. */
 
@@ -977,11 +978,21 @@ execute_signals(thr_data *selfptr, thr_j
       }
     }
 
+    /*
+     * These pre-fetching were found using OProfile to reduce cache misses.
+     * (Though on Intel Core 2, they do not give much speedup, as apparently
+     * the hardware prefetcher is already doing a fairly good job).
+     */
+    __builtin_prefetch (read_buffer->m_data + read_pos + 16, 0, 3);
+    __builtin_prefetch ((Uint32 *)&sig->header + 16, 1, 3);
+
     /* Now execute the signal. */
     SignalHeader* s =
       reinterpret_cast<SignalHeader*>(&(read_buffer->m_data[read_pos]));
     Uint32 seccnt = s->m_noOfSections;
     Uint32 siglen = (sizeof(*s)>>2) + s->theLength;
+    if(siglen>16)
+      __builtin_prefetch (read_buffer->m_data + read_pos + 32, 0, 3);
     Uint32 bno = s->theReceiversBlockNumber;
     Uint32 gsn = s->theVerId_signalNumber;
     SimulatedBlock * block = blockptr[bno];
@@ -1077,7 +1088,8 @@ extern "C"
 void *
 mt_thr_main(void *thr_arg)
 {
-  Signal signal;
+  unsigned char signal_buf[sizeof(Signal) + 63 + 256 * MAX_THREADS];
+  Signal *signal;
   struct timespec nowait;
   nowait.tv_sec = 0;
   nowait.tv_nsec = 10 * 1000000;
@@ -1100,6 +1112,15 @@ mt_thr_main(void *thr_arg)
     the numeric thread id in this slightly backwards way.
   */
   unsigned thr_no = selfptr - &(rep->m_thread[0]);
+  /*
+   * Align signal buffer for better cache performance.
+   * Also skew it a litte for each thread to avoid cache pollution.
+   */
+  UintPtr sigtmp= (UintPtr)signal_buf;
+  sigtmp= (sigtmp+63) & (~(UintPtr)63);
+  sigtmp+= thr_no*256;
+  signal = (Signal *)sigtmp;
+
   pid_t tid = (unsigned)syscall(SYS_gettid);
   ndbout_c("Tread %u started, tid=%u", thr_no, tid);
 #ifdef NDB_MT_LOCK_TO_CPU
@@ -1139,11 +1160,11 @@ mt_thr_main(void *thr_arg)
     watchDogCounter = 2;
     scan_time_queues(selfptr);
 
-    Uint32 sum = run_job_buffers(selfptr, &signal,
+    Uint32 sum = run_job_buffers(selfptr, signal,
                                  &watchDogCounter, &thrSignalId);
     
     watchDogCounter = 1;
-    sendpacked(selfptr, &signal, thr_no);
+    sendpacked(selfptr, signal, thr_no);
     
     if (sum)
     {
Thread
bk commit into 5.1 tree (knielsen:1.2614)knielsen1 Nov