List:Commits« Previous MessageNext Message »
From:Mattias Jonsson Date:October 13 2009 2:59pm
Subject:bzr commit into mysql-5.1-bugteam branch (mattias.jonsson:3160)
Bug#47261
View as plain text  
#At file:///Users/mattiasj/clones/bzrroot/b47261-51-bugteam/ based on revid:v.narayanan@stripped

 3160 Mattias Jonsson	2009-10-13
      Bug#47261: 5.1 fix for Partitioning performance drop...
      with hundreds of partitions
      
      One problem that is solvable in 5.1 is that the sorting of locks
      is not partitioning aware, every partition have its on locks,
      and they are already sorted in partition order, but sort_locks
      sorted them as independent locks.
      
      Solution was to group locks togather as already sorted locks, and
      only sort those group of locks.
      
      And as a part of this change I also changed the sorting algorithm
      from an O(n2) to O(nlog(n)), by using my_qsort instead. The order
      remains the same.
     @ include/thr_lock.h
        Bug#47261: 5.1 fix for Partitioning performance drop...
        with hundreds of partitions
        
        Added flag for grouping locks for use with partitioning,
        since all locks in a partitioned table is stored in
        the same order.
     @ mysys/thr_lock.c
        Bug#47261: 5.1 fix for Partitioning performance drop...
        with hundreds of partitions
        
        Changed the sorting of locks into two parts, first pick out
        the locks that need to be sorted (only using the first lock in
        groups already sorted, like locks for partitioned tables).
        And then sorting only these locks, then copying the locks in the
        correct order.
        
        To ease the ordering, I changed the inplace ordering in thr_multi_lock
        to only read the first count members, and then write the ordered locks
        into the second count members (since that was already handled that way
        in sql/lock.cc).
     @ sql/ha_partition.cc
        Bug#47261: 5.1 fix for Partitioning performance drop...
        with hundreds of partitions
        
        Added grouping of locks to be able to increase the
        performance of sorting locks for heavily partitioned
        tables, by using the fact that the locks within a
        partitioned table is already stored in partition
        order.
        
        Note the DBUG_ASSERT(prev == (to - 1)), since if an engine stores
        several locks we cannot ensure the correct order.
     @ sql/lock.cc
        Bug#47261: 5.1 fix for Partitioning performance drop...
        with hundreds of partitions
        
        Since the lock array was already doubled, I moved the
        copying of the sorted array to thr_multi_lock instead,
        to ease the implementation.
        
        (Fixed a typo in a comment too...)

    modified:
      include/thr_lock.h
      mysys/thr_lock.c
      sql/ha_partition.cc
      sql/lock.cc
=== modified file 'include/thr_lock.h'
--- a/include/thr_lock.h	2009-04-14 12:05:32 +0000
+++ b/include/thr_lock.h	2009-10-13 14:59:43 +0000
@@ -119,6 +119,13 @@ typedef struct st_thr_lock_data {
   enum thr_lock_type type;
   void *status_param;			/* Param to status functions */
   void *debug_print_param;
+  /*
+    For already sorted locks, i.e. locks on partitions which are always stored
+    in partition id order, this is set for all but the last lock.
+    This is to be able to sort the whole group only by the first lock of that
+    group of locks.
+  */
+  my_bool have_more_sorted_locks;
 } THR_LOCK_DATA;
 
 struct st_lock_list {

=== modified file 'mysys/thr_lock.c'
--- a/mysys/thr_lock.c	2009-09-29 15:38:40 +0000
+++ b/mysys/thr_lock.c	2009-10-13 14:59:43 +0000
@@ -358,6 +358,7 @@ void thr_lock_data_init(THR_LOCK *lock,T
   data->owner= 0;                               /* no owner yet */
   data->status_param=param;
   data->cond=0;
+  data->have_more_sorted_locks= FALSE;
 }
 
 
@@ -958,49 +959,155 @@ end:
 */
 
 
-#define LOCK_CMP(A,B) ((uchar*) (A->lock) - (uint) ((A)->type) < (uchar*) (B->lock)- (uint) ((B)->type))
+/** Structure to be able to use my_qsort for groups of already sorted locks */
+struct st_sort_node {
+  THR_LOCK_DATA *lock;
+  uint          index;
+};
 
-static void sort_locks(THR_LOCK_DATA **data,uint count)
-{
-  THR_LOCK_DATA **pos,**end,**prev,*tmp;
+typedef struct st_sort_node SORT_NODE;
+
+
+/**
+  Compare two locks
+
+  @param  a  sort_node pointer (including THR_LOCK_DATA pointer)
+  @param  b  sort_node pointer (including THR_LOCK_DATA pointer)
+
+  @return a cmp b
+    @retval -1  a < b
+    @retval  0  a = b
+    @retval  1  a > b
 
-  /* Sort locks with insertion sort (fast because almost always few locks) */
+  @note Sort locks by address (lock position), and on secondary level on type.
+*/
 
-  for (pos=data+1,end=data+count; pos < end ; pos++)
+static int thr_lock_data_cmp(SORT_NODE *a, SORT_NODE *b)
+{
+  THR_LOCK_DATA *al= a->lock;
+  THR_LOCK_DATA *bl= b->lock;
+  /* First sort by lock address (position) */
+  if ((uchar*) al->lock < (uchar*) bl->lock)
+    return -1;
+  else if ((uchar*) al->lock > (uchar*) bl->lock)
+    return 1;
+  else
   {
-    tmp= *pos;
-    if (LOCK_CMP(tmp,pos[-1]))
-    {
-      prev=pos;
-      do {
-	prev[0]=prev[-1];
-      } while (--prev != data && LOCK_CMP(tmp,prev[-1]));
-      prev[0]=tmp;
-    }
+    /* if same lock, sort higher priority (higher type) before lower */
+    if ((uint) al->type > (uint) bl->type)
+      return -1;
+    else if ((uint) al->type < (uint) bl->type)
+      return 1;
   }
+  return 0;
 }
 
 
+
+
+/**
+  Lock multiple locks in a consitent order
+
+  @param  data    pointer to an array of 2*count THR_LOCK_DATA elements,
+                  where the first half is the locks to lock (read),
+                  and the second half is the locks sorted (written)
+  @param  count   number of locks to lock
+  @param  owner   owner of the lock
+
+  @return Operation status
+
+  @note Data is read from the first count members, sorted, written to
+  the second count part and then locked in that order.
+*/
+
 enum enum_thr_lock_result
 thr_multi_lock(THR_LOCK_DATA **data, uint count, THR_LOCK_OWNER *owner)
 {
-  THR_LOCK_DATA **pos,**end;
+  THR_LOCK_DATA **pos, **end;
+  THR_LOCK_DATA **sorted_data= data + count;
   DBUG_ENTER("thr_multi_lock");
   DBUG_PRINT("lock",("data: 0x%lx  count: %d", (long) data, count));
+
   if (count > 1)
-    sort_locks(data,count);
+  {
+    SORT_NODE *sort_array;
+    uint i;
+    uint sort_count= 0;
+    /*
+      Prepare an array of pointers to the lock_data's, with only the first
+      lock_data in every group of already sorted lock_data's.
+      I.e. only including the first partition's lock_data in a partitioned
+      table.
+    */
+    sort_array= (SORT_NODE*) my_malloc(count*sizeof(SORT_NODE), MYF(0));
+    if (!sort_array)
+      DBUG_RETURN(THR_LOCK_ABORTED);
+    i= 0;
+    for (pos=data, end=data+count; pos < end ; pos++)
+    {
+      sort_array[sort_count].index= i++;
+      sort_array[sort_count++].lock= *pos;
+
+      /* Skip the rest of the already sorted group of lock_data's */
+      while ((*pos)->have_more_sorted_locks)
+      {
+        i++;
+        pos++;
+      }
+    }
+
+    if (sort_count > 1)
+    {
+      uint j= 0;
+      /* Sort the array of pointers.  */
+      my_qsort(sort_array, sort_count, sizeof(SORT_NODE),
+               (qsort_cmp) thr_lock_data_cmp);
+
+      /* Write the sorted 'data' to 'sorted_data' */
+      for (i= 0; i < sort_count; i++)
+      {
+        DBUG_PRINT("info", ("moving lock %u to %u", sort_array[i].index, j));
+        sorted_data[j++]= sort_array[i].lock;
+
+        /*
+          Copy the rest of the already sorted group of lock_data's
+          after the newly sorted lock_data (which was the first lock_data
+          of the group).
+        */
+        pos= &data[sort_array[i].index];
+        while ((*pos)->have_more_sorted_locks)
+        {
+          DBUG_PRINT("info", ("moving secondary partition lock of %u to %u",
+                              sort_array[i].index, j));
+          sorted_data[j++]= *(++pos);
+        }
+      }
+      DBUG_ASSERT(count == j);
+    }
+    else
+      memcpy(sorted_data, data, count * sizeof(THR_LOCK_DATA*));
+    my_free(sort_array, MYF(0));
+  }
+  else
+    sorted_data[0]= data[0];
+
+#ifdef MAIN
+  for (pos=sorted_data,end=sorted_data+count; pos < end ; pos++)
+    printf("Thread: %s lock order: 0x%lx (0x%lx) type: %d\n",my_thread_name(),
+	   (long) pos[0]->lock, (ulong) pos[0], pos[0]->type); fflush(stdout);
+#endif
   /* lock everything */
-  for (pos=data,end=data+count; pos < end ; pos++)
+  for (pos=sorted_data,end=sorted_data+count; pos < end ; pos++)
   {
     enum enum_thr_lock_result result= thr_lock(*pos, owner, (*pos)->type);
     if (result != THR_LOCK_SUCCESS)
     {						/* Aborted */
-      thr_multi_unlock(data,(uint) (pos-data));
+      thr_multi_unlock(sorted_data,(uint) (pos-sorted_data));
       DBUG_RETURN(result);
     }
 #ifdef MAIN
-    printf("Thread: %s  Got lock: 0x%lx  type: %d\n",my_thread_name(),
-	   (long) pos[0]->lock, pos[0]->type); fflush(stdout);
+    printf("Thread: %s  Got lock: 0x%lx (0x%lx) type: %d\n",my_thread_name(),
+	   (long) pos[0]->lock, (ulong) pos[0], pos[0]->type); fflush(stdout);
 #endif
   }
   /*
@@ -1029,7 +1136,7 @@ thr_multi_lock(THR_LOCK_DATA **data, uin
 	  */
 	  for (;
 	       (*pos)->type <= TL_READ_NO_INSERT &&
-		 pos != data &&
+		 pos != sorted_data &&
 		 pos[-1]->lock == (*pos)->lock ;
 	       pos--) ;
 
@@ -1047,7 +1154,7 @@ thr_multi_lock(THR_LOCK_DATA **data, uin
       }
       else
 	last_lock=(*pos);
-    } while (pos != data);
+    } while (pos != sorted_data);
   }
 #endif
   DBUG_RETURN(THR_LOCK_SUCCESS);
@@ -1064,8 +1171,8 @@ void thr_multi_unlock(THR_LOCK_DATA **da
   for (pos=data,end=data+count; pos < end ; pos++)
   {
 #ifdef MAIN
-    printf("Thread: %s  Rel lock: 0x%lx  type: %d\n",
-	   my_thread_name(), (long) pos[0]->lock, pos[0]->type);
+    printf("Thread: %s  Rel lock: 0x%lx (0x%lx) type: %d\n",
+	   my_thread_name(), (long) pos[0]->lock, (ulong) pos[0], pos[0]->type);
     fflush(stdout);
 #endif
     if ((*pos)->type != TL_UNLOCK)
@@ -1621,7 +1728,7 @@ static void *test_thread(void *arg)
   THR_LOCK_DATA data[MAX_LOCK_COUNT];
   THR_LOCK_OWNER owner;
   THR_LOCK_INFO lock_info;
-  THR_LOCK_DATA *multi_locks[MAX_LOCK_COUNT];
+  THR_LOCK_DATA *multi_locks[2*MAX_LOCK_COUNT];
   my_thread_init();
 
   printf("Thread %s (%d) started\n",my_thread_name(),param); fflush(stdout);
@@ -1630,7 +1737,12 @@ static void *test_thread(void *arg)
   thr_lock_info_init(&lock_info);
   thr_lock_owner_init(&owner, &lock_info);
   for (i=0; i < lock_counts[param] ; i++)
+  {
     thr_lock_data_init(locks+tests[param][i].lock_nr,data+i,NULL);
+    printf("Thread %s (%d) uses lock %u (%u/%u)\n",
+           my_thread_name(), param, tests[param][i].lock_nr,
+           i, lock_counts[param]);
+  }
   for (j=1 ; j < 10 ; j++)		/* try locking 10 times */
   {
     for (i=0; i < lock_counts[param] ; i++)
@@ -1638,9 +1750,14 @@ static void *test_thread(void *arg)
       multi_locks[i]= &data[i];
       data[i].type= tests[param][i].lock_type;
     }
+    printf("Thread %s (%d) trying to get %u locks\n",my_thread_name(),param,
+           lock_counts[param]);
+    fflush(stdout);
     thr_multi_lock(multi_locks, lock_counts[param], &owner);
     pthread_mutex_lock(&LOCK_thread_count);
     {
+      printf("Thread %s (%d) working\n",my_thread_name(),param);
+      fflush(stdout);
       int tmp=rand() & 7;			/* Do something from 0-2 sec */
       if (tmp == 0)
 	sleep(1);
@@ -1654,7 +1771,7 @@ static void *test_thread(void *arg)
       }
     }
     pthread_mutex_unlock(&LOCK_thread_count);
-    thr_multi_unlock(multi_locks,lock_counts[param]);
+    thr_multi_unlock(multi_locks+lock_counts[param],lock_counts[param]);
   }
 
   printf("Thread %s (%d) ended\n",my_thread_name(),param); fflush(stdout);
@@ -1753,7 +1870,9 @@ int main(int argc __attribute__((unused)
   {
     if ((error=pthread_cond_wait(&COND_thread_count,&LOCK_thread_count)))
       fprintf(stderr,"Got error: %d from pthread_cond_wait\n",error);
+    printf("Thread count %u\n", thread_count);
   }
+  printf("All threads are done\n");
   if ((error=pthread_mutex_unlock(&LOCK_thread_count)))
     fprintf(stderr,"Got error: %d from pthread_mutex_unlock\n",error);
   for (i=0 ; i < (int) array_elements(locks) ; i++)

=== modified file 'sql/ha_partition.cc'
--- a/sql/ha_partition.cc	2009-10-09 07:56:07 +0000
+++ b/sql/ha_partition.cc	2009-10-13 14:59:43 +0000
@@ -2774,13 +2774,36 @@ THR_LOCK_DATA **ha_partition::store_lock
 					 enum thr_lock_type lock_type)
 {
   handler **file;
+  THR_LOCK_DATA **first= to;
+#ifndef DBUG_OFF
+  THR_LOCK_DATA **prev;
+#endif
   DBUG_ENTER("ha_partition::store_lock");
   file= m_file;
   do
   {
+#ifndef DBUG_OFF
     DBUG_PRINT("info", ("store lock %d iteration", (int) (file - m_file)));
+    prev= to;
+#endif
     to= (*file)->store_lock(thd, to, lock_type);
+    /*
+      Ensure that engines stores exactly one lock, have_more_sorted_locks may
+      not work properly otherwise.
+    */
+#ifndef DBUG_OFF
+    DBUG_ASSERT(prev == (to - 1));
+#endif
   } while (*(++file));
+
+  /* Mark all partitions accept the last with have_more_sorted_locks */
+  while (first < (to - 1))
+  {
+    /* May already be set by concurrent handler */
+    (*(first++))->have_more_sorted_locks= TRUE;
+  }
+  /* Last should never be set! */
+  DBUG_ASSERT(!(*first)->have_more_sorted_locks);
   DBUG_RETURN(to);
 }
 

=== modified file 'sql/lock.cc'
--- a/sql/lock.cc	2009-08-28 16:21:54 +0000
+++ b/sql/lock.cc	2009-10-13 14:59:43 +0000
@@ -269,12 +269,11 @@ MYSQL_LOCK *mysql_lock_tables(THD *thd, 
     thd_proc_info(thd, "Table lock");
     DBUG_PRINT("info", ("thd->proc_info %s", thd->proc_info));
     thd->locked=1;
-    /* Copy the lock data array. thr_multi_lock() reorders its contens. */
-    memcpy(sql_lock->locks + sql_lock->lock_count, sql_lock->locks,
-           sql_lock->lock_count * sizeof(*sql_lock->locks));
-    /* Lock on the copied half of the lock data array. */
-    rc= thr_lock_errno_to_mysql[(int) thr_multi_lock(sql_lock->locks +
-                                                     sql_lock->lock_count,
+    /*
+      thr_multi_lock reads the locks in the first lock_count elements,
+      and writes them ordered into the second lock_count elements. 
+    */
+    rc= thr_lock_errno_to_mysql[(int) thr_multi_lock(sql_lock->locks,
                                                      sql_lock->lock_count,
                                                      thd->lock_id)];
     if (rc > 1)                                 /* a timeout or a deadlock */
@@ -832,7 +831,7 @@ static MYSQL_LOCK *get_lock_data(THD *th
 
   /*
     Allocating twice the number of pointers for lock data for use in
-    thr_mulit_lock(). This function reorders the lock data, but cannot
+    thr_multi_lock(). This function reorders the lock data, but cannot
     update the table values. So the second part of the array is copied
     from the first part immediately before calling thr_multi_lock().
   */


Attachment: [text/bzr-bundle]
Thread
bzr commit into mysql-5.1-bugteam branch (mattias.jonsson:3160)Bug#47261Mattias Jonsson13 Oct