List:Commits« Previous MessageNext Message »
From:Kristofer Pettersson Date:September 1 2008 2:20pm
Subject:bzr commit into mysql-5.1 branch (kpettersson:2728) Bug#38804
View as plain text  
#At file:///home/thek/Development/cpp/mysqlbzr/mysql-5.1-bug38804/

 2728 Kristofer Pettersson	2008-09-01
      Bug#38804 Query deadlock causes all tables to be inaccessible.
      
      Backport 6.0-> 5.1 of b ug 33479: auto_increment failures in partitioning
      
      Several problems with auto_increment in partitioning
      (with MyISAM, InnoDB and Falcon. Locking issues, not handling
      multi-row INSERTs properly etc.)
        
      This is a full patch which I have updated according to sergs and
      guilhems comments.
      Changed the auto_increment handling for partitioning:
      Added a ha_data variable in table_share for storage engine specific data
      such as auto_increment value handling in partitioning, also see WL 4305
      and using the ha_data->mutex to lock around read + update.
      
      The idea is this:
      Store the table's reserved auto_increment value in
      the TABLE_SHARE and use a mutex to, lock it for reading and updating it
      and unlocking it, in one block. Only accessing all partitions
      when it is not initialized.
      Also allow reservations of ranges, and if no one has done a reservation
      afterwards, lower the reservation to what was actually used after
      the statement is done (via release_auto_increment from WL 3146).
      The lock is kept from the first reservation if it is statement based
      replication and a multi-row INSERT statement where the number of
      candidate rows to insert is not known in advance (like INSERT SELECT,
      LOAD DATA, unlike INSERT VALUES (row1), (row2),,(rowN)).
      
      This should also lead to better concurrancy (no need to have a mutex
      protection around write_row in all cases)
      and work with any local storage engine.
modified:
  sql/ha_partition.cc
  sql/ha_partition.h
  sql/handler.h
  sql/table.h

=== modified file 'sql/ha_partition.cc'
--- a/sql/ha_partition.cc	2008-08-20 15:29:14 +0000
+++ b/sql/ha_partition.cc	2008-09-01 14:20:12 +0000
@@ -160,7 +160,8 @@ const uint ha_partition::NO_CURRENT_PART
 
 ha_partition::ha_partition(handlerton *hton, TABLE_SHARE *share)
   :handler(hton, share), m_part_info(NULL), m_create_handler(FALSE),
-   m_is_sub_partitioned(0), is_clone(FALSE)
+   m_is_sub_partitioned(0), is_clone(FALSE), auto_increment_lock(FALSE),
+   auto_increment_safe_stmt_log_lock(FALSE)
 {
   DBUG_ENTER("ha_partition::ha_partition(table)");
   init_handler_variables();
@@ -182,7 +183,8 @@ ha_partition::ha_partition(handlerton *h
 ha_partition::ha_partition(handlerton *hton, partition_info *part_info)
   :handler(hton, NULL), m_part_info(part_info),
    m_create_handler(TRUE),
-   m_is_sub_partitioned(m_part_info->is_sub_partitioned()), is_clone(FALSE)
+   m_is_sub_partitioned(m_part_info->is_sub_partitioned()), is_clone(FALSE),
+   auto_increment_lock(FALSE), auto_increment_safe_stmt_log_lock(FALSE)
 {
   DBUG_ENTER("ha_partition::ha_partition(part_info)");
   init_handler_variables();
@@ -1248,7 +1250,7 @@ int ha_partition::prepare_new_partition(
     assumes that external_lock() is last call that may fail here.
     Otherwise see description for cleanup_new_partition().
   */
-  if ((error= file->ha_external_lock(current_thd, m_lock_type)))
+  if ((error= file->ha_external_lock(ha_thd(), m_lock_type)))
     goto error;
 
   DBUG_RETURN(0);
@@ -1336,8 +1338,8 @@ void ha_partition::cleanup_new_partition
 
 int ha_partition::change_partitions(HA_CREATE_INFO *create_info,
                                     const char *path,
-                                    ulonglong *copied,
-                                    ulonglong *deleted,
+                                    ulonglong * const copied,
+                                    ulonglong * const deleted,
                                     const uchar *pack_frm_data
                                     __attribute__((unused)),
                                     size_t pack_frm_len
@@ -1354,7 +1356,7 @@ int ha_partition::change_partitions(HA_C
   int error= 1;
   bool first;
   uint temp_partitions= m_part_info->temp_partitions.elements;
-  THD *thd= current_thd;
+  THD *thd= ha_thd();
   DBUG_ENTER("ha_partition::change_partitions");
 
   /*
@@ -1628,7 +1630,8 @@ int ha_partition::change_partitions(HA_C
     partitions.
 */
 
-int ha_partition::copy_partitions(ulonglong *copied, ulonglong *deleted)
+int ha_partition::copy_partitions(ulonglong * const copied,
+                                  ulonglong * const deleted)
 {
   uint reorg_part= 0;
   int result= 0;
@@ -1674,13 +1677,13 @@ int ha_partition::copy_partitions(ulongl
            table since it doesn't fit into any partition any longer due to
            changed partitioning ranges or list values.
         */
-        deleted++;
+        (*deleted)++;
       }
       else
       {
         THD *thd= ha_thd();
         /* Copy record to new handler */
-        copied++;
+        (*copied)++;
         tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
         result= m_new_file[new_part]->ha_write_row(m_rec0);
         reenable_binlog(thd);
@@ -1804,7 +1807,7 @@ uint ha_partition::del_ren_cre_table(con
   handler **file, **abort_file;
   DBUG_ENTER("del_ren_cre_table()");
 
-  if (get_from_handler_file(from, current_thd->mem_root))
+  if (get_from_handler_file(from, ha_thd()->mem_root))
     DBUG_RETURN(TRUE);
   DBUG_ASSERT(m_file_buffer);
   DBUG_PRINT("enter", ("from: (%s) to: (%s)", from, to));
@@ -1897,7 +1900,7 @@ partition_element *ha_partition::find_pa
   }
   DBUG_ASSERT(0);
   my_error(ER_OUT_OF_RESOURCES, MYF(0));
-  current_thd->fatal_error();                   // Abort
+  ha_thd()->fatal_error();                      // Abort
   return NULL;
 }
 
@@ -1931,7 +1934,7 @@ int ha_partition::set_up_table_before_cr
 {
   int error= 0;
   const char *partition_name;
-  THD *thd= current_thd;
+  THD *thd= ha_thd();
   DBUG_ENTER("set_up_table_before_create");
 
   if (!part_elem)
@@ -2327,7 +2330,7 @@ bool ha_partition::get_from_handler_file
   tot_partition_words= (m_tot_parts + 3) / 4;
   engine_array= (handlerton **) my_alloca(m_tot_parts * sizeof(handlerton*));
   for (i= 0; i < m_tot_parts; i++)
-    engine_array[i]= ha_resolve_by_legacy_type(current_thd,
+    engine_array[i]= ha_resolve_by_legacy_type(ha_thd(),
                                                (enum legacy_db_type)
                                                *(uchar *) ((file_buffer) + 12 + i));
   address_tot_name_len= file_buffer + 12 + 4 * tot_partition_words;
@@ -2398,8 +2401,10 @@ int ha_partition::open(const char *name,
   uint alloc_len;
   handler **file;
   char name_buff[FN_REFLEN];
+  bool is_not_tmp_table= (table_share->tmp_table == NO_TMP_TABLE);
   DBUG_ENTER("ha_partition::open");
 
+  DBUG_ASSERT(table->s == table_share);
   ref_length= 0;
   m_mode= mode;
   m_open_test_lock= test_if_locked;
@@ -2408,9 +2413,9 @@ int ha_partition::open(const char *name,
     DBUG_RETURN(1);
   m_start_key.length= 0;
   m_rec0= table->record[0];
-  m_rec_length= table->s->reclength;
+  m_rec_length= table_share->reclength;
   alloc_len= m_tot_parts * (m_rec_length + PARTITION_BYTES_IN_POS);
-  alloc_len+= table->s->max_key_length;
+  alloc_len+= table_share->max_key_length;
   if (!m_ordered_rec_buffer)
   {
     if (!(m_ordered_rec_buffer= (uchar*)my_malloc(alloc_len, MYF(MY_WME))))
@@ -2483,6 +2488,26 @@ int ha_partition::open(const char *name,
     goto err_handler;
 
   /*
+    Use table_share->ha_data to share auto_increment_value among all handlers
+    for the same table.
+  */
+  if (is_not_tmp_table)
+    pthread_mutex_lock(&table_share->mutex);
+  if (!table_share->ha_data)
+  {
+    HA_DATA_PARTITION *ha_data;
+    /* currently only needed for auto_increment */
+    table_share->ha_data= ha_data= (HA_DATA_PARTITION*)
+                                   alloc_root(&table_share->mem_root,
+                                              sizeof(HA_DATA_PARTITION));
+    if (!ha_data)
+      goto err_handler;
+    DBUG_PRINT("info", ("table_share->ha_data 0x%p", ha_data));
+    bzero(ha_data, sizeof(HA_DATA_PARTITION));
+  }
+  if (is_not_tmp_table)
+    pthread_mutex_unlock(&table_share->mutex);
+  /*
     Some handlers update statistics as part of the open call. This will in
     some cases corrupt the statistics of the partition handler and thus
     to ensure we have correct statistics we call info from open after
@@ -2539,6 +2564,7 @@ int ha_partition::close(void)
   handler **file;
   DBUG_ENTER("ha_partition::close");
 
+  DBUG_ASSERT(table->s == table_share);
   delete_queue(&m_queue);
   if (!is_clone)
     bitmap_free(&(m_part_info->used_partitions));
@@ -2607,6 +2633,7 @@ int ha_partition::external_lock(THD *thd
   handler **file;
   DBUG_ENTER("ha_partition::external_lock");
 
+  DBUG_ASSERT(!auto_increment_lock && !auto_increment_safe_stmt_log_lock);
   file= m_file;
   m_lock_type= lock_type;
 
@@ -2825,8 +2852,9 @@ int ha_partition::write_row(uchar * buf)
   uint32 part_id;
   int error;
   longlong func_value;
-  bool autoincrement_lock= FALSE;
+  bool have_auto_increment= table->next_number_field && buf == table->record[0];
   my_bitmap_map *old_map;
+  HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
   THD *thd= ha_thd();
   timestamp_auto_set_type orig_timestamp_type= table->timestamp_field_type;
 #ifdef NOT_NEEDED
@@ -2844,31 +2872,18 @@ int ha_partition::write_row(uchar * buf)
     If we have an auto_increment column and we are writing a changed row
     or a new row, then update the auto_increment value in the record.
   */
-  if (table->next_number_field && buf == table->record[0])
+  if (have_auto_increment)
   {
-    /*
-      Some engines (InnoDB for example) can change autoincrement
-      counter only after 'table->write_row' operation.
-      So if another thread gets inside the ha_partition::write_row
-      before it is complete, it gets same auto_increment value,
-      which means DUP_KEY error (bug #27405)
-      Here we separate the access using table_share->mutex, and
-      use autoincrement_lock variable to avoid unnecessary locks.
-      Probably not an ideal solution.
-    */
-    if (table_share->tmp_table == NO_TMP_TABLE)
+    if (!ha_data->auto_inc_initialized &&
+        !table->s->next_number_keypart)
     {
       /*
-        Bug#30878 crash when alter table from non partitioned table
-        to partitioned.
-        Checking if tmp table then there is no need to lock,
-        and the table_share->mutex may not be initialised.
+        If auto_increment in table_share is not initialized, start by
+        initializing it.
       */
-      autoincrement_lock= TRUE;
-      pthread_mutex_lock(&table_share->mutex);
+      info(HA_STATUS_AUTO);
     }
     error= update_auto_increment();
-
     /*
       If we have failed to set the auto-increment value for this row,
       it is highly likely that we will not be able to insert it into
@@ -2903,11 +2918,11 @@ int ha_partition::write_row(uchar * buf)
   DBUG_PRINT("info", ("Insert in partition %d", part_id));
   tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
   error= m_file[part_id]->ha_write_row(buf);
+  if (have_auto_increment && !table->s->next_number_keypart)
+    set_auto_increment_if_higher();
   reenable_binlog(thd);
 exit:
   table->timestamp_field_type= orig_timestamp_type;
-  if (autoincrement_lock)
-    pthread_mutex_unlock(&table_share->mutex);
   DBUG_RETURN(error);
 }
 
@@ -2969,17 +2984,21 @@ int ha_partition::update_row(const uchar
     goto exit;
   }
 
-  /*
-    TODO:
-      set_internal_auto_increment=
-        max(set_internal_auto_increment, new_data->auto_increment)
-  */
   m_last_part= new_part_id;
   if (new_part_id == old_part_id)
   {
     DBUG_PRINT("info", ("Update in partition %d", new_part_id));
     tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
     error= m_file[new_part_id]->ha_update_row(old_data, new_data);
+    /*
+      if updating an auto_increment column, update
+      table_share->ha_data->next_auto_inc_val if needed.
+      (not to be used if auto_increment on secondary field in a multi-
+      column index)
+    */
+    if (table->next_number_field && new_data == table->record[0] &&
+        !table->s->next_number_keypart)
+      set_auto_increment_if_higher();
     reenable_binlog(thd);
     goto exit;
   }
@@ -3084,8 +3103,17 @@ int ha_partition::delete_all_rows()
 {
   int error;
   handler **file;
+  THD *thd= ha_thd();
   DBUG_ENTER("ha_partition::delete_all_rows");
 
+  if (thd->lex->sql_command == SQLCOM_TRUNCATE)
+  {
+    HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
+    lock_auto_increment();
+    ha_data->next_auto_inc_val= 0;
+    ha_data->auto_inc_initialized= FALSE;
+    unlock_auto_increment();
+  }
   file= m_file;
   do
   {
@@ -4544,21 +4572,54 @@ int ha_partition::handle_ordered_prev(uc
 
 int ha_partition::info(uint flag)
 {
-  handler *file, **file_array;
-  DBUG_ENTER("ha_partition:info");
+  DBUG_ENTER("ha_partition::info");
 
   if (flag & HA_STATUS_AUTO)
   {
-    ulonglong auto_increment_value= 0;
+    bool auto_inc_is_first_in_idx= (table_share->next_number_keypart == 0);
+    HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
     DBUG_PRINT("info", ("HA_STATUS_AUTO"));
-    file_array= m_file;
-    do
+    if (!table->found_next_number_field)
+      stats.auto_increment_value= 0;
+    else if (ha_data->auto_inc_initialized)
+    {
+      lock_auto_increment();
+      stats.auto_increment_value= ha_data->next_auto_inc_val;
+      unlock_auto_increment();
+    }
+    else
     {
-      file= *file_array;
-      file->info(HA_STATUS_AUTO);
-      set_if_bigger(auto_increment_value, file->stats.auto_increment_value);
-    } while (*(++file_array));
-    stats.auto_increment_value= auto_increment_value;
+      lock_auto_increment();
+      /* to avoid two concurrent initializations, check again when locked */
+      if (ha_data->auto_inc_initialized)
+        stats.auto_increment_value= ha_data->next_auto_inc_val;
+      else
+      {
+        handler *file, **file_array;
+        ulonglong auto_increment_value= 0;
+        file_array= m_file;
+        DBUG_PRINT("info",
+                   ("checking all partitions for auto_increment_value"));
+        do
+        {
+          file= *file_array;
+          file->info(HA_STATUS_AUTO);
+          set_if_bigger(auto_increment_value,
+                        file->stats.auto_increment_value);
+        } while (*(++file_array));
+
+        DBUG_ASSERT(auto_increment_value);
+        stats.auto_increment_value= auto_increment_value;
+        if (auto_inc_is_first_in_idx)
+        {
+          set_if_bigger(ha_data->next_auto_inc_val, auto_increment_value);
+          ha_data->auto_inc_initialized= TRUE;
+          DBUG_PRINT("info", ("initializing next_auto_inc_val to %lu",
+                              (ulong) ha_data->next_auto_inc_val));
+        }
+      }
+      unlock_auto_increment();
+    }
   }
   if (flag & HA_STATUS_VARIABLE)
   {
@@ -4582,6 +4643,7 @@ int ha_partition::info(uint flag)
       check_time:        Time of last check (only applicable to MyISAM)
       We report last time of all underlying handlers
     */
+    handler *file, **file_array;
     stats.records= 0;
     stats.deleted= 0;
     stats.data_file_length= 0;
@@ -4663,6 +4725,7 @@ int ha_partition::info(uint flag)
       So we calculate these constants by using the variables on the first
       handler.
     */
+    handler *file;
 
     file= m_file[0];
     file->info(HA_STATUS_CONST);
@@ -4684,6 +4747,7 @@ int ha_partition::info(uint flag)
   }
   if (flag & HA_STATUS_TIME)
   {
+    handler *file, **file_array;
     DBUG_PRINT("info", ("info: HA_STATUS_TIME"));
     /*
       This flag is used to set the latest update time of the table.
@@ -5744,19 +5808,33 @@ int ha_partition::cmp_ref(const uchar *r
                 MODULE auto increment
 ****************************************************************************/
 
-void ha_partition::restore_auto_increment(ulonglong)
-{
-  DBUG_ENTER("ha_partition::restore_auto_increment");
 
-  DBUG_VOID_RETURN;
+int ha_partition::reset_auto_increment(ulonglong value)
+{
+  handler **file= m_file;
+  int res;
+  HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
+  DBUG_ENTER("ha_partition::reset_auto_increment");
+  lock_auto_increment();
+  ha_data->auto_inc_initialized= FALSE;
+  ha_data->next_auto_inc_val= 0;
+  do
+  {
+    if ((res= (*file)->ha_reset_auto_increment(value)) != 0)
+      break;
+  } while (*(++file));
+  unlock_auto_increment();
+  DBUG_RETURN(res);
 }
 
 
-/*
+/**
   This method is called by update_auto_increment which in turn is called
-  by the individual handlers as part of write_row. We will always let
-  the first handler keep track of the auto increment value for all
-  partitions.
+  by the individual handlers as part of write_row. We use the
+  table_share->ha_data->next_auto_inc_val, or search all
+  partitions for the highest auto_increment_value if not initialized or
+  if auto_increment field is a secondary part of a key, we must search
+  every partition when holding a mutex to be sure of correctness.
 */
 
 void ha_partition::get_auto_increment(ulonglong offset, ulonglong increment,
@@ -5764,59 +5842,87 @@ void ha_partition::get_auto_increment(ul
                                       ulonglong *first_value,
                                       ulonglong *nb_reserved_values)
 {
-  ulonglong first_value_part, last_value_part, nb_reserved_values_part,
-    last_value= ~ (ulonglong) 0;
-  handler **pos, **end;
-  bool retry= TRUE;
   DBUG_ENTER("ha_partition::get_auto_increment");
-
-again:
-  for (pos=m_file, end= m_file+ m_tot_parts; pos != end ; pos++)
+  DBUG_PRINT("info", ("offset: %lu inc: %lu desired_values: %lu "
+                      "first_value: %lu", (ulong) offset, (ulong) increment,
+                      (ulong) nb_desired_values, (ulong) *first_value));
+  DBUG_ASSERT(increment && nb_desired_values);
+  *first_value= 0;
+  if (table->s->next_number_keypart)
   {
-    first_value_part= *first_value;
-    (*pos)->get_auto_increment(offset, increment, nb_desired_values,
-                               &first_value_part, &nb_reserved_values_part);
-    if (first_value_part == ~(ulonglong)(0)) // error in one partition
-    {
-      *first_value= first_value_part;
-      sql_print_error("Partition failed to reserve auto_increment value");
-      DBUG_VOID_RETURN;
-    }
     /*
-      Partition has reserved an interval. Intersect it with the intervals
-      already reserved for the previous partitions.
+      next_number_keypart is != 0 if the auto_increment column is a secondary
+      column in the index (it is allowed in MyISAM)
     */
-    last_value_part= (nb_reserved_values_part == ULONGLONG_MAX) ?
-      ULONGLONG_MAX : (first_value_part + nb_reserved_values_part * increment);
-    set_if_bigger(*first_value, first_value_part);
-    set_if_smaller(last_value, last_value_part);
+    DBUG_PRINT("info", ("next_number_keypart != 0"));
+    ulonglong nb_reserved_values_part;
+    ulonglong first_value_part, max_first_value;
+    handler **file= m_file;
+    first_value_part= max_first_value= *first_value;
+    /* Must lock and find highest value among all partitions. */
+    lock_auto_increment();
+    do
+    {
+      /* Only nb_desired_values = 1 makes sense */
+      (*file)->get_auto_increment(offset, increment, 1,
+                                 &first_value_part, &nb_reserved_values_part);
+      if (first_value_part == ~(ulonglong)(0)) // error in one partition
+      {
+        *first_value= first_value_part;
+        sql_print_error("Partition failed to reserve auto_increment value");
+        unlock_auto_increment();
+        DBUG_VOID_RETURN;
+      }
+      DBUG_PRINT("info", ("first_value_part: %lu", (ulong) first_value_part));
+      set_if_bigger(max_first_value, first_value_part);
+    } while (*(++file));
+    *first_value= max_first_value;
+    *nb_reserved_values= 1;
+    unlock_auto_increment();
   }
-  if (last_value < *first_value) /* empty intersection, error */
+  else
   {
+    THD *thd= ha_thd();
+    HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
     /*
-      When we have an empty intersection, it means that one or more
-      partitions may have a significantly different autoinc next value.
-      We should not fail here - it just means that we should try to
-      find a new reservation making use of the current *first_value
-      wbich should now be compatible with all partitions.
+      This is initialized in the beginning of the first write_row call.
     */
-    if (retry)
-    {
-      retry= FALSE;
-      last_value= ~ (ulonglong) 0;
-      release_auto_increment();
-      goto again;
-    }
+    DBUG_ASSERT(ha_data->auto_inc_initialized);
+    /*
+      Get a lock for handling the auto_increment in table_share->ha_data
+      for avoiding two concurrent statements getting the same number.
+    */ 
+
+    lock_auto_increment();
+
     /*
-      We should not get here.
+      In a multi-row insert statement like INSERT SELECT and LOAD DATA
+      where the number of candidate rows to insert is not known in advance
+      we must hold a lock/mutex for the whole statement if we have statement
+      based replication. Because the statement-based binary log contains
+      only the first generated value used by the statement, and slaves assumes
+      all other generated values used by this statement were consecutive to
+      this first one, we must exclusively lock the generator until the statement
+      is done.
     */
-    sql_print_error("Failed to calculate auto_increment value for partition");
-    
-    *first_value= ~(ulonglong)(0);
+    if (!auto_increment_safe_stmt_log_lock &&
+        thd->lex->sql_command != SQLCOM_INSERT &&
+        mysql_bin_log.is_open() &&
+        !thd->current_stmt_binlog_row_based &&
+        (thd->options & OPTION_BIN_LOG))
+    {
+      DBUG_PRINT("info", ("locking auto_increment_safe_stmt_log_lock"));
+      auto_increment_safe_stmt_log_lock= TRUE;
+    }
+
+    /* this gets corrected (for offset/increment) in update_auto_increment */
+    *first_value= ha_data->next_auto_inc_val;
+    ha_data->next_auto_inc_val+= nb_desired_values * increment;
+
+    unlock_auto_increment();
+    DBUG_PRINT("info", ("*first_value: %lu", (ulong) *first_value));
+    *nb_reserved_values= nb_desired_values;
   }
-  if (increment)                                // If not check for values
-    *nb_reserved_values= (last_value == ULONGLONG_MAX) ?
-      ULONGLONG_MAX : ((last_value - *first_value) / increment);
   DBUG_VOID_RETURN;
 }
 
@@ -5824,9 +5930,31 @@ void ha_partition::release_auto_incremen
 {
   DBUG_ENTER("ha_partition::release_auto_increment");
 
-  for (uint i= 0; i < m_tot_parts; i++)
+  if (table->s->next_number_keypart)
+  {
+    for (uint i= 0; i < m_tot_parts; i++)
+      m_file[i]->ha_release_auto_increment();
+  }
+  else if (next_insert_id)
   {
-    m_file[i]->ha_release_auto_increment();
+    HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
+    ulonglong next_auto_inc_val;
+    lock_auto_increment();
+    next_auto_inc_val= ha_data->next_auto_inc_val;
+    if (next_insert_id < next_auto_inc_val &&
+        auto_inc_interval_for_cur_row.maximum() >= next_auto_inc_val)
+      ha_data->next_auto_inc_val= next_insert_id;
+    DBUG_PRINT("info", ("ha_data->next_auto_inc_val: %lu",
+                        (ulong) ha_data->next_auto_inc_val));
+
+    /* Unlock the multi row statement lock taken in get_auto_increment */
+    if (auto_increment_safe_stmt_log_lock)
+    {
+      auto_increment_safe_stmt_log_lock= FALSE;
+      DBUG_PRINT("info", ("unlocking auto_increment_safe_stmt_log_lock"));
+    }
+
+    unlock_auto_increment();
   }
   DBUG_VOID_RETURN;
 }

=== modified file 'sql/ha_partition.h'
--- a/sql/ha_partition.h	2008-08-11 18:06:08 +0000
+++ b/sql/ha_partition.h	2008-09-01 14:20:12 +0000
@@ -37,6 +37,15 @@ typedef struct st_partition_share
 } PARTITION_SHARE;
 #endif
 
+/**
+  Partition specific ha_data struct.
+  @todo: move all partition specific data from TABLE_SHARE here.
+*/
+typedef struct st_ha_data_partition
+{
+  ulonglong next_auto_inc_val;                 /**< first non reserved value */
+  bool auto_inc_initialized;
+} HA_DATA_PARTITION;
 
 #define PARTITION_BYTES_IN_POS 2
 class ha_partition :public handler
@@ -141,6 +150,12 @@ private:
     "own" the m_part_info structure.
   */
   bool is_clone;
+  bool auto_increment_lock;             /**< lock reading/updating auto_inc */
+  /**
+    Flag to keep the auto_increment lock through out the statement.
+    This to ensure it will work with statement based replication.
+  */
+  bool auto_increment_safe_stmt_log_lock;
 public:
   handler *clone(MEM_ROOT *mem_root);
   virtual void set_part_info(partition_info *part_info)
@@ -197,8 +212,8 @@ public:
   virtual char *update_table_comment(const char *comment);
   virtual int change_partitions(HA_CREATE_INFO *create_info,
                                 const char *path,
-                                ulonglong *copied,
-                                ulonglong *deleted,
+                                ulonglong * const copied,
+                                ulonglong * const deleted,
                                 const uchar *pack_frm_data,
                                 size_t pack_frm_len);
   virtual int drop_partitions(const char *path);
@@ -212,7 +227,7 @@ public:
   virtual void change_table_ptr(TABLE *table_arg, TABLE_SHARE *share);
 private:
   int prepare_for_rename();
-  int copy_partitions(ulonglong *copied, ulonglong *deleted);
+  int copy_partitions(ulonglong * const copied, ulonglong * const deleted);
   void cleanup_new_partition(uint part_count);
   int prepare_new_partition(TABLE *table, HA_CREATE_INFO *create_info,
                             handler *file, const char *part_name,
@@ -829,12 +844,52 @@ public:
     auto_increment_column_changed
      -------------------------------------------------------------------------
   */
-  virtual void restore_auto_increment(ulonglong prev_insert_id);
   virtual void get_auto_increment(ulonglong offset, ulonglong increment,
                                   ulonglong nb_desired_values,
                                   ulonglong *first_value,
                                   ulonglong *nb_reserved_values);
   virtual void release_auto_increment();
+private:
+  virtual int reset_auto_increment(ulonglong value);
+  virtual void lock_auto_increment()
+  {
+    /* lock already taken */
+    if (auto_increment_safe_stmt_log_lock)
+      return;
+    DBUG_ASSERT(table_share->ha_data && !auto_increment_lock);
+    if(table_share->tmp_table == NO_TMP_TABLE)
+    {
+      auto_increment_lock= TRUE;
+      pthread_mutex_lock(&table_share->mutex);
+    }
+  }
+  virtual void unlock_auto_increment()
+  {
+    DBUG_ASSERT(table_share->ha_data);
+    /*
+      If auto_increment_safe_stmt_log_lock is true, we have to keep the lock.
+      It will be set to false and thus unlocked at the end of the statement by
+      ha_partition::release_auto_increment.
+    */
+    if(auto_increment_lock && !auto_increment_safe_stmt_log_lock)
+    {
+      pthread_mutex_unlock(&table_share->mutex);
+      auto_increment_lock= FALSE;
+    }
+  }
+  virtual void set_auto_increment_if_higher()
+  {
+    ulonglong nr= table->next_number_field->val_int();
+    HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
+    lock_auto_increment();
+    /* must check when the mutex is taken */
+    if (nr >= ha_data->next_auto_inc_val)
+      ha_data->next_auto_inc_val= nr + 1;
+    ha_data->auto_inc_initialized= TRUE;
+    unlock_auto_increment();
+  }
+
+public:
 
   /*
      -------------------------------------------------------------------------

=== modified file 'sql/handler.h'
--- a/sql/handler.h	2008-08-11 18:02:03 +0000
+++ b/sql/handler.h	2008-09-01 14:20:12 +0000
@@ -1241,8 +1241,8 @@ public:
 
   int ha_change_partitions(HA_CREATE_INFO *create_info,
                            const char *path,
-                           ulonglong *copied,
-                           ulonglong *deleted,
+                           ulonglong * const copied,
+                           ulonglong * const deleted,
                            const uchar *pack_frm_data,
                            size_t pack_frm_len);
   int ha_drop_partitions(const char *path);
@@ -1859,7 +1859,8 @@ private:
     This is called to delete all rows in a table
     If the handler don't support this, then this function will
     return HA_ERR_WRONG_COMMAND and MySQL will delete the rows one
-    by one.
+    by one. It should reset auto_increment if
+    thd->lex->sql_command == SQLCOM_TRUNCATE.
   */
   virtual int delete_all_rows()
   { return (my_errno=HA_ERR_WRONG_COMMAND); }
@@ -1898,8 +1899,8 @@ private:
 
   virtual int change_partitions(HA_CREATE_INFO *create_info,
                                 const char *path,
-                                ulonglong *copied,
-                                ulonglong *deleted,
+                                ulonglong * const copied,
+                                ulonglong * const deleted,
                                 const uchar *pack_frm_data,
                                 size_t pack_frm_len)
   { return HA_ERR_WRONG_COMMAND; }

=== modified file 'sql/table.h'
--- a/sql/table.h	2008-07-24 20:38:44 +0000
+++ b/sql/table.h	2008-09-01 14:20:12 +0000
@@ -360,6 +360,7 @@ typedef struct st_table_share
   int cached_row_logging_check;
 
 #ifdef WITH_PARTITION_STORAGE_ENGINE
+  /** @todo: Move into *ha_data for partitioning */
   bool auto_partitioned;
   const char *partition_info;
   uint  partition_info_len;
@@ -369,6 +370,9 @@ typedef struct st_table_share
   handlerton *default_part_db_type;
 #endif
 
+  /** place to store storage engine specific data */
+  void *ha_data;
+
 
   /*
     Set share's table cache key and update its db and table name appropriately.

Thread
bzr commit into mysql-5.1 branch (kpettersson:2728) Bug#38804Kristofer Pettersson1 Sep