Hi!
On Feb 21, mattiasj@stripped wrote:
> ChangeSet@stripped, 2008-02-21 10:09:43+01:00, mattiasj@witty. +3 -0
> Bug#33479: auto_increment failures in partitioning
>
> Several problems with auto_increment in partitioning
> (with MyISAM, InnoDB and Falcon)
>
> Code only patch (the test cases will be in a separate patches
> 5.1+ (InnoDB+MyISAM) and 6.0 (Falcon)
>
> Changed the auto_increment handling for partitioning:
> Added a ha_data variable in table_share for storage engine specific data such as
> auto_increment value handling in partitioning
> and using the ha_data->mutex to lock around read + update.
>
> The idea is this:
> Store the table's reserved auto_increment value in
> the TABLE_SHARE and use a mutex to lock it for reading and updateing it,
> only accessing all partitions when it is not initialized.
> Also allow reservations of ranges, and if no one has done a reservation
> afterwards, lower the reservation to what was actually used after
> the statement is done (via release_auto_increment from WL#3146).
>
> This should be quite fast and work with any local storage engine.
Generally, I like it, it's certainly on the right track.
Thanks for adding TABLE_SHARE::ha_data.
Still I have some comments and questions, see below:
> diff -Nrup a/sql/ha_partition.cc b/sql/ha_partition.cc
> --- a/sql/ha_partition.cc 2007-12-20 19:16:51 +01:00
> +++ b/sql/ha_partition.cc 2008-02-21 10:09:42 +01:00
> @@ -4412,20 +4443,46 @@ int ha_partition::handle_ordered_prev(uc
> int ha_partition::info(uint flag)
> {
> handler *file, **file_array;
> + HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
> DBUG_ENTER("ha_partition:info");
>
> if (flag & HA_STATUS_AUTO)
> {
> + bool auto_increment_lock= FALSE;
> DBUG_PRINT("info", ("HA_STATUS_AUTO"));
> + if (ha_data->auto_inc_initialized &&
> !table_share->next_number_keypart)
> {
> + DBUG_PRINT("info", ("Using ha_data->next_auto_inc_val: %llu",
> + ha_data->next_auto_inc_val));
> +
> + lock_auto_increment(&auto_increment_lock);
> + stats.auto_increment_value= ha_data->next_auto_inc_val;
> + unlock_auto_increment(&auto_increment_lock);
> + }
> + else
> + {
> + ulonglong auto_increment_value= 0;
> + file_array= m_file;
> + DBUG_PRINT("info", ("checking all partitions for auto_increment_value"));
> + do
> + {
> + file= *file_array;
> + file->info(HA_STATUS_AUTO);
> + set_if_bigger(auto_increment_value, file->stats.auto_increment_value);
> + DBUG_PRINT("info", ("file->stats.auto_increment_value: %llu",
> + file->stats.auto_increment_value));
> + } while (*(++file_array));
> + stats.auto_increment_value= auto_increment_value;
> + if (!table_share->next_number_keypart)
> + {
> + lock_auto_increment(&auto_increment_lock);
> + set_if_bigger(ha_data->next_auto_inc_val, auto_increment_value);
why set_if_bigger here ?
> + ha_data->auto_inc_initialized= TRUE;
> + unlock_auto_increment(&auto_increment_lock);
> + }
> + }
> + DBUG_PRINT("info", ("stats.auto_increment_value: %llu",
> + stats.auto_increment_value));
> }
> if (flag & HA_STATUS_VARIABLE)
> {
> @@ -5579,19 +5636,29 @@ int ha_partition::cmp_ref(const uchar *r
> MODULE auto increment
> ****************************************************************************/
>
> -void ha_partition::restore_auto_increment(ulonglong)
> -{
>
> +int ha_partition::ha_reset_auto_increment(ulonglong value)
Never. You can only override reset_auto_increment(), and never ha_xxx
methods. By the way, ha_reset_auto_increment() (and all other ha_xxx
methods) are not even virtual.
Something must be wrong with your test, if you didn't notice that you
"override" a non-virtual method, didn't notice that your
ha_reset_auto_increment() is never called.
> +{
> + handler **pos, **end;
> + int res;
> + HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
> + DBUG_ENTER("ha_partition::ha_reset_auto_increment");
> + DBUG_PRINT("info", ("value: %llu", value));
> + for (pos=m_file, end= m_file+ m_tot_parts; pos != end ; pos++)
> + if ((res= (*pos)->ha_reset_auto_increment(value)) != 0)
> + DBUG_RETURN(res);
> + ha_data->auto_inc_initialized= FALSE;
> + DBUG_RETURN(0);
> }
>
>
> /*
> This method is called by update_auto_increment which in turn is called
> + by the individual handlers as part of write_row. We use the
> + table_share->ha_data->next_auto_inc_val, or search all
> + partitions for the highest auto_increment_value if not initialized or
> + if auto_increment field is a secondary part of a key, we must search
> + every partition with a mutex to be sure of correctness.
> */
>
> void ha_partition::get_auto_increment(ulonglong offset, ulonglong increment,
> ulonglong *nb_desired_values,
> ulonglong *first_value,
> ulonglong *nb_reserved_values)
> {
> + HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
> DBUG_ENTER("ha_partition::get_auto_increment");
> -
> -again:
> + DBUG_PRINT("info", ("offset: %llu inc: %llu desired_values: %llu "
> + "first_value: %llu", offset, increment,
> + nb_desired_values, *first_value));
> + DBUG_ASSERT(increment && nb_desired_values);
> + *first_value= 0;
> + if (table->s->next_number_keypart)
> {
> /*
> + next_number_keypart is != 0 if the auto_increment column is a secondary
> + column in the index (it is allowed in MyISAM)
> */
> + DBUG_PRINT("info", ("next_number_keypart != 0"));
> + ulonglong first_value_part, last_value_part, nb_reserved_values_part,
> + last_value= ~ (ulonglong) 0;
> + handler **pos, **end;
> + bool auto_increment_lock= FALSE;
> /*
> + Must lock and find highest value among all partitions
> */
> + lock_auto_increment(&auto_increment_lock);
> + for (pos=m_file, end= m_file+ m_tot_parts; pos != end ; pos++)
> {
> + first_value_part= 0;
> + (*pos)->get_auto_increment(offset, increment, nb_desired_values,
> + &first_value_part,
> &nb_reserved_values_part);
> + if (first_value_part == ~(ulonglong)(0)) // error in one partition
> + {
> + *first_value= first_value_part;
> + sql_print_error("Partition failed to reserve auto_increment value");
> + unlock_auto_increment(&auto_increment_lock);
> + DBUG_VOID_RETURN;
> + }
> + DBUG_PRINT("info", ("partition loop, first_value_part: %llu",
> first_value_part));
> + /*
> + Partition has reserved an interval. Intersect it with the intervals
> + already reserved for the previous partitions.
> + */
> + last_value_part= (nb_reserved_values_part == ULONGLONG_MAX) ?
> + ULONGLONG_MAX : (first_value_part + nb_reserved_values_part * increment);
> + set_if_bigger(*first_value, first_value_part);
> + set_if_smaller(last_value, last_value_part);
> + }
> + DBUG_PRINT("info", ("*first_value: %llu last_value: %llu", *first_value,
> + last_value));
> + if (last_value < *first_value)
> + {
> + /*
> + Set last_value to *first_value. This is safe because of the mutex.
I don't understand the comment
> + */
> + last_value= *first_value;
> + }
> + if (increment) // If not check for values
> + {
> + *nb_reserved_values= (last_value == ULONGLONG_MAX) ?
> + ULONGLONG_MAX : ((last_value - *first_value) / increment);
> }
> + unlock_auto_increment(&auto_increment_lock);
> + }
> + else if (!ha_data->auto_inc_initialized)
> + {
> + ulonglong first_value_part;
> + handler **pos, **end;
> /*
> + Not initialized, but should be locked in ha_partition::write_row().
> + (Or is a temporary table).
> + Initialize to highest value among all partitions
> */
> + DBUG_PRINT("info", ("Initializing auto_increment in table_share"));
> + for (pos=m_file, end= m_file+ m_tot_parts; pos != end ; pos++)
> + {
> + first_value_part= 0;
> + /*
> + Use (*pos)->info() and (*pos)->stats.auto_increment_value instead of
> + (*pos)->get_auto_increment() since InnoDB does not handle it correctly
> + for partitioned tables.
> + */
> + (*pos)->info(HA_STATUS_AUTO);
> + first_value_part= (*pos)->stats.auto_increment_value;
> + if (first_value_part == ~(ulonglong)(0)) // error in one partition
> + {
> + *first_value= first_value_part;
> + sql_print_error("Partition failed to reserve auto_increment value");
> + DBUG_VOID_RETURN;
> + }
> + DBUG_PRINT("info", ("partition loop, first_value_part: %llu",
> first_value_part));
> + set_if_bigger(*first_value, first_value_part);
> + }
eh, why don't you just call ::info(HA_STATUS_AUTO) ?
> + if (*first_value == 0 &&
> + !(table->auto_increment_field_not_null &&
> + table->next_number_field->val_int() == 0 &&
> + current_thd->variables.sql_mode & MODE_NO_AUTO_VALUE_ON_ZERO))
> + {
> + /*
> + First auto_increment, set to 1
> + (if not explicit number 0 and not null and sql_mode set)
> + */
> + DBUG_PRINT("info", ("Setting *first_value to 1"));
> + *first_value= 1;
> + }
> + DBUG_PRINT("info", ("*first_value: %llu", *first_value));
> +
> + ha_data->next_auto_inc_val= *first_value + nb_desired_values * increment;
> + ha_data->auto_inc_initialized= TRUE;
> + *nb_reserved_values= nb_desired_values;
> }
> + else
> + {
> + /*
> + This is an optimized solution that not require
> + a mutex around write_row for auto_increment handling
> + Only for a picking the next auto_increment_value.
> +
> + Get a lock for handling the auto_increment in table_share->ha_data
> + for avoiding two concurrent statements getting the same
> + number.
> + */
> + bool auto_increment_lock= FALSE;
> +
> + DBUG_PRINT("info", ("reserving %llu auto_inc values", nb_desired_values));
> + lock_auto_increment(&auto_increment_lock);
> +
> + DBUG_PRINT("info", ("ha_data->next_auto_inc_val: %llu",
> + ha_data->next_auto_inc_val));
> + /* this gets corrected (for offset) in update_auto_increment */
> + *first_value= ha_data->next_auto_inc_val;
> + ha_data->next_auto_inc_val= *first_value + nb_desired_values * increment;
> +
> + unlock_auto_increment(&auto_increment_lock);
> + *nb_reserved_values= nb_desired_values;
> + }
> + DBUG_PRINT("info", ("first_value: %llu next_auto_inc_val: %llu "
> + "nb_reserved_values: %llu", *first_value,
> + ha_data->next_auto_inc_val,
> + *nb_reserved_values));
> +
> DBUG_VOID_RETURN;
> }
>
> diff -Nrup a/sql/ha_partition.h b/sql/ha_partition.h
> --- a/sql/ha_partition.h 2007-09-24 15:30:28 +02:00
> +++ b/sql/ha_partition.h 2008-02-21 10:09:42 +01:00
> @@ -828,12 +835,43 @@ public:
> auto_increment_column_changed
> -------------------------------------------------------------------------
> */
> virtual void get_auto_increment(ulonglong offset, ulonglong increment,
> ulonglong nb_desired_values,
> ulonglong *first_value,
> ulonglong *nb_reserved_values);
> virtual void release_auto_increment();
> + virtual int ha_reset_auto_increment(ulonglong value);
> +private:
> + inline virtual void lock_auto_increment(bool *is_locked)
> + {
> + if(!(*is_locked) && table_share->tmp_table == NO_TMP_TABLE)
> + {
> + HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
> + *is_locked= TRUE;
> + pthread_mutex_lock(&ha_data->mutex);
> + }
> + }
> + inline virtual void unlock_auto_increment(bool *is_locked)
> + {
> + if(*is_locked)
> + {
> + HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
> + pthread_mutex_unlock(&ha_data->mutex);
> + *is_locked= FALSE;
> + }
> + }
> + inline virtual void set_auto_increment_if_higher(bool *is_locked)
> + {
> + ulonglong nr= table->next_number_field->val_int();
> + HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
> + if (nr >= ha_data->next_auto_inc_val)
> + {
> + lock_auto_increment(is_locked);
as you check ha_data->next_auto_inc_val without a mutex, you need to
re-check it under a mutex.
> + ha_data->next_auto_inc_val= nr + 1;
> + }
> + }
> +
> +public:
>
Regards / Mit vielen Grüssen,
Sergei
--
__ ___ ___ ____ __
/ |/ /_ __/ __/ __ \/ / Sergei Golubchik <serg@stripped>
/ /|_/ / // /\ \/ /_/ / /__ Principal Software Developer/Server Architect
/_/ /_/\_, /___/\___\_\___/ MySQL GmbH, Dachauer Str. 37, D-80335 München
<___/ Geschäftsführer: Kaj Arnö - HRB
München 162140