List:Commits« Previous MessageNext Message »
From:Davi Arnaut Date:June 1 2010 3:50pm
Subject:Review for WL#4445
View as plain text  
Hi Mattias,

Some review comments for WL#4445 below.

> === added file 'mysql-test/suite/parts/inc/partition_crash.inc'
> --- mysql-test/suite/parts/inc/partition_crash.inc	1970-01-01 00:00:00 +0000
> +++ mysql-test/suite/parts/inc/partition_crash.inc	2010-05-30 13:48:32 +0000
> @@ -0,0 +1,44 @@
> +# Include file to decrease test code duplication
> +
> +--eval $create_statement
> +--eval $insert_statement
> +--echo # State before crash
> +--list_files $DATADIR/test
> +SHOW CREATE TABLE t1;
> +--sorted_result
> +SELECT * FROM t1;
> +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
> +wait
> +EOF
> +--error 2013

2013 -> CR_SERVER_LOST

> +--eval $crash_statement
> +--echo # State after crash (before recovery)
> +--list_files_write_file $MYSQLTEST_VARDIR/tmp/ls_of_crash.txt $DATADIR/test
> +--let LS_FILE=$MYSQLTEST_VARDIR/tmp/ls_of_crash.txt
> +--let EXPECT_FILE=$MYSQLTEST_VARDIR/tmp/mysqld.1.expect
> +--perl
> +open(FH, "<", $ENV{'LS_FILE'}) or die "Failed to open ls file";
> +while (<FH>)
> +{
> +  s/sqlx.*\./sqlx-nnnn_nnnn./;
> +  print;
> +}
> +close(FH);
> +unlink $ENV{'LS_FILE'};
> +open(FH, ">", $ENV{'EXPECT_FILE'}) or die "Failed to open expect file";
> +print FH "restart";
> +close(FH);
> +EOF

Can't the above be implemented with list_files using a wild-card and 
using replace_regex to clean up the result?

> +#--remove_file $MYSQLTEST_VARDIR/tmp/ls_of_crash.txt
> +#--remove_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
> +#--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
> +#restart
> +#EOF

Remove the commented out stuff?

> +--enable_reconnect
> +--source include/wait_until_connected_again.inc
> +--echo # State after crash recovery
> +--list_files $DATADIR/test
> +SHOW CREATE TABLE t1;
> +--sorted_result
> +SELECT * FROM t1;
> +DROP TABLE t1;
>

[..]


> === modified file 'sql/ha_partition.cc'
> --- sql/ha_partition.cc	2010-05-28 20:41:55 +0000
> +++ sql/ha_partition.cc	2010-05-30 13:48:32 +0000
> @@ -149,7 +149,7 @@
>            HA_FAST_CHANGE_PARTITION);
>  }
>
> -const uint ha_partition::NO_CURRENT_PART_ID= 0xFFFFFFFF;
> +const uint ha_partition::NO_CURRENT_PART_ID= NOT_A_PARTITION_ID;

I see that NOT_A_PARTITION_ID is defined in handler.h and 
sql_partition.h, can this be cleaned up? Also, we have the type uint32 
and UINT_MAX32.

>  /*
>    Constructor method
>
> === modified file 'sql/lex.h'
> --- sql/lex.h	2010-05-23 16:36:34 +0000
> +++ sql/lex.h	2010-05-30 13:48:32 +0000
> @@ -207,6 +207,7 @@
>    { "EXISTS",		SYM(EXISTS)},
>    { "EXIT",             SYM(EXIT_SYM)},
>    { "EXPANSION",	SYM(EXPANSION_SYM)},
> +  { "EXCHANGE",         SYM(EXCHANGE_SYM)},
>    { "EXPLAIN",		SYM(DESCRIBE)},
>    { "EXTENDED",		SYM(EXTENDED_SYM)},
>    { "EXTENT_SIZE",	SYM(EXTENT_SIZE_SYM)},
>

[..]

>
> === modified file 'sql/sql_parse.cc'
> --- sql/sql_parse.cc	2010-05-28 20:41:55 +0000
> +++ sql/sql_parse.cc	2010-05-30 13:48:32 +0000
> @@ -2819,18 +2819,38 @@
>
>        /* Must be set in the parser */
>        DBUG_ASSERT(select_lex->db);
> -      if (check_access(thd, priv_needed, first_table->db,
> -                       &first_table->grant.privilege,
> -                       &first_table->grant.m_internal,
> -                       0, 0) ||
> -          check_access(thd, INSERT_ACL | CREATE_ACL, select_lex->db,
> -                       &priv,
> -                       NULL, /* Do not use first_table->grant with
> select_lex->db */
> -                       0, 0) ||
> -	  check_merge_table_access(thd, first_table->db,
> -				   (TABLE_LIST *)
> -				   create_info.merge_list.first))
> -	goto error;				/* purecov: inspected */
> +#ifdef WITH_PARTITION_STORAGE_ENGINE
> +      /* also check the table to be exchanged with the partition */
> +      if (alter_info.flags & ALTER_EXCHANGE_PARTITION)
> +      {
> +        priv_needed|= DROP_ACL | INSERT_ACL | CREATE_ACL;
> +        if (check_access(thd, priv_needed, first_table->db,
> +                         &first_table->grant.privilege,
> +                         &first_table->grant.m_internal,
> +                         0, 0) ||
> +            check_access(thd, priv_needed, first_table->next_local->db,
> +                         &first_table->next_local->grant.privilege,
> +                         &first_table->next_local->grant.m_internal,
> +                         0, 0))
> +          goto error;
> +      }
> +      else
> +#endif
> +      {
> +        if (check_access(thd, priv_needed, first_table->db,
> +                         &first_table->grant.privilege,
> +                         &first_table->grant.m_internal,
> +                         0, 0) ||
> +            check_access(thd, INSERT_ACL | CREATE_ACL, select_lex->db,
> +                         &priv,
> +                         NULL, /* Don't use first_tab->grant with sel_lex->db
> */
> +                         0, 0) ||
> +            check_merge_table_access(thd, first_table->db,
> +                                     (TABLE_LIST *)
> +                                     create_info.merge_list.first))
> +          goto error;				/* purecov: inspected */
> +      }
> +
>        if (check_grant(thd, priv_needed, all_tables, FALSE, UINT_MAX, FALSE))
>          goto error;
>        if (lex->name.str && !test_all_bits(priv,INSERT_ACL | CREATE_ACL))
> @@ -2857,13 +2877,19 @@
>        create_info.data_file_name= create_info.index_file_name= NULL;
>
>        thd->enable_slow_log= opt_log_slow_admin_statements;
> -      res= mysql_alter_table(thd, select_lex->db, lex->name.str,
> -                             &create_info,
> -                             first_table,
> -                             &alter_info,
> -                             select_lex->order_list.elements,
> -                             (ORDER *) select_lex->order_list.first,
> -                             lex->ignore);
> +#ifdef WITH_PARTITION_STORAGE_ENGINE
> +      if (alter_info.flags & ALTER_EXCHANGE_PARTITION)
> +        res= mysql_exchange_partition(thd, first_table, &alter_info,
> +                                      lex->ignore);
> +      else
> +#endif
> +        res= mysql_alter_table(thd, select_lex->db, lex->name.str,
> +                               &create_info,
> +                               first_table,
> +                               &alter_info,
> +                               select_lex->order_list.elements,
> +                               (ORDER *) select_lex->order_list.first,
> +                               lex->ignore);
>        break;
>      }

Hum, unfortunately alter table is a bit of a rat's nest, we need to 
invest some time into re-engineering it.. this ifdef approach is not 
viable long term.

One thing we can do is to take a more object oriented approach to the 
various alter table variations. This approach can be based on the model 
initially introduced for the SIGNAL statement.

Take a look in sql_yacc.yy and sql_parse.cc for SQLCOM_SIGNAL, and later 
at the Signal_statement class. Following a object structure similar to 
that used for Signal_statement, we could introduce a common class 
hierarchy for alter table and exchange could be a specialization.

There is no need to untangle everything in alter table, just separating 
exchange from alter table through a object hierarchy is a good first 
step. We can discuss this further on IRC.

>    case SQLCOM_RENAME_TABLE:
> @@ -2959,7 +2985,7 @@
>            contains any table-specific privilege.
>          */
>          DBUG_PRINT("debug", ("first_table->grant.privilege: %x",
> -                             first_table->grant.privilege));
> +                             (uint) first_table->grant.privilege));
>          if (check_some_access(thd, SHOW_CREATE_TABLE_ACLS, first_table) ||
>              (first_table->grant.privilege & SHOW_CREATE_TABLE_ACLS) == 0)
>          {
>
> === modified file 'sql/sql_partition.cc'
> --- sql/sql_partition.cc	2010-05-28 20:41:55 +0000
> +++ sql/sql_partition.cc	2010-05-30 13:48:32 +0000
> @@ -58,6 +58,7 @@
>  #include <m_ctype.h>
>  #include "my_md5.h"
>  #include "transaction.h"
> +#include "debug_sync.h"
>
>  #include "sql_base.h"                           // close_thread_tables
>  #include "sql_table.h"                  // build_table_filename,
> @@ -571,6 +572,7 @@
>            } while (++inx < num_fields);
>            if (inx == num_fields)
>            {
> +            /* TODO: Set to correct error message! */

Now is not a good time? :-)

>              mem_alloc_error(1);
>              result= TRUE;
>              continue;
> @@ -3890,6 +3892,88 @@
>    DBUG_VOID_RETURN;
>  }
>
> +
> +/**
> +  @brief Verify that all rows in a table is in the given partition
> +
> +  @param table      Table which contains the data that will be checked if
> +                    it is matching the partition definition.
> +  @param part_table Partitioned table containing the partition to check.
> +  @param part_id    Which partition to match with.

Please document the return value. Something along the lines of "@return 
TRUE if error, FALSE otherwise." should do.

> +*/
> +bool verify_data_with_partition(TABLE *table, TABLE *part_table,
> +                                uint32 part_id)
> +{
> +  uint32 found_part_id;
> +  longlong func_value;                     /* Unused */
> +  handler *file;
> +  int error;
> +  uchar *old_rec;
> +  partition_info *part_info;
> +  DBUG_ENTER("verify_data_with_partition");
> +  DBUG_ASSERT (table && table->file && part_table &&
> part_table->part_info &&
> +               part_table->file);

Remove space before (.

[..]

> +/**
> +  @brief Check if partition is exchangable with table by checking table options
> +
> +  @param table_create_info Table options from table.
> +  @param part_elem         All the info of the partition.
> +
> +  @retval FALSE if they are equal, otherwise TRUE.
> +
> +  @note Any differens that would cause a change in the frm file is prohibited.
> +  Such options as data_file_name, index_file_name, min_rows, max_rows etc. are
> +  not allowed to differ. But comment is allowed to differ.
> +*/
> +bool compare_partition_options(HA_CREATE_INFO *table_create_info,
> +                               partition_element *part_elem)
> +{
> +  bool error= FALSE;
> +  DBUG_ENTER("compare_partition_options");
> +  DBUG_ASSERT(!part_elem->tablespace_name &&
> +              !table_create_info->tablespace);
> +
> +  /*
> +    Note that there are not yet any engine supporting tablespace together
> +    with partitioning. TODO: when there are, add compare.
> +  */
> +  if (part_elem->tablespace_name || table_create_info->tablespace)
> +  {
> +    my_error(ER_PARTITION_EXCHANGE_DIFFERENT_OPTION, MYF(0),
> +             "TABLESPACE");
> +    error= TRUE;
> +  }
> +  if (part_elem->part_max_rows != table_create_info->max_rows)
> +  {
> +    my_error(ER_PARTITION_EXCHANGE_DIFFERENT_OPTION, MYF(0),
> +             "MAX_ROWS");
> +    error= TRUE;
> +  }
> +  if (part_elem->part_min_rows != table_create_info->min_rows)
> +  {
> +    my_error(ER_PARTITION_EXCHANGE_DIFFERENT_OPTION, MYF(0),
> +             "MIN_ROWS");
> +    error= TRUE;
> +  }
> +  if (part_elem->data_file_name || table_create_info->data_file_name)
> +  {
> +    my_error(ER_PARTITION_EXCHANGE_DIFFERENT_OPTION, MYF(0),
> +             "DATA DIRECTORY");
> +    error= TRUE;
> +  }
> +  if (part_elem->index_file_name || table_create_info->index_file_name)
> +  {
> +    my_error(ER_PARTITION_EXCHANGE_DIFFERENT_OPTION, MYF(0),
> +             "INDEX DIRECTORY");
> +    error= TRUE;
> +  }

You could get rid of error and use thd->is_error() or use:

const char *option= NULL;

if (..)
   option= "TABLESPACE";
else if (..)
   option= "MAX_ROWS";
else if (..)
   ..

if (option)
   my_error(ER_PARTITION_EXCHANGE_DIFFERENT_OPTION, .., option);

  ..

> +  DBUG_RETURN(error);
> +}
> +

[..]

>
> === modified file 'sql/sql_rename.cc'
> --- sql/sql_rename.cc	2010-05-23 16:36:34 +0000
> +++ sql/sql_rename.cc	2010-05-30 13:48:32 +0000
> @@ -36,8 +36,8 @@
>  static TABLE_LIST *reverse_table_list(TABLE_LIST *table_list);
>
>  /*
> -  Every second entry in the table_list is the original name and every
> -  second entry is the new name.
> +  Every two entries in the table_list form a pair of original name and
> +  the new name.
>  */
>
>  bool mysql_rename_tables(THD *thd, TABLE_LIST *table_list, bool silent)
>
> === modified file 'sql/sql_table.cc'
> --- sql/sql_table.cc	2010-05-28 20:41:55 +0000
> +++ sql/sql_table.cc	2010-05-30 13:48:32 +0000
> @@ -643,6 +643,7 @@
>
>     History:
>     First version written in 2006 by Mikael Ronstrom
> +   Updated 2010 by Mattias Jonsson, added EXCHANGE action + refactoring

Remove this history, we have authors.h for this kind of stuff. Feel free 
to add your name to it.

>  --------------------------------------------------------------------------
>  */
>
> @@ -684,14 +685,14 @@
>  #define DDL_LOG_NAME_LEN_POS 4
>  #define DDL_LOG_IO_SIZE_POS 8
>
> -/*
> -  Read one entry from ddl log file
> -  SYNOPSIS
> -    read_ddl_log_file_entry()
> -    entry_no                     Entry number to read
> -  RETURN VALUES
> -    TRUE                         Error
> -    FALSE                        Success
> +/**
> +  Read one entry from ddl log file.
> +
> +  @param entry_no                     Entry number to read
> +
> +  @return Operation status
> +    @retval TRUE                      Error
> +    @retval FALSE                     Success
>  */
>
>  static bool read_ddl_log_file_entry(uint entry_no)
> @@ -702,6 +703,7 @@
>    uint io_size= global_ddl_log.io_size;
>    DBUG_ENTER("read_ddl_log_file_entry");
>
> +  mysql_mutex_assert_owner(&LOCK_gdl);
>    if (mysql_file_pread(file_id, file_entry_buf, io_size, io_size * entry_no,
>                         MYF(MY_WME)) != io_size)
>      error= TRUE;
> @@ -709,14 +711,14 @@
>  }
>
>
> -/*
> -  Write one entry from ddl log file
> -  SYNOPSIS
> -    write_ddl_log_file_entry()
> -    entry_no                     Entry number to write
> -  RETURN VALUES
> -    TRUE                         Error
> -    FALSE                        Success
> +/**
> +  Write one entry from ddl log file.
> +
> +  @param entry_no                     Entry number to write
> +
> +  @return Operation status
> +    @retval TRUE                      Error
> +    @retval FALSE                     Success
>  */
>
>  static bool write_ddl_log_file_entry(uint entry_no)
> @@ -726,6 +728,7 @@
>    char *file_entry_buf= (char*)global_ddl_log.file_entry_buf;
>    DBUG_ENTER("write_ddl_log_file_entry");
>
> +  mysql_mutex_assert_owner(&LOCK_gdl);
>    if (mysql_file_pwrite(file_id, (uchar*)file_entry_buf,
>                          IO_SIZE, IO_SIZE * entry_no, MYF(MY_WME)) != IO_SIZE)
>      error= TRUE;
> @@ -733,13 +736,34 @@
>  }
>
>
> -/*
> -  Write ddl log header
> -  SYNOPSIS
> -    write_ddl_log_header()
> -  RETURN VALUES
> -    TRUE                      Error
> -    FALSE                     Success
> +/**
> +  Sync the ddl log file.
> +
> +  @return Operation status
> +    @retval FALSE  Success
> +    @retval TRUE   Error
> +*/
> +
> +static bool sync_ddl_log_file()
> +{
> +  bool error= FALSE;
> +  DBUG_ENTER("sync_ddl_log_file");
> +  if (mysql_file_sync(global_ddl_log.file_id, MYF(0)))
> +  {
> +    /* Write to error log */
> +    sql_print_error("Failed to sync ddl log file");

Wouldn't be better to use the flag MY_WME and let the error be 
propagated? It seems more useful as the error will include the errno and 
other relevant information.

> +    error= TRUE;
> +  }
> +  DBUG_RETURN(error);
> +}
> +
> +
> +/**
> +  Write ddl log header.
> +
> +  @return Operation status
> +    @retval TRUE                      Error
> +    @retval FALSE                     Success
>  */
>
>  static bool write_ddl_log_header()
> @@ -750,7 +774,7 @@
>
>    int4store(&global_ddl_log.file_entry_buf[DDL_LOG_NUM_ENTRY_POS],
>              global_ddl_log.num_entries);
> -  const_var= FN_LEN;
> +  const_var= FN_REFLEN;
>    int4store(&global_ddl_log.file_entry_buf[DDL_LOG_NAME_LEN_POS],
>              (ulong) const_var);
>    const_var= IO_SIZE;
> @@ -761,18 +785,14 @@
>      sql_print_error("Error writing ddl log header");
>      DBUG_RETURN(TRUE);
>    }
> -  (void) sync_ddl_log();
> +  (void) sync_ddl_log_file();

Shouldn't be ignored?

>    DBUG_RETURN(error);
>  }
>
>
> -/*
> -  Create ddl log file name
> -  SYNOPSIS
> -    create_ddl_log_file_name()
> -    file_name                   Filename setup
> -  RETURN VALUES
> -    NONE
> +/**
> +  Create ddl log file name.
> +  @param file_name                   Filename setup
>  */
>
>  static inline void create_ddl_log_file_name(char *file_name)
> @@ -781,17 +801,14 @@
>  }
>
>
> -/*
> -  Read header of ddl log file
> -  SYNOPSIS
> -    read_ddl_log_header()
> -  RETURN VALUES
> -    > 0                  Last entry in ddl log
> -    0                    No entries in ddl log
> -  DESCRIPTION
> -    When we read the ddl log header we get information about maximum sizes
> -    of names in the ddl log and we also get information about the number
> -    of entries in the ddl log.
> +/**
> +  Read header of ddl log file.
> +
> +  When we read the ddl log header we get information about maximum sizes
> +  of names in the ddl log and we also get information about the number
> +  of entries in the ddl log.
> +
> +  @return Last entry in ddl log (0 if no entries)
>  */
>
>  static uint read_ddl_log_header()
> @@ -802,6 +819,8 @@
>    bool successful_open= FALSE;
>    DBUG_ENTER("read_ddl_log_header");
>
> +  mysql_mutex_init(key_LOCK_gdl, &LOCK_gdl, MY_MUTEX_INIT_FAST);

As LOCK_gdl is mostly used for I/O, MY_MUTEX_INIT_FAST seems a bit overkill.

> +  mysql_mutex_lock(&LOCK_gdl);
>    create_ddl_log_file_name(file_name);
>    if ((global_ddl_log.file_id= mysql_file_open(key_file_global_ddl_log,
>                                                 file_name,
> @@ -830,36 +849,69 @@
>    global_ddl_log.first_free= NULL;
>    global_ddl_log.first_used= NULL;
>    global_ddl_log.num_entries= 0;
> -  mysql_mutex_init(key_LOCK_gdl, &LOCK_gdl, MY_MUTEX_INIT_FAST);
>    global_ddl_log.do_release= true;
> +  mysql_mutex_unlock(&LOCK_gdl);
>    DBUG_RETURN(entry_no);
>  }
>
>
> -/*
> -  Read a ddl log entry
> -  SYNOPSIS
> -    read_ddl_log_entry()
> -    read_entry               Number of entry to read
> -    out:entry_info           Information from entry
> -  RETURN VALUES
> -    TRUE                     Error
> -    FALSE                    Success
> -  DESCRIPTION
> -    Read a specified entry in the ddl log
> -*/
> -

[..]

> +/**
> +  Convert from file_entry_buf binary blob to ddl_log_entry struct.
> +
> +  @param[out] ddl_log_entry   struct to fill in.
> +*/
> +
> +static void set_ddl_log_entry_from_global(DDL_LOG_ENTRY *ddl_log_entry,
> +                                          const uint read_entry)
>  {
>    char *file_entry_buf= (char*)&global_ddl_log.file_entry_buf;

Highly unusual...

[..]

> +
> +
> +/**
> +  @brief Deactivate an individual entry.
> +
> +  @details For complex rename operations we need to deactivate individual
> +  entries.
> +
> +  During replace operations where we start with an existing table called
> +  t1 and a replacement table called t1#temp or something else and where
> +  we want to delete t1 and rename t1#temp to t1 this is not possible to
> +  do in a safe manner unless the ddl log is informed of the phases in
> +  the change.
> +
> +  Delete actions are 1-phase actions that can be ignored immediately after
> +  being executed.
> +  Rename actions from x to y is also a 1-phase action since there is no
> +  interaction with any other handlers named x and y.
> +  Replace action where drop y and x -> y happens needs to be a two-phase
> +  action. Thus the first phase will drop y and the second phase will
> +  rename x -> y.
> +
> +  @param entry_no     Entry position of record to change
> +
> +  @return Operation status
> +    @retval TRUE      Error
> +    @retval FALSE     Success
> +*/
> +
> +static bool deactivate_ddl_log_entry_no_lock(uint entry_no)
> +{
> +  char *file_entry_buf= (char*)global_ddl_log.file_entry_buf;
> +  DBUG_ENTER("deactivate_ddl_log_entry_no_lock");
> +
> +  mysql_mutex_assert_owner(&LOCK_gdl);
> +  if (!read_ddl_log_file_entry(entry_no))
> +  {
> +    if (file_entry_buf[DDL_LOG_ENTRY_TYPE_POS] == DDL_LOG_ENTRY_CODE)
> +    {
> +      if (file_entry_buf[DDL_LOG_ACTION_TYPE_POS] == DDL_LOG_DELETE_ACTION ||
> +          file_entry_buf[DDL_LOG_ACTION_TYPE_POS] == DDL_LOG_RENAME_ACTION ||
> +          (file_entry_buf[DDL_LOG_ACTION_TYPE_POS] == DDL_LOG_REPLACE_ACTION
> &&
> +           file_entry_buf[DDL_LOG_PHASE_POS] == 1) ||
> +          (file_entry_buf[DDL_LOG_ACTION_TYPE_POS] == DDL_LOG_EXCHANGE_ACTION
> &&
> +           file_entry_buf[DDL_LOG_PHASE_POS] > 1))
> +        file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= DDL_IGNORE_LOG_ENTRY_CODE;
> +      else if (file_entry_buf[DDL_LOG_ACTION_TYPE_POS] == DDL_LOG_REPLACE_ACTION)

I wonder what this stuff means :-)

> +      {
> +        DBUG_ASSERT(file_entry_buf[DDL_LOG_PHASE_POS] == 0);
> +        file_entry_buf[DDL_LOG_PHASE_POS]= 1;
> +      }
> +      else if (file_entry_buf[DDL_LOG_ACTION_TYPE_POS] == DDL_LOG_EXCHANGE_ACTION)
> +      {
> +        DBUG_ASSERT(file_entry_buf[DDL_LOG_PHASE_POS] <= 1);
> +        file_entry_buf[DDL_LOG_PHASE_POS]++;
> +      }
> +      else
> +      {
> +        DBUG_ASSERT(0);
> +      }

[..]

> +    case DDL_LOG_EXCHANGE_ACTION:
> +    {
> +      /* We hold LOCK_gdl, so we can alter global_ddl_log.file_entry_buf */
> +      char *file_entry_buf= (char*)&global_ddl_log.file_entry_buf;
> +      /* not yet implemented for frm */
> +      DBUG_ASSERT(!frm_action);
> +      /*
> +        Using a case-switch here to revert all currently done phases,
> +        since it will fall through until the first phase is undone.
> +      */
> +      switch (ddl_log_entry->phase) {
> +        case 2:

Perhaps phase should be a enum?

> +          /* tmp_name -> from_name possibly done */
> +          (void) file->ha_rename_table(ddl_log_entry->from_name,
> +                                       ddl_log_entry->tmp_name);
> +          /* decrease the phase and sync */
> +          file_entry_buf[DDL_LOG_PHASE_POS]--;
> +          if (write_ddl_log_file_entry(ddl_log_entry->entry_pos))
> +            break;
> +          if (sync_ddl_log_no_lock())
> +            break;
> +          /* fall through */

[..]


> +/**
> +  Execute the ddl log at recovery of MySQL Server.
>  */
>
>  void execute_ddl_log_recovery()
> @@ -1477,6 +1658,7 @@
>    THD *thd;
>    DDL_LOG_ENTRY ddl_log_entry;
>    char file_name[FN_REFLEN];
> +  char recover_query_string[]= "INTERNAL DDL LOG RECOVER IN PROGRESS";
>    DBUG_ENTER("execute_ddl_log_recovery");
>
>    /*
> @@ -1496,18 +1678,23 @@
>    thd->thread_stack= (char*) &thd;
>    thd->store_globals();
>
> +  /* Needed for InnoDB, since it checks for DROP FOREIGN KEY */
> +  thd->query_string.str= recover_query_string;
> +  thd->query_string.length= strlen(recover_query_string);

Must use THD::set_query even if not in a concurrent environment.

> +  /* this also initialize LOCK_gdl */
>    num_entries= read_ddl_log_header();
> +  mysql_mutex_lock(&LOCK_gdl);

[..]

> +
> +/**
> +  @brief Exchange partition/table with ddl log.
> +
> +  @details How to handle a crash in the middle of the rename (break on error):
> +  1) register in ddl_log that we are going to exchange swap_table with part.
> +  2) do the first rename (swap_table -> tmp-name) and sync the ddl_log.
> +  3) do the second rename (part -> swap_table) and sync the ddl_log.
> +  4) do the last rename (tmp-name -> part).
> +  5) mark the entry done.
> +
> +  Recover by:
> +    5) is done, All completed. Nothing to recover.
> +    4) is done see 3). (No mark or sync in the ddl_log...)
> +    3) is done -> try rename part -> tmp-name (ignore failure) goto 2).
> +    2) is done -> try rename swap_table -> part (ignore failure) goto 1).
> +    1) is done -> try rename tmp-name -> swap_table (ignore failure).
> +    before 1) Nothing to recover...
> +
> +  @param thd        Thread handle
> +  @param name       name of table/partition 1 (to be exchanged with 2)
> +  @param from_name  name of table/partition 2 (to be exchanged with 1)
> +  @param tmp_name   temporary name to use while exchaning
> +  @param ht         handlerton of the table/partitions
> +
> +  @return Operation status
> +    @retval TRUE    Error
> +    @retval FALSE   Success
> +
> +  @note ha_heap always succeeds in rename (since it is created upon usage).
> +  This is OK when to recover from a crash since all heap are empty and the
> +  recover is done early in the startup of the server (right before
> +  read_init_file which can populate the tables).
> +
> +  And if no crash we can trust the syncs in the ddl_log.
> +
> +  What about if the rename is put into a background thread? That will cause
> +  corruption and is avoided by the exlusive metadata lock.
> +*/
> +static bool exchange_name_with_ddl_log(THD *thd,
> +                                       const char *name,
> +                                       const char *from_name,
> +                                       const char *tmp_name,
> +                                       handlerton *ht)
> +{
> +  DDL_LOG_ENTRY exchange_entry;
> +  DDL_LOG_MEMORY_ENTRY *log_entry= NULL;
> +  DDL_LOG_MEMORY_ENTRY *exec_log_entry= NULL;
> +  bool error= TRUE;
> +  bool error_set= FALSE;
> +  handler *file= NULL;
> +  DBUG_ENTER("exchange_name_with_ddl_log");
> +
> +  if (!(file= get_new_handler((TABLE_SHARE*) 0, thd->mem_root, ht)))

Use NULL instead of 0 and then no need for cast.

[..]


> +
> +/**
> +  @brief Swap places between a partition and a table.
> +
> +  @details Verify that the tables are compatible (same engine, definition etc),
> +  if not IGNORE is given, verify that all rows in the table will fit in the
> +  partition, if all OK, rename table to tmp name, rename partition to table
> +  and finally rename tmp name to partition.
> +
> +  1) Take upgradable mdl, open tables and then lock them (inited in parse)
> +  2) Verify that metadata matches
> +  3) If not ignore, verify data
> +  4) Upgrade to exclusive mdl for both tables
> +  5) Rename table <-> partition
> +  6) Rely on close_thread_tables to release mdl and table locks
> +
> +  @param thd            Thread handle
> +  @param table_list     Table where the partition exists as first table,
> +                        Table to swap with the partition as second table
> +  @param alter_info     Contains partition name to swap
> +  @param ignore         flag to skip verification of partition values
> +
> +  @note This is a DDL operation so triggers will not be used.
> +*/
> +bool mysql_exchange_partition(THD *thd,
> +                              TABLE_LIST *table_list,
> +                              Alter_info *alter_info,
> +                              bool ignore)
> +{
> +  TABLE  *part_table, *swap_table;
> +  TABLE_LIST *swap_table_list;
> +  handlerton *table_hton;
> +  partition_element *part_elem;
> +  char *partition_name;
> +  char temp_name[FN_REFLEN+1];
> +  char part_file_name[FN_REFLEN+1];
> +  char swap_file_name[FN_REFLEN+1];
> +  char temp_file_name[FN_REFLEN+1];
> +  uint swap_part_id;
> +  uint part_file_name_len;
> +  Alter_table_prelocking_strategy alter_prelocking_strategy(alter_info);
> +  DBUG_ENTER("mysql_exchange_partition");
> +  DBUG_ASSERT(alter_info->flags & ALTER_EXCHANGE_PARTITION);
> +
> +  partition_name= alter_info->partition_names.head();
> +

[..]

> +
> +  /*
> +    Currently no MDL lock that allows both read and write and is upgradeable
> +    to exclusive, so leave the lock type to TL_WRITE_ALLOW_READ also on the
> +    partitioned table.
> +
> +    TODO: add MDL lock that allows both read and write and is upgradable to
> +    exclusive lock. This would allow to continue using the partitioned table
> +    also with update/insert/delete while the verification of the swap table
> +    is running.
> +  */
> +
> +  /*
> +    NOTE: It is not possible to exchange a crashed partition/table since
> +    we need som info from the engine, which we can only access after open,

som ->  some

> +    to be able to verify the structure/metadata.
> +  */
> +  if (open_and_lock_tables(thd, table_list, FALSE,
> +                           MYSQL_OPEN_TAKE_UPGRADABLE_MDL,
> +                           &alter_prelocking_strategy))
> +    goto err;
> +
> +  part_table= table_list->table;
> +  swap_table= swap_table_list->table;
> +  table_hton= swap_table->file->ht;

Do we need to check for stuff like merge tables, etc here?

> +  if (check_exchange_partition(swap_table, part_table))
> +    goto err;
> +
> +
> +  thd_proc_info(thd, "verifying table");
> +
> +  /* Will append the partition name later in part_info->get_part_elem() */
> +  part_file_name_len= build_table_filename(part_file_name,
> +                                           sizeof(part_file_name),
> +                                           table_list->db,
> +                                           table_list->table_name,
> +                                           "", 0);

Shoudl check the table_name as it might contain stuff like "../.."

> +  build_table_filename(swap_file_name,
> +                       sizeof(swap_file_name),
> +                       swap_table_list->db,
> +                       swap_table_list->table_name,
> +                       "", 0);
> +  /* create a unique temp name #sqlx-nnnn_nnnn, x for eXchange */
> +  my_snprintf(temp_name, sizeof(temp_name), "%sx-%lx_%lx",
> +              tmp_file_prefix, current_pid, thd->thread_id);
> +  if (lower_case_table_names)
> +    my_casedn_str(files_charset_info, temp_name);
> +  build_table_filename(temp_file_name, sizeof(temp_file_name),
> +                       table_list->next_local->db,
> +                       temp_name, "", FN_IS_TMP);
> +

[..]

> +
> +  /*
> +    Get exclusive mdl lock on both tables, alway the non partitioned table
> +    first.
> +  */
> +
> +  /*
> +    No need to set used_partitions to only propagate
> +    HA_EXTRA_PREPARE_FOR_RENAME to one part since no built in engine uses
> +    that flag. And the action would probably be to force close all other
> +    instances which is what we are doing any way.
> +  */
> +  if (wait_while_table_is_used(thd, swap_table, HA_EXTRA_PREPARE_FOR_RENAME) ||
> +      wait_while_table_is_used(thd, part_table, HA_EXTRA_PREPARE_FOR_RENAME))
> +    goto err;
> +
> +  DEBUG_SYNC(thd, "swap_partition_after_wait");
> +
> +  close_all_tables_for_name(thd, swap_table->s, FALSE);
> +  close_all_tables_for_name(thd, part_table->s, FALSE);
> +  DEBUG_SYNC(thd, "swap_partition_before_rename");
> +  if (exchange_name_with_ddl_log(thd, swap_file_name, part_file_name,
> +                                 temp_file_name, table_hton))
> +    goto err;
> +  /* Skip ha_binlog_log_query since this function is not supported by NDB */
> +
> +  /* Reopen tables under LOCK TABLES */
> +  if (thd->locked_tables_list.reopen_tables(thd))
> +    goto err;
> +
> +  if (write_bin_log(thd, TRUE, thd->query(), thd->query_length()))
> +  {
>

Shouldn't be ignored, the binlog code will set a error. The whole thing 
should be rolled back if possible. 0therwise, just let the error be 
returned to the user.

>
> +    /*
> +      Should we only report failure, even if the operation was successful,
> +      or should we also revert the operation? We continue and only report the
> +      failed bin log call.
> +    */
> +    sql_print_error("Failed to write to binlog in mysql_exchange_partition:"
> +                    " ALTER TABLE ... EXCHANGE PARTITION ... WITH TABLE ..."
> +                    " (Part 'file name' '%s' Exchange table 'file name' '%s')",
> +                    part_file_name, swap_file_name);

No need to print a error either, binlog code will generate a incident.

> +
> +  }

Under locked tables we must downgrade the lock. Take a look at 
mysql_create_or_drop_trigger for an example.

> +  my_ok(thd);
> +  table_list->table= NULL;			// For query cache
> +  table_list->next_local->table= NULL;
> +  query_cache_invalidate3(thd, table_list, 0);
> +  DBUG_RETURN(FALSE);
> +err:
> +  DBUG_RETURN(TRUE);
> +}
> +#endif
> +
> +

That's all for now.
Thread
Review for WL#4445Davi Arnaut1 Jun
  • Re: Review for WL#4445Mattias Jonsson3 Jun