MySQL Lists are EOL. Please join:

List:Commits« Previous MessageNext Message »
From:Mats Kindahl Date:February 16 2009 8:39pm
Subject:Re: bzr commit into mysql-6.0-rpl branch (alfranio.correia:2811)
Bug#40337
View as plain text  
Hi Alfranio,

I see no problems with the patch, so I approve it. However, have a look at the
comments inline below.

Just my few cents,
Mats Kindahl

Alfranio Correia wrote:
> #At
> file:///home/acorreia/workspace.sun/repository.mysql/bzrwork/bug-40337/mysql-6.0-rpl/
> based on revid:alfranio.correia@stripped
> 
>  2811 Alfranio Correia	2009-02-12
>       BUG#40337  Fsyncing master and relay log to disk after every event is too slow
>       
>       Post-fix.
> added:
>   mysql-test/suite/rpl/r/rpl_sync.result
>   mysql-test/suite/rpl/t/rpl_sync-slave.opt
>   mysql-test/suite/rpl/t/rpl_sync.test
> modified:
>   sql/rpl_mi.cc
>   sql/rpl_mi.h
>   sql/rpl_rli.cc
>   sql/rpl_rli.h
>   sql/slave.cc
>   sql/sql_binlog.cc
> 
> === added file 'mysql-test/suite/rpl/r/rpl_sync.result'
> --- a/mysql-test/suite/rpl/r/rpl_sync.result	1970-01-01 00:00:00 +0000
> +++ b/mysql-test/suite/rpl/r/rpl_sync.result	2009-02-12 09:13:57 +0000
> @@ -0,0 +1,75 @@
> +=====Configuring the enviroment=======;
> +stop slave;
> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> +reset master;
> +reset slave;
> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> +start slave;
> +CREATE TABLE t1(a INT, PRIMARY KEY(a)) engine=innodb;
> +insert into t1(a) values(1);
> +insert into t1(a) values(2);
> +insert into t1(a) values(3);
> +=====Inserting data on the master but without the SQL Thread being running=======;
> +stop slave SQL_THREAD;
> +insert into t1(a) values(4);
> +insert into t1(a) values(5);
> +insert into t1(a) values(6);
> +=====Removing relay log files and crashing/recoverying the slave=======;
> +stop slave IO_THREAD;
> +failure
> +SET SESSION debug="d,crash_before_rotate_relaylog";
> +FLUSH LOGS;
> +ERROR HY000: Lost connection to MySQL server during query
> +=====Dumping and comparing tables=======;
> +start slave;
> +select * from t1;
> +a
> +1
> +2
> +3
> +4
> +5
> +6
> +select * from t1;
> +a
> +1
> +2
> +3
> +4
> +5
> +6

Could you use diff_tables.inc instead? It works since the tables are supposed to
be equal, but otherwise, it doesn't work.

> +=====Corrupting the master.info=======;
> +stop slave;
> +FLUSH LOGS;
> +insert into t1(a) values(7);
> +insert into t1(a) values(8);
> +insert into t1(a) values(9);
> +SET SESSION debug="d,crash_before_rotate_relaylog";
> +FLUSH LOGS;
> +ERROR HY000: Lost connection to MySQL server during query
> +=====Dumping and comparing tables=======;
> +start slave;
> +select * from t1;
> +a
> +1
> +2
> +3
> +4
> +5
> +6
> +7
> +8
> +9
> +select * from t1;
> +a
> +1
> +2
> +3
> +4
> +5
> +6
> +7
> +8
> +9

Same here.

> +=====Clean up=======;
> +drop table t1;
> 
> === added file 'mysql-test/suite/rpl/t/rpl_sync-slave.opt'
> --- a/mysql-test/suite/rpl/t/rpl_sync-slave.opt	1970-01-01 00:00:00 +0000
> +++ b/mysql-test/suite/rpl/t/rpl_sync-slave.opt	2009-02-12 09:13:57 +0000
> @@ -0,0 +1 @@
> +--sync-relay-log-info=1 --relay-log-recovery=1
> 
> === added file 'mysql-test/suite/rpl/t/rpl_sync.test'
> --- a/mysql-test/suite/rpl/t/rpl_sync.test	1970-01-01 00:00:00 +0000
> +++ b/mysql-test/suite/rpl/t/rpl_sync.test	2009-02-12 09:13:57 +0000
> @@ -0,0 +1,143 @@
>
> +########################################################################################
> +# This test verifies the options --sync-relay-log-info and --relay-log-recovery by 
> +# crashing the slave in two different situations:
> +#  (case-1) - Corrupt the relay log with changes which were not processed by
> +#  the SQL Thread and crashes it.
> +#  (case-2) - Corrupt the master.info with wrong coordinates and crashes it.
> +#
> +#  Case 1:
> +#    1 - Stops the SQL Thread
> +#    2 - Inserts new records into the master.
> +#    3 - Corrupts the relay-log.bin* which most likely has such changes.
> +#    4 - Crashes the slave
> +#    5 - Verifies if the slave is sync with the master which means that the
> information
> +#    loss was circumvented by the recovery process.
> +#
> +#  Case 2:
> +#    1 - Stops the SQL/IO Threads
> +#    2 - Inserts new records into the master.
> +#    3 - Corrupts the master.info with wrong coordinates.
> +#    4 - Crashes the slave
> +#    5 - Verifies if the slave is sync with the master which means that the
> information
> +#    loss was circumvented by the recovery process.
>
> +########################################################################################
> +
>
> +########################################################################################
> +#                                Configuring the environment
>
> +########################################################################################
> +--echo =====Configuring the enviroment=======;
> +--source include/master-slave.inc
> +--source include/not_embedded.inc
> +--source include/not_valgrind.inc
> +
> +--source include/have_debug.inc
> +
> +--source include/have_innodb.inc
> +
> +CREATE TABLE t1(a INT, PRIMARY KEY(a)) engine=innodb;
> +
> +insert into t1(a) values(1);
> +insert into t1(a) values(2);
> +insert into t1(a) values(3);
> +
>
> +########################################################################################
> +#                             Case 1: Corrupt a relay-log.bin*
>
> +########################################################################################
> +--echo =====Inserting data on the master but without the SQL Thread being
> running=======;
> +save_master_pos;
> +connection slave;
> +sync_with_master;

These three can be replaced with "sync_slave_with_master;"

> +
> +connection slave;
> +let $MYSQLD_SLAVE_DATADIR= `select @@datadir`;
> +--replace_result $MYSQLD_SLAVE_DATADIR MYSQLD_SLAVE_DATADIR
> +--copy_file $MYSQLD_SLAVE_DATADIR/master.info $MYSQLD_SLAVE_DATADIR/master.backup
> +stop slave SQL_THREAD;
> +source include/wait_for_slave_sql_to_stop.inc;
> +
> +connection master;
> +insert into t1(a) values(4);
> +insert into t1(a) values(5);
> +insert into t1(a) values(6);
> +
> +--echo =====Removing relay log files and crashing/recoverying the slave=======;
> +connection slave;
> +stop slave IO_THREAD;
> +source include/wait_for_slave_io_to_stop.inc;
> +
> +let $file= query_get_value("SHOW SLAVE STATUS", Relay_Log_File, 1);
> +--replace_result $MYSQLD_SLAVE_DATADIR MYSQLD_SLAVE_DATADIR
> +--exec echo "failure" > $MYSQLD_SLAVE_DATADIR/$file
> +--exec cat  $MYSQLD_SLAVE_DATADIR/$file
> +
> +--exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
> +SET SESSION debug="d,crash_before_rotate_relaylog";
> +--error 2013
> +FLUSH LOGS;
> +
> +--enable_reconnect
> +--source include/wait_until_connected_again.inc
> +
> +--echo =====Dumping and comparing tables=======;
> +start slave;
> +source include/wait_for_slave_to_start.inc;
> +
> +connection master;
> +save_master_pos;
> +connection slave;
> +sync_with_master;

sync_slave_with_master;

> +
> +connection master;
> +select * from t1;
> +
> +connection slave;
> +select * from t1;
> +
>
> +########################################################################################
> +#                             Case 2: Corrupt a master.info
>
> +########################################################################################
> +--echo =====Corrupting the master.info=======;
> +connection slave;
> +stop slave;
> +source include/wait_for_slave_to_stop.inc;
> +
> +connection master;
> +FLUSH LOGS;
> +
> +insert into t1(a) values(7);
> +insert into t1(a) values(8);
> +insert into t1(a) values(9);
> +
> +connection slave;
> +--replace_result $MYSQLD_SLAVE_DATADIR MYSQLD_SLAVE_DATADIR
> +--exec cat $MYSQLD_SLAVE_DATADIR/master.backup >
> $MYSQLD_SLAVE_DATADIR/master.info
> +
> +--exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
> +SET SESSION debug="d,crash_before_rotate_relaylog";
> +--error 2013
> +FLUSH LOGS;
> +
> +--enable_reconnect
> +--source include/wait_until_connected_again.inc
> +
> +--echo =====Dumping and comparing tables=======;
> +start slave;
> +source include/wait_for_slave_to_start.inc;
> +
> +connection master;
> +save_master_pos;
> +connection slave;
> +sync_with_master;

sync_slave_with_master;

> +
> +connection master;
> +select * from t1;
> +
> +connection slave;
> +select * from t1;
> +
>
> +########################################################################################
> +#                                      Clean up
>
> +########################################################################################
> +--echo =====Clean up=======;
> +connection master;
> +drop table t1;
> 
> === modified file 'sql/rpl_mi.cc'
> --- a/sql/rpl_mi.cc	2009-02-04 16:11:20 +0000
> +++ b/sql/rpl_mi.cc	2009-02-12 09:13:57 +0000
> @@ -30,13 +30,14 @@ int init_strvar_from_file(char *var, int
>  int init_floatvar_from_file(float* var, IO_CACHE* f, float default_val);
>  int init_dynarray_intvar_from_file(DYNAMIC_ARRAY* arr, IO_CACHE* f);
>  
> -Master_info::Master_info()
> +Master_info::Master_info(bool is_slave_recovery)
>    :Slave_reporting_capability("I/O"),
>     ssl(0), ssl_verify_server_cert(0), fd(-1),  io_thd(0), port(MYSQL_PORT),
>     connect_retry(DEFAULT_CONNECT_RETRY), heartbeat_period(0),
>     received_heartbeats(0), inited(0), master_id(0),
>     abort_slave(0), slave_running(0), slave_run_id(0),
> -   sync_counter(0)
> +   sync_counter(0),
> +   rli(is_slave_recovery) 
>  {
>    host[0] = 0; user[0] = 0; password[0] = 0;
>    ssl_ca[0]= 0; ssl_capath[0]= 0; ssl_cert[0]= 0;
> 
> === modified file 'sql/rpl_mi.h'
> --- a/sql/rpl_mi.h	2009-02-04 16:11:20 +0000
> +++ b/sql/rpl_mi.h	2009-02-12 09:13:57 +0000
> @@ -59,7 +59,7 @@
>  class Master_info : public Slave_reporting_capability
>  {
>   public:
> -  Master_info();
> +  Master_info(bool is_slave_recovery);
>    ~Master_info();
>    bool shall_ignore_server_id(ulong s_id);
>  
> 
> === modified file 'sql/rpl_rli.cc'
> --- a/sql/rpl_rli.cc	2009-02-04 16:11:20 +0000
> +++ b/sql/rpl_rli.cc	2009-02-12 09:13:57 +0000
> @@ -30,7 +30,7 @@ int init_strvar_from_file(char *var, int
>  			  const char *default_val);
>  
>  
> -Relay_log_info::Relay_log_info()
> +Relay_log_info::Relay_log_info(bool is_slave_recovery)
>    :Slave_reporting_capability("SQL"),
>     no_storage(FALSE), replicate_same_server_id(::replicate_same_server_id),
>     info_fd(-1), cur_log_fd(-1), 
> @@ -50,6 +50,8 @@ Relay_log_info::Relay_log_info()
>  {
>    DBUG_ENTER("Relay_log_info::Relay_log_info");
>  
> +  is_relay_log_recovery= is_slave_recovery;
> +

Why don't you add this to the initializer list above?

>    group_relay_log_name[0]= event_relay_log_name[0]=
>      group_master_log_name[0]= 0;
>    until_log_name[0]= ign_master_log_name_end[0]= 0;
> @@ -265,14 +267,14 @@ Failed to open the existing relay log in
>    }
>  
>  #ifndef DBUG_OFF
> +  if (!(rli->is_relay_log_recovery))
>    {
>      char llbuf1[22], llbuf2[22];
>      DBUG_PRINT("info", ("my_b_tell(rli->cur_log)=%s
> rli->event_relay_log_pos=%s",
>                          llstr(my_b_tell(rli->cur_log),llbuf1),
>                          llstr(rli->event_relay_log_pos,llbuf2)));
>      DBUG_ASSERT(rli->event_relay_log_pos >= BIN_LOG_HEADER_SIZE);
> -    DBUG_ASSERT(rli->is_relay_log_recovery || 
> -                (my_b_tell(rli->cur_log) == rli->event_relay_log_pos));
> +    DBUG_ASSERT((my_b_tell(rli->cur_log) == rli->event_relay_log_pos));
>    }
>  #endif
>  
> 
> === modified file 'sql/rpl_rli.h'
> --- a/sql/rpl_rli.h	2009-02-04 16:11:20 +0000
> +++ b/sql/rpl_rli.h	2009-02-12 09:13:57 +0000
> @@ -273,7 +273,7 @@ public:
>    char ign_master_log_name_end[FN_REFLEN];
>    ulonglong ign_master_log_pos_end;
>  
> -  Relay_log_info();
> +  Relay_log_info(bool is_slave_recovery);
>    ~Relay_log_info();
>  
>    /*
> 
> === modified file 'sql/slave.cc'
> --- a/sql/slave.cc	2009-02-04 16:11:20 +0000
> +++ b/sql/slave.cc	2009-02-12 09:13:57 +0000
> @@ -220,6 +220,7 @@ void unlock_slave_threads(Master_info* m
>  int init_slave()
>  {
>    DBUG_ENTER("init_slave");
> +  int error= 0;
>  
>    /*
>      This is called when mysqld starts. Before client connections are
> @@ -233,9 +234,12 @@ int init_slave()
>    */
>  
>    if (pthread_key_create(&RPL_MASTER_INFO, NULL))
> +  {
> +    error= 1;
>      goto err;
> +  }
>  
> -  active_mi= new Master_info;
> +  active_mi= new Master_info(relay_log_recovery);

Hmmm... OK.

Caravaning through the memory allocator does not scale very well, but this is a
single thread and it is not very commonly executed.

>  
>    /*
>      If master_host is not specified, try to read it from the master_info file.
> @@ -245,6 +249,7 @@ int init_slave()
>    if (!active_mi)
>    {
>      sql_print_error("Failed to allocate memory for the master info structure");
> +    error= 1;
>      goto err;
>    }
>  
> @@ -252,12 +257,15 @@ int init_slave()
>                         1, (SLAVE_IO | SLAVE_SQL)))
>    {
>      sql_print_error("Failed to initialize the master info structure");
> +    error= 1;
>      goto err;
>    }
>  
> -  active_mi->rli.is_relay_log_recovery= relay_log_recovery;
>    if (active_mi->rli.is_relay_log_recovery && init_recovery(active_mi))
> +  {
> +    error= 1;
>      goto err;
> +  }
>  
>    /* If server id is not set, start_slave_thread() will say it */
>  
> @@ -271,43 +279,43 @@ int init_slave()
>                              SLAVE_IO | SLAVE_SQL))
>      {
>        sql_print_error("Failed to create slave threads");
> +      error= 1;
>        goto err;
>      }
>    }
> -  active_mi->rli.is_relay_log_recovery= 0;
> -  pthread_mutex_unlock(&LOCK_active_mi);
> -  DBUG_RETURN(0);
>  
>  err:
> -  active_mi->rli.is_relay_log_recovery= 0;
> +  active_mi->rli.is_relay_log_recovery= false;

We use FALSE instead of false.

>    pthread_mutex_unlock(&LOCK_active_mi);
> -  DBUG_RETURN(1);
> +  DBUG_RETURN(error);
>  }
>  
>  /*
> - * Updates the master info based on the information stored in the
> - * relay info and ignores relay logs previously retrieved by the IO 
> - * thread, which thus starts fetching again based on to the  
> - * group_master_log_pos and group_master_log_name. Eventually, the old
> - * relay logs will be purged by the normal purge mechanism.
> - * 
> - * In the feature, we can improve this routine in order to avoid throwing
> - * away logs that are safely stored in the disk.
> - * 
> - * There is no need for a mutex as the caller (i.e. init_slave) already 
> - * has one acquired.
> - * 
> - * Specifically, the following structures are updated:
> - * 
> - * 1 - mi->master_log_pos  <-- rli->group_master_log_pos
> - * 2 - mi->master_log_name <-- rli->group_master_log_name
> - * 3 - It moves the relay log to the new relay log file, by
> - *     rli->group_relay_log_pos  <-- BIN_LOG_HEADER_SIZE;
> - *     rli->event_relay_log_pos  <-- BIN_LOG_HEADER_SIZE;
> - *     rli->group_relay_log_name <-- rli->relay_log.get_log_fname();
> - *     rli->event_relay_log_name <-- rli->relay_log.get_log_fname();
> - * 
> - *  If there is an error, it returns (1), otherwise returns (0).
> +  Updates the master info based on the information stored in the
> +  relay info and ignores relay logs previously retrieved by the IO 
> +  thread, which thus starts fetching again based on to the  
> +  group_master_log_pos and group_master_log_name. Eventually, the old
> +  relay logs will be purged by the normal purge mechanism.
> +
> +  In the feature, we should improve this routine in order to avoid throwing
> +  away logs that are safely stored in the disk. Note also that this recovery 
> +  routine relies on the correctness of the relay-log.info and only tolerates 
> +  coordinate problems in master.info.
> +  
> +  In this function, there is no need for a mutex as the caller 
> +  (i.e. init_slave) already has one acquired.
> +  
> +  Specifically, the following structures are updated:
> + 
> +  1 - mi->master_log_pos  <-- rli->group_master_log_pos
> +  2 - mi->master_log_name <-- rli->group_master_log_name
> +  3 - It moves the relay log to the new relay log file, by
> +      rli->group_relay_log_pos  <-- BIN_LOG_HEADER_SIZE;
> +      rli->event_relay_log_pos  <-- BIN_LOG_HEADER_SIZE;
> +      rli->group_relay_log_name <-- rli->relay_log.get_log_fname();
> +      rli->event_relay_log_name <-- rli->relay_log.get_log_fname();
> +  
> +   If there is an error, it returns (1), otherwise returns (0).
>   */
>  static int init_recovery(Master_info* mi)
>  {
> @@ -4316,6 +4324,8 @@ void rotate_relay_log(Master_info* mi)
>    DBUG_ENTER("rotate_relay_log");
>    Relay_log_info* rli= &mi->rli;
>  
> +  DBUG_EXECUTE_IF("crash_before_rotate_relaylog", exit(1););
> +
>    /* We don't lock rli->run_lock. This would lead to deadlocks. */
>    pthread_mutex_lock(&mi->run_lock);
>  
> 
> === modified file 'sql/sql_binlog.cc'
> --- a/sql/sql_binlog.cc	2009-01-26 16:03:39 +0000
> +++ b/sql/sql_binlog.cc	2009-02-12 09:13:57 +0000
> @@ -58,7 +58,7 @@ void mysql_client_binlog_statement(THD* 
>    my_bool have_fd_event= TRUE;
>    if (!thd->rli_fake)
>    {
> -    thd->rli_fake= new Relay_log_info;
> +    thd->rli_fake= new Relay_log_info(false);

We use 'FALSE' instead of 'false'.

>  #ifdef HAVE_purify
>      thd->rli_fake->is_fake= TRUE;
>  #endif
> 
> 

-- 
Mats Kindahl
Lead Software Developer
Replication Team
MySQL AB, www.mysql.com

Thread
bzr commit into mysql-6.0-rpl branch (alfranio.correia:2811) Bug#40337Alfranio Correia12 Feb
  • Re: bzr commit into mysql-6.0-rpl branch (alfranio.correia:2811)Bug#40337Mats Kindahl16 Feb
    • Re: bzr commit into mysql-6.0-rpl branch (alfranio.correia:2811)Bug#40337Alfranio Correia17 Feb