Hi Alfranio,
Patch approved, thank you!
Alfranio Correia wrote:
> #At
> file:///home/acorreia/workspace.sun/repository.mysql/bzrwork/bug-43075/mysql-6.0-rpl/
> based on revid:aelkin@stripped
>
> 2821 Alfranio Correia 2009-03-12
> BUG#43075 rpl.rpl_sync fails sporadically on pushbuild
>
> The slave was crashing while failing to execute the init_slave() function.
>
> The issue stems from two different reasons:
>
> 1 - A failure while allocating the master info structure generated a
> segfault due to a NULL pointer.
>
> 2 - A failure while recovering generated a segfault due to a non-initialized
> relay log file. In other words, the mi->init and rli->init were both
> set to true
> before executing the recovery process thus creating an inconsistent state
> as the
> relay log file was not initialized.
>
> To circumvent such problems, we refactored the recovery process which is now
> executed while
> initializing the relay log. It is ensured that the master info structure is
> created
> before accessing it and any error is propagated thus avoiding to set
> mi->init and
> rli->init to true when for instance the relay log is not initialized or the
> relay info
> is not flushed.
>
> The changes related to the refactory are described below:
>
> 1 - Removed call to init_recovery from init_slave.
>
> 2 - Changed the signature of the function init_recovery.
>
> 3 - Removed flushes. They are called while initializing the relay log and
> master info.
>
> 4 - Made sure that if the relay info is not flushed the mi-init and rli-init
> are not
> set to true.
>
> In this patch, we also replaced the exit(1) in the fault injection by
> DBUG_ABORT() to
> make it compliant with the code guidelines.
> modified:
> mysql-test/suite/rpl/r/rpl_sync.result
> mysql-test/suite/rpl/t/rpl_sync.test
> sql/rpl_mi.cc
> sql/rpl_rli.cc
> sql/slave.cc
> sql/slave.h
>
> per-file messages:
> mysql-test/suite/rpl/r/rpl_sync.result
> Added abort while simulating crashes.
> mysql-test/suite/rpl/t/rpl_sync.test
> Added abort while simulating crashes.
> sql/rpl_mi.cc
> If evertything went well the recovery is over.
> sql/rpl_rli.cc
> The mi->init and rli->init were both set to true before executing the
> recovery
> process which might fail thus generating an iconsistent state as the relay
> log file was not intialized.
>
> To avoid problems, the recovery process is now executed while initializing
> the relay log and any error is propagated thus avoiding to set mi->init
> and rli->init to true.
>
> It is also ensured that the relay log is flushed.
> sql/slave.cc
> Changed due to the refactory.
> sql/slave.h
> Changes due to the refactory.
> === modified file 'mysql-test/suite/rpl/r/rpl_sync.result'
> --- a/mysql-test/suite/rpl/r/rpl_sync.result 2009-02-17 23:06:00 +0000
> +++ b/mysql-test/suite/rpl/r/rpl_sync.result 2009-03-12 11:16:58 +0000
> @@ -5,6 +5,7 @@ reset master;
> reset slave;
> drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> start slave;
> +call mtr.add_suppression('Attempting backtrace');
> CREATE TABLE t1(a INT, PRIMARY KEY(a)) engine=innodb;
> insert into t1(a) values(1);
> insert into t1(a) values(2);
>
> === modified file 'mysql-test/suite/rpl/t/rpl_sync.test'
> --- a/mysql-test/suite/rpl/t/rpl_sync.test 2009-02-17 23:06:00 +0000
> +++ b/mysql-test/suite/rpl/t/rpl_sync.test 2009-03-12 11:16:58 +0000
> @@ -29,11 +29,11 @@
> --source include/master-slave.inc
> --source include/not_embedded.inc
> --source include/not_valgrind.inc
> -
> --source include/have_debug.inc
> -
> --source include/have_innodb.inc
>
> +call mtr.add_suppression('Attempting backtrace');
> +
> CREATE TABLE t1(a INT, PRIMARY KEY(a)) engine=innodb;
>
> insert into t1(a) values(1);
>
> === modified file 'sql/rpl_mi.cc'
> --- a/sql/rpl_mi.cc 2009-02-19 08:54:07 +0000
> +++ b/sql/rpl_mi.cc 2009-03-12 11:16:58 +0000
> @@ -380,6 +380,7 @@ file '%s')", fname);
> goto err;
>
> mi->inited = 1;
> + mi->rli.is_relay_log_recovery= FALSE;
> // now change cache READ -> WRITE - must do this before flush_master_info
> reinit_io_cache(&mi->file, WRITE_CACHE, 0L, 0, 1);
> if ((error=test(flush_master_info(mi, 1))))
>
> === modified file 'sql/rpl_rli.cc'
> --- a/sql/rpl_rli.cc 2009-03-09 12:55:05 +0000
> +++ b/sql/rpl_rli.cc 2009-03-12 11:16:58 +0000
> @@ -250,8 +250,10 @@ Failed to open the existing relay log in
> rli->group_relay_log_pos= rli->event_relay_log_pos= relay_log_pos;
> rli->group_master_log_pos= master_log_pos;
>
> - if (!rli->is_relay_log_recovery &&
> - init_relay_log_pos(rli,
> + if (rli->is_relay_log_recovery && init_recovery(rli->mi,
> &msg))
> + goto err;
> +
> + if (init_relay_log_pos(rli,
> rli->group_relay_log_name,
> rli->group_relay_log_pos,
> 0 /* no data lock*/,
> @@ -266,7 +268,6 @@ Failed to open the existing relay log in
> }
>
> #ifndef DBUG_OFF
> - if (!(rli->is_relay_log_recovery))
> {
> char llbuf1[22], llbuf2[22];
> DBUG_PRINT("info", ("my_b_tell(rli->cur_log)=%s
> rli->event_relay_log_pos=%s",
> @@ -283,7 +284,10 @@ Failed to open the existing relay log in
> */
> reinit_io_cache(&rli->info_file, WRITE_CACHE,0L,0,1);
> if ((error= flush_relay_log_info(rli)))
> - sql_print_error("Failed to flush relay log info file");
> + {
> + msg="Failed to flush relay log info file";
> + goto err;
> + }
> if (count_relay_log_space(rli))
> {
> msg="Error counting relay log space";
>
> === modified file 'sql/slave.cc'
> --- a/sql/slave.cc 2009-03-09 12:55:05 +0000
> +++ b/sql/slave.cc 2009-03-12 11:16:58 +0000
> @@ -132,7 +132,6 @@ static bool wait_for_relay_log_space(Rel
> static inline bool io_slave_killed(THD* thd,Master_info* mi);
> static inline bool sql_slave_killed(THD* thd,Relay_log_info* rli);
> static int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type);
> -static int init_recovery(Master_info* mi);
> static int safe_connect(THD* thd, MYSQL* mysql, Master_info* mi);
> static int safe_reconnect(THD* thd, MYSQL* mysql, Master_info* mi,
> bool suppress_warnings);
> @@ -261,12 +260,6 @@ int init_slave()
> goto err;
> }
>
> - if (active_mi->rli.is_relay_log_recovery && init_recovery(active_mi))
> - {
> - error= 1;
> - goto err;
> - }
> -
> /* If server id is not set, start_slave_thread() will say it */
>
> if (active_mi->host[0] && !opt_skip_slave_start)
> @@ -285,7 +278,6 @@ int init_slave()
> }
>
> err:
> - active_mi->rli.is_relay_log_recovery= FALSE;
> pthread_mutex_unlock(&LOCK_active_mi);
> DBUG_RETURN(error);
> }
> @@ -317,9 +309,8 @@ err:
>
> If there is an error, it returns (1), otherwise returns (0).
> */
> -static int init_recovery(Master_info* mi)
> +int init_recovery(Master_info* mi, const char** errmsg)
> {
> - const char *errmsg= 0;
> DBUG_ENTER("init_recovery");
>
> Relay_log_info *rli= &mi->rli;
> @@ -339,26 +330,8 @@ static int init_recovery(Master_info* mi
> sizeof(mi->rli.event_relay_log_name)-1);
>
> rli->group_relay_log_pos= rli->event_relay_log_pos= BIN_LOG_HEADER_SIZE;
> -
> - if (init_relay_log_pos(rli,
> - rli->group_relay_log_name,
> - rli->group_relay_log_pos,
> - 0 /*no data lock*/,
> - &errmsg, 0))
> - DBUG_RETURN(1);
> -
> - if (flush_master_info(mi, 0))
> - {
> - sql_print_error("Failed to flush master info file");
> - DBUG_RETURN(1);
> - }
> - if (flush_relay_log_info(rli))
> - {
> - sql_print_error("Failed to flush relay info file");
> - DBUG_RETURN(1);
> - }
> }
> -
> +
> DBUG_RETURN(0);
> }
>
> @@ -4331,7 +4304,7 @@ void rotate_relay_log(Master_info* mi)
> DBUG_ENTER("rotate_relay_log");
> Relay_log_info* rli= &mi->rli;
>
> - DBUG_EXECUTE_IF("crash_before_rotate_relaylog", exit(1););
> + DBUG_EXECUTE_IF("crash_before_rotate_relaylog", DBUG_ABORT(););
>
> /* We don't lock rli->run_lock. This would lead to deadlocks. */
> pthread_mutex_lock(&mi->run_lock);
>
> === modified file 'sql/slave.h'
> --- a/sql/slave.h 2008-12-24 10:48:24 +0000
> +++ b/sql/slave.h 2009-03-12 11:16:58 +0000
> @@ -144,6 +144,7 @@ extern ulonglong relay_log_space_limit;
> #define SLAVE_FORCE_ALL 4
>
> int init_slave();
> +int init_recovery(Master_info* mi, const char** errmsg);
> void init_slave_skip_errors(const char* arg);
> bool flush_relay_log_info(Relay_log_info* rli);
> int register_slave_on_master(MYSQL* mysql);
>
>