From: Alfranio Correia Date: February 11 2009 10:23am Subject: bzr commit into mysql-6.0-rpl branch (alfranio.correia:2811) Bug#40337 List-Archive: http://lists.mysql.com/commits/65869 X-Bug: 40337 Message-Id: <0KEW008GOCUKK5C0@fe-emea-09.sun.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: 7BIT #At file:///home/acorreia/workspace.sun/repository.mysql/bzrwork/bug-40337/mysql-6.0-rpl/ based on revid:alfranio.correia@stripped 2811 Alfranio Correia 2009-02-11 BUG#40337 Fsyncing master and relay log to disk after every event is too slow Post-fix. added: mysql-test/suite/rpl/r/rpl_sync.result mysql-test/suite/rpl/t/rpl_sync-slave.opt mysql-test/suite/rpl/t/rpl_sync.test modified: sql/rpl_mi.cc sql/rpl_mi.h sql/rpl_rli.cc sql/rpl_rli.h sql/slave.cc sql/sql_binlog.cc === added file 'mysql-test/suite/rpl/r/rpl_sync.result' --- a/mysql-test/suite/rpl/r/rpl_sync.result 1970-01-01 00:00:00 +0000 +++ b/mysql-test/suite/rpl/r/rpl_sync.result 2009-02-11 10:22:51 +0000 @@ -0,0 +1,75 @@ +=====Configuring the enviroment=======; +stop slave; +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; +reset master; +reset slave; +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; +start slave; +CREATE TABLE t1(a INT, PRIMARY KEY(a)) engine=innodb; +insert into t1(a) values(1); +insert into t1(a) values(2); +insert into t1(a) values(3); +=====Inserting data on the master but without the SQL Thread being running=======; +stop slave SQL_THREAD; +insert into t1(a) values(4); +insert into t1(a) values(5); +insert into t1(a) values(6); +=====Removing relay log files and crashing/recoverying the slave=======; +stop slave IO_THREAD; +failure +SET SESSION debug="d,crash_sync"; +FLUSH LOGS; +ERROR HY000: Lost connection to MySQL server during query +=====Dumping and comparing tables=======; +start slave; +select * from t1; +a +1 +2 +3 +4 +5 +6 +select * from t1; +a +1 +2 +3 +4 +5 +6 +=====Corrupting the master.info=======; +stop slave; +FLUSH LOGS; +insert into t1(a) values(7); +insert into t1(a) values(8); +insert into t1(a) values(9); +SET SESSION debug="d,crash_sync"; +FLUSH LOGS; +ERROR HY000: Lost connection to MySQL server during query +=====Dumping and comparing tables=======; +start slave; +select * from t1; +a +1 +2 +3 +4 +5 +6 +7 +8 +9 +select * from t1; +a +1 +2 +3 +4 +5 +6 +7 +8 +9 +=====Clean up=======; +drop table t1; === added file 'mysql-test/suite/rpl/t/rpl_sync-slave.opt' --- a/mysql-test/suite/rpl/t/rpl_sync-slave.opt 1970-01-01 00:00:00 +0000 +++ b/mysql-test/suite/rpl/t/rpl_sync-slave.opt 2009-02-11 10:22:51 +0000 @@ -0,0 +1 @@ +--sync-master-info=999 --sync-relay-log=999 --sync-binlog=999 --sync-relay-log-info=1 --relay-log-recovery=1 === added file 'mysql-test/suite/rpl/t/rpl_sync.test' --- a/mysql-test/suite/rpl/t/rpl_sync.test 1970-01-01 00:00:00 +0000 +++ b/mysql-test/suite/rpl/t/rpl_sync.test 2009-02-11 10:22:51 +0000 @@ -0,0 +1,143 @@ +######################################################################################## +# This test verifies the options --sync-relay-log-info and --relay-log-recovery by +# crashing the slave in two different situations: +# (case-1) - Corrupt the relay log with changes which were not processed by +# the SQL Thread and crashes it. +# (case-2) - Corrupt the master.info with wrong coordinates and crashes it. +# +# Case 1: +# 1 - Stops the SQL Thread +# 2 - Inserts new records into the master. +# 3 - Corrupts the relay-log.bin* which most likely has such changes. +# 4 - Crashes the slave +# 5 - Verifies if the slave is sync with the master which means that the information +# loss was circumvented by the recovery process. +# +# Case 2: +# 1 - Stops the SQL/IO Threads +# 2 - Inserts new records into the master. +# 3 - Corrupts the master.info with wrong coordinates. +# 4 - Crashes the slave +# 5 - Verifies if the slave is sync with the master which means that the information +# loss was circumvented by the recovery process. +######################################################################################## + +######################################################################################## +# Configuring the environment +######################################################################################## +--echo =====Configuring the enviroment=======; +--source include/master-slave.inc +--source include/not_embedded.inc +--source include/not_valgrind.inc + +--source include/have_debug.inc + +--source include/have_innodb.inc + +CREATE TABLE t1(a INT, PRIMARY KEY(a)) engine=innodb; + +insert into t1(a) values(1); +insert into t1(a) values(2); +insert into t1(a) values(3); + +######################################################################################## +# Case 1: Corrupt a relay-log.bin* +######################################################################################## +--echo =====Inserting data on the master but without the SQL Thread being running=======; +save_master_pos; +connection slave; +sync_with_master; + +connection slave; +let $MYSQLD_SLAVE_DATADIR= `select @@datadir`; +--replace_result $MYSQLD_SLAVE_DATADIR MYSQLD_SLAVE_DATADIR +--copy_file $MYSQLD_SLAVE_DATADIR/master.info $MYSQLD_SLAVE_DATADIR/master.backup +stop slave SQL_THREAD; +source include/wait_for_slave_sql_to_stop.inc; + +connection master; +insert into t1(a) values(4); +insert into t1(a) values(5); +insert into t1(a) values(6); + +--echo =====Removing relay log files and crashing/recoverying the slave=======; +connection slave; +stop slave IO_THREAD; +source include/wait_for_slave_io_to_stop.inc; + +let $file= query_get_value("SHOW SLAVE STATUS", Relay_Log_File, 1); +--replace_result $MYSQLD_SLAVE_DATADIR MYSQLD_SLAVE_DATADIR +--exec echo "failure" > $MYSQLD_SLAVE_DATADIR/$file +--exec cat $MYSQLD_SLAVE_DATADIR/$file + +--exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.2.expect +SET SESSION debug="d,crash_sync"; +--error 2013 +FLUSH LOGS; + +--enable_reconnect +--source include/wait_until_connected_again.inc + +--echo =====Dumping and comparing tables=======; +start slave; +source include/wait_for_slave_to_start.inc; + +connection master; +save_master_pos; +connection slave; +sync_with_master; + +connection master; +select * from t1; + +connection slave; +select * from t1; + +######################################################################################## +# Case 1: Corrupt a master.info +######################################################################################## +--echo =====Corrupting the master.info=======; +connection slave; +stop slave; +source include/wait_for_slave_to_stop.inc; + +connection master; +FLUSH LOGS; + +insert into t1(a) values(7); +insert into t1(a) values(8); +insert into t1(a) values(9); + +connection slave; +--replace_result $MYSQLD_SLAVE_DATADIR MYSQLD_SLAVE_DATADIR +--exec cat $MYSQLD_SLAVE_DATADIR/master.backup > $MYSQLD_SLAVE_DATADIR/master.info + +--exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.2.expect +SET SESSION debug="d,crash_sync"; +--error 2013 +FLUSH LOGS; + +--enable_reconnect +--source include/wait_until_connected_again.inc + +--echo =====Dumping and comparing tables=======; +start slave; +source include/wait_for_slave_to_start.inc; + +connection master; +save_master_pos; +connection slave; +sync_with_master; + +connection master; +select * from t1; + +connection slave; +select * from t1; + +######################################################################################## +# Clean up +######################################################################################## +--echo =====Clean up=======; +connection master; +drop table t1; === modified file 'sql/rpl_mi.cc' --- a/sql/rpl_mi.cc 2009-02-04 16:11:20 +0000 +++ b/sql/rpl_mi.cc 2009-02-11 10:22:51 +0000 @@ -30,13 +30,14 @@ int init_strvar_from_file(char *var, int int init_floatvar_from_file(float* var, IO_CACHE* f, float default_val); int init_dynarray_intvar_from_file(DYNAMIC_ARRAY* arr, IO_CACHE* f); -Master_info::Master_info() +Master_info::Master_info(bool is_slave_recovery) :Slave_reporting_capability("I/O"), ssl(0), ssl_verify_server_cert(0), fd(-1), io_thd(0), port(MYSQL_PORT), connect_retry(DEFAULT_CONNECT_RETRY), heartbeat_period(0), received_heartbeats(0), inited(0), master_id(0), abort_slave(0), slave_running(0), slave_run_id(0), - sync_counter(0) + sync_counter(0), + rli(is_slave_recovery) { host[0] = 0; user[0] = 0; password[0] = 0; ssl_ca[0]= 0; ssl_capath[0]= 0; ssl_cert[0]= 0; === modified file 'sql/rpl_mi.h' --- a/sql/rpl_mi.h 2009-02-04 16:11:20 +0000 +++ b/sql/rpl_mi.h 2009-02-11 10:22:51 +0000 @@ -59,7 +59,7 @@ class Master_info : public Slave_reporting_capability { public: - Master_info(); + Master_info(bool is_slave_recovery); ~Master_info(); bool shall_ignore_server_id(ulong s_id); === modified file 'sql/rpl_rli.cc' --- a/sql/rpl_rli.cc 2009-02-04 16:11:20 +0000 +++ b/sql/rpl_rli.cc 2009-02-11 10:22:51 +0000 @@ -30,7 +30,7 @@ int init_strvar_from_file(char *var, int const char *default_val); -Relay_log_info::Relay_log_info() +Relay_log_info::Relay_log_info(bool is_slave_recovery) :Slave_reporting_capability("SQL"), no_storage(FALSE), replicate_same_server_id(::replicate_same_server_id), info_fd(-1), cur_log_fd(-1), @@ -50,6 +50,8 @@ Relay_log_info::Relay_log_info() { DBUG_ENTER("Relay_log_info::Relay_log_info"); + is_relay_log_recovery= is_slave_recovery; + group_relay_log_name[0]= event_relay_log_name[0]= group_master_log_name[0]= 0; until_log_name[0]= ign_master_log_name_end[0]= 0; @@ -265,14 +267,14 @@ Failed to open the existing relay log in } #ifndef DBUG_OFF + if (!(rli->is_relay_log_recovery)) { char llbuf1[22], llbuf2[22]; DBUG_PRINT("info", ("my_b_tell(rli->cur_log)=%s rli->event_relay_log_pos=%s", llstr(my_b_tell(rli->cur_log),llbuf1), llstr(rli->event_relay_log_pos,llbuf2))); DBUG_ASSERT(rli->event_relay_log_pos >= BIN_LOG_HEADER_SIZE); - DBUG_ASSERT(rli->is_relay_log_recovery || - (my_b_tell(rli->cur_log) == rli->event_relay_log_pos)); + DBUG_ASSERT((my_b_tell(rli->cur_log) == rli->event_relay_log_pos)); } #endif === modified file 'sql/rpl_rli.h' --- a/sql/rpl_rli.h 2009-02-04 16:11:20 +0000 +++ b/sql/rpl_rli.h 2009-02-11 10:22:51 +0000 @@ -273,7 +273,7 @@ public: char ign_master_log_name_end[FN_REFLEN]; ulonglong ign_master_log_pos_end; - Relay_log_info(); + Relay_log_info(bool is_slave_recovery); ~Relay_log_info(); /* === modified file 'sql/slave.cc' --- a/sql/slave.cc 2009-02-04 16:11:20 +0000 +++ b/sql/slave.cc 2009-02-11 10:22:51 +0000 @@ -220,6 +220,7 @@ void unlock_slave_threads(Master_info* m int init_slave() { DBUG_ENTER("init_slave"); + int error= 0; /* This is called when mysqld starts. Before client connections are @@ -233,9 +234,12 @@ int init_slave() */ if (pthread_key_create(&RPL_MASTER_INFO, NULL)) + { + error= 1; goto err; + } - active_mi= new Master_info; + active_mi= new Master_info(relay_log_recovery); /* If master_host is not specified, try to read it from the master_info file. @@ -245,6 +249,7 @@ int init_slave() if (!active_mi) { sql_print_error("Failed to allocate memory for the master info structure"); + error= 1; goto err; } @@ -252,12 +257,15 @@ int init_slave() 1, (SLAVE_IO | SLAVE_SQL))) { sql_print_error("Failed to initialize the master info structure"); + error= 1; goto err; } - active_mi->rli.is_relay_log_recovery= relay_log_recovery; if (active_mi->rli.is_relay_log_recovery && init_recovery(active_mi)) + { + error= 1; goto err; + } /* If server id is not set, start_slave_thread() will say it */ @@ -271,43 +279,43 @@ int init_slave() SLAVE_IO | SLAVE_SQL)) { sql_print_error("Failed to create slave threads"); + error= 1; goto err; } } - active_mi->rli.is_relay_log_recovery= 0; - pthread_mutex_unlock(&LOCK_active_mi); - DBUG_RETURN(0); err: - active_mi->rli.is_relay_log_recovery= 0; + active_mi->rli.is_relay_log_recovery= false; pthread_mutex_unlock(&LOCK_active_mi); - DBUG_RETURN(1); + DBUG_RETURN(error); } /* - * Updates the master info based on the information stored in the - * relay info and ignores relay logs previously retrieved by the IO - * thread, which thus starts fetching again based on to the - * group_master_log_pos and group_master_log_name. Eventually, the old - * relay logs will be purged by the normal purge mechanism. - * - * In the feature, we can improve this routine in order to avoid throwing - * away logs that are safely stored in the disk. - * - * There is no need for a mutex as the caller (i.e. init_slave) already - * has one acquired. - * - * Specifically, the following structures are updated: - * - * 1 - mi->master_log_pos <-- rli->group_master_log_pos - * 2 - mi->master_log_name <-- rli->group_master_log_name - * 3 - It moves the relay log to the new relay log file, by - * rli->group_relay_log_pos <-- BIN_LOG_HEADER_SIZE; - * rli->event_relay_log_pos <-- BIN_LOG_HEADER_SIZE; - * rli->group_relay_log_name <-- rli->relay_log.get_log_fname(); - * rli->event_relay_log_name <-- rli->relay_log.get_log_fname(); - * - * If there is an error, it returns (1), otherwise returns (0). + Updates the master info based on the information stored in the + relay info and ignores relay logs previously retrieved by the IO + thread, which thus starts fetching again based on to the + group_master_log_pos and group_master_log_name. Eventually, the old + relay logs will be purged by the normal purge mechanism. + + In the feature, we should improve this routine in order to avoid throwing + away logs that are safely stored in the disk. Note also that this recovery + routine relies on the correctness of the relay-log.info and only tolerates + coordinate problems in master.info. + + In this function, there is no need for a mutex as the caller + (i.e. init_slave) already has one acquired. + + Specifically, the following structures are updated: + + 1 - mi->master_log_pos <-- rli->group_master_log_pos + 2 - mi->master_log_name <-- rli->group_master_log_name + 3 - It moves the relay log to the new relay log file, by + rli->group_relay_log_pos <-- BIN_LOG_HEADER_SIZE; + rli->event_relay_log_pos <-- BIN_LOG_HEADER_SIZE; + rli->group_relay_log_name <-- rli->relay_log.get_log_fname(); + rli->event_relay_log_name <-- rli->relay_log.get_log_fname(); + + If there is an error, it returns (1), otherwise returns (0). */ static int init_recovery(Master_info* mi) { @@ -4316,6 +4324,8 @@ void rotate_relay_log(Master_info* mi) DBUG_ENTER("rotate_relay_log"); Relay_log_info* rli= &mi->rli; + DBUG_EXECUTE_IF("crash_sync", exit(1);); + /* We don't lock rli->run_lock. This would lead to deadlocks. */ pthread_mutex_lock(&mi->run_lock); === modified file 'sql/sql_binlog.cc' --- a/sql/sql_binlog.cc 2009-01-26 16:03:39 +0000 +++ b/sql/sql_binlog.cc 2009-02-11 10:22:51 +0000 @@ -58,7 +58,7 @@ void mysql_client_binlog_statement(THD* my_bool have_fd_event= TRUE; if (!thd->rli_fake) { - thd->rli_fake= new Relay_log_info; + thd->rli_fake= new Relay_log_info(false); #ifdef HAVE_purify thd->rli_fake->is_fake= TRUE; #endif