Hi Sven
I think your fix also has race condition.
1) start slave until ...;
2) source include/wait_for_slave_io_to_start.inc;
3) source include/wait_for_slave_sql_to_stop.inc;
if before running 2), the slave I/O thread has already started and
finished replication to given position and stopped, then 2) will fail to
wait the slave I/O start and thus the test will fail.
Sven Sandberg wrote:
> #At file:///home/sven/bzr/b37717-rpl_stm_until/5.1-rpl/
>
> 2633 Sven Sandberg 2008-07-23
> BUG#37717: rpl.rpl_stm_until 'stmt' fails sporadically on pushbuild
> Problem: After START SLAVE, the Slave_IO_Status column of
> SHOW SLAVE STATUS goes from No to Yes asynchronously. That
> caused sporadic failures on pushbuild in rpl_stm_until since
> the test contains SHOW SLAVE STATUS right after START SLAVE.
> Fix: Wait until Slave_IO_Status becomes Yes after each
> START SLAVE.
> added:
> mysql-test/include/wait_for_slave_io_to_start.inc
> modified:
> mysql-test/suite/rpl/r/rpl_stm_until.result
> mysql-test/suite/rpl/t/rpl_stm_until.test
>
> per-file messages:
> mysql-test/include/wait_for_slave_io_to_start.inc
> Macro that waits until the Slave_IO_Running field of
> SHOW SLAVE STATUS becomes Yes.
> mysql-test/suite/rpl/r/rpl_stm_until.result
> updated result file
> mysql-test/suite/rpl/t/rpl_stm_until.test
> - Added wait_for_slave_io_to_start after each start slave.
> - Removed unused initialization of test variable $VERSION
> - Added comments.
> === added file 'mysql-test/include/wait_for_slave_io_to_start.inc'
> --- a/mysql-test/include/wait_for_slave_io_to_start.inc 1970-01-01 00:00:00 +0000
> +++ b/mysql-test/include/wait_for_slave_io_to_start.inc 2008-07-23 11:23:52 +0000
> @@ -0,0 +1,19 @@
> +# ==== Purpose ====
> +#
> +# Waits until the IO thread of the current connection has started and
> +# connected to the master (i.e., until SHOW SLAVE STATUS returns Yes
> +# in the Slave_IO_Running field), or until a timeout is reached.
> +#
> +# ==== Usage ====
> +#
> +# source include/wait_for_slave_io_to_start.inc;
> +#
> +# Parameters to this macro are $slave_timeout and
> +# $slave_keep_connection. See wait_for_slave_param.inc for
> +# descriptions.
> +
> +let $slave_param= Slave_IO_Running;
> +let $slave_param_value= Yes;
> +let $slave_error_message= Failed while waiting for slave IO thread to start;
> +source include/wait_for_slave_param.inc;
> +let $slave_error_message= ;
>
> === modified file 'mysql-test/suite/rpl/r/rpl_stm_until.result'
> --- a/mysql-test/suite/rpl/r/rpl_stm_until.result 2008-07-10 16:09:39 +0000
> +++ b/mysql-test/suite/rpl/r/rpl_stm_until.result 2008-07-23 11:23:52 +0000
> @@ -4,7 +4,10 @@ reset master;
> reset slave;
> drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> start slave;
> +[on slave]
> include/stop_slave.inc
> +==== Create some events on master ====
> +[on master]
> create table t1(n int not null auto_increment primary key);
> insert into t1 values (1),(2),(3),(4);
> drop table t1;
> @@ -12,6 +15,8 @@ create table t2(n int not null auto_incr
> insert into t2 values (1),(2);
> insert into t2 values (3),(4);
> drop table t2;
> +==== Replicate one event at a time on slave ====
> +[on slave]
> start slave until master_log_file='master-bin.000001', master_log_pos=323;
> select * from t1;
> n
> @@ -149,6 +154,8 @@ Last_IO_Error #
> Last_SQL_Errno 0
> Last_SQL_Error
> start slave;
> +[on master]
> +[on slave]
> include/stop_slave.inc
> start slave until master_log_file='master-bin.000001', master_log_pos=776;
> SHOW SLAVE STATUS;
> @@ -190,6 +197,7 @@ Last_IO_Errno #
> Last_IO_Error #
> Last_SQL_Errno 0
> Last_SQL_Error
> +==== Test various error conditions ====
> start slave until master_log_file='master-bin', master_log_pos=561;
> ERROR HY000: Incorrect parameter or combination of parameters for START SLAVE UNTIL
> start slave until master_log_file='master-bin.000001', master_log_pos=561,
> relay_log_pos=12;
>
> === modified file 'mysql-test/suite/rpl/t/rpl_stm_until.test'
> --- a/mysql-test/suite/rpl/t/rpl_stm_until.test 2008-07-10 16:09:39 +0000
> +++ b/mysql-test/suite/rpl/t/rpl_stm_until.test 2008-07-23 11:23:52 +0000
> @@ -1,18 +1,37 @@
> +# ==== Purpose ====
> +#
> +# Verify that START SLAVE UNTIL replicates until the given binlog
> +# position but not longer. Verify that START SLAVE UNTIL with various
> +# incorrect arguments gives an error.
> +#
> +# ==== Method ====
> +#
> +# On master, create a table and insert some rows. On slave, START
> +# SLAVE UNTIL so that it reads one event at a time, and check the
> +# table and the slave status each time.
> +#
> +# Then, on slave, run START SLAVE UNTIL with incorrect arguments and
> +# verify that it gives an error.
> +#
> +# ==== Related bugs ====
> +#
> +# Bug in this test: BUG#37717: rpl.rpl_stm_until 'stmt' fails sporadically on
> pushbuild
> +
> -- source include/have_binlog_format_mixed_or_statement.inc
> -- source include/master-slave.inc
>
> # Test is dependent on binlog positions
>
> -# prepare version for substitutions
> -let $VERSION=`select version()`;
> -
> -# stop slave before he will start replication also sync with master
> -# for avoiding undetermenistic behaviour
> +# Stop slave before it starts replication. Also sync with master
> +# to avoid nondeterministic behaviour.
> +--echo [on slave]
> sync_slave_with_master;
> --source include/stop_slave.inc
>
> +--echo ==== Create some events on master ====
> +
> +--echo [on master]
> connection master;
> -# create some events on master
> create table t1(n int not null auto_increment primary key);
> insert into t1 values (1),(2),(3),(4);
> drop table t1;
> @@ -21,9 +40,13 @@ insert into t2 values (1),(2);
> insert into t2 values (3),(4);
> drop table t2;
>
> +--echo ==== Replicate one event at a time on slave ====
> +
> # try to replicate all queries until drop of t1
> +--echo [on slave]
> connection slave;
> start slave until master_log_file='master-bin.000001', master_log_pos=323;
> +--source include/wait_for_slave_io_to_start.inc
> --source include/wait_for_slave_sql_to_stop.inc
> # here table should be still not deleted
> select * from t1;
> @@ -31,6 +54,7 @@ source include/show_slave_status2.inc;
>
> # this should fail right after start
> start slave until master_log_file='master-no-such-bin.000001', master_log_pos=291;
> +--source include/wait_for_slave_io_to_start.inc
> --source include/wait_for_slave_sql_to_stop.inc
> # again this table should be still not deleted
> select * from t1;
> @@ -38,24 +62,28 @@ source include/show_slave_status2.inc;
>
> # try replicate all up to and not including the second insert to t2;
> start slave until relay_log_file='slave-relay-bin.000004', relay_log_pos=746;
> +--source include/wait_for_slave_io_to_start.inc
> --source include/wait_for_slave_sql_to_stop.inc
> select * from t2;
> source include/show_slave_status2.inc;
>
> # clean up
> start slave;
> +--echo [on master]
> connection master;
> +--echo [on slave]
> sync_slave_with_master;
> --source include/stop_slave.inc
>
> # this should stop immediately as we are already there
> start slave until master_log_file='master-bin.000001', master_log_pos=776;
> +--source include/wait_for_slave_io_to_start.inc
> --source include/wait_for_slave_sql_to_stop.inc
> -# here the sql slave thread should be stopped
> --replace_result bin.000005 bin.000004 bin.000006 bin.000004 bin.000007 bin.000004
> source include/show_slave_status2.inc;
>
> -#testing various error conditions
> +--echo ==== Test various error conditions ====
> +
> --error 1277
> start slave until master_log_file='master-bin', master_log_pos=561;
> --error 1277
> @@ -70,4 +98,3 @@ start slave until relay_log_file='slave-
> start slave sql_thread;
> start slave until master_log_file='master-bin.000001', master_log_pos=776;
>
> -# End of 4.1 tests
>