Hi Daogang,
Good work, I have some comments, please see below.
Dao-Gang.Qu@stripped wrote:
> #At file:///home/daogangq/mysql/bzrwork/bug45214/5.1-bt/ based on
> revid:davi.arnaut@stripped
>
> 2941 Dao-Gang.Qu@stripped 2009-06-27
> Bug #45214 get_master_version_and_clock does not report error when queries
> fail
>
> The "get_master_version_and_clock(...)" function in sql/slave.cc ignores error
>
> and passes directly when queries get a failed response from master because of
> transient network problems between master and slave or get a successful
> response
> but the value of relational variable is not existent in the result retrieved.
>
> The "get_master_version_and_clock(...)" function should try to reconnect master
>
> if the queries get a failed response from master because of transient network
> problems between master and slave. And it should report a nonexistent value
> error
> or print a warning information if the queries get a successful response but the
>
> value of relational variable is not existent in the result retrieved.
> @ mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test
> Test script for bug #45214
> @ mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result
> Test result for bug #45214
> @ mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test
> Test script for bug #45214
> @ sql/slave.cc
> Update the slave.cc for fixing bug #45214
>
> added:
> mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test
> mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result
> mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test
> modified:
> sql/slave.cc
> === added file
> 'mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test'
> ---
> a/mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test 1970-01-01
> 00:00:00 +0000
> +++
> b/mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test 2009-06-27
> 02:32:25 +0000
> @@ -0,0 +1,53 @@
> +# Restart slave under network disconnection between slave and master
> +
This is not clear, please provide a better description about what this
test does.
Please also add test cases to test that the slave IO thread will fail if
the configuration (SERVER_ID, COLLATION_SERVER, etc) is not compatible
on master and slave.
> +connection slave;
> +if (`SELECT '$debug_lock' = ''`)
> +{
> + --die Cannot continue. Please set value for $debug_lock.
> +}
> +
> +# Restart slave
> +--disable_warnings
> +stop slave;
> +source include/wait_for_slave_to_stop.inc;
> +start slave;
> +source include/wait_for_slave_to_start.inc;
> +
> +connection master;
> +# Write file to make mysql-test-run.pl expect the "crash", but don't start
> +# it until it's told to
> +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
> +wait
> +EOF
> +
> +# Send shutdown to the connected server and give
> +# it 10 seconds to die before zapping it
> +shutdown_server 10;
> +
> +connection slave;
> +eval SELECT RELEASE_LOCK($debug_lock);
> +
> +# Write file to make mysql-test-run.pl start up the server again
> +--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
> +restart
> +EOF
> +
> +connection master;
> +# Turn on reconnect
> +--enable_reconnect
> +
> +# Call script that will poll the server waiting for it to be back online again
> +--source include/wait_until_connected_again.inc
> +
> +# Turn off reconnect again
> +--disable_reconnect
> +
> +connection slave;
> +source include/wait_for_slave_to_start.inc;
> +
> +# Show slave last IO errno
> +let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1);
> +echo Slave_IO_Errno= $last_io_errno;
> +
> +
> +# End of tests
>
> === added file
> 'mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result'
> ---
> a/mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result 1970-01-01
> 00:00:00 +0000
> +++
> b/mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result 2009-06-27
> 02:32:25 +0000
> @@ -0,0 +1,59 @@
> +stop slave;
> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> +reset master;
> +reset slave;
> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> +start slave;
> +SELECT IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP");
> +IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP")
> +1
> +SELECT GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000);
> +GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000)
> +1
> +set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP';
> +stop slave;
> +start slave;
> +SELECT RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP");
> +RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP")
> +1
> +Slave_IO_Errno= 2013
> +SELECT IS_FREE_LOCK("debug_lock.before_get_SERVER_ID");
> +IS_FREE_LOCK("debug_lock.before_get_SERVER_ID")
> +1
> +SELECT GET_LOCK("debug_lock.before_get_SERVER_ID", 1000);
> +GET_LOCK("debug_lock.before_get_SERVER_ID", 1000)
> +1
> +set global debug= 'd,debug_lock.before_get_SERVER_ID';
> +stop slave;
> +start slave;
> +SELECT RELEASE_LOCK("debug_lock.before_get_SERVER_ID");
> +RELEASE_LOCK("debug_lock.before_get_SERVER_ID")
> +1
> +Slave_IO_Errno= 2013
> +SELECT IS_FREE_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE");
> +IS_FREE_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE")
> +1
> +SELECT GET_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE", 1000);
> +GET_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE", 1000)
> +1
> +set global debug= 'd,debug_lock.before_get_COLLATION_SERVER_VALUE';
> +stop slave;
> +start slave;
> +SELECT RELEASE_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE");
> +RELEASE_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE")
> +1
> +Slave_IO_Errno= 2013
> +SELECT IS_FREE_LOCK("debug_lock.before_get_TIME_ZONE_VALUE");
> +IS_FREE_LOCK("debug_lock.before_get_TIME_ZONE_VALUE")
> +1
> +SELECT GET_LOCK("debug_lock.before_get_TIME_ZONE_VALUE", 1000);
> +GET_LOCK("debug_lock.before_get_TIME_ZONE_VALUE", 1000)
> +1
> +set global debug= 'd,debug_lock.before_get_TIME_ZONE_VALUE';
> +stop slave;
> +start slave;
> +SELECT RELEASE_LOCK("debug_lock.before_get_TIME_ZONE_VALUE");
> +RELEASE_LOCK("debug_lock.before_get_TIME_ZONE_VALUE")
> +1
> +Slave_IO_Errno= 2013
> +set global debug= '';
>
> === added file
> 'mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test'
> ---
> a/mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test 1970-01-01
> 00:00:00 +0000
> +++
> b/mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test 2009-06-27
> 02:32:25 +0000
> @@ -0,0 +1,56 @@
> +#
> +# BUG#45214: http://bugs.mysql.com/bug.php?id=45214
> +# This test verifies if the slave I/O tread tries to reconnect to
> +# master when it tries to get the values of the UNIX_TIMESTAMP, SERVER_ID,
> +# COLLATION_SERVER and TIME_ZONE from master under network disconnection.
> +#
> +# Steps:
> +# 1 - Got DBUG_SYNC_POINT lock
> +# 2 - Set DBUG_SYNC_POINT before call mysql_real_query(...) function in
> get_master_version_and_clock(...) function and hang here
> +# 3 - shutdown master server for simulating network disconnection
> +# 4 - Release DBUG_SYNC_POINT lock
> +# 5 - Check if the slave I/O thread tries to reconnect to master.
> +
> +source include/master-slave.inc;
> +
> +#Test case 1: Try to get the value of the UNIX_TIMESTAMP from master under network
> disconnection
> +connection slave;
> +let $debug_saved= `select @@global.debug`;
> +
> +let $debug_lock= "debug_lock.before_get_UNIX_TIMESTAMP";
> +eval SELECT IS_FREE_LOCK($debug_lock);
> +eval SELECT GET_LOCK($debug_lock, 1000);
> +
> +set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP';
> +source extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test;
> +
> +#Test case 2: Try to get the value of the SERVER_ID from master under network
> disconnection
> +connection slave;
> +let $debug_lock= "debug_lock.before_get_SERVER_ID";
> +eval SELECT IS_FREE_LOCK($debug_lock);
> +eval SELECT GET_LOCK($debug_lock, 1000);
> +
> +set global debug= 'd,debug_lock.before_get_SERVER_ID';
> +source extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test;
> +
> +#Test case 3: Try to get the value of the COLLATION_SERVER from master under network
> disconnection
> +connection slave;
> +let $debug_lock= "debug_lock.before_get_COLLATION_SERVER_VALUE";
> +eval SELECT IS_FREE_LOCK($debug_lock);
> +eval SELECT GET_LOCK($debug_lock, 1000);
> +
> +set global debug= 'd,debug_lock.before_get_COLLATION_SERVER_VALUE';
> +source extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test;
> +
> +#Test case 4: Try to get the value of the TIME_ZONE from master under network
> disconnection
> +connection slave;
> +let $debug_lock= "debug_lock.before_get_TIME_ZONE_VALUE";
> +eval SELECT IS_FREE_LOCK($debug_lock);
> +eval SELECT GET_LOCK($debug_lock, 1000);
> +
> +set global debug= 'd,debug_lock.before_get_TIME_ZONE_VALUE';
> +source extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test;
> +
> +eval set global debug= '$debug_saved';
> +
> +# End of tests
>
> === modified file 'sql/slave.cc'
> --- a/sql/slave.cc 2009-06-09 16:44:26 +0000
> +++ b/sql/slave.cc 2009-06-27 02:32:25 +0000
> @@ -854,6 +854,7 @@ int init_intvar_from_file(int* var, IO_C
> RETURNS
> 0 ok
> 1 error
> + 2 transient network problem, the caller should try to reconnect
> */
>
> static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
> @@ -864,7 +865,7 @@ static int get_master_version_and_clock(
> MYSQL_RES *master_res= 0;
> MYSQL_ROW master_row;
> DBUG_ENTER("get_master_version_and_clock");
> -
> + int query_re= 0;
> /*
> Free old description_event_for_queue (that is needed if we are in
> a reconnection).
> @@ -938,10 +939,16 @@ static int get_master_version_and_clock(
> Compare the master and slave's clock. Do not die if master's clock is
> unavailable (very old master not supporting UNIX_TIMESTAMP()?).
> */
> -
> - if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()"))
> &&
> - (master_res= mysql_store_result(mysql)) &&
> - (master_row= mysql_fetch_row(master_res)))
> + DBUG_SYNC_POINT("debug_lock.before_get_UNIX_TIMESTAMP", 10);
> + query_re= mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()"));
> + if (query_re)
> + DBUG_RETURN(2);
> +
> + master_res= mysql_store_result(mysql);
> + if (!master_res)
> + DBUG_RETURN(2);
> +
> + if (master_row= mysql_fetch_row(master_res))
> {
> mi->clock_diff_with_master=
> (long) (time((time_t*) 0) - strtoul(master_row[0], 0, 10));
> @@ -967,26 +974,34 @@ static int get_master_version_and_clock(
> Note: we could have put a @@SERVER_ID in the previous SELECT
> UNIX_TIMESTAMP() instead, but this would not have worked on 3.23 masters.
> */
> - if (!mysql_real_query(mysql,
> - STRING_WITH_LEN("SHOW VARIABLES LIKE 'SERVER_ID'"))
> &&
> - (master_res= mysql_store_result(mysql)))
> - {
> - if ((master_row= mysql_fetch_row(master_res)) &&
> - (::server_id == strtoul(master_row[1], 0, 10)) &&
> - !mi->rli.replicate_same_server_id)
> - {
> - errmsg= "The slave I/O thread stops because master and slave have equal \
> + DBUG_SYNC_POINT("debug_lock.before_get_SERVER_ID", 10);
> + query_re= mysql_real_query(mysql, STRING_WITH_LEN("SHOW VARIABLES LIKE
> 'SERVER_ID'"));
> + if (query_re)
> + DBUG_RETURN(2);
> +
> + master_res= mysql_store_result(mysql);
> + if (!master_res)
> + DBUG_RETURN(2);
> +
> + master_row= mysql_fetch_row(master_res);
> + if (master_row && (::server_id == strtoul(master_row[1], 0, 10))
> &&
> + !mi->rli.replicate_same_server_id)
> + {
> + errmsg= "The slave I/O thread stops because master and slave have equal \
> MySQL server ids; these ids must be different for replication to work (or \
> the --replicate-same-server-id option must be used on slave but this does \
> not always make sense; please check the manual before using it).";
> - err_code= ER_SLAVE_FATAL_ERROR;
> - sprintf(err_buff, ER(err_code), errmsg);
> - }
> - mysql_free_result(master_res);
> - if (errmsg)
> - goto err;
> + err_code= ER_SLAVE_FATAL_ERROR;
> + sprintf(err_buff, ER(err_code), errmsg);
> }
> + else if (!master_row)
> + sql_print_warning("\"SHOW VARIABLES LIKE 'SERVER_ID'\" failed on master, \
> +maybe it is a very old master.");
I'd suggest change 'failed on master' to 'get empty result on master'
> + mysql_free_result(master_res);
>
> + if (errmsg)
> + goto err;
> +
> /*
> Check that the master's global character_set_server and ours are the same.
> Not fatal if query fails (old master?).
> @@ -1008,13 +1023,20 @@ not always make sense; please check the
> if (*mysql->server_version == '3')
> goto err;
>
> - if ((*mysql->server_version == '4') &&
> - !mysql_real_query(mysql,
> - STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER"))
> &&
> - (master_res= mysql_store_result(mysql)))
> + if (*mysql->server_version == '4')
> {
> - if ((master_row= mysql_fetch_row(master_res)) &&
> - strcmp(master_row[0], global_system_variables.collation_server->name))
> + DBUG_SYNC_POINT("debug_lock.before_get_COLLATION_SERVER_VALUE", 10);
> + query_re= mysql_real_query(mysql, STRING_WITH_LEN("SELECT
> @@GLOBAL.COLLATION_SERVER"));
> + if (query_re)
> + DBUG_RETURN(2);
Please note that for 'SHOW VARIABLES LIKE ...' queries, if the variable
does not exist, the query will success with empty result, but for SELECT
queries, the query will fail with error code 1193.
So when master does not support 'COLLATION_SERVER', the query above will
fail with error code 1193 (ER_UNKNOWN_SYSTEM_VARIABLE), and here the
patch should check if mysql_real_query fails with this error, if it is
then this is an old master.
> +
> + master_res= mysql_store_result(mysql);
> + if (!master_res)
> + DBUG_RETURN(2);
> +
> + master_row= mysql_fetch_row(master_res);
> + if (master_row && strcmp(master_row[0],
> + global_system_variables.collation_server->name))
> {
> errmsg= "The slave I/O thread stops because master and slave have \
> different values for the COLLATION_SERVER global variable. The values must \
> @@ -1022,7 +1044,11 @@ be equal for replication to work";
> err_code= ER_SLAVE_FATAL_ERROR;
> sprintf(err_buff, ER(err_code), errmsg);
> }
> + else if (!master_row)
> + sql_print_warning("\"SELECT @@GLOBAL.COLLATION_SERVER\" failed on master, \
> +maybe it is a very old master.");
For SELECT, this should never happen, if master does not have the
variable, then mysql_real_query should fail with error 1193, see
comments above.
> mysql_free_result(master_res);
> +
> if (errmsg)
> goto err;
> }
> @@ -1042,13 +1068,20 @@ be equal for replication to work";
> This check is only necessary for 4.x masters (and < 5.0.4 masters but
> those were alpha).
> */
> - if ((*mysql->server_version == '4') &&
> - !mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE"))
> &&
> - (master_res= mysql_store_result(mysql)))
> - {
> - if ((master_row= mysql_fetch_row(master_res)) &&
> - strcmp(master_row[0],
> - global_system_variables.time_zone->get_name()->ptr()))
> + if (*mysql->server_version == '4')
> + {
> + DBUG_SYNC_POINT("debug_lock.before_get_TIME_ZONE_VALUE", 10);
> + query_re= mysql_real_query(mysql, STRING_WITH_LEN("SELECT
> @@GLOBAL.TIME_ZONE"));
> + if (query_re)
> + DBUG_RETURN(2);
> +
Here too, see comments above.
> + master_res= mysql_store_result(mysql);
> + if (!master_res)
> + DBUG_RETURN(2);
> +
> + master_row= mysql_fetch_row(master_res);
> + if (master_row && strcmp(master_row[0],
> +
> global_system_variables.time_zone->get_name()->ptr()))
> {
> errmsg= "The slave I/O thread stops because master and slave have \
> different values for the TIME_ZONE global variable. The values must \
> @@ -1056,6 +1089,13 @@ be equal for replication to work";
> err_code= ER_SLAVE_FATAL_ERROR;
> sprintf(err_buff, ER(err_code), errmsg);
> }
> + else if (!master_row)
> + {
> + errmsg= "The slave I/O thread stops because the value of \
> +the TIME_ZONE global variable is not existent on master.";
> + err_code= ER_SLAVE_FATAL_ERROR;
> + sprintf(err_buff, ER(err_code), errmsg);
> + }
and here
> mysql_free_result(master_res);
>
> if (errmsg)
> @@ -2372,6 +2412,7 @@ pthread_handler_t handle_slave_io(void *
> char llbuff[22];
> uint retry_count;
> bool suppress_warnings;
> + int ret;
> #ifndef DBUG_OFF
> uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0;
> #endif
> @@ -2382,7 +2423,7 @@ pthread_handler_t handle_slave_io(void *
> DBUG_ASSERT(mi->inited);
> mysql= NULL ;
> retry_count= 0;
> -
> +
> pthread_mutex_lock(&mi->run_lock);
> /* Inform waiting threads that slave has started */
> mi->slave_run_id++;
> @@ -2451,8 +2492,19 @@ connected:
> mi->slave_running= MYSQL_SLAVE_RUN_CONNECT;
> thd->slave_net = &mysql->net;
> thd_proc_info(thd, "Checking master version");
> - if (get_master_version_and_clock(mysql, mi))
> + ret= get_master_version_and_clock(mysql, mi);
> + if (ret == 1)
> + /* Fatal error */
> goto err;
> +
> + if (ret == 2)
> + {
> + /* Try to reconnect because the error was caused by a transient network problem
> */
> + if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
> + reconnect_messages[SLAVE_RECON_ACT_REG]))
> + goto err;
> + goto connected;
> + }
>
> if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1)
> {
>