#At file:///home/daogangq/mysql/bzrwork/bug45214no/mysql-pe/ based on revid:joro@stripped
3465 Dao-Gang.Qu@stripped 2009-07-15 [merge]
Bug #45214 get_master_version_and_clock does not report error when queries fail
The "get_master_version_and_clock(...)" function in sql/slave.cc ignores
error and passes directly when queries fail, or queries succeed
but the result retrieved is empty.
The "get_master_version_and_clock(...)" function should try to reconnect master
if queries fail because of transient network problems, and fail otherwise.
The I/O thread should print a warning if the some system variables do not
exist on master (very old master)
@ mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test
Added test file for bug #45214
@ mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result
Added test result for bug #45214
@ mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test
Added test file for bug #45214
@ sql/slave.cc
The 'is_network_error()' function is added for checking if the error is caused by network.
Added a new return value (2) to 'get_master_version_and_clock()' function result set
to indicate transient network errors when queries fail, and the caller should
try to reconnect in this case.
added:
mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test
mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result
mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test
modified:
sql/slave.cc
=== added file 'mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test'
--- a/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test 2009-07-15 14:47:55 +0000
@@ -0,0 +1,47 @@
+#
+# BUG#45214
+# The common part of the "rpl_get_master_version_and_clock" test.
+# Restart slave under network disconnection between slave and master
+#
+connection slave;
+# Restart slave
+--disable_warnings
+stop slave;
+source include/wait_for_slave_to_stop.inc;
+start slave;
+source include/wait_for_slave_to_start.inc;
+connection master;
+# Write file to make mysql-test-run.pl expect the "crash", but don't start
+# it until it's told to
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+wait
+EOF
+
+# Send shutdown to the connected server and give
+# it 10 seconds to die before zapping it
+shutdown_server 10;
+
+# Show slave last IO errno
+connection slave;
+source include/wait_for_slave_io_error.inc;
+let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1);
+echo Slave_IO_Errno= $last_io_errno;
+
+# Write file to make mysql-test-run.pl start up the server again
+--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+restart
+EOF
+
+connection master;
+# Turn on reconnect
+--enable_reconnect
+
+# Call script that will poll the server waiting for it to be back online again
+--source include/wait_until_connected_again.inc
+
+# Turn off reconnect again
+--disable_reconnect
+
+connection slave;
+source include/wait_for_slave_to_start.inc;
+
=== added file 'mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result'
--- a/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result 2009-07-15 14:47:55 +0000
@@ -0,0 +1,22 @@
+stop slave;
+drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
+reset master;
+reset slave;
+drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
+start slave;
+set global debug= 'd,before_get_UNIX_TIMESTAMP_bug45214';
+stop slave;
+start slave;
+Slave_IO_Errno= 2013
+set global debug= 'd,before_get_SERVER_ID_bug45214';
+stop slave;
+start slave;
+Slave_IO_Errno= 2013
+set global debug= '';
+reset master;
+include/stop_slave.inc
+change master to master_port=SLAVE_PORT;
+start slave;
+*** must be having the replicate-same-server-id IO thread error ***
+Slave_IO_Errno= 1593
+Slave_IO_Error= Fatal error: The slave I/O thread stops because master and slave have equal MySQL server ids; these ids must be different for replication to work (or the --replicate-same-server-id option must be used on slave but this does not always make sense; please check the manual before using it).
=== added file 'mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test'
--- a/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test 2009-07-15 14:47:55 +0000
@@ -0,0 +1,51 @@
+#
+# BUG#45214
+# This test verifies if the slave I/O tread tries to reconnect to
+# master when it tries to get the values of the UNIX_TIMESTAMP, SERVER_ID,
+# COLLATION_SERVER and TIME_ZONE from master under network disconnection.
+# The COLLATION_SERVER and TIME_ZONE are got only on master server version 4.
+# So they can't be verified by test case here.
+# Finish the following tests by calling its common test script:
+# extra/rpl_tests/rpl_get_master_version_and_clock.test.
+# And meanwhile this test checks that the slave I/O thread refuses to start if slave
+# and master have the same server id (because this is a useless setup,
+# and otherwise SHOW SLAVE STATUS shows progress but all queries are
+# ignored, which has caught our customers), unless
+# --replicate-same-server-id.
+#
+
+source include/master-slave.inc;
+
+#Test case 1: Try to get the value of the UNIX_TIMESTAMP from master under network disconnection
+connection slave;
+let $debug_saved= `select @@global.debug`;
+set global debug= 'd,before_get_UNIX_TIMESTAMP_bug45214';
+source extra/rpl_tests/rpl_get_master_version_and_clock.test;
+
+#Test case 2: Try to get the value of the SERVER_ID from master under network disconnection
+connection slave;
+set global debug= 'd,before_get_SERVER_ID_bug45214';
+source extra/rpl_tests/rpl_get_master_version_and_clock.test;
+
+eval set global debug= '$debug_saved';
+
+#Test case 3: This test checks that the slave I/O thread refuses to start
+#if slave and master have the same server id.
+connection slave;
+reset master;
+# replicate ourselves
+source include/stop_slave.inc;
+--replace_result $SLAVE_MYPORT SLAVE_PORT
+eval change master to master_port=$SLAVE_MYPORT;
+start slave;
+
+let $slave_param= Last_IO_Errno;
+let $slave_param_value= 1593;
+source include/wait_for_slave_param.inc;
+--echo *** must be having the replicate-same-server-id IO thread error ***
+let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1);
+let $last_io_error= query_get_value("show slave status", Last_IO_Error, 1);
+echo Slave_IO_Errno= $last_io_errno;
+echo Slave_IO_Error= $last_io_error;
+
+# End of tests
=== modified file 'sql/slave.cc'
--- a/sql/slave.cc 2009-07-06 08:12:12 +0000
+++ b/sql/slave.cc 2009-07-15 14:47:55 +0000
@@ -42,6 +42,7 @@
#include <my_dir.h>
#include <sql_common.h>
#include <errmsg.h>
+#include <mysqld_error.h>
#include <mysys_err.h>
#include "rpl_handler.h"
@@ -1145,6 +1146,28 @@ static bool check_io_slave_killed(THD *t
/*
+ Check if the error is caused by network.
+ @param[in] errorno Number of the error.
+ RETURNS:
+ TRUE network error
+ FALSE not network error
+*/
+
+bool is_network_error(uint errorno)
+{
+ if (errorno == CR_CONNECTION_ERROR ||
+ errorno == CR_CONN_HOST_ERROR ||
+ errorno == CR_SERVER_GONE_ERROR ||
+ errorno == CR_SERVER_LOST ||
+ errorno == ER_CON_COUNT_ERROR ||
+ errorno == ER_SERVER_SHUTDOWN)
+ return TRUE;
+
+ return FALSE;
+}
+
+
+/*
Note that we rely on the master's version (3.23, 4.0.14 etc) instead of
relying on the binlog's version. This is not perfect: imagine an upgrade
of the master without waiting that all slaves are in sync with the master;
@@ -1156,6 +1179,7 @@ static bool check_io_slave_killed(THD *t
RETURNS
0 ok
1 error
+ 2 transient network problem, the caller should try to reconnect
*/
static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
@@ -1168,6 +1192,7 @@ static int get_master_version_and_clock(
MYSQL_RES *master_res= 0;
MYSQL_ROW master_row;
DBUG_ENTER("get_master_version_and_clock");
+ int query_re= 0;
err_msg.length(0);
/*
@@ -1247,6 +1272,8 @@ static int get_master_version_and_clock(
unavailable (very old master not supporting UNIX_TIMESTAMP()?).
*/
+ DBUG_EXECUTE_IF("before_get_UNIX_TIMESTAMP_bug45214", sleep(5););
+ master_res= NULL;
if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()")) &&
(master_res= mysql_store_result(mysql)) &&
(master_row= mysql_fetch_row(master_res)))
@@ -1254,7 +1281,13 @@ static int get_master_version_and_clock(
mi->clock_diff_with_master=
(long) (time((time_t*) 0) - strtoul(master_row[0], 0, 10));
}
- else if (!check_io_slave_killed(mi->info_thd, mi, NULL))
+ else if (is_network_error(mysql_errno(mysql)))
+ {
+ mi->report(WARNING_LEVEL, mysql_errno(mysql),
+ "Get master clock failed with error: %s", mysql_error(mysql));
+ goto network_err;
+ }
+ else
{
mi->clock_diff_with_master= 0; /* The "most sensible" value */
sql_print_warning("\"SELECT UNIX_TIMESTAMP()\" failed on master, "
@@ -1263,7 +1296,10 @@ static int get_master_version_and_clock(
mysql_error(mysql), mysql_errno(mysql));
}
if (master_res)
+ {
mysql_free_result(master_res);
+ master_res= NULL;
+ }
/*
Check that the master's server id and ours are different. Because if they
@@ -1275,13 +1311,16 @@ static int get_master_version_and_clock(
Note: we could have put a @@SERVER_ID in the previous SELECT
UNIX_TIMESTAMP() instead, but this would not have worked on 3.23 masters.
*/
+ DBUG_EXECUTE_IF("before_get_SERVER_ID_bug45214", sleep(5););
+ master_res= NULL;
+ master_row= NULL;
if (!mysql_real_query(mysql,
STRING_WITH_LEN("SHOW VARIABLES LIKE 'SERVER_ID'")) &&
- (master_res= mysql_store_result(mysql)))
+ (master_res= mysql_store_result(mysql)) &&
+ (master_row= mysql_fetch_row(master_res)))
{
- if ((master_row= mysql_fetch_row(master_res)) &&
- (::server_id == (mi->master_id= strtoul(master_row[1], 0, 10))) &&
- !mi->rli->replicate_same_server_id)
+ if ((::server_id == (mi->master_id= strtoul(master_row[1], 0, 10))) &&
+ !mi->rli->replicate_same_server_id)
{
errmsg=
"The slave I/O thread stops because master and slave have equal"
@@ -1291,10 +1330,35 @@ static int get_master_version_and_clock(
err_code= ER_SLAVE_FATAL_ERROR;
sprintf(err_buff, ER(err_code), errmsg);
err_msg.append(err_buff);
+ goto err;
}
+ }
+ else if (mysql_errno(mysql))
+ {
+ if (is_network_error(mysql_errno(mysql)))
+ {
+ mi->report(WARNING_LEVEL, mysql_errno(mysql),
+ "Get master SERVER_ID failed with error: %s", mysql_error(mysql));
+ goto network_err;
+ }
+ /* Fatal error */
+ errmsg= "The slave I/O thread stops because a fatal error is encountered \
+when it try to get the value of SERVER_ID variable from master.";
+ err_code= mysql_errno(mysql);
+ sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql));
+ err_msg.append(err_buff);
+ goto err;
+ }
+ else if (!master_row && master_res)
+ {
+ mi->report(WARNING_LEVEL, ER_UNKNOWN_SYSTEM_VARIABLE,
+ "Unknown system variable 'SERVER_ID' on master, \
+maybe it is a *VERY OLD MASTER*.");
+ }
+ if (master_res)
+ {
mysql_free_result(master_res);
- if (errmsg)
- goto err;
+ master_res= NULL;
}
if (mi->master_id == 0 && mi->ignore_server_ids.elements > 0)
{
@@ -1325,25 +1389,47 @@ static int get_master_version_and_clock(
if (*mysql->server_version == '3')
goto err;
- if ((*mysql->server_version == '4') &&
- !mysql_real_query(mysql,
- STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER")) &&
- (master_res= mysql_store_result(mysql)))
+ if (*mysql->server_version == '4')
{
- if ((master_row= mysql_fetch_row(master_res)) &&
- strcmp(master_row[0], global_system_variables.collation_server->name))
+ master_res= NULL;
+ if (!mysql_real_query(mysql,
+ STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER")) &&
+ (master_res= mysql_store_result(mysql)) &&
+ (master_row= mysql_fetch_row(master_res)))
{
- errmsg=
- "The slave I/O thread stops because master and slave have"
- " different values for the COLLATION_SERVER global variable."
- " The values must be equal for replication to work";
- err_code= ER_SLAVE_FATAL_ERROR;
- sprintf(err_buff, ER(err_code), errmsg);
- err_msg.append(err_buff);
+ if (strcmp(master_row[0], global_system_variables.collation_server->name))
+ {
+ errmsg= "The slave I/O thread stops because master and slave have \
+different values for the COLLATION_SERVER global variable. The values must \
+be equal for the Statement-format replication to work";
+ err_code= ER_SLAVE_FATAL_ERROR;
+ sprintf(err_buff, ER(err_code), errmsg);
+ err_msg.append(err_buff);
+ goto err;
+ }
}
- mysql_free_result(master_res);
- if (errmsg)
+ else if (is_network_error(mysql_errno(mysql)))
+ {
+ mi->report(WARNING_LEVEL, mysql_errno(mysql),
+ "Get master COLLATION_SERVER failed with error: %s", mysql_error(mysql));
+ goto network_err;
+ }
+ else
+ {
+ /* Fatal error */
+ errmsg= "The slave I/O thread stops because a fatal error is encountered \
+when it try to get the value of COLLATION_SERVER global variable from master.";
+ err_code= mysql_errno(mysql);
+ sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql));
+ err_msg.append(err_buff);
goto err;
+ }
+
+ if (master_res)
+ {
+ mysql_free_result(master_res);
+ master_res= NULL;
+ }
}
/*
@@ -1361,26 +1447,46 @@ static int get_master_version_and_clock(
This check is only necessary for 4.x masters (and < 5.0.4 masters but
those were alpha).
*/
- if ((*mysql->server_version == '4') &&
- !mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE")) &&
- (master_res= mysql_store_result(mysql)))
- {
- if ((master_row= mysql_fetch_row(master_res)) &&
- strcmp(master_row[0],
- global_system_variables.time_zone->get_name()->ptr()))
+ if (*mysql->server_version == '4')
+ {
+ master_res= NULL;
+ if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE")) &&
+ (master_res= mysql_store_result(mysql)) &&
+ (master_row= mysql_fetch_row(master_res)))
{
- errmsg=
- "The slave I/O thread stops because master and slave have"
- " different values for the TIME_ZONE global variable."
- " The values must be equal for replication to work";
- err_code= ER_SLAVE_FATAL_ERROR;
- sprintf(err_buff, ER(err_code), errmsg);
- err_msg.append(err_buff);
+ if (strcmp(master_row[0],
+ global_system_variables.time_zone->get_name()->ptr()))
+ {
+ errmsg= "The slave I/O thread stops because master and slave have \
+different values for the TIME_ZONE global variable. The values must \
+be equal for the Statement-format replication to work";
+ err_code= ER_SLAVE_FATAL_ERROR;
+ sprintf(err_buff, ER(err_code), errmsg);
+ err_msg.append(err_buff);
+ goto err;
+ }
}
- mysql_free_result(master_res);
-
- if (errmsg)
+ else if (is_network_error(mysql_errno(mysql)))
+ {
+ mi->report(WARNING_LEVEL, mysql_errno(mysql),
+ "Get master TIME_ZONE failed with error: %s", mysql_error(mysql));
+ goto network_err;
+ }
+ else
+ {
+ /* Fatal error */
+ errmsg= "The slave I/O thread stops because a fatal error is encountered \
+when it try to get the value of TIME_ZONE global variable from master.";
+ err_code= mysql_errno(mysql);
+ sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql));
+ err_msg.append(err_buff);
goto err;
+ }
+ if (master_res)
+ {
+ mysql_free_result(master_res);
+ master_res= NULL;
+ }
}
if (mi->heartbeat_period != 0.0)
@@ -1416,12 +1522,20 @@ err:
if (err_msg.length() != 0)
{
sql_print_error(err_msg.ptr());
+ if (master_res)
+ mysql_free_result(master_res);
+
DBUG_ASSERT(err_code != 0);
mi->report(ERROR_LEVEL, err_code, err_msg.ptr());
DBUG_RETURN(1);
}
DBUG_RETURN(0);
+
+network_err:
+ if (master_res)
+ mysql_free_result(master_res);
+ DBUG_RETURN(2);
}
@@ -2517,6 +2631,7 @@ pthread_handler_t handle_slave_io(void *
char llbuff[22];
uint retry_count;
bool suppress_warnings;
+ int ret;
#ifndef DBUG_OFF
uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0;
#endif
@@ -2608,8 +2723,23 @@ connected:
mi->slave_running= MYSQL_SLAVE_RUN_CONNECT;
thd->slave_net = &mysql->net;
thd_proc_info(thd, "Checking master version");
- if (get_master_version_and_clock(mysql, mi))
+ ret= get_master_version_and_clock(mysql, mi);
+ if (ret == 1)
+ /* Fatal error */
goto err;
+
+ if (ret == 2)
+ {
+ if (check_io_slave_killed(thd, mi, "Slave I/O thread killed"
+ "while calling get_master_version_and_clock(...)"))
+ goto err;
+ suppress_warnings= FALSE;
+ /* Try to reconnect because the error was caused by a transient network problem */
+ if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
+ reconnect_messages[SLAVE_RECON_ACT_REG]))
+ goto err;
+ goto connected;
+ }
if (mi->rli->relay_log.description_event_for_queue->binlog_version > 1)
{
Attachment: [text/bzr-bundle] bzr/dao-gang.qu@sun.com-20090715144755-p9uytrnfxn0r02kj.bundle
| Thread |
|---|
| • bzr commit into mysql-5.4 branch (Dao-Gang.Qu:3465) Bug#45214 | Dao-Gang.Qu | 15 Jul |