| List: | Commits | « Previous MessageNext Message » | |
| From: | Daogang Qu | Date: | July 2 2009 6:50am |
| Subject: | Re: bzr commit into mysql-5.1-bugteam branch (Dao-Gang.Qu:2941) Bug#45214 | ||
| View as plain text | |||
Andrei Elkin 写道: > Dao Gang, hello. > > Surprise but I seem could not find the latest patch which Zhen Xing > commented on in a mail that I replied to. > > Thanks for addressing report()-ing warnings in that patch. > I am leaving few comments that probably would remain for that latest inaccessible > patch. > > > > >> #At file:///home/daogangq/mysql/bzrwork/bug45214/5.1-bt/ based on >> revid:davi.arnaut@stripped >> >> 2941 Dao-Gang.Qu@stripped 2009-07-01 >> Bug #45214 get_master_version_and_clock does not report error when queries > fail >> >> The "get_master_version_and_clock(...)" function in sql/slave.cc ignores >> error and passes directly when queries fail, or queries succeed >> but the result retrieved is empty. >> >> > > fine > > >> The "get_master_version_and_clock(...)" function should try to reconnect > master >> if queries fail because of transient network problems, and fail otherwise. >> The I/O thread should print a warning if the some system variables do not >> exist on master (very old master) >> @ mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test >> Test script for bug #45214 >> @ mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result >> Test result for bug #45214 >> @ mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test >> Test script for bug #45214 >> > > >> @ sql/slave.cc >> Update the slave.cc for fixing bug #45214 >> > > Please be verbose. We are used to list some summary of changes in > per-file comments. > You would say `is_network_error() is added; > get_master_version_and_clock() result set is changed with the new value > to force reconnecing...' > > Good comments. I have added the following summary: The 'is_network_error()' function is added for checking if the error is caused by network; The 'get_master_version_and_clock()' result set function is changed with the new value to force reconnecting when queries fail. OK? >> added: >> mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test >> mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result >> mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test >> modified: >> sql/slave.cc >> === added file > 'mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test' >> --- a/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test 1970-01-01 > 00:00:00 +0000 >> +++ b/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test 2009-07-01 > 13:34:00 +0000 >> @@ -0,0 +1,63 @@ >> +# >> +# BUG#45214: http://bugs.mysql.com/bug.php?id=45214 >> +# The common part of the "rpl_get_master_version_and_clock" test. >> +# Restart slave under network disconnection between slave and master >> +# following the steps: >> +# 1 - Got DBUG_SYNC_POINT lock >> +# 2 - Set DBUG_SYNC_POINT before call mysql_real_query(...) function in > get_master_version_and_clock(...) function and hang here >> > > >> +# 3 - shutdown master server for simulating network disconnection >> > > Ok. > > Just a remark question, would not merely killing the dump thread on master be okay > too? > > > >> +# 4 - Release DBUG_SYNC_POINT lock >> +# 5 - Check if the slave I/O thread tries to reconnect to master. >> +# >> +# Note: Please make sure initialize the $debug_lock when call the test script. >> +# >> +connection slave; >> +if (`SELECT '$debug_lock' = ''`) >> +{ >> + --die Cannot continue. Please set value for $debug_lock. >> +} >> + >> +# Restart slave >> +--disable_warnings >> +stop slave; >> +source include/wait_for_slave_to_stop.inc; >> +start slave; >> +source include/wait_for_slave_to_start.inc; >> +connection master; >> +# Write file to make mysql-test-run.pl expect the "crash", but don't start >> +# it until it's told to >> +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect >> +wait >> +EOF >> + >> +# Send shutdown to the connected server and give >> +# it 10 seconds to die before zapping it >> +shutdown_server 10; >> + >> +connection slave; >> +eval SELECT RELEASE_LOCK($debug_lock); >> + >> +# Write file to make mysql-test-run.pl start up the server again >> +--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect >> +restart >> +EOF >> + >> +connection master; >> +# Turn on reconnect >> +--enable_reconnect >> + >> +# Call script that will poll the server waiting for it to be back online again >> +--source include/wait_until_connected_again.inc >> + >> +# Turn off reconnect again >> +--disable_reconnect >> + >> +connection slave; >> +source include/wait_for_slave_to_start.inc; >> + >> +# Show slave last IO errno >> +let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1); >> +echo Slave_IO_Errno= $last_io_errno; >> + >> + >> +# End of tests >> >> === added file 'mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result' >> --- a/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result 1970-01-01 > 00:00:00 +0000 >> +++ b/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result 2009-07-01 > 13:34:00 +0000 >> @@ -0,0 +1,40 @@ >> +stop slave; >> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; >> +reset master; >> +reset slave; >> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; >> +start slave; >> +SELECT IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP"); >> +IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP") >> +1 >> +SELECT GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000); >> +GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000) >> +1 >> +set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP'; >> +stop slave; >> +start slave; >> +SELECT RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP"); >> +RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP") >> +1 >> +Slave_IO_Errno= 2013 >> +SELECT IS_FREE_LOCK("debug_lock.before_get_SERVER_ID"); >> +IS_FREE_LOCK("debug_lock.before_get_SERVER_ID") >> +1 >> +SELECT GET_LOCK("debug_lock.before_get_SERVER_ID", 1000); >> +GET_LOCK("debug_lock.before_get_SERVER_ID", 1000) >> +1 >> +set global debug= 'd,debug_lock.before_get_SERVER_ID'; >> +stop slave; >> +start slave; >> +SELECT RELEASE_LOCK("debug_lock.before_get_SERVER_ID"); >> +RELEASE_LOCK("debug_lock.before_get_SERVER_ID") >> +1 >> +Slave_IO_Errno= 2013 >> +set global debug= ''; >> +reset master; >> +include/stop_slave.inc >> +change master to master_port=SLAVE_PORT; >> +start slave; >> +*** must be having the replicate-same-server-id IO thread error *** >> +Slave_IO_Errno= 1593 >> +Slave_IO_Error= Fatal error: The slave I/O thread stops because master and slave > have equal MySQL server ids; these ids must be different for replication to work (or the > --replicate-same-server-id option must be used on slave but this does not always make > sense; please check the manual before using it). >> >> === added file 'mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test' >> --- a/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test 1970-01-01 > 00:00:00 +0000 >> +++ b/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test 2009-07-01 > 13:34:00 +0000 >> @@ -0,0 +1,60 @@ >> +# >> > > >> +# BUG#45214: http://bugs.mysql.com/bug.php?id=45214 >> > > `http://bugs.mysql.com/bug.php?id=45214' is redundant. Dare to assume everyone knows > how to find the bug having its id; popularity comes with bugs as well :-). > > Yes. Right. >> +# This test verifies if the slave I/O tread tries to reconnect to >> +# master when it tries to get the values of the UNIX_TIMESTAMP, SERVER_ID, >> +# COLLATION_SERVER and TIME_ZONE from master under network disconnection. >> +# The COLLATION_SERVER and TIME_ZONE are got only on master server version 4. >> +# So they can't be verified by test case here. >> +# Finish the following tests by calling its common test script: >> +# extra/rpl_tests/rpl_get_master_version_and_clock.test. >> +# And meanwhile this test checks that the slave I/O thread refuses to start if > slave >> +# and master have the same server id (because this is a useless setup, >> +# and otherwise SHOW SLAVE STATUS shows progress but all queries are >> +# ignored, which has caught our customers), unless >> +# --replicate-same-server-id. >> +# >> + >> +source include/master-slave.inc; >> + >> +#Test case 1: Try to get the value of the UNIX_TIMESTAMP from master under > network disconnection >> +connection slave; >> +let $debug_saved= `select @@global.debug`; >> + >> +let $debug_lock= "debug_lock.before_get_UNIX_TIMESTAMP"; >> +eval SELECT IS_FREE_LOCK($debug_lock); >> +eval SELECT GET_LOCK($debug_lock, 1000); >> + >> +set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP'; >> +source extra/rpl_tests/rpl_get_master_version_and_clock.test; >> + >> +#Test case 2: Try to get the value of the SERVER_ID from master under network > disconnection >> +connection slave; >> +let $debug_lock= "debug_lock.before_get_SERVER_ID"; >> +eval SELECT IS_FREE_LOCK($debug_lock); >> +eval SELECT GET_LOCK($debug_lock, 1000); >> + >> +set global debug= 'd,debug_lock.before_get_SERVER_ID'; >> +source extra/rpl_tests/rpl_get_master_version_and_clock.test; >> + >> +eval set global debug= '$debug_saved'; >> + >> +#Test case 3: This test checks that the slave I/O thread refuses to start >> +#if slave and master have the same server id. >> +connection slave; >> +reset master; >> +# replicate ourselves >> +source include/stop_slave.inc; >> +--replace_result $SLAVE_MYPORT SLAVE_PORT >> +eval change master to master_port=$SLAVE_MYPORT; >> +start slave; >> + >> +let $slave_param= Last_IO_Errno; >> +let $slave_param_value= 1593; >> +source include/wait_for_slave_param.inc; >> +--echo *** must be having the replicate-same-server-id IO thread error *** >> +let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1); >> +let $last_io_error= query_get_value("show slave status", Last_IO_Error, 1); >> +echo Slave_IO_Errno= $last_io_errno; >> +echo Slave_IO_Error= $last_io_error; >> + >> +# End of tests >> >> === modified file 'sql/slave.cc' >> --- a/sql/slave.cc 2009-06-09 16:44:26 +0000 >> +++ b/sql/slave.cc 2009-07-01 13:34:00 +0000 >> @@ -38,6 +38,7 @@ >> #include <my_dir.h> >> #include <sql_common.h> >> #include <errmsg.h> >> +#include <mysqld_error.h> >> #include <mysys_err.h> >> >> #ifdef HAVE_REPLICATION >> @@ -842,6 +843,32 @@ int init_intvar_from_file(int* var, IO_C >> DBUG_RETURN(1); >> } >> >> + >> +/* >> + Check if the error is caused by network. >> + >> + RETURNS: >> + TRUE network error >> + FALSE not network error >> +*/ >> + >> +bool is_network_error(int errorno) >> +{ >> + int errno; >> + errno= errorno; >> + >> + if (errno == CR_CONNECTION_ERROR || >> + errno == CR_CONN_HOST_ERROR || >> + errno == CR_SERVER_GONE_ERROR || >> + errno == CR_SERVER_LOST || >> + errno == ER_CON_COUNT_ERROR || >> + errno == ER_SERVER_SHUTDOWN) >> > > Thanks for refusing my too eager suggestion I made on #rep last evening! > To confess I was checking my appeared to be wrong claim to remove ER_ from the list > but managed to push the button too soon. > > Indeed, grepping mysqltest.c proves CR_ and ER_ can coexist. > > >> + return TRUE; >> + >> + return FALSE; >> +} >> + >> + >> /* >> Note that we rely on the master's version (3.23, 4.0.14 etc) instead of >> relying on the binlog's version. This is not perfect: imagine an upgrade >> @@ -854,6 +881,7 @@ int init_intvar_from_file(int* var, IO_C >> RETURNS >> 0 ok >> 1 error >> + 2 transient network problem, the caller should try to reconnect >> */ >> >> static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) >> @@ -864,6 +892,7 @@ static int get_master_version_and_clock( >> MYSQL_RES *master_res= 0; >> MYSQL_ROW master_row; >> DBUG_ENTER("get_master_version_and_clock"); >> + int query_re= 0; >> >> /* >> Free old description_event_for_queue (that is needed if we are in >> @@ -938,14 +967,28 @@ static int get_master_version_and_clock( >> Compare the master and slave's clock. Do not die if master's clock is >> unavailable (very old master not supporting UNIX_TIMESTAMP()?). >> */ >> + DBUG_SYNC_POINT("debug_lock.before_get_UNIX_TIMESTAMP", 10); >> + query_re= mysql_real_query(mysql, STRING_WITH_LEN("SELECT > UNIX_TIMESTAMP()")); >> + if ((query_re || !(master_res= mysql_store_result(mysql))) && >> + is_network_error(mysql_errno(mysql))) >> + { >> > > >> + sql_print_warning("The slave I/O thread tries to reconnect " >> + "due to network error: %s (%d)", >> + mysql_error(mysql), mysql_errno(mysql)); >> > > I assume the latest patch keeps warning generation inside is_network_error(), > which is fine. > > NO. I think we should make a function more common, and the function name should be consistent with functions provided by it. So the 'is_network_error()' function shouldn't report warning. >> + DBUG_RETURN(2); >> + } >> >> - if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()")) > && >> - (master_res= mysql_store_result(mysql)) && >> - (master_row= mysql_fetch_row(master_res))) >> + if ((master_row= mysql_fetch_row(master_res))) >> { >> mi->clock_diff_with_master= >> (long) (time((time_t*) 0) - strtoul(master_row[0], 0, 10)); >> } >> > > >> + else if (!master_row && is_network_error(mysql_errno(mysql))) >> > > !master_row is true because of the if(), you may remove it. > > Yes. Right. >> + { >> + if (master_res) >> + mysql_free_result(master_res); >> + DBUG_RETURN(2); >> + } >> else if (!check_io_slave_killed(mi->io_thd, mi, NULL)) >> { >> mi->clock_diff_with_master= 0; /* The "most sensible" value */ >> @@ -967,26 +1010,54 @@ static int get_master_version_and_clock( >> Note: we could have put a @@SERVER_ID in the previous SELECT >> UNIX_TIMESTAMP() instead, but this would not have worked on 3.23 masters. >> */ >> - if (!mysql_real_query(mysql, >> - STRING_WITH_LEN("SHOW VARIABLES LIKE 'SERVER_ID'")) > && >> - (master_res= mysql_store_result(mysql))) >> - { >> - if ((master_row= mysql_fetch_row(master_res)) && >> - (::server_id == strtoul(master_row[1], 0, 10)) && >> - !mi->rli.replicate_same_server_id) >> + DBUG_SYNC_POINT("debug_lock.before_get_SERVER_ID", 10); >> + query_re= mysql_real_query(mysql, STRING_WITH_LEN("SHOW VARIABLES LIKE > 'SERVER_ID'")); >> + if (query_re || !(master_res= mysql_store_result(mysql))) >> + { >> + if (is_network_error(mysql_errno(mysql))) >> { >> - errmsg= "The slave I/O thread stops because master and slave have equal \ >> + sql_print_warning("The slave I/O thread tries to reconnect " >> + "due to network error: %s (%d)", >> + mysql_error(mysql), mysql_errno(mysql)); >> + DBUG_RETURN(2); >> + } >> > > `else /* Fatal error */' is asking to be there, but it's up to you. > > >> + /* Fatal error */ >> > > >> + sql_print_error("The slave I/O thread stops. Error: %s (%d)", >> + mysql_error(mysql), mysql_errno(mysql)); >> > > as said in the ealier mail, this sql_print_error is redundant because of the > eventual > report(). > > Yes. Right. >> + errmsg= "The slave I/O thread stops because a fatal error is encountered \ >> +when it try to get the value of SERVER_ID variable from master."; >> + err_code= ER_SLAVE_FATAL_ERROR; >> + sprintf(err_buff, ER(err_code), errmsg); >> + goto err; >> + } >> + >> + master_row= mysql_fetch_row(master_res); >> + if (master_row && (::server_id == strtoul(master_row[1], 0, 10)) > && >> + !mi->rli.replicate_same_server_id) >> + { >> + errmsg= "The slave I/O thread stops because master and slave have equal \ >> MySQL server ids; these ids must be different for replication to work (or \ >> the --replicate-same-server-id option must be used on slave but this does \ >> not always make sense; please check the manual before using it)."; >> - err_code= ER_SLAVE_FATAL_ERROR; >> - sprintf(err_buff, ER(err_code), errmsg); >> + err_code= ER_SLAVE_FATAL_ERROR; >> + sprintf(err_buff, ER(err_code), errmsg); >> + } >> > > >> + else if (!master_row) >> > > as above !master_row is true and you may remove if (!master_row). > > NO.We didn't remove it because its previous judgment criteria involved the following criteria: && (::server_id == strtoul(master_row[1], 0, 10)) && !mi->rli.replicate_same_server_id It's different with above situation. >> + { >> + if (is_network_error(mysql_errno(mysql))) >> + { >> + mysql_free_result(master_res); >> + DBUG_RETURN(2); >> } >> - mysql_free_result(master_res); >> - if (errmsg) >> - goto err; >> + else >> > > >> + sql_print_warning("Unknown system variable 'SERVER_ID' on master, \ >> +maybe it is a very old master."); >> > > sql_print_warning() is logical here, but as we agreed I am expecting mi->report() > here. > > Yes. Right. >> } >> + mysql_free_result(master_res); >> >> + if (errmsg) >> + goto err; >> + >> /* >> Check that the master's global character_set_server and ours are the same. >> Not fatal if query fails (old master?). >> @@ -1008,23 +1079,55 @@ not always make sense; please check the >> if (*mysql->server_version == '3') >> goto err; >> >> - if ((*mysql->server_version == '4') && >> - !mysql_real_query(mysql, >> - STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER")) > && >> - (master_res= mysql_store_result(mysql))) >> + if (*mysql->server_version == '4') >> { >> - if ((master_row= mysql_fetch_row(master_res)) && >> - strcmp(master_row[0], > global_system_variables.collation_server->name)) >> + query_re= mysql_real_query(mysql, STRING_WITH_LEN("SELECT > @@GLOBAL.COLLATION_SERVER")); >> + if (query_re || !(master_res= mysql_store_result(mysql))) >> { >> - errmsg= "The slave I/O thread stops because master and slave have \ >> + if (is_network_error(mysql_errno(mysql))) >> + { >> + sql_print_warning("The slave I/O thread tries to reconnect " >> + "due to network error: %s (%d)", >> + mysql_error(mysql), mysql_errno(mysql)); >> + DBUG_RETURN(2); >> + } >> > > >> + if (mysql_errno(mysql) != ER_UNKNOWN_SYSTEM_VARIABLE) >> > > I guess the if above can yield only true. > I don't believe COLLATION_SERVER does not exist in mysql->server_version == '4'. > Even if I am wrong in that, > the empty of the master would not be equal to a non-empty of the slave and therefore > the following error message makes sense as well: > >> + { >> + /* Fatal error */ >> + sql_print_error("The slave I/O thread stops. Error: %s (%d)", >> + mysql_error(mysql), mysql_errno(mysql)); >> + errmsg= "The slave I/O thread stops because a fatal error is encountered > \ >> +when it try to get the value of COLLATION_SERVER global variable from master."; >> + err_code= ER_SLAVE_FATAL_ERROR; >> + sprintf(err_buff, ER(err_code), errmsg); >> + goto err; >> + } >> + } >> + >> > > >> + if (mysql_errno(mysql) == ER_UNKNOWN_SYSTEM_VARIABLE) >> + sql_print_warning("Unknown system variable 'COLLATION_SERVER' on master, > \ >> +maybe it is a very old master."); >> > > I suggest to restore the only requirement COLLATION_SERVER must be equal. > The empty trivially is not equal to any non-empty, and then this case is the same > error as above. > > NO. I think we should consider the very old master as the block's comments said: /* Check that the master's global character_set_server and ours are the same. Not fatal if query fails (old master?). Note that we don't check for equality of global character_set_client and collation_connection (neither do we prevent their setting in set_var.cc). That's because from what I (Guilhem) have tested, the global values of these 2 are never used (new connections don't use them). We don't test equality of global collation_database either as it's is going to be deprecated (made read-only) in 4.1 very soon. The test is only relevant if master < 5.0.3 (we'll test only if it's older than the 5 branch; < 5.0.3 was alpha...), as >= 5.0.3 master stores charset info in each binlog event. We don't do it for 3.23 because masters <3.23.50 hang on SELECT @@unknown_var (BUG#7965 - see changelog of 3.23.50). So finally we test only if master is 4.x. */ >> + else >> + { >> + master_row= mysql_fetch_row(master_res); >> + if (master_row && strcmp(master_row[0], >> + > global_system_variables.collation_server->name)) >> + { >> + errmsg= "The slave I/O thread stops because master and slave have \ >> different values for the COLLATION_SERVER global variable. The values must \ >> be equal for replication to work"; >> - err_code= ER_SLAVE_FATAL_ERROR; >> - sprintf(err_buff, ER(err_code), errmsg); >> + err_code= ER_SLAVE_FATAL_ERROR; >> + sprintf(err_buff, ER(err_code), errmsg); >> + } >> + else if (!master_row && is_network_error(mysql_errno(mysql))) >> + { >> + mysql_free_result(master_res); >> + DBUG_RETURN(2); >> + } >> + mysql_free_result(master_res); >> + if (errmsg) >> + goto err; >> } >> - mysql_free_result(master_res); >> - if (errmsg) >> - goto err; >> } >> >> /* >> @@ -1042,13 +1145,31 @@ be equal for replication to work"; >> This check is only necessary for 4.x masters (and < 5.0.4 masters but >> those were alpha). >> */ >> - if ((*mysql->server_version == '4') && >> - !mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE")) > && >> - (master_res= mysql_store_result(mysql))) >> - { >> - if ((master_row= mysql_fetch_row(master_res)) && >> - strcmp(master_row[0], >> - global_system_variables.time_zone->get_name()->ptr())) >> + if (*mysql->server_version == '4') >> + { >> + query_re= mysql_real_query(mysql, STRING_WITH_LEN("SELECT > @@GLOBAL.TIME_ZONE")); >> + if (query_re || !(master_res= mysql_store_result(mysql))) >> + { >> + if (is_network_error(mysql_errno(mysql))) >> + { >> + sql_print_warning("The slave I/O thread tries to reconnect " >> + "due to network error: %s (%d)", >> + mysql_error(mysql), mysql_errno(mysql)); >> + DBUG_RETURN(2); >> + } >> > > >> + /* Fatal error */ >> + sql_print_error("The slave I/O thread stops. Error: %s (%d)", >> + mysql_error(mysql), mysql_errno(mysql)); >> + errmsg= "The slave I/O thread stops because a fatal error is encountered > \ >> +when it try to get the value of TIME_ZONE global variable from master."; >> + err_code= ER_SLAVE_FATAL_ERROR; >> + sprintf(err_buff, ER(err_code), errmsg); >> + goto err; >> + } >> > > This block is fine. You must have noticed that this last case is similar to the > previous > where, I think, you overdid with checking of COLLATION_SERVER. > > NO. They are different situation. the 'COLLATION_SERVER' may be empty on very old master, but the 'TIME_ZONE' can't be empty on very old master. >> + >> + master_row= mysql_fetch_row(master_res); >> + if (master_row && strcmp(master_row[0], >> + > global_system_variables.time_zone->get_name()->ptr())) >> { >> errmsg= "The slave I/O thread stops because master and slave have \ >> different values for the TIME_ZONE global variable. The values must \ >> > > Here is an important question which was not supposed to be a part of this bug. > In row-based replication TIME_ZONE is irrelevant. > However, the IO thread might not know on what type of event the master will send. > > So, ideally we are better to defer the error until after the slave has > received a first query-event. It would be reasonable to stop by the SQL > slave only if the query deals with time_zone objects. > > I think you should not try to fix this rather complicated issue, but rather > to report a bug on this matter referring to this analysis. > > > OK. I will report the bug later. >> @@ -1056,6 +1177,11 @@ be equal for replication to work"; >> err_code= ER_SLAVE_FATAL_ERROR; >> sprintf(err_buff, ER(err_code), errmsg); >> } >> + else if (!master_row && is_network_error(mysql_errno(mysql))) >> + { >> + mysql_free_result(master_res); >> + DBUG_RETURN(2); >> + } >> mysql_free_result(master_res); >> >> if (errmsg) >> @@ -2372,6 +2498,7 @@ pthread_handler_t handle_slave_io(void * >> char llbuff[22]; >> uint retry_count; >> bool suppress_warnings; >> + int ret; >> #ifndef DBUG_OFF >> uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0; >> #endif >> @@ -2451,8 +2578,19 @@ connected: >> mi->slave_running= MYSQL_SLAVE_RUN_CONNECT; >> thd->slave_net = &mysql->net; >> thd_proc_info(thd, "Checking master version"); >> - if (get_master_version_and_clock(mysql, mi)) >> + ret= get_master_version_and_clock(mysql, mi); >> + if (ret == 1) >> + /* Fatal error */ >> goto err; >> + >> + if (ret == 2) >> + { >> + /* Try to reconnect because the error was caused by a transient network > problem */ >> > > >> + if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings, >> > > I assume you got my message on `suppress_warnings' is not defined at this point > and your latest patch has fixed that. > > Yes. done it. >> + reconnect_messages[SLAVE_RECON_ACT_REG])) >> + goto err; >> + goto connected; >> + } >> >> if (mi->rli.relay_log.description_event_for_queue->binlog_version > > 1) >> { >> >> >> > > cheers, > > Andrei >
