List:Commits« Previous MessageNext Message »
From:Andrei Elkin Date:June 29 2009 2:52pm
Subject:Re: bzr commit into mysql-5.1-bugteam branch (Dao-Gang.Qu:2941)
Bug#45214
View as plain text  
Dao-Gang, hello.

To follow our telephone conversation,

> #At file:///home/daogangq/mysql/bzrwork/bug45214/5.1-bt/ based on
> revid:davi.arnaut@stripped
>
>  2941 Dao-Gang.Qu@stripped	2009-06-27
>       Bug #45214  get_master_version_and_clock does not report error when queries
> fail
>             
>       The "get_master_version_and_clock(...)" function in sql/slave.cc ignores error
> 
>       and passes directly when queries get a failed response from master because of 
>       transient network problems between master and slave or get a successful
> response 
>       but the value of relational variable is not existent in the result retrieved.
>       
>       The "get_master_version_and_clock(...)" function should try to reconnect master
> 
>       if the queries get a failed response from master because of transient network 
>       problems between master and slave. 

>       And it should report a nonexistent value error 

It's not clear what's `nonexistent'. You mean the master reported an
inappropiate value, like a different time_zone, don't you?


>       or print a warning information if the queries get a successful response but the
> 
>       value of relational variable is not existent in the result retrieved.   	

We need to state the following. 

A failure of mysql_real_query() is tollerable; we need only to print out a warning
which your patch does not do (TODO).

A failure of mysql_store_result() or mysql_fetch_row() to retrieve
result set because of different from a network issue, such as
`CR_SERVER_LOST', is a critical one; what kind of issue is checked with
mysql_error().

A failure of mysql_fetch_row() to produce not empty set results despite of
zero return of mysql_error() is a critical failure because of nature of
being retrieved data. In neither of 4 interaction with the master they
can't be selected as the null set.

Finally, failures due to the existing logics like
`The slave I/O thread stops because master and slave have equal MySQL server ids'
should remain on the error level.


>      @ mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test
>         Test script for bug #45214
>      @ mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result
>         Test result for bug #45214
>      @ mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test
>         Test script for bug #45214
>      @ sql/slave.cc
>         Update the slave.cc for fixing bug #45214
>
>     added:
>       mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test
>       mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result
>       mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test
>     modified:
>       sql/slave.cc
> === added file
> 'mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test'
> ---
> a/mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test	1970-01-01
> 00:00:00 +0000
> +++
> b/mysql-test/extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test	2009-06-27
> 02:32:25 +0000
> @@ -0,0 +1,53 @@
> +# Restart slave under network disconnection between slave and master
> +
> +connection slave;
> +if (`SELECT '$debug_lock' = ''`)
> +{
> +    --die Cannot continue. Please set value for $debug_lock.
> +}
> +
> +# Restart slave
> +--disable_warnings
> +stop slave;
> +source include/wait_for_slave_to_stop.inc;
> +start slave;
> +source include/wait_for_slave_to_start.inc;
> +
> +connection master;
> +# Write file to make mysql-test-run.pl expect the "crash", but don't start
> +# it until it's told to
> +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
> +wait
> +EOF
> +
> +# Send shutdown to the connected server and give
> +# it 10 seconds to die before zapping it
> +shutdown_server 10;
> +
> +connection slave;
> +eval SELECT RELEASE_LOCK($debug_lock);
> +
> +# Write file to make mysql-test-run.pl start up the server again
> +--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
> +restart
> +EOF
> +
> +connection master;
> +# Turn on reconnect
> +--enable_reconnect
> +
> +# Call script that will poll the server waiting for it to be back online again
> +--source include/wait_until_connected_again.inc
> +
> +# Turn off reconnect again
> +--disable_reconnect
> +
> +connection slave;
> +source include/wait_for_slave_to_start.inc;
> +
> +# Show slave last IO errno
> +let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1);
> +echo Slave_IO_Errno= $last_io_errno;
> +
> +
> +# End of tests
>
> === added file
> 'mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result'
> ---
> a/mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result	1970-01-01
> 00:00:00 +0000
> +++
> b/mysql-test/suite/rpl/r/rpl_get_master_version_clock_on_disconnection.result	2009-06-27
> 02:32:25 +0000
> @@ -0,0 +1,59 @@
> +stop slave;
> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> +reset master;
> +reset slave;
> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> +start slave;
> +SELECT IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP");
> +IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP")
> +1
> +SELECT GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000);
> +GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000)
> +1
> +set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP';
> +stop slave;
> +start slave;
> +SELECT RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP");
> +RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP")
> +1
> +Slave_IO_Errno= 2013
> +SELECT IS_FREE_LOCK("debug_lock.before_get_SERVER_ID");
> +IS_FREE_LOCK("debug_lock.before_get_SERVER_ID")
> +1
> +SELECT GET_LOCK("debug_lock.before_get_SERVER_ID", 1000);
> +GET_LOCK("debug_lock.before_get_SERVER_ID", 1000)
> +1
> +set global debug= 'd,debug_lock.before_get_SERVER_ID';
> +stop slave;
> +start slave;
> +SELECT RELEASE_LOCK("debug_lock.before_get_SERVER_ID");
> +RELEASE_LOCK("debug_lock.before_get_SERVER_ID")
> +1
> +Slave_IO_Errno= 2013
> +SELECT IS_FREE_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE");
> +IS_FREE_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE")
> +1
> +SELECT GET_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE", 1000);
> +GET_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE", 1000)
> +1
> +set global debug= 'd,debug_lock.before_get_COLLATION_SERVER_VALUE';
> +stop slave;
> +start slave;
> +SELECT RELEASE_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE");
> +RELEASE_LOCK("debug_lock.before_get_COLLATION_SERVER_VALUE")
> +1
> +Slave_IO_Errno= 2013
> +SELECT IS_FREE_LOCK("debug_lock.before_get_TIME_ZONE_VALUE");
> +IS_FREE_LOCK("debug_lock.before_get_TIME_ZONE_VALUE")
> +1
> +SELECT GET_LOCK("debug_lock.before_get_TIME_ZONE_VALUE", 1000);
> +GET_LOCK("debug_lock.before_get_TIME_ZONE_VALUE", 1000)
> +1
> +set global debug= 'd,debug_lock.before_get_TIME_ZONE_VALUE';
> +stop slave;
> +start slave;
> +SELECT RELEASE_LOCK("debug_lock.before_get_TIME_ZONE_VALUE");
> +RELEASE_LOCK("debug_lock.before_get_TIME_ZONE_VALUE")
> +1
> +Slave_IO_Errno= 2013
> +set global debug= '';
>
> === added file
> 'mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test'
> ---
> a/mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test	1970-01-01
> 00:00:00 +0000
> +++
> b/mysql-test/suite/rpl/t/rpl_get_master_version_clock_on_disconnection.test	2009-06-27
> 02:32:25 +0000
> @@ -0,0 +1,56 @@
> +#
> +# BUG#45214: http://bugs.mysql.com/bug.php?id=45214 
> +# This test verifies if the slave I/O tread tries to reconnect to
> +# master when it tries to get the values of the UNIX_TIMESTAMP, SERVER_ID,
> +# COLLATION_SERVER and TIME_ZONE from master under network disconnection.
> +#
> +# Steps:
> +#    1 - Got DBUG_SYNC_POINT lock
> +#    2 - Set DBUG_SYNC_POINT before call mysql_real_query(...) function in
> get_master_version_and_clock(...) function and hang here
> +#    3 - shutdown master server for simulating network disconnection
> +#    4 - Release DBUG_SYNC_POINT lock
> +#    5 - Check if the slave I/O thread tries to reconnect to master. 
> +
> +source include/master-slave.inc;
> +
> +#Test case 1: Try to get the value of the UNIX_TIMESTAMP from master under network
> disconnection
> +connection slave;
> +let $debug_saved= `select @@global.debug`;
> +
> +let $debug_lock= "debug_lock.before_get_UNIX_TIMESTAMP";
> +eval SELECT IS_FREE_LOCK($debug_lock);
> +eval SELECT GET_LOCK($debug_lock, 1000);
> +
> +set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP';
> +source extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test; 
> +
> +#Test case 2: Try to get the value of the SERVER_ID from master under network
> disconnection
> +connection slave;
> +let $debug_lock= "debug_lock.before_get_SERVER_ID";
> +eval SELECT IS_FREE_LOCK($debug_lock);
> +eval SELECT GET_LOCK($debug_lock, 1000);
> +
> +set global debug= 'd,debug_lock.before_get_SERVER_ID';
> +source extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test;
> +
> +#Test case 3: Try to get the value of the COLLATION_SERVER from master under network
> disconnection
> +connection slave;
> +let $debug_lock= "debug_lock.before_get_COLLATION_SERVER_VALUE";
> +eval SELECT IS_FREE_LOCK($debug_lock);
> +eval SELECT GET_LOCK($debug_lock, 1000);
> +
> +set global debug= 'd,debug_lock.before_get_COLLATION_SERVER_VALUE';
> +source extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test;
> +
> +#Test case 4: Try to get the value of the TIME_ZONE from master under network
> disconnection
> +connection slave;
> +let $debug_lock= "debug_lock.before_get_TIME_ZONE_VALUE";
> +eval SELECT IS_FREE_LOCK($debug_lock);
> +eval SELECT GET_LOCK($debug_lock, 1000);
> +
> +set global debug= 'd,debug_lock.before_get_TIME_ZONE_VALUE';
> +source extra/rpl_tests/rpl_get_master_version_clock_on_disconnection.test;
> +
> +eval set global debug= '$debug_saved';
> +
> +# End of tests
>
> === modified file 'sql/slave.cc'
> --- a/sql/slave.cc	2009-06-09 16:44:26 +0000
> +++ b/sql/slave.cc	2009-06-27 02:32:25 +0000
> @@ -854,6 +854,7 @@ int init_intvar_from_file(int* var, IO_C
>    RETURNS
>    0       ok
>    1       error
> +  2       transient network problem, the caller should try to reconnect
>  */
>  
>  static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
> @@ -864,7 +865,7 @@ static int get_master_version_and_clock(
>    MYSQL_RES *master_res= 0;
>    MYSQL_ROW master_row;
>    DBUG_ENTER("get_master_version_and_clock");
> -
> +  int query_re= 0;
>    /*
>      Free old description_event_for_queue (that is needed if we are in
>      a reconnection).
> @@ -938,10 +939,16 @@ static int get_master_version_and_clock(
>      Compare the master and slave's clock. Do not die if master's clock is
>      unavailable (very old master not supporting UNIX_TIMESTAMP()?).
>    */
> -
> -  if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()"))
> &&
> -      (master_res= mysql_store_result(mysql)) &&
> -      (master_row= mysql_fetch_row(master_res)))
> +  DBUG_SYNC_POINT("debug_lock.before_get_UNIX_TIMESTAMP", 10);
> +  query_re= mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()"));
> +  if (query_re)
> +    DBUG_RETURN(2);

I think you need call to rli->report(WARNING_LEVEL, a message).
Warnings can be error-logged to provide the user info that can possibly
indicate on existence of network issues and so on.

> +  
> +  master_res= mysql_store_result(mysql);
> +  if (!master_res)
> +    DBUG_RETURN(2);

As said above, that could be a Warning only if mysql_error() confirms a network problem.
You may choose engage the working goto-technique:

 if (!master_res && mysql_error() is  a network issue)

   err_buff=...
   error_level= WARNING_LEVEL;
   goto err;


provided that you added a variable

 enum loglevel err_level= ERR_LEVEL; // default


and changed s/ERR_LEVEL/err_level/ the trainling part of the function

  if (errmsg)
  {
    DBUG_ASSERT(err_code != 0);
    mi->report(err_level, err_code, err_buff);
    DBUG_RETURN(1);
  }


> +  
> +  if (master_row= mysql_fetch_row(master_res))
>    {
>      mi->clock_diff_with_master=
>        (long) (time((time_t*) 0) - strtoul(master_row[0], 0, 10));

Conversly to above, you need to change the remained 

  sql_print_warning("\"SELECT UNIX_TIMESTAMP()\"

to generate an Error if mysql_error() does not confirm a network problem.


The remained 3 interractions with the master follow the same as above pattern.

> @@ -967,26 +974,34 @@ static int get_master_version_and_clock(
>      Note: we could have put a @@SERVER_ID in the previous SELECT
>      UNIX_TIMESTAMP() instead, but this would not have worked on 3.23 masters.
>    */
> -  if (!mysql_real_query(mysql,
> -                        STRING_WITH_LEN("SHOW VARIABLES LIKE 'SERVER_ID'"))
> &&
> -      (master_res= mysql_store_result(mysql)))
> -  {
> -    if ((master_row= mysql_fetch_row(master_res)) &&
> -        (::server_id == strtoul(master_row[1], 0, 10)) &&
> -        !mi->rli.replicate_same_server_id)
> -    {
> -      errmsg= "The slave I/O thread stops because master and slave have equal \
> +  DBUG_SYNC_POINT("debug_lock.before_get_SERVER_ID", 10);
> +  query_re= mysql_real_query(mysql, STRING_WITH_LEN("SHOW VARIABLES LIKE
> 'SERVER_ID'"));
> +  if (query_re)
> +    DBUG_RETURN(2);
> +
> +  master_res= mysql_store_result(mysql);
> +  if (!master_res)
> +    DBUG_RETURN(2);
> +
> +  master_row= mysql_fetch_row(master_res);
> +  if (master_row && (::server_id == strtoul(master_row[1], 0, 10))
> &&
> +      !mi->rli.replicate_same_server_id)
> +  {
> +    errmsg= "The slave I/O thread stops because master and slave have equal \
>  MySQL server ids; these ids must be different for replication to work (or \
>  the --replicate-same-server-id option must be used on slave but this does \
>  not always make sense; please check the manual before using it).";
> -      err_code= ER_SLAVE_FATAL_ERROR;
> -      sprintf(err_buff, ER(err_code), errmsg);
> -    }
> -    mysql_free_result(master_res);
> -    if (errmsg)
> -      goto err;
> +    err_code= ER_SLAVE_FATAL_ERROR;
> +    sprintf(err_buff, ER(err_code), errmsg);
>    }
> +  else if (!master_row)
> +    sql_print_warning("\"SHOW VARIABLES LIKE 'SERVER_ID'\" failed on master, \
> +maybe it is a very old master.");
> +  mysql_free_result(master_res);
>  
> +  if (errmsg)
> +    goto err;
> + 
>    /*
>      Check that the master's global character_set_server and ours are the same.
>      Not fatal if query fails (old master?).
> @@ -1008,13 +1023,20 @@ not always make sense; please check the 
>    if (*mysql->server_version == '3')
>      goto err;
>  
> -  if ((*mysql->server_version == '4') &&
> -      !mysql_real_query(mysql,
> -                        STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER"))
> &&
> -      (master_res= mysql_store_result(mysql)))
> +  if (*mysql->server_version == '4')
>    {
> -    if ((master_row= mysql_fetch_row(master_res)) &&
> -        strcmp(master_row[0], global_system_variables.collation_server->name))
> +    DBUG_SYNC_POINT("debug_lock.before_get_COLLATION_SERVER_VALUE", 10);
> +    query_re= mysql_real_query(mysql, STRING_WITH_LEN("SELECT
> @@GLOBAL.COLLATION_SERVER"));
> +    if (query_re)
> +      DBUG_RETURN(2);
> +
> +    master_res= mysql_store_result(mysql);
> +    if (!master_res)
> +      DBUG_RETURN(2);
> +
> +    master_row= mysql_fetch_row(master_res);
> +    if (master_row && strcmp(master_row[0],
> +                             global_system_variables.collation_server->name))
>      {
>        errmsg= "The slave I/O thread stops because master and slave have \
>  different values for the COLLATION_SERVER global variable. The values must \
> @@ -1022,7 +1044,11 @@ be equal for replication to work";
>        err_code= ER_SLAVE_FATAL_ERROR;
>        sprintf(err_buff, ER(err_code), errmsg);
>      }
> +    else if (!master_row)
> +      sql_print_warning("\"SELECT @@GLOBAL.COLLATION_SERVER\" failed on master, \
> +maybe it is a very old master.");
>      mysql_free_result(master_res);
> +
>      if (errmsg)
>        goto err;
>    }
> @@ -1042,13 +1068,20 @@ be equal for replication to work";
>      This check is only necessary for 4.x masters (and < 5.0.4 masters but
>      those were alpha).
>    */
> -  if ((*mysql->server_version == '4') &&
> -      !mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE"))
> &&
> -      (master_res= mysql_store_result(mysql)))
> -  {
> -    if ((master_row= mysql_fetch_row(master_res)) &&
> -        strcmp(master_row[0],
> -               global_system_variables.time_zone->get_name()->ptr()))
> +  if (*mysql->server_version == '4')
> +  {
> +    DBUG_SYNC_POINT("debug_lock.before_get_TIME_ZONE_VALUE", 10);
> +    query_re= mysql_real_query(mysql, STRING_WITH_LEN("SELECT
> @@GLOBAL.TIME_ZONE"));
> +    if (query_re)
> +      DBUG_RETURN(2);
> +
> +    master_res= mysql_store_result(mysql);
> +    if (!master_res)
> +      DBUG_RETURN(2);
> +
> +    master_row= mysql_fetch_row(master_res);
> +    if (master_row && strcmp(master_row[0],
> +                            
> global_system_variables.time_zone->get_name()->ptr()))
>      {
>        errmsg= "The slave I/O thread stops because master and slave have \
>  different values for the TIME_ZONE global variable. The values must \
> @@ -1056,6 +1089,13 @@ be equal for replication to work";
>        err_code= ER_SLAVE_FATAL_ERROR;
>        sprintf(err_buff, ER(err_code), errmsg);
>      }
> +    else if (!master_row)
> +    {
> +      errmsg= "The slave I/O thread stops because the value of \
> +the TIME_ZONE global variable is not existent on master.";
> +      err_code= ER_SLAVE_FATAL_ERROR;
> +      sprintf(err_buff, ER(err_code), errmsg);
> +    }
>      mysql_free_result(master_res);
>  
>      if (errmsg)
> @@ -2372,6 +2412,7 @@ pthread_handler_t handle_slave_io(void *
>    char llbuff[22];
>    uint retry_count;
>    bool suppress_warnings;
> +  int ret;
>  #ifndef DBUG_OFF
>    uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0;
>  #endif
> @@ -2382,7 +2423,7 @@ pthread_handler_t handle_slave_io(void *
>    DBUG_ASSERT(mi->inited);
>    mysql= NULL ;
>    retry_count= 0;
> -
> +  
>    pthread_mutex_lock(&mi->run_lock);
>    /* Inform waiting threads that slave has started */
>    mi->slave_run_id++;
> @@ -2451,8 +2492,19 @@ connected:
>    mi->slave_running= MYSQL_SLAVE_RUN_CONNECT;
>    thd->slave_net = &mysql->net;
>    thd_proc_info(thd, "Checking master version");
> -  if (get_master_version_and_clock(mysql, mi))
> +  ret= get_master_version_and_clock(mysql, mi);
> +  if (ret == 1)
> +    /* Fatal error */
>      goto err;
> +  
> +  if (ret == 2) 
> +  {
> +    /* Try to reconnect because the error was caused by a transient network problem
> */
> +    if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
> +                             reconnect_messages[SLAVE_RECON_ACT_REG]))
> +      goto err;
> +    goto connected;
> +  } 
>  
>    if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1)
>    {
>
>
> # Bazaar merge directive format 2 (Bazaar 0.90)
> # revision_id: dao-gang.qu@stripped
> # target_branch: file:///home/daogangq/mysql/bzrwork/bug45214/5.1-bt/
> # testament_sha1: 314bc0d7091b26dc84ed77172332b7ba88c24146
> # timestamp: 2009-06-27 10:32:41 +0800
> # base_revision_id: davi.arnaut@stripped\
> #   jrxxjz0fx90hj9lv
> # 

Finally, please check more carefully next time if you add an unnecessary
black line changes like

@@ -2382,7 +2423,7 @@ pthread_handler_t handle_slave_io(void *
   DBUG_ASSERT(mi->inited);
   mysql= NULL ;
   retry_count= 0;
-
+  

@@ -1022,7 +1044,11 @@ be equal for replication to work";
       err_code= ER_SLAVE_FATAL_ERROR;
       sprintf(err_buff, ER(err_code), errmsg);
     }
+    else if (!master_row)
+      sql_print_warning("\"SELECT @@GLOBAL.COLLATION_SERVER\" failed on master, \
+maybe it is a very old master.");
     mysql_free_result(master_res);
+
     if (errmsg)
       goto err;
   }


cheers,

Andrei
Thread
bzr commit into mysql-5.1-bugteam branch (Dao-Gang.Qu:2941) Bug#45214Dao-Gang.Qu27 Jun
  • Re: bzr commit into mysql-5.1-bugteam branch (Dao-Gang.Qu:2941)Bug#45214He Zhenxing29 Jun
    • Re: bzr commit into mysql-5.1-bugteam branch (Dao-Gang.Qu:2941)Bug#45214Daogang Qu1 Jul
  • Re: bzr commit into mysql-5.1-bugteam branch (Dao-Gang.Qu:2941)Bug#45214Andrei Elkin29 Jun
    • Re: bzr commit into mysql-5.1-bugteam branch (Dao-Gang.Qu:2941)Bug#45214Daogang Qu1 Jul