List:Commits« Previous MessageNext Message »
From:Dao-Gang.Qu Date:July 6 2009 7:23am
Subject:bzr commit into mysql-5.1-bugteam branch (Dao-Gang.Qu:2941) Bug#45214
View as plain text  
#At file:///home/daogangq/mysql/bzrwork/bug45214/5.1-bt/ based on revid:davi.arnaut@stripped

 2941 Dao-Gang.Qu@stripped	2009-07-06
      Bug #45214  get_master_version_and_clock does not report error when queries fail
            
      The "get_master_version_and_clock(...)" function in sql/slave.cc ignores 
      error and passes directly when queries fail, or queries succeed 
      but the result retrieved is empty.
      
      The "get_master_version_and_clock(...)" function should try to reconnect master
      if queries fail because of transient network problems, and fail otherwise.
      The I/O thread should print a warning if the some system variables do not 
      exist on master (very old master)
     @ mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test
        Added test file for bug #45214
     @ mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result
        Added test result for bug #45214
     @ mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test
        Added test file for bug #45214
     @ sql/slave.cc
        The 'is_network_error()' function is added for checking if the error is caused by network.
        Added a new return value (2) to 'get_master_version_and_clock()' function result set 
        to indicate transient network errors when queries fail, and the caller should 
        try to reconnect in this case.

    added:
      mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test
      mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result
      mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test
    modified:
      sql/slave.cc
=== added file 'mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test'
--- a/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test	1970-01-01 00:00:00 +0000
+++ b/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test	2009-07-06 07:23:14 +0000
@@ -0,0 +1,63 @@
+#
+# BUG#45214
+# The common part of the "rpl_get_master_version_and_clock" test. 
+# Restart slave under network disconnection between slave and master
+# following the steps:
+#    1 - Got DBUG_SYNC_POINT lock
+#    2 - Set DBUG_SYNC_POINT before call mysql_real_query(...) function in get_master_version_and_clock(...) function and hang here
+#    3 - shutdown master server for simulating network disconnection
+#    4 - Release DBUG_SYNC_POINT lock
+#    5 - Check if the slave I/O thread tries to reconnect to master. 
+#
+# Note: Please make sure initialize the $debug_lock when call the test script.
+#
+connection slave;
+if (`SELECT '$debug_lock' = ''`)
+{
+    --die Cannot continue. Please set value for $debug_lock.
+}
+
+# Restart slave
+--disable_warnings
+stop slave;
+source include/wait_for_slave_to_stop.inc;
+start slave;
+source include/wait_for_slave_to_start.inc;
+connection master;
+# Write file to make mysql-test-run.pl expect the "crash", but don't start
+# it until it's told to
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+wait
+EOF
+
+# Send shutdown to the connected server and give
+# it 10 seconds to die before zapping it
+shutdown_server 10;
+
+connection slave;
+eval SELECT RELEASE_LOCK($debug_lock);
+
+# Write file to make mysql-test-run.pl start up the server again
+--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+restart
+EOF
+
+connection master;
+# Turn on reconnect
+--enable_reconnect
+
+# Call script that will poll the server waiting for it to be back online again
+--source include/wait_until_connected_again.inc
+
+# Turn off reconnect again
+--disable_reconnect
+
+connection slave;
+source include/wait_for_slave_to_start.inc;
+
+# Show slave last IO errno
+let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1);
+echo Slave_IO_Errno= $last_io_errno;
+
+
+# End of tests

=== added file 'mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result'
--- a/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result	2009-07-06 07:23:14 +0000
@@ -0,0 +1,40 @@
+stop slave;
+drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
+reset master;
+reset slave;
+drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
+start slave;
+SELECT IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP");
+IS_FREE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP")
+1
+SELECT GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000);
+GET_LOCK("debug_lock.before_get_UNIX_TIMESTAMP", 1000)
+1
+set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP';
+stop slave;
+start slave;
+SELECT RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP");
+RELEASE_LOCK("debug_lock.before_get_UNIX_TIMESTAMP")
+1
+Slave_IO_Errno= 2013
+SELECT IS_FREE_LOCK("debug_lock.before_get_SERVER_ID");
+IS_FREE_LOCK("debug_lock.before_get_SERVER_ID")
+1
+SELECT GET_LOCK("debug_lock.before_get_SERVER_ID", 1000);
+GET_LOCK("debug_lock.before_get_SERVER_ID", 1000)
+1
+set global debug= 'd,debug_lock.before_get_SERVER_ID';
+stop slave;
+start slave;
+SELECT RELEASE_LOCK("debug_lock.before_get_SERVER_ID");
+RELEASE_LOCK("debug_lock.before_get_SERVER_ID")
+1
+Slave_IO_Errno= 2013
+set global debug= '';
+reset master;
+include/stop_slave.inc
+change master to master_port=SLAVE_PORT;
+start slave;
+*** must be having the replicate-same-server-id IO thread error ***
+Slave_IO_Errno= 1593
+Slave_IO_Error= Fatal error: The slave I/O thread stops because master and slave have equal MySQL server ids; these ids must be different for replication to work (or the --replicate-same-server-id option must be used on slave but this does not always make sense; please check the manual before using it).

=== added file 'mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test'
--- a/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test	2009-07-06 07:23:14 +0000
@@ -0,0 +1,60 @@
+#
+# BUG#45214 
+# This test verifies if the slave I/O tread tries to reconnect to
+# master when it tries to get the values of the UNIX_TIMESTAMP, SERVER_ID,
+# COLLATION_SERVER and TIME_ZONE from master under network disconnection.
+# The COLLATION_SERVER and TIME_ZONE are got only on master server version 4.
+# So they can't be verified by test case here.
+# Finish the following tests by calling its common test script:  
+# extra/rpl_tests/rpl_get_master_version_and_clock.test. 
+# And meanwhile this test checks that the slave I/O thread refuses to start if slave
+# and master have the same server id (because this is a useless setup,
+# and otherwise SHOW SLAVE STATUS shows progress but all queries are
+# ignored, which has caught our customers), unless
+# --replicate-same-server-id.
+#
+
+source include/master-slave.inc;
+
+#Test case 1: Try to get the value of the UNIX_TIMESTAMP from master under network disconnection
+connection slave;
+let $debug_saved= `select @@global.debug`;
+
+let $debug_lock= "debug_lock.before_get_UNIX_TIMESTAMP";
+eval SELECT IS_FREE_LOCK($debug_lock);
+eval SELECT GET_LOCK($debug_lock, 1000);
+
+set global debug= 'd,debug_lock.before_get_UNIX_TIMESTAMP';
+source extra/rpl_tests/rpl_get_master_version_and_clock.test; 
+
+#Test case 2: Try to get the value of the SERVER_ID from master under network disconnection
+connection slave;
+let $debug_lock= "debug_lock.before_get_SERVER_ID";
+eval SELECT IS_FREE_LOCK($debug_lock);
+eval SELECT GET_LOCK($debug_lock, 1000);
+
+set global debug= 'd,debug_lock.before_get_SERVER_ID';
+source extra/rpl_tests/rpl_get_master_version_and_clock.test;
+
+eval set global debug= '$debug_saved';
+
+#Test case 3: This test checks that the slave I/O thread refuses to start
+#if slave and master have the same server id.
+connection slave;
+reset master;
+# replicate ourselves
+source include/stop_slave.inc;
+--replace_result $SLAVE_MYPORT SLAVE_PORT
+eval change master to master_port=$SLAVE_MYPORT;
+start slave;
+
+let $slave_param= Last_IO_Errno;
+let $slave_param_value= 1593;
+#source include/wait_for_slave_param.inc;
+--echo *** must be having the replicate-same-server-id IO thread error ***
+let $last_io_errno= query_get_value("show slave status", Last_IO_Errno, 1);
+let $last_io_error= query_get_value("show slave status", Last_IO_Error, 1);
+echo Slave_IO_Errno= $last_io_errno;
+echo Slave_IO_Error= $last_io_error;
+
+# End of tests

=== modified file 'sql/slave.cc'
--- a/sql/slave.cc	2009-06-09 16:44:26 +0000
+++ b/sql/slave.cc	2009-07-06 07:23:14 +0000
@@ -38,6 +38,7 @@
 #include <my_dir.h>
 #include <sql_common.h>
 #include <errmsg.h>
+#include <mysqld_error.h>
 #include <mysys_err.h>
 
 #ifdef HAVE_REPLICATION
@@ -842,6 +843,29 @@ int init_intvar_from_file(int* var, IO_C
   DBUG_RETURN(1);
 }
 
+
+/*
+  Check if the error is caused by network.
+  @param[in]   errorno   Number of the error.
+  RETURNS:
+  TRUE         network error
+  FALSE        not network error
+*/
+
+bool is_network_error(uint errorno)
+{ 
+  if (errorno == CR_CONNECTION_ERROR || 
+      errorno == CR_CONN_HOST_ERROR ||
+      errorno == CR_SERVER_GONE_ERROR ||
+      errorno == CR_SERVER_LOST ||
+      errorno == ER_CON_COUNT_ERROR ||
+      errorno == ER_SERVER_SHUTDOWN)
+    return TRUE;
+
+  return FALSE;   
+}
+
+
 /*
   Note that we rely on the master's version (3.23, 4.0.14 etc) instead of
   relying on the binlog's version. This is not perfect: imagine an upgrade
@@ -854,6 +878,7 @@ int init_intvar_from_file(int* var, IO_C
   RETURNS
   0       ok
   1       error
+  2       transient network problem, the caller should try to reconnect
 */
 
 static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
@@ -864,6 +889,7 @@ static int get_master_version_and_clock(
   MYSQL_RES *master_res= 0;
   MYSQL_ROW master_row;
   DBUG_ENTER("get_master_version_and_clock");
+  int query_re= 0;
 
   /*
     Free old description_event_for_queue (that is needed if we are in
@@ -939,6 +965,8 @@ static int get_master_version_and_clock(
     unavailable (very old master not supporting UNIX_TIMESTAMP()?).
   */
 
+  DBUG_SYNC_POINT("debug_lock.before_get_UNIX_TIMESTAMP", 10);
+  master_res= NULL;
   if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()")) &&
       (master_res= mysql_store_result(mysql)) &&
       (master_row= mysql_fetch_row(master_res)))
@@ -946,7 +974,13 @@ static int get_master_version_and_clock(
     mi->clock_diff_with_master=
       (long) (time((time_t*) 0) - strtoul(master_row[0], 0, 10));
   }
-  else if (!check_io_slave_killed(mi->io_thd, mi, NULL))
+  else if (is_network_error(mysql_errno(mysql)))
+  {
+    mi->report(WARNING_LEVEL, mysql_errno(mysql),
+               "Get master clock failed with error: %s", mysql_error(mysql));
+    goto network_err;
+  }
+  else 
   {
     mi->clock_diff_with_master= 0; /* The "most sensible" value */
     sql_print_warning("\"SELECT UNIX_TIMESTAMP()\" failed on master, "
@@ -955,7 +989,10 @@ static int get_master_version_and_clock(
                       mysql_error(mysql), mysql_errno(mysql));
   }
   if (master_res)
+  {
     mysql_free_result(master_res);
+    master_res= NULL;
+  }
 
   /*
     Check that the master's server id and ours are different. Because if they
@@ -967,12 +1004,15 @@ static int get_master_version_and_clock(
     Note: we could have put a @@SERVER_ID in the previous SELECT
     UNIX_TIMESTAMP() instead, but this would not have worked on 3.23 masters.
   */
+  DBUG_SYNC_POINT("debug_lock.before_get_SERVER_ID", 10);
+  master_res= NULL;
+  master_row= NULL;
   if (!mysql_real_query(mysql,
                         STRING_WITH_LEN("SHOW VARIABLES LIKE 'SERVER_ID'")) &&
-      (master_res= mysql_store_result(mysql)))
+      (master_res= mysql_store_result(mysql)) &&
+      (master_row= mysql_fetch_row(master_res)))
   {
-    if ((master_row= mysql_fetch_row(master_res)) &&
-        (::server_id == strtoul(master_row[1], 0, 10)) &&
+    if ((::server_id == strtoul(master_row[1], 0, 10)) &&
         !mi->rli.replicate_same_server_id)
     {
       errmsg= "The slave I/O thread stops because master and slave have equal \
@@ -981,10 +1021,34 @@ the --replicate-same-server-id option mu
 not always make sense; please check the manual before using it).";
       err_code= ER_SLAVE_FATAL_ERROR;
       sprintf(err_buff, ER(err_code), errmsg);
+      goto err;
     }
+  }
+  else if (mysql_errno(mysql))
+  {
+    if (is_network_error(mysql_errno(mysql)))
+    {
+      mi->report(WARNING_LEVEL, mysql_errno(mysql),
+                 "Get master SERVER_ID failed with error: %s", mysql_error(mysql));
+      goto network_err;
+    }
+    /* Fatal error */
+    errmsg= "The slave I/O thread stops because a fatal error is encountered \
+when it try to get the value of SERVER_ID variable from master.";
+    err_code= ER_SLAVE_FATAL_ERROR;
+    sprintf(err_buff, ER(err_code), errmsg);
+    goto err;
+  }
+  else if (!master_row && master_res)
+  {
+    mi->report(WARNING_LEVEL, ER_UNKNOWN_SYSTEM_VARIABLE,
+               "Unknown system variable 'SERVER_ID' on master, \
+maybe it is a very old master.");
+  }
+  if (master_res)
+  {
     mysql_free_result(master_res);
-    if (errmsg)
-      goto err;
+    master_res= NULL;
   }
 
   /*
@@ -1008,23 +1072,49 @@ not always make sense; please check the 
   if (*mysql->server_version == '3')
     goto err;
 
-  if ((*mysql->server_version == '4') &&
-      !mysql_real_query(mysql,
-                        STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER")) &&
-      (master_res= mysql_store_result(mysql)))
+  if (*mysql->server_version == '4')
   {
-    if ((master_row= mysql_fetch_row(master_res)) &&
-        strcmp(master_row[0], global_system_variables.collation_server->name))
+    master_res= NULL;
+    if (!mysql_real_query(mysql,
+                          STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER")) &&
+        (master_res= mysql_store_result(mysql)) &&
+        (master_row= mysql_fetch_row(master_res)))
     {
-      errmsg= "The slave I/O thread stops because master and slave have \
+      if (strcmp(master_row[0], global_system_variables.collation_server->name))
+      {
+        errmsg= "The slave I/O thread stops because master and slave have \
 different values for the COLLATION_SERVER global variable. The values must \
 be equal for replication to work";
+        err_code= ER_SLAVE_FATAL_ERROR;
+        sprintf(err_buff, ER(err_code), errmsg);
+        goto err;
+      }
+    }
+    else if (is_network_error(mysql_errno(mysql)))
+    {
+      mi->report(WARNING_LEVEL, mysql_errno(mysql),
+                 "Get master COLLATION_SERVER failed with error: %s", mysql_error(mysql));
+      goto network_err;
+    }
+    else if (mysql_errno(mysql) != ER_UNKNOWN_SYSTEM_VARIABLE)
+    {
+      /* Fatal error */
+      errmsg= "The slave I/O thread stops because a fatal error is encountered \
+when it try to get the value of COLLATION_SERVER global variable from master.";
       err_code= ER_SLAVE_FATAL_ERROR;
       sprintf(err_buff, ER(err_code), errmsg);
-    }
-    mysql_free_result(master_res);
-    if (errmsg)
       goto err;
+    }
+    else
+      mi->report(WARNING_LEVEL, ER_UNKNOWN_SYSTEM_VARIABLE,
+                 "Unknown system variable 'COLLATION_SERVER' on master, \
+maybe it is a very old master.");
+
+    if (master_res)
+    {
+      mysql_free_result(master_res);
+      master_res= NULL;
+    }
   }
 
   /*
@@ -1042,35 +1132,62 @@ be equal for replication to work";
     This check is only necessary for 4.x masters (and < 5.0.4 masters but
     those were alpha).
   */
-  if ((*mysql->server_version == '4') &&
-      !mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE")) &&
-      (master_res= mysql_store_result(mysql)))
-  {
-    if ((master_row= mysql_fetch_row(master_res)) &&
-        strcmp(master_row[0],
-               global_system_variables.time_zone->get_name()->ptr()))
+  if (*mysql->server_version == '4')
+  {
+    master_res= NULL;
+    if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE")) &&
+        (master_res= mysql_store_result(mysql)) &&
+        (master_row= mysql_fetch_row(master_res)))
     {
-      errmsg= "The slave I/O thread stops because master and slave have \
+      if (strcmp(master_row[0],
+                 global_system_variables.time_zone->get_name()->ptr()))
+      {
+        errmsg= "The slave I/O thread stops because master and slave have \
 different values for the TIME_ZONE global variable. The values must \
 be equal for replication to work";
+        err_code= ER_SLAVE_FATAL_ERROR;
+        sprintf(err_buff, ER(err_code), errmsg);
+        goto err;
+      }
+    }
+    else if (is_network_error(mysql_errno(mysql)))
+    {
+      mi->report(WARNING_LEVEL, mysql_errno(mysql),
+                 "Get master TIME_ZONE failed with error: %s", mysql_error(mysql));
+      goto network_err;
+    } 
+    else
+    {
+      /* Fatal error */
+      errmsg= "The slave I/O thread stops because a fatal error is encountered \
+when it try to get the value of TIME_ZONE global variable from master.";
       err_code= ER_SLAVE_FATAL_ERROR;
       sprintf(err_buff, ER(err_code), errmsg);
-    }
-    mysql_free_result(master_res);
-
-    if (errmsg)
       goto err;
+    }
+    if (master_res)
+    {
+      mysql_free_result(master_res);
+      master_res= NULL;
+    }
   }
 
 err:
   if (errmsg)
   {
+    if (master_res)
+      mysql_free_result(master_res);
     DBUG_ASSERT(err_code != 0);
     mi->report(ERROR_LEVEL, err_code, err_buff);
     DBUG_RETURN(1);
   }
 
   DBUG_RETURN(0);
+
+network_err:
+  if (master_res)
+    mysql_free_result(master_res);
+  DBUG_RETURN(2);
 }
 
 /*
@@ -2372,6 +2489,7 @@ pthread_handler_t handle_slave_io(void *
   char llbuff[22];
   uint retry_count;
   bool suppress_warnings;
+  int ret;
 #ifndef DBUG_OFF
   uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0;
 #endif
@@ -2451,8 +2569,22 @@ connected:
   mi->slave_running= MYSQL_SLAVE_RUN_CONNECT;
   thd->slave_net = &mysql->net;
   thd_proc_info(thd, "Checking master version");
-  if (get_master_version_and_clock(mysql, mi))
+  ret= get_master_version_and_clock(mysql, mi);
+  if (ret == 1)
+    /* Fatal error */
     goto err;
+  
+  if (ret == 2) 
+  { 
+    if (check_io_slave_killed(mi->io_thd, mi, NULL))
+      goto err;
+    suppress_warnings= FALSE;
+    /* Try to reconnect because the error was caused by a transient network problem */
+    if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
+                             reconnect_messages[SLAVE_RECON_ACT_REG]))
+      goto err;
+    goto connected;
+  } 
 
   if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1)
   {


Attachment: [text/bzr-bundle] bzr/dao-gang.qu@sun.com-20090706072314-g0gdgpx060olayax.bundle
Thread
bzr commit into mysql-5.1-bugteam branch (Dao-Gang.Qu:2941) Bug#45214Dao-Gang.Qu6 Jul