List:Commits« Previous MessageNext Message »
From:ramil Date:June 29 2007 6:48pm
Subject:bk commit into 5.1 tree (ramil:1.2524) BUG#19328
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of ram. When ram does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-06-29 21:48:22+05:00, ramil@stripped +1 -0
  Fix for bug #19328: Slave timeout with COM_REGISTER_SLAVE error causing stop
  
  Problem: "Under high load, the slave registering to the master can timeout 
  during the COM_REGISTER_SLAVE execution. This causes an error, which 
  prevents the slave from connecting at all."
  
  Fix: Do not abort immediately, but retry registering on master.

  sql/slave.cc@stripped, 2007-06-29 21:48:20+05:00, ramil@stripped +208 -100
    Fix for bug #19328: Slave timeout with COM_REGISTER_SLAVE error causing stop
      - retry connecting after a failed slave registration on master.
        Reconnecting has been factored out (see try_to_reconnect() and 
        check_io_slave_killed()) to get rid of similar code used during 
        registering on master/dump requesting/event reading.
        All messages have been moved to reconnect_messages[] array 
        for easy usage and maintenance.
      - now one can force slave to reconnect using 
        ./mtr --mysqld=--loose-debug=d,FORCE_SLAVE_TO_RECONNECT_{REG, DUMP, EVENT}

diff -Nrup a/sql/slave.cc b/sql/slave.cc
--- a/sql/slave.cc	2007-06-12 01:15:29 +05:00
+++ b/sql/slave.cc	2007-06-29 21:48:20 +05:00
@@ -59,6 +59,58 @@ ulonglong relay_log_space_limit = 0;
 int disconnect_slave_event_count = 0, abort_slave_event_count = 0;
 int events_till_abort = -1;
 
+enum enum_slave_reconnect_actions
+{
+  SLAVE_RECON_ACT_REG= 0,
+  SLAVE_RECON_ACT_DUMP= 1,
+  SLAVE_RECON_ACT_EVENT= 2,
+  SLAVE_RECON_ACT_MAX
+};
+
+enum enum_slave_reconnect_messages
+{
+  SLAVE_RECON_MSG_WAIT= 0,
+  SLAVE_RECON_MSG_KILLED_WAITING= 1,
+  SLAVE_RECON_MSG_AFTER= 2,
+  SLAVE_RECON_MSG_FAILED= 3,
+  SLAVE_RECON_MSG_COMMAND= 4,
+  SLAVE_RECON_MSG_KILLED_AFTER= 5,
+  SLAVE_RECON_MSG_MAX
+};
+
+static const char *reconnect_messages[SLAVE_RECON_ACT_MAX][SLAVE_RECON_MSG_MAX]=
+{
+  {
+    "Waiting to reconnect after a failed registration on master",
+    "Slave I/O thread killed while waitnig to reconnect after a failed \
+registration on master",
+    "Reconnecting after a failed registration on master",
+    "failed registering on master, reconnecting to try again, \
+log '%s' at postion %s",
+    "COM_REGISTER_SLAVE",
+    "Slave I/O thread killed during or after reconnect"
+  },
+  {
+    "Waiting to reconnect after a failed binlog dump request",
+    "Slave I/O thread killed while retrying master dump",
+    "Reconnecting after a failed binlog dump request",
+    "failed dump request, reconnecting to try again, log '%s' at postion %s",
+    "COM_BINLOG_DUMP",
+    "Slave I/O thread killed during or after reconnect"
+  },
+  {
+    "Waiting to reconnect after a failed master event read",
+    "Slave I/O thread killed while waiting to reconnect after a failed read",
+    "Reconnecting after a failed master event read",
+    "Slave I/O thread: Failed reading log event, reconnecting to retry, \
+log '%s' at postion %s",
+    "",
+    "Slave I/O thread killed during or after a reconnect done to recover from \
+failed read"
+  }
+};
+ 
+
 typedef enum { SLAVE_THD_IO, SLAVE_THD_SQL} SLAVE_THD_TYPE;
 
 static int process_io_rotate(MASTER_INFO* mi, Rotate_log_event* rev);
@@ -1098,12 +1150,14 @@ static void write_ignored_events_info_to
 }
 
 
-int register_slave_on_master(MYSQL* mysql, MASTER_INFO *mi)
+int register_slave_on_master(MYSQL* mysql, MASTER_INFO *mi,
+                             bool *suppress_warnings)
 {
   uchar buf[1024], *pos= buf;
   uint report_host_len, report_user_len=0, report_password_len=0;
   DBUG_ENTER("register_slave_on_master");
 
+  *suppress_warnings= FALSE;
   if (!report_host)
     DBUG_RETURN(0);
   report_host_len= strlen(report_host);
@@ -1127,11 +1181,18 @@ int register_slave_on_master(MYSQL* mysq
 
   if (simple_command(mysql, COM_REGISTER_SLAVE, buf, (size_t) (pos- buf), 0))
   {
-    char buf[256];
-    my_snprintf(buf, sizeof(buf),
-                "%s (Errno: %d)", mysql_error(mysql), mysql_errno(mysql));
-    mi->report(ERROR_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
-               ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_REGISTER_SLAVE", buf);
+    if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
+    {
+      *suppress_warnings= TRUE;                 // Suppress reconnect warning
+    }
+    else
+    {
+      char buf[256];
+      my_snprintf(buf, sizeof(buf), "%s (Errno: %d)", mysql_error(mysql), 
+                  mysql_errno(mysql));
+      mi->report(ERROR_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
+                 ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_REGISTER_SLAVE", buf);
+    }
     DBUG_RETURN(1);
   }
   DBUG_RETURN(0);
@@ -1460,6 +1521,8 @@ static int request_dump(MYSQL* mysql, MA
   int binlog_flags = 0; // for now
   char* logname = mi->master_log_name;
   DBUG_ENTER("request_dump");
+  
+  *suppress_warnings= FALSE;
 
   // TODO if big log files: Change next to int8store()
   int4store(buf, (ulong) mi->master_log_pos);
@@ -1475,7 +1538,7 @@ static int request_dump(MYSQL* mysql, MA
       now we just fill up the error log :-)
     */
     if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
-      *suppress_warnings= 1;                    // Suppress reconnect warning
+      *suppress_warnings= TRUE;                 // Suppress reconnect warning
     else
       sql_print_error("Error on COM_BINLOG_DUMP: %d  %s, will retry in %d secs",
                       mysql_errno(mysql), mysql_error(mysql),
@@ -1539,7 +1602,7 @@ static ulong read_event(MYSQL* mysql, MA
   ulong len;
   DBUG_ENTER("read_event");
 
-  *suppress_warnings= 0;
+  *suppress_warnings= FALSE;
   /*
     my_real_read() will time us out
     We check if we were told to die, and if not, try reading again
@@ -1879,6 +1942,94 @@ on this slave.\
 }
 
 
+static bool check_io_slave_killed(THD *thd, MASTER_INFO *mi, const char *info)
+{
+  if (io_slave_killed(thd, mi))
+  {
+    if (global_system_variables.log_warnings)
+      sql_print_information(info);
+    return TRUE;
+  }
+  return FALSE;
+}
+
+
+/**
+  @brief Try to reconnect slave IO thread.
+
+  @details Terminates current connection to master, sleeps for
+  @c mi->connect_retry msecs and initiates new connection with
+  @c safe_reconnect(). Variable pointed by @c retry_count is increased -
+  if it exceeds @c master_retry_count then connection is not re-established
+  and function signals error.
+  Unless @c suppres_warnings is TRUE, a warning is put in the server error log
+  when reconnecting. The warning message and messages used to report errors
+  are taken from @c messages array. In case @c master_retry_count is exceeded,
+  no messages are added to the log.
+
+  @param[in]     thd                 Thread context.
+  @param[in]     mysql               MySQL connection.
+  @param[in]     mi                  Master connection information.
+  @param[in,out] retry_count         Number of attempts to reconnect.
+  @param[in]     suppress_warnings   TRUE when a normal net read timeout 
+                                     has caused to reconnecting.
+  @param[in]     messages            Messages to print/log, see 
+                                     reconnect_messages[] array.
+
+  @retval        0                   OK.
+  @retval        1                   There was an error.
+*/
+
+static int try_to_reconnect(THD *thd, MYSQL *mysql, MASTER_INFO *mi,
+                            uint *retry_count, bool suppress_warnings,
+                            const char *messages[SLAVE_RECON_MSG_MAX])
+{
+  mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
+  thd->proc_info= messages[SLAVE_RECON_MSG_WAIT];
+#ifdef SIGNAL_WITH_VIO_CLOSE  
+  thd->clear_active_vio();
+#endif
+  end_server(mysql);
+  if ((*retry_count)++)
+  {
+    if (*retry_count > master_retry_count)
+      return 1;                             // Don't retry forever
+    safe_sleep(thd, mi->connect_retry, (CHECK_KILLED_FUNC) io_slave_killed,
+               (void *) mi);
+  }
+  if (check_io_slave_killed(thd, mi, messages[SLAVE_RECON_MSG_KILLED_WAITING]))
+    return 1;
+  thd->proc_info = messages[SLAVE_RECON_MSG_AFTER];
+  if (!suppress_warnings) 
+  {
+    char buf[256], llbuff[22];
+    my_snprintf(buf, sizeof(buf), messages[SLAVE_RECON_MSG_FAILED], 
+                IO_RPL_LOG_NAME, llstr(mi->master_log_pos, llbuff));
+    /* 
+      Raise a warining during registering on master/requesting dump.
+      Log a message reading event.
+    */
+    if (messages[SLAVE_RECON_MSG_COMMAND][0])
+    {
+      mi->report(WARNING_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
+                 ER(ER_SLAVE_MASTER_COM_FAILURE), 
+                 messages[SLAVE_RECON_MSG_COMMAND], buf);
+    }
+    else
+    {
+      sql_print_information(buf);
+    }
+  }
+  if (safe_reconnect(thd, mysql, mi, 1) || io_slave_killed(thd, mi))
+  {
+    if (global_system_variables.log_warnings)
+      sql_print_information(messages[SLAVE_RECON_MSG_KILLED_AFTER]);
+    return 1;
+  }
+  return 0;
+}
+
+
 /* Slave I/O Thread entry point */
 
 pthread_handler_t handle_slave_io(void *arg)
@@ -1889,7 +2040,10 @@ pthread_handler_t handle_slave_io(void *
   RELAY_LOG_INFO *rli= &mi->rli;
   char llbuff[22];
   uint retry_count;
-
+  bool suppress_warnings;
+#ifndef DBUG_OFF
+  uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0;
+#endif
   // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
   my_thread_init();
   DBUG_ENTER("handle_slave_io");
@@ -1975,79 +2129,56 @@ connected:
       Register ourselves with the master.
     */
     thd->proc_info = "Registering slave on master";
-    if (register_slave_on_master(mysql, mi))
+    if (register_slave_on_master(mysql, mi, &suppress_warnings))
     {
       sql_print_error("Slave I/O thread couldn't register on master");
-      goto err;
+      if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
+registering slave on master") ||
+          try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
+                           reconnect_messages[SLAVE_RECON_ACT_REG]))
+        goto err;
+      goto connected;
     }
+    DBUG_EXECUTE_IF("FORCE_SLAVE_TO_RECONNECT_REG", 
+      if (!retry_count_reg)
+      {
+        retry_count_reg++;
+        sql_print_information("Forcing to reconnect slave I/O thread");
+        if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
+                             reconnect_messages[SLAVE_RECON_ACT_REG]))
+          goto err;
+        goto connected;
+      });
   }
 
   DBUG_PRINT("info",("Starting reading binary log from master"));
   while (!io_slave_killed(thd,mi))
   {
-    bool suppress_warnings= 0;
     thd->proc_info = "Requesting binlog dump";
     if (request_dump(mysql, mi, &suppress_warnings))
     {
       sql_print_error("Failed on request_dump()");
-      if (io_slave_killed(thd,mi))
-      {
-        sql_print_information("Slave I/O thread killed while requesting master \
-dump");
-        goto err;
-      }
-
-      mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
-      thd->proc_info= "Waiting to reconnect after a failed binlog dump request";
-#ifdef SIGNAL_WITH_VIO_CLOSE
-      thd->clear_active_vio();
-#endif
-      end_server(mysql);
-      /*
-        First time retry immediately, assuming that we can recover
-        right away - if first time fails, sleep between re-tries
-        hopefuly the admin can fix the problem sometime
-      */
-      if (retry_count++)
-      {
-        if (retry_count > master_retry_count)
-          goto err;                             // Don't retry forever
-        safe_sleep(thd,mi->connect_retry,(CHECK_KILLED_FUNC)io_slave_killed,
-                   (void*)mi);
-      }
-      if (io_slave_killed(thd,mi))
-      {
-        sql_print_information("Slave I/O thread killed while retrying master \
-dump");
-        goto err;
-      }
-
-      thd->proc_info = "Reconnecting after a failed binlog dump request";
-      if (!suppress_warnings) {
-        char buf[256];
-        my_snprintf(buf, sizeof(buf),
-                    "failed dump request, reconnecting to try again,"
-                    " log '%s' at postion %s",
-                    IO_RPL_LOG_NAME,
-                    llstr(mi->master_log_pos,llbuff));
-        mi->report(WARNING_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
-                   ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_BINLOG_DUMP", buf);
-      }
-      if (safe_reconnect(thd, mysql, mi, suppress_warnings) ||
-          io_slave_killed(thd,mi))
-      {
-        sql_print_information("Slave I/O thread killed during or \
-after reconnect");
+      if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
+requesting master dump") ||
+          try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
+                           reconnect_messages[SLAVE_RECON_ACT_DUMP]))
         goto err;
-      }
-
       goto connected;
     }
+    DBUG_EXECUTE_IF("FORCE_SLAVE_TO_RECONNECT_DUMP", 
+      if (!retry_count_dump)
+      {
+        retry_count_dump++;
+        sql_print_information("Forcing to reconnect slave I/O thread");
+        if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
+                             reconnect_messages[SLAVE_RECON_ACT_DUMP]))
+          goto err;
+        goto connected;
+      });
 
     while (!io_slave_killed(thd,mi))
     {
       ulong event_len;
-      suppress_warnings= 0;
       /*
          We say "waiting" because read_event() will wait if there's nothing to
          read. But if there's something to read, it will not wait. The
@@ -2056,12 +2187,19 @@ after reconnect");
       */
       thd->proc_info= "Waiting for master to send event";
       event_len= read_event(mysql, mi, &suppress_warnings);
-      if (io_slave_killed(thd,mi))
-      {
-        if (global_system_variables.log_warnings)
-          sql_print_information("Slave I/O thread killed while reading event");
+      if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
+reading event"))
         goto err;
-      }
+      DBUG_EXECUTE_IF("FORCE_SLAVE_TO_RECONNECT_EVENT",
+        if (!retry_count_event)
+        {
+          retry_count_event++;
+          sql_print_information("Forcing to reconnect slave I/O thread");
+          if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
+                               reconnect_messages[SLAVE_RECON_ACT_EVENT]))
+            goto err;
+          goto connected;
+        });
 
       if (event_len == packet_error)
       {
@@ -2081,39 +2219,9 @@ max_allowed_packet",
                           mysql_error(mysql));
           goto err;
         }
-        mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
-        thd->proc_info = "Waiting to reconnect after a failed master event read";
-#ifdef SIGNAL_WITH_VIO_CLOSE
-        thd->clear_active_vio();
-#endif
-        end_server(mysql);
-        if (retry_count++)
-        {
-          if (retry_count > master_retry_count)
-            goto err;                           // Don't retry forever
-          safe_sleep(thd,mi->connect_retry,(CHECK_KILLED_FUNC)io_slave_killed,
-                     (void*) mi);
-        }
-        if (io_slave_killed(thd,mi))
-        {
-          if (global_system_variables.log_warnings)
-            sql_print_information("Slave I/O thread killed while waiting to \
-reconnect after a failed read");
+        if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
+                             reconnect_messages[SLAVE_RECON_ACT_EVENT]))
           goto err;
-        }
-        thd->proc_info = "Reconnecting after a failed master event read";
-        if (!suppress_warnings)
-          sql_print_information("Slave I/O thread: Failed reading log event, \
-reconnecting to retry, log '%s' position %s", IO_RPL_LOG_NAME,
-                          llstr(mi->master_log_pos, llbuff));
-        if (safe_reconnect(thd, mysql, mi, suppress_warnings) ||
-            io_slave_killed(thd,mi))
-        {
-          if (global_system_variables.log_warnings)
-            sql_print_information("Slave I/O thread killed during or after a \
-reconnect done to recover from failed read");
-          goto err;
-        }
         goto connected;
       } // if (event_len == packet_error)
 
Thread
bk commit into 5.1 tree (ramil:1.2524) BUG#19328ramil29 Jun