List:Commits« Previous MessageNext Message »
From:Rafal Somla Date:June 28 2007 10:37am
Subject:Re: bk commit into 5.1 tree (ramil:1.2524) BUG#19328
View as plain text  
Hi Ramil,

See my comments inlined below.

ramil@stripped wrote:
> Below is the list of changes that have just been committed into a local
> 5.1 repository of ram. When ram does a push these changes will
> be propagated to the main repository and, within 24 hours after the
> push, to the public repository.
> For information on how to access the public repository
> see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
> 
> ChangeSet@stripped, 2007-06-28 12:38:43+05:00, ramil@stripped +1 -0
>   Fix for bug #19328: Slave timeout with COM_REGISTER_SLAVE error causing stop
>   
>   Problem: "Under high load, the slave registering to the master can timeout 
>   during the COM_REGISTER_SLAVE execution. This causes an error, which 
>   prevents the slave from connecting at all."
>   
>   Fix: Do not abort immediately, but retry registering on master.
> 
>   sql/slave.cc@stripped, 2007-06-28 12:38:42+05:00, ramil@stripped +165 -100
>     Fix for bug #19328: Slave timeout with COM_REGISTER_SLAVE error causing stop
>       - retry connecting after a failed slave registration on master.
>         Reconnecting has been factored out (see try_to_reconnect() and 
>         check_io_slave_killed()) to get rid of similar code used during 
>         registering on master/dump requesting/event reading.
>         All messages have been moved to reconnect_messages[] array 
>         for easy usage and maintenance.
>       - DBUG_FORCE_SLAVE_TO_RECONNECT() introduced; now one can force slave to
>         reconnect using 
>         ./mtr --mysqld=--loose-debug=d,FORCE_SLAVE_TO_RECONNECT_{REG, DUMP, EVENT}
> 
> diff -Nrup a/sql/slave.cc b/sql/slave.cc
> --- a/sql/slave.cc	2007-06-12 01:15:29 +05:00
> +++ b/sql/slave.cc	2007-06-28 12:38:42 +05:00
> @@ -59,6 +59,68 @@ ulonglong relay_log_space_limit = 0;
>  int disconnect_slave_event_count = 0, abort_slave_event_count = 0;
>  int events_till_abort = -1;
>  
> +#define DBUG_FORCE_SLAVE_TO_RECONNECT(var, count, msg) \

I suggest to change parameter name "msg" to "act".

> +  DBUG_EXECUTE_IF(var, \
> +      if (!count) \
> +      { \
> +        count++; \
> +        sql_print_information("Forcing to reconnect slave I/O thread"); \
> +        if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings, \
> +                             reconnect_messages[msg])) \

Are retry_count and suppress_warning global variables? If not, then it is not so 
good.

> +          goto err; \
> +        goto connected; \
> +      });
> +
> +enum enum_slave_reconnect_actions
> +{
> +  SLAVE_RECON_ACT_REG= 0,
> +  SLAVE_RECON_ACT_DUMP= 1,
> +  SLAVE_RECON_ACT_EVENT= 2,
> +  SLAVE_RECON_ACT_MAX
> +};
> +
> +enum enum_slave_reconnect_messages
> +{
> +  SLAVE_RECON_MSG_WAIT= 0,
> +  SLAVE_RECON_MSG_KILLED_WAITING= 1,
> +  SLAVE_RECON_MSG_AFTER= 2,
> +  SLAVE_RECON_MSG_FAILED= 3,
> +  SLAVE_RECON_MSG_COMMAND= 4,
> +  SLAVE_RECON_MSG_KILLED_AFTER= 5,
> +  SLAVE_RECON_MSG_MAX
> +};
> +
> +static const char *reconnect_messages[SLAVE_RECON_ACT_MAX][SLAVE_RECON_MSG_MAX]=
> +{
> +  {
> +    "Waiting to reconnect after a failed registration on master",
> +    "Slave I/O thread killed while waitnig to reconnect after a failed \
> +registration on master",
> +    "Reconnecting after a failed registration on master",
> +    "failed registering on master, reconnecting to try again",
> +    "COM_REGISTER_SLAVE",
> +    "Slave I/O thread killed during or after reconnect"
> +  },
> +  {
> +    "Waiting to reconnect after a failed binlog dump request",
> +    "Slave I/O thread killed while retrying master dump",
> +    "Reconnecting after a failed binlog dump request",
> +    "failed dump request, reconnecting to try again",
> +    "COM_BINLOG_DUMP",
> +    "Slave I/O thread killed during or after reconnect"
> +  },
> +  {
> +    "Waiting to reconnect after a failed master event read",
> +    "Slave I/O thread killed while waiting to reconnect after a failed read",
> +    "Reconnecting after a failed master event read",
> +    "Slave I/O thread: Failed reading log event, reconnecting to retry,",
> +    "",
> +    "Slave I/O thread killed during or after a reconnect done to recover from \
> +failed read"
> +  }
> +};
> + 
> +
>  typedef enum { SLAVE_THD_IO, SLAVE_THD_SQL} SLAVE_THD_TYPE;
>  
>  static int process_io_rotate(MASTER_INFO* mi, Rotate_log_event* rev);
> @@ -1098,12 +1160,14 @@ static void write_ignored_events_info_to
>  }
>  
>  
> -int register_slave_on_master(MYSQL* mysql, MASTER_INFO *mi)
> +int register_slave_on_master(MYSQL* mysql, MASTER_INFO *mi,
> +                             bool *suppress_warnings)
>  {
>    uchar buf[1024], *pos= buf;
>    uint report_host_len, report_user_len=0, report_password_len=0;
>    DBUG_ENTER("register_slave_on_master");
>  
> +  *suppress_warnings= FALSE;
>    if (!report_host)
>      DBUG_RETURN(0);
>    report_host_len= strlen(report_host);
> @@ -1127,11 +1191,18 @@ int register_slave_on_master(MYSQL* mysq
>  
>    if (simple_command(mysql, COM_REGISTER_SLAVE, buf, (size_t) (pos- buf), 0))
>    {
> -    char buf[256];
> -    my_snprintf(buf, sizeof(buf),
> -                "%s (Errno: %d)", mysql_error(mysql), mysql_errno(mysql));
> -    mi->report(ERROR_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
> -               ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_REGISTER_SLAVE", buf);
> +    if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
> +    {
> +      *suppress_warnings= TRUE;                 // Suppress reconnect warning
> +    }
> +    else
> +    {
> +      char buf[256];
> +      my_snprintf(buf, sizeof(buf), "%s (Errno: %d)", mysql_error(mysql), 
> +                  mysql_errno(mysql));
> +      mi->report(ERROR_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
> +                 ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_REGISTER_SLAVE", buf);
> +    }
>      DBUG_RETURN(1);
>    }
>    DBUG_RETURN(0);
> @@ -1460,6 +1531,8 @@ static int request_dump(MYSQL* mysql, MA
>    int binlog_flags = 0; // for now
>    char* logname = mi->master_log_name;
>    DBUG_ENTER("request_dump");
> +  
> +  *suppress_warnings= FALSE;
>  
>    // TODO if big log files: Change next to int8store()
>    int4store(buf, (ulong) mi->master_log_pos);
> @@ -1475,7 +1548,7 @@ static int request_dump(MYSQL* mysql, MA
>        now we just fill up the error log :-)
>      */
>      if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
> -      *suppress_warnings= 1;                    // Suppress reconnect warning
> +      *suppress_warnings= TRUE;                 // Suppress reconnect warning
>      else
>        sql_print_error("Error on COM_BINLOG_DUMP: %d  %s, will retry in %d secs",
>                        mysql_errno(mysql), mysql_error(mysql),
> @@ -1539,7 +1612,7 @@ static ulong read_event(MYSQL* mysql, MA
>    ulong len;
>    DBUG_ENTER("read_event");
>  
> -  *suppress_warnings= 0;
> +  *suppress_warnings= FALSE;
>    /*
>      my_real_read() will time us out
>      We check if we were told to die, and if not, try reading again
> @@ -1879,6 +1952,65 @@ on this slave.\
>  }
>  
>  
> +static bool check_io_slave_killed(THD *thd, MASTER_INFO *mi, const char *info)
> +{
> +  if (io_slave_killed(thd, mi))
> +  {
> +    if (global_system_variables.log_warnings)
> +      sql_print_information(info);
> +    return TRUE;
> +  }
> +  return FALSE;
> +}
> +
> +
> +static int try_to_reconnect(THD *thd, MYSQL *mysql, MASTER_INFO *mi,
> +                            uint *retry_count, bool suppress_warnings,
> +                            const char *messages[SLAVE_RECON_MSG_MAX])

I know I'm asking for much, but it would be nice to add short description to 
each new introduced function. Lack of documentation makes reading our code a 
nightmare...

> +{
> +  mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
> +  thd->proc_info= messages[SLAVE_RECON_MSG_WAIT];
> +#ifdef SIGNAL_WITH_VIO_CLOSE  
> +  thd->clear_active_vio();
> +#endif
> +  end_server(mysql);
> +  if ((*retry_count)++)
> +  {
> +    if (*retry_count > master_retry_count)
> +      return 1;                             // Don't retry forever
> +    safe_sleep(thd, mi->connect_retry, (CHECK_KILLED_FUNC) io_slave_killed,
> +               (void *) mi);
> +  }
> +  if (check_io_slave_killed(thd, mi, messages[SLAVE_RECON_MSG_KILLED_WAITING]))
> +    return 1;
> +  thd->proc_info = messages[SLAVE_RECON_MSG_AFTER];
> +  if (!suppress_warnings) 
> +  {
> +    char buf[256], llbuff[22];
> +    my_snprintf(buf, sizeof(buf), messages[SLAVE_RECON_MSG_FAILED], 
> +                " log '%s' at postion %s", IO_RPL_LOG_NAME,
> +                llstr(mi->master_log_pos, llbuff));

This my_snprintf() invocation is not correct. I mean, it will work but the log 
positions will be ignored. This is because messages[SLAVE_RECON_MSG_FAILED] will 
be used as format string and it contains no %... fields.

Originally it was

my_snprintf(buf, sizeof(buf),
                     "failed dump request, reconnecting to try again,"
                     " log '%s' at postion %s",
                     IO_RPL_LOG_NAME,
                     llstr(mi->master_log_pos,llbuff));

where two strings are concatenated forming single format string. In your code 
the two formats will not be joined.

> +    if (messages[SLAVE_RECON_MSG_COMMAND][0])

Why above if condition? Could you add explaining comment?

> +    {
> +      mi->report(WARNING_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
> +                 ER(ER_SLAVE_MASTER_COM_FAILURE), 
> +                 messages[SLAVE_RECON_MSG_COMMAND], buf);
> +    }
> +    else
> +    {
> +      sql_print_information(buf);
> +    }
> +  }
> +  if (safe_reconnect(thd, mysql, mi, 1) || io_slave_killed(thd, mi))

Why not check_io_slave_killed() above?

> +  {
> +    if (global_system_variables.log_warnings)
> +      sql_print_information(messages[SLAVE_RECON_MSG_KILLED_AFTER]);
> +    return 1;
> +  }
> +  return 0;
> +}
> +
> +
>  /* Slave I/O Thread entry point */
>  
>  pthread_handler_t handle_slave_io(void *arg)
> @@ -1889,7 +2021,10 @@ pthread_handler_t handle_slave_io(void *
>    RELAY_LOG_INFO *rli= &mi->rli;
>    char llbuff[22];
>    uint retry_count;
> -
> +  bool suppress_warnings;
> +#ifndef DBUG_OFF
> +  uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0;
> +#endif
>    // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
>    my_thread_init();
>    DBUG_ENTER("handle_slave_io");
> @@ -1975,79 +2110,40 @@ connected:
>        Register ourselves with the master.
>      */
>      thd->proc_info = "Registering slave on master";
> -    if (register_slave_on_master(mysql, mi))
> +    if (register_slave_on_master(mysql, mi, &suppress_warnings))
>      {
>        sql_print_error("Slave I/O thread couldn't register on master");
> -      goto err;
> +      if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
> +registering slave on master") ||
> +          try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
> +                           reconnect_messages[SLAVE_RECON_ACT_REG]))
> +        goto err;
> +      goto connected;
>      }
> +    DBUG_FORCE_SLAVE_TO_RECONNECT("FORCE_SLAVE_TO_RECONNECT_REG", 
> +                                  retry_count_reg, SLAVE_RECON_ACT_REG);
>    }
>  
>    DBUG_PRINT("info",("Starting reading binary log from master"));
>    while (!io_slave_killed(thd,mi))
>    {
> -    bool suppress_warnings= 0;
>      thd->proc_info = "Requesting binlog dump";
>      if (request_dump(mysql, mi, &suppress_warnings))
>      {
>        sql_print_error("Failed on request_dump()");
> -      if (io_slave_killed(thd,mi))
> -      {
> -        sql_print_information("Slave I/O thread killed while requesting master \
> -dump");
> -        goto err;
> -      }
> -
> -      mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
> -      thd->proc_info= "Waiting to reconnect after a failed binlog dump request";
> -#ifdef SIGNAL_WITH_VIO_CLOSE
> -      thd->clear_active_vio();
> -#endif
> -      end_server(mysql);
> -      /*
> -        First time retry immediately, assuming that we can recover
> -        right away - if first time fails, sleep between re-tries
> -        hopefuly the admin can fix the problem sometime
> -      */
> -      if (retry_count++)
> -      {
> -        if (retry_count > master_retry_count)
> -          goto err;                             // Don't retry forever
> -        safe_sleep(thd,mi->connect_retry,(CHECK_KILLED_FUNC)io_slave_killed,
> -                   (void*)mi);
> -      }
> -      if (io_slave_killed(thd,mi))
> -      {
> -        sql_print_information("Slave I/O thread killed while retrying master \
> -dump");
> -        goto err;
> -      }
> -
> -      thd->proc_info = "Reconnecting after a failed binlog dump request";
> -      if (!suppress_warnings) {
> -        char buf[256];
> -        my_snprintf(buf, sizeof(buf),
> -                    "failed dump request, reconnecting to try again,"
> -                    " log '%s' at postion %s",
> -                    IO_RPL_LOG_NAME,
> -                    llstr(mi->master_log_pos,llbuff));
> -        mi->report(WARNING_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
> -                   ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_BINLOG_DUMP", buf);
> -      }
> -      if (safe_reconnect(thd, mysql, mi, suppress_warnings) ||
> -          io_slave_killed(thd,mi))
> -      {
> -        sql_print_information("Slave I/O thread killed during or \
> -after reconnect");
> +      if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
> +requesting master dump") ||
> +          try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
> +                           reconnect_messages[SLAVE_RECON_ACT_DUMP]))
>          goto err;
> -      }
> -
>        goto connected;
>      }
> +    DBUG_FORCE_SLAVE_TO_RECONNECT("FORCE_SLAVE_TO_RECONNECT_DUMP", 
> +                                  retry_count_dump, SLAVE_RECON_ACT_DUMP);
>  
>      while (!io_slave_killed(thd,mi))
>      {
>        ulong event_len;
> -      suppress_warnings= 0;
>        /*
>           We say "waiting" because read_event() will wait if there's nothing to
>           read. But if there's something to read, it will not wait. The
> @@ -2056,12 +2152,11 @@ after reconnect");
>        */
>        thd->proc_info= "Waiting for master to send event";
>        event_len= read_event(mysql, mi, &suppress_warnings);
> -      if (io_slave_killed(thd,mi))
> -      {
> -        if (global_system_variables.log_warnings)
> -          sql_print_information("Slave I/O thread killed while reading event");
> +      if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
> +reading event"))
>          goto err;
> -      }
> +      DBUG_FORCE_SLAVE_TO_RECONNECT("FORCE_SLAVE_TO_RECONNECT_EVENT", 
> +                                    retry_count_event, SLAVE_RECON_ACT_EVENT);
>  
>        if (event_len == packet_error)
>        {
> @@ -2081,39 +2176,9 @@ max_allowed_packet",
>                            mysql_error(mysql));
>            goto err;
>          }
> -        mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
> -        thd->proc_info = "Waiting to reconnect after a failed master event
> read";
> -#ifdef SIGNAL_WITH_VIO_CLOSE
> -        thd->clear_active_vio();
> -#endif
> -        end_server(mysql);
> -        if (retry_count++)
> -        {
> -          if (retry_count > master_retry_count)
> -            goto err;                           // Don't retry forever
> -          safe_sleep(thd,mi->connect_retry,(CHECK_KILLED_FUNC)io_slave_killed,
> -                     (void*) mi);
> -        }
> -        if (io_slave_killed(thd,mi))
> -        {
> -          if (global_system_variables.log_warnings)
> -            sql_print_information("Slave I/O thread killed while waiting to \
> -reconnect after a failed read");
> -          goto err;
> -        }
> -        thd->proc_info = "Reconnecting after a failed master event read";
> -        if (!suppress_warnings)
> -          sql_print_information("Slave I/O thread: Failed reading log event, \
> -reconnecting to retry, log '%s' position %s", IO_RPL_LOG_NAME,
> -                          llstr(mi->master_log_pos, llbuff));
> -        if (safe_reconnect(thd, mysql, mi, suppress_warnings) ||
> -            io_slave_killed(thd,mi))
> -        {
> -          if (global_system_variables.log_warnings)
> -            sql_print_information("Slave I/O thread killed during or after a \
> -reconnect done to recover from failed read");
> +        if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
> +                             reconnect_messages[SLAVE_RECON_ACT_EVENT]))
>            goto err;
> -        }
>          goto connected;
>        } // if (event_len == packet_error)
>  
> 

Rafal
Thread
bk commit into 5.1 tree (ramil:1.2524) BUG#19328ramil28 Jun
  • Re: bk commit into 5.1 tree (ramil:1.2524) BUG#19328Rafal Somla28 Jun
    • Re: bk commit into 5.1 tree (ramil:1.2524) BUG#19328Ramil Kalimullin28 Jun