Hi Ramil,
See my comments inlined below.
ramil@stripped wrote:
> Below is the list of changes that have just been committed into a local
> 5.1 repository of ram. When ram does a push these changes will
> be propagated to the main repository and, within 24 hours after the
> push, to the public repository.
> For information on how to access the public repository
> see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
>
> ChangeSet@stripped, 2007-06-28 12:38:43+05:00, ramil@stripped +1 -0
> Fix for bug #19328: Slave timeout with COM_REGISTER_SLAVE error causing stop
>
> Problem: "Under high load, the slave registering to the master can timeout
> during the COM_REGISTER_SLAVE execution. This causes an error, which
> prevents the slave from connecting at all."
>
> Fix: Do not abort immediately, but retry registering on master.
>
> sql/slave.cc@stripped, 2007-06-28 12:38:42+05:00, ramil@stripped +165 -100
> Fix for bug #19328: Slave timeout with COM_REGISTER_SLAVE error causing stop
> - retry connecting after a failed slave registration on master.
> Reconnecting has been factored out (see try_to_reconnect() and
> check_io_slave_killed()) to get rid of similar code used during
> registering on master/dump requesting/event reading.
> All messages have been moved to reconnect_messages[] array
> for easy usage and maintenance.
> - DBUG_FORCE_SLAVE_TO_RECONNECT() introduced; now one can force slave to
> reconnect using
> ./mtr --mysqld=--loose-debug=d,FORCE_SLAVE_TO_RECONNECT_{REG, DUMP, EVENT}
>
> diff -Nrup a/sql/slave.cc b/sql/slave.cc
> --- a/sql/slave.cc 2007-06-12 01:15:29 +05:00
> +++ b/sql/slave.cc 2007-06-28 12:38:42 +05:00
> @@ -59,6 +59,68 @@ ulonglong relay_log_space_limit = 0;
> int disconnect_slave_event_count = 0, abort_slave_event_count = 0;
> int events_till_abort = -1;
>
> +#define DBUG_FORCE_SLAVE_TO_RECONNECT(var, count, msg) \
I suggest to change parameter name "msg" to "act".
> + DBUG_EXECUTE_IF(var, \
> + if (!count) \
> + { \
> + count++; \
> + sql_print_information("Forcing to reconnect slave I/O thread"); \
> + if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings, \
> + reconnect_messages[msg])) \
Are retry_count and suppress_warning global variables? If not, then it is not so
good.
> + goto err; \
> + goto connected; \
> + });
> +
> +enum enum_slave_reconnect_actions
> +{
> + SLAVE_RECON_ACT_REG= 0,
> + SLAVE_RECON_ACT_DUMP= 1,
> + SLAVE_RECON_ACT_EVENT= 2,
> + SLAVE_RECON_ACT_MAX
> +};
> +
> +enum enum_slave_reconnect_messages
> +{
> + SLAVE_RECON_MSG_WAIT= 0,
> + SLAVE_RECON_MSG_KILLED_WAITING= 1,
> + SLAVE_RECON_MSG_AFTER= 2,
> + SLAVE_RECON_MSG_FAILED= 3,
> + SLAVE_RECON_MSG_COMMAND= 4,
> + SLAVE_RECON_MSG_KILLED_AFTER= 5,
> + SLAVE_RECON_MSG_MAX
> +};
> +
> +static const char *reconnect_messages[SLAVE_RECON_ACT_MAX][SLAVE_RECON_MSG_MAX]=
> +{
> + {
> + "Waiting to reconnect after a failed registration on master",
> + "Slave I/O thread killed while waitnig to reconnect after a failed \
> +registration on master",
> + "Reconnecting after a failed registration on master",
> + "failed registering on master, reconnecting to try again",
> + "COM_REGISTER_SLAVE",
> + "Slave I/O thread killed during or after reconnect"
> + },
> + {
> + "Waiting to reconnect after a failed binlog dump request",
> + "Slave I/O thread killed while retrying master dump",
> + "Reconnecting after a failed binlog dump request",
> + "failed dump request, reconnecting to try again",
> + "COM_BINLOG_DUMP",
> + "Slave I/O thread killed during or after reconnect"
> + },
> + {
> + "Waiting to reconnect after a failed master event read",
> + "Slave I/O thread killed while waiting to reconnect after a failed read",
> + "Reconnecting after a failed master event read",
> + "Slave I/O thread: Failed reading log event, reconnecting to retry,",
> + "",
> + "Slave I/O thread killed during or after a reconnect done to recover from \
> +failed read"
> + }
> +};
> +
> +
> typedef enum { SLAVE_THD_IO, SLAVE_THD_SQL} SLAVE_THD_TYPE;
>
> static int process_io_rotate(MASTER_INFO* mi, Rotate_log_event* rev);
> @@ -1098,12 +1160,14 @@ static void write_ignored_events_info_to
> }
>
>
> -int register_slave_on_master(MYSQL* mysql, MASTER_INFO *mi)
> +int register_slave_on_master(MYSQL* mysql, MASTER_INFO *mi,
> + bool *suppress_warnings)
> {
> uchar buf[1024], *pos= buf;
> uint report_host_len, report_user_len=0, report_password_len=0;
> DBUG_ENTER("register_slave_on_master");
>
> + *suppress_warnings= FALSE;
> if (!report_host)
> DBUG_RETURN(0);
> report_host_len= strlen(report_host);
> @@ -1127,11 +1191,18 @@ int register_slave_on_master(MYSQL* mysq
>
> if (simple_command(mysql, COM_REGISTER_SLAVE, buf, (size_t) (pos- buf), 0))
> {
> - char buf[256];
> - my_snprintf(buf, sizeof(buf),
> - "%s (Errno: %d)", mysql_error(mysql), mysql_errno(mysql));
> - mi->report(ERROR_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
> - ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_REGISTER_SLAVE", buf);
> + if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
> + {
> + *suppress_warnings= TRUE; // Suppress reconnect warning
> + }
> + else
> + {
> + char buf[256];
> + my_snprintf(buf, sizeof(buf), "%s (Errno: %d)", mysql_error(mysql),
> + mysql_errno(mysql));
> + mi->report(ERROR_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
> + ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_REGISTER_SLAVE", buf);
> + }
> DBUG_RETURN(1);
> }
> DBUG_RETURN(0);
> @@ -1460,6 +1531,8 @@ static int request_dump(MYSQL* mysql, MA
> int binlog_flags = 0; // for now
> char* logname = mi->master_log_name;
> DBUG_ENTER("request_dump");
> +
> + *suppress_warnings= FALSE;
>
> // TODO if big log files: Change next to int8store()
> int4store(buf, (ulong) mi->master_log_pos);
> @@ -1475,7 +1548,7 @@ static int request_dump(MYSQL* mysql, MA
> now we just fill up the error log :-)
> */
> if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
> - *suppress_warnings= 1; // Suppress reconnect warning
> + *suppress_warnings= TRUE; // Suppress reconnect warning
> else
> sql_print_error("Error on COM_BINLOG_DUMP: %d %s, will retry in %d secs",
> mysql_errno(mysql), mysql_error(mysql),
> @@ -1539,7 +1612,7 @@ static ulong read_event(MYSQL* mysql, MA
> ulong len;
> DBUG_ENTER("read_event");
>
> - *suppress_warnings= 0;
> + *suppress_warnings= FALSE;
> /*
> my_real_read() will time us out
> We check if we were told to die, and if not, try reading again
> @@ -1879,6 +1952,65 @@ on this slave.\
> }
>
>
> +static bool check_io_slave_killed(THD *thd, MASTER_INFO *mi, const char *info)
> +{
> + if (io_slave_killed(thd, mi))
> + {
> + if (global_system_variables.log_warnings)
> + sql_print_information(info);
> + return TRUE;
> + }
> + return FALSE;
> +}
> +
> +
> +static int try_to_reconnect(THD *thd, MYSQL *mysql, MASTER_INFO *mi,
> + uint *retry_count, bool suppress_warnings,
> + const char *messages[SLAVE_RECON_MSG_MAX])
I know I'm asking for much, but it would be nice to add short description to
each new introduced function. Lack of documentation makes reading our code a
nightmare...
> +{
> + mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
> + thd->proc_info= messages[SLAVE_RECON_MSG_WAIT];
> +#ifdef SIGNAL_WITH_VIO_CLOSE
> + thd->clear_active_vio();
> +#endif
> + end_server(mysql);
> + if ((*retry_count)++)
> + {
> + if (*retry_count > master_retry_count)
> + return 1; // Don't retry forever
> + safe_sleep(thd, mi->connect_retry, (CHECK_KILLED_FUNC) io_slave_killed,
> + (void *) mi);
> + }
> + if (check_io_slave_killed(thd, mi, messages[SLAVE_RECON_MSG_KILLED_WAITING]))
> + return 1;
> + thd->proc_info = messages[SLAVE_RECON_MSG_AFTER];
> + if (!suppress_warnings)
> + {
> + char buf[256], llbuff[22];
> + my_snprintf(buf, sizeof(buf), messages[SLAVE_RECON_MSG_FAILED],
> + " log '%s' at postion %s", IO_RPL_LOG_NAME,
> + llstr(mi->master_log_pos, llbuff));
This my_snprintf() invocation is not correct. I mean, it will work but the log
positions will be ignored. This is because messages[SLAVE_RECON_MSG_FAILED] will
be used as format string and it contains no %... fields.
Originally it was
my_snprintf(buf, sizeof(buf),
"failed dump request, reconnecting to try again,"
" log '%s' at postion %s",
IO_RPL_LOG_NAME,
llstr(mi->master_log_pos,llbuff));
where two strings are concatenated forming single format string. In your code
the two formats will not be joined.
> + if (messages[SLAVE_RECON_MSG_COMMAND][0])
Why above if condition? Could you add explaining comment?
> + {
> + mi->report(WARNING_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
> + ER(ER_SLAVE_MASTER_COM_FAILURE),
> + messages[SLAVE_RECON_MSG_COMMAND], buf);
> + }
> + else
> + {
> + sql_print_information(buf);
> + }
> + }
> + if (safe_reconnect(thd, mysql, mi, 1) || io_slave_killed(thd, mi))
Why not check_io_slave_killed() above?
> + {
> + if (global_system_variables.log_warnings)
> + sql_print_information(messages[SLAVE_RECON_MSG_KILLED_AFTER]);
> + return 1;
> + }
> + return 0;
> +}
> +
> +
> /* Slave I/O Thread entry point */
>
> pthread_handler_t handle_slave_io(void *arg)
> @@ -1889,7 +2021,10 @@ pthread_handler_t handle_slave_io(void *
> RELAY_LOG_INFO *rli= &mi->rli;
> char llbuff[22];
> uint retry_count;
> -
> + bool suppress_warnings;
> +#ifndef DBUG_OFF
> + uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0;
> +#endif
> // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
> my_thread_init();
> DBUG_ENTER("handle_slave_io");
> @@ -1975,79 +2110,40 @@ connected:
> Register ourselves with the master.
> */
> thd->proc_info = "Registering slave on master";
> - if (register_slave_on_master(mysql, mi))
> + if (register_slave_on_master(mysql, mi, &suppress_warnings))
> {
> sql_print_error("Slave I/O thread couldn't register on master");
> - goto err;
> + if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
> +registering slave on master") ||
> + try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
> + reconnect_messages[SLAVE_RECON_ACT_REG]))
> + goto err;
> + goto connected;
> }
> + DBUG_FORCE_SLAVE_TO_RECONNECT("FORCE_SLAVE_TO_RECONNECT_REG",
> + retry_count_reg, SLAVE_RECON_ACT_REG);
> }
>
> DBUG_PRINT("info",("Starting reading binary log from master"));
> while (!io_slave_killed(thd,mi))
> {
> - bool suppress_warnings= 0;
> thd->proc_info = "Requesting binlog dump";
> if (request_dump(mysql, mi, &suppress_warnings))
> {
> sql_print_error("Failed on request_dump()");
> - if (io_slave_killed(thd,mi))
> - {
> - sql_print_information("Slave I/O thread killed while requesting master \
> -dump");
> - goto err;
> - }
> -
> - mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
> - thd->proc_info= "Waiting to reconnect after a failed binlog dump request";
> -#ifdef SIGNAL_WITH_VIO_CLOSE
> - thd->clear_active_vio();
> -#endif
> - end_server(mysql);
> - /*
> - First time retry immediately, assuming that we can recover
> - right away - if first time fails, sleep between re-tries
> - hopefuly the admin can fix the problem sometime
> - */
> - if (retry_count++)
> - {
> - if (retry_count > master_retry_count)
> - goto err; // Don't retry forever
> - safe_sleep(thd,mi->connect_retry,(CHECK_KILLED_FUNC)io_slave_killed,
> - (void*)mi);
> - }
> - if (io_slave_killed(thd,mi))
> - {
> - sql_print_information("Slave I/O thread killed while retrying master \
> -dump");
> - goto err;
> - }
> -
> - thd->proc_info = "Reconnecting after a failed binlog dump request";
> - if (!suppress_warnings) {
> - char buf[256];
> - my_snprintf(buf, sizeof(buf),
> - "failed dump request, reconnecting to try again,"
> - " log '%s' at postion %s",
> - IO_RPL_LOG_NAME,
> - llstr(mi->master_log_pos,llbuff));
> - mi->report(WARNING_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
> - ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_BINLOG_DUMP", buf);
> - }
> - if (safe_reconnect(thd, mysql, mi, suppress_warnings) ||
> - io_slave_killed(thd,mi))
> - {
> - sql_print_information("Slave I/O thread killed during or \
> -after reconnect");
> + if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
> +requesting master dump") ||
> + try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
> + reconnect_messages[SLAVE_RECON_ACT_DUMP]))
> goto err;
> - }
> -
> goto connected;
> }
> + DBUG_FORCE_SLAVE_TO_RECONNECT("FORCE_SLAVE_TO_RECONNECT_DUMP",
> + retry_count_dump, SLAVE_RECON_ACT_DUMP);
>
> while (!io_slave_killed(thd,mi))
> {
> ulong event_len;
> - suppress_warnings= 0;
> /*
> We say "waiting" because read_event() will wait if there's nothing to
> read. But if there's something to read, it will not wait. The
> @@ -2056,12 +2152,11 @@ after reconnect");
> */
> thd->proc_info= "Waiting for master to send event";
> event_len= read_event(mysql, mi, &suppress_warnings);
> - if (io_slave_killed(thd,mi))
> - {
> - if (global_system_variables.log_warnings)
> - sql_print_information("Slave I/O thread killed while reading event");
> + if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
> +reading event"))
> goto err;
> - }
> + DBUG_FORCE_SLAVE_TO_RECONNECT("FORCE_SLAVE_TO_RECONNECT_EVENT",
> + retry_count_event, SLAVE_RECON_ACT_EVENT);
>
> if (event_len == packet_error)
> {
> @@ -2081,39 +2176,9 @@ max_allowed_packet",
> mysql_error(mysql));
> goto err;
> }
> - mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
> - thd->proc_info = "Waiting to reconnect after a failed master event
> read";
> -#ifdef SIGNAL_WITH_VIO_CLOSE
> - thd->clear_active_vio();
> -#endif
> - end_server(mysql);
> - if (retry_count++)
> - {
> - if (retry_count > master_retry_count)
> - goto err; // Don't retry forever
> - safe_sleep(thd,mi->connect_retry,(CHECK_KILLED_FUNC)io_slave_killed,
> - (void*) mi);
> - }
> - if (io_slave_killed(thd,mi))
> - {
> - if (global_system_variables.log_warnings)
> - sql_print_information("Slave I/O thread killed while waiting to \
> -reconnect after a failed read");
> - goto err;
> - }
> - thd->proc_info = "Reconnecting after a failed master event read";
> - if (!suppress_warnings)
> - sql_print_information("Slave I/O thread: Failed reading log event, \
> -reconnecting to retry, log '%s' position %s", IO_RPL_LOG_NAME,
> - llstr(mi->master_log_pos, llbuff));
> - if (safe_reconnect(thd, mysql, mi, suppress_warnings) ||
> - io_slave_killed(thd,mi))
> - {
> - if (global_system_variables.log_warnings)
> - sql_print_information("Slave I/O thread killed during or after a \
> -reconnect done to recover from failed read");
> + if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
> + reconnect_messages[SLAVE_RECON_ACT_EVENT]))
> goto err;
> - }
> goto connected;
> } // if (event_len == packet_error)
>
>
Rafal