#At file:///home/andrei/MySQL/BZR/FIXES/az-bf-bug36524-dealock_overreaction/ based on
revid:jon.hauglid@stripped
2801 Andrei Elkin 2009-06-29
Bug #36524 incident of deadlock on slave is overdramatized
In cases of temporary errors in replication event execution (deadlock, timeout)
the error log gained an overreacting error message whereas just a warning would be
fine.
Fixed with checking of the temporary status of the error
inside Slave_reporting_capability::report() to demote the error to the warning level
if
needed.
The warning gets converted into the error whenever number of successive attempts to
re-apply the event(s) gets equal to the global server variable slave_trans_retries.
The patch addresses some BUG#25597 issues also to indicate the demotion can not
create any loose in logics as long as the developer sticks to stardard tools of
reporting and displaying errors:
report() mimics my_error() and display() does so to send_error().
Two test file results advertise a new pattern of report()/display().
modified:
mysql-test/suite/rpl/r/rpl_extraCol_myisam.result
mysql-test/suite/rpl/r/rpl_row_conflicts.result
sql/log_event.cc
sql/rpl_reporting.cc
sql/rpl_reporting.h
sql/slave.cc
sql/slave.h
per-file messages:
mysql-test/suite/rpl/r/rpl_extraCol_myisam.result
results updated.
mysql-test/suite/rpl/r/rpl_row_conflicts.result
results updated.
sql/log_event.cc
treating rli->report() as an error reporting similar to my_error().
Gathering errors in the diag area should be completed with displaying
via rli->display() each time a slave thread breaks the applying
loop.
sql/rpl_reporting.cc
report() method is turned to be a wrapper of functions dealing with the diag area
like my_error() and ones with the error log.
One of its use case to register an error and print it out the error log.
A new display() method "transfers" the gathered errors the area of the Show Slave
Status
when a slave thread is about to exit.
sql/rpl_reporting.h
a new display() method added as well as comments for report().
sql/slave.cc
defeating a compilation warning;
exporting has_temporary_error() to be used in log_event.cc;
moving definition of clear_all_errors() into slave.cc to use in both modules;
run-out-of-tries case is finished with raising on the fatal error flag;
finalizing the error branch of events execution with display() to
make gathered error be saved in show-slave-status area;
deploying display() at the end of io-thread and sql-thread.
sql/slave.h
exporting functions shared with slave.cc and log_event.cc.
=== modified file 'mysql-test/suite/rpl/r/rpl_extraCol_myisam.result'
--- a/mysql-test/suite/rpl/r/rpl_extraCol_myisam.result 2009-04-08 13:09:37 +0000
+++ b/mysql-test/suite/rpl/r/rpl_extraCol_myisam.result 2009-06-29 10:47:20 +0000
@@ -270,7 +270,7 @@ Replicate_Ignore_Table #
Replicate_Wild_Do_Table
Replicate_Wild_Ignore_Table
Last_Errno 1535
-Last_Error Table definition on master and slave does not match: Column 5 type mismatch -
received type 4, test.t5 has type 246
+Last_Error Table definition on master and slave does not match: Column 1 type mismatch -
received type 15, test.t5 has type 254, Error_code: 1535; Table definition on master and
slave does not match: Column 2 type mismatch - received type 246, test.t5 has type 4,
Error_code: 1535; Table definition on master and slave does not match: Column 3 type
mismatch - received type 16, test.t5 has type 3, Error_code: 1535; Table definition on
master and slave does not match: Column 4 type mismatch - received type 252, test.t5 has
type 5, Error_code: 1535; Table definition on master and slave does not match: Column 5
type mismatch - received type 4, test.t5 has type 246
Skip_Counter 0
Exec_Master_Log_Pos #
Relay_Log_Space #
@@ -288,7 +288,7 @@ Master_SSL_Verify_Server_Cert No
Last_IO_Errno #
Last_IO_Error #
Last_SQL_Errno 1535
-Last_SQL_Error Table definition on master and slave does not match: Column 5 type
mismatch - received type 4, test.t5 has type 246
+Last_SQL_Error Table definition on master and slave does not match: Column 1 type
mismatch - received type 15, test.t5 has type 254, Error_code: 1535; Table definition on
master and slave does not match: Column 2 type mismatch - received type 246, test.t5 has
type 4, Error_code: 1535; Table definition on master and slave does not match: Column 3
type mismatch - received type 16, test.t5 has type 3, Error_code: 1535; Table definition
on master and slave does not match: Column 4 type mismatch - received type 252, test.t5
has type 5, Error_code: 1535; Table definition on master and slave does not match: Column
5 type mismatch - received type 4, test.t5 has type 246
Replicate_Ignore_Server_Ids
Master_Server_Id 1
SET GLOBAL SQL_SLAVE_SKIP_COUNTER=2;
@@ -333,7 +333,7 @@ Replicate_Ignore_Table #
Replicate_Wild_Do_Table
Replicate_Wild_Ignore_Table
Last_Errno 1535
-Last_Error Table definition on master and slave does not match: Column 3 type mismatch -
received type 16, test.t6 has type 3
+Last_Error Table definition on master and slave does not match: Column 1 type mismatch -
received type 15, test.t6 has type 254, Error_code: 1535; Table definition on master and
slave does not match: Column 2 type mismatch - received type 246, test.t6 has type 4,
Error_code: 1535; Table definition on master and slave does not match: Column 3 type
mismatch - received type 16, test.t6 has type 3
Skip_Counter 0
Exec_Master_Log_Pos #
Relay_Log_Space #
@@ -351,7 +351,7 @@ Master_SSL_Verify_Server_Cert No
Last_IO_Errno #
Last_IO_Error #
Last_SQL_Errno 1535
-Last_SQL_Error Table definition on master and slave does not match: Column 3 type
mismatch - received type 16, test.t6 has type 3
+Last_SQL_Error Table definition on master and slave does not match: Column 1 type
mismatch - received type 15, test.t6 has type 254, Error_code: 1535; Table definition on
master and slave does not match: Column 2 type mismatch - received type 246, test.t6 has
type 4, Error_code: 1535; Table definition on master and slave does not match: Column 3
type mismatch - received type 16, test.t6 has type 3
Replicate_Ignore_Server_Ids
Master_Server_Id 1
SET GLOBAL SQL_SLAVE_SKIP_COUNTER=3;
@@ -786,7 +786,7 @@ Replicate_Ignore_Table #
Replicate_Wild_Do_Table
Replicate_Wild_Ignore_Table
Last_Errno 1060
-Last_Error Error 'Duplicate column name 'c6'' on query. Default database: 'test'. Query:
'ALTER TABLE t15 ADD COLUMN c6 INT AFTER c5'
+Last_Error Duplicate column name 'c6', Error_code: 1060; Error 'Duplicate column name
'c6'' on query. Default database: 'test'. Query: 'ALTER TABLE t15 ADD COLUMN c6 INT AFTER
c5'
Skip_Counter 0
Exec_Master_Log_Pos #
Relay_Log_Space #
@@ -804,7 +804,7 @@ Master_SSL_Verify_Server_Cert No
Last_IO_Errno #
Last_IO_Error #
Last_SQL_Errno 1060
-Last_SQL_Error Error 'Duplicate column name 'c6'' on query. Default database: 'test'.
Query: 'ALTER TABLE t15 ADD COLUMN c6 INT AFTER c5'
+Last_SQL_Error Duplicate column name 'c6', Error_code: 1060; Error 'Duplicate column name
'c6'' on query. Default database: 'test'. Query: 'ALTER TABLE t15 ADD COLUMN c6 INT AFTER
c5'
Replicate_Ignore_Server_Ids
Master_Server_Id 1
SET GLOBAL SQL_SLAVE_SKIP_COUNTER=1;
=== modified file 'mysql-test/suite/rpl/r/rpl_row_conflicts.result'
--- a/mysql-test/suite/rpl/r/rpl_row_conflicts.result 2009-01-09 15:26:52 +0000
+++ b/mysql-test/suite/rpl/r/rpl_row_conflicts.result 2009-06-29 10:47:20 +0000
@@ -24,7 +24,7 @@ a
1
[on slave]
---- Wait until slave stops with an error ----
-Last_SQL_Error = Could not execute Write_rows event on table test.t1; Duplicate entry '1'
for key 'PRIMARY', Error_code: 1062; handler error HA_ERR_FOUND_DUPP_KEY; the event's
master log master-bin.000001, end_log_pos 347 (expected "duplicate key" error)
+Last_SQL_Error = Duplicate entry '1' for key 'PRIMARY', Error_code: 1062; Could not
execute Write_rows event on table test.t1;handler error HA_ERR_FOUND_DUPP_KEY; the
event's master log master-bin.000001, end_log_pos 347 (expected "duplicate key" error)
SELECT * FROM t1;
a
1
@@ -50,7 +50,7 @@ SELECT * FROM t1;
a
[on slave]
---- Wait until slave stops with an error ----
-Last_SQL_Error = Could not execute Delete_rows event on table test.t1; Can't find record
in 't1', Error_code: 1032; handler error HA_ERR_KEY_NOT_FOUND; the event's master log
master-bin.000001, end_log_pos 983 (expected "can't find record" error)
+Last_SQL_Error = Can't find record in 't1', Error_code: 1032; Could not execute
Delete_rows event on table test.t1;handler error HA_ERR_KEY_NOT_FOUND; the event's master
log master-bin.000001, end_log_pos 983 (expected "can't find record" error)
SELECT * FROM t1;
a
---- Resolve the conflict on the slave and restart SQL thread ----
=== modified file 'sql/log_event.cc'
--- a/sql/log_event.cc 2009-05-13 16:04:01 +0000
+++ b/sql/log_event.cc 2009-06-29 10:47:20 +0000
@@ -132,28 +132,13 @@ static void inline slave_rows_error_repo
const char *log_name, ulong pos)
{
const char *handler_error= HA_ERR(ha_error);
- char buff[MAX_SLAVE_ERRMSG], *slider;
- const char *buff_end= buff + sizeof(buff);
- uint len;
- List_iterator_fast<MYSQL_ERROR> it(thd->warning_info->warn_list());
- MYSQL_ERROR *err;
- buff[0]= 0;
-
- for (err= it++, slider= buff; err && slider < buff_end - 1;
- slider += len, err= it++)
- {
- len= my_snprintf(slider, buff_end - slider,
- " %s, Error_code: %d;", err->get_message_text(),
- err->get_sql_errno());
- }
rli->report(level, thd->is_error()? thd->stmt_da->sql_errno() : 0,
"Could not execute %s event on table %s.%s;"
- "%s handler error %s; "
+ "handler error %s; "
"the event's master log %s, end_log_pos %lu",
type, table->s->db.str,
table->s->table_name.str,
- buff,
handler_error == NULL? "<unknown>" : handler_error,
log_name, pos);
}
@@ -272,13 +257,6 @@ static void pretty_print_str(IO_CACHE* c
#if defined(HAVE_REPLICATION) && !defined(MYSQL_CLIENT)
-static void clear_all_errors(THD *thd, Relay_log_info *rli)
-{
- thd->is_slave_error = 0;
- thd->clear_error();
- rli->clear_error();
-}
-
inline int idempotent_error_code(int err_code)
{
int ret= 0;
@@ -3177,7 +3155,7 @@ compare_errors:
!ignored_error_code(actual_error) &&
!ignored_error_code(expected_error))
{
- rli->report(ERROR_LEVEL, 0,
+ rli->report(ERROR_LEVEL, ER_UNKNOWN_ERROR,
"\
Query caused different errors on master and slave. \
Error on master: '%s' (%d), Error on slave: '%s' (%d). \
=== modified file 'sql/rpl_reporting.cc'
--- a/sql/rpl_reporting.cc 2009-03-31 11:16:20 +0000
+++ b/sql/rpl_reporting.cc 2009-06-29 10:47:20 +0000
@@ -2,53 +2,149 @@
#include "mysql_priv.h"
#include "rpl_reporting.h"
+/**
+ The method treats a supplied message depending on the level:
+
+ - an error is queued in the diagnostic area
+ to be recorded eventually in Show Slave status area and
+ in the error log.
+ - a warning or an info level are optinally (log_warnings)
+ stored into the error log
+
+ @param level severity of the message.
+ @param err_code the user level error code
+ @param msg the format string of the message,
+ can be followed with more arguments
+ providing the data for the format string
+
+ @note Queueing in the da is protected by @err_lock mutex
+ to collabote with a concurrent reader from the da.
+*/
void
Slave_reporting_capability::report(loglevel level, int err_code,
const char *msg, ...) const
{
+ THD *thd= current_thd;
void (*report_function)(const char *, ...);
- char buff[MAX_SLAVE_ERRMSG];
+ char buff[sizeof(m_last_error.message)];
char *pbuff= buff;
- uint pbuffsize= sizeof(buff);
va_list args;
+
+ if (level == ERROR_LEVEL && has_temporary_error(thd))
+ level= WARNING_LEVEL;
+
va_start(args, msg);
+ my_vsnprintf(pbuff, sizeof(buff), msg, args);
+ va_end(args);
pthread_mutex_lock(&err_lock);
switch (level)
{
case ERROR_LEVEL:
/*
- It's an error, it must be reported in Last_error and Last_errno in SHOW
- SLAVE STATUS.
+ It's an error, it goes in a slave thread's main_da.
*/
- pbuff= m_last_error.message;
- pbuffsize= sizeof(m_last_error.message);
- m_last_error.number = err_code;
+ if (err_code > 0)
+ {
+ if (thd->stmt_da->is_set())
+ thd->stmt_da->can_overwrite_status= TRUE;
+ my_printf_error(err_code, "%s", MYF(0), pbuff);
+ if (thd->stmt_da->can_overwrite_status)
+ thd->stmt_da->can_overwrite_status= FALSE;
+ }
report_function= sql_print_error;
break;
case WARNING_LEVEL:
- report_function= sql_print_warning;
+ report_function= global_system_variables.log_warnings?
+ sql_print_warning : NULL;
break;
case INFORMATION_LEVEL:
- report_function= sql_print_information;
+ report_function= global_system_variables.log_warnings?
+ sql_print_information : NULL;
break;
default:
DBUG_ASSERT(0); // should not come here
return; // don't crash production builds, just do nothing
}
+ pthread_mutex_unlock(&err_lock);
- my_vsnprintf(pbuff, pbuffsize, msg, args);
+ if (report_function)
+ report_function("Slave %s: %s%s Error_code: %d", m_thread_name, pbuff,
+ (pbuff[0] && *(strend(pbuff)-1) == '.') ?
+ "" : ",", err_code);
+}
- pthread_mutex_unlock(&err_lock);
+/**
+ compose Last_Error to be shown by SHOW SLAVE STATUS
+
+ @param err_code The summary error code to display. If zero
+ the last reported default will be set to Last*Errno
+
+ @param msg The summary message's format string optinally followed
+ by arguments. It can be NULL.
+
+ @note The method tries gathering all queued *error-level* messages
+ into the limited-size buffer and may not compose the whole
+ list. In that case the rest can be found anyway in the error log.
+ The Last_Error's code corresponds to the last queued error if
+ the supplied @c err_code is zero.
+
+ The gathering procedure is protected by @c err_lock
+ to provide collaboration with a concurrent reader such as
+ show slave status.
+*/
+void
+Slave_reporting_capability::display(int err_code, const char *msg, ...) const
+{
+ uint len, size= sizeof(m_last_error.message);
+ char *slider, *buff= m_last_error.message, *buff_end= buff + size;
+ List_iterator_fast<MYSQL_ERROR> it(current_thd->warning_info->warn_list());
+ MYSQL_ERROR *err;
+ uint32 err_number;
+ const char err_code_fmt[]= ", Error_code: %d; ";
+ va_list args;
+
+ va_start(args, msg);
+ len= msg == NULL? 0 : my_vsnprintf(buff, size, msg, args);
va_end(args);
- /* If the msg string ends with '.', do not add a ',' it would be ugly */
- report_function("Slave %s: %s%s Error_code: %d",
- m_thread_name, pbuff,
- (pbuff[0] && *(strend(pbuff)-1) == '.') ? "" : ",",
- err_code);
+ pthread_mutex_lock(&err_lock);
+
+ /*
+ In case of multiple errors the error codes are printed along with msgs ...
+ */
+ for (err= it++,
+ err_number= 0,
+ slider= buff + len;
+ err;
+ slider += len, err= it++)
+ {
+ if (err->get_level() != MYSQL_ERROR::WARN_LEVEL_ERROR)
+ continue;
+ if (err_number != 0)
+ {
+ len= my_snprintf(slider, buff_end - slider, err_code_fmt, err_number);
+ slider += len;
+ }
+ err_number= err->get_sql_errno(); // Last*Errno is of the error level
+ if (slider < buff_end)
+ len= my_snprintf(slider, buff_end - slider,"%s", err->get_message_text());
+ }
+ m_last_error.number= err_code == 0? err_number : err_code;
+ /*
+ ... except for the last error if its code is equal to @c err_code
+ the supplied summary's.
+ */
+ if ((err_number != 0 && err_number != m_last_error.number) &&
+ slider < buff_end)
+ my_snprintf(slider, buff_end - slider, err_code_fmt, err_number);
+
+ pthread_mutex_unlock(&err_lock);
+
}
+
+
Slave_reporting_capability::~Slave_reporting_capability()
{
pthread_mutex_destroy(&err_lock);
=== modified file 'sql/rpl_reporting.h'
--- a/sql/rpl_reporting.h 2009-03-31 11:16:20 +0000
+++ b/sql/rpl_reporting.h 2009-06-29 10:47:20 +0000
@@ -30,8 +30,7 @@ public:
}
/**
- Writes a message and, if it's an error message, to Last_Error
- (which will be displayed by SHOW SLAVE STATUS).
+ Writes a message into diag area and/or/either the error log
@param level The severity level
@param err_code The error code
@@ -43,6 +42,13 @@ public:
ATTRIBUTE_FORMAT(printf, 4, 5);
/**
+ compose Last_Error to be shown by SHOW SLAVE STATUS
+ @param err_code The summary code to display instead of the last reported
+ @param msg A summary message to set afront of the reported errors
+ */
+ void display(int err_code, const char *msg, ...) const;
+
+ /**
Clear errors. They will not show up under <code>SHOW SLAVE
STATUS</code>.
*/
=== modified file 'sql/slave.cc'
--- a/sql/slave.cc 2009-05-13 16:04:01 +0000
+++ b/sql/slave.cc 2009-06-29 10:47:20 +0000
@@ -966,7 +966,7 @@ int init_strvar_from_file(char *var, int
up to and including newline.
*/
int c;
- while (((c=my_b_get(f)) != '\n' && c != my_b_EOF));
+ while (((c=my_b_get(f)) != '\n' && c != my_b_EOF)) {};
}
DBUG_RETURN(0);
}
@@ -2032,7 +2032,7 @@ int check_expected_error(THD* thd, Relay
that the error is temporary by pushing a warning with the error code
ER_GET_TEMPORARY_ERRMSG, if the originating error is temporary.
*/
-static int has_temporary_error(THD *thd)
+int has_temporary_error(THD *thd)
{
DBUG_ENTER("has_temporary_error");
@@ -2085,6 +2085,13 @@ static int has_temporary_error(THD *thd)
DBUG_RETURN(0);
}
+void clear_all_errors(THD *thd, Relay_log_info *rli)
+{
+ thd->is_slave_error = 0;
+ thd->clear_error();
+ rli->clear_error();
+}
+
/**
Applies the given event and advances the relay log position.
@@ -2286,7 +2293,6 @@ static int exec_relay_log_event(THD* thd
if (ev)
{
int exec_res;
-
/*
This tests if the position of the beginning of the current event
hits the UNTIL barrier.
@@ -2381,10 +2387,13 @@ static int exec_relay_log_event(THD* thd
}
}
else
+ {
sql_print_error("Slave SQL thread retried transaction %lu time(s) "
"in vain, giving up. Consider raising the value of "
"the slave_transaction_retries variable.",
slave_trans_retries);
+ thd->is_fatal_error= 1; // to force has_temporary_error() return 0
+ }
}
else if (exec_res && !temp_err ||
(opt_using_transactions &&
@@ -2800,6 +2809,7 @@ err:
mi->mysql=0;
}
write_ignored_events_info_to_relay_log(thd, mi);
+ mi->display(0, NULL);
thd_proc_info(thd, "Waiting for slave mutex on exit");
pthread_mutex_lock(&mi->run_lock);
@@ -2943,6 +2953,7 @@ pthread_handler_t handle_slave_sql(void
now.
But the master timestamp is reset by RESET SLAVE & CHANGE MASTER.
*/
+
rli->clear_error();
//tell the I/O thread to take relay_log_space_limit into account from now on
@@ -3053,40 +3064,11 @@ log '%s' at position %s, relay log '%s'
if (!sql_slave_killed(thd,rli))
{
/*
- retrieve as much info as possible from the thd and, error
- codes and warnings and print this to the error log as to
- allow the user to locate the error
- */
- uint32 const last_errno= rli->last_error().number;
-
- if (thd->is_error())
- {
- char const *const errmsg= thd->stmt_da->message();
-
- DBUG_PRINT("info",
- ("thd->stmt_da->sql_errno()=%d; rli->last_error.number=%d",
- thd->stmt_da->sql_errno(), last_errno));
- if (last_errno == 0)
- {
- /*
- This function is reporting an error which was not reported
- while executing exec_relay_log_event().
- */
- rli->report(ERROR_LEVEL, thd->stmt_da->sql_errno(), errmsg);
- }
- else if (last_errno != thd->stmt_da->sql_errno())
- {
- /*
- * An error was reported while executing exec_relay_log_event()
- * however the error code differs from what is in the thread.
- * This function prints out more information to help finding
- * what caused the problem.
- */
- sql_print_error("Slave (additional info): %s Error_code: %d",
- errmsg, thd->stmt_da->sql_errno());
- }
- }
-
+ Reporting all gather errors while executing exec_relay_log_event()
+ analogously send_error()
+ */
+ rli->display(0, NULL);
+
/* Print any warnings issued */
List_iterator_fast<MYSQL_ERROR> it(thd->warning_info->warn_list());
MYSQL_ERROR *err;
@@ -3117,6 +3099,14 @@ the slave SQL thread with \"SLAVE START\
}
goto err;
}
+ else
+ {
+ /*
+ Event applying succeeded, reset errors and diag area status
+ */
+ clear_all_errors(thd, const_cast<Relay_log_info*>(rli));
+ thd->stmt_da->reset_diagnostics_area();
+ }
}
/* Thread stopped. Print the current replication position to the log */
@@ -3125,7 +3115,7 @@ the slave SQL thread with \"SLAVE START\
RPL_LOG_NAME, llstr(rli->group_master_log_pos,llbuff));
err:
-
+ rli->display(0, NULL);
/*
Some events set some playgrounds, which won't be cleared because thread
stops. Stopping of this thread may not be known to these events ("stop"
=== modified file 'sql/slave.h'
--- a/sql/slave.h 2009-04-30 14:51:06 +0000
+++ b/sql/slave.h 2009-06-29 10:47:20 +0000
@@ -203,6 +203,8 @@ void set_slave_thread_default_charset(TH
void rotate_relay_log(Master_info* mi);
int apply_event_and_update_pos(Log_event* ev, THD* thd, Relay_log_info* rli,
bool skip);
+int has_temporary_error(THD *thd);
+void clear_all_errors(THD *thd, Relay_log_info *rli);
pthread_handler_t handle_slave_io(void *arg);
pthread_handler_t handle_slave_sql(void *arg);