#At file:///home/andrei/MySQL/BZR/2a-23May/FIXES/bug11748510_36524_less_drama/ based on revid:magne.mahre@stripped
3311 Andrei Elkin 2011-03-27
bug#11748510 incident of deadlock on slave is overdramatized
In cases of temporary errors in replication event execution (deadlock,
timeout) the error log gained an overreacting error message whereas
just a warning would be fine.
Fixed with checking of the temporary status of the error inside
Slave_reporting_capability::report() to demote the error to the
warning level if needed.
The warning gets converted into the error
whenever number of successive attempts to re-apply the event(s) gets
equal to @@global.slave_trans_retries.
@ mysql-test/extra/rpl_tests/rpl_deadlock.test
regression test for bug#11748510 is added.
@ mysql-test/suite/rpl/r/rpl_deadlock_innodb.result
results get updated.
@ sql/rpl_reporting.cc
implementing error to warning demotion and refining stderr report func:s invocation
with prior checking of global_system_variables.log_warnings.
@ sql/rpl_slave.cc
has_temporary_error() is extended with a new argument and a new rule
to return 0 if there has been the fatal error flag raised.
A minor refinement is done to not regard an error as temporary in case
there is no retry.
@ sql/rpl_slave.h
providing has_temporary_error() interface for rpl_reporting.cc.
modified:
mysql-test/extra/rpl_tests/rpl_deadlock.test
mysql-test/suite/rpl/r/rpl_deadlock_innodb.result
sql/rpl_reporting.cc
sql/rpl_slave.cc
sql/rpl_slave.h
=== modified file 'mysql-test/extra/rpl_tests/rpl_deadlock.test'
--- a/mysql-test/extra/rpl_tests/rpl_deadlock.test 2010-12-19 17:07:28 +0000
+++ b/mysql-test/extra/rpl_tests/rpl_deadlock.test 2011-03-27 18:16:32 +0000
@@ -123,6 +123,57 @@ SELECT * FROM t3;
source include/check_slave_is_running.inc;
--echo
+#
+# bug#11748510/36524 incident of deadlock on slave is overdramatized
+#
+# Observe that the slave stopped when the number of transation retries
+# exceeds @@global.slave_transaction_retries
+#
+connection master;
+
+--echo *** Test the deadlock warning to be escalated into the error ***
+
+delete from t1;
+delete from t2;
+delete from t3;
+
+sync_slave_with_master;
+
+# make sure slave's unilateral row gone as well
+delete from t1;
+delete from t2;
+delete from t3;
+
+# the first attempt to run a deadlock scenario of p 1) leads to the error
+set @save.slave_transaction_retries= @@global.slave_transaction_retries;
+set @@global.slave_transaction_retries= 0;
+source include/stop_slave.inc;
+
+connection master;
+
+BEGIN;
+INSERT INTO t1 VALUES (1);
+# We make a long transaction here
+INSERT INTO t2 VALUES (2), (2), (2), (2), (2), (2), (2), (2), (2), (2);
+INSERT INTO t3 VALUES (3);
+COMMIT;
+
+connection slave;
+BEGIN;
+SELECT count(*) as zero FROM t1 FOR UPDATE;
+
+start slave;
+
+--echo *** Now the slave must be stopped due to timeout ***
+
+let $slave_sql_errno= 1205; # ER_LOCK_TIMEOUT
+let $show_slave_sql_error= 0;
+source include/wait_for_slave_sql_error.inc;
+
+rollback;
+
+set @@global.slave_transaction_retries= @save.slave_transaction_retries;
+source include/start_slave.inc;
# Clean up
--echo *** Clean up ***
connection master;
=== modified file 'mysql-test/suite/rpl/r/rpl_deadlock_innodb.result'
--- a/mysql-test/suite/rpl/r/rpl_deadlock_innodb.result 2010-12-19 17:22:30 +0000
+++ b/mysql-test/suite/rpl/r/rpl_deadlock_innodb.result 2011-03-27 18:16:32 +0000
@@ -103,6 +103,31 @@ a
3
include/check_slave_is_running.inc
+*** Test the deadlock warning to be escalated into the error ***
+delete from t1;
+delete from t2;
+delete from t3;
+delete from t1;
+delete from t2;
+delete from t3;
+set @save.slave_transaction_retries= @@global.slave_transaction_retries;
+set @@global.slave_transaction_retries= 0;
+include/stop_slave.inc
+BEGIN;
+INSERT INTO t1 VALUES (1);
+INSERT INTO t2 VALUES (2), (2), (2), (2), (2), (2), (2), (2), (2), (2);
+INSERT INTO t3 VALUES (3);
+COMMIT;
+BEGIN;
+SELECT count(*) as zero FROM t1 FOR UPDATE;
+zero
+0
+start slave;
+*** Now the slave must be stopped due to timeout ***
+include/wait_for_slave_sql_error.inc [errno=1205]
+rollback;
+set @@global.slave_transaction_retries= @save.slave_transaction_retries;
+include/start_slave.inc
*** Clean up ***
DROP TABLE t1,t2,t3;
SET global max_relay_log_size= @my_max_relay_log_size;
=== modified file 'sql/rpl_reporting.cc'
--- a/sql/rpl_reporting.cc 2010-08-05 17:45:25 +0000
+++ b/sql/rpl_reporting.cc 2011-03-27 18:16:32 +0000
@@ -17,6 +17,7 @@
#include "rpl_reporting.h"
#include "log.h" // sql_print_error, sql_print_warning,
// sql_print_information
+#include "rpl_slave.h"
Slave_reporting_capability::Slave_reporting_capability(char const *thread_name)
: m_thread_name(thread_name)
@@ -29,11 +30,17 @@ void
Slave_reporting_capability::report(loglevel level, int err_code,
const char *msg, ...) const
{
+#if !defined(EMBEDDED_LIBRARY)
+ THD *thd= current_thd;
void (*report_function)(const char *, ...);
char buff[MAX_SLAVE_ERRMSG];
char *pbuff= buff;
uint pbuffsize= sizeof(buff);
va_list args;
+
+ if (level == ERROR_LEVEL && has_temporary_error(thd, err_code))
+ level= WARNING_LEVEL;
+
va_start(args, msg);
mysql_mutex_lock(&err_lock);
@@ -51,10 +58,12 @@ Slave_reporting_capability::report(logle
report_function= sql_print_error;
break;
case WARNING_LEVEL:
- report_function= sql_print_warning;
+ report_function= global_system_variables.log_warnings?
+ sql_print_warning : NULL;
break;
case INFORMATION_LEVEL:
- report_function= sql_print_information;
+ report_function= global_system_variables.log_warnings?
+ sql_print_information : NULL;
break;
default:
DBUG_ASSERT(0); // should not come here
@@ -71,6 +80,7 @@ Slave_reporting_capability::report(logle
m_thread_name, pbuff,
(pbuff[0] && *(strend(pbuff)-1) == '.') ? "" : ",",
err_code);
+#endif
}
Slave_reporting_capability::~Slave_reporting_capability()
=== modified file 'sql/rpl_slave.cc'
--- a/sql/rpl_slave.cc 2011-03-17 13:20:36 +0000
+++ b/sql/rpl_slave.cc 2011-03-27 18:16:32 +0000
@@ -2514,15 +2514,23 @@ static ulong read_event(MYSQL* mysql, Ma
DBUG_RETURN(len - 1);
}
-/*
- Check if the current error is of temporary nature of not.
+/**
+ Check if the current error is of temporary nature or not.
Some errors are temporary in nature, such as
ER_LOCK_DEADLOCK and ER_LOCK_WAIT_TIMEOUT. Ndb also signals
that the error is temporary by pushing a warning with the error code
ER_GET_TEMPORARY_ERRMSG, if the originating error is temporary.
+
+ @param thd a THD instance, typically of the slave SQL thread's.
+ @error_arg the error code for assessment.
+ defaults to zero which makes the function check the top
+ of the reported errors stack.
+
+ @return 1 as the positive and 0 as the negative verdict
*/
-static int has_temporary_error(THD *thd)
+int has_temporary_error(THD *thd, uint error_arg)
{
+ uint error;
DBUG_ENTER("has_temporary_error");
DBUG_EXECUTE_IF("all_errors_are_temporary_errors",
@@ -2533,20 +2541,25 @@ static int has_temporary_error(THD *thd)
});
/*
- If there is no message in THD, we can't say if it's a temporary
+ The state of the slave thread can't be regarded as
+ experiencing a temporary failure in cases of @c is_slave_error was set TRUE,
+ or if there is no message in THD, we can't say if it's a temporary
error or not. This is currently the case for Incident_log_event,
- which sets no message. Return FALSE.
+ which sets no message.
*/
- if (!thd->is_error())
+ if (thd->is_fatal_error || !thd->is_error())
DBUG_RETURN(0);
+ error= (error_arg == 0)? thd->stmt_da->sql_errno() : error_arg;
+
/*
Temporary error codes:
currently, InnoDB deadlock detected by InnoDB or lock
- wait timeout (innodb_lock_wait_timeout exceeded
+ wait timeout (innodb_lock_wait_timeout exceeded).
+ Notice, the temporary error requires slave_trans_retries != 0)
*/
- if (thd->stmt_da->sql_errno() == ER_LOCK_DEADLOCK ||
- thd->stmt_da->sql_errno() == ER_LOCK_WAIT_TIMEOUT)
+ if (slave_trans_retries &&
+ (error == ER_LOCK_DEADLOCK || error == ER_LOCK_WAIT_TIMEOUT))
DBUG_RETURN(1);
#ifdef HAVE_NDB_BINLOG
@@ -3007,10 +3020,13 @@ static int exec_relay_log_event(THD* thd
}
}
else
- sql_print_error("Slave SQL thread retried transaction %lu time(s) "
- "in vain, giving up. Consider raising the value of "
- "the slave_transaction_retries variable.",
- slave_trans_retries);
+ {
+ thd->is_fatal_error= 1;
+ rli->report(ERROR_LEVEL, thd->stmt_da->sql_errno(),
+ "Slave SQL thread retried transaction %lu time(s) "
+ "in vain, giving up. Consider raising the value of "
+ "the slave_transaction_retries variable.", rli->trans_retries);
+ }
}
else if ((exec_res && !temp_err) ||
(opt_using_transactions &&
=== modified file 'sql/rpl_slave.h'
--- a/sql/rpl_slave.h 2011-02-16 17:13:30 +0000
+++ b/sql/rpl_slave.h 2011-03-27 18:16:32 +0000
@@ -213,6 +213,7 @@ void set_slave_thread_options(THD* thd);
void set_slave_thread_default_charset(THD *thd, Relay_log_info const *rli);
int apply_event_and_update_pos(Log_event* ev, THD* thd, Relay_log_info* rli);
int rotate_relay_log(Master_info* mi);
+int has_temporary_error(THD *thd, uint err= 0);
pthread_handler_t handle_slave_io(void *arg);
pthread_handler_t handle_slave_sql(void *arg);
Attachment: [text/bzr-bundle] bzr/andrei.elkin@oracle.com-20110327181632-zcv44cu2cawtepw5.bundle