Hi Andrei,
Great work !!!
Thanks for addressing my comments.
Patch approved.
I have just one suggestion that you may do if you agree on it.
Please, make "has_temporary_error" a static member function in
Slave_reporting_capability.
Cheers.
On 03/27/2011 07:16 PM, Andrei Elkin wrote:
> #At file:///home/andrei/MySQL/BZR/2a-23May/FIXES/bug11748510_36524_less_drama/ based
> on revid:magne.mahre@stripped
>
> 3311 Andrei Elkin 2011-03-27
> bug#11748510 incident of deadlock on slave is overdramatized
>
> In cases of temporary errors in replication event execution (deadlock,
> timeout) the error log gained an overreacting error message whereas
> just a warning would be fine.
>
> Fixed with checking of the temporary status of the error inside
> Slave_reporting_capability::report() to demote the error to the
> warning level if needed.
> The warning gets converted into the error
> whenever number of successive attempts to re-apply the event(s) gets
> equal to @@global.slave_trans_retries.
> @ mysql-test/extra/rpl_tests/rpl_deadlock.test
> regression test for bug#11748510 is added.
> @ mysql-test/suite/rpl/r/rpl_deadlock_innodb.result
> results get updated.
> @ sql/rpl_reporting.cc
> implementing error to warning demotion and refining stderr report func:s
> invocation
> with prior checking of global_system_variables.log_warnings.
> @ sql/rpl_slave.cc
> has_temporary_error() is extended with a new argument and a new rule
> to return 0 if there has been the fatal error flag raised.
> A minor refinement is done to not regard an error as temporary in case
> there is no retry.
> @ sql/rpl_slave.h
> providing has_temporary_error() interface for rpl_reporting.cc.
>
> modified:
> mysql-test/extra/rpl_tests/rpl_deadlock.test
> mysql-test/suite/rpl/r/rpl_deadlock_innodb.result
> sql/rpl_reporting.cc
> sql/rpl_slave.cc
> sql/rpl_slave.h
> === modified file 'mysql-test/extra/rpl_tests/rpl_deadlock.test'
> --- a/mysql-test/extra/rpl_tests/rpl_deadlock.test 2010-12-19 17:07:28 +0000
> +++ b/mysql-test/extra/rpl_tests/rpl_deadlock.test 2011-03-27 18:16:32 +0000
> @@ -123,6 +123,57 @@ SELECT * FROM t3;
> source include/check_slave_is_running.inc;
> --echo
>
> +#
> +# bug#11748510/36524 incident of deadlock on slave is overdramatized
> +#
> +# Observe that the slave stopped when the number of transation retries
> +# exceeds @@global.slave_transaction_retries
> +#
> +connection master;
> +
> +--echo *** Test the deadlock warning to be escalated into the error ***
> +
> +delete from t1;
> +delete from t2;
> +delete from t3;
> +
> +sync_slave_with_master;
> +
> +# make sure slave's unilateral row gone as well
> +delete from t1;
> +delete from t2;
> +delete from t3;
> +
> +# the first attempt to run a deadlock scenario of p 1) leads to the error
> +set @save.slave_transaction_retries= @@global.slave_transaction_retries;
> +set @@global.slave_transaction_retries= 0;
> +source include/stop_slave.inc;
> +
> +connection master;
> +
> +BEGIN;
> +INSERT INTO t1 VALUES (1);
> +# We make a long transaction here
> +INSERT INTO t2 VALUES (2), (2), (2), (2), (2), (2), (2), (2), (2), (2);
> +INSERT INTO t3 VALUES (3);
> +COMMIT;
> +
> +connection slave;
> +BEGIN;
> +SELECT count(*) as zero FROM t1 FOR UPDATE;
> +
> +start slave;
> +
> +--echo *** Now the slave must be stopped due to timeout ***
> +
> +let $slave_sql_errno= 1205; # ER_LOCK_TIMEOUT
> +let $show_slave_sql_error= 0;
> +source include/wait_for_slave_sql_error.inc;
> +
> +rollback;
> +
> +set @@global.slave_transaction_retries= @save.slave_transaction_retries;
> +source include/start_slave.inc;
> # Clean up
> --echo *** Clean up ***
> connection master;
>
> === modified file 'mysql-test/suite/rpl/r/rpl_deadlock_innodb.result'
> --- a/mysql-test/suite/rpl/r/rpl_deadlock_innodb.result 2010-12-19 17:22:30 +0000
> +++ b/mysql-test/suite/rpl/r/rpl_deadlock_innodb.result 2011-03-27 18:16:32 +0000
> @@ -103,6 +103,31 @@ a
> 3
> include/check_slave_is_running.inc
>
> +*** Test the deadlock warning to be escalated into the error ***
> +delete from t1;
> +delete from t2;
> +delete from t3;
> +delete from t1;
> +delete from t2;
> +delete from t3;
> +set @save.slave_transaction_retries= @@global.slave_transaction_retries;
> +set @@global.slave_transaction_retries= 0;
> +include/stop_slave.inc
> +BEGIN;
> +INSERT INTO t1 VALUES (1);
> +INSERT INTO t2 VALUES (2), (2), (2), (2), (2), (2), (2), (2), (2), (2);
> +INSERT INTO t3 VALUES (3);
> +COMMIT;
> +BEGIN;
> +SELECT count(*) as zero FROM t1 FOR UPDATE;
> +zero
> +0
> +start slave;
> +*** Now the slave must be stopped due to timeout ***
> +include/wait_for_slave_sql_error.inc [errno=1205]
> +rollback;
> +set @@global.slave_transaction_retries= @save.slave_transaction_retries;
> +include/start_slave.inc
> *** Clean up ***
> DROP TABLE t1,t2,t3;
> SET global max_relay_log_size= @my_max_relay_log_size;
>
> === modified file 'sql/rpl_reporting.cc'
> --- a/sql/rpl_reporting.cc 2010-08-05 17:45:25 +0000
> +++ b/sql/rpl_reporting.cc 2011-03-27 18:16:32 +0000
> @@ -17,6 +17,7 @@
> #include "rpl_reporting.h"
> #include "log.h" // sql_print_error, sql_print_warning,
> // sql_print_information
> +#include "rpl_slave.h"
>
> Slave_reporting_capability::Slave_reporting_capability(char const *thread_name)
> : m_thread_name(thread_name)
> @@ -29,11 +30,17 @@ void
> Slave_reporting_capability::report(loglevel level, int err_code,
> const char *msg, ...) const
> {
> +#if !defined(EMBEDDED_LIBRARY)
> + THD *thd= current_thd;
> void (*report_function)(const char *, ...);
> char buff[MAX_SLAVE_ERRMSG];
> char *pbuff= buff;
> uint pbuffsize= sizeof(buff);
> va_list args;
> +
> + if (level == ERROR_LEVEL && has_temporary_error(thd, err_code))
> + level= WARNING_LEVEL;
> +
> va_start(args, msg);
>
> mysql_mutex_lock(&err_lock);
> @@ -51,10 +58,12 @@ Slave_reporting_capability::report(logle
> report_function= sql_print_error;
> break;
> case WARNING_LEVEL:
> - report_function= sql_print_warning;
> + report_function= global_system_variables.log_warnings?
> + sql_print_warning : NULL;
> break;
> case INFORMATION_LEVEL:
> - report_function= sql_print_information;
> + report_function= global_system_variables.log_warnings?
> + sql_print_information : NULL;
> break;
> default:
> DBUG_ASSERT(0); // should not come here
> @@ -71,6 +80,7 @@ Slave_reporting_capability::report(logle
> m_thread_name, pbuff,
> (pbuff[0] && *(strend(pbuff)-1) == '.') ? "" : ",",
> err_code);
> +#endif
> }
>
> Slave_reporting_capability::~Slave_reporting_capability()
>
> === modified file 'sql/rpl_slave.cc'
> --- a/sql/rpl_slave.cc 2011-03-17 13:20:36 +0000
> +++ b/sql/rpl_slave.cc 2011-03-27 18:16:32 +0000
> @@ -2514,15 +2514,23 @@ static ulong read_event(MYSQL* mysql, Ma
> DBUG_RETURN(len - 1);
> }
>
> -/*
> - Check if the current error is of temporary nature of not.
> +/**
> + Check if the current error is of temporary nature or not.
> Some errors are temporary in nature, such as
> ER_LOCK_DEADLOCK and ER_LOCK_WAIT_TIMEOUT. Ndb also signals
> that the error is temporary by pushing a warning with the error code
> ER_GET_TEMPORARY_ERRMSG, if the originating error is temporary.
> +
> + @param thd a THD instance, typically of the slave SQL thread's.
> + @error_arg the error code for assessment.
> + defaults to zero which makes the function check the top
> + of the reported errors stack.
> +
> + @return 1 as the positive and 0 as the negative verdict
> */
> -static int has_temporary_error(THD *thd)
> +int has_temporary_error(THD *thd, uint error_arg)
> {
> + uint error;
> DBUG_ENTER("has_temporary_error");
>
> DBUG_EXECUTE_IF("all_errors_are_temporary_errors",
> @@ -2533,20 +2541,25 @@ static int has_temporary_error(THD *thd)
> });
>
> /*
> - If there is no message in THD, we can't say if it's a temporary
> + The state of the slave thread can't be regarded as
> + experiencing a temporary failure in cases of @c is_slave_error was set TRUE,
> + or if there is no message in THD, we can't say if it's a temporary
> error or not. This is currently the case for Incident_log_event,
> - which sets no message. Return FALSE.
> + which sets no message.
> */
> - if (!thd->is_error())
> + if (thd->is_fatal_error || !thd->is_error())
> DBUG_RETURN(0);
>
> + error= (error_arg == 0)? thd->stmt_da->sql_errno() : error_arg;
> +
> /*
> Temporary error codes:
> currently, InnoDB deadlock detected by InnoDB or lock
> - wait timeout (innodb_lock_wait_timeout exceeded
> + wait timeout (innodb_lock_wait_timeout exceeded).
> + Notice, the temporary error requires slave_trans_retries != 0)
> */
> - if (thd->stmt_da->sql_errno() == ER_LOCK_DEADLOCK ||
> - thd->stmt_da->sql_errno() == ER_LOCK_WAIT_TIMEOUT)
> + if (slave_trans_retries &&
> + (error == ER_LOCK_DEADLOCK || error == ER_LOCK_WAIT_TIMEOUT))
> DBUG_RETURN(1);
>
> #ifdef HAVE_NDB_BINLOG
> @@ -3007,10 +3020,13 @@ static int exec_relay_log_event(THD* thd
> }
> }
> else
> - sql_print_error("Slave SQL thread retried transaction %lu time(s) "
> - "in vain, giving up. Consider raising the value of "
> - "the slave_transaction_retries variable.",
> - slave_trans_retries);
> + {
> + thd->is_fatal_error= 1;
> + rli->report(ERROR_LEVEL, thd->stmt_da->sql_errno(),
> + "Slave SQL thread retried transaction %lu time(s) "
> + "in vain, giving up. Consider raising the value of "
> + "the slave_transaction_retries variable.",
> rli->trans_retries);
> + }
> }
> else if ((exec_res && !temp_err) ||
> (opt_using_transactions &&
>
> === modified file 'sql/rpl_slave.h'
> --- a/sql/rpl_slave.h 2011-02-16 17:13:30 +0000
> +++ b/sql/rpl_slave.h 2011-03-27 18:16:32 +0000
> @@ -213,6 +213,7 @@ void set_slave_thread_options(THD* thd);
> void set_slave_thread_default_charset(THD *thd, Relay_log_info const *rli);
> int apply_event_and_update_pos(Log_event* ev, THD* thd, Relay_log_info* rli);
> int rotate_relay_log(Master_info* mi);
> +int has_temporary_error(THD *thd, uint err= 0);
>
> pthread_handler_t handle_slave_io(void *arg);
> pthread_handler_t handle_slave_sql(void *arg);
>
>
>
>
>