Mats, hej.
The idea of the patch is fair enough.
I need some clarification. The principal question is whether a group
of events can be replayable without being a transaction.
I think that's the only situation.
The comments back up it saying:
/*
We were in a transaction which has been rolled back because of a
temporary error;
let's seek back to BEGIN log event and retry it all again.
...
Your opinion?
> Below is the list of changes that have just been committed into a local
> 5.1 repository of mats. When mats does a push these changes will
> be propagated to the main repository and, within 24 hours after the
> push, to the public repository.
> For information on how to access the public repository
> see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
>
> ChangeSet@stripped, 2007-10-20 20:16:12+02:00, mats@stripped +4 -0
> BUG#24860 (Incorrect SLAVE_TRANSACTION_RETRIES code can result in slave stuck):
>
> If a temporary error occured inside a group on an event that was not the first
> event of the group, the slave could get stuck because the retry counter is reset
> whenever an event was executed successfully.
>
> This patch only reset the retry counter when an entire group has been successfully
> executed, or failed with a non-transient error.
>
> mysql-test/suite/rpl/r/rpl_temporary_errors.result@stripped, 2007-10-20 20:16:08+02:00,
> mats@stripped +81 -0
> New BitKeeper file ``mysql-test/suite/rpl/r/rpl_temporary_errors.result''
>
> mysql-test/suite/rpl/r/rpl_temporary_errors.result@stripped, 2007-10-20 20:16:08+02:00,
> mats@stripped +0 -0
>
> mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt@stripped, 2007-10-20
> 20:16:08+02:00, mats@stripped +3 -0
> New BitKeeper file ``mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt''
>
> mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt@stripped, 2007-10-20
> 20:16:08+02:00, mats@stripped +0 -0
>
> mysql-test/suite/rpl/t/rpl_temporary_errors.test@stripped, 2007-10-20 20:16:08+02:00,
> mats@stripped +29 -0
> New BitKeeper file ``mysql-test/suite/rpl/t/rpl_temporary_errors.test''
>
> mysql-test/suite/rpl/t/rpl_temporary_errors.test@stripped, 2007-10-20 20:16:08+02:00,
> mats@stripped +0 -0
>
> sql/slave.cc@stripped, 2007-10-20 20:16:08+02:00, mats@stripped +21
> -5
> Adding debug printouts to every place where Relay_log_info::trans_retries
> is changed and to has_temporary_error() to print the error when returning.
>
> Adding debug variable all_errors_are_temporary_errors to make all errors
> for slave thread temporary errors.
>
> Adding code so that the Relay_log_info::trans_retries is only reset when
> an entire group was sucessfully executed, or a non-temporary error occured.
>
> diff -Nrup a/mysql-test/suite/rpl/r/rpl_temporary_errors.result
> b/mysql-test/suite/rpl/r/rpl_temporary_errors.result
> --- /dev/null Wed Dec 31 16:00:00 196900
> +++ b/mysql-test/suite/rpl/r/rpl_temporary_errors.result 2007-10-20 20:16:08 +02:00
> @@ -0,0 +1,81 @@
> +stop slave;
> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> +reset master;
> +reset slave;
> +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
> +start slave;
> +**** On Master ****
> +SET SESSION BINLOG_FORMAT=ROW;
> +CREATE TABLE t1 (a INT PRIMARY KEY, b INT);
> +INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4);
> +**** On Slave ****
> +SHOW STATUS LIKE 'Slave_retried_transactions';
> +Variable_name Value
> +Slave_retried_transactions 0
> +UPDATE t1 SET a = 5, b = 47 WHERE a = 1;
> +SELECT * FROM t1;
> +a b
> +5 47
> +2 2
> +3 3
> +4 4
> +**** On Master ****
> +UPDATE t1 SET a = 5, b = 5 WHERE a = 1;
> +SELECT * FROM t1;
> +a b
> +5 5
> +2 2
> +3 3
> +4 4
> +**** On Slave ****
> +SHOW STATUS LIKE 'Slave_retried_transactions';
> +Variable_name Value
> +Slave_retried_transactions 2
> +SELECT * FROM t1;
> +a b
> +5 47
> +2 2
> +3 3
> +4 4
> +SHOW SLAVE STATUS;
> +Slave_IO_State #
> +Master_Host 127.0.0.1
> +Master_User root
> +Master_Port MASTER_PORT
> +Connect_Retry 1
> +Master_Log_File master-bin.000001
> +Read_Master_Log_Pos 408
> +Relay_Log_File #
> +Relay_Log_Pos #
> +Relay_Master_Log_File master-bin.000001
> +Slave_IO_Running Yes
> +Slave_SQL_Running No
> +Replicate_Do_DB
> +Replicate_Ignore_DB
> +Replicate_Do_Table
> +Replicate_Ignore_Table #
> +Replicate_Wild_Do_Table
> +Replicate_Wild_Ignore_Table
> +Last_Errno 1032
> +Last_Error Error in Update_rows event: error during transaction execution on table
> test.t1. Can't find record in 't1'
> +Skip_Counter 0
> +Exec_Master_Log_Pos 318
> +Relay_Log_Space #
> +Until_Condition None
> +Until_Log_File
> +Until_Log_Pos 0
> +Master_SSL_Allowed No
> +Master_SSL_CA_File
> +Master_SSL_CA_Path
> +Master_SSL_Cert
> +Master_SSL_Cipher
> +Master_SSL_Key
> +Seconds_Behind_Master #
> +Master_SSL_Verify_Server_Cert No
> +Last_IO_Errno #
> +Last_IO_Error #
> +Last_SQL_Errno 1032
> +Last_SQL_Error Error in Update_rows event: error during transaction execution on
> table test.t1. Can't find record in 't1'
This one should not be a temporary error. So slave does not have to retry.
> +DROP TABLE t1;
> +**** On Master ****
> +DROP TABLE t1;
> diff -Nrup a/mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt
> b/mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt
> --- /dev/null Wed Dec 31 16:00:00 196900
> +++ b/mysql-test/suite/rpl/t/rpl_temporary_errors-slave.opt 2007-10-20 20:16:08
> +02:00
> @@ -0,0 +1,3 @@
> +--loose-debug="+d,all_errors_are_temporary_errors" --slave-transaction-retries=2
> +
> +
> diff -Nrup a/mysql-test/suite/rpl/t/rpl_temporary_errors.test
> b/mysql-test/suite/rpl/t/rpl_temporary_errors.test
> --- /dev/null Wed Dec 31 16:00:00 196900
> +++ b/mysql-test/suite/rpl/t/rpl_temporary_errors.test 2007-10-20 20:16:08 +02:00
> @@ -0,0 +1,29 @@
> +source include/master-slave.inc;
> +
> +--echo **** On Master ****
> +connection master;
> +SET SESSION BINLOG_FORMAT=ROW;
> +CREATE TABLE t1 (a INT PRIMARY KEY, b INT);
> +INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4);
> +--echo **** On Slave ****
> +sync_slave_with_master;
> +SHOW STATUS LIKE 'Slave_retried_transactions';
> +UPDATE t1 SET a = 5, b = 47 WHERE a = 1;
> +SELECT * FROM t1;
> +--echo **** On Master ****
> +connection master;
> +UPDATE t1 SET a = 5, b = 5 WHERE a = 1;
> +save_master_pos;
> +SELECT * FROM t1;
> +#SHOW BINLOG EVENTS;
> +--echo **** On Slave ****
> +connection slave;
> +source include/wait_for_slave_sql_to_stop.inc;
> +SHOW STATUS LIKE 'Slave_retried_transactions';
> +SELECT * FROM t1;
> +source include/show_slave_status.inc;
> +DROP TABLE t1;
> +
> +--echo **** On Master ****
> +connection master;
> +DROP TABLE t1;
> diff -Nrup a/sql/slave.cc b/sql/slave.cc
> --- a/sql/slave.cc 2007-10-13 14:58:06 +02:00
> +++ b/sql/slave.cc 2007-10-20 20:16:08 +02:00
> @@ -1714,7 +1714,14 @@ static int has_temporary_error(THD *thd)
> DBUG_ENTER("has_temporary_error");
>
> if (thd->is_fatal_error)
> + {
> + DBUG_PRINT("info", ("thd->net.last_errno: %s", ER(thd->net.last_errno)));
> DBUG_RETURN(0);
> + }
> +
> + DBUG_EXECUTE_IF("all_errors_are_temporary_errors",
> + if (thd->net.last_errno)
> + thd->net.last_errno= ER_LOCK_DEADLOCK;);
>
> /*
> Temporary error codes:
> @@ -1723,7 +1730,10 @@ static int has_temporary_error(THD *thd)
> */
> if (thd->net.last_errno == ER_LOCK_DEADLOCK ||
> thd->net.last_errno == ER_LOCK_WAIT_TIMEOUT)
> + {
> + DBUG_PRINT("info", ("thd->net.last_errno: %s", ER(thd->net.last_errno)));
> DBUG_RETURN(1);
> + }
>
> #ifdef HAVE_NDB_BINLOG
> /*
> @@ -1905,7 +1915,8 @@ static int exec_relay_log_event(THD* thd
> }
> if (slave_trans_retries)
> {
> - if (exec_res && has_temporary_error(thd))
> + int temp_err;
> + if (exec_res && (temp_err= has_temporary_error(thd)))
> {
> const char *errmsg;
> /*
> @@ -1953,15 +1964,19 @@ static int exec_relay_log_event(THD* thd
> "the slave_transaction_retries variable.",
> slave_trans_retries);
> }
> - else if (!((thd->options & OPTION_BEGIN) &&
> opt_using_transactions))
> + else if (exec_res && !temp_err ||
> + (opt_using_transactions &&
> + rli->group_relay_log_pos == rli->event_relay_log_pos))
> {
this is actually a generalization of the former `OPTION_BEGIN'
condition.
I really wonder if we need that without having a use case - e.g a
non-transactional binlog group with an event of it suffering a
temporary failure.
> /*
> - Only reset the retry counter if the event succeeded or
> - failed with a non-transient error. On a successful event,
> - the execution will proceed as usual; in the case of a
> + Only reset the retry counter if the entire group succeeded
> + or failed with a non-transient error. On a successful
> + event, the execution will proceed as usual; in the case of a
> non-transient error, the slave will stop with an error.
> */
> rli->trans_retries= 0; // restart from fresh
> + DBUG_PRINT("info", ("Resetting retry counter, rli->trans_retries: %d",
> + rli->trans_retries));
> }
> }
> DBUG_RETURN(exec_res);
> @@ -2450,6 +2465,7 @@ pthread_handler_t handle_slave_sql(void
> rli->ignore_log_space_limit= 0;
> pthread_mutex_unlock(&rli->log_space_lock);
> rli->trans_retries= 0; // start from "no error"
> + DBUG_PRINT("info", ("rli->trans_retries: %d", rli->trans_retries));
>
> if (init_relay_log_pos(rli,
> rli->group_relay_log_name,
>
> --
> MySQL Code Commits Mailing List
> For list archives: http://lists.mysql.com/commits
> To unsubscribe: http://lists.mysql.com/commits?unsub=1
>
regards,
Andrei