From: Date: October 7 2006 9:02pm Subject: bk commit into 5.0 tree (aelkin:1.2295) BUG#16228 List-Archive: http://lists.mysql.com/commits/13303 X-Bug: 16228 Message-Id: <200610071902.k97J2puc018166@dsl-hkigw8-feb9fb00-191.dhcp.inet.fi> Below is the list of changes that have just been committed into a local 5.0 repository of elkin. When elkin does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet@stripped, 2006-10-07 22:02:43+03:00, aelkin@stripped +3 -0 BUG#20697 slave fails to rollback replicated transaction hang over innodb_lock_wait_timeou Transaction on the slave sql thread got blocked against a slave's mysqld local ta's lock. Since the default, slave-transaction-retries=10, there was replaying of the replicated ta. That failed because of a new 5.0.13 started policy not to rollback a timeouted transaction. Effectively the first round of a timed-out ta became committed by the replaying's first "BEGIN". It was decided to backport already existed method working in 5.1 implemented in bug #16228 for handling symmetrical deadlock problem. That patch introduced end_trans execution whenever a replicated ta deadlocks or timed-out. Note, that this solution can be practically suboptimal - in the light of the changed behavior due to timeout we still could replay only the last statement - only with a high rate of timeouting replicated transactions. mysql-test/r/rpl_deadlock.result@stripped, 2006-10-07 22:02:40+03:00, aelkin@stripped +110 -15 results changed mysql-test/t/rpl_deadlock.test@stripped, 2006-10-07 22:02:40+03:00, aelkin@stripped +10 -6 Refining the timeout part of the test to display that the timeouted transaction is rolled back prior its replaying by slave sql. Non-zero select count would mean the first round work became persistent - wrong. sql/slave.cc@stripped, 2006-10-07 22:02:41+03:00, aelkin@stripped +16 -7 applying bug#16228 fix, approbated for deadlock use case in 5.1, almost verbatim. Another alternative to replay only the offending statement requires significant efforts, incl design work. # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: aelkin # Host: dsl-hkigw8-feb9fb00-191.dhcp.inet.fi # Root: /home/elkin/MySQL/TEAM/FIXES/5.0/bug20697_slave_msta_retry --- 1.279/sql/slave.cc 2006-10-07 22:02:51 +03:00 +++ 1.280/sql/slave.cc 2006-10-07 22:02:51 +03:00 @@ -3345,9 +3345,9 @@ static int exec_relay_log_event(THD* thd const char *errmsg; /* We were in a transaction which has been rolled back because of a - deadlock (currently, InnoDB deadlock detected by InnoDB) or lock - wait timeout (innodb_lock_wait_timeout exceeded); let's seek back to - BEGIN log event and retry it all again. + Sonera deadlock. if lock wait timeout (innodb_lock_wait_timeout exceeded) + there is no rollback since 5.0.13 (ref: manual). + let's seek back to BEGIN log event and retry it all again. We have to not only seek but also a) init_master_info(), to seek back to hot relay log's start for later (for when we will come back to this hot log after re-processing the @@ -3369,6 +3369,7 @@ static int exec_relay_log_event(THD* thd else { exec_res= 0; + end_trans(thd, ROLLBACK); /* chance for concurrent connection to get more locks */ safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE), (CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli); @@ -3386,9 +3387,17 @@ static int exec_relay_log_event(THD* thd "the slave_transaction_retries variable.", slave_trans_retries); } - if (!((thd->options & OPTION_BEGIN) && opt_using_transactions)) - rli->trans_retries= 0; // restart from fresh - } + else if (!((thd->options & OPTION_BEGIN) && opt_using_transactions)) + { + /* + Only reset the retry counter if the event succeeded or + failed with a non-transient error. On a successful event, + the execution will proceed as usual; in the case of a + non-transient error, the slave will stop with an error. + */ + rli->trans_retries= 0; // restart from fresh + } + } return exec_res; } else @@ -4613,7 +4622,7 @@ static int connect_to_master(THD* thd, M suppress_warnings= 0; sql_print_error("Slave I/O thread: error %s to master \ '%s@%s:%d': \ -Error: '%s' errno: %d retry-time: %d retries: %d", +Error: '%s' errno: %d retry-time: %d retries: %lu", (reconnect ? "reconnecting" : "connecting"), mi->user,mi->host,mi->port, mysql_error(mysql), last_errno, --- 1.10/mysql-test/r/rpl_deadlock.result 2006-10-07 22:02:51 +03:00 +++ 1.11/mysql-test/r/rpl_deadlock.result 2006-10-07 22:02:51 +03:00 @@ -6,7 +6,7 @@ drop table if exists t1,t2,t3,t4,t5,t6,t start slave; create table t1 (a int not null, key(a)) engine=innodb; create table t2 (a int not null, key(a)) engine=innodb; -create table t3 (a int) engine=innodb; +create table t3 (a int unique) engine=innodb; create table t4 (a int) engine=innodb; show variables like 'slave_transaction_retries'; Variable_name Value @@ -35,14 +35,14 @@ begin; select * from t1 for update; a start slave; -insert into t2 values(22); +insert into t2 values(201); commit; select * from t1; a 1 select * from t2; a -22 +201 show slave status; Slave_IO_State # Master_Host 127.0.0.1 @@ -50,7 +50,7 @@ Master_User root Master_Port MASTER_MYPORT Connect_Retry 1 Master_Log_File master-bin.000001 -Read_Master_Log_Pos 18911 +Read_Master_Log_Pos 18918 Relay_Log_File # Relay_Log_Pos # Relay_Master_Log_File master-bin.000001 @@ -65,7 +65,7 @@ Replicate_Wild_Ignore_Table Last_Errno 0 Last_Error Skip_Counter 0 -Exec_Master_Log_Pos 18911 +Exec_Master_Log_Pos 18918 Relay_Log_Space # Until_Condition None Until_Log_File @@ -78,12 +78,16 @@ Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master # stop slave; -change master to master_log_pos=532; +delete from t3; +change master to master_log_pos=539; begin; select * from t2 for update; a -22 +201 start slave; +select count(*) from t3 /* must be zero */; +count(*) +0 commit; select * from t1; a @@ -91,7 +95,7 @@ a 1 select * from t2; a -22 +201 show slave status; Slave_IO_State # Master_Host 127.0.0.1 @@ -99,7 +103,7 @@ Master_User root Master_Port MASTER_MYPORT Connect_Retry 1 Master_Log_File master-bin.000001 -Read_Master_Log_Pos 18911 +Read_Master_Log_Pos 18918 Relay_Log_File # Relay_Log_Pos # Relay_Master_Log_File master-bin.000001 @@ -114,7 +118,7 @@ Replicate_Wild_Ignore_Table Last_Errno 0 Last_Error Skip_Counter 0 -Exec_Master_Log_Pos 18911 +Exec_Master_Log_Pos 18918 Relay_Log_Space # Until_Condition None Until_Log_File @@ -128,12 +132,16 @@ Master_SSL_Key Seconds_Behind_Master # set global max_relay_log_size=0; stop slave; -change master to master_log_pos=532; +delete from t3; +change master to master_log_pos=539; begin; select * from t2 for update; a -22 +201 start slave; +select count(*) from t3 /* must be zero */; +count(*) +0 commit; select * from t1; a @@ -142,7 +150,7 @@ a 1 select * from t2; a -22 +201 show slave status; Slave_IO_State # Master_Host 127.0.0.1 @@ -150,7 +158,7 @@ Master_User root Master_Port MASTER_MYPORT Connect_Retry 1 Master_Log_File master-bin.000001 -Read_Master_Log_Pos 18911 +Read_Master_Log_Pos 18918 Relay_Log_File # Relay_Log_Pos # Relay_Master_Log_File master-bin.000001 @@ -165,7 +173,7 @@ Replicate_Wild_Ignore_Table Last_Errno 0 Last_Error Skip_Counter 0 -Exec_Master_Log_Pos 18911 +Exec_Master_Log_Pos 18918 Relay_Log_Space # Until_Condition None Until_Log_File @@ -177,4 +185,91 @@ Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master # +drop table if exists t1; +create table t1 (f int unique) engine=innodb; +insert into t1 values (0); +begin; +select * from t1 where f = 0 for update; +f +0 +begin; +insert into t1 values (1); +update t1 set f=-1 where f = 0; +commit; +insert into t1 values (2); +show slave status; +Slave_IO_State # +Master_Host 127.0.0.1 +Master_User root +Master_Port MASTER_MYPORT +Connect_Retry 1 +Master_Log_File master-bin.000001 +Read_Master_Log_Pos 19618 +Relay_Log_File # +Relay_Log_Pos # +Relay_Master_Log_File master-bin.000001 +Slave_IO_Running # +Slave_SQL_Running No +Replicate_Do_DB +Replicate_Ignore_DB +Replicate_Do_Table +Replicate_Ignore_Table +Replicate_Wild_Do_Table +Replicate_Wild_Ignore_Table +Last_Errno 1205 +Last_Error Error 'Lock wait timeout exceeded; try restarting transaction' on query. Default database: 'test'. Query: 'update t1 set f=-1 where f = 0' +Skip_Counter 0 +Exec_Master_Log_Pos 19227 +Relay_Log_Space # +Until_Condition None +Until_Log_File +Until_Log_Pos 0 +Master_SSL_Allowed No +Master_SSL_CA_File +Master_SSL_CA_Path +Master_SSL_Cert +Master_SSL_Cipher +Master_SSL_Key +Seconds_Behind_Master # +set @@global.sql_slave_skip_counter = 4; +start slave; +show slave status; +Slave_IO_State # +Master_Host 127.0.0.1 +Master_User root +Master_Port MASTER_MYPORT +Connect_Retry 1 +Master_Log_File master-bin.000001 +Read_Master_Log_Pos 19618 +Relay_Log_File # +Relay_Log_Pos # +Relay_Master_Log_File master-bin.000001 +Slave_IO_Running # +Slave_SQL_Running Yes +Replicate_Do_DB +Replicate_Ignore_DB +Replicate_Do_Table +Replicate_Ignore_Table +Replicate_Wild_Do_Table +Replicate_Wild_Ignore_Table +Last_Errno 0 +Last_Error +Skip_Counter 0 +Exec_Master_Log_Pos 19618 +Relay_Log_Space # +Until_Condition None +Until_Log_File +Until_Log_Pos 0 +Master_SSL_Allowed No +Master_SSL_CA_File +Master_SSL_CA_Path +Master_SSL_Cert +Master_SSL_Cipher +Master_SSL_Key +Seconds_Behind_Master # +select * from t1; +f +0 +2 +commit; drop table t1,t2,t3,t4; --- 1.12/mysql-test/t/rpl_deadlock.test 2006-10-07 22:02:51 +03:00 +++ 1.13/mysql-test/t/rpl_deadlock.test 2006-10-07 22:02:51 +03:00 @@ -16,7 +16,8 @@ source include/master-slave.inc; connection master; create table t1 (a int not null, key(a)) engine=innodb; create table t2 (a int not null, key(a)) engine=innodb; -create table t3 (a int) engine=innodb; +# requiring 'unique' for the timeout part of the test +create table t3 (a int unique) engine=innodb; create table t4 (a int) engine=innodb; show variables like 'slave_transaction_retries'; sync_slave_with_master; @@ -31,8 +32,7 @@ stop slave; connection master; begin; # Let's keep BEGIN and the locked statement in two different relay logs. -let $1=200; -disable_query_log; +let $1=200;disable_query_log; while ($1) { eval insert into t3 values( $1 ); @@ -59,7 +59,7 @@ enable_query_log; select * from t1 for update; start slave; --real_sleep 3 # hope that slave is blocked now -insert into t2 values(22); # provoke deadlock, slave should be victim +insert into t2 values(201); # provoke deadlock, slave should be victim commit; sync_with_master; select * from t1; # check that slave succeeded finally @@ -74,11 +74,13 @@ show slave status; # 2) Test lock wait timeout stop slave; -change master to master_log_pos=532; # the BEGIN log event +delete from t3; +change master to master_log_pos=539; # the BEGIN log event begin; select * from t2 for update; # hold lock start slave; --real_sleep 10 # slave should have blocked, and be retrying +select count(*) from t3 /* must be zero */; # replaying begins after rollback commit; sync_with_master; select * from t1; # check that slave succeeded finally @@ -97,11 +99,13 @@ set global max_relay_log_size=0; # This is really copy-paste of 2) of above stop slave; -change master to master_log_pos=532; +delete from t3; +change master to master_log_pos=539; begin; select * from t2 for update; start slave; --real_sleep 10 +select count(*) from t3 /* must be zero */; # replaying begins after rollback commit; sync_with_master; select * from t1;