Below is the list of changes that have just been committed into a local
5.0 repository of elkin. When elkin does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet@stripped, 2006-10-11 10:16:37+03:00, aelkin@stripped +3 -0
BUG#20697 slave fails to rollback replicated transaction hang over innodb_lock_wait_timeou
Transaction on the slave sql thread got blocked against a slave's mysqld local ta's
lock. Since the default, slave-transaction-retries=10, there was replaying of the
replicated ta. That failed because of a new started from 5.0.13 policy not to rollback
a timed-out transaction. Effectively the first round of a timed-out ta becomes committed
by the replaying's first "BEGIN".
It was decided to backport already existed method working in 5.1 implemented in
bug #16228 for handling symmetrical deadlock problem. That patch introduced end_trans
execution whenever a replicated ta deadlocks or timed-out.
Note, that this solution can be practically suboptimal - in the light of the changed behavior
due to timeout we still could replay only the last statement - only with a high rate of timeouting
replicated transactions.
mysql-test/r/rpl_deadlock.result@stripped, 2006-10-11 10:16:34+03:00, aelkin@stripped +110 -15
results changed
mysql-test/t/rpl_deadlock.test@stripped, 2006-10-11 10:16:34+03:00, aelkin@stripped +10 -6
Refining the timeout part of the test to display that the timed-out transaction
is rolled back prior its replaying by slave sql.
Non-zero select's count would mean the first round work became persistent - wrong.
sql/slave.cc@stripped, 2006-10-11 10:16:34+03:00, aelkin@stripped +16 -7
applying bug#16228 fix, approbated for deadlock use case in 5.1, almost verbatim.
Another alternative to replay only the offending statement requires significant
efforts, incl design work.
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: aelkin
# Host: dsl-hkigw8-feb9fb00-191.dhcp.inet.fi
# Root: /home/elkin/MySQL/TEAM/FIXES/5.0/bug20697_slave_msta_retry
--- 1.279/sql/slave.cc 2006-10-11 10:16:44 +03:00
+++ 1.280/sql/slave.cc 2006-10-11 10:16:44 +03:00
@@ -3345,9 +3345,9 @@ static int exec_relay_log_event(THD* thd
const char *errmsg;
/*
We were in a transaction which has been rolled back because of a
- deadlock (currently, InnoDB deadlock detected by InnoDB) or lock
- wait timeout (innodb_lock_wait_timeout exceeded); let's seek back to
- BEGIN log event and retry it all again.
+ Sonera deadlock. if lock wait timeout (innodb_lock_wait_timeout exceeded)
+ there is no rollback since 5.0.13 (ref: manual).
+ let's seek back to BEGIN log event and retry it all again.
We have to not only seek but also
a) init_master_info(), to seek back to hot relay log's start for later
(for when we will come back to this hot log after re-processing the
@@ -3369,6 +3369,7 @@ static int exec_relay_log_event(THD* thd
else
{
exec_res= 0;
+ end_trans(thd, ROLLBACK);
/* chance for concurrent connection to get more locks */
safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
(CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
@@ -3386,9 +3387,17 @@ static int exec_relay_log_event(THD* thd
"the slave_transaction_retries variable.",
slave_trans_retries);
}
- if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
- rli->trans_retries= 0; // restart from fresh
- }
+ else if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
+ {
+ /*
+ Only reset the retry counter if the event succeeded or
+ failed with a non-transient error. On a successful event,
+ the execution will proceed as usual; in the case of a
+ non-transient error, the slave will stop with an error.
+ */
+ rli->trans_retries= 0; // restart from fresh
+ }
+ }
return exec_res;
}
else
@@ -4613,7 +4622,7 @@ static int connect_to_master(THD* thd, M
suppress_warnings= 0;
sql_print_error("Slave I/O thread: error %s to master \
'%s@%s:%d': \
-Error: '%s' errno: %d retry-time: %d retries: %d",
+Error: '%s' errno: %d retry-time: %d retries: %lu",
(reconnect ? "reconnecting" : "connecting"),
mi->user,mi->host,mi->port,
mysql_error(mysql), last_errno,
--- 1.10/mysql-test/r/rpl_deadlock.result 2006-10-11 10:16:44 +03:00
+++ 1.11/mysql-test/r/rpl_deadlock.result 2006-10-11 10:16:44 +03:00
@@ -6,7 +6,7 @@ drop table if exists t1,t2,t3,t4,t5,t6,t
start slave;
create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb;
-create table t3 (a int) engine=innodb;
+create table t3 (a int unique) engine=innodb;
create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries';
Variable_name Value
@@ -35,14 +35,14 @@ begin;
select * from t1 for update;
a
start slave;
-insert into t2 values(22);
+insert into t2 values(201);
commit;
select * from t1;
a
1
select * from t2;
a
-22
+201
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
@@ -50,7 +50,7 @@ Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
-Read_Master_Log_Pos 18911
+Read_Master_Log_Pos 18918
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
@@ -65,7 +65,7 @@ Replicate_Wild_Ignore_Table
Last_Errno 0
Last_Error
Skip_Counter 0
-Exec_Master_Log_Pos 18911
+Exec_Master_Log_Pos 18918
Relay_Log_Space #
Until_Condition None
Until_Log_File
@@ -78,12 +78,16 @@ Master_SSL_Cipher
Master_SSL_Key
Seconds_Behind_Master #
stop slave;
-change master to master_log_pos=532;
+delete from t3;
+change master to master_log_pos=539;
begin;
select * from t2 for update;
a
-22
+201
start slave;
+select count(*) from t3 /* must be zero */;
+count(*)
+0
commit;
select * from t1;
a
@@ -91,7 +95,7 @@ a
1
select * from t2;
a
-22
+201
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
@@ -99,7 +103,7 @@ Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
-Read_Master_Log_Pos 18911
+Read_Master_Log_Pos 18918
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
@@ -114,7 +118,7 @@ Replicate_Wild_Ignore_Table
Last_Errno 0
Last_Error
Skip_Counter 0
-Exec_Master_Log_Pos 18911
+Exec_Master_Log_Pos 18918
Relay_Log_Space #
Until_Condition None
Until_Log_File
@@ -128,12 +132,16 @@ Master_SSL_Key
Seconds_Behind_Master #
set global max_relay_log_size=0;
stop slave;
-change master to master_log_pos=532;
+delete from t3;
+change master to master_log_pos=539;
begin;
select * from t2 for update;
a
-22
+201
start slave;
+select count(*) from t3 /* must be zero */;
+count(*)
+0
commit;
select * from t1;
a
@@ -142,7 +150,7 @@ a
1
select * from t2;
a
-22
+201
show slave status;
Slave_IO_State #
Master_Host 127.0.0.1
@@ -150,7 +158,7 @@ Master_User root
Master_Port MASTER_MYPORT
Connect_Retry 1
Master_Log_File master-bin.000001
-Read_Master_Log_Pos 18911
+Read_Master_Log_Pos 18918
Relay_Log_File #
Relay_Log_Pos #
Relay_Master_Log_File master-bin.000001
@@ -165,7 +173,7 @@ Replicate_Wild_Ignore_Table
Last_Errno 0
Last_Error
Skip_Counter 0
-Exec_Master_Log_Pos 18911
+Exec_Master_Log_Pos 18918
Relay_Log_Space #
Until_Condition None
Until_Log_File
@@ -177,4 +185,91 @@ Master_SSL_Cert
Master_SSL_Cipher
Master_SSL_Key
Seconds_Behind_Master #
+drop table if exists t1;
+create table t1 (f int unique) engine=innodb;
+insert into t1 values (0);
+begin;
+select * from t1 where f = 0 for update;
+f
+0
+begin;
+insert into t1 values (1);
+update t1 set f=-1 where f = 0;
+commit;
+insert into t1 values (2);
+show slave status;
+Slave_IO_State #
+Master_Host 127.0.0.1
+Master_User root
+Master_Port MASTER_MYPORT
+Connect_Retry 1
+Master_Log_File master-bin.000001
+Read_Master_Log_Pos 19618
+Relay_Log_File #
+Relay_Log_Pos #
+Relay_Master_Log_File master-bin.000001
+Slave_IO_Running #
+Slave_SQL_Running No
+Replicate_Do_DB
+Replicate_Ignore_DB
+Replicate_Do_Table
+Replicate_Ignore_Table
+Replicate_Wild_Do_Table
+Replicate_Wild_Ignore_Table
+Last_Errno 1205
+Last_Error Error 'Lock wait timeout exceeded; try restarting transaction' on query. Default database: 'test'. Query: 'update t1 set f=-1 where f = 0'
+Skip_Counter 0
+Exec_Master_Log_Pos 19227
+Relay_Log_Space #
+Until_Condition None
+Until_Log_File
+Until_Log_Pos 0
+Master_SSL_Allowed No
+Master_SSL_CA_File
+Master_SSL_CA_Path
+Master_SSL_Cert
+Master_SSL_Cipher
+Master_SSL_Key
+Seconds_Behind_Master #
+set @@global.sql_slave_skip_counter = 4;
+start slave;
+show slave status;
+Slave_IO_State #
+Master_Host 127.0.0.1
+Master_User root
+Master_Port MASTER_MYPORT
+Connect_Retry 1
+Master_Log_File master-bin.000001
+Read_Master_Log_Pos 19618
+Relay_Log_File #
+Relay_Log_Pos #
+Relay_Master_Log_File master-bin.000001
+Slave_IO_Running #
+Slave_SQL_Running Yes
+Replicate_Do_DB
+Replicate_Ignore_DB
+Replicate_Do_Table
+Replicate_Ignore_Table
+Replicate_Wild_Do_Table
+Replicate_Wild_Ignore_Table
+Last_Errno 0
+Last_Error
+Skip_Counter 0
+Exec_Master_Log_Pos 19618
+Relay_Log_Space #
+Until_Condition None
+Until_Log_File
+Until_Log_Pos 0
+Master_SSL_Allowed No
+Master_SSL_CA_File
+Master_SSL_CA_Path
+Master_SSL_Cert
+Master_SSL_Cipher
+Master_SSL_Key
+Seconds_Behind_Master #
+select * from t1;
+f
+0
+2
+commit;
drop table t1,t2,t3,t4;
--- 1.12/mysql-test/t/rpl_deadlock.test 2006-10-11 10:16:44 +03:00
+++ 1.13/mysql-test/t/rpl_deadlock.test 2006-10-11 10:16:44 +03:00
@@ -16,7 +16,8 @@ source include/master-slave.inc;
connection master;
create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb;
-create table t3 (a int) engine=innodb;
+# requiring 'unique' for the timeout part of the test
+create table t3 (a int unique) engine=innodb;
create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries';
sync_slave_with_master;
@@ -31,8 +32,7 @@ stop slave;
connection master;
begin;
# Let's keep BEGIN and the locked statement in two different relay logs.
-let $1=200;
-disable_query_log;
+let $1=200;disable_query_log;
while ($1)
{
eval insert into t3 values( $1 );
@@ -59,7 +59,7 @@ enable_query_log;
select * from t1 for update;
start slave;
--real_sleep 3 # hope that slave is blocked now
-insert into t2 values(22); # provoke deadlock, slave should be victim
+insert into t2 values(201); # provoke deadlock, slave should be victim
commit;
sync_with_master;
select * from t1; # check that slave succeeded finally
@@ -74,11 +74,13 @@ show slave status;
# 2) Test lock wait timeout
stop slave;
-change master to master_log_pos=532; # the BEGIN log event
+delete from t3;
+change master to master_log_pos=539; # the BEGIN log event
begin;
select * from t2 for update; # hold lock
start slave;
--real_sleep 10 # slave should have blocked, and be retrying
+select count(*) from t3 /* must be zero */; # replaying begins after rollback
commit;
sync_with_master;
select * from t1; # check that slave succeeded finally
@@ -97,11 +99,13 @@ set global max_relay_log_size=0;
# This is really copy-paste of 2) of above
stop slave;
-change master to master_log_pos=532;
+delete from t3;
+change master to master_log_pos=539;
begin;
select * from t2 for update;
start slave;
--real_sleep 10
+select count(*) from t3 /* must be zero */; # replaying begins after rollback
commit;
sync_with_master;
select * from t1;
| Thread |
|---|
| • bk commit into 5.0 tree (aelkin:1.2295) BUG#16228 | Andrei Elkin | 11 Oct |