List:Commits« Previous MessageNext Message »
From:Andrei Elkin Date:October 11 2006 7:16am
Subject:bk commit into 5.0 tree (aelkin:1.2295) BUG#16228
View as plain text  
Below is the list of changes that have just been committed into a local
5.0 repository of elkin. When elkin does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2006-10-11 10:16:37+03:00, aelkin@stripped +3 -0
  BUG#20697 slave fails to rollback replicated transaction hang over innodb_lock_wait_timeou
  
  Transaction on the slave sql thread got blocked against a slave's mysqld local ta's
  lock. Since the default, slave-transaction-retries=10, there was replaying of the 
  replicated ta. That failed because of a new started from 5.0.13 policy not to rollback
  a timed-out transaction. Effectively the first round of a timed-out ta becomes committed
  by the replaying's first "BEGIN".
  
  It was decided to backport already existed method working in 5.1 implemented in
  bug #16228 for handling symmetrical deadlock problem. That patch introduced end_trans
  execution whenever a replicated ta deadlocks or timed-out.
  
  Note, that this solution can be practically suboptimal - in the light of the changed behavior
  due to timeout we still could replay only the last statement -  only with a high rate of timeouting
  replicated transactions.

  mysql-test/r/rpl_deadlock.result@stripped, 2006-10-11 10:16:34+03:00, aelkin@stripped +110 -15
    results changed

  mysql-test/t/rpl_deadlock.test@stripped, 2006-10-11 10:16:34+03:00, aelkin@stripped +10 -6
    Refining the timeout part of the test to display that the timed-out transaction
    is rolled back prior its replaying by slave sql.
    Non-zero select's count would mean the first round work became persistent - wrong.

  sql/slave.cc@stripped, 2006-10-11 10:16:34+03:00, aelkin@stripped +16 -7
    applying bug#16228 fix, approbated for deadlock use case in 5.1, almost verbatim. 
    Another alternative to replay only the offending statement requires significant
    efforts, incl design work.

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	aelkin
# Host:	dsl-hkigw8-feb9fb00-191.dhcp.inet.fi
# Root:	/home/elkin/MySQL/TEAM/FIXES/5.0/bug20697_slave_msta_retry

--- 1.279/sql/slave.cc	2006-10-11 10:16:44 +03:00
+++ 1.280/sql/slave.cc	2006-10-11 10:16:44 +03:00
@@ -3345,9 +3345,9 @@ static int exec_relay_log_event(THD* thd
         const char *errmsg;
         /*
           We were in a transaction which has been rolled back because of a
-          deadlock (currently, InnoDB deadlock detected by InnoDB) or lock
-          wait timeout (innodb_lock_wait_timeout exceeded); let's seek back to
-          BEGIN log event and retry it all again.
+        Sonera  deadlock. if lock wait timeout (innodb_lock_wait_timeout exceeded)
+	  there is no rollback since 5.0.13 (ref: manual).
+	  let's seek back to BEGIN log event and retry it all again.
           We have to not only seek but also
           a) init_master_info(), to seek back to hot relay log's start for later
           (for when we will come back to this hot log after re-processing the
@@ -3369,6 +3369,7 @@ static int exec_relay_log_event(THD* thd
           else
           {
             exec_res= 0;
+	    end_trans(thd, ROLLBACK);
 	    /* chance for concurrent connection to get more locks */
             safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
 		       (CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
@@ -3386,9 +3387,17 @@ static int exec_relay_log_event(THD* thd
                           "the slave_transaction_retries variable.",
                           slave_trans_retries);
       }
-      if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
-         rli->trans_retries= 0; // restart from fresh
-     }
+      else if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
+      {
+        /*
+          Only reset the retry counter if the event succeeded or
+          failed with a non-transient error.  On a successful event,
+          the execution will proceed as usual; in the case of a
+          non-transient error, the slave will stop with an error.
+	*/
+        rli->trans_retries= 0; // restart from fresh
+      }
+    }
     return exec_res;
   }
   else
@@ -4613,7 +4622,7 @@ static int connect_to_master(THD* thd, M
       suppress_warnings= 0;
       sql_print_error("Slave I/O thread: error %s to master \
 '%s@%s:%d': \
-Error: '%s'  errno: %d  retry-time: %d  retries: %d",
+Error: '%s'  errno: %d  retry-time: %d  retries: %lu",
 		      (reconnect ? "reconnecting" : "connecting"),
 		      mi->user,mi->host,mi->port,
 		      mysql_error(mysql), last_errno,

--- 1.10/mysql-test/r/rpl_deadlock.result	2006-10-11 10:16:44 +03:00
+++ 1.11/mysql-test/r/rpl_deadlock.result	2006-10-11 10:16:44 +03:00
@@ -6,7 +6,7 @@ drop table if exists t1,t2,t3,t4,t5,t6,t
 start slave;
 create table t1 (a int not null, key(a)) engine=innodb;
 create table t2 (a int not null, key(a)) engine=innodb;
-create table t3 (a int) engine=innodb;
+create table t3 (a int unique) engine=innodb;
 create table t4 (a int) engine=innodb;
 show variables like 'slave_transaction_retries';
 Variable_name	Value
@@ -35,14 +35,14 @@ begin;
 select * from t1 for update;
 a
 start slave;
-insert into t2 values(22);
+insert into t2 values(201);
 commit;
 select * from t1;
 a
 1
 select * from t2;
 a
-22
+201
 show slave status;
 Slave_IO_State	#
 Master_Host	127.0.0.1
@@ -50,7 +50,7 @@ Master_User	root
 Master_Port	MASTER_MYPORT
 Connect_Retry	1
 Master_Log_File	master-bin.000001
-Read_Master_Log_Pos	18911
+Read_Master_Log_Pos	18918
 Relay_Log_File	#
 Relay_Log_Pos	#
 Relay_Master_Log_File	master-bin.000001
@@ -65,7 +65,7 @@ Replicate_Wild_Ignore_Table	
 Last_Errno	0
 Last_Error	
 Skip_Counter	0
-Exec_Master_Log_Pos	18911
+Exec_Master_Log_Pos	18918
 Relay_Log_Space	#
 Until_Condition	None
 Until_Log_File	
@@ -78,12 +78,16 @@ Master_SSL_Cipher	
 Master_SSL_Key	
 Seconds_Behind_Master	#
 stop slave;
-change master to master_log_pos=532;
+delete from t3;
+change master to master_log_pos=539;
 begin;
 select * from t2 for update;
 a
-22
+201
 start slave;
+select count(*) from t3  /* must be zero */;
+count(*)
+0
 commit;
 select * from t1;
 a
@@ -91,7 +95,7 @@ a
 1
 select * from t2;
 a
-22
+201
 show slave status;
 Slave_IO_State	#
 Master_Host	127.0.0.1
@@ -99,7 +103,7 @@ Master_User	root
 Master_Port	MASTER_MYPORT
 Connect_Retry	1
 Master_Log_File	master-bin.000001
-Read_Master_Log_Pos	18911
+Read_Master_Log_Pos	18918
 Relay_Log_File	#
 Relay_Log_Pos	#
 Relay_Master_Log_File	master-bin.000001
@@ -114,7 +118,7 @@ Replicate_Wild_Ignore_Table	
 Last_Errno	0
 Last_Error	
 Skip_Counter	0
-Exec_Master_Log_Pos	18911
+Exec_Master_Log_Pos	18918
 Relay_Log_Space	#
 Until_Condition	None
 Until_Log_File	
@@ -128,12 +132,16 @@ Master_SSL_Key	
 Seconds_Behind_Master	#
 set global max_relay_log_size=0;
 stop slave;
-change master to master_log_pos=532;
+delete from t3;
+change master to master_log_pos=539;
 begin;
 select * from t2 for update;
 a
-22
+201
 start slave;
+select count(*) from t3  /* must be zero */;
+count(*)
+0
 commit;
 select * from t1;
 a
@@ -142,7 +150,7 @@ a
 1
 select * from t2;
 a
-22
+201
 show slave status;
 Slave_IO_State	#
 Master_Host	127.0.0.1
@@ -150,7 +158,7 @@ Master_User	root
 Master_Port	MASTER_MYPORT
 Connect_Retry	1
 Master_Log_File	master-bin.000001
-Read_Master_Log_Pos	18911
+Read_Master_Log_Pos	18918
 Relay_Log_File	#
 Relay_Log_Pos	#
 Relay_Master_Log_File	master-bin.000001
@@ -165,7 +173,7 @@ Replicate_Wild_Ignore_Table	
 Last_Errno	0
 Last_Error	
 Skip_Counter	0
-Exec_Master_Log_Pos	18911
+Exec_Master_Log_Pos	18918
 Relay_Log_Space	#
 Until_Condition	None
 Until_Log_File	
@@ -177,4 +185,91 @@ Master_SSL_Cert	
 Master_SSL_Cipher	
 Master_SSL_Key	
 Seconds_Behind_Master	#
+drop table if exists t1;
+create table t1 (f int unique) engine=innodb;
+insert into t1  values (0);
+begin;
+select * from t1 where f = 0 for update;
+f
+0
+begin;
+insert into t1 values (1);
+update t1 set f=-1 where f = 0;
+commit;
+insert into t1 values (2);
+show slave status;
+Slave_IO_State	#
+Master_Host	127.0.0.1
+Master_User	root
+Master_Port	MASTER_MYPORT
+Connect_Retry	1
+Master_Log_File	master-bin.000001
+Read_Master_Log_Pos	19618
+Relay_Log_File	#
+Relay_Log_Pos	#
+Relay_Master_Log_File	master-bin.000001
+Slave_IO_Running	#
+Slave_SQL_Running	No
+Replicate_Do_DB	
+Replicate_Ignore_DB	
+Replicate_Do_Table	
+Replicate_Ignore_Table	
+Replicate_Wild_Do_Table	
+Replicate_Wild_Ignore_Table	
+Last_Errno	1205
+Last_Error	Error 'Lock wait timeout exceeded; try restarting transaction' on query. Default database: 'test'. Query: 'update t1 set f=-1 where f = 0'
+Skip_Counter	0
+Exec_Master_Log_Pos	19227
+Relay_Log_Space	#
+Until_Condition	None
+Until_Log_File	
+Until_Log_Pos	0
+Master_SSL_Allowed	No
+Master_SSL_CA_File	
+Master_SSL_CA_Path	
+Master_SSL_Cert	
+Master_SSL_Cipher	
+Master_SSL_Key	
+Seconds_Behind_Master	#
+set @@global.sql_slave_skip_counter = 4;
+start slave;
+show slave status;
+Slave_IO_State	#
+Master_Host	127.0.0.1
+Master_User	root
+Master_Port	MASTER_MYPORT
+Connect_Retry	1
+Master_Log_File	master-bin.000001
+Read_Master_Log_Pos	19618
+Relay_Log_File	#
+Relay_Log_Pos	#
+Relay_Master_Log_File	master-bin.000001
+Slave_IO_Running	#
+Slave_SQL_Running	Yes
+Replicate_Do_DB	
+Replicate_Ignore_DB	
+Replicate_Do_Table	
+Replicate_Ignore_Table	
+Replicate_Wild_Do_Table	
+Replicate_Wild_Ignore_Table	
+Last_Errno	0
+Last_Error	
+Skip_Counter	0
+Exec_Master_Log_Pos	19618
+Relay_Log_Space	#
+Until_Condition	None
+Until_Log_File	
+Until_Log_Pos	0
+Master_SSL_Allowed	No
+Master_SSL_CA_File	
+Master_SSL_CA_Path	
+Master_SSL_Cert	
+Master_SSL_Cipher	
+Master_SSL_Key	
+Seconds_Behind_Master	#
+select * from t1;
+f
+0
+2
+commit;
 drop table t1,t2,t3,t4;

--- 1.12/mysql-test/t/rpl_deadlock.test	2006-10-11 10:16:44 +03:00
+++ 1.13/mysql-test/t/rpl_deadlock.test	2006-10-11 10:16:44 +03:00
@@ -16,7 +16,8 @@ source include/master-slave.inc;
 connection master;
 create table t1 (a int not null, key(a)) engine=innodb;
 create table t2 (a int not null, key(a)) engine=innodb;
-create table t3 (a int) engine=innodb;
+# requiring 'unique' for the timeout part of the test
+create table t3 (a int unique) engine=innodb;
 create table t4 (a int) engine=innodb;
 show variables like 'slave_transaction_retries';
 sync_slave_with_master;
@@ -31,8 +32,7 @@ stop slave;
 connection master;
 begin;
 # Let's keep BEGIN and the locked statement in two different relay logs.
-let $1=200;
-disable_query_log;
+let $1=200;disable_query_log;
 while ($1)
 {
  eval insert into t3 values( $1 );
@@ -59,7 +59,7 @@ enable_query_log;
 select * from t1 for update;
 start slave;
 --real_sleep 3 # hope that slave is blocked now
-insert into t2 values(22); # provoke deadlock, slave should be victim
+insert into t2 values(201); # provoke deadlock, slave should be victim
 commit;
 sync_with_master;
 select * from t1; # check that slave succeeded finally
@@ -74,11 +74,13 @@ show slave status;
 # 2) Test lock wait timeout
 
 stop slave;
-change master to master_log_pos=532; # the BEGIN log event
+delete from t3;
+change master to master_log_pos=539; # the BEGIN log event
 begin;
 select * from t2 for update; # hold lock
 start slave;
 --real_sleep 10 # slave should have blocked, and be retrying
+select count(*) from t3  /* must be zero */; # replaying begins after rollback
 commit;
 sync_with_master;
 select * from t1; # check that slave succeeded finally
@@ -97,11 +99,13 @@ set global max_relay_log_size=0;
 
 # This is really copy-paste of 2) of above
 stop slave;
-change master to master_log_pos=532;
+delete from t3;
+change master to master_log_pos=539;
 begin;
 select * from t2 for update;
 start slave;
 --real_sleep 10
+select count(*) from t3  /* must be zero */; # replaying begins after rollback
 commit;
 sync_with_master;
 select * from t1;
Thread
bk commit into 5.0 tree (aelkin:1.2295) BUG#16228Andrei Elkin11 Oct