Below is the list of changes that have just been committed into a local
4.1 repository of gbichot. When gbichot does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet
1.2142 05/03/23 19:19:36 gbichot@stripped +9 -0
"After Monty's review" changes to the fix for BUG#8325 "Deadlock in replication thread
stops replication":
s/sleep/safe_sleep (thread safe); sleep 0/1/2/3/4/5/5/5 (get slave less late);
no message on error log (deadlock is too common sometimes), a global counter
instead (SHOW STATUS LIKE 'slave_retried_transactions').
Plus a fix for libmysql/Makefile.shared
BitKeeper/etc/logging_ok
1.373 05/03/23 19:19:35 gbichot@stripped +1 -0
Logging to logging@stripped accepted
sql/structs.h
1.39 05/03/23 19:19:10 gbichot@stripped +1 -1
new SHOW_SLAVE_RETRIED_TRANS
sql/sql_show.cc
1.197 05/03/23 19:19:09 gbichot@stripped +13 -0
SHOW STATUS LIKE "slave_retried_transactions"; needs replication mutexes.
Can't be a simple SHOW_LONG, because active_mi is unset (not alloced yet)
when the static global status_vars is created (active_mi is set
in init_slave()).
sql/slave.h
1.85 05/03/23 19:19:09 gbichot@stripped +8 -1
new global counter rli->retried_trans
sql/slave.cc
1.268 05/03/23 19:19:09 gbichot@stripped +16 -17
If slave retries automatically a transaction, no message on error log
(too common situation); sleep 0 secs at first retry, then 1, 2, 3, 4,
5, 5, 5... Sleeping 0 is to get the least possible late, as deadlocks
are usually resolved at first try. New global counter rli->retried_trans
(for SHOW STATUS: total number of times the slave had to retry
any transaction). safe_sleep() is thread-safe, sleep() was not.
I change the rli->trans_retries counter to go from 0 to max instead
of the other way (better for new sleep()).
sql/mysqld.cc
1.561 05/03/23 19:19:09 gbichot@stripped +13 -3
if active_mi could not be alloced, die. New SHOW STATUS LIKE
"slave_retried_transactions".
mysql-test/t/rpl_deadlock.test
1.2 05/03/23 19:19:08 gbichot@stripped +4 -0
small test addition
mysql-test/r/rpl_deadlock.result
1.2 05/03/23 19:19:08 gbichot@stripped +6 -0
result fix
libmysql/Makefile.shared
1.56 05/03/23 19:19:08 gbichot@stripped +2 -1
When we "make clean" in libmysql/ we remove the symlinks there, so we
need to mark that they have to be recreated later: this is done by removing
../linked_libmysql_sources. If we don't do this, 'make' will fail after 'cd
libmysql;make clean'.
This Makefile.shared is used by libmysql_r too.
No reason to remove linked_client_sources as we don't remove the links in client/.
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: gbichot
# Host: quadita2.mysql.com
# Root: /nfstmp1/guilhem/mysql-4.1-4ita
--- 1.560/sql/mysqld.cc 2005-03-09 13:41:50 +01:00
+++ 1.561/sql/mysqld.cc 2005-03-23 19:19:09 +01:00
@@ -3062,8 +3062,17 @@
#endif
if (opt_bootstrap) /* If running with bootstrap, do not start replication. */
opt_skip_slave_start= 1;
- /* init_slave() must be called after the thread keys are created */
- init_slave();
+ /*
+ init_slave() must be called after the thread keys are created.
+ Some parts of the code (e.g. SHOW STATUS LIKE 'slave_running' and other
+ places) assume that active_mi != 0, so let's fail if it's 0 (out of
+ memory); a message has already been printed.
+ */
+ if (init_slave() && !active_mi)
+ {
+ end_thr_alarm(1); // Don't allow alarms
+ unireg_abort(1);
+ }
if (opt_bootstrap)
{
@@ -5494,7 +5503,8 @@
{"Select_range_check", (char*) &select_range_check_count, SHOW_LONG},
{"Select_scan", (char*) &select_scan_count, SHOW_LONG},
{"Slave_open_temp_tables", (char*) &slave_open_temp_tables, SHOW_LONG},
- {"Slave_running", (char*) 0, SHOW_SLAVE_RUNNING},
+ {"Slave_running", (char*) 0, SHOW_SLAVE_RUNNING},
+ {"Slave_retried_transactions",(char*) 0,
SHOW_SLAVE_RETRIED_TRANS},
{"Slow_launch_threads", (char*) &slow_launch_threads, SHOW_LONG},
{"Slow_queries", (char*) &long_query_count, SHOW_LONG},
{"Sort_merge_passes", (char*) &filesort_merge_passes, SHOW_LONG},
--- 1.267/sql/slave.cc 2005-03-02 11:29:32 +01:00
+++ 1.268/sql/slave.cc 2005-03-23 19:19:09 +01:00
@@ -27,6 +27,7 @@
#include <my_dir.h>
#include <sql_common.h>
+#define MAX_SLAVE_RETRY_PAUSE 5
bool use_slave_mask = 0;
MY_BITMAP slave_error_mask;
@@ -2335,7 +2336,7 @@
ignore_log_space_limit(0), last_master_timestamp(0), slave_skip_counter(0),
abort_pos_wait(0), slave_run_id(0), sql_thd(0), last_slave_errno(0),
inited(0), abort_slave(0), slave_running(0), until_condition(UNTIL_NONE),
- until_log_pos(0)
+ until_log_pos(0), retried_trans(0)
{
group_relay_log_name[0]= event_relay_log_name[0]=
group_master_log_name[0]= 0;
@@ -2980,9 +2981,8 @@
init_master_info()).
b) init_relay_log_pos(), because the BEGIN may be an older relay log.
*/
- if (rli->trans_retries--)
+ if (rli->trans_retries < slave_trans_retries)
{
- sql_print_information("Slave SQL thread retries transaction");
if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL))
sql_print_error("Failed to initialize the master info structure");
else if (init_relay_log_pos(rli,
@@ -2994,8 +2994,16 @@
else
{
exec_res= 0;
- sleep(2); // chance for concurrent connection to get more locks
- }
+ /* chance for concurrent connection to get more locks */
+ safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
+ (CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
+ pthread_mutex_lock(&rli->data_lock); // because of SHOW STATUS
+ rli->trans_retries++;
+ rli->retried_trans++;
+ pthread_mutex_unlock(&rli->data_lock);
+ DBUG_PRINT("info", ("Slave retries transaction "
+ "rli->trans_retries: %lu", rli->trans_retries));
+ }
}
else
sql_print_error("Slave SQL thread retried transaction %lu time(s) "
@@ -3004,17 +3012,8 @@
slave_trans_retries);
}
if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
- {
- rli->trans_retries= slave_trans_retries; // restart from fresh
- /*
- TODO: when merged into 5.0, when slave does auto-rollback if
- corrupted binlog, this should reset the retry counter too
- (any rollback should). In fact it will work, as here we are just out
- of a Format_description_log_event::exec_event() which rolled back.
- But check repl code in 5.0 for new ha_rollback calls, just in case.
- */
- }
- }
+ rli->trans_retries= 0; // restart from fresh
+ }
return exec_res;
}
else
@@ -3426,7 +3425,7 @@
pthread_mutex_lock(&rli->log_space_lock);
rli->ignore_log_space_limit= 0;
pthread_mutex_unlock(&rli->log_space_lock);
- rli->trans_retries= slave_trans_retries; // start from "no error"
+ rli->trans_retries= 0; // start from "no error"
if (init_relay_log_pos(rli,
rli->group_relay_log_name,
--- 1.196/sql/sql_show.cc 2005-03-15 23:23:59 +01:00
+++ 1.197/sql/sql_show.cc 2005-03-23 19:19:09 +01:00
@@ -1887,6 +1887,19 @@
pthread_mutex_unlock(&LOCK_active_mi);
break;
}
+ case SHOW_SLAVE_RETRIED_TRANS:
+ {
+ /*
+ TODO: in 5.1 with multimaster, have one such counter per line in SHOW
+ SLAVE STATUS, and have the sum over all lines here.
+ */
+ pthread_mutex_lock(&LOCK_active_mi);
+ pthread_mutex_lock(&active_mi->rli.data_lock);
+ end= int10_to_str(active_mi->rli.retried_trans, buff, 10);
+ pthread_mutex_unlock(&active_mi->rli.data_lock);
+ pthread_mutex_unlock(&LOCK_active_mi);
+ break;
+ }
#endif /* HAVE_REPLICATION */
case SHOW_OPENTABLES:
end= int10_to_str((long) cached_tables(), buff, 10);
--- 1.38/sql/structs.h 2004-09-08 00:07:03 +02:00
+++ 1.39/sql/structs.h 2005-03-23 19:19:10 +01:00
@@ -180,7 +180,7 @@
SHOW_SSL_CTX_SESS_TIMEOUTS, SHOW_SSL_CTX_SESS_CACHE_FULL,
SHOW_SSL_GET_CIPHER_LIST,
#endif /* HAVE_OPENSSL */
- SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING,
+ SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING, SHOW_SLAVE_RETRIED_TRANS,
SHOW_KEY_CACHE_LONG, SHOW_KEY_CACHE_CONST_LONG
};
--- 1.372/BitKeeper/etc/logging_ok 2005-03-21 21:13:01 +01:00
+++ 1.373/BitKeeper/etc/logging_ok 2005-03-23 19:19:35 +01:00
@@ -44,6 +44,7 @@
dlenev@stripped
dlenev@stripped
ejonore@stripped
+gbichot@stripped
gbichot@stripped
georg@stripped
georg@stripped
--- 1.1/mysql-test/r/rpl_deadlock.result 2005-03-02 11:29:32 +01:00
+++ 1.2/mysql-test/r/rpl_deadlock.result 2005-03-23 19:19:08 +01:00
@@ -8,6 +8,9 @@
create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb;
create table t4 (a int) engine=innodb;
+show variables like 'slave_transaction_retries';
+Variable_name Value
+slave_transaction_retries 0
show create table t1;
Table Create Table
t1 CREATE TABLE `t1` (
@@ -20,6 +23,9 @@
`a` int(11) NOT NULL default '0',
KEY `a` (`a`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
+show variables like 'slave_transaction_retries';
+Variable_name Value
+slave_transaction_retries 2
stop slave;
begin;
insert into t3 select * from t2 for update;
--- 1.1/mysql-test/t/rpl_deadlock.test 2005-03-02 11:29:34 +01:00
+++ 1.2/mysql-test/t/rpl_deadlock.test 2005-03-23 19:19:08 +01:00
@@ -7,6 +7,8 @@
# (Guilhem) have seen the test manage to provoke lock wait timeout
# error but not deadlock error; that is ok as code deals with the two
# errors in exactly the same way.
+# We don't 'show status like 'slave_retried_transactions'' because this
+# is not repeatable (depends on sleeps).
source include/have_innodb.inc;
source include/master-slave.inc;
@@ -16,10 +18,12 @@
create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb;
create table t4 (a int) engine=innodb;
+show variables like 'slave_transaction_retries';
sync_slave_with_master;
show create table t1;
show create table t2;
+show variables like 'slave_transaction_retries';
stop slave;
# 1) Test deadlock
--- 1.84/sql/slave.h 2005-03-02 11:29:32 +01:00
+++ 1.85/sql/slave.h 2005-03-23 19:19:09 +01:00
@@ -295,7 +295,14 @@
UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1
} until_log_names_cmp_result;
- ulong trans_retries;
+ /*
+ trans_retries varies between 0 to slave_transaction_retries and counts how
+ many times the slave has retried the present transaction; gets reset to 0
+ when the transaction finally succeeds. retried_trans is a cumulative
+ counter: how many times the slave has retried a transaction (any) since
+ slave started.
+ */
+ ulong trans_retries, retried_trans;
st_relay_log_info();
~st_relay_log_info();
--- 1.55/libmysql/Makefile.shared 2004-09-25 14:20:33 +02:00
+++ 1.56/libmysql/Makefile.shared 2005-03-23 19:19:08 +01:00
@@ -94,7 +94,8 @@
`echo $(sql_cmn_objects) | sed "s;\.lo;.c;g"` \
$(CHARSET_SRCS) $(CHARSET_OBJS) \
$(mystringsextra) $(mysysheaders) $(vioheaders)\
- ../linked_client_sources net.c
+ ../linked_libmysql_sources ../linked_libmysql_r_sources \
+ net.c
conf_to_src_SOURCES = conf_to_src.c
conf_to_src_LDADD=
| Thread |
|---|
| • bk commit into 4.1 tree (gbichot:1.2142) BUG#8325 | guilhem | 23 Mar |