From: Date: March 23 2005 7:19pm Subject: bk commit into 4.1 tree (gbichot:1.2142) BUG#8325 List-Archive: http://lists.mysql.com/internals/23323 X-Bug: 8325 Message-Id: <200503231819.j2NIJkxO003926@quadita2.mysql.com> Below is the list of changes that have just been committed into a local 4.1 repository of gbichot. When gbichot does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet 1.2142 05/03/23 19:19:36 gbichot@stripped +9 -0 "After Monty's review" changes to the fix for BUG#8325 "Deadlock in replication thread stops replication": s/sleep/safe_sleep (thread safe); sleep 0/1/2/3/4/5/5/5 (get slave less late); no message on error log (deadlock is too common sometimes), a global counter instead (SHOW STATUS LIKE 'slave_retried_transactions'). Plus a fix for libmysql/Makefile.shared BitKeeper/etc/logging_ok 1.373 05/03/23 19:19:35 gbichot@stripped +1 -0 Logging to logging@stripped accepted sql/structs.h 1.39 05/03/23 19:19:10 gbichot@stripped +1 -1 new SHOW_SLAVE_RETRIED_TRANS sql/sql_show.cc 1.197 05/03/23 19:19:09 gbichot@stripped +13 -0 SHOW STATUS LIKE "slave_retried_transactions"; needs replication mutexes. Can't be a simple SHOW_LONG, because active_mi is unset (not alloced yet) when the static global status_vars is created (active_mi is set in init_slave()). sql/slave.h 1.85 05/03/23 19:19:09 gbichot@stripped +8 -1 new global counter rli->retried_trans sql/slave.cc 1.268 05/03/23 19:19:09 gbichot@stripped +16 -17 If slave retries automatically a transaction, no message on error log (too common situation); sleep 0 secs at first retry, then 1, 2, 3, 4, 5, 5, 5... Sleeping 0 is to get the least possible late, as deadlocks are usually resolved at first try. New global counter rli->retried_trans (for SHOW STATUS: total number of times the slave had to retry any transaction). safe_sleep() is thread-safe, sleep() was not. I change the rli->trans_retries counter to go from 0 to max instead of the other way (better for new sleep()). sql/mysqld.cc 1.561 05/03/23 19:19:09 gbichot@stripped +13 -3 if active_mi could not be alloced, die. New SHOW STATUS LIKE "slave_retried_transactions". mysql-test/t/rpl_deadlock.test 1.2 05/03/23 19:19:08 gbichot@stripped +4 -0 small test addition mysql-test/r/rpl_deadlock.result 1.2 05/03/23 19:19:08 gbichot@stripped +6 -0 result fix libmysql/Makefile.shared 1.56 05/03/23 19:19:08 gbichot@stripped +2 -1 When we "make clean" in libmysql/ we remove the symlinks there, so we need to mark that they have to be recreated later: this is done by removing ../linked_libmysql_sources. If we don't do this, 'make' will fail after 'cd libmysql;make clean'. This Makefile.shared is used by libmysql_r too. No reason to remove linked_client_sources as we don't remove the links in client/. # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: gbichot # Host: quadita2.mysql.com # Root: /nfstmp1/guilhem/mysql-4.1-4ita --- 1.560/sql/mysqld.cc 2005-03-09 13:41:50 +01:00 +++ 1.561/sql/mysqld.cc 2005-03-23 19:19:09 +01:00 @@ -3062,8 +3062,17 @@ #endif if (opt_bootstrap) /* If running with bootstrap, do not start replication. */ opt_skip_slave_start= 1; - /* init_slave() must be called after the thread keys are created */ - init_slave(); + /* + init_slave() must be called after the thread keys are created. + Some parts of the code (e.g. SHOW STATUS LIKE 'slave_running' and other + places) assume that active_mi != 0, so let's fail if it's 0 (out of + memory); a message has already been printed. + */ + if (init_slave() && !active_mi) + { + end_thr_alarm(1); // Don't allow alarms + unireg_abort(1); + } if (opt_bootstrap) { @@ -5494,7 +5503,8 @@ {"Select_range_check", (char*) &select_range_check_count, SHOW_LONG}, {"Select_scan", (char*) &select_scan_count, SHOW_LONG}, {"Slave_open_temp_tables", (char*) &slave_open_temp_tables, SHOW_LONG}, - {"Slave_running", (char*) 0, SHOW_SLAVE_RUNNING}, + {"Slave_running", (char*) 0, SHOW_SLAVE_RUNNING}, + {"Slave_retried_transactions",(char*) 0, SHOW_SLAVE_RETRIED_TRANS}, {"Slow_launch_threads", (char*) &slow_launch_threads, SHOW_LONG}, {"Slow_queries", (char*) &long_query_count, SHOW_LONG}, {"Sort_merge_passes", (char*) &filesort_merge_passes, SHOW_LONG}, --- 1.267/sql/slave.cc 2005-03-02 11:29:32 +01:00 +++ 1.268/sql/slave.cc 2005-03-23 19:19:09 +01:00 @@ -27,6 +27,7 @@ #include #include +#define MAX_SLAVE_RETRY_PAUSE 5 bool use_slave_mask = 0; MY_BITMAP slave_error_mask; @@ -2335,7 +2336,7 @@ ignore_log_space_limit(0), last_master_timestamp(0), slave_skip_counter(0), abort_pos_wait(0), slave_run_id(0), sql_thd(0), last_slave_errno(0), inited(0), abort_slave(0), slave_running(0), until_condition(UNTIL_NONE), - until_log_pos(0) + until_log_pos(0), retried_trans(0) { group_relay_log_name[0]= event_relay_log_name[0]= group_master_log_name[0]= 0; @@ -2980,9 +2981,8 @@ init_master_info()). b) init_relay_log_pos(), because the BEGIN may be an older relay log. */ - if (rli->trans_retries--) + if (rli->trans_retries < slave_trans_retries) { - sql_print_information("Slave SQL thread retries transaction"); if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL)) sql_print_error("Failed to initialize the master info structure"); else if (init_relay_log_pos(rli, @@ -2994,8 +2994,16 @@ else { exec_res= 0; - sleep(2); // chance for concurrent connection to get more locks - } + /* chance for concurrent connection to get more locks */ + safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE), + (CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli); + pthread_mutex_lock(&rli->data_lock); // because of SHOW STATUS + rli->trans_retries++; + rli->retried_trans++; + pthread_mutex_unlock(&rli->data_lock); + DBUG_PRINT("info", ("Slave retries transaction " + "rli->trans_retries: %lu", rli->trans_retries)); + } } else sql_print_error("Slave SQL thread retried transaction %lu time(s) " @@ -3004,17 +3012,8 @@ slave_trans_retries); } if (!((thd->options & OPTION_BEGIN) && opt_using_transactions)) - { - rli->trans_retries= slave_trans_retries; // restart from fresh - /* - TODO: when merged into 5.0, when slave does auto-rollback if - corrupted binlog, this should reset the retry counter too - (any rollback should). In fact it will work, as here we are just out - of a Format_description_log_event::exec_event() which rolled back. - But check repl code in 5.0 for new ha_rollback calls, just in case. - */ - } - } + rli->trans_retries= 0; // restart from fresh + } return exec_res; } else @@ -3426,7 +3425,7 @@ pthread_mutex_lock(&rli->log_space_lock); rli->ignore_log_space_limit= 0; pthread_mutex_unlock(&rli->log_space_lock); - rli->trans_retries= slave_trans_retries; // start from "no error" + rli->trans_retries= 0; // start from "no error" if (init_relay_log_pos(rli, rli->group_relay_log_name, --- 1.196/sql/sql_show.cc 2005-03-15 23:23:59 +01:00 +++ 1.197/sql/sql_show.cc 2005-03-23 19:19:09 +01:00 @@ -1887,6 +1887,19 @@ pthread_mutex_unlock(&LOCK_active_mi); break; } + case SHOW_SLAVE_RETRIED_TRANS: + { + /* + TODO: in 5.1 with multimaster, have one such counter per line in SHOW + SLAVE STATUS, and have the sum over all lines here. + */ + pthread_mutex_lock(&LOCK_active_mi); + pthread_mutex_lock(&active_mi->rli.data_lock); + end= int10_to_str(active_mi->rli.retried_trans, buff, 10); + pthread_mutex_unlock(&active_mi->rli.data_lock); + pthread_mutex_unlock(&LOCK_active_mi); + break; + } #endif /* HAVE_REPLICATION */ case SHOW_OPENTABLES: end= int10_to_str((long) cached_tables(), buff, 10); --- 1.38/sql/structs.h 2004-09-08 00:07:03 +02:00 +++ 1.39/sql/structs.h 2005-03-23 19:19:10 +01:00 @@ -180,7 +180,7 @@ SHOW_SSL_CTX_SESS_TIMEOUTS, SHOW_SSL_CTX_SESS_CACHE_FULL, SHOW_SSL_GET_CIPHER_LIST, #endif /* HAVE_OPENSSL */ - SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING, + SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING, SHOW_SLAVE_RETRIED_TRANS, SHOW_KEY_CACHE_LONG, SHOW_KEY_CACHE_CONST_LONG }; --- 1.372/BitKeeper/etc/logging_ok 2005-03-21 21:13:01 +01:00 +++ 1.373/BitKeeper/etc/logging_ok 2005-03-23 19:19:35 +01:00 @@ -44,6 +44,7 @@ dlenev@stripped dlenev@stripped ejonore@stripped +gbichot@stripped gbichot@stripped georg@stripped georg@stripped --- 1.1/mysql-test/r/rpl_deadlock.result 2005-03-02 11:29:32 +01:00 +++ 1.2/mysql-test/r/rpl_deadlock.result 2005-03-23 19:19:08 +01:00 @@ -8,6 +8,9 @@ create table t2 (a int not null, key(a)) engine=innodb; create table t3 (a int) engine=innodb; create table t4 (a int) engine=innodb; +show variables like 'slave_transaction_retries'; +Variable_name Value +slave_transaction_retries 0 show create table t1; Table Create Table t1 CREATE TABLE `t1` ( @@ -20,6 +23,9 @@ `a` int(11) NOT NULL default '0', KEY `a` (`a`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1 +show variables like 'slave_transaction_retries'; +Variable_name Value +slave_transaction_retries 2 stop slave; begin; insert into t3 select * from t2 for update; --- 1.1/mysql-test/t/rpl_deadlock.test 2005-03-02 11:29:34 +01:00 +++ 1.2/mysql-test/t/rpl_deadlock.test 2005-03-23 19:19:08 +01:00 @@ -7,6 +7,8 @@ # (Guilhem) have seen the test manage to provoke lock wait timeout # error but not deadlock error; that is ok as code deals with the two # errors in exactly the same way. +# We don't 'show status like 'slave_retried_transactions'' because this +# is not repeatable (depends on sleeps). source include/have_innodb.inc; source include/master-slave.inc; @@ -16,10 +18,12 @@ create table t2 (a int not null, key(a)) engine=innodb; create table t3 (a int) engine=innodb; create table t4 (a int) engine=innodb; +show variables like 'slave_transaction_retries'; sync_slave_with_master; show create table t1; show create table t2; +show variables like 'slave_transaction_retries'; stop slave; # 1) Test deadlock --- 1.84/sql/slave.h 2005-03-02 11:29:32 +01:00 +++ 1.85/sql/slave.h 2005-03-23 19:19:09 +01:00 @@ -295,7 +295,14 @@ UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1 } until_log_names_cmp_result; - ulong trans_retries; + /* + trans_retries varies between 0 to slave_transaction_retries and counts how + many times the slave has retried the present transaction; gets reset to 0 + when the transaction finally succeeds. retried_trans is a cumulative + counter: how many times the slave has retried a transaction (any) since + slave started. + */ + ulong trans_retries, retried_trans; st_relay_log_info(); ~st_relay_log_info(); --- 1.55/libmysql/Makefile.shared 2004-09-25 14:20:33 +02:00 +++ 1.56/libmysql/Makefile.shared 2005-03-23 19:19:08 +01:00 @@ -94,7 +94,8 @@ `echo $(sql_cmn_objects) | sed "s;\.lo;.c;g"` \ $(CHARSET_SRCS) $(CHARSET_OBJS) \ $(mystringsextra) $(mysysheaders) $(vioheaders)\ - ../linked_client_sources net.c + ../linked_libmysql_sources ../linked_libmysql_r_sources \ + net.c conf_to_src_SOURCES = conf_to_src.c conf_to_src_LDADD=