List:Internals« Previous MessageNext Message »
From:guilhem Date:March 23 2005 6:19pm
Subject:bk commit into 4.1 tree (gbichot:1.2142) BUG#8325
View as plain text  
Below is the list of changes that have just been committed into a local
4.1 repository of gbichot. When gbichot does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2142 05/03/23 19:19:36 gbichot@stripped +9 -0
  "After Monty's review" changes to the fix for BUG#8325 "Deadlock in replication thread stops replication":
  s/sleep/safe_sleep (thread safe); sleep 0/1/2/3/4/5/5/5 (get slave less late);
  no message on error log (deadlock is too common sometimes), a global counter
  instead (SHOW STATUS LIKE 'slave_retried_transactions').
  Plus a fix for libmysql/Makefile.shared

  BitKeeper/etc/logging_ok
    1.373 05/03/23 19:19:35 gbichot@stripped +1 -0
    Logging to logging@stripped accepted

  sql/structs.h
    1.39 05/03/23 19:19:10 gbichot@stripped +1 -1
    new SHOW_SLAVE_RETRIED_TRANS

  sql/sql_show.cc
    1.197 05/03/23 19:19:09 gbichot@stripped +13 -0
    SHOW STATUS LIKE "slave_retried_transactions"; needs replication mutexes.
    Can't be a simple SHOW_LONG, because active_mi is unset (not alloced yet)
    when the static global status_vars is created (active_mi is set
    in init_slave()).

  sql/slave.h
    1.85 05/03/23 19:19:09 gbichot@stripped +8 -1
    new global counter rli->retried_trans

  sql/slave.cc
    1.268 05/03/23 19:19:09 gbichot@stripped +16 -17
    If slave retries automatically a transaction, no message on error log
    (too common situation); sleep 0 secs at first retry, then 1, 2, 3, 4,
    5, 5, 5... Sleeping 0 is to get the least possible late, as deadlocks
    are usually resolved at first try. New global counter rli->retried_trans
    (for SHOW STATUS: total number of times the slave had to retry
    any transaction). safe_sleep() is thread-safe, sleep() was not.
    I change the rli->trans_retries counter to go from 0 to max instead
    of the other way (better for new sleep()).

  sql/mysqld.cc
    1.561 05/03/23 19:19:09 gbichot@stripped +13 -3
    if active_mi could not be alloced, die. New SHOW STATUS LIKE "slave_retried_transactions".

  mysql-test/t/rpl_deadlock.test
    1.2 05/03/23 19:19:08 gbichot@stripped +4 -0
    small test addition

  mysql-test/r/rpl_deadlock.result
    1.2 05/03/23 19:19:08 gbichot@stripped +6 -0
    result fix

  libmysql/Makefile.shared
    1.56 05/03/23 19:19:08 gbichot@stripped +2 -1
    When we "make clean" in libmysql/ we remove the symlinks there, so we
    need to mark that they have to be recreated later: this is done by removing
    ../linked_libmysql_sources. If we don't do this, 'make' will fail after 'cd libmysql;make clean'.
    This Makefile.shared is used by libmysql_r too.
    No reason to remove linked_client_sources as we don't remove the links in client/.

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	gbichot
# Host:	quadita2.mysql.com
# Root:	/nfstmp1/guilhem/mysql-4.1-4ita

--- 1.560/sql/mysqld.cc	2005-03-09 13:41:50 +01:00
+++ 1.561/sql/mysqld.cc	2005-03-23 19:19:09 +01:00
@@ -3062,8 +3062,17 @@
 #endif
   if (opt_bootstrap) /* If running with bootstrap, do not start replication. */
     opt_skip_slave_start= 1;
-  /* init_slave() must be called after the thread keys are created */
-  init_slave();
+  /*
+    init_slave() must be called after the thread keys are created.
+    Some parts of the code (e.g. SHOW STATUS LIKE 'slave_running' and other
+    places) assume that active_mi != 0, so let's fail if it's 0 (out of
+    memory); a message has already been printed.
+  */
+  if (init_slave() && !active_mi)
+  {
+    end_thr_alarm(1);				// Don't allow alarms
+    unireg_abort(1);
+  }
 
   if (opt_bootstrap)
   {
@@ -5494,7 +5503,8 @@
   {"Select_range_check",       (char*) &select_range_check_count, SHOW_LONG},
   {"Select_scan",	       (char*) &select_scan_count,	SHOW_LONG},
   {"Slave_open_temp_tables",   (char*) &slave_open_temp_tables, SHOW_LONG},
-  {"Slave_running",            (char*) 0, SHOW_SLAVE_RUNNING},
+  {"Slave_running",            (char*) 0,                       SHOW_SLAVE_RUNNING},
+  {"Slave_retried_transactions",(char*) 0,                      SHOW_SLAVE_RETRIED_TRANS},
   {"Slow_launch_threads",      (char*) &slow_launch_threads,    SHOW_LONG},
   {"Slow_queries",             (char*) &long_query_count,       SHOW_LONG},
   {"Sort_merge_passes",	       (char*) &filesort_merge_passes,  SHOW_LONG},

--- 1.267/sql/slave.cc	2005-03-02 11:29:32 +01:00
+++ 1.268/sql/slave.cc	2005-03-23 19:19:09 +01:00
@@ -27,6 +27,7 @@
 #include <my_dir.h>
 #include <sql_common.h>
 
+#define MAX_SLAVE_RETRY_PAUSE 5
 bool use_slave_mask = 0;
 MY_BITMAP slave_error_mask;
 
@@ -2335,7 +2336,7 @@
    ignore_log_space_limit(0), last_master_timestamp(0), slave_skip_counter(0),
    abort_pos_wait(0), slave_run_id(0), sql_thd(0), last_slave_errno(0),
    inited(0), abort_slave(0), slave_running(0), until_condition(UNTIL_NONE),
-   until_log_pos(0)
+   until_log_pos(0), retried_trans(0)
 {
   group_relay_log_name[0]= event_relay_log_name[0]=
     group_master_log_name[0]= 0;
@@ -2980,9 +2981,8 @@
           init_master_info()).
           b) init_relay_log_pos(), because the BEGIN may be an older relay log.
         */
-        if (rli->trans_retries--)
+        if (rli->trans_retries < slave_trans_retries)
         {
-          sql_print_information("Slave SQL thread retries transaction");
           if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL))
             sql_print_error("Failed to initialize the master info structure");
           else if (init_relay_log_pos(rli,
@@ -2994,8 +2994,16 @@
           else
           {
             exec_res= 0;
-            sleep(2); // chance for concurrent connection to get more locks
-          }
+	    /* chance for concurrent connection to get more locks */
+            safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
+		       (CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
+            pthread_mutex_lock(&rli->data_lock); // because of SHOW STATUS
+	    rli->trans_retries++;
+            rli->retried_trans++;
+            pthread_mutex_unlock(&rli->data_lock);
+            DBUG_PRINT("info", ("Slave retries transaction "
+                                "rli->trans_retries: %lu", rli->trans_retries));
+	  }
         }
         else
           sql_print_error("Slave SQL thread retried transaction %lu time(s) "
@@ -3004,17 +3012,8 @@
                           slave_trans_retries);
       }
       if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
-      {
-        rli->trans_retries= slave_trans_retries; // restart from fresh
-        /*
-          TODO: when merged into 5.0, when slave does auto-rollback if
-          corrupted binlog, this should reset the retry counter too
-          (any rollback should). In fact it will work, as here we are just out
-          of a Format_description_log_event::exec_event() which rolled back.
-          But check repl code in 5.0 for new ha_rollback calls, just in case.
-        */
-      }
-    }
+         rli->trans_retries= 0; // restart from fresh
+     }
     return exec_res;
   }
   else
@@ -3426,7 +3425,7 @@
   pthread_mutex_lock(&rli->log_space_lock);
   rli->ignore_log_space_limit= 0;
   pthread_mutex_unlock(&rli->log_space_lock);
-  rli->trans_retries= slave_trans_retries; // start from "no error"
+  rli->trans_retries= 0; // start from "no error"
 
   if (init_relay_log_pos(rli,
 			 rli->group_relay_log_name,

--- 1.196/sql/sql_show.cc	2005-03-15 23:23:59 +01:00
+++ 1.197/sql/sql_show.cc	2005-03-23 19:19:09 +01:00
@@ -1887,6 +1887,19 @@
 	pthread_mutex_unlock(&LOCK_active_mi);
 	break;
       }
+      case SHOW_SLAVE_RETRIED_TRANS:
+      {
+        /*
+          TODO: in 5.1 with multimaster, have one such counter per line in SHOW
+          SLAVE STATUS, and have the sum over all lines here.
+        */
+	pthread_mutex_lock(&LOCK_active_mi);
+        pthread_mutex_lock(&active_mi->rli.data_lock);
+	end= int10_to_str(active_mi->rli.retried_trans, buff, 10);
+        pthread_mutex_unlock(&active_mi->rli.data_lock);
+	pthread_mutex_unlock(&LOCK_active_mi);
+	break;
+      }
 #endif /* HAVE_REPLICATION */
       case SHOW_OPENTABLES:
 	end= int10_to_str((long) cached_tables(), buff, 10);

--- 1.38/sql/structs.h	2004-09-08 00:07:03 +02:00
+++ 1.39/sql/structs.h	2005-03-23 19:19:10 +01:00
@@ -180,7 +180,7 @@
   SHOW_SSL_CTX_SESS_TIMEOUTS, SHOW_SSL_CTX_SESS_CACHE_FULL,
   SHOW_SSL_GET_CIPHER_LIST,
 #endif /* HAVE_OPENSSL */
-  SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING,
+  SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING, SHOW_SLAVE_RETRIED_TRANS,
   SHOW_KEY_CACHE_LONG, SHOW_KEY_CACHE_CONST_LONG
 };
 

--- 1.372/BitKeeper/etc/logging_ok	2005-03-21 21:13:01 +01:00
+++ 1.373/BitKeeper/etc/logging_ok	2005-03-23 19:19:35 +01:00
@@ -44,6 +44,7 @@
 dlenev@stripped
 dlenev@stripped
 ejonore@stripped
+gbichot@stripped
 gbichot@stripped
 georg@stripped
 georg@stripped

--- 1.1/mysql-test/r/rpl_deadlock.result	2005-03-02 11:29:32 +01:00
+++ 1.2/mysql-test/r/rpl_deadlock.result	2005-03-23 19:19:08 +01:00
@@ -8,6 +8,9 @@
 create table t2 (a int not null, key(a)) engine=innodb;
 create table t3 (a int) engine=innodb;
 create table t4 (a int) engine=innodb;
+show variables like 'slave_transaction_retries';
+Variable_name	Value
+slave_transaction_retries	0
 show create table t1;
 Table	Create Table
 t1	CREATE TABLE `t1` (
@@ -20,6 +23,9 @@
   `a` int(11) NOT NULL default '0',
   KEY `a` (`a`)
 ) ENGINE=InnoDB DEFAULT CHARSET=latin1
+show variables like 'slave_transaction_retries';
+Variable_name	Value
+slave_transaction_retries	2
 stop slave;
 begin;
 insert into t3 select * from t2 for update;

--- 1.1/mysql-test/t/rpl_deadlock.test	2005-03-02 11:29:34 +01:00
+++ 1.2/mysql-test/t/rpl_deadlock.test	2005-03-23 19:19:08 +01:00
@@ -7,6 +7,8 @@
 # (Guilhem) have seen the test manage to provoke lock wait timeout
 # error but not deadlock error; that is ok as code deals with the two
 # errors in exactly the same way.
+# We don't 'show status like 'slave_retried_transactions'' because this
+# is not repeatable (depends on sleeps).
 
 source include/have_innodb.inc;
 source include/master-slave.inc;
@@ -16,10 +18,12 @@
 create table t2 (a int not null, key(a)) engine=innodb;
 create table t3 (a int) engine=innodb;
 create table t4 (a int) engine=innodb;
+show variables like 'slave_transaction_retries';
 sync_slave_with_master;
 
 show create table t1;
 show create table t2;
+show variables like 'slave_transaction_retries';
 stop slave;
 
 # 1) Test deadlock

--- 1.84/sql/slave.h	2005-03-02 11:29:32 +01:00
+++ 1.85/sql/slave.h	2005-03-23 19:19:09 +01:00
@@ -295,7 +295,14 @@
     UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1
   } until_log_names_cmp_result;
 
-  ulong trans_retries;
+  /*
+    trans_retries varies between 0 to slave_transaction_retries and counts how
+    many times the slave has retried the present transaction; gets reset to 0
+    when the transaction finally succeeds. retried_trans is a cumulative
+    counter: how many times the slave has retried a transaction (any) since
+    slave started.
+  */
+  ulong trans_retries, retried_trans;
 
   st_relay_log_info();
   ~st_relay_log_info();

--- 1.55/libmysql/Makefile.shared	2004-09-25 14:20:33 +02:00
+++ 1.56/libmysql/Makefile.shared	2005-03-23 19:19:08 +01:00
@@ -94,7 +94,8 @@
 	      `echo $(sql_cmn_objects) | sed "s;\.lo;.c;g"` \
 	       $(CHARSET_SRCS) $(CHARSET_OBJS) \
 	       $(mystringsextra) $(mysysheaders) $(vioheaders)\
-	       ../linked_client_sources net.c
+	       ../linked_libmysql_sources ../linked_libmysql_r_sources \
+               net.c
 
 conf_to_src_SOURCES = conf_to_src.c
 conf_to_src_LDADD=
Thread
bk commit into 4.1 tree (gbichot:1.2142) BUG#8325guilhem23 Mar