List:Internals« Previous MessageNext Message »
From:guilhem Date:December 7 2005 2:29pm
Subject:bk commit into 5.1 tree (guilhem:1.1984) BUG#14418
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of guilhem. When guilhem does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.1984 05/12/07 15:29:49 guilhem@stripped +11 -0
  WL#1012 "row-based replication": Temporary fix for BUG#14418 "RBR: Uncertain behaviour when stopping updates in the middle"
  as agreed by the team: if slave SQL thread is in execution of RBR events and has not executed
  the last one of the statement, and table does not guarantee idempotency (no primary key
  AND not transactional), wait until executing the last event, or if no new events come, 
  time out after one minute and print error.
  Plus fixes of --abort-slave-event-count and --disconnect-slave-event-count

  mysql-test/t/rpl_row_stop_middle_update.test
    1.1 05/12/07 15:29:43 guilhem@stripped +31 -0
    A test (row-based replication specific) to see if slave stops in the middle of a sequence
    of Rows_log_event part of the same statement (BUG#14418), or waits until the last of them (the fix
    brought by this patch).

  mysql-test/t/rpl_row_stop_middle_update.test
    1.0 05/12/07 15:29:43 guilhem@stripped +0 -0
    BitKeeper file /home/mysql_src/mysql-5.1/mysql-test/t/rpl_row_stop_middle_update.test

  mysql-test/t/rpl_row_stop_middle_update-slave.opt
    1.1 05/12/07 15:29:42 guilhem@stripped +1 -0
    try to stop slave after one Rows_log_event event

  mysql-test/t/rpl_row_stop_middle_update-master.opt
    1.1 05/12/07 15:29:42 guilhem@stripped +1 -0
    create small events

  mysql-test/r/rpl_row_stop_middle_update.result
    1.1 05/12/07 15:29:42 guilhem@stripped +16 -0
    result for new test

  sql/slave.h
    1.95 05/12/07 15:29:42 guilhem@stripped +3 -1
    new members (see slave.cc). Renaming of events_till_abort to events_till_disconnect to
    be inline with the option name --disconnect_slave_event_count

  sql/slave.cc
    1.260 05/12/07 15:29:42 guilhem@stripped +71 -37
    1) Making --disconnect-slave-event-count and --abort-slave-event-count work (the latter had
    been broken since years).
    2) rli->cleanup_context() introduced, it's to clean up any half-done work when we exit some replication
    work without having finished it: rollback any updated records, unlock and close tables, clear RBR table mappings.
    3) if slave has declared itself in an unsafe situation, it will accept to stop only after having
    gone out of the unsafe situation, OR if impossible (end of relay log reached and no new event comes in
    one minute), will timeout and print an error message. 

  sql/log_event.h
    1.143 05/12/07 15:29:42 guilhem@stripped +0 -7
    Table_map_log_event::cleanup moved into a general method of RELAY_LOG_INFO, because in fact
    it's not completely RBR-specific.

  sql/log_event.cc
    1.197 05/12/07 15:29:42 guilhem@stripped +26 -28
    1) A DBUG_EXECUTE_IF to simulate a STOP SLAVE request after executing the first Rows_log_event.
    2) Table_map_log_event::cleanup() gone
    3) If we notice we are in an unsafe situation (after executing a Rows_log_event which is not
    the last one of the statement, on a table where idempotency is not guaranteed: no PK AND no transactions),
    record it

  mysql-test/t/rpl_row_stop_middle_update-slave.opt
    1.0 05/12/07 15:29:42 guilhem@stripped +0 -0
    BitKeeper file /home/mysql_src/mysql-5.1/mysql-test/t/rpl_row_stop_middle_update-slave.opt

  mysql-test/t/rpl_row_stop_middle_update-master.opt
    1.0 05/12/07 15:29:42 guilhem@stripped +0 -0
    BitKeeper file /home/mysql_src/mysql-5.1/mysql-test/t/rpl_row_stop_middle_update-master.opt

  mysql-test/r/rpl_row_stop_middle_update.result
    1.0 05/12/07 15:29:42 guilhem@stripped +0 -0
    BitKeeper file /home/mysql_src/mysql-5.1/mysql-test/r/rpl_row_stop_middle_update.result

  mysql-test/r/rpl_relayrotate.result
    1.22 05/12/07 15:29:41 guilhem@stripped +0 -3
    result update

  mysql-test/extra/rpl_tests/rpl_relayrotate.test
    1.23 05/12/07 15:29:41 guilhem@stripped +2 -3
    sync_with_master is more robust than MASTER_POS_WAIT (if slave starts slowly)

  BitKeeper/deleted/.del-rpl000013-slave.opt~18266ad8a2403e8d
    1.5 05/12/06 16:30:34 guilhem@stripped +1 -1
    Delete: mysql-test/t/rpl000013-slave.opt

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	guilhem
# Host:	gbichot3.local
# Root:	/home/mysql_src/mysql-5.1

--- 1.196/sql/log_event.cc	2005-12-06 14:54:14 +01:00
+++ 1.197/sql/log_event.cc	2005-12-07 15:29:42 +01:00
@@ -2227,12 +2227,7 @@
                     "or ROLLBACK in relay log). A probable cause is that "
                     "the master died while writing the transaction to "
                     "its binary log, thus rolled back too."); 
-    end_trans(thd, ROLLBACK);
-    close_thread_tables(thd); // some tables may be open, must close them
-    /* we also need to clear table mappings as tables are closed: */
-#ifdef HAVE_ROW_BASED_REPLICATION
-    Table_map_log_event::cleanup(thd, rli);
-#endif
+    rli->cleanup_context(thd, 1);
   }
 #endif
   /*
@@ -5342,14 +5337,15 @@
 
       row_start= row_end;
     }
+    DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
+                    rli->abort_slave=1;);
     error= do_after_row_operations(table, error);
+    
   }
 
   if (error)
   {                     /* error has occured during the transaction */
     /*
-      We do the two next calls to be sure to rollback whether it was
-      autocommit or in a transaction.
       If one day we honour --skip-slave-errors in row-based replication, and
       the error should be skipped, then we would clear mappings, rollback,
       close tables, but the slave SQL thread would not stop and then may
@@ -5359,10 +5355,7 @@
       For now we code, knowing that error is not skippable and so slave SQL
       thread is certainly going to stop.
     */
-    ha_autocommit_or_rollback(thd, 1);
-    ha_rollback(thd);
-
-    Table_map_log_event::cleanup(thd, rli);
+    rli->cleanup_context(thd, 1);
     thd->query_error= 1;
     DBUG_RETURN(error);
   }
@@ -5405,7 +5398,7 @@
       pending event flushed.
     */
 
-    Table_map_log_event::cleanup(thd, rli);
+    rli->cleanup_context(thd, 0);
     rli->transaction_end(thd);
 
     if (error == 0)
@@ -5444,6 +5437,26 @@
     */
     mysql_unlock_tables(thd, thd->lock);
     thd->lock= 0;
+    if ((table->s->primary_key == MAX_KEY) &&
+        !table->file->has_transactions())
+    {
+      /*
+        ------------ Temporary fix until WL#2975 is implemented ---------
+        This event is not the last one (no STMT_END_F). If we stop now (in
+        case of terminate_slave_thread()), how will we restart? We have to
+        restart from Table_map_log_event, but as this table is not
+        transactional, the rows already inserted will still be present, and
+        idempotency is not guaranteed (no PK) so we risk that repeating leads
+        to double insert. So we desperately try to continue, hope we'll
+        eventually leave this buggy situation (by executing the final
+        Rows_log_event). If we are in a hopeless wait (reached end of last
+        relay log and nothing gets appended there), we timeout after one
+        minute, and notify DBA about the problem.
+        When WL#2975 is implemented, just remove the member
+        st_relay_log_info::unsafe_to_stop_at and all its occurences.
+      */
+      rli->unsafe_to_stop_at= time((time_t*) 0);
+    }
   }
 
   DBUG_ASSERT(error == 0);
@@ -5921,21 +5934,6 @@
             "`%s`.`%s` mapped to number %lu\n",
             m_dbnam, m_tblnam, m_table_id);
   }
-}
-#endif
-
-#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
-void Table_map_log_event::cleanup(THD *thd_arg, struct st_relay_log_info *rli)
-{
-  /*
-    Instances of Table_map_log_event, if ::exec_event() was called on them,
-    may have opened tables, which we cannot be sure have been closed (because
-    maybe the Rows_log_event have not been found or will not be, because slave
-    SQL thread is stopping, or relay log has a missing tail etc). So we close
-    all thread's tables. And so the table mappings have to be cancelled.
-  */
-  rli->m_table_map.clear_tables();
-  close_thread_tables(thd_arg);
 }
 #endif
 

--- 1.142/sql/log_event.h	2005-12-06 11:13:09 +01:00
+++ 1.143/sql/log_event.h	2005-12-07 15:29:42 +01:00
@@ -1714,13 +1714,6 @@
   virtual void print(FILE *file, PRINT_EVENT_INFO *print_event_info);
 #endif
 
-  /*
-    When you use ::exec_event(), you have to use ::cleanup() before exiting
-    your main function. Mats: if you have a better place for this function...
-  */
-#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION) && defined(HAVE_ROW_BASED_REPLICATION)
-  static void cleanup(THD *thd, struct st_relay_log_info *rli);
-#endif
 
 private:
 #ifndef MYSQL_CLIENT

--- 1.259/sql/slave.cc	2005-12-06 11:13:09 +01:00
+++ 1.260/sql/slave.cc	2005-12-07 15:29:42 +01:00
@@ -50,8 +50,6 @@
 */
 
 int disconnect_slave_event_count = 0, abort_slave_event_count = 0;
-int events_till_abort = -1;
-static int events_till_disconnect = -1;
 
 typedef enum { SLAVE_THD_IO, SLAVE_THD_SQL} SLAVE_THD_TYPE;
 
@@ -862,7 +860,37 @@
 {
   DBUG_ASSERT(rli->sql_thd == thd);
   DBUG_ASSERT(rli->slave_running == 1);// tracking buffer overrun
-  return rli->abort_slave || abort_loop || thd->killed;
+  if (abort_loop || thd->killed || rli->abort_slave)
+  {
+    /*
+      If we are in an unsafe situation (stopping could corrupt replication),
+      we give one minute to the slave SQL thread of grace before really
+      terminating, in the hope that it will be able to read more events and
+      the unsafe situation will soon be left. Note that this one minute starts
+      from the last time anything happened in the slave SQL thread. So it's
+      really one minute of idleness, we don't timeout if the slave SQL thread
+      is actively working.
+    */
+    if (!rli->unsafe_to_stop_at)
+      return 1;
+    DBUG_PRINT("info", ("Slave SQL thread is in an unsafe situation, giving "
+                        "it some grace period"));
+    fprintf(stderr,"Slave SQL thread is in an unsafe situation, giving "
+                        "it some grace period\n");
+
+    if (((ulong)time((time_t)0) - (ulong)rli->unsafe_to_stop_at) > 60)
+    {
+      slave_print_msg(ERROR_LEVEL, rli, 0,
+                      "SQL thread had to stop in an unsafe situation, in "
+                      "the middle of applying updates to a "
+                      "non-transactional table without any primary key. "
+                      "There is a risk of duplicate updates when the slave "
+                      "SQL thread is restarted. Please check your table "
+                      "contents after restart.");
+      return 1;
+    }
+  }
+  return 0;
 }
 
 
@@ -2266,7 +2294,8 @@
    ignore_log_space_limit(0), last_master_timestamp(0), slave_skip_counter(0),
    abort_pos_wait(0), slave_run_id(0), sql_thd(0), last_slave_errno(0),
    inited(0), abort_slave(0), slave_running(0), until_condition(UNTIL_NONE),
-   until_log_pos(0), retried_trans(0), m_reload_flags(RELOAD_NONE_F)
+   until_log_pos(0), retried_trans(0), m_reload_flags(RELOAD_NONE_F),
+   unsafe_to_stop_at(0)
 {
   group_relay_log_name[0]= event_relay_log_name[0]=
     group_master_log_name[0]= 0;
@@ -2690,11 +2719,9 @@
   /*
     my_real_read() will time us out
     We check if we were told to die, and if not, try reading again
-
-    TODO:  Move 'events_till_disconnect' to the MASTER_INFO structure
   */
 #ifndef DBUG_OFF
-  if (disconnect_slave_event_count && !(events_till_disconnect--))
+  if (disconnect_slave_event_count && !(mi->events_till_disconnect--))
     return packet_error;      
 #endif
   
@@ -3070,9 +3097,6 @@
   my_thread_init();
   DBUG_ENTER("handle_slave_io");
 
-#ifndef DBUG_OFF
-slave_begin:
-#endif
   DBUG_ASSERT(mi->inited);
   mysql= NULL ;
   retry_count= 0;
@@ -3082,7 +3106,7 @@
   mi->slave_run_id++;
 
 #ifndef DBUG_OFF
-  mi->events_till_abort = abort_slave_event_count;
+  mi->events_till_disconnect = disconnect_slave_event_count;
 #endif
 
   thd= new THD; // note that contructor of THD uses DBUG_ !
@@ -3320,14 +3344,6 @@
 log space");
 	  goto err;
 	}
-      // TODO: check debugging abort code
-#ifndef DBUG_OFF
-      if (abort_slave_event_count && !--events_till_abort)
-      {
-	sql_print_error("Slave I/O thread: debugging abort");
-	goto err;
-      }
-#endif
     } 
   }
 
@@ -3366,10 +3382,6 @@
   pthread_mutex_unlock(&LOCK_thread_count);
   pthread_cond_broadcast(&mi->stop_cond);	// tell the world we are done
   pthread_mutex_unlock(&mi->run_lock);
-#ifndef DBUG_OFF
-  if (abort_slave_event_count && !events_till_abort)
-    goto slave_begin;
-#endif
   my_thread_end();
   pthread_exit(0);
   DBUG_RETURN(0);				// Can't return anything here
@@ -3389,10 +3401,6 @@
   my_thread_init();
   DBUG_ENTER("handle_slave_sql");
 
-#ifndef DBUG_OFF
-slave_begin:
-#endif  
-
   DBUG_ASSERT(rli->inited);
   pthread_mutex_lock(&rli->run_lock);
   DBUG_ASSERT(!rli->slave_running);
@@ -3540,16 +3548,13 @@
 
  err:
 
-#ifdef HAVE_ROW_BASED_REPLICATION
   /*
     Some events set some playgrounds, which won't be cleared because thread
     stops. Stopping of this thread may not be known to these events ("stop"
     request is detected only by the present function, not by events), so we
     must "proactively" clear playgrounds:
   */
-  Table_map_log_event::cleanup(thd, rli);
-#endif
-
+  rli->cleanup_context(thd, 1);
   VOID(pthread_mutex_lock(&LOCK_thread_count));
   /*
     Some extra safety, which should not been needed (normally, event deletion
@@ -3595,10 +3600,6 @@
   pthread_cond_broadcast(&rli->stop_cond);
   // tell the world we are done
   pthread_mutex_unlock(&rli->run_lock);
-#ifndef DBUG_OFF // TODO: reconsider the code below
-  if (abort_slave_event_count && !rli->events_till_abort)
-    goto slave_begin;
-#endif  
   my_thread_end();
   pthread_exit(0);
   DBUG_RETURN(0);				// Can't return anything here
@@ -3751,7 +3752,7 @@
     rotate event forever, so we need to not disconnect after one.
   */
   if (disconnect_slave_event_count)
-    events_till_disconnect++;
+    mi->events_till_disconnect++;
 #endif
 
   /*
@@ -4207,7 +4208,7 @@
   DBUG_ENTER("connect_to_master");
 
 #ifndef DBUG_OFF
-  events_till_disconnect = disconnect_slave_event_count;
+  mi->events_till_disconnect = disconnect_slave_event_count;
 #endif
   ulong client_flag= CLIENT_REMEMBER_OPTIONS;
   if (opt_slave_compressed_protocol)
@@ -4398,6 +4399,11 @@
   DBUG_ENTER("next_event");
   DBUG_ASSERT(thd != 0);
 
+#ifndef DBUG_OFF
+  if (abort_slave_event_count && !rli->events_till_abort--)
+    DBUG_RETURN(0);
+#endif
+
   /*
     For most operations we need to protect rli members with data_lock,
     so we assume calling function acquired this mutex for us and we will
@@ -4842,6 +4848,34 @@
     m_reload_flags= RELOAD_NONE_F;
   }
 }
+
+#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
+void st_relay_log_info::cleanup_context(THD *thd, bool error)
+{
+  DBUG_ASSERT(sql_thd == thd);
+  /*
+    1) Instances of Table_map_log_event, if ::exec_event() was called on them,
+    may have opened tables, which we cannot be sure have been closed (because
+    maybe the Rows_log_event have not been found or will not be, because slave
+    SQL thread is stopping, or relay log has a missing tail etc). So we close
+    all thread's tables. And so the table mappings have to be cancelled.
+    2) Rows_log_event::exec_event() may even have started statements or
+    transactions on them, which we need to rollback in case of error.
+    3) If finding a Format_description_log_event after a BEGIN, we also need
+    to rollback before continuing with the next events.
+    4) so we need this "context cleanup" function.
+  */
+  if (error)
+  {
+    ha_autocommit_or_rollback(thd, 1); // if a "statement transaction"
+    end_trans(thd, ROLLBACK); // if a "real transaction"
+  }
+  m_table_map.clear_tables();
+  close_thread_tables(thd);
+  unsafe_to_stop_at= 0;
+}
+#endif
+
 
 #ifdef HAVE_EXPLICIT_TEMPLATE_INSTANTIATION
 template class I_List_iterator<i_string>;

--- 1.21/mysql-test/r/rpl_relayrotate.result	2005-12-06 11:13:10 +01:00
+++ 1.22/mysql-test/r/rpl_relayrotate.result	2005-12-07 15:29:41 +01:00
@@ -10,9 +10,6 @@
 start slave;
 stop slave;
 start slave;
-select master_pos_wait('master-bin.001',3000)>=0;
-master_pos_wait('master-bin.001',3000)>=0
-1
 select max(a) from t1;
 max(a)
 8000
--- New file ---
+++ mysql-test/r/rpl_row_stop_middle_update.result	05/12/07 15:29:42
stop slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
create table t1 (words varchar(20)) engine=myisam;
load data infile '../../std_data/words.dat' into table t1 (words);
select count(*) from t1;
count(*)
70
select count(*) from t1;
count(*)
70
drop table t1;
drop table t1;


--- 1.22/mysql-test/extra/rpl_tests/rpl_relayrotate.test	2005-12-06 11:13:10 +01:00
+++ 1.23/mysql-test/extra/rpl_tests/rpl_relayrotate.test	2005-12-07 15:29:41 +01:00
@@ -52,9 +52,8 @@
 # which proves that the transaction restarted at
 # the right place.
 # We must wait for the transaction to commit before
-# reading, MASTER_POS_WAIT() will do it for sure
-# (the only statement with position>=3000 is COMMIT).
-select master_pos_wait('master-bin.001',3000)>=0;
+# reading, with a sync_with_master.
+sync_with_master;
 select max(a) from t1;
 connection master;
 
--- New file ---
+++ mysql-test/t/rpl_row_stop_middle_update-master.opt	05/12/07 15:29:42
--loose-binlog-row-event-max-size=256

--- New file ---
+++ mysql-test/t/rpl_row_stop_middle_update-slave.opt	05/12/07 15:29:42
--loose-debug=d,STOP_SLAVE_after_first_Rows_event

--- New file ---
+++ mysql-test/t/rpl_row_stop_middle_update.test	05/12/07 15:29:43
-- source include/have_binlog_format_row.inc
-- source include/have_debug.inc
-- source include/master-slave.inc

# master is asked to create small Rows events: if only one event is
# created, stopping slave at the end of that one will show no bug, we
# need at least two (and stop after first); in this test we use three.

connection master;
create table t1 (words varchar(20)) engine=myisam;

load data infile '../../std_data/words.dat' into table t1 (words);
select count(*) from t1;
save_master_pos;

connection slave;

# slave will automatically tell itself to stop thanks to the .opt
# file; it will initiate the stop request after the first
# Rows_log_event (out of 3) but should wait until the last one is
# executed before stopping.

wait_for_slave_to_stop;

# check that we inserted all rows (waited until the last Rows event)
select count(*) from t1;

connection master;
drop table t1;
connection slave; # slave SQL thread is stopped
drop table t1;


--- 1.94/sql/slave.h	2005-12-06 11:13:09 +01:00
+++ 1.95/sql/slave.h	2005-12-07 15:29:42 +01:00
@@ -390,6 +390,8 @@
 
   void touching_table(char const* db, char const* table, ulong table_id);
   void transaction_end(THD*);
+  void cleanup_context(THD *, bool);
+  time_t unsafe_to_stop_at;
 } RELAY_LOG_INFO;
 
 
@@ -452,7 +454,7 @@
   uint port;
   uint connect_retry;
 #ifndef DBUG_OFF
-  int events_till_abort;
+  int events_till_disconnect;
 #endif
   bool inited;
   volatile bool abort_slave;

--- 1.4/mysql-test/t/rpl000013-slave.opt	2001-10-11 02:59:42 +02:00
+++ 1.5/BitKeeper/deleted/.del-rpl000013-slave.opt~18266ad8a2403e8d	2005-12-06 16:30:34 +01:00
@@ -1 +1 @@
---abort-slave-event-count=1
+--abort-slave-event-count=0
Thread
bk commit into 5.1 tree (guilhem:1.1984) BUG#14418guilhem7 Dec