List:Commits« Previous MessageNext Message »
From:Andrei Elkin Date:July 5 2011 10:35pm
Subject:bzr push into mysql-next-mr-wl5569 branch (andrei.elkin:3333 to 3334)
Bug#12719875
View as plain text  
 3334 Andrei Elkin	2011-07-05
      bug#12719875 possible MTS recovery issue.
      
      MTS stopped with an error after failing to apply an event.
      It turned out that the event was sceduled incorrectly due to
      earlier stop by Single-Threaded Slave not at the group boundary
      but rather in the middle of it.
      
      Fixed with forcing CREATE..SELECT be logged as two groups.
      The CREATE-TABLE group is surrounded with its own BEGIN/COMMIT braces.
     @ mysql-test/suite/rpl/r/rpl_parallel_switch_sequential.result
        new results file is added.
     @ mysql-test/suite/rpl/t/rpl_parallel_switch_sequential-slave.opt
        transaction retry is not supported yet by MTS.
     @ mysql-test/suite/rpl/t/rpl_parallel_switch_sequential.test
        Regression test for bug#12719875 is added.
        Notice, created tables engine is Innodb also because with MyISAM stop-slave can be actually
        in the middle a group of myisam table events so the following restart fails with a dup key error.
        CREATE-SELECT is not tested according to another bug as commented.
     @ sql/log_event.cc
        changing error report style to be actually effective: rli->report() does not make rli->info_thd to return
        from is_error() true.
        my_error() message eventually gets to the show-slave-status sql-error at the end of slave sql thread.
     @ sql/rpl_slave.cc
        fixing a possible hanging that can happen due to errored-out worker at time of gaq is full and
        the worker was the first to update it;
        refining  asserts;
        shifting stop_workers() routine to a point where slave sql has not reset its errors which pleases
        a refined assert in slave_stop_workers(rli).

    added:
      mysql-test/suite/rpl/r/rpl_parallel_switch_sequential.result
      mysql-test/suite/rpl/t/rpl_parallel_switch_sequential-slave.opt
      mysql-test/suite/rpl/t/rpl_parallel_switch_sequential.test
    modified:
      sql/log_event.cc
      sql/rpl_slave.cc
 3333 Andrei Elkin	2011-07-04
      wl#5569 MTS
      
      Adding a rule to run PB with all suites in MTS with binlog-format ROW.
     @ .bzr-mysql/default.conf
        restoring commits@.
     @ mysql-test/collections/default.push
        adding a rule to run all suites in MTS with binlog-format ROW.

    modified:
      .bzr-mysql/default.conf
      mysql-test/collections/default.push
=== added file 'mysql-test/suite/rpl/r/rpl_parallel_switch_sequential.result'
--- a/mysql-test/suite/rpl/r/rpl_parallel_switch_sequential.result	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/rpl/r/rpl_parallel_switch_sequential.result	2011-07-05 17:43:04 +0000
@@ -0,0 +1,48 @@
+include/master-slave.inc
+[connection master]
+SET @save_slave_parallel_workers= @@slave_parallel_workers;
+SET @save_default_storage_engine=@@global.default_storage_engine;
+SET @@global.default_storage_engine='innodb';
+SET GLOBAL slave_parallel_workers= 4;
+call mtr.add_suppression("Request to stop slave SQL Thread received while applying a group that has non-transactional changes");
+include/stop_slave.inc
+include/start_slave.inc
+SET @save_default_storage_engine=@@global.default_storage_engine;
+SET @@global.default_storage_engine='innodb';
+SET @@session.default_storage_engine='innodb';
+call mtr.add_suppression('.*Unsafe statement written to the binary log using statement format since BINLOG_FORMAT = STATEMENT.*');
+CREATE DATABASE test_10;
+CREATE TABLE test_10.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+CREATE DATABASE test_9;
+CREATE TABLE test_9.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+CREATE DATABASE test_8;
+CREATE TABLE test_8.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+CREATE DATABASE test_7;
+CREATE TABLE test_7.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+CREATE DATABASE test_6;
+CREATE TABLE test_6.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+CREATE DATABASE test_5;
+CREATE TABLE test_5.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+CREATE DATABASE test_4;
+CREATE TABLE test_4.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+CREATE DATABASE test_3;
+CREATE TABLE test_3.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+CREATE DATABASE test_2;
+CREATE TABLE test_2.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+CREATE DATABASE test_1;
+CREATE TABLE test_1.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+include/start_slave.inc
+DROP DATABASE test_10;
+DROP DATABASE test_9;
+DROP DATABASE test_8;
+DROP DATABASE test_7;
+DROP DATABASE test_6;
+DROP DATABASE test_5;
+DROP DATABASE test_4;
+DROP DATABASE test_3;
+DROP DATABASE test_2;
+DROP DATABASE test_1;
+SET @@global.default_storage_engine= @save_default_storage_engine;
+include/stop_slave.inc
+SET @@global.default_storage_engine= @save_default_storage_engine;
+SET GLOBAL slave_parallel_workers= @save_slave_parallel_workers;

=== added file 'mysql-test/suite/rpl/t/rpl_parallel_switch_sequential-slave.opt'
--- a/mysql-test/suite/rpl/t/rpl_parallel_switch_sequential-slave.opt	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/rpl/t/rpl_parallel_switch_sequential-slave.opt	2011-07-05 17:43:04 +0000
@@ -0,0 +1 @@
+--slave-transaction-retries=0

=== added file 'mysql-test/suite/rpl/t/rpl_parallel_switch_sequential.test'
--- a/mysql-test/suite/rpl/t/rpl_parallel_switch_sequential.test	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/rpl/t/rpl_parallel_switch_sequential.test	2011-07-05 17:43:04 +0000
@@ -0,0 +1,153 @@
+--source include/master-slave.inc
+#--source include/have_binlog_format_row.inc
+
+let $max_workers= 4;
+
+--connection slave
+SET @save_slave_parallel_workers= @@slave_parallel_workers;
+SET @save_default_storage_engine=@@global.default_storage_engine;
+SET @@global.default_storage_engine='innodb';
+
+eval SET GLOBAL slave_parallel_workers= $max_workers;
+#call mtr.add_suppression("The slave coordinator and worker threads are stopped, possibly leaving data in inconsistent state");
+call mtr.add_suppression("Request to stop slave SQL Thread received while applying a group that has non-transactional changes");
+
+--source include/stop_slave.inc
+--source include/start_slave.inc
+
+--connection master
+SET @save_default_storage_engine=@@global.default_storage_engine;
+SET @@global.default_storage_engine='innodb';
+SET @@session.default_storage_engine='innodb';
+
+call mtr.add_suppression('.*Unsafe statement written to the binary log using statement format since BINLOG_FORMAT = STATEMENT.*');
+
+let $i= 10;
+let $slave_status= 0;
+while ($i) {
+  eval CREATE DATABASE test_$i;
+  eval CREATE TABLE test_$i.t1 (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+  dec $i;
+}
+--sync_slave_with_master
+--connection master
+
+
+--disable_query_log
+--disable_warnings
+
+let $slave_status= 0;
+let $trx= 0;
+let $alter= 0;
+let $i= 300;
+if (`select @@binlog_format like "STATEMENT"`) {
+   # relax mtr to scan unsafe warnings
+   let $i=100;
+}
+while ($i) {
+  if (`SELECT 10*RAND() > 8`) {
+    if ($trx) {
+      COMMIT;
+      let $trx= 0;
+    }
+  }  
+  if (`SELECT 10*RAND() > 8`) {
+    if (!$trx) {
+      BEGIN;
+      let $trx= 1;
+    }
+  }  
+  let $db= `SELECT FLOOR(10*RAND())+1`;
+  let $k= 10;
+  while ($k) {
+    eval INSERT INTO test_$k.t1 VALUES ($i, 'test', NULL, 0);
+    dec $k;
+  }
+  let $k= 10;
+  while ($k) {
+    eval UPDATE test_$k.t1 SET d=a WHERE a = FLOOR(1000*RAND());
+    dec $k;
+  }
+  eval DELETE FROM test_$db.t1 WHERE a = FLOOR(1000*RAND());
+  if (`SELECT 10*RAND() > 5`) {
+    eval UPDATE test_$db.t1 SET c=REPEAT('a', 1000) WHERE a < $i;
+  }  
+  if ($trx) {
+    if (`SELECT 10*RAND() > 8`) {
+      let $k= 10;
+      while ($k) {
+
+	# Can't be CREATE .. SELECT here because of BUG#11756034 that allows STOP SLAVE
+	# in the middle of CREATE-SELECT caused group.
+	# TODO: fix the bug and create a use case.
+
+        eval CREATE TABLE test_$k.t0  (a INT NOT NULL PRIMARY KEY, b VARCHAR(20), c BLOB, d INT NOT NULL DEFAULT 0);
+	eval INSERT INTO test_$k.t0 SELECT * FROM test_$k.t1;
+        dec $k;
+      }
+      let $k= 10;
+      while ($k) {
+        eval DROP TABLE test_$k.t1;
+        dec $k;
+      }
+      let $k= 10;
+      while ($k) {
+        eval RENAME TABLE test_$k.t0 TO test_$k.t1 /* i= $i */;
+        dec $k;
+      }
+    }
+ }
+  if (`SELECT 10*RAND() > 9`) {
+    --connection slave
+    FLUSH LOGS;
+    --connection master 
+  }
+  if (`SELECT 100*RAND() > 95`) {
+    if ($slave_status) {
+      --connection slave
+      let $workers= $max_workers;
+      if (`SELECT 10*RAND() > 5`) {
+        let $workers= 0;
+      }
+      eval SET GLOBAL slave_parallel_workers= $workers;
+      START SLAVE;
+      --connection master
+      let $slave_status= 0;
+    }
+  }
+  if (`SELECT 100*RAND() > 95`) {
+    if (!$slave_status)
+    {
+      --connection slave
+      STOP SLAVE;
+      --connection master
+      let $slave_status= 1;
+    }
+  }
+  dec $i;
+}
+--enable_warnings
+
+--enable_query_log
+--connection slave
+--disable_warnings
+--source include/start_slave.inc
+--enable_warnings
+--sync_with_master
+
+#cleanup
+
+--connection master
+let $i= 10;
+while ($i) {
+  eval DROP DATABASE test_$i;
+  dec $i;
+}
+
+SET @@global.default_storage_engine= @save_default_storage_engine;
+
+--sync_slave_with_master
+
+--source include/stop_slave.inc
+SET @@global.default_storage_engine= @save_default_storage_engine;
+SET GLOBAL slave_parallel_workers= @save_slave_parallel_workers;

=== modified file 'sql/log_event.cc'
--- a/sql/log_event.cc	2011-07-01 13:41:35 +0000
+++ b/sql/log_event.cc	2011-07-05 17:43:04 +0000
@@ -2504,10 +2504,8 @@ Slave_worker *Log_event::get_slave_worke
                              ret_worker)))
       {
         llstr(rli->get_event_relay_log_pos(), llbuff);
-        rli->report(ERROR_LEVEL, ER_MTS_CANT_PARALLEL,
-                    ER(ER_MTS_CANT_PARALLEL),
-                    get_type_str(), rli->get_event_relay_log_name(),
-                    rli->get_event_relay_log_pos());
+        my_error(ER_MTS_CANT_PARALLEL, MYF(0),
+                 get_type_str(), rli->get_event_relay_log_name(), llbuff);
         return ret_worker;
       }
       // all temporary tables are transferred from Coordinator in over-max case
@@ -2553,10 +2551,9 @@ Slave_worker *Log_event::get_slave_worke
         DBUG_ASSERT(!ret_worker);
         
         llstr(rli->get_event_relay_log_pos(), llbuff);
-        rli->report(ERROR_LEVEL, ER_MTS_CANT_PARALLEL,
-                    ER(ER_MTS_CANT_PARALLEL),
-                    get_type_str(), rli->get_event_relay_log_name(),
-                    llbuff);
+        my_error(ER_MTS_CANT_PARALLEL, MYF(0),
+                 get_type_str(), rli->get_event_relay_log_name(), llbuff);
+
         return ret_worker;
       }
 
@@ -2722,11 +2719,9 @@ int Log_event::apply_event(Relay_log_inf
              MTS has to stop to suggest restart in the permanent sequential mode.
           */
           llstr(rli->get_event_relay_log_pos(), llbuff);
-          rli->report(ERROR_LEVEL, ER_MTS_CANT_PARALLEL,
-                      ER(ER_MTS_CANT_PARALLEL),
-                      get_type_str(), rli->get_event_relay_log_name(),
-                      rli->get_event_relay_log_pos());
-          
+          my_error(ER_MTS_CANT_PARALLEL, MYF(0),
+                   get_type_str(), rli->get_event_relay_log_name(), llbuff);
+
           /* Coordinator cant continue, it marks MTS group status accordingly */
           rli->mts_group_status= Relay_log_info::MTS_KILLED_GROUP;
 

=== modified file 'sql/rpl_slave.cc'
--- a/sql/rpl_slave.cc	2011-07-02 07:58:56 +0000
+++ b/sql/rpl_slave.cc	2011-07-05 17:43:04 +0000
@@ -4169,7 +4169,8 @@ bool mts_checkpoint_routine(Relay_log_in
         cnt != mts_checkpoint_period)
       sql_print_error("This an error cnt != mts_checkpoint_period");
 #endif
-  } while (cnt == 0 && (rli->gaq->full()  || force) &&
+  } while (!sql_slave_killed(rli->info_thd, rli) &&
+           cnt == 0 && (rli->gaq->full() || force) &&
            !DBUG_EVALUATE_IF("check_slave_debug_group", 1, 0) &&
            (my_sleep(rli->mts_coordinator_basic_nap), 1));
   /*
@@ -4413,7 +4414,8 @@ void slave_stop_workers(Relay_log_info *
   if (rli->mts_group_status != Relay_log_info::MTS_KILLED_GROUP &&
       thd->killed == THD::NOT_KILLED)
   {
-    DBUG_ASSERT(rli->mts_group_status != Relay_log_info::MTS_IN_GROUP);
+    DBUG_ASSERT(rli->mts_group_status != Relay_log_info::MTS_IN_GROUP ||
+                thd->is_error());
 
 #ifndef DBUG_OFF
     if (DBUG_EVALUATE_IF("check_slave_debug_group", 1, 0))
@@ -4819,6 +4821,8 @@ llstr(rli->get_group_master_log_pos(), l
 
  err:
 
+  slave_stop_workers(rli); // stopping worker pool before clearing own error
+
   /*
     Some events set some playgrounds, which won't be cleared because thread
     stops. Stopping of this thread may not be known to these events ("stop"
@@ -4836,8 +4840,6 @@ llstr(rli->get_group_master_log_pos(), l
   thd->reset_query();
   thd->reset_db(NULL, 0);
 
-  slave_stop_workers(rli);
-
   THD_STAGE_INFO(thd, stage_waiting_for_slave_mutex_on_exit);
   mysql_mutex_lock(&rli->run_lock);
   /* We need data_lock, at least to wake up any waiting master_pos_wait() */
@@ -6059,7 +6061,9 @@ static Log_event* next_event(Relay_log_i
           the checkpoint routine must be periodically invoked.
         */
         (void) mts_checkpoint_routine(rli, period, force, TRUE); // TODO: ALFRANIO ERROR
-        DBUG_ASSERT(!force || (force && (rli->checkpoint_seqno <= (rli->checkpoint_group - 1))));
+        DBUG_ASSERT(!force ||
+                    (force && (rli->checkpoint_seqno <= (rli->checkpoint_group - 1))) ||
+                    sql_slave_killed(thd, rli));
         mysql_mutex_lock(&rli->data_lock);
       }
       DBUG_RETURN(ev);

No bundle (reason: useless for push emails).
Thread
bzr push into mysql-next-mr-wl5569 branch (andrei.elkin:3333 to 3334)Bug#12719875Andrei Elkin6 Jul