List:Commits« Previous MessageNext Message »
From:Andrei Elkin Date:July 15 2011 11:12pm
Subject:bzr push into mysql-next-mr-wl5569 branch (andrei.elkin:3347 to 3348)
Bug#12755663
View as plain text  
 3348 Andrei Elkin	2011-07-16
      bug#12755663 MTS: RPL_CIRCULAR_FOR_4_HOSTS FAILS: CANT EXECUTE THE CURRENT EVENT GROUP
      
      MTS stopped with an error in the middle of the test.
      The reason is that a group of events from the slave itself was processed partly to 
      modify the group position. In the following restart the wrong group bondary made slave
      either to error out or assert.
      
      Fixed with locating a possible race condition allowin Coordinator to ignore
      actual failed status of a Worker.
      So in the case of the test, the slave server group can't be started.
      Notice, this is a trial patch since I can't catch the failure on available to me hosts at all.
     @ sql/rpl_rli_pdb.cc
        Changing the running status of the Worker before it releases assigned entries.
        That ensure that the waiting in wait_for_workers_to_finish() Coordinator exits the function
        with a negative result and therefore stops without attempting to apply an event
        due to which it attempted synchronization.
        Couple of diagnostics into error log are added. They may be removed in short while but
        currently might be helpful to provide details if the failure won't disappear after this push.

    modified:
      mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
      mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
      sql/rpl_rli_pdb.cc
 3347 Andrei Elkin	2011-07-14
      WL#5569 MTS
      
      further extensive rpl_circular_for_4_hosts exersices with --repeat 10 --parallel=8 
      revealed a race condition in that Coordinator might miss to catch not-running status
      for a Worker. That made Coordinator to skip only a part of a group of the slave server own events
      so the slave stops at not the bondary of a group.
      
      Fixed with moving marking of the errored-out Worker as failed prior to its APH entries release.
      
      TODO: notice there can be a possibility to stop at not the boundary due to graceful STOP SLAVE
      if one is run at time of skipping self-originated events.
      However this issue belongs to STS and might be similar with BUG@12604951 and BUG@12728160.
     @ mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
        results are updated.
     @ mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
        tests is updated with a new text of a suppression.
     @ sql/log_event.cc
        Adding clarifying text to an error message when parallel execution fails.
     @ sql/rpl_rli_pdb.cc
        Moving marking of the errored-out Worker as failed prior to its APH entries release.
        That ensures Coordinator always finds the non-running status in a case it has to
        know that (wait_for_workers_to_finish()).
     @ sql/share/errmsg-utf8.txt
        Adding a format specifier for a clarifying text.

    modified:
      mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
      mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
      sql/log_event.cc
      sql/rpl_rli_pdb.cc
      sql/share/errmsg-utf8.txt
=== modified file 'mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result'
--- a/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result	2011-07-14 09:40:06 +0000
+++ b/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result	2011-07-15 23:11:11 +0000
@@ -48,9 +48,9 @@ STOP SLAVE;
 SET GLOBAL SQL_SLAVE_SKIP_COUNTER = 1;
 include/start_slave.inc
 INSERT INTO t1 VALUES(6,'C',2);
-call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
 lock table t1 write  /* must block B_2^6 coming */;
 INSERT INTO t1(b,c) VALUES('B',2);
+call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
 unlock tables;
 include/wait_for_slave_sql_error.inc [errno=1062]
 INSERT INTO t1(b,c) VALUES('A',2);

=== modified file 'mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test'
--- a/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test	2011-07-14 09:40:06 +0000
+++ b/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test	2011-07-15 23:11:11 +0000
@@ -85,23 +85,23 @@ INSERT INTO t1 VALUES(6,'C',2);
 
 #
 # MTS part of the test makes sure that server B will have received
-# the being failed C's `call mtr.add_suppression' so it will send it
+# the being failed C's `call mtr.add_suppression' and it will send it
 # to D after its own INSERT INTO t1 VALUES(6,'C',2).
 #
 --connection server_3
 
-  call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
-
   lock table t1 write  /* must block B_2^6 coming */;
 
 --connection server_2
 INSERT INTO t1(b,c) VALUES('B',2);
 
+--connection server_3_1
+  call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
+
 # MTS: catching failure
 --connection server_3
-
   unlock tables;
- 
+
 # Wait while C will stop.
 --connection server_3
 # 1062 = ER_DUP_ENTRY

=== modified file 'sql/rpl_rli_pdb.cc'
--- a/sql/rpl_rli_pdb.cc	2011-07-14 09:40:06 +0000
+++ b/sql/rpl_rli_pdb.cc	2011-07-15 23:11:11 +0000
@@ -913,15 +913,15 @@ void Slave_worker::slave_worker_ends_gro
   }
   else
   {
-    // Killing Coordinator to indicate eventual consistency error
-    mysql_mutex_lock(&c_rli->info_thd->LOCK_thd_data);
-    c_rli->info_thd->awake(THD::KILL_QUERY);
-    mysql_mutex_unlock(&c_rli->info_thd->LOCK_thd_data);
-
     // tagging as exiting so Coordinator won't be able synchronize with it
     mysql_mutex_lock(&jobs_lock);
     running_status= ERROR_LEAVING;
     mysql_mutex_unlock(&jobs_lock);
+
+    // Killing Coordinator to indicate eventual consistency error
+    mysql_mutex_lock(&c_rli->info_thd->LOCK_thd_data);
+    c_rli->info_thd->awake(THD::KILL_QUERY);
+    mysql_mutex_unlock(&c_rli->info_thd->LOCK_thd_data);
   }
 
   /*
@@ -1346,9 +1346,13 @@ int wait_for_workers_to_finish(Relay_log
   const char info_format[]=
     "Waiting for Slave Worker %d to release partition `%s`";
   bool cant_sync= FALSE;
+  char llbuf[22];
 
   DBUG_ENTER("wait_for_workers_to_finish");
 
+  llstr(const_cast<Relay_log_info*>(rli)->get_event_relay_log_pos(), llbuf);
+  sql_print_information("Coordinator enter synchronization when distributes event relay-log: %s pos: %s", const_cast<Relay_log_info*>(rli)->get_event_relay_log_name(), llbuf);
+
   for (uint i= 0, ret= 0; i < hash->records; i++)
   {
     db_worker_hash_entry *entry;
@@ -1401,7 +1405,11 @@ int wait_for_workers_to_finish(Relay_log
   }
 
   if (!ignore)
+  {
+    sql_print_information("Coordinator synchronized with Workers, waited entries: %d, cant_sync: %d", ret, cant_sync);
+
     const_cast<Relay_log_info*>(rli)->mts_group_status= Relay_log_info::MTS_NOT_IN_GROUP;
+  }
 
   DBUG_RETURN(!cant_sync ? ret : -1);
 }

No bundle (reason: useless for push emails).
Thread
bzr push into mysql-next-mr-wl5569 branch (andrei.elkin:3347 to 3348)Bug#12755663Andrei Elkin17 Jul