3347 Andrei Elkin 2011-07-14
WL#5569 MTS
further extensive rpl_circular_for_4_hosts exersices with --repeat 10 --parallel=8
revealed a race condition in that Coordinator might miss to catch not-running status
for a Worker. That made Coordinator to skip only a part of a group of the slave server own events
so the slave stops at not the bondary of a group.
Fixed with moving marking of the errored-out Worker as failed prior to its APH entries release.
TODO: notice there can be a possibility to stop at not the boundary due to graceful STOP SLAVE
if one is run at time of skipping self-originated events.
However this issue belongs to STS and might be similar with BUG@12604951 and BUG@12728160.
@ mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
results are updated.
@ mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
tests is updated with a new text of a suppression.
@ sql/log_event.cc
Adding clarifying text to an error message when parallel execution fails.
@ sql/rpl_rli_pdb.cc
Moving marking of the errored-out Worker as failed prior to its APH entries release.
That ensures Coordinator always finds the non-running status in a case it has to
know that (wait_for_workers_to_finish()).
@ sql/share/errmsg-utf8.txt
Adding a format specifier for a clarifying text.
modified:
mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
sql/log_event.cc
sql/rpl_rli_pdb.cc
sql/share/errmsg-utf8.txt
3346 Alfranio Correia 2011-07-14
Post-push fixes for WL#5569
Injecting faults while updating a myisam table requires to flush the
changes before committing suicide. So we have introduced the follwing
code:
DBUG_EXECUTE_IF("crash_after_commit_and_update_pos",
- DBUG_SUICIDE(););
+ sql_print_information("Crashing crash_after_commit_and_update_pos.");
+ flush_info(TRUE);
+ DBUG_SUICIDE();
Besides we improved some comments.
modified:
mysql-test/r/mysqld--help-win.result
mysql-test/suite/rpl/t/rpl_mixed_mts_crash_safe-slave.opt
mysql-test/suite/rpl/t/rpl_mixed_mts_rec_crash_safe-slave.opt
mysql-test/suite/rpl/t/rpl_row_mts_crash_safe-slave.opt
mysql-test/suite/rpl/t/rpl_row_mts_rec_crash_safe-slave.opt
mysql-test/suite/rpl/t/rpl_stm_mts_crash_safe-slave.opt
mysql-test/suite/rpl/t/rpl_stm_mts_rec_crash_safe-slave.opt
sql/log_event.cc
sql/rpl_rli_pdb.cc
sql/rpl_slave.cc
sql/sys_vars.cc
=== modified file 'mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result'
--- a/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result 2011-07-12 20:36:17 +0000
+++ b/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result 2011-07-14 09:40:06 +0000
@@ -7,7 +7,7 @@ CREATE TABLE t2 (a INT NOT NULL AUTO_INC
include/rpl_sync.inc
call mtr.add_suppression("Slave SQL.*Request to stop slave SQL Thread received while applying a group that has non-transactional changes; waiting for completion of the group");
call mtr.add_suppression("The slave coordinator and worker threads are stopped, possibly leaving data in inconsistent state");
-call mtr.add_suppression("Slave SQL: Coordinator thread of multi-threaded slave is exiting seeing a failed to apply an event Worker");
+call mtr.add_suppression("Slave SQL: Coordinator thread of multi-threaded slave is exiting seeing a failed Worker to apply an event");
*** Testing schema A->B->C->D->A ***
=== modified file 'mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test'
--- a/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test 2011-07-12 20:36:17 +0000
+++ b/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test 2011-07-14 09:40:06 +0000
@@ -32,7 +32,7 @@ CREATE TABLE t2 (a INT NOT NULL AUTO_INC
--connection server_4
call mtr.add_suppression("Slave SQL.*Request to stop slave SQL Thread received while applying a group that has non-transactional changes; waiting for completion of the group");
call mtr.add_suppression("The slave coordinator and worker threads are stopped, possibly leaving data in inconsistent state");
-call mtr.add_suppression("Slave SQL: Coordinator thread of multi-threaded slave is exiting seeing a failed to apply an event Worker");
+call mtr.add_suppression("Slave SQL: Coordinator thread of multi-threaded slave is exiting seeing a failed Worker to apply an event");
--echo
#
=== modified file 'sql/log_event.cc'
--- a/sql/log_event.cc 2011-07-14 07:03:55 +0000
+++ b/sql/log_event.cc 2011-07-14 09:40:06 +0000
@@ -2507,7 +2507,8 @@ Slave_worker *Log_event::get_slave_worke
{
llstr(rli->get_event_relay_log_pos(), llbuff);
my_error(ER_MTS_CANT_PARALLEL, MYF(0),
- get_type_str(), rli->get_event_relay_log_name(), llbuff);
+ get_type_str(), rli->get_event_relay_log_name(), llbuff,
+ "could not distribute the event to a Worker");
return ret_worker;
}
// all temporary tables are transferred from Coordinator in over-max case
@@ -2554,7 +2555,9 @@ Slave_worker *Log_event::get_slave_worke
llstr(rli->get_event_relay_log_pos(), llbuff);
my_error(ER_MTS_CANT_PARALLEL, MYF(0),
- get_type_str(), rli->get_event_relay_log_name(), llbuff);
+ get_type_str(), rli->get_event_relay_log_name(), llbuff,
+ "the event is a part of a group that is unsupported in "
+ "the parallel execution mode");
return ret_worker;
}
@@ -2729,7 +2732,8 @@ int Log_event::apply_event(Relay_log_inf
*/
llstr(rli->get_event_relay_log_pos(), llbuff);
my_error(ER_MTS_CANT_PARALLEL, MYF(0),
- get_type_str(), rli->get_event_relay_log_name(), llbuff);
+ get_type_str(), rli->get_event_relay_log_name(), llbuff,
+ "possible malformed group of events from an old master");
/* Coordinator cant continue, it marks MTS group status accordingly */
rli->mts_group_status= Relay_log_info::MTS_KILLED_GROUP;
@@ -2744,7 +2748,7 @@ int Log_event::apply_event(Relay_log_inf
// handle synchronization error
rli->report(WARNING_LEVEL, 0,
"Coordinator thread of multi-threaded slave is exiting "
- "seeing a failed to apply an event Worker.");
+ "seeing a failed Worker to apply an event.");
DBUG_RETURN(-1);
}
/*
=== modified file 'sql/rpl_rli_pdb.cc'
--- a/sql/rpl_rli_pdb.cc 2011-07-14 07:03:55 +0000
+++ b/sql/rpl_rli_pdb.cc 2011-07-14 09:40:06 +0000
@@ -911,6 +911,18 @@ void Slave_worker::slave_worker_ends_gro
last_group_done_index= gaq_idx;
groups_done++;
}
+ else
+ {
+ // Killing Coordinator to indicate eventual consistency error
+ mysql_mutex_lock(&c_rli->info_thd->LOCK_thd_data);
+ c_rli->info_thd->awake(THD::KILL_QUERY);
+ mysql_mutex_unlock(&c_rli->info_thd->LOCK_thd_data);
+
+ // tagging as exiting so Coordinator won't be able synchronize with it
+ mysql_mutex_lock(&jobs_lock);
+ running_status= ERROR_LEAVING;
+ mysql_mutex_unlock(&jobs_lock);
+ }
/*
Cleanup relating to the last executed group regardless of error.
@@ -973,16 +985,6 @@ void Slave_worker::slave_worker_ends_gro
if (error)
{
- // Killing Coordinator to indicate eventual consistency error
- mysql_mutex_lock(&c_rli->info_thd->LOCK_thd_data);
- c_rli->info_thd->awake(THD::KILL_QUERY);
- mysql_mutex_unlock(&c_rli->info_thd->LOCK_thd_data);
-
- // tagging as exiting so Coordinator won't be able synchronize with it
- mysql_mutex_lock(&jobs_lock);
- running_status= ERROR_LEAVING;
- mysql_mutex_unlock(&jobs_lock);
-
// Awakening Coordinator that could be waiting for entry release
mysql_mutex_lock(&slave_worker_hash_lock);
mysql_cond_signal(&slave_worker_hash_cond);
@@ -1332,8 +1334,8 @@ void Slave_worker::do_report(loglevel le
@return non-negative number of released by Workers partitions
(one partition by one Worker can count multiple times)
- or -1 to indicate there has been a failure on a Worker
- so synchronization can't succeed.
+ or -1 to indicate there has been a failure on a not-ignored Worker
+ as indicated by its running_status so synchronization can't succeed.
*/
int wait_for_workers_to_finish(Relay_log_info const *rli, Slave_worker *ignore)
=== modified file 'sql/share/errmsg-utf8.txt'
--- a/sql/share/errmsg-utf8.txt 2011-06-29 07:04:19 +0000
+++ b/sql/share/errmsg-utf8.txt 2011-07-14 09:40:06 +0000
@@ -6517,6 +6517,6 @@ ER_MTS_FEATURE_IS_NOT_SUPPORTED
ER_MTS_UPDATED_DBS_GREATER_MAX
eng "Modified database names number exceeds the maximum %d; the names are not written into the replication event."
ER_MTS_CANT_PARALLEL
- eng "Can't execute the current event group in parallel mode. Encountered event %s, relay-log name %s, position %s which prevents execution of this event group in parallel mode."
+ eng "Can't execute the current event group in parallel mode. Encountered event %s, relay-log name %s, position %s which prevents execution of this event group in parallel mode. Reason: %s."
ER_MTS_PARALLEL_INCONSISTENT_DATA
eng "%s"
No bundle (reason: useless for push emails).
| Thread |
|---|
| • bzr push into mysql-next-mr-wl5569 branch (andrei.elkin:3346 to 3347) WL#5569 | Andrei Elkin | 17 Jul |