3348 Andrei Elkin 2011-07-16
bug#12755663 MTS: RPL_CIRCULAR_FOR_4_HOSTS FAILS: CANT EXECUTE THE CURRENT EVENT GROUP
MTS stopped with an error in the middle of the test.
The reason is that a group of events from the slave itself was processed partly to
modify the group position. In the following restart the wrong group bondary made slave
either to error out or assert.
Fixed with locating a possible race condition allowin Coordinator to ignore
actual failed status of a Worker.
So in the case of the test, the slave server group can't be started.
Notice, this is a trial patch since I can't catch the failure on available to me hosts at all.
@ sql/rpl_rli_pdb.cc
Changing the running status of the Worker before it releases assigned entries.
That ensure that the waiting in wait_for_workers_to_finish() Coordinator exits the function
with a negative result and therefore stops without attempting to apply an event
due to which it attempted synchronization.
Couple of diagnostics into error log are added. They may be removed in short while but
currently might be helpful to provide details if the failure won't disappear after this push.
modified:
mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
sql/rpl_rli_pdb.cc
3347 Andrei Elkin 2011-07-14
WL#5569 MTS
further extensive rpl_circular_for_4_hosts exersices with --repeat 10 --parallel=8
revealed a race condition in that Coordinator might miss to catch not-running status
for a Worker. That made Coordinator to skip only a part of a group of the slave server own events
so the slave stops at not the bondary of a group.
Fixed with moving marking of the errored-out Worker as failed prior to its APH entries release.
TODO: notice there can be a possibility to stop at not the boundary due to graceful STOP SLAVE
if one is run at time of skipping self-originated events.
However this issue belongs to STS and might be similar with BUG@12604951 and BUG@12728160.
@ mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
results are updated.
@ mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
tests is updated with a new text of a suppression.
@ sql/log_event.cc
Adding clarifying text to an error message when parallel execution fails.
@ sql/rpl_rli_pdb.cc
Moving marking of the errored-out Worker as failed prior to its APH entries release.
That ensures Coordinator always finds the non-running status in a case it has to
know that (wait_for_workers_to_finish()).
@ sql/share/errmsg-utf8.txt
Adding a format specifier for a clarifying text.
modified:
mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
sql/log_event.cc
sql/rpl_rli_pdb.cc
sql/share/errmsg-utf8.txt
=== modified file 'mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result'
--- a/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result 2011-07-14 09:40:06 +0000
+++ b/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result 2011-07-15 23:11:11 +0000
@@ -48,9 +48,9 @@ STOP SLAVE;
SET GLOBAL SQL_SLAVE_SKIP_COUNTER = 1;
include/start_slave.inc
INSERT INTO t1 VALUES(6,'C',2);
-call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
lock table t1 write /* must block B_2^6 coming */;
INSERT INTO t1(b,c) VALUES('B',2);
+call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
unlock tables;
include/wait_for_slave_sql_error.inc [errno=1062]
INSERT INTO t1(b,c) VALUES('A',2);
=== modified file 'mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test'
--- a/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test 2011-07-14 09:40:06 +0000
+++ b/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test 2011-07-15 23:11:11 +0000
@@ -85,23 +85,23 @@ INSERT INTO t1 VALUES(6,'C',2);
#
# MTS part of the test makes sure that server B will have received
-# the being failed C's `call mtr.add_suppression' so it will send it
+# the being failed C's `call mtr.add_suppression' and it will send it
# to D after its own INSERT INTO t1 VALUES(6,'C',2).
#
--connection server_3
- call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
-
lock table t1 write /* must block B_2^6 coming */;
--connection server_2
INSERT INTO t1(b,c) VALUES('B',2);
+--connection server_3_1
+ call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
+
# MTS: catching failure
--connection server_3
-
unlock tables;
-
+
# Wait while C will stop.
--connection server_3
# 1062 = ER_DUP_ENTRY
=== modified file 'sql/rpl_rli_pdb.cc'
--- a/sql/rpl_rli_pdb.cc 2011-07-14 09:40:06 +0000
+++ b/sql/rpl_rli_pdb.cc 2011-07-15 23:11:11 +0000
@@ -913,15 +913,15 @@ void Slave_worker::slave_worker_ends_gro
}
else
{
- // Killing Coordinator to indicate eventual consistency error
- mysql_mutex_lock(&c_rli->info_thd->LOCK_thd_data);
- c_rli->info_thd->awake(THD::KILL_QUERY);
- mysql_mutex_unlock(&c_rli->info_thd->LOCK_thd_data);
-
// tagging as exiting so Coordinator won't be able synchronize with it
mysql_mutex_lock(&jobs_lock);
running_status= ERROR_LEAVING;
mysql_mutex_unlock(&jobs_lock);
+
+ // Killing Coordinator to indicate eventual consistency error
+ mysql_mutex_lock(&c_rli->info_thd->LOCK_thd_data);
+ c_rli->info_thd->awake(THD::KILL_QUERY);
+ mysql_mutex_unlock(&c_rli->info_thd->LOCK_thd_data);
}
/*
@@ -1346,9 +1346,13 @@ int wait_for_workers_to_finish(Relay_log
const char info_format[]=
"Waiting for Slave Worker %d to release partition `%s`";
bool cant_sync= FALSE;
+ char llbuf[22];
DBUG_ENTER("wait_for_workers_to_finish");
+ llstr(const_cast<Relay_log_info*>(rli)->get_event_relay_log_pos(), llbuf);
+ sql_print_information("Coordinator enter synchronization when distributes event relay-log: %s pos: %s", const_cast<Relay_log_info*>(rli)->get_event_relay_log_name(), llbuf);
+
for (uint i= 0, ret= 0; i < hash->records; i++)
{
db_worker_hash_entry *entry;
@@ -1401,7 +1405,11 @@ int wait_for_workers_to_finish(Relay_log
}
if (!ignore)
+ {
+ sql_print_information("Coordinator synchronized with Workers, waited entries: %d, cant_sync: %d", ret, cant_sync);
+
const_cast<Relay_log_info*>(rli)->mts_group_status= Relay_log_info::MTS_NOT_IN_GROUP;
+ }
DBUG_RETURN(!cant_sync ? ret : -1);
}
No bundle (reason: useless for push emails).
| Thread |
|---|
| • bzr push into mysql-next-mr-wl5569 branch (andrei.elkin:3347 to 3348)Bug#12755663 | Andrei Elkin | 17 Jul |