3342 Andrei Elkin 2011-07-12
WL#5569 MTS
Fixing code and test due to rpl.rpl_circular_for_4_hosts mismatch failure, like
http://pb2.norway.sun.com/?action=archive_download&archive_id=3608382.
The reason of the mismatch was that when having two group of events to execute,
the first for a Worker and the 2nd for Coordinator, Coordinator waited for
the 1st group completion but did not verify success of synchronization.
So in a case of the failed applying of the 1st group processing of the 2nd
could find an inconsistent state to end up with a segfault (even though only
the mismatch has been seen so far).
@ mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
results are updated.
@ mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
Test is updated to include a part specific to MTS.
While all former conditions hold, the new section makes sure B server
has two group of events to send which was not previously guaraneed nor necessary.
Further, when the first of the two fails with Duplicate entry at applying of the 2nd Coordinator
senses the first failure and gives out the 2nd.
The first error remains to be seen in show-slave-status.
@ sql/log_event.cc
Checking wait_for_workers_to_finish() return code in case Coordinator executes a sequential-mode event.
Comments are deployed in few other places where that is unnecessary to do.
@ sql/rpl_rli_pdb.cc
Worker marks itself as failed to apply which fact is reported to Coordinator
also through wait_for_workers_to_finish().
Coodinator shall check the error code in a branch of a sequential event applying.
@ sql/rpl_rli_pdb.h
Adding a new state that Worker sets itself to indiate its failure to apply.
@ sql/rpl_slave.cc
Refining an assert as consequence of the new state and its actual setting by Worker.
modified:
mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result
mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test
sql/log_event.cc
sql/rpl_rli_pdb.cc
sql/rpl_rli_pdb.h
sql/rpl_slave.cc
3341 Alfranio Correia 2011-07-10
Avoiding busy waiting when running mts recovery tests.
modified:
sql/rpl_slave.cc
=== modified file 'mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result'
--- a/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result 2011-06-20 13:26:35 +0000
+++ b/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result 2011-07-12 11:46:23 +0000
@@ -47,8 +47,10 @@ STOP SLAVE;
SET GLOBAL SQL_SLAVE_SKIP_COUNTER = 1;
include/start_slave.inc
INSERT INTO t1 VALUES(6,'C',2);
-INSERT INTO t1(b,c) VALUES('B',2);
call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
+lock table t1 write /* must block B_2^6 coming */;
+INSERT INTO t1(b,c) VALUES('B',2);
+unlock tables;
include/wait_for_slave_sql_error.inc [errno=1062]
INSERT INTO t1(b,c) VALUES('A',2);
INSERT INTO t1(b,c) VALUES('D',2);
=== modified file 'mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test'
--- a/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test 2011-06-20 13:26:35 +0000
+++ b/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test 2011-07-12 11:46:23 +0000
@@ -83,17 +83,34 @@ source include/start_slave.inc;
INSERT INTO t1 VALUES(6,'C',2);
--sync_slave_with_master server_4
+#
+# MTS part of the test makes sure that server B will have received
+# the being failed C's `call mtr.add_suppression' so it will send it
+# to D after its own INSERT INTO t1 VALUES(6,'C',2).
+#
+--connection server_3
+
+ call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
+
+ lock table t1 write /* must block B_2^6 coming */;
+
--connection server_2
INSERT INTO t1(b,c) VALUES('B',2);
+
+# MTS: catching failure
+--connection server_3
+
+ unlock tables;
+
# Wait while C will stop.
--connection server_3
# 1062 = ER_DUP_ENTRY
-call mtr.add_suppression("Slave SQL.*Duplicate entry .6. for key .PRIMARY.* Error_code: 1062");
--let $slave_sql_errno= 1062
--source include/wait_for_slave_sql_error.inc
--connection server_1
INSERT INTO t1(b,c) VALUES('A',2);
--connection server_4
+
INSERT INTO t1(b,c) VALUES('D',2);
=== modified file 'sql/log_event.cc'
--- a/sql/log_event.cc 2011-07-09 22:08:07 +0000
+++ b/sql/log_event.cc 2011-07-12 11:46:23 +0000
@@ -2483,6 +2483,7 @@ Slave_worker *Log_event::get_slave_worke
// Worker with id 0 to handle serial execution
if (!ret_worker)
ret_worker= *(Slave_worker**) dynamic_array_ptr(&rli->workers, 0);
+ // No need to know a possible error out of synchronization call.
(void) wait_for_workers_to_finish(rli, ret_worker);
/*
this marking is transferred further into T-event of the current group.
@@ -2739,7 +2740,14 @@ int Log_event::apply_event(Relay_log_inf
/*
Marking sure the event will be executed in sequential mode.
*/
- (void) wait_for_workers_to_finish(rli);
+ if (wait_for_workers_to_finish(rli) == -1)
+ {
+ // handle synchronization error
+ rli->report(WARNING_LEVEL, 0,
+ "Coordinator thread of multi-threaded slave is exiting "
+ "seeing a failed to apply an event Worker.");
+ DBUG_RETURN(-1);
+ }
/*
Given not in-group mark the event handler can invoke checkpoint
update routine in the following course.
=== modified file 'sql/rpl_rli_pdb.cc'
--- a/sql/rpl_rli_pdb.cc 2011-07-09 22:08:07 +0000
+++ b/sql/rpl_rli_pdb.cc 2011-07-12 11:46:23 +0000
@@ -997,6 +997,11 @@ void Slave_worker::slave_worker_ends_gro
c_rli->info_thd->awake(THD::KILL_QUERY);
mysql_mutex_unlock(&c_rli->info_thd->LOCK_thd_data);
+ // tagging as exiting so Coordinator won't be able synchronize with it
+ mysql_mutex_lock(&jobs_lock);
+ running_status= ERROR_LEAVING;
+ mysql_mutex_unlock(&jobs_lock);
+
// Awakening Coordinator that could be waiting for entry release
mysql_mutex_lock(&slave_worker_hash_lock);
mysql_cond_signal(&slave_worker_hash_cond);
@@ -1344,7 +1349,10 @@ void Slave_worker::do_report(loglevel le
of APH are relocated to the Coordinator placeholder.
@return non-negative number of released by Workers partitions
- (one partition by one Worker can count multiple times).
+ (one partition by one Worker can count multiple times)
+
+ or -1 to indicate there has been a failure on a Worker
+ so synchronization can't succeed.
*/
int wait_for_workers_to_finish(Relay_log_info const *rli, Slave_worker *ignore)
@@ -1354,6 +1362,7 @@ int wait_for_workers_to_finish(Relay_log
THD *thd= rli->info_thd;
const char info_format[]=
"Waiting for Slave Worker %d to release partition `%s`";
+ bool cant_sync= FALSE;
DBUG_ENTER("wait_for_workers_to_finish");
@@ -1404,12 +1413,14 @@ int wait_for_workers_to_finish(Relay_log
// resources relocation
mts_move_temp_tables_to_thd(thd, entry->temporary_tables);
entry->temporary_tables= NULL;
+ if (entry->worker->running_status != Slave_worker::RUNNING)
+ cant_sync= TRUE;
}
if (!ignore)
const_cast<Relay_log_info*>(rli)->mts_group_status= Relay_log_info::MTS_NOT_IN_GROUP;
- DBUG_RETURN(ret);
+ DBUG_RETURN(!cant_sync ? ret : -1);
}
=== modified file 'sql/rpl_rli_pdb.h'
--- a/sql/rpl_rli_pdb.h 2011-07-09 22:08:07 +0000
+++ b/sql/rpl_rli_pdb.h 2011-07-12 11:46:23 +0000
@@ -285,8 +285,13 @@ public:
{
NOT_RUNNING= 0,
RUNNING= 1,
- KILLED
+ ERROR_LEAVING, // is set by Worker
+ KILLED // is set by Coordinator
};
+ /*
+ The running status is guarded by jobs_lock mutex that a writer
+ Coordinator or Worker itself needs to hold when write a new value.
+ */
en_running_state volatile running_status;
bool inited_curr_group_exec_parts;
=== modified file 'sql/rpl_slave.cc'
--- a/sql/rpl_slave.cc 2011-07-10 21:40:01 +0000
+++ b/sql/rpl_slave.cc 2011-07-12 11:46:23 +0000
@@ -2870,6 +2870,7 @@ int apply_event_and_update_pos(Log_event
to the event worker.
Indeed MTS group status could be safely set to MTS_NOT_IN_GROUP
after wait_() returns.
+ No need to know a possible error out of synchronization call.
*/
(void) wait_for_workers_to_finish(rli);
}
@@ -4430,7 +4431,7 @@ void slave_stop_workers(Relay_log_info *
DBUG_SUICIDE();
}
#endif
-
+ // No need to know a possible error out of synchronization call.
(void) wait_for_workers_to_finish(rli);
/*
At this point the coordinator has been stopped and the checkpoint
@@ -4472,7 +4473,8 @@ void slave_stop_workers(Relay_log_info *
{
const char *save_proc_info;
- DBUG_ASSERT(w->running_status == Slave_worker::KILLED);
+ DBUG_ASSERT(w->running_status == Slave_worker::KILLED ||
+ w->running_status == Slave_worker::ERROR_LEAVING);
save_proc_info= thd->enter_cond(&w->jobs_cond, &w->jobs_lock,
"Waiting for workers to exit");
No bundle (reason: useless for push emails).
| Thread |
|---|
| • bzr push into mysql-next-mr-wl5569 branch (andrei.elkin:3341 to 3342) WL#5569 | Andrei Elkin | 13 Jul |