From: Andrei Elkin Date: July 14 2011 9:40am Subject: bzr push into mysql-next-mr-wl5569 branch (andrei.elkin:3346 to 3347) WL#5569 List-Archive: http://lists.mysql.com/commits/140310 Message-Id: <201107140940.p6E9ehU4002430@mysql1000.dsl.inet.fi> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit 3347 Andrei Elkin 2011-07-14 WL#5569 MTS further extensive rpl_circular_for_4_hosts exersices with --repeat 10 --parallel=8 revealed a race condition in that Coordinator might miss to catch not-running status for a Worker. That made Coordinator to skip only a part of a group of the slave server own events so the slave stops at not the bondary of a group. Fixed with moving marking of the errored-out Worker as failed prior to its APH entries release. TODO: notice there can be a possibility to stop at not the boundary due to graceful STOP SLAVE if one is run at time of skipping self-originated events. However this issue belongs to STS and might be similar with BUG@12604951 and BUG@12728160. @ mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result results are updated. @ mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test tests is updated with a new text of a suppression. @ sql/log_event.cc Adding clarifying text to an error message when parallel execution fails. @ sql/rpl_rli_pdb.cc Moving marking of the errored-out Worker as failed prior to its APH entries release. That ensures Coordinator always finds the non-running status in a case it has to know that (wait_for_workers_to_finish()). @ sql/share/errmsg-utf8.txt Adding a format specifier for a clarifying text. modified: mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test sql/log_event.cc sql/rpl_rli_pdb.cc sql/share/errmsg-utf8.txt 3346 Alfranio Correia 2011-07-14 Post-push fixes for WL#5569 Injecting faults while updating a myisam table requires to flush the changes before committing suicide. So we have introduced the follwing code: DBUG_EXECUTE_IF("crash_after_commit_and_update_pos", - DBUG_SUICIDE();); + sql_print_information("Crashing crash_after_commit_and_update_pos."); + flush_info(TRUE); + DBUG_SUICIDE(); Besides we improved some comments. modified: mysql-test/r/mysqld--help-win.result mysql-test/suite/rpl/t/rpl_mixed_mts_crash_safe-slave.opt mysql-test/suite/rpl/t/rpl_mixed_mts_rec_crash_safe-slave.opt mysql-test/suite/rpl/t/rpl_row_mts_crash_safe-slave.opt mysql-test/suite/rpl/t/rpl_row_mts_rec_crash_safe-slave.opt mysql-test/suite/rpl/t/rpl_stm_mts_crash_safe-slave.opt mysql-test/suite/rpl/t/rpl_stm_mts_rec_crash_safe-slave.opt sql/log_event.cc sql/rpl_rli_pdb.cc sql/rpl_slave.cc sql/sys_vars.cc === modified file 'mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result' --- a/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result 2011-07-12 20:36:17 +0000 +++ b/mysql-test/suite/rpl/r/rpl_circular_for_4_hosts.result 2011-07-14 09:40:06 +0000 @@ -7,7 +7,7 @@ CREATE TABLE t2 (a INT NOT NULL AUTO_INC include/rpl_sync.inc call mtr.add_suppression("Slave SQL.*Request to stop slave SQL Thread received while applying a group that has non-transactional changes; waiting for completion of the group"); call mtr.add_suppression("The slave coordinator and worker threads are stopped, possibly leaving data in inconsistent state"); -call mtr.add_suppression("Slave SQL: Coordinator thread of multi-threaded slave is exiting seeing a failed to apply an event Worker"); +call mtr.add_suppression("Slave SQL: Coordinator thread of multi-threaded slave is exiting seeing a failed Worker to apply an event"); *** Testing schema A->B->C->D->A *** === modified file 'mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test' --- a/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test 2011-07-12 20:36:17 +0000 +++ b/mysql-test/suite/rpl/t/rpl_circular_for_4_hosts.test 2011-07-14 09:40:06 +0000 @@ -32,7 +32,7 @@ CREATE TABLE t2 (a INT NOT NULL AUTO_INC --connection server_4 call mtr.add_suppression("Slave SQL.*Request to stop slave SQL Thread received while applying a group that has non-transactional changes; waiting for completion of the group"); call mtr.add_suppression("The slave coordinator and worker threads are stopped, possibly leaving data in inconsistent state"); -call mtr.add_suppression("Slave SQL: Coordinator thread of multi-threaded slave is exiting seeing a failed to apply an event Worker"); +call mtr.add_suppression("Slave SQL: Coordinator thread of multi-threaded slave is exiting seeing a failed Worker to apply an event"); --echo # === modified file 'sql/log_event.cc' --- a/sql/log_event.cc 2011-07-14 07:03:55 +0000 +++ b/sql/log_event.cc 2011-07-14 09:40:06 +0000 @@ -2507,7 +2507,8 @@ Slave_worker *Log_event::get_slave_worke { llstr(rli->get_event_relay_log_pos(), llbuff); my_error(ER_MTS_CANT_PARALLEL, MYF(0), - get_type_str(), rli->get_event_relay_log_name(), llbuff); + get_type_str(), rli->get_event_relay_log_name(), llbuff, + "could not distribute the event to a Worker"); return ret_worker; } // all temporary tables are transferred from Coordinator in over-max case @@ -2554,7 +2555,9 @@ Slave_worker *Log_event::get_slave_worke llstr(rli->get_event_relay_log_pos(), llbuff); my_error(ER_MTS_CANT_PARALLEL, MYF(0), - get_type_str(), rli->get_event_relay_log_name(), llbuff); + get_type_str(), rli->get_event_relay_log_name(), llbuff, + "the event is a part of a group that is unsupported in " + "the parallel execution mode"); return ret_worker; } @@ -2729,7 +2732,8 @@ int Log_event::apply_event(Relay_log_inf */ llstr(rli->get_event_relay_log_pos(), llbuff); my_error(ER_MTS_CANT_PARALLEL, MYF(0), - get_type_str(), rli->get_event_relay_log_name(), llbuff); + get_type_str(), rli->get_event_relay_log_name(), llbuff, + "possible malformed group of events from an old master"); /* Coordinator cant continue, it marks MTS group status accordingly */ rli->mts_group_status= Relay_log_info::MTS_KILLED_GROUP; @@ -2744,7 +2748,7 @@ int Log_event::apply_event(Relay_log_inf // handle synchronization error rli->report(WARNING_LEVEL, 0, "Coordinator thread of multi-threaded slave is exiting " - "seeing a failed to apply an event Worker."); + "seeing a failed Worker to apply an event."); DBUG_RETURN(-1); } /* === modified file 'sql/rpl_rli_pdb.cc' --- a/sql/rpl_rli_pdb.cc 2011-07-14 07:03:55 +0000 +++ b/sql/rpl_rli_pdb.cc 2011-07-14 09:40:06 +0000 @@ -911,6 +911,18 @@ void Slave_worker::slave_worker_ends_gro last_group_done_index= gaq_idx; groups_done++; } + else + { + // Killing Coordinator to indicate eventual consistency error + mysql_mutex_lock(&c_rli->info_thd->LOCK_thd_data); + c_rli->info_thd->awake(THD::KILL_QUERY); + mysql_mutex_unlock(&c_rli->info_thd->LOCK_thd_data); + + // tagging as exiting so Coordinator won't be able synchronize with it + mysql_mutex_lock(&jobs_lock); + running_status= ERROR_LEAVING; + mysql_mutex_unlock(&jobs_lock); + } /* Cleanup relating to the last executed group regardless of error. @@ -973,16 +985,6 @@ void Slave_worker::slave_worker_ends_gro if (error) { - // Killing Coordinator to indicate eventual consistency error - mysql_mutex_lock(&c_rli->info_thd->LOCK_thd_data); - c_rli->info_thd->awake(THD::KILL_QUERY); - mysql_mutex_unlock(&c_rli->info_thd->LOCK_thd_data); - - // tagging as exiting so Coordinator won't be able synchronize with it - mysql_mutex_lock(&jobs_lock); - running_status= ERROR_LEAVING; - mysql_mutex_unlock(&jobs_lock); - // Awakening Coordinator that could be waiting for entry release mysql_mutex_lock(&slave_worker_hash_lock); mysql_cond_signal(&slave_worker_hash_cond); @@ -1332,8 +1334,8 @@ void Slave_worker::do_report(loglevel le @return non-negative number of released by Workers partitions (one partition by one Worker can count multiple times) - or -1 to indicate there has been a failure on a Worker - so synchronization can't succeed. + or -1 to indicate there has been a failure on a not-ignored Worker + as indicated by its running_status so synchronization can't succeed. */ int wait_for_workers_to_finish(Relay_log_info const *rli, Slave_worker *ignore) === modified file 'sql/share/errmsg-utf8.txt' --- a/sql/share/errmsg-utf8.txt 2011-06-29 07:04:19 +0000 +++ b/sql/share/errmsg-utf8.txt 2011-07-14 09:40:06 +0000 @@ -6517,6 +6517,6 @@ ER_MTS_FEATURE_IS_NOT_SUPPORTED ER_MTS_UPDATED_DBS_GREATER_MAX eng "Modified database names number exceeds the maximum %d; the names are not written into the replication event." ER_MTS_CANT_PARALLEL - eng "Can't execute the current event group in parallel mode. Encountered event %s, relay-log name %s, position %s which prevents execution of this event group in parallel mode." + eng "Can't execute the current event group in parallel mode. Encountered event %s, relay-log name %s, position %s which prevents execution of this event group in parallel mode. Reason: %s." ER_MTS_PARALLEL_INCONSISTENT_DATA eng "%s" No bundle (reason: useless for push emails).