#At file:///C:/source/bzr/mysql-6.0-bug-40434/ based on revid:rafal.somla@stripped
2737 Chuck Bell 2008-12-02
BUG#40434 : Replication should not be allowed to start if restore is running
Currently, the work of WL#4209 and WL#4280 provide mechanisms to control
how replication and backup interact. However, we also need to have a method
to prevent replication from starting on a slave if a restore is ongoing on that
same slave.
This patch prohibits a slave from starting replication when a restore is in
progress.
modified:
mysql-test/suite/rpl/r/rpl_backup.result
mysql-test/suite/rpl/t/rpl_backup.test
sql/backup/kernel.cc
sql/mysql_priv.h
sql/mysqld.cc
sql/repl_failsafe.cc
sql/repl_failsafe.h
sql/share/errmsg.txt
sql/si_objects.cc
sql/si_objects.h
sql/sql_repl.cc
per-file messages:
mysql-test/suite/rpl/r/rpl_backup.result
Result file with additional test.
mysql-test/suite/rpl/t/rpl_backup.test
Added test for prohibiting a slave to start replication while a restore is
running.
sql/backup/kernel.cc
Added calls to restore_running() to tell server a restore is in progress.
sql/mysql_priv.h
Added extern reference for mutex for variable.
sql/mysqld.cc
Added mutex for variable.
Added calls for initialization of mutex and variable.
sql/repl_failsafe.cc
Added methods to init and destroy mutex.
sql/repl_failsafe.h
Added declarations for methods to init and destroy mutex.
sql/share/errmsg.txt
New error message to tell the user a slave cannot start until the ongoing
restore is complete.
sql/si_objects.cc
Added method to tell server a restore is running.
sql/si_objects.h
Method declaration for restore running method.
sql/sql_repl.cc
Added code to prohibit slave from starting if the variable allow_slave_start
is set to FALSE. This is done when a restore is run via restore_running()
in si_objects.cc.
=== modified file 'mysql-test/suite/rpl/r/rpl_backup.result'
--- a/mysql-test/suite/rpl/r/rpl_backup.result 2008-11-17 09:57:51 +0000
+++ b/mysql-test/suite/rpl/r/rpl_backup.result 2008-12-03 02:53:40 +0000
@@ -327,6 +327,110 @@ the after position of the master's binlo
should be 0.
Delta
0
+RESET MASTER;
+RESET SLAVE;
+SET DEBUG_SYNC = 'reset';
+SET DEBUG_SYNC = 'restore_before_end SIGNAL restore_running WAIT_FOR proceed';
+RESTORE FROM 'rpl_bup_s3.bak' OVERWRITE;
+SET DEBUG_SYNC = 'now WAIT_FOR restore_running';
+Try to start the slave while restore is running -- gets error.
+SLAVE START;
+ERROR HY000: Slave cannot start until restore is complete.
+SET DEBUG_SYNC = 'now SIGNAL proceed';
+SHOW SLAVE STATUS;
+Slave_IO_State #
+Master_Host 127.0.0.1
+Master_User root
+Master_Port MASTER_PORT
+Connect_Retry 1
+Master_Log_File #
+Read_Master_Log_Pos #
+Relay_Log_File #
+Relay_Log_Pos #
+Relay_Master_Log_File
+Slave_IO_Running No
+Slave_SQL_Running No
+Replicate_Do_DB
+Replicate_Ignore_DB
+Replicate_Do_Table
+Replicate_Ignore_Table
+Replicate_Wild_Do_Table
+Replicate_Wild_Ignore_Table
+Last_Errno 0
+Last_Error
+Skip_Counter 0
+Exec_Master_Log_Pos #
+Relay_Log_Space #
+Until_Condition None
+Until_Log_File
+Until_Log_Pos 0
+Master_SSL_Allowed No
+Master_SSL_CA_File
+Master_SSL_CA_Path
+Master_SSL_Cert
+Master_SSL_Cipher
+Master_SSL_Key
+Seconds_Behind_Master #
+Master_SSL_Verify_Server_Cert No
+Last_IO_Errno 0
+Last_IO_Error
+Last_SQL_Errno 0
+Last_SQL_Error
+Restore is now complete.
+backup_id
+#
+SET DEBUG_SYNC = 'now SIGNAL done';
+SET DEBUG_SYNC = 'now WAIT_FOR done';
+SHOW DATABASES;
+Database
+information_schema
+mysql
+rpl_backup
+test
+SET DEBUG_SYNC = 'reset';
+Try to start the slave after restore is done -- should succeed.
+SLAVE START;
+SHOW SLAVE STATUS;
+Slave_IO_State #
+Master_Host 127.0.0.1
+Master_User root
+Master_Port MASTER_PORT
+Connect_Retry 1
+Master_Log_File #
+Read_Master_Log_Pos #
+Relay_Log_File #
+Relay_Log_Pos #
+Relay_Master_Log_File master-bin.000001
+Slave_IO_Running Yes
+Slave_SQL_Running Yes
+Replicate_Do_DB
+Replicate_Ignore_DB
+Replicate_Do_Table
+Replicate_Ignore_Table
+Replicate_Wild_Do_Table
+Replicate_Wild_Ignore_Table
+Last_Errno 0
+Last_Error
+Skip_Counter 0
+Exec_Master_Log_Pos #
+Relay_Log_Space #
+Until_Condition None
+Until_Log_File
+Until_Log_Pos 0
+Master_SSL_Allowed No
+Master_SSL_CA_File
+Master_SSL_CA_Path
+Master_SSL_Cert
+Master_SSL_Cipher
+Master_SSL_Key
+Seconds_Behind_Master #
+Master_SSL_Verify_Server_Cert No
+Last_IO_Errno 0
+Last_IO_Error
+Last_SQL_Errno 0
+Last_SQL_Error
+Now stop the slave.
+SLAVE STOP;
FLUSH BACKUP LOGS;
PURGE BACKUP LOGS;
DROP DATABASE rpl_backup;
=== modified file 'mysql-test/suite/rpl/t/rpl_backup.test'
--- a/mysql-test/suite/rpl/t/rpl_backup.test 2008-11-17 09:57:51 +0000
+++ b/mysql-test/suite/rpl/t/rpl_backup.test 2008-12-03 02:53:40 +0000
@@ -375,6 +375,63 @@ eval SELECT $master_after_pos - $master_
--enable_query_log
#
+# Now test 'slave start' while restore is in progress on slave.
+#
+
+RESET MASTER;
+
+connection slave;
+
+RESET SLAVE;
+
+SET DEBUG_SYNC = 'reset';
+
+connection slave1;
+
+SET DEBUG_SYNC = 'restore_before_end SIGNAL restore_running WAIT_FOR proceed';
+SEND RESTORE FROM 'rpl_bup_s3.bak' OVERWRITE;
+
+connection slave;
+
+SET DEBUG_SYNC = 'now WAIT_FOR restore_running';
+
+--echo Try to start the slave while restore is running -- gets error.
+--error ER_RESTORE_CANNOT_START_SLAVE
+SLAVE START;
+
+SET DEBUG_SYNC = 'now SIGNAL proceed';
+
+--replace_result $MASTER_MYPORT MASTER_PORT
+--replace_column 1 # 6 # 7 # 8 # 9 # 22 # 23 # 33 #
+--query_vertical SHOW SLAVE STATUS
+
+connection slave1;
+--echo Restore is now complete.
+--replace_column 1 #
+reap;
+SET DEBUG_SYNC = 'now SIGNAL done';
+
+connection slave;
+
+SET DEBUG_SYNC = 'now WAIT_FOR done';
+
+SHOW DATABASES;
+
+SET DEBUG_SYNC = 'reset';
+
+--echo Try to start the slave after restore is done -- should succeed.
+SLAVE START;
+--source include/wait_for_slave_to_start.inc
+
+--replace_result $MASTER_MYPORT MASTER_PORT
+--replace_column 1 # 6 # 7 # 8 # 9 # 22 # 23 # 33 #
+--query_vertical SHOW SLAVE STATUS
+
+--echo Now stop the slave.
+SLAVE STOP;
+--source include/wait_for_slave_to_stop.inc
+
+#
# Cleanup
#
connection master;
=== modified file 'sql/backup/kernel.cc'
--- a/sql/backup/kernel.cc 2008-11-28 10:10:39 +0000
+++ b/sql/backup/kernel.cc 2008-12-03 02:53:40 +0000
@@ -219,10 +219,14 @@ execute_backup_command(THD *thd, LEX *le
DEBUG_SYNC(thd, "after_backup_start_restore");
+ obs::restore_running(TRUE);
+
res= context.do_restore(overwrite);
DEBUG_SYNC(thd, "restore_before_end");
+ obs::restore_running(FALSE);
+
if (res)
DBUG_RETURN(send_error(context, ER_BACKUP_RESTORE));
=== modified file 'sql/mysql_priv.h'
--- a/sql/mysql_priv.h 2008-11-06 18:39:27 +0000
+++ b/sql/mysql_priv.h 2008-12-03 02:53:40 +0000
@@ -1988,6 +1988,7 @@ extern ulong slow_launch_threads, slow_l
extern ulong table_cache_size, table_def_size;
extern ulong max_connections,max_connect_errors, connect_timeout;
extern my_bool slave_allow_batching;
+extern my_bool allow_slave_start;
extern ulong slave_net_timeout, slave_trans_retries;
extern uint max_user_connections;
extern ulong what_to_log,flush_time;
@@ -2078,7 +2079,7 @@ extern pthread_mutex_t LOCK_mysql_create
LOCK_error_log, LOCK_delayed_insert, LOCK_uuid_short,
LOCK_delayed_status, LOCK_delayed_create, LOCK_crypt, LOCK_timezone,
LOCK_slave_list, LOCK_active_mi, LOCK_manager, LOCK_global_read_lock,
- LOCK_global_system_variables, LOCK_user_conn,
+ LOCK_global_system_variables, LOCK_user_conn, LOCK_slave_start,
LOCK_prepared_stmt_count,
LOCK_connection_count;
#ifdef HAVE_OPENSSL
=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc 2008-11-17 11:17:59 +0000
+++ b/sql/mysqld.cc 2008-12-03 02:53:40 +0000
@@ -540,6 +540,7 @@ ulong query_buff_size, slow_launch_time,
ulong open_files_limit, max_binlog_size, max_relay_log_size;
ulong slave_net_timeout, slave_trans_retries;
my_bool slave_allow_batching;
+my_bool allow_slave_start= TRUE;
ulong slave_exec_mode_options;
const char *slave_exec_mode_str= "STRICT";
ulong thread_cache_size=0, thread_pool_size= 0;
@@ -690,7 +691,7 @@ pthread_mutex_t LOCK_mysql_create_db, LO
LOCK_crypt,
LOCK_global_system_variables,
LOCK_user_conn, LOCK_slave_list, LOCK_active_mi,
- LOCK_connection_count;
+ LOCK_connection_count, LOCK_slave_start;
/**
The below lock protects access to two global server variables:
@@ -1387,6 +1388,7 @@ void clean_up(bool print_message)
free_max_user_conn();
#ifdef HAVE_REPLICATION
end_slave_list();
+ end_slave_start();
#endif
delete binlog_filter;
delete rpl_filter;
@@ -3914,6 +3916,10 @@ static int init_server_components()
my_uuid_init((ulong) (my_rnd(&sql_rand))*12345,12345);
#ifdef HAVE_REPLICATION
init_slave_list();
+ init_slave_start();
+ pthread_mutex_lock(&LOCK_slave_start);
+ allow_slave_start= TRUE;
+ pthread_mutex_unlock(&LOCK_slave_start);
#endif
/* Setup logs */
=== modified file 'sql/repl_failsafe.cc'
--- a/sql/repl_failsafe.cc 2008-08-07 17:52:43 +0000
+++ b/sql/repl_failsafe.cc 2008-12-03 02:53:40 +0000
@@ -236,6 +236,22 @@ void end_slave_list()
}
}
+/**
+ Initialize mutex for slave start variable.
+*/
+void init_slave_start()
+{
+ pthread_mutex_init(&LOCK_slave_start, MY_MUTEX_INIT_FAST);
+}
+
+/**
+ Destroy mutex for slave start variable.
+*/
+void end_slave_start()
+{
+ pthread_mutex_destroy(&LOCK_slave_start);
+}
+
static int find_target_pos(LEX_MASTER_INFO *mi, IO_CACHE *log, char *errmsg)
{
my_off_t log_pos = (my_off_t) mi->pos;
=== modified file 'sql/repl_failsafe.h'
--- a/sql/repl_failsafe.h 2007-08-16 06:52:50 +0000
+++ b/sql/repl_failsafe.h 2008-12-03 02:53:40 +0000
@@ -45,6 +45,8 @@ bool show_slave_hosts(THD* thd);
int translate_master(THD* thd, LEX_MASTER_INFO* mi, char* errmsg);
void init_slave_list();
void end_slave_list();
+void init_slave_start();
+void end_slave_start();
int register_slave(THD* thd, uchar* packet, uint packet_length);
void unregister_slave(THD* thd, bool only_mine, bool need_mutex);
=== modified file 'sql/share/errmsg.txt'
--- a/sql/share/errmsg.txt 2008-11-28 10:10:39 +0000
+++ b/sql/share/errmsg.txt 2008-12-03 02:53:40 +0000
@@ -6442,3 +6442,5 @@ ER_BACKUP_BACKUP_DBS
eng "Backing up %u database(s) %.220s"
ER_BACKUP_RESTORE_DBS
eng "Restoring %u database(s) %.220s"
+ER_RESTORE_CANNOT_START_SLAVE
+ eng "Slave cannot start until restore is complete."
=== modified file 'sql/si_objects.cc'
--- a/sql/si_objects.cc 2008-10-30 12:29:54 +0000
+++ b/sql/si_objects.cc 2008-12-03 02:53:40 +0000
@@ -4073,6 +4073,22 @@ int disable_slave_connections(bool disab
}
/**
+ Set state where restore is running.
+
+ This method tells the server that a restore is in progress.
+ This is used to prohibit slaves from starting once a restore is
+ in progress.
+
+ param[IN] running TRUE = restore running, FALSE = no restore running
+*/
+void restore_running(bool running)
+{
+ pthread_mutex_lock(&LOCK_slave_start);
+ allow_slave_start= !running;
+ pthread_mutex_unlock(&LOCK_slave_start);
+}
+
+/**
Write an incident event in the binary log.
This method can be used to issue an incident event to inform the slave
=== modified file 'sql/si_objects.h'
--- a/sql/si_objects.h 2008-10-28 18:14:14 +0000
+++ b/sql/si_objects.h 2008-12-03 02:53:40 +0000
@@ -739,6 +739,11 @@ int num_slaves_attached();
*/
int disable_slave_connections(bool disable);
+/*
+ Set state where restore is running.
+*/
+void restore_running(bool running);
+
/**
Enumeration of the incidents that can occur on the master.
*/
=== modified file 'sql/sql_repl.cc'
--- a/sql/sql_repl.cc 2008-10-28 18:14:14 +0000
+++ b/sql/sql_repl.cc 2008-12-03 02:53:40 +0000
@@ -1000,6 +1000,21 @@ int start_slave(THD* thd , Master_info*
if (check_access(thd, SUPER_ACL, any_db,0,0,0,0))
DBUG_RETURN(1);
+
+ /*
+ Ensure there are no restores running on the server.
+ */
+ pthread_mutex_lock(&LOCK_slave_start);
+ bool proceed= allow_slave_start;
+ pthread_mutex_unlock(&LOCK_slave_start);
+ printf("WHAT??? %d\n", proceed);
+ if (!proceed)
+ {
+ slave_errno= ER_RESTORE_CANNOT_START_SLAVE;
+ my_message(slave_errno, ER(slave_errno), MYF(0));
+ DBUG_RETURN(1);
+ }
+
lock_slave_threads(mi); // this allows us to cleanly read slave_running
// Get a mask of _stopped_ threads
init_thread_mask(&thread_mask,mi,1 /* inverse */);