#At file:///D:/source/bzr/mysql-6.0-bug-40434/
2741 Chuck Bell 2008-12-16
BUG#40434 : Replication should not be allowed to start if restore is running
Currently, the work of WL#4209 and WL#4280 provide mechanisms to control
how replication and backup interact. However, we also need to have a method
to prevent replication from starting on a slave if a restore is ongoing on that
same slave.
This patch prohibits a slave from starting replication when a restore is in
progress.
modified:
mysql-test/lib/mtr_report.pl
mysql-test/suite/rpl/r/rpl_backup.result
mysql-test/suite/rpl/t/rpl_backup.test
sql/backup/kernel.cc
sql/mysql_priv.h
sql/mysqld.cc
sql/share/errmsg.txt
sql/si_objects.cc
sql/si_objects.h
sql/sql_repl.cc
sql/sql_repl.h
per-file messages:
mysql-test/lib/mtr_report.pl
Mask for error trying to start restore while replicating.
mysql-test/suite/rpl/r/rpl_backup.result
Result file with additional test.
mysql-test/suite/rpl/t/rpl_backup.test
Added test for prohibiting a slave to start replication while a restore is
running.
sql/backup/kernel.cc
Added code to block replication during restore and release
block after restore is complete.
sql/mysql_priv.h
Added extern reference for mutex for variable.
Added extern reference for 'reason' string for error.
sql/mysqld.cc
Added mutex for variable.
Added calls for initialization of mutex and variable.
Added string reference for reason in error message for
replication start failure.
sql/share/errmsg.txt
New error message to tell the user a slave cannot start until the ongoing
process is complete.
sql/si_objects.cc
Added method to tell server a restore is running.
sql/si_objects.h
Method declaration for restore running method.
sql/sql_repl.cc
Added methods to init and destroy mutex.
Added code to prohibit slave from starting if the variable allow_slave_start
is set to FALSE. This is done when a restore is run via restore_running()
in si_objects.cc.
sql/sql_repl.h
Added declarations for methods to init and destroy mutex.
=== modified file 'mysql-test/lib/mtr_report.pl'
--- a/mysql-test/lib/mtr_report.pl 2008-11-21 15:02:34 +0000
+++ b/mysql-test/lib/mtr_report.pl 2008-12-16 20:54:07 +0000
@@ -377,6 +377,13 @@ sub mtr_report_stats ($) {
/Backup: The MySQL server is running with the /
) or
+ # The rpl_backup test will throw an error about running restore
+ # on a slave.
+ ($testname eq 'rpl.rpl_backup') and
+ (
+ /A restore operation was attempted on a slave during replication/
+ ) or
+
# The views test triggers errors below on purpose
($testname eq 'backup.backup_views') and
(
=== modified file 'mysql-test/suite/rpl/r/rpl_backup.result'
--- a/mysql-test/suite/rpl/r/rpl_backup.result 2008-11-17 09:57:51 +0000
+++ b/mysql-test/suite/rpl/r/rpl_backup.result 2008-12-16 20:54:07 +0000
@@ -327,6 +327,110 @@ the after position of the master's binlo
should be 0.
Delta
0
+RESET MASTER;
+RESET SLAVE;
+SET DEBUG_SYNC = 'reset';
+SET DEBUG_SYNC = 'restore_before_end SIGNAL restore_running WAIT_FOR proceed';
+RESTORE FROM 'rpl_bup_s3.bak' OVERWRITE;
+SET DEBUG_SYNC = 'now WAIT_FOR restore_running';
+Try to start the slave while restore is running -- gets error.
+SLAVE START;
+ERROR HY000: Cannot start slave. SLAVE START is blocked by RESTORE.
+SET DEBUG_SYNC = 'now SIGNAL proceed';
+SHOW SLAVE STATUS;
+Slave_IO_State #
+Master_Host 127.0.0.1
+Master_User root
+Master_Port MASTER_PORT
+Connect_Retry 1
+Master_Log_File #
+Read_Master_Log_Pos #
+Relay_Log_File #
+Relay_Log_Pos #
+Relay_Master_Log_File
+Slave_IO_Running No
+Slave_SQL_Running No
+Replicate_Do_DB
+Replicate_Ignore_DB
+Replicate_Do_Table
+Replicate_Ignore_Table
+Replicate_Wild_Do_Table
+Replicate_Wild_Ignore_Table
+Last_Errno 0
+Last_Error
+Skip_Counter 0
+Exec_Master_Log_Pos #
+Relay_Log_Space #
+Until_Condition None
+Until_Log_File
+Until_Log_Pos 0
+Master_SSL_Allowed No
+Master_SSL_CA_File
+Master_SSL_CA_Path
+Master_SSL_Cert
+Master_SSL_Cipher
+Master_SSL_Key
+Seconds_Behind_Master #
+Master_SSL_Verify_Server_Cert No
+Last_IO_Errno 0
+Last_IO_Error
+Last_SQL_Errno 0
+Last_SQL_Error
+Restore is now complete.
+backup_id
+#
+SET DEBUG_SYNC = 'now SIGNAL done';
+SET DEBUG_SYNC = 'now WAIT_FOR done';
+SHOW DATABASES;
+Database
+information_schema
+mysql
+rpl_backup
+test
+SET DEBUG_SYNC = 'reset';
+Try to start the slave after restore is done -- should succeed.
+SLAVE START;
+SHOW SLAVE STATUS;
+Slave_IO_State #
+Master_Host 127.0.0.1
+Master_User root
+Master_Port MASTER_PORT
+Connect_Retry 1
+Master_Log_File #
+Read_Master_Log_Pos #
+Relay_Log_File #
+Relay_Log_Pos #
+Relay_Master_Log_File master-bin.000001
+Slave_IO_Running Yes
+Slave_SQL_Running Yes
+Replicate_Do_DB
+Replicate_Ignore_DB
+Replicate_Do_Table
+Replicate_Ignore_Table
+Replicate_Wild_Do_Table
+Replicate_Wild_Ignore_Table
+Last_Errno 0
+Last_Error
+Skip_Counter 0
+Exec_Master_Log_Pos #
+Relay_Log_Space #
+Until_Condition None
+Until_Log_File
+Until_Log_Pos 0
+Master_SSL_Allowed No
+Master_SSL_CA_File
+Master_SSL_CA_Path
+Master_SSL_Cert
+Master_SSL_Cipher
+Master_SSL_Key
+Seconds_Behind_Master #
+Master_SSL_Verify_Server_Cert No
+Last_IO_Errno 0
+Last_IO_Error
+Last_SQL_Errno 0
+Last_SQL_Error
+Now stop the slave.
+SLAVE STOP;
FLUSH BACKUP LOGS;
PURGE BACKUP LOGS;
DROP DATABASE rpl_backup;
=== modified file 'mysql-test/suite/rpl/t/rpl_backup.test'
--- a/mysql-test/suite/rpl/t/rpl_backup.test 2008-11-17 09:57:51 +0000
+++ b/mysql-test/suite/rpl/t/rpl_backup.test 2008-12-16 20:54:07 +0000
@@ -375,6 +375,63 @@ eval SELECT $master_after_pos - $master_
--enable_query_log
#
+# Now test 'slave start' while restore is in progress on slave.
+#
+
+RESET MASTER;
+
+connection slave;
+
+RESET SLAVE;
+
+SET DEBUG_SYNC = 'reset';
+
+connection slave1;
+
+SET DEBUG_SYNC = 'restore_before_end SIGNAL restore_running WAIT_FOR proceed';
+SEND RESTORE FROM 'rpl_bup_s3.bak' OVERWRITE;
+
+connection slave;
+
+SET DEBUG_SYNC = 'now WAIT_FOR restore_running';
+
+--echo Try to start the slave while restore is running -- gets error.
+--error ER_RESTORE_CANNOT_START_SLAVE
+SLAVE START;
+
+SET DEBUG_SYNC = 'now SIGNAL proceed';
+
+--replace_result $MASTER_MYPORT MASTER_PORT
+--replace_column 1 # 6 # 7 # 8 # 9 # 22 # 23 # 33 #
+--query_vertical SHOW SLAVE STATUS
+
+connection slave1;
+--echo Restore is now complete.
+--replace_column 1 #
+reap;
+SET DEBUG_SYNC = 'now SIGNAL done';
+
+connection slave;
+
+SET DEBUG_SYNC = 'now WAIT_FOR done';
+
+SHOW DATABASES;
+
+SET DEBUG_SYNC = 'reset';
+
+--echo Try to start the slave after restore is done -- should succeed.
+SLAVE START;
+--source include/wait_for_slave_to_start.inc
+
+--replace_result $MASTER_MYPORT MASTER_PORT
+--replace_column 1 # 6 # 7 # 8 # 9 # 22 # 23 # 33 #
+--query_vertical SHOW SLAVE STATUS
+
+--echo Now stop the slave.
+SLAVE STOP;
+--source include/wait_for_slave_to_stop.inc
+
+#
# Cleanup
#
connection master;
=== modified file 'sql/backup/kernel.cc'
--- a/sql/backup/kernel.cc 2008-12-10 15:53:06 +0000
+++ b/sql/backup/kernel.cc 2008-12-16 20:54:07 +0000
@@ -208,12 +208,6 @@ execute_backup_command(THD *thd, LEX *le
case SQLCOM_RESTORE:
{
- /*
- Restore cannot be run on a slave while connected to a master.
- */
- if (obs::is_slave())
- DBUG_RETURN(send_error(context, ER_RESTORE_ON_SLAVE));
-
Restore_info *info= context.prepare_for_restore(backupdir, lex->backup_dir,
thd->query);
@@ -701,6 +695,20 @@ Backup_restore_ctx::prepare_for_restore(
{
using namespace backup;
+ /*
+ Block replication from starting.
+ */
+ obs::block_replication(TRUE, "RESTORE");
+
+ /*
+ Restore cannot be run on a slave while connected to a master.
+ */
+ if (obs::is_slave())
+ {
+ fatal_error(report_error(ER_RESTORE_ON_SLAVE));
+ return NULL;
+ }
+
if (m_error)
return NULL;
@@ -934,6 +942,11 @@ int Backup_restore_ctx::close()
obs::disable_slave_connections(FALSE);
/*
+ Allow replication to start after restore is complete.
+ */
+ obs::block_replication(FALSE, "");
+
+ /*
Turn binlog back on iff it was turned off earlier.
*/
if (m_engage_binlog)
=== modified file 'sql/mysql_priv.h'
--- a/sql/mysql_priv.h 2008-12-16 11:51:34 +0000
+++ b/sql/mysql_priv.h 2008-12-16 20:54:07 +0000
@@ -1976,6 +1976,8 @@ extern ulong slow_launch_threads, slow_l
extern ulong table_cache_size, table_def_size;
extern ulong max_connections,max_connect_errors, connect_timeout;
extern my_bool slave_allow_batching;
+extern my_bool allow_slave_start;
+extern LEX_STRING reason_slave_blocked;
extern ulong slave_net_timeout, slave_trans_retries;
extern uint max_user_connections;
extern ulong what_to_log,flush_time;
@@ -2066,7 +2068,7 @@ extern pthread_mutex_t LOCK_mysql_create
LOCK_error_log, LOCK_delayed_insert, LOCK_uuid_short,
LOCK_delayed_status, LOCK_delayed_create, LOCK_crypt, LOCK_timezone,
LOCK_slave_list, LOCK_active_mi, LOCK_manager, LOCK_global_read_lock,
- LOCK_global_system_variables, LOCK_user_conn,
+ LOCK_global_system_variables, LOCK_user_conn, LOCK_slave_start,
LOCK_prepared_stmt_count,
LOCK_connection_count;
#ifdef HAVE_OPENSSL
=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc 2008-12-10 12:57:51 +0000
+++ b/sql/mysqld.cc 2008-12-16 20:54:07 +0000
@@ -540,6 +540,8 @@ ulong query_buff_size, slow_launch_time,
ulong open_files_limit, max_binlog_size, max_relay_log_size;
ulong slave_net_timeout, slave_trans_retries;
my_bool slave_allow_batching;
+my_bool allow_slave_start= TRUE;
+LEX_STRING reason_slave_blocked;
ulong slave_exec_mode_options;
const char *slave_exec_mode_str= "STRICT";
ulong thread_cache_size=0, thread_pool_size= 0;
@@ -690,7 +692,7 @@ pthread_mutex_t LOCK_mysql_create_db, LO
LOCK_crypt,
LOCK_global_system_variables,
LOCK_user_conn, LOCK_slave_list, LOCK_active_mi,
- LOCK_connection_count;
+ LOCK_connection_count, LOCK_slave_start;
/**
The below lock protects access to two global server variables:
@@ -1381,6 +1383,7 @@ void clean_up(bool print_message)
free_max_user_conn();
#ifdef HAVE_REPLICATION
end_slave_list();
+ end_slave_start();
#endif
delete binlog_filter;
delete rpl_filter;
@@ -3886,6 +3889,7 @@ static int init_server_components()
my_uuid_init((ulong) (my_rnd(&sql_rand))*12345,12345);
#ifdef HAVE_REPLICATION
init_slave_list();
+ init_slave_start();
#endif
/* Setup logs */
=== modified file 'sql/share/errmsg.txt'
--- a/sql/share/errmsg.txt 2008-12-15 09:22:24 +0000
+++ b/sql/share/errmsg.txt 2008-12-16 20:54:07 +0000
@@ -6447,4 +6447,5 @@ ER_BACKUP_RESTORE_DBS
eng "Restoring %u database(s) %.220s"
ER_BACKUP_SYNCHRONIZE
eng "Backup failed to synchronize table images."
-
+ER_RESTORE_CANNOT_START_SLAVE
+ eng "Cannot start slave. SLAVE START is blocked by %-.64s."
=== modified file 'sql/si_objects.cc'
--- a/sql/si_objects.cc 2008-12-06 00:02:44 +0000
+++ b/sql/si_objects.cc 2008-12-16 20:54:07 +0000
@@ -3137,6 +3137,28 @@ int disable_slave_connections(bool disab
}
/**
+ Set state where replication is blocked from starting.
+
+ This method tells the server that a process that requires replication
+ to be turned off while the operation is in progress.
+ This is used to prohibit slaves from starting.
+
+ @param[in] block TRUE = block slave start, FALSE = do not block
+ @param[in] reason Reason for the block
+*/
+void block_replication(bool block, char *reason)
+{
+ pthread_mutex_lock(&LOCK_slave_start);
+ allow_slave_start= !block;
+ if (block)
+ {
+ reason_slave_blocked.length= strlen(reason);
+ reason_slave_blocked.str= reason;
+ }
+ pthread_mutex_unlock(&LOCK_slave_start);
+}
+
+/**
Write an incident event in the binary log.
This method can be used to issue an incident event to inform the slave
=== modified file 'sql/si_objects.h'
--- a/sql/si_objects.h 2008-12-04 23:14:30 +0000
+++ b/sql/si_objects.h 2008-12-16 20:54:07 +0000
@@ -509,6 +509,12 @@ int num_slaves_attached();
*/
int disable_slave_connections(bool disable);
+/*
+ Set state where replication is blocked (TRUE) or not blocked (FALSE)
+ from starting. Include reason for feedback to user.
+*/
+void block_replication(bool block, char *reason);
+
/**
Enumeration of the incidents that can occur on the master.
*/
=== modified file 'sql/sql_repl.cc'
--- a/sql/sql_repl.cc 2008-11-24 20:46:11 +0000
+++ b/sql/sql_repl.cc 2008-12-16 20:54:07 +0000
@@ -992,6 +992,27 @@ err:
DBUG_VOID_RETURN;
}
+/**
+ Initialize mutex for slave start variable.
+*/
+void init_slave_start()
+{
+ pthread_mutex_init(&LOCK_slave_start, MY_MUTEX_INIT_FAST);
+ pthread_mutex_lock(&LOCK_slave_start);
+ allow_slave_start= TRUE;
+ reason_slave_blocked.length= 0;
+ reason_slave_blocked.str= "";
+ pthread_mutex_unlock(&LOCK_slave_start);
+}
+
+/**
+ Destroy mutex for slave start variable.
+*/
+void end_slave_start()
+{
+ pthread_mutex_destroy(&LOCK_slave_start);
+}
+
int start_slave(THD* thd , Master_info* mi, bool net_report)
{
int slave_errno= 0;
@@ -1000,6 +1021,25 @@ int start_slave(THD* thd , Master_info*
if (check_access(thd, SUPER_ACL, any_db,0,0,0,0))
DBUG_RETURN(1);
+
+
+ /*
+ Ensure there are no restores running on the server.
+ */
+ pthread_mutex_lock(&LOCK_slave_start);
+ bool proceed= allow_slave_start;
+ bool success= TRUE;
+ if (!proceed)
+ {
+ slave_errno= ER_RESTORE_CANNOT_START_SLAVE;
+ if (net_report)
+ my_error(slave_errno, MYF(0), reason_slave_blocked);
+ success= FALSE;
+ }
+ pthread_mutex_unlock(&LOCK_slave_start);
+ if (!success)
+ DBUG_RETURN(1);
+
lock_slave_threads(mi); // this allows us to cleanly read slave_running
// Get a mask of _stopped_ threads
init_thread_mask(&thread_mask,mi,1 /* inverse */);
=== modified file 'sql/sql_repl.h'
--- a/sql/sql_repl.h 2008-05-09 10:27:23 +0000
+++ b/sql/sql_repl.h 2008-12-16 20:54:07 +0000
@@ -52,6 +52,8 @@ bool show_binlogs(THD* thd);
extern int init_master_info(Master_info* mi);
void kill_zombie_dump_threads(uint32 slave_server_id);
int check_binlog_magic(IO_CACHE* log, const char** errmsg);
+void init_slave_start();
+void end_slave_start();
typedef struct st_load_file_info
{