#At file:///home/acorreia/workspace.sun/repository.mysql/bzrwork/bug-40337/mysql-6.0-rpl/ based on revid:luis.soares@stripped
2810 Alfranio Correia 2009-02-04
BUG#40337 Fsyncing master and relay log to disk after every event is too slow
The fix proposed in BUG#35542 and BUG#31665 introduces a performance issue
when fsyncing the master.info, relay.info and relay-log.bin* after #th events.
Although such solution has been proposed to reduce the probability of corrupted
files due to a slave-crash, the performance penalty introduced by it has
made the approach impractical for highly intensive workloads.
In a nutshell, the option --syn-relay-log proposed in BUG#35542 and BUG#31665
simultaneously fsyncs master.info, relay-log.info and relay-log.bin* and
this is the main source of performance issues.
This patch introduces new options that give more control to the user on
what should be fsynced and how often:
1) (--sync-master-info, integer) which syncs the master.info after #th event;
2) (--sync-relay-log, integer) which syncs the relay-log.bin* after #th events.
3) (--sync-relay-log-info, integer) which syncs the relay.info after #th
transactions.
To provide both performance and increased reliability, we recommend the following
setup:
1) --sync-master-info = 0 eventually the operating system will fsync it;
2) --sync-relay-log = 0 eventually the operating system will fsync it;
3) --sync-relay-log-info = 1 fsyncs it after every transaction;
Notice, however, that the previous setup does not reduce the probability of corrupted
master.info and relay-log.bin*. To overcome the issue, this patch also introduces
a recovery mechanism that right after restart throws away relay-log.bin* retrieved
from a master and updates the master.info based on the relay.info:
4) (--relay-log-recovery, boolean) which enables a recovery mechanism that
throws away relay-log.bin* after a crash.
So, it is advisable to setup:
4) --syn-relay-log-info = 1.
modified:
mysql-test/suite/rpl/r/rpl_flushlog_loop.result
sql/mysql_priv.h
sql/mysqld.cc
sql/rpl_mi.cc
sql/rpl_mi.h
sql/rpl_rli.cc
sql/rpl_rli.h
sql/slave.cc
sql/sql_repl.cc
per-file messages:
mysql-test/suite/rpl/r/rpl_flushlog_loop.result
Fixed result as "show variables like 'relay_log%'" produces a new record:
relay_log_recovery OFF
sql/mysql_priv.h
Introduces the new options: sync_relayloginfo_period, sync_masterinfo_period and relay_log_recovery.
sql/mysqld.cc
Introduces the new options: sync_relayloginfo_period, sync_masterinfo_period and relay_log_recovery.
sql/rpl_mi.cc
This patch does three distinct things:
1) Initializes the counter that keeps track of the number of events before fsyncing.
2) Removes an unnecessary synchronization point that was causing two fsyncs on the relay-log.bin*.
3) Introduces the fsync for the master.info.
sql/rpl_mi.h
Creates a counter that keeps track of the number of events before fsyncing the master.info.
sql/rpl_rli.cc
This patch does two distinct things:
1) Initializes the counter that keeps track of the number of events before fsyncing.
2) Avoids reading the current relay-log.bin* after restarting because it may be
corrupted.
sql/rpl_rli.h
Creates a counter that keeps track of the number of events before fsyncing the master.info and
a flag that indentifies that the server is recoverying. This is important to avoid reading a
possilbe corrupted relay-log.bin*
sql/slave.cc
This patch does two distinct things:
1) Introduces a recovery mechanism that right after restart throws away relay-log.bin* retrieved
from a master and updates the master.info based on the relay.info
2) Introduces the fsync for the relay-log.info.
=== modified file 'mysql-test/suite/rpl/r/rpl_flushlog_loop.result'
--- a/mysql-test/suite/rpl/r/rpl_flushlog_loop.result 2009-01-08 09:49:02 +0000
+++ b/mysql-test/suite/rpl/r/rpl_flushlog_loop.result 2009-02-04 16:11:20 +0000
@@ -10,6 +10,7 @@ relay_log MYSQLD_DATADIR/relay-log
relay_log_index
relay_log_info_file relay-log.info
relay_log_purge ON
+relay_log_recovery OFF
relay_log_space_limit 0
stop slave;
change master to master_host='127.0.0.1',master_user='root',
=== modified file 'sql/mysql_priv.h'
--- a/sql/mysql_priv.h 2009-01-26 16:03:39 +0000
+++ b/sql/mysql_priv.h 2009-02-04 16:11:20 +0000
@@ -1969,10 +1969,12 @@ extern ulong specialflag;
#ifdef MYSQL_SERVER
extern ulong current_pid;
extern ulong expire_logs_days;
-extern uint sync_binlog_period, sync_relaylog_period;
+extern uint sync_binlog_period, sync_relaylog_period,
+ sync_relayloginfo_period, sync_masterinfo_period;
extern ulong opt_tc_log_size, tc_log_max_pages_used, tc_log_page_size;
extern ulong tc_log_page_waits;
extern my_bool relay_log_purge, opt_innodb_safe_binlog, opt_innodb;
+extern my_bool relay_log_recovery;
extern uint test_flags,select_errors,ha_open_options;
extern uint protocol_version, mysqld_port, dropping_tables;
extern uint delay_key_write_options;
=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc 2009-01-26 16:03:39 +0000
+++ b/sql/mysqld.cc 2009-02-04 16:11:20 +0000
@@ -499,6 +499,7 @@ extern const char *opt_ndb_distribution;
extern enum ndb_distribution opt_ndb_distribution_id;
#endif
my_bool opt_readonly, use_temp_pool, relay_log_purge;
+my_bool relay_log_recovery;
my_bool opt_sync_frm, opt_allow_suspicious_udfs;
my_bool opt_secure_auth= 0;
char* opt_secure_file_priv= 0;
@@ -580,7 +581,8 @@ ulong max_prepared_stmt_count;
ulong prepared_stmt_count=0;
ulong thread_id=1L,current_pid;
ulong slow_launch_threads = 0;
-uint sync_binlog_period= 0, sync_relaylog_period= 0;
+uint sync_binlog_period= 0, sync_relaylog_period= 0,
+ sync_relayloginfo_period= 0, sync_masterinfo_period= 0;
ulong expire_logs_days = 0;
ulong rpl_recovery_rank=0;
const char *log_output_str= "FILE";
@@ -5841,6 +5843,7 @@ enum options_mysqld
OPT_QUERY_CACHE_TYPE, OPT_QUERY_CACHE_WLOCK_INVALIDATE, OPT_RECORD_BUFFER,
OPT_RECORD_RND_BUFFER, OPT_DIV_PRECINCREMENT, OPT_RELAY_LOG_SPACE_LIMIT,
OPT_RELAY_LOG_PURGE,
+ OPT_RELAY_LOG_RECOVERY,
OPT_SLAVE_NET_TIMEOUT, OPT_SLAVE_COMPRESSED_PROTOCOL, OPT_SLOW_LAUNCH_TIME,
OPT_SLAVE_TRANS_RETRIES, OPT_READONLY, OPT_DEBUGGING,
OPT_SORT_BUFFER, OPT_TABLE_OPEN_CACHE, OPT_TABLE_DEF_CACHE,
@@ -5918,6 +5921,8 @@ enum options_mysqld
OPT_GENERAL_LOG_FILE,
OPT_SLOW_QUERY_LOG_FILE,
OPT_SYNC_RELAY_LOG,
+ OPT_SYNC_RELAY_LOG_INFO,
+ OPT_SYNC_MASTER_INFO,
OPT_BACKUP_HISTORY_LOG_FILE,
OPT_BACKUP_PROGRESS_LOG_FILE
};
@@ -7176,6 +7181,13 @@ The minimum value for this variable is 4
(uchar**) &relay_log_purge,
(uchar**) &relay_log_purge, 0, GET_BOOL, NO_ARG,
1, 0, 1, 0, 1, 0},
+ {"relay_log_recovery", OPT_RELAY_LOG_RECOVERY,
+ "Enables automatic relay log recovery right after the database startup, "
+ "which means that the IO Thread starts re-fetching from the master "
+ "right after the last transaction processed.",
+ (uchar**) &relay_log_recovery,
+ (uchar**) &relay_log_recovery, 0, GET_BOOL, NO_ARG,
+ 0, 0, 1, 0, 1, 0},
{"relay_log_space_limit", OPT_RELAY_LOG_SPACE_LIMIT,
"Maximum space to use for all relay logs.",
(uchar**) &relay_log_space_limit,
@@ -7221,6 +7233,16 @@ The minimum value for this variable is 4
"Use 0 (default) to disable synchronous flushing.",
(uchar**) &sync_relaylog_period, (uchar**) &sync_relaylog_period, 0, GET_UINT,
REQUIRED_ARG, 0, 0, (longlong) UINT_MAX, 0, 1, 0},
+ {"sync-relay-log-info", OPT_SYNC_RELAY_LOG_INFO,
+ "Synchronously flush relay log info to disk after #th transaction. "
+ "Use 0 (default) to disable synchronous flushing.",
+ (uchar**) &sync_relayloginfo_period, (uchar**) &sync_relayloginfo_period, 0, GET_UINT,
+ REQUIRED_ARG, 0, 0, (longlong) UINT_MAX, 0, 1, 0},
+ {"sync-master-info", OPT_SYNC_MASTER_INFO,
+ "Synchronously flush master info to disk after every #th event. "
+ "Use 0 (default) to disable synchronous flushing.",
+ (uchar**) &sync_masterinfo_period, (uchar**) &sync_masterinfo_period, 0, GET_UINT,
+ REQUIRED_ARG, 0, 0, (longlong) UINT_MAX, 0, 1, 0},
{"sync-frm", OPT_SYNC_FRM, "Sync .frm to disk on create. Enabled by default.",
(uchar**) &opt_sync_frm, (uchar**) &opt_sync_frm, 0, GET_BOOL, NO_ARG, 1, 0,
0, 0, 0, 0},
=== modified file 'sql/rpl_mi.cc'
--- a/sql/rpl_mi.cc 2008-12-24 10:48:24 +0000
+++ b/sql/rpl_mi.cc 2009-02-04 16:11:20 +0000
@@ -35,7 +35,8 @@ Master_info::Master_info()
ssl(0), ssl_verify_server_cert(0), fd(-1), io_thd(0), port(MYSQL_PORT),
connect_retry(DEFAULT_CONNECT_RETRY), heartbeat_period(0),
received_heartbeats(0), inited(0), master_id(0),
- abort_slave(0), slave_running(0), slave_run_id(0)
+ abort_slave(0), slave_running(0), slave_run_id(0),
+ sync_counter(0)
{
host[0] = 0; user[0] = 0; password[0] = 0;
ssl_ca[0]= 0; ssl_capath[0]= 0; ssl_cert[0]= 0;
@@ -432,11 +433,6 @@ int flush_master_info(Master_info* mi, b
IO_CACHE *log_file= mi->rli.relay_log.get_log_file();
if (flush_io_cache(log_file))
DBUG_RETURN(2);
-
- /* Sync to disk if --sync-relay-log is set */
- if (sync_relaylog_period &&
- my_sync(log_file->file, MY_WME))
- DBUG_RETURN(2);
}
/*
@@ -492,8 +488,12 @@ int flush_master_info(Master_info* mi, b
ignore_server_ids_buf);
my_free(ignore_server_ids_buf, MYF(0));
err= flush_io_cache(file);
- if (sync_relaylog_period && !err)
+ if (sync_masterinfo_period && !err &&
+ ++(mi->sync_counter) >= sync_masterinfo_period)
+ {
err= my_sync(mi->fd, MYF(MY_WME));
+ mi->sync_counter= 0;
+ }
DBUG_RETURN(-err);
}
=== modified file 'sql/rpl_mi.h'
--- a/sql/rpl_mi.h 2008-07-17 19:11:37 +0000
+++ b/sql/rpl_mi.h 2009-02-04 16:11:20 +0000
@@ -106,6 +106,13 @@ class Master_info : public Slave_reporti
*/
long clock_diff_with_master;
+
+ /*
+ * Keeps track of the number of events before fsyncing.
+ * The option --sync-master-info determines how many
+ * events should happen before fsyncing.
+ */
+ uint sync_counter;
};
void init_master_log_pos(Master_info* mi);
=== modified file 'sql/rpl_rli.cc'
--- a/sql/rpl_rli.cc 2008-12-24 10:48:24 +0000
+++ b/sql/rpl_rli.cc 2009-02-04 16:11:20 +0000
@@ -33,7 +33,8 @@ int init_strvar_from_file(char *var, int
Relay_log_info::Relay_log_info()
:Slave_reporting_capability("SQL"),
no_storage(FALSE), replicate_same_server_id(::replicate_same_server_id),
- info_fd(-1), cur_log_fd(-1), relay_log(&sync_relaylog_period),
+ info_fd(-1), cur_log_fd(-1),
+ relay_log(&sync_relaylog_period), sync_counter(0),
save_temporary_tables(0),
cur_log_old_open_count(0), group_relay_log_pos(0), event_relay_log_pos(0),
#if HAVE_purify
@@ -248,7 +249,8 @@ Failed to open the existing relay log in
rli->group_relay_log_pos= rli->event_relay_log_pos= relay_log_pos;
rli->group_master_log_pos= master_log_pos;
- if (init_relay_log_pos(rli,
+ if (!rli->is_relay_log_recovery &&
+ init_relay_log_pos(rli,
rli->group_relay_log_name,
rli->group_relay_log_pos,
0 /* no data lock*/,
@@ -269,7 +271,8 @@ Failed to open the existing relay log in
llstr(my_b_tell(rli->cur_log),llbuf1),
llstr(rli->event_relay_log_pos,llbuf2)));
DBUG_ASSERT(rli->event_relay_log_pos >= BIN_LOG_HEADER_SIZE);
- DBUG_ASSERT(my_b_tell(rli->cur_log) == rli->event_relay_log_pos);
+ DBUG_ASSERT(rli->is_relay_log_recovery ||
+ (my_b_tell(rli->cur_log) == rli->event_relay_log_pos));
}
#endif
=== modified file 'sql/rpl_rli.h'
--- a/sql/rpl_rli.h 2008-08-18 05:43:50 +0000
+++ b/sql/rpl_rli.h 2009-02-04 16:11:20 +0000
@@ -96,6 +96,19 @@ public:
LOG_INFO linfo;
IO_CACHE cache_buf,*cur_log;
+ /*
+ * Keeps track of the number of transactions that commits
+ * before fsyncing. The option --sync-relay-log-info determines
+ * how many transactions should commit before fsyncing.
+ */
+ uint sync_counter;
+
+ /*
+ * Identifies when the recovery process is going on.
+ * See sql/slave.cc:init_recovery for further details.
+ */
+ bool is_relay_log_recovery;
+
/* The following variables are safe to read any time */
/* IO_CACHE of the info file - set only during init or end */
=== modified file 'sql/slave.cc'
--- a/sql/slave.cc 2009-01-26 16:03:39 +0000
+++ b/sql/slave.cc 2009-02-04 16:11:20 +0000
@@ -132,6 +132,7 @@ static bool wait_for_relay_log_space(Rel
static inline bool io_slave_killed(THD* thd,Master_info* mi);
static inline bool sql_slave_killed(THD* thd,Relay_log_info* rli);
static int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type);
+static int init_recovery(Master_info* mi);
static int safe_connect(THD* thd, MYSQL* mysql, Master_info* mi);
static int safe_reconnect(THD* thd, MYSQL* mysql, Master_info* mi,
bool suppress_warnings);
@@ -254,6 +255,10 @@ int init_slave()
goto err;
}
+ active_mi->rli.is_relay_log_recovery= relay_log_recovery;
+ if (active_mi->rli.is_relay_log_recovery && init_recovery(active_mi))
+ goto err;
+
/* If server id is not set, start_slave_thread() will say it */
if (active_mi->host[0] && !opt_skip_slave_start)
@@ -269,15 +274,86 @@ int init_slave()
goto err;
}
}
+ active_mi->rli.is_relay_log_recovery= 0;
pthread_mutex_unlock(&LOCK_active_mi);
DBUG_RETURN(0);
err:
+ active_mi->rli.is_relay_log_recovery= 0;
pthread_mutex_unlock(&LOCK_active_mi);
DBUG_RETURN(1);
}
-
+/*
+ * Updates the master info based on the information stored in the
+ * relay info and ignores relay logs previously retrieved by the IO
+ * thread, which thus starts fetching again based on to the
+ * group_master_log_pos and group_master_log_name. Eventually, the old
+ * relay logs will be purged by the normal purge mechanism.
+ *
+ * In the feature, we can improve this routine in order to avoid throwing
+ * away logs that are safely stored in the disk.
+ *
+ * There is no need for a mutex as the caller (i.e. init_slave) already
+ * has one acquired.
+ *
+ * Specifically, the following structures are updated:
+ *
+ * 1 - mi->master_log_pos <-- rli->group_master_log_pos
+ * 2 - mi->master_log_name <-- rli->group_master_log_name
+ * 3 - It moves the relay log to the new relay log file, by
+ * rli->group_relay_log_pos <-- BIN_LOG_HEADER_SIZE;
+ * rli->event_relay_log_pos <-- BIN_LOG_HEADER_SIZE;
+ * rli->group_relay_log_name <-- rli->relay_log.get_log_fname();
+ * rli->event_relay_log_name <-- rli->relay_log.get_log_fname();
+ *
+ * If there is an error, it returns (1), otherwise returns (0).
+ */
+static int init_recovery(Master_info* mi)
+{
+ const char *errmsg= 0;
+ DBUG_ENTER("init_recovery");
+
+ Relay_log_info *rli= &mi->rli;
+ if (rli->group_master_log_name[0])
+ {
+ mi->master_log_pos= max(BIN_LOG_HEADER_SIZE,
+ rli->group_master_log_pos);
+ strmake(mi->master_log_name, rli->group_master_log_name,
+ sizeof(mi->master_log_name)-1);
+
+ sql_print_warning("Recovery from master pos %ld and file %s.",
+ (ulong) mi->master_log_pos, mi->master_log_name);
+
+ strmake(rli->group_relay_log_name, rli->relay_log.get_log_fname(),
+ sizeof(rli->group_relay_log_name)-1);
+ strmake(rli->event_relay_log_name, rli->relay_log.get_log_fname(),
+ sizeof(mi->rli.event_relay_log_name)-1);
+
+ rli->group_relay_log_pos= rli->event_relay_log_pos= BIN_LOG_HEADER_SIZE;
+
+ if (init_relay_log_pos(rli,
+ rli->group_relay_log_name,
+ rli->group_relay_log_pos,
+ 0 /*no data lock*/,
+ &errmsg, 0))
+ DBUG_RETURN(1);
+
+ if (flush_master_info(mi, 0))
+ {
+ sql_print_error("Failed to flush master info file");
+ DBUG_RETURN(1);
+ }
+ if (flush_relay_log_info(rli))
+ {
+ sql_print_error("Failed to flush relay info file");
+ DBUG_RETURN(1);
+ }
+ }
+
+ DBUG_RETURN(0);
+}
+
/**
Convert slave skip errors bitmap into a printable string.
*/
@@ -3826,7 +3902,14 @@ bool flush_relay_log_info(Relay_log_info
error=1;
if (flush_io_cache(file))
error=1;
-
+ if (sync_relayloginfo_period &&
+ !error &&
+ ++(rli->sync_counter) >= sync_relayloginfo_period)
+ {
+ if (my_sync(rli->info_fd, MYF(MY_WME)))
+ error=1;
+ rli->sync_counter= 0;
+ }
/* Flushing the relay log is done by the slave I/O thread */
DBUG_RETURN(error);
}
=== modified file 'sql/sql_repl.cc'
--- a/sql/sql_repl.cc 2009-01-26 16:32:29 +0000
+++ b/sql/sql_repl.cc 2009-02-04 16:11:20 +0000
@@ -2036,6 +2036,16 @@ static sys_var_const sys_relay_log_in
(uchar*) &relay_log_info_file);
static sys_var_bool_ptr sys_relay_log_purge(&vars, "relay_log_purge",
&relay_log_purge);
+static sys_var_bool_ptr sys_relay_log_recovery(&vars, "relay_log_recovery",
+ &relay_log_recovery);
+static sys_var_int_ptr sys_sync_binlog_period(&vars, "sync_binlog",
+ &sync_binlog_period);
+static sys_var_int_ptr sys_sync_relaylog_period(&vars, "sync_relay_log",
+ &sync_relaylog_period);
+static sys_var_int_ptr sys_sync_relayloginfo_period(&vars, "sync_relay_log_info",
+ &sync_relayloginfo_period);
+static sys_var_int_ptr sys_sync_masterinfo_period(&vars, "sync_master_info",
+ &sync_masterinfo_period);
static sys_var_const sys_relay_log_space_limit(&vars,
"relay_log_space_limit",
OPT_GLOBAL, SHOW_LONGLONG,
@@ -2052,8 +2062,6 @@ static sys_var_const sys_slave_skip_e
(uchar*) slave_skip_error_names);
static sys_var_long_ptr sys_slave_trans_retries(&vars, "slave_transaction_retries",
&slave_trans_retries);
-static sys_var_int_ptr sys_sync_binlog_period(&vars, "sync_binlog", &sync_binlog_period);
-static sys_var_int_ptr sys_sync_relaylog_period(&vars, "sync_relay_log", &sync_relaylog_period);
static sys_var_slave_skip_counter sys_slave_skip_counter(&vars, "sql_slave_skip_counter");