#At file:///home/andrei/MySQL/BZR/FIXES/5.1-bt-bug33815-dupl_s_id-contrib/ based on revid:joro@sun.com-20090821144148-lo8rbehf04x094bz
3078 Andrei Elkin 2009-08-26
Bug #33815 Improve reporting with duplicate server-ids
The report designate actually a serious issue:
if two slaves happen to have the same server id then:
1. The first one's dump thread session will be killed
2. Neither of them in the following could establish connection ever until
manual action.
Fixed with
deploying an unique signature to the slave-to-master session that complements
server_id.
The signature is caclulated on the slave at its replication module start-up.
It's passed to the master at `start slave'.
In the following master verifies the signature and kills only those zombi dump threads
that have an existing signature more additionally to equality of the server_id.
If server_id:s are the same but not the signature then a new connection attempt is rejected
and the existing dump thread remains to operate undistirbingly.
modified:
sql/rpl_mi.cc
sql/rpl_mi.h
sql/share/errmsg.txt
sql/slave.cc
sql/sql_parse.cc
sql/sql_repl.cc
sql/sql_repl.h
per-file messages:
sql/rpl_mi.cc
initializing the slave session unique identifier along with active_mi.
sql/rpl_mi.h
adding a new member to Master_info class to hold slave-to-master connections
unique identifier.
sql/share/errmsg.txt
A new error constant is added;
sql/slave.cc
refining logics of request_dump() to never tolerate
the duplicate slave server id error;
augmenting connect_to_master() to ship
the slave's io_thread session id to the master.
sql/sql_parse.cc
Treating a new error situation out of kill_zombie_dump_threads() so that slave
will receive a new duplicate slave server_id and stop attempts to reconnect.
sql/sql_repl.cc
Slave session id access function on the master is added;
kill_zombie_dump_threads() is refactored to pay attention to the new slave session id.
sql/sql_repl.h
signature of kill_zombie_dump_threads() is modified.
=== modified file 'sql/rpl_mi.cc'
--- a/sql/rpl_mi.cc 2007-12-14 13:21:37 +0000
+++ b/sql/rpl_mi.cc 2009-08-26 08:41:47 +0000
@@ -308,6 +308,8 @@ file '%s')", fname);
mi->rli.mi = mi;
if (init_relay_log_info(&mi->rli, slave_info_fname))
goto err;
+ srand(time(0));
+ mi->io_thread_session_id= rand();
mi->inited = 1;
// now change cache READ -> WRITE - must do this before flush_master_info
=== modified file 'sql/rpl_mi.h'
--- a/sql/rpl_mi.h 2007-08-16 06:52:50 +0000
+++ b/sql/rpl_mi.h 2009-08-26 08:41:47 +0000
@@ -100,6 +100,14 @@ class Master_info : public Slave_reporti
*/
long clock_diff_with_master;
+ /*
+ The integer represents all slave-to-master connections unique identifier.
+ It is passed to the master at slave connecting time.
+ The value allows the master to distinquish sessions from different
+ servers particularly those that by configuration mistake supply
+ the same server id.
+ */
+ ulong io_thread_session_id;
};
void init_master_info_with_options(Master_info* mi);
=== modified file 'sql/share/errmsg.txt'
--- a/sql/share/errmsg.txt 2009-07-31 17:14:52 +0000
+++ b/sql/share/errmsg.txt 2009-08-26 08:41:47 +0000
@@ -6203,6 +6203,8 @@ ER_RENAMED_NAME
swe "Namn�rad"
ER_TOO_MANY_CONCURRENT_TRXS
eng "Too many active concurrent transactions"
+ER_DUP_SLAVE_CONNECTION
+ eng "Duplicate slave connection with server_id %d is rejected by the existing connection '%s'@'%s'"
WARN_NON_ASCII_SEPARATOR_NOT_IMPLEMENTED
eng "Non-ASCII separator arguments are not fully supported"
=== modified file 'sql/slave.cc'
--- a/sql/slave.cc 2009-08-13 20:45:01 +0000
+++ b/sql/slave.cc 2009-08-26 08:41:47 +0000
@@ -1885,22 +1885,33 @@ static int request_dump(MYSQL* mysql, Ma
int4store(buf + 6, server_id);
len = (uint) strlen(logname);
memcpy(buf + 10, logname,len);
- if (simple_command(mysql, COM_BINLOG_DUMP, buf, len + 10, 1))
+ if (simple_command(mysql, COM_BINLOG_DUMP, buf, len + 10, 0))
{
+ uint err= mysql_errno(mysql);
/*
- Something went wrong, so we will just reconnect and retry later
- in the future, we should do a better error analysis, but for
- now we just fill up the error log :-)
+ Duplicate server id can't be tolerated.
*/
- if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
- *suppress_warnings= TRUE; // Suppress reconnect warning
+ if (err == ER_DUP_SLAVE_CONNECTION)
+ {
+ sql_print_error("Error %d on COM_BINLOG_DUMP: %s",
+ err, mysql_error(mysql));
+ DBUG_RETURN(-1);
+ }
else
- sql_print_error("Error on COM_BINLOG_DUMP: %d %s, will retry in %d secs",
- mysql_errno(mysql), mysql_error(mysql),
- master_connect_retry);
- DBUG_RETURN(1);
+ {
+ /*
+ Otherwise try reconnecting
+ */
+ if (err == ER_NET_READ_INTERRUPTED)
+ *suppress_warnings= TRUE; // Suppress reconnect warning
+ else
+ sql_print_error("Error %d on COM_BINLOG_DUMP: %s, "
+ "will retry in %d secs",
+ err, mysql_error(mysql),
+ master_connect_retry);
+ DBUG_RETURN(1);
+ }
}
-
DBUG_RETURN(0);
}
@@ -2623,12 +2634,15 @@ connected:
DBUG_PRINT("info",("Starting reading binary log from master"));
while (!io_slave_killed(thd,mi))
{
+ int dump_err;
thd_proc_info(thd, "Requesting binlog dump");
- if (request_dump(mysql, mi, &suppress_warnings))
+ if ((dump_err= request_dump(mysql, mi, &suppress_warnings)))
{
sql_print_error("Failed on request_dump()");
- if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
-requesting master dump") ||
+ if (dump_err == -1 ||
+ check_io_slave_killed(thd, mi,
+ "Slave I/O thread killed while "
+ "requesting master dump") ||
try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
reconnect_messages[SLAVE_RECON_ACT_DUMP]))
goto err;
@@ -3801,6 +3815,8 @@ static int connect_to_master(THD* thd, M
int last_errno= -2; // impossible error
ulong err_count=0;
char llbuff[22];
+ const char init_cmd_fmt[]= "set @slave_io_thread_session_id= %u";
+ char init_cmd[sizeof init_cmd_fmt + 12];
DBUG_ENTER("connect_to_master");
#ifndef DBUG_OFF
@@ -3812,6 +3828,8 @@ static int connect_to_master(THD* thd, M
mysql_options(mysql, MYSQL_OPT_CONNECT_TIMEOUT, (char *) &slave_net_timeout);
mysql_options(mysql, MYSQL_OPT_READ_TIMEOUT, (char *) &slave_net_timeout);
+ sprintf(init_cmd, init_cmd_fmt, mi->io_thread_session_id);
+ mysql_options(mysql, MYSQL_INIT_COMMAND, init_cmd);
#ifdef HAVE_OPENSSL
if (mi->ssl)
=== modified file 'sql/sql_parse.cc'
--- a/sql/sql_parse.cc 2009-08-21 14:41:48 +0000
+++ b/sql/sql_parse.cc 2009-08-26 08:41:47 +0000
@@ -1393,11 +1393,15 @@ bool dispatch_command(enum enum_server_c
break;
/* TODO: The following has to be changed to an 8 byte integer */
- pos = uint4korr(packet);
- flags = uint2korr(packet + 4);
- thd->server_id=0; /* avoid suicide */
+ pos= uint4korr(packet);
+ flags= uint2korr(packet + 4);
+ thd->server_id =0; /* avoid suicide */
if ((slave_server_id= uint4korr(packet+6))) // mysqlbinlog.server_id==0
- kill_zombie_dump_threads(slave_server_id);
+ if (kill_zombie_dump_threads(thd, slave_server_id))
+ {
+ error= TRUE;
+ break;
+ }
thd->server_id = slave_server_id;
general_log_print(thd, command, "Log: '%s' Pos: %ld", packet+10,
=== modified file 'sql/sql_repl.cc'
--- a/sql/sql_repl.cc 2009-07-24 16:04:55 +0000
+++ b/sql/sql_repl.cc 2009-08-26 08:41:47 +0000
@@ -1071,18 +1071,35 @@ err:
DBUG_RETURN(error);
}
+/**
+ An auxiliary function extracts slave session unique identifier.
+
+ @param[in] thd THD to access a user variable
+
+ @return slave session identifier or zero
+*/
+static ulong get_slave_session_id(THD * thd)
+{
+ my_bool null_value;
+ LEX_STRING name= { C_STRING_WITH_LEN("slave_io_thread_session_id")};
+ user_var_entry *entry=
+ (user_var_entry*) my_hash_search(&thd->user_vars, (uchar*) name.str,
+ name.length);
+ return entry? (ulong) entry->val_int(&null_value) : 0;
+}
+
/*
Kill all Binlog_dump threads which previously talked to the same slave
- ("same" means with the same server id). Indeed, if the slave stops, if the
- Binlog_dump thread is waiting (pthread_cond_wait) for binlog update, then it
- will keep existing until a query is written to the binlog. If the master is
- idle, then this could last long, and if the slave reconnects, we could have 2
- Binlog_dump threads in SHOW PROCESSLIST, until a query is written to the
+ i.e one having the same slave session id and the same server id.
+ If the slave stops the Binlog_dump thread does not immediately exit. It
+ would wait (pthread_cond_wait) for binlog update that can take long.
+ If the slave reconnects meanwhile, there will be 2 Binlog_dump threads
+ in SHOW PROCESSLIST all time until a query is written to the
binlog. To avoid this, when the slave reconnects and sends COM_BINLOG_DUMP,
- the master kills any existing thread with the slave's server id (if this id is
- not zero; it will be true for real slaves, but false for mysqlbinlog when it
- sends COM_BINLOG_DUMP to get a remote binlog dump).
+ the master kills any existing thread with the same slave session signature
+ and the slave's server id.
+ Notice, this id is not zero to designate a slave server not mysqlbinlog.
SYNOPSIS
kill_zombie_dump_threads()
@@ -1091,23 +1108,51 @@ err:
*/
-void kill_zombie_dump_threads(uint32 slave_server_id)
+int kill_zombie_dump_threads(THD *thd, uint32 slave_server_id)
{
- pthread_mutex_lock(&LOCK_thread_count);
+ int err= 0;
+ bool matched= false;
I_List_iterator<THD> it(threads);
THD *tmp;
+ DBUG_ASSERT(slave_server_id != 0);
+
+ pthread_mutex_lock(&LOCK_thread_count);
while ((tmp=it++))
{
if (tmp->command == COM_BINLOG_DUMP &&
tmp->server_id == slave_server_id)
{
- pthread_mutex_lock(&tmp->LOCK_thd_data); // Lock from delete
+ uint curr_id= get_slave_session_id(thd);
+ uint prev_id= get_slave_session_id(tmp);
+ if (curr_id == prev_id)
+ {
+ matched= true;
+ pthread_mutex_lock(&tmp->LOCK_thd_data); // Lock from delete
+ }
+ else
+ {
+ err= 1;
+ my_error(ER_DUP_SLAVE_CONNECTION, MYF(0), slave_server_id,
+ thd->main_security_ctx.user, thd->main_security_ctx.host_or_ip);
+ sql_print_information("A duplicate to current slave connection "
+ "with server_id %d from '%s'@'%s' "
+ "(signature %u) is rejected; the existing "
+ "connection '%s'@'%s' (signature %u) remains.",
+ slave_server_id,
+ thd->main_security_ctx.user,
+ thd->main_security_ctx.host_or_ip,
+ curr_id,
+ tmp->main_security_ctx.user,
+ tmp->main_security_ctx.host_or_ip,
+ prev_id);
+ }
break;
}
}
pthread_mutex_unlock(&LOCK_thread_count);
- if (tmp)
+
+ if (matched)
{
/*
Here we do not call kill_one_thread() as
@@ -1117,6 +1162,7 @@ void kill_zombie_dump_threads(uint32 sla
tmp->awake(THD::KILL_QUERY);
pthread_mutex_unlock(&tmp->LOCK_thd_data);
}
+ return err;
}
=== modified file 'sql/sql_repl.h'
--- a/sql/sql_repl.h 2008-03-14 17:38:54 +0000
+++ b/sql/sql_repl.h 2009-08-26 08:41:47 +0000
@@ -50,7 +50,7 @@ bool log_in_use(const char* log_name);
void adjust_linfo_offsets(my_off_t purge_offset);
bool show_binlogs(THD* thd);
extern int init_master_info(Master_info* mi);
-void kill_zombie_dump_threads(uint32 slave_server_id);
+int kill_zombie_dump_threads(THD *thd, uint32 slave_server_id);
int check_binlog_magic(IO_CACHE* log, const char** errmsg);
typedef struct st_load_file_info
| Thread |
|---|
| • bzr commit into mysql-5.1 branch (aelkin:3078) Bug#33815 | Andrei Elkin | 26 Aug |