List:Commits« Previous MessageNext Message »
From:Andrei Elkin Date:August 26 2009 8:41am
Subject:bzr commit into mysql-5.1 branch (aelkin:3078) Bug#33815
View as plain text  
#At file:///home/andrei/MySQL/BZR/FIXES/5.1-bt-bug33815-dupl_s_id-contrib/ based on revid:joro@sun.com-20090821144148-lo8rbehf04x094bz

 3078 Andrei Elkin	2009-08-26
      Bug #33815 Improve reporting with duplicate server-ids
      
      The report designate actually a serious issue:
      if two slaves happen to have the same server id then:
      
      1. The first one's dump thread session will be killed
      2. Neither of them in the following could establish connection ever until
      manual action.
      
      Fixed with
      deploying an unique signature to the slave-to-master session that complements
      server_id.
      The signature is caclulated on the slave at its replication module start-up.
      It's passed to the master at `start slave'.
      In the following master verifies the signature and kills only those zombi dump threads
      that have an existing signature more additionally to equality of the server_id.
      If server_id:s are the same but not the signature then a new connection attempt is rejected
      and the existing dump thread remains to operate undistirbingly.
      modified:
        sql/rpl_mi.cc
        sql/rpl_mi.h
        sql/share/errmsg.txt
        sql/slave.cc
        sql/sql_parse.cc
        sql/sql_repl.cc
        sql/sql_repl.h

per-file messages:
  sql/rpl_mi.cc
    initializing the slave session unique identifier along with active_mi.
  sql/rpl_mi.h
    adding a new member to Master_info class to hold slave-to-master connections
    unique identifier.
  sql/share/errmsg.txt
    A new error constant is added;
  sql/slave.cc
    refining logics of request_dump() to never tolerate
    the duplicate slave server id error;
    augmenting connect_to_master() to ship
    the slave's io_thread session id to the master.
  sql/sql_parse.cc
    Treating a new error situation out of kill_zombie_dump_threads() so that slave
    will receive a new duplicate slave server_id and stop attempts to reconnect.
  sql/sql_repl.cc
    Slave session id access function on the master is added;
    kill_zombie_dump_threads() is refactored to pay attention to the new slave session id.
  sql/sql_repl.h
    signature of kill_zombie_dump_threads() is modified.
=== modified file 'sql/rpl_mi.cc'
--- a/sql/rpl_mi.cc	2007-12-14 13:21:37 +0000
+++ b/sql/rpl_mi.cc	2009-08-26 08:41:47 +0000
@@ -308,6 +308,8 @@ file '%s')", fname);
   mi->rli.mi = mi;
   if (init_relay_log_info(&mi->rli, slave_info_fname))
     goto err;
+  srand(time(0));
+  mi->io_thread_session_id= rand();
 
   mi->inited = 1;
   // now change cache READ -> WRITE - must do this before flush_master_info

=== modified file 'sql/rpl_mi.h'
--- a/sql/rpl_mi.h	2007-08-16 06:52:50 +0000
+++ b/sql/rpl_mi.h	2009-08-26 08:41:47 +0000
@@ -100,6 +100,14 @@ class Master_info : public Slave_reporti
 
   */
   long clock_diff_with_master;
+  /*
+    The integer represents all slave-to-master connections unique identifier.
+    It is passed to the master at slave connecting time.
+    The value allows the master to distinquish sessions from different
+    servers particularly those that by configuration mistake supply 
+    the same server id.
+  */
+  ulong io_thread_session_id;
 };
 
 void init_master_info_with_options(Master_info* mi);

=== modified file 'sql/share/errmsg.txt'
--- a/sql/share/errmsg.txt	2009-07-31 17:14:52 +0000
+++ b/sql/share/errmsg.txt	2009-08-26 08:41:47 +0000
@@ -6203,6 +6203,8 @@ ER_RENAMED_NAME
   swe "Namn�rad"
 ER_TOO_MANY_CONCURRENT_TRXS
         eng  "Too many active concurrent transactions"
+ER_DUP_SLAVE_CONNECTION
+  eng "Duplicate slave connection with server_id %d is rejected by the existing connection '%s'@'%s'"
 
 WARN_NON_ASCII_SEPARATOR_NOT_IMPLEMENTED
   eng "Non-ASCII separator arguments are not fully supported"

=== modified file 'sql/slave.cc'
--- a/sql/slave.cc	2009-08-13 20:45:01 +0000
+++ b/sql/slave.cc	2009-08-26 08:41:47 +0000
@@ -1885,22 +1885,33 @@ static int request_dump(MYSQL* mysql, Ma
   int4store(buf + 6, server_id);
   len = (uint) strlen(logname);
   memcpy(buf + 10, logname,len);
-  if (simple_command(mysql, COM_BINLOG_DUMP, buf, len + 10, 1))
+  if (simple_command(mysql, COM_BINLOG_DUMP, buf, len + 10, 0))
   {
+    uint err= mysql_errno(mysql);
     /*
-      Something went wrong, so we will just reconnect and retry later
-      in the future, we should do a better error analysis, but for
-      now we just fill up the error log :-)
+      Duplicate server id can't be tolerated.
     */
-    if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
-      *suppress_warnings= TRUE;                 // Suppress reconnect warning
+    if (err == ER_DUP_SLAVE_CONNECTION)
+    {
+      sql_print_error("Error %d on COM_BINLOG_DUMP:  %s",
+                      err, mysql_error(mysql));
+      DBUG_RETURN(-1);
+    }
     else
-      sql_print_error("Error on COM_BINLOG_DUMP: %d  %s, will retry in %d secs",
-                      mysql_errno(mysql), mysql_error(mysql),
-                      master_connect_retry);
-    DBUG_RETURN(1);
+    {
+      /*
+        Otherwise try reconnecting
+      */
+      if (err == ER_NET_READ_INTERRUPTED)
+        *suppress_warnings= TRUE;                 // Suppress reconnect warning
+      else
+        sql_print_error("Error %d on COM_BINLOG_DUMP: %s, "
+                        "will retry in %d secs",
+                        err, mysql_error(mysql),
+                        master_connect_retry);
+      DBUG_RETURN(1);
+    }
   }
-
   DBUG_RETURN(0);
 }
 
@@ -2623,12 +2634,15 @@ connected:
   DBUG_PRINT("info",("Starting reading binary log from master"));
   while (!io_slave_killed(thd,mi))
   {
+    int dump_err;
     thd_proc_info(thd, "Requesting binlog dump");
-    if (request_dump(mysql, mi, &suppress_warnings))
+    if ((dump_err= request_dump(mysql, mi, &suppress_warnings)))
     {
       sql_print_error("Failed on request_dump()");
-      if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
-requesting master dump") ||
+      if (dump_err == -1 ||
+          check_io_slave_killed(thd, mi,
+                                "Slave I/O thread killed while "
+                                "requesting master dump") ||
           try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
                            reconnect_messages[SLAVE_RECON_ACT_DUMP]))
         goto err;
@@ -3801,6 +3815,8 @@ static int connect_to_master(THD* thd, M
   int last_errno= -2;                           // impossible error
   ulong err_count=0;
   char llbuff[22];
+  const char init_cmd_fmt[]= "set @slave_io_thread_session_id= %u";
+  char init_cmd[sizeof init_cmd_fmt + 12];
   DBUG_ENTER("connect_to_master");
 
 #ifndef DBUG_OFF
@@ -3812,6 +3828,8 @@ static int connect_to_master(THD* thd, M
 
   mysql_options(mysql, MYSQL_OPT_CONNECT_TIMEOUT, (char *) &slave_net_timeout);
   mysql_options(mysql, MYSQL_OPT_READ_TIMEOUT, (char *) &slave_net_timeout);
+  sprintf(init_cmd, init_cmd_fmt, mi->io_thread_session_id);
+  mysql_options(mysql, MYSQL_INIT_COMMAND, init_cmd);
 
 #ifdef HAVE_OPENSSL
   if (mi->ssl)

=== modified file 'sql/sql_parse.cc'
--- a/sql/sql_parse.cc	2009-08-21 14:41:48 +0000
+++ b/sql/sql_parse.cc	2009-08-26 08:41:47 +0000
@@ -1393,11 +1393,15 @@ bool dispatch_command(enum enum_server_c
 	break;
 
       /* TODO: The following has to be changed to an 8 byte integer */
-      pos = uint4korr(packet);
-      flags = uint2korr(packet + 4);
-      thd->server_id=0; /* avoid suicide */
+      pos= uint4korr(packet);
+      flags= uint2korr(packet + 4);
+      thd->server_id =0; /* avoid suicide */
       if ((slave_server_id= uint4korr(packet+6))) // mysqlbinlog.server_id==0
-	kill_zombie_dump_threads(slave_server_id);
+	if (kill_zombie_dump_threads(thd, slave_server_id))
+        {
+          error= TRUE;
+          break;
+        }
       thd->server_id = slave_server_id;
 
       general_log_print(thd, command, "Log: '%s'  Pos: %ld", packet+10,

=== modified file 'sql/sql_repl.cc'
--- a/sql/sql_repl.cc	2009-07-24 16:04:55 +0000
+++ b/sql/sql_repl.cc	2009-08-26 08:41:47 +0000
@@ -1071,18 +1071,35 @@ err:
   DBUG_RETURN(error);
 }
 
+/**
+  An auxiliary function extracts slave session unique identifier.
+
+  @param[in]    thd  THD to access a user variable
+
+  @return       slave session identifier or zero
+*/ 
+static ulong get_slave_session_id(THD * thd)
+{
+  my_bool null_value;
+  LEX_STRING name=  { C_STRING_WITH_LEN("slave_io_thread_session_id")};
+  user_var_entry *entry= 
+    (user_var_entry*) my_hash_search(&thd->user_vars, (uchar*) name.str,
+                                     name.length);
+  return entry? (ulong) entry->val_int(&null_value) : 0;
+}
+
 /*
 
   Kill all Binlog_dump threads which previously talked to the same slave
-  ("same" means with the same server id). Indeed, if the slave stops, if the
-  Binlog_dump thread is waiting (pthread_cond_wait) for binlog update, then it
-  will keep existing until a query is written to the binlog. If the master is
-  idle, then this could last long, and if the slave reconnects, we could have 2
-  Binlog_dump threads in SHOW PROCESSLIST, until a query is written to the
+  i.e one having the same slave session id and the same server id. 
+  If the slave stops the Binlog_dump thread does not immediately exit. It
+  would wait (pthread_cond_wait) for binlog update that can take long.
+  If the slave reconnects meanwhile, there will be 2 Binlog_dump threads
+  in SHOW PROCESSLIST all time until a query is written to the
   binlog. To avoid this, when the slave reconnects and sends COM_BINLOG_DUMP,
-  the master kills any existing thread with the slave's server id (if this id is
-  not zero; it will be true for real slaves, but false for mysqlbinlog when it
-  sends COM_BINLOG_DUMP to get a remote binlog dump).
+  the master kills any existing thread with the same slave session  signature
+  and the slave's server id.
+  Notice, this id is not zero to designate a slave server not mysqlbinlog.
 
   SYNOPSIS
     kill_zombie_dump_threads()
@@ -1091,23 +1108,51 @@ err:
 */
 
 
-void kill_zombie_dump_threads(uint32 slave_server_id)
+int kill_zombie_dump_threads(THD *thd, uint32 slave_server_id)
 {
-  pthread_mutex_lock(&LOCK_thread_count);
+  int err= 0;
+  bool matched= false;
   I_List_iterator<THD> it(threads);
   THD *tmp;
 
+  DBUG_ASSERT(slave_server_id != 0);
+
+  pthread_mutex_lock(&LOCK_thread_count);
   while ((tmp=it++))
   {
     if (tmp->command == COM_BINLOG_DUMP &&
        tmp->server_id == slave_server_id)
     {
-      pthread_mutex_lock(&tmp->LOCK_thd_data);	// Lock from delete
+      uint curr_id= get_slave_session_id(thd);
+      uint prev_id= get_slave_session_id(tmp);
+      if (curr_id == prev_id)
+      {
+        matched= true;
+        pthread_mutex_lock(&tmp->LOCK_thd_data);	// Lock from delete
+      }
+      else
+      {
+        err= 1;
+        my_error(ER_DUP_SLAVE_CONNECTION, MYF(0), slave_server_id,
+                 thd->main_security_ctx.user, thd->main_security_ctx.host_or_ip);
+        sql_print_information("A duplicate to current slave connection "
+                              "with server_id %d from '%s'@'%s' "
+                              "(signature %u) is rejected; the existing "
+                              "connection '%s'@'%s' (signature %u) remains.",
+                              slave_server_id,
+                              thd->main_security_ctx.user,
+                              thd->main_security_ctx.host_or_ip,
+                              curr_id,
+                              tmp->main_security_ctx.user,
+                              tmp->main_security_ctx.host_or_ip,
+                              prev_id);
+      }
       break;
     }
   }
   pthread_mutex_unlock(&LOCK_thread_count);
-  if (tmp)
+
+  if (matched)
   {
     /*
       Here we do not call kill_one_thread() as
@@ -1117,6 +1162,7 @@ void kill_zombie_dump_threads(uint32 sla
     tmp->awake(THD::KILL_QUERY);
     pthread_mutex_unlock(&tmp->LOCK_thd_data);
   }
+  return err;
 }
 
 

=== modified file 'sql/sql_repl.h'
--- a/sql/sql_repl.h	2008-03-14 17:38:54 +0000
+++ b/sql/sql_repl.h	2009-08-26 08:41:47 +0000
@@ -50,7 +50,7 @@ bool log_in_use(const char* log_name);
 void adjust_linfo_offsets(my_off_t purge_offset);
 bool show_binlogs(THD* thd);
 extern int init_master_info(Master_info* mi);
-void kill_zombie_dump_threads(uint32 slave_server_id);
+int kill_zombie_dump_threads(THD *thd, uint32 slave_server_id);
 int check_binlog_magic(IO_CACHE* log, const char** errmsg);
 
 typedef struct st_load_file_info

Thread
bzr commit into mysql-5.1 branch (aelkin:3078) Bug#33815Andrei Elkin26 Aug