List:Commits« Previous MessageNext Message »
From:Andrei Elkin Date:June 15 2009 2:00pm
Subject:bzr commit into mysql-6.0-rpl branch (aelkin:2864) Bug#37714
View as plain text  
#At file:///home/andrei/MySQL/BZR/mysql-6.0-rpl/ based on revid:zhenxing.he@stripped

 2864 Andrei Elkin	2009-06-15
      Bug #37714  rpl.rpl_heartbeat fails sporadically in pushbuild due to timeout 
      
      The reason of the bug is a feature of pthread_cond_timedwait() having a time window
      in between of the timer elapsed that wakes up the thread and
      the thread re-acquired the mutex. There could be signals sent to the dump thread at
      times of the interval so that the dump thread was not aware of updating of the
      binlog and continued to stay in the loop.
            
      Fixed by augmenting MYSQL_BIN_LOG class with a counter that is checked before and
      after the wake-up to catch the fact of the binlog got updated.
modified:
  mysql-test/suite/rpl/r/rpl_heartbeat.result
  mysql-test/suite/rpl/t/rpl_heartbeat.test
  sql/log.cc
  sql/log.h
  sql/sql_repl.cc

per-file messages:
  mysql-test/suite/rpl/r/rpl_heartbeat.result
    Bug #37714 fixes updated results.
  mysql-test/suite/rpl/t/rpl_heartbeat.test
    Removing a debug printout.
    Restroring the pre-debug push aelkin@stripped
    values (requested as todo in that push).
  sql/log.cc
    MYSQL_BIN_LOG class is augmented with `signal_cnt' counter to increment
    per an event write to the binlog file.
  sql/log.h
    MYSQL_BIN_LOG class is augmented with `signal_cnt' counter.
  sql/sql_repl.cc
    In the inner-most while-loop the dump thread checks if mysql_bin_log.signal_cnt
    had changed while it it was asleep. A change designates a new event has been recorded
    and needs sending out, and that breaks the loop.
=== modified file 'mysql-test/suite/rpl/r/rpl_heartbeat.result'
--- a/mysql-test/suite/rpl/r/rpl_heartbeat.result	2009-04-03 16:24:50 +0000
+++ b/mysql-test/suite/rpl/r/rpl_heartbeat.result	2009-06-15 14:00:15 +0000
@@ -45,7 +45,7 @@ Warnings:
 Warning	1624	The currect value for master_heartbeat_period exceeds the new value of `slave_net_timeout' sec. A sensible value for the period should be less than the timeout.
 reset slave;
 drop table if exists t1;
-set @@global.slave_net_timeout= 20;
+set @@global.slave_net_timeout= 10;
 change master to master_host='127.0.0.1',master_port=MASTER_PORT, master_user='root', master_heartbeat_period= 0.5;
 show status like 'Slave_heartbeat_period';;
 Variable_name	Slave_heartbeat_period
@@ -139,7 +139,5 @@ Variable_name	Slave_heartbeat_period
 Value	0.500
 A heartbeat has been received by the slave
 drop table t1;
-*** debug: 356 ***
-#connection slave
 set @@global.slave_net_timeout=@restore_slave_net_timeout;
 End of tests

=== modified file 'mysql-test/suite/rpl/t/rpl_heartbeat.test'
--- a/mysql-test/suite/rpl/t/rpl_heartbeat.test	2009-04-03 16:24:50 +0000
+++ b/mysql-test/suite/rpl/t/rpl_heartbeat.test	2009-06-15 14:00:15 +0000
@@ -113,7 +113,7 @@ drop table if exists t1;
 #
 
 connection slave;
-set @@global.slave_net_timeout= 20;
+set @@global.slave_net_timeout= 10;
 --replace_result $MASTER_MYPORT MASTER_PORT
 # no error this time but rather a warning
 eval change master to master_host='127.0.0.1',master_port=$MASTER_MYPORT, master_user='root', master_heartbeat_period= 0.5;
@@ -133,7 +133,7 @@ source include/show_slave_status.inc;
 # That would cause reconnecting and relaylog rotation w/o the fix i.e
 # without a heartbeat received.
 
-real_sleep 30;
+real_sleep 15;
 
 # check (compare with the previous show's results) that no rotation happened
 source include/show_slave_status.inc;
@@ -173,10 +173,6 @@ while (`select $slave_value = 0`)
 connection master;
 drop table t1;
 
-# debug print out, todo: remove after bug#37714 is fixed
-let $last_pos = query_get_value("SHOW MASTER STATUS", Position, 1);
-echo *** debug: $last_pos ***
-
 #connection slave;
 sync_slave_with_master;
 set @@global.slave_net_timeout=@restore_slave_net_timeout;

=== modified file 'sql/log.cc'
--- a/sql/log.cc	2009-06-08 14:58:33 +0000
+++ b/sql/log.cc	2009-06-15 14:00:15 +0000
@@ -4218,7 +4218,7 @@ MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_
   :bytes_written(0), prepared_xids(0), file_id(1), open_count(1),
    need_start_event(TRUE), m_table_map_version(0),
    sync_period_ptr(sync_period),
-   is_relay_log(0),
+   is_relay_log(0), signal_cnt(0),
    description_event_for_exec(0), description_event_for_queue(0)
 {
   /*
@@ -6687,6 +6687,7 @@ bool flush_error_log()
 void MYSQL_BIN_LOG::signal_update()
 {
   DBUG_ENTER("MYSQL_BIN_LOG::signal_update");
+  signal_cnt++;
   pthread_cond_broadcast(&update_cond);
   DBUG_VOID_RETURN;
 }

=== modified file 'sql/log.h'
--- a/sql/log.h	2009-06-08 14:58:33 +0000
+++ b/sql/log.h	2009-06-15 14:00:15 +0000
@@ -410,7 +410,7 @@ public:
 
   /* This is relay log */
   bool is_relay_log;
-
+  ulong signal_cnt;  // update of the counter is checked by heartbeat
   /*
     These describe the log's format. This is used only for relay logs.
     _for_exec is used by the SQL thread, _for_queue by the I/O thread. It's

=== modified file 'sql/sql_repl.cc'
--- a/sql/sql_repl.cc	2009-06-12 02:01:08 +0000
+++ b/sql/sql_repl.cc	2009-06-15 14:00:15 +0000
@@ -817,18 +817,19 @@ impossible position";
 	case LOG_READ_EOF:
         {
           int ret;
+          ulong signal_cnt;
 	  DBUG_PRINT("wait",("waiting for data in binary log"));
 	  if (thd->server_id==0) // for mysqlbinlog (mysqlbinlog.server_id==0)
 	  {
 	    pthread_mutex_unlock(log_lock);
 	    goto end;
 	  }
-
+          signal_cnt= mysql_bin_log.signal_cnt;
+          do
+          {
 #ifndef DBUG_OFF
-          ulong hb_info_counter= 0;
+            ulong hb_info_counter= 0;
 #endif
-          do 
-          {
             if (coord)
             {
               DBUG_ASSERT(heartbeat_ts && heartbeat_period != 0LL);
@@ -860,12 +861,13 @@ impossible position";
             }
             else
             {
-              DBUG_ASSERT(ret == 0);
+              DBUG_ASSERT(ret == 0 && signal_cnt != mysql_bin_log.signal_cnt ||
+                          thd->killed);
               DBUG_PRINT("wait",("binary log received update"));
             }
-          } while (ret != 0 && coord != NULL && !thd->killed);
+          } while (signal_cnt == mysql_bin_log.signal_cnt && !thd->killed);
           pthread_mutex_unlock(log_lock);
-        }    
+        }
         break;
             
         default:

Thread
bzr commit into mysql-6.0-rpl branch (aelkin:2864) Bug#37714Andrei Elkin15 Jun