List:Commits« Previous MessageNext Message »
From:Marc Alff Date:September 3 2010 5:29pm
Subject:bzr commit into mysql-5.5-bugfixing branch (marc.alff:3195) Bug#56324
View as plain text  
#At file:///home/malff/BZR_TREE/mysql-5.5-bugfixing-56324/ based on revid:svoj@stripped

 3195 Marc Alff	2010-09-03
      Bug#56324 Race Condition while shutting down MySQL: "PSI_server"
      
      Before this fix, the server could crash during shutdown,
      due to race conditions, that occured when killing the server.
      
      In particular, the performance schema instrumentation handle,
      PSI_server, and the performance schema itself would be cleaned up
      too soon, causing race conditions with a running kill server thread.
      
      The specifics of the race condition found are that:
      the main thread executing "PSI_server= NULL" can cause crashes in
      other threads still running, which are executing
      "if (PSI_server != NULL) PSI_server->xxx()"
      as part of the performance schema instrumentation.
      
      While the bug was reported for the kill server thread,
      in theory the same crash could happen with the signal thread,
      as found by code analysis.
      
      This fix re works the code in mysqld_main() used during shutdown,
      and in particular:
      - the performance schema is now shut down _after_ the signal thread
      is completed,
      - the performance schema is now shut down _after_ the kill server thread
      is completed,
      
      This code has been tested with running 300 times in a row the following
      tests in parallel, which have been known to fail with race conditions
      in the past:
      - rpl_change_master
      - binlog_max_extension
      - events_restart
      - rpl_heartbeat_basic
      and no crash of test failure has been seen with the changed code.

    modified:
      sql/mysqld.cc
=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc	2010-08-25 19:00:38 +0000
+++ b/sql/mysqld.cc	2010-09-03 17:29:54 +0000
@@ -323,6 +323,13 @@ static PSI_rwlock_key key_rwlock_openssl
 /* the default log output is log tables */
 static bool lower_case_table_names_used= 0;
 static bool volatile select_thread_in_use, signal_thread_in_use;
+/**
+  Indicate that the kill server thread is executing.
+  This flag is used to make sure that the server main thread (@c mysqld_main())
+  properly waits for @c kill_server_thread() to complete,
+  to avoid race conditions during shutdown.
+*/
+static bool volatile kill_server_thread_in_use;
 static bool volatile ready_to_exit;
 static my_bool opt_debugging= 0, opt_external_locking= 0, opt_console= 0;
 static my_bool opt_short_log_format= 0;
@@ -626,6 +633,7 @@ mysql_rwlock_t LOCK_system_variables_has
 mysql_cond_t COND_thread_count;
 mysql_cond_t COND_global_read_lock;
 pthread_t signal_thread;
+pthread_t kill_server_thread_id;
 pthread_attr_t connection_attrib;
 mysql_mutex_t LOCK_server_started;
 mysql_cond_t COND_server_started;
@@ -965,6 +973,7 @@ static void start_signal_handler(void);
 static void close_server_sock();
 static void clean_up_mutexes(void);
 static void wait_for_signal_thread_to_end(void);
+static void wait_for_kill_server_thread_to_end(void);
 static void create_pid_file();
 static void mysqld_exit(int exit_code) __attribute__((noreturn));
 #endif
@@ -1219,10 +1228,11 @@ void kill_mysql(void)
 #ifdef SIGNALS_DONT_BREAK_READ
   if (!kill_in_progress)
   {
-    pthread_t tmp;
     abort_loop=1;
     if (mysql_thread_create(0, /* Not instrumented */
-                            &tmp, &connection_attrib, kill_server_thread,
+                            &kill_server_thread_id,
+                            &connection_attrib,
+                            kill_server_thread,
                             (void*) 0))
       sql_print_error("Can't create thread to kill server");
   }
@@ -1306,10 +1316,12 @@ static void __cdecl kill_server(int sig_
 #if defined(USE_ONE_SIGNAL_HAND)
 pthread_handler_t kill_server_thread(void *arg __attribute__((unused)))
 {
+  kill_server_thread_in_use= 1;
   my_thread_init();				// Initialize new thread
   kill_server(0);
   /* purecov: begin deadcode */
   my_thread_end();
+  kill_server_thread_in_use= 0;
   pthread_exit(0);
   return 0;
   /* purecov: end */
@@ -1381,12 +1393,6 @@ extern "C" void unireg_abort(int exit_co
 
 static void mysqld_exit(int exit_code)
 {
-  /*
-    Important note: we wait for the signal thread to end,
-    but if a kill -15 signal was sent, the signal thread did
-    spawn the kill_server_thread thread, which is running concurrently.
-  */
-  wait_for_signal_thread_to_end();
   mysql_audit_finalize();
   clean_up_mutexes();
   clean_up_error_log_mutex();
@@ -1523,6 +1529,26 @@ static void wait_for_signal_thread_to_en
   }
 }
 
+static void wait_for_kill_server_thread_to_end()
+{
+  uint i;
+  /*
+    Wait up to 10 seconds to give a chance to the kill server thread
+    to complete and actually kill the server.
+    Note that this is a 'low tech' approach here (sleep), on purpose.
+    'High tech' approach such as:
+    - pthread_join()
+    - pthread_kill()
+    would just generate more events and more signals to trap,
+    which would interfere with the shutdown already in progress.
+    A guarantee to finish after a given delay, even with memory leaks,
+    is preferred to no guarantee to complete at all.
+    The mysqld process is about to die and exit() in a couple of instructions,
+    so memory 'leaks' are not real leaks.
+  */
+  for (i= 0; i<100 && kill_in_progress; i++)
+    my_sleep(100);
+}
 
 static void clean_up_mutexes()
 {
@@ -2728,9 +2754,10 @@ pthread_handler_t signal_hand(void *arg
       {
 	abort_loop=1;				// mark abort for threads
 #ifdef USE_ONE_SIGNAL_HAND
-	pthread_t tmp;
         if (mysql_thread_create(0, /* Not instrumented */
-                                &tmp, &connection_attrib, kill_server_thread,
+                                &kill_server_thread_id,
+                                &connection_attrib,
+                                kill_server_thread,
                                 (void*) &sig))
 	  sql_print_error("Can't create thread to kill server");
 #else
@@ -4614,18 +4641,44 @@ int mysqld_main(int argc, char **argv)
   }
 #endif
   clean_up(1);
+
 #ifdef HAVE_PSI_INTERFACE
   /*
-    Disable the instrumentation, to avoid recording events
+    Disable this thread instrumentation, to avoid recording events
     during the shutdown.
   */
   if (PSI_server)
-  {
     PSI_server->delete_current_thread();
+#endif
+  /*
+    Important note: we wait for the signal thread to end,
+    but if a kill -15 signal / console CTRL-C was sent, the event did
+    spawn the kill_server_thread thread, which is running concurrently.
+  */
+  wait_for_signal_thread_to_end();
+  /*
+    So now that no more signal thread is running,
+    we can hunt down any remaining kill server thread.
+    This call may not return.
+  */
+  wait_for_kill_server_thread_to_end();
+
+#ifdef WITH_PERFSCHEMA_STORAGE_ENGINE
+  /*
+    If the kill server thread somehow is still alive,
+    do not free any memory in the performance schema instrumentation,
+    to avoid race conditions, as mysqld_main() and kill_server_thread()
+    are still competing to end this process ...
+    This is for robustness in production, and should not cause
+    false positive with tools like valgrind.
+  */
+  if (! kill_in_progress)
+  {
     PSI_server= NULL;
+    shutdown_performance_schema();
   }
-  shutdown_performance_schema();
 #endif
+
   mysqld_exit(0);
 }
 
@@ -6700,6 +6753,7 @@ static int mysql_init_variables(void)
   opt_endinfo= using_udf_functions= 0;
   opt_using_transactions= 0;
   abort_loop= select_thread_in_use= signal_thread_in_use= 0;
+  kill_server_thread_in_use= 0;
   ready_to_exit= shutdown_in_progress= grant_option= 0;
   aborted_threads= aborted_connects= 0;
   delayed_insert_threads= delayed_insert_writes= delayed_rows_in_use= 0;


Attachment: [text/bzr-bundle] bzr/marc.alff@oracle.com-20100903172954-4r9ppqz8fvatx5ho.bundle
Thread
bzr commit into mysql-5.5-bugfixing branch (marc.alff:3195) Bug#56324Marc Alff3 Sep
  • Re: bzr commit into mysql-5.5-bugfixing branch (marc.alff:3195) Bug#56324Christopher Powers7 Sep