MySQL Lists are EOL. Please join:

List:Commits« Previous MessageNext Message »
From:Rafal Somla Date:November 26 2009 9:03am
Subject:bzr commit into mysql-6.0-backup branch (Rafal.Somla:2895) Bug#48930
View as plain text  
#At file:///ext/mysql/bzr/backup/bug48930/ based on revid:rafal.somla@stripped

 2895 Rafal Somla	2009-11-26
      Bug#48930 - Backup: Lock thread race condition for default backup driver.
      
      This patch fixes few issues in the synchronization between the table 
      locking thread used by the default backup driver and the main thread. It 
      basically implements a new synchronization protocol which mimics the one 
      used in the MyISAM backup driver. In particular, it uses a single mutex 
      for synchronization between the two threads.
     @ sql/backup/be_default.cc
        Change the logic for acquiring table locks, because now erros are not
        signaled through locking_thd->lock_state.
        
        Note: As in the original code, if driver is waiting for the locking
        thread it sends empty buffer with OK reply to the kernel. It works, but
        perhaps better would be to send BUSY reply in that case. However, we 
        do not change previous behavior and leave this refactoring for a separate
        patch.
     @ sql/backup/be_snapshot.cc
        Do not use lock_state to infer if all tables have been processed.
        Instead, use explicit information given by get_data() method.
     @ sql/backup/be_thread.cc
        - Changes in the locking thread: 
         -- use m_error for signalling errors, independent from lock_state;
         -- implement new synchronization protocol;
         -- add comments.
        - Changes in kill_locking_thread():
         -- make this method wait for the locking thread after it has been killed,
         -- implement new synchronization protocol;
         -- add extensive comments.
        - Removed wait_until_locking_thread_dies() which is not needed now.
     @ sql/backup/be_thread.h
        - Remove lock_states which are not used now.
        - Use only one mutex to synchronize locking thread with the main thread.
        - Introduce new conditions for improved synchronization protocol.
        - Introduce new members: m_error and do_not_terminate.
        - Removed wait_until_locking_thread_dies() method since now waiting is
          done inside kill_locking_thread().

    modified:
      sql/backup/be_default.cc
      sql/backup/be_snapshot.cc
      sql/backup/be_thread.cc
      sql/backup/be_thread.h
=== modified file 'sql/backup/be_default.cc'
--- a/sql/backup/be_default.cc	2009-10-26 16:48:43 +0000
+++ b/sql/backup/be_default.cc	2009-11-26 09:02:58 +0000
@@ -156,7 +156,6 @@ Backup::~Backup()
     thread has finished before freeing elements of this table list.
   */
   locking_thd->kill_locking_thread();
-  locking_thd->wait_until_locking_thread_dies();
   my_free(all_tables, MYF(0));
 }
 
@@ -336,6 +335,21 @@ result_t Backup::get_data(Buffer &buf)
 
   DBUG_ENTER("Default_backup::get_data(Buffer &buf)");
 
+  buf.table_num= 0;
+  buf.last= TRUE;
+
+  /*
+    If this is first get_data() call, inform the kernel that init phase
+    is over so that we will proceed to the other phases of the protocol. 
+    No data is sent to the kernel.
+  */
+  if (!init_phase_complete)
+  {
+    buf.size= 0;
+    init_phase_complete= TRUE;
+    DBUG_RETURN(READY);
+  }
+
   /*
     Check the lock state. Take action based on the availability of the lock.
 
@@ -345,25 +359,28 @@ result_t Backup::get_data(Buffer &buf)
   if (!locks_acquired)
   {
     buf.size= 0;
-    buf.table_num= 0;
-    buf.last= TRUE;
     switch (locking_thd->lock_state) {
-    case LOCK_ERROR:             // Something ugly happened in locking.
-      DBUG_RETURN(ERROR);
     case LOCK_ACQUIRED:          // First time lock ready for validity point.
     {
       locks_acquired= TRUE;
       DEBUG_SYNC(locking_thd->m_thd, "default_locking_thread_added");
+      /*
+        Since locks have been acquired, we can signal to the kernel that we 
+        are ready for lock() call.
+      */ 
       DBUG_RETURN(READY);
     }
-    default:                     // If first call, signal end of init phase.
-      if (init_phase_complete)
-        DBUG_RETURN(OK);
+    default:
+      if (locking_thd->m_error)
+        DBUG_RETURN(ERROR);     // Something ugly happened in locking.
       else
-      {
-        init_phase_complete= TRUE;
-        DBUG_RETURN(READY);
-      }
+        /*
+          We must wait for locks. We return OK sending an empty buffer (which 
+          will be ignored). Backup kernel will call get_data() again and then 
+          we will have another chance to check if the locking thread has 
+          acquired table locks.
+        */ 
+        DBUG_RETURN(OK);
     }
   }
 

=== modified file 'sql/backup/be_snapshot.cc'
--- a/sql/backup/be_snapshot.cc	2009-10-21 13:32:24 +0000
+++ b/sql/backup/be_snapshot.cc	2009-11-26 09:02:58 +0000
@@ -153,12 +153,11 @@ result_t Backup::get_data(Buffer &buf)
   res= default_backup::Backup::get_data(buf);
 
   /*
-    If this is the last table to be read, close the transaction
-    and unlock the tables. This is indicated by the lock state
-    being set to LOCK_SIGNAL from parent::get_data(). This is set
-    after the last table is finished reading.
+    If this was the last table to be read, close the transaction
+    and unlock the tables. This is indicated by DONE result from
+    parent::get_data().
   */
-  if ((locking_thd->lock_state == LOCK_SIGNAL) || m_cancel)
+  if (res == DONE || m_cancel)
     cleanup();
   return(res);
 }

=== modified file 'sql/backup/be_thread.cc'
--- a/sql/backup/be_thread.cc	2009-10-21 13:32:24 +0000
+++ b/sql/backup/be_thread.cc	2009-11-26 09:02:58 +0000
@@ -108,8 +108,6 @@ pthread_handler_t backup_thread_for_lock
 
   DBUG_PRINT("info", ("Default_backup - lock_tables_in_separate_thread"));
 
-  // Turn off condition variable check for lock.
-  locking_thd->lock_state= LOCK_NOT_STARTED;
   my_thread_init();
 
   // First, create a new THD object.
@@ -118,7 +116,7 @@ pthread_handler_t backup_thread_for_lock
 
   if (thd == 0 || thd->killed)
   {
-    locking_thd->lock_state= LOCK_ERROR;
+    locking_thd->m_error= TRUE;
     goto end2;
   }
 
@@ -126,39 +124,30 @@ pthread_handler_t backup_thread_for_lock
   thd->set_query(locking_thd->thd_name.c_ptr(), locking_thd->thd_name.length());
 
   pthread_detach_this_thread();
+
   locking_thd->lock_thd= thd;
+  locking_thd->lock_state= LOCK_IN_PROGRESS;
 
-  // Now open and lock the tables.
+  // There should be some tables to be opened. We error if this is not the case.
 
   DBUG_PRINT("info",("Online backup open tables in thread"));
   if (!locking_thd->tables_in_backup)
   {
     DBUG_PRINT("info",("Online backup locking error no tables to lock"));
     THD_SET_PROC_INFO(thd, "lock error");
-    locking_thd->lock_state= LOCK_ERROR;
+    locking_thd->m_error= TRUE;
     goto end2;
   }
 
-  pthread_mutex_lock(&locking_thd->THR_LOCK_caller);
-
-  if (thd->killed)
-  {
-    THD_SET_PROC_INFO(thd, "lock error");
-    locking_thd->lock_state= LOCK_ERROR;
-    pthread_mutex_unlock(&locking_thd->THR_LOCK_caller);
-    goto end;
-  }
-
-  /*
-    As locking tables can be a long operation, we need to support
-    killing the thread. In this case, we need to close the tables
-    and exit.
-  */
+  // Now open and lock the tables.
 
   /*
     The MYSQL_OPEN_SKIP_TEMPORARY flag is needed so that temporary tables are
-    not opened which would occulde the regular tables selected for backup
+    not opened which would occlude the regular tables selected for backup
     (BUG#33574).
+    
+    Note: open_and_lock_tables_derived() will signal error when the
+    locking thread is killed.
   */
   if (open_and_lock_tables_derived(thd, locking_thd->tables_in_backup,
                                    FALSE, /* Do not process derived tables. */
@@ -167,34 +156,25 @@ pthread_handler_t backup_thread_for_lock
   {
     DBUG_PRINT("info",("Online backup locking thread dying"));
     THD_SET_PROC_INFO(thd, "lock error");
-    locking_thd->lock_state= LOCK_ERROR;
-    pthread_mutex_unlock(&locking_thd->THR_LOCK_caller);
-    goto end;
-  }
-
-  if (thd->killed)
-  {
-    THD_SET_PROC_INFO(thd, "lock error");
-    locking_thd->lock_state= LOCK_ERROR;
-    pthread_mutex_unlock(&locking_thd->THR_LOCK_caller);
+    locking_thd->m_error= TRUE;
     goto end;
   }
 
-  pthread_mutex_unlock(&locking_thd->THR_LOCK_caller);
+  locking_thd->lock_state= LOCK_ACQUIRED;
 
   /*
-    Part of work is done. Rest until woken up.
-    We wait if the thread is not killed and the driver has not signaled us.
+    Part of work is done. Wait until we are killed (COND_killed condition), 
+    then release table locks and terminate. 
   */
   THD_SET_PROC_INFO(thd, "waiting for signal");
-  pthread_mutex_lock(&locking_thd->THR_LOCK_thread);
-  locking_thd->lock_state= LOCK_ACQUIRED;
-  thd->enter_cond(&locking_thd->COND_thread_wait,
-                  &locking_thd->THR_LOCK_thread,
+
+  pthread_mutex_lock(&locking_thd->THR_LOCK_cond);
+  thd->enter_cond(&locking_thd->COND_killed,
+                  &locking_thd->THR_LOCK_cond,
                   "Locking thread: holding table locks");
-  while (!thd->killed && (locking_thd->lock_state != LOCK_SIGNAL))
-    pthread_cond_wait(&locking_thd->COND_thread_wait,
-                      &locking_thd->THR_LOCK_thread);
+  while (!thd->killed)
+    pthread_cond_wait(&locking_thd->COND_killed,
+                      &locking_thd->THR_LOCK_cond);
   thd->exit_cond("Locking thread: terminating");
   THD_SET_PROC_INFO(thd, "terminating");
 
@@ -207,22 +187,39 @@ end:
 
 end2:
   THD_SET_PROC_INFO(thd, "lock done");
+
+  // Final synchronization with driver's thread.
+
+  pthread_mutex_lock(&locking_thd->THR_LOCK_cond);
+
+  /*
+    We must wait if do_not_terminate flag is TRUE. This is because driver's 
+    thread is about to use lock_thd to kill us. Therefore we can't destroy THD
+    instance yet, but must wait for COND_can_terminate.
+  */ 
+  while (locking_thd->do_not_terminate)
+    pthread_cond_wait(&locking_thd->COND_can_terminate,
+                      &locking_thd->THR_LOCK_cond);
+
+  locking_thd->lock_state= LOCK_DONE;
+  locking_thd->lock_thd= NULL;
+
+  // Signal COND_done condition in case driver's thread is waiting for us.
+  pthread_cond_signal(&locking_thd->COND_done);
+
+  pthread_mutex_unlock(&locking_thd->THR_LOCK_cond);
+
+  // Unregister this thread.
+  
   pthread_mutex_lock(&LOCK_thread_count);
   thread_count--;
   thread_running--;
   pthread_mutex_unlock(&LOCK_thread_count);
 
-  pthread_mutex_lock(&locking_thd->THR_LOCK_caller);
+  // Final cleanup.
+
   net_end(&thd->net);
   delete thd;
-  locking_thd->lock_thd= NULL;
-  if (locking_thd->lock_state != LOCK_ERROR)
-    locking_thd->lock_state= LOCK_DONE;
-
-  // Signal the driver thread that it's ok to proceed with destructor.
-
-  pthread_cond_signal(&locking_thd->COND_caller_wait);
-  pthread_mutex_unlock(&locking_thd->THR_LOCK_caller);
   my_thread_end(); /* always last, after all mutex usage */
   pthread_exit(0);
   return (0);
@@ -232,15 +229,15 @@ end2:
 /// Constructor for Locking_thread_st structure.
 
 Locking_thread_st::Locking_thread_st()
- :m_thread_started(FALSE)
+ : tables_in_backup(NULL), lock_thd(NULL), lock_state(LOCK_NOT_STARTED),
+   m_thd(NULL), m_thread_started(FALSE), do_not_terminate(FALSE), 
+   m_error(FALSE)
 {
   // Initialize the thread mutex and cond variable.
-  pthread_mutex_init(&THR_LOCK_thread, MY_MUTEX_INIT_FAST);
-  pthread_cond_init(&COND_thread_wait, NULL);
-  pthread_mutex_init(&THR_LOCK_caller, MY_MUTEX_INIT_FAST);
-  pthread_cond_init(&COND_caller_wait, NULL);
-  lock_state= LOCK_NOT_STARTED;
-  lock_thd= NULL; // Set to 0 as precaution for get_data being called too soon.
+  pthread_mutex_init(&THR_LOCK_cond, MY_MUTEX_INIT_FAST);
+  pthread_cond_init(&COND_killed, NULL);
+  pthread_cond_init(&COND_can_terminate, NULL);
+  pthread_cond_init(&COND_done, NULL);
   thd_name.length(0);
 };
 
@@ -249,23 +246,15 @@ Locking_thread_st::Locking_thread_st()
 
 Locking_thread_st::~Locking_thread_st()
 {
-  /*
-    If the locking thread has been started we need to kill it. We also need to
-    wait until it dies before destroying the mutexes so that the locking thread
-    won't access them any more.
-  */
+  // Kill the locking thread (if it is running)
   if (m_thread_started)
-  {
     kill_locking_thread();
-    wait_until_locking_thread_dies();
-  }
 
   // Destroy the thread mutexes and cond variables.
-
-  pthread_mutex_destroy(&THR_LOCK_thread);
-  pthread_cond_destroy(&COND_thread_wait);
-  pthread_mutex_destroy(&THR_LOCK_caller);
-  pthread_cond_destroy(&COND_caller_wait);
+  pthread_mutex_destroy(&THR_LOCK_cond);
+  pthread_cond_destroy(&COND_killed);
+  pthread_cond_destroy(&COND_can_terminate);
+  pthread_cond_destroy(&COND_done);
 }
 
 
@@ -293,11 +282,12 @@ result_t Locking_thread_st::start_lockin
 
 
 /**
-   Kill the driver's lock thread.
+   Kill the driver's lock thread and wait for it to terminate.
 
-   This method issues the awake and broadcast to kill the locking thread.
-   A mutex is used to prevent the locking thread from deleting the THD
-   structure until this operation is complete.
+   This method issues the awake and broadcast to kill the locking thread. It
+   also waits for COND_done so that we know that the locking thread will 
+   terminate shortly. A care is taken to ensure that the lock_thd instance is
+   not deleted when it is used to kill the locking thread. 
 */
 
 void Locking_thread_st::kill_locking_thread()
@@ -308,53 +298,77 @@ void Locking_thread_st::kill_locking_thr
   if (!m_thread_started)
     DBUG_VOID_RETURN;
 
-  pthread_mutex_lock(&THR_LOCK_caller);
-  if (lock_state == LOCK_ERROR)
-    THD_SET_PROC_INFO(m_thd, "error in the locking thread");
+ retry:
+
+  pthread_mutex_lock(&THR_LOCK_cond);
 
-  if (lock_thd && (lock_state != LOCK_DONE) && (lock_state != LOCK_SIGNAL))
+  /*
+    We hold THR_LOCK_cond. If lock_state == LOCK_DONE done then it means that 
+    the locking thread is in its final section (after end2 label) and will
+    terminate without any further action. 
+  */ 
+  if (lock_state == LOCK_DONE)
   {
-    lock_state= LOCK_SIGNAL;
-    pthread_mutex_lock(&lock_thd->LOCK_thd_data);
-    lock_thd->awake(THD::KILL_CONNECTION);
-    pthread_mutex_unlock(&lock_thd->LOCK_thd_data);
-    pthread_cond_signal(&COND_thread_wait);
+    pthread_mutex_unlock(&THR_LOCK_cond);  
+    DBUG_VOID_RETURN;
   }
-  pthread_mutex_unlock(&THR_LOCK_caller);
-
-  // This tells the CS driver that we're finished with the tables.
-  if (!lock_thd && (lock_state == LOCK_ACQUIRED))
-    lock_state= LOCK_SIGNAL;
-  DBUG_VOID_RETURN;
-}
 
+  /*
+    Otherwise (lock_state != LOCK_DONE), the locking thread must be still 
+    before the final synchronization phase (i.e., before waiting for 
+    COND_can_terminate). We want to ensure that lock_thd is set before we use
+    it for killing the locking thread. It could happen that the locking thread
+    have not yet created THD instance - in that case we must wait for it.
+    Since this situation is unlikely, we do not bother with introducing yet 
+    another condition, but we do active wait.
+  */    
+  if (unlikely(lock_thd == NULL))
+  {
+    pthread_mutex_unlock(&THR_LOCK_cond);  
+    my_sleep(1);
+    goto retry;
+  }
 
-/**
-   Wait until driver's lock thread finishes.
+  /*  
+    Now we know that lock_thd is set. Since we are holding THR_LOCK_cond mutex,
+    the THD instance will not be deleted by the locking thread. However, we 
+    must release the mutex because THD::awake() can not be used while we are 
+    holding one. To protect locking thread from terminating and deleting 
+    lock_thd in the meanwhile, we set do_not_terminate flag to TRUE.
+  */
+  do_not_terminate= TRUE;
+  pthread_mutex_unlock(&THR_LOCK_cond);  
 
-   @note It is important to use this function before freeing memory
-         or destroying objects to which such thread might access.
-*/
+ 
+  // Kill locking thread.
+  DBUG_ASSERT(lock_thd);
+  pthread_mutex_lock(&lock_thd->LOCK_thd_data);
+  lock_thd->awake(THD::KILL_CONNECTION);
+  pthread_mutex_unlock(&lock_thd->LOCK_thd_data);
 
-void Locking_thread_st::wait_until_locking_thread_dies()
-{
-  // Nothing to do if the locking thread has not been started.
-  if (!m_thread_started)
-    return;
+  /*
+    Now we will wait for COND_done to be sure that the locking thread goes 
+    away. But first we will signal both conditions on which the locking thread
+    might be waiting at the moment. 
+  */ 
+
+  pthread_mutex_lock(&THR_LOCK_cond);
+  do_not_terminate= FALSE;
+  pthread_cond_signal(&COND_killed);
+  pthread_cond_signal(&COND_can_terminate);
+  
+  // Now Wait for COND_done.
 
-  pthread_mutex_lock(&THR_LOCK_caller);
-  if (lock_state != LOCK_DONE)
-  {
-    m_thd->enter_cond(&COND_caller_wait, &THR_LOCK_caller,
+  m_thd->enter_cond(&COND_done, &THR_LOCK_cond,
                     "Locking thread: waiting until locking thread is done");
-    while (lock_state != LOCK_DONE)
-      pthread_cond_wait(&COND_caller_wait, &THR_LOCK_caller);
-    m_thd->exit_cond("Locking thread: terminating");
-
-    DBUG_PRINT("info",("Locking thread's locking thread terminated"));
-  }
-  else
-    pthread_mutex_unlock(&THR_LOCK_caller);
+  while (lock_state != LOCK_DONE)
+    pthread_cond_wait(&COND_done, &THR_LOCK_cond);
+  m_thd->exit_cond("Locking thread: terminating");
 
   m_thread_started= FALSE;
+
+  if (m_error)
+    THD_SET_PROC_INFO(m_thd, "error in the locking thread");
+
+  DBUG_VOID_RETURN;
 }

=== modified file 'sql/backup/be_thread.h'
--- a/sql/backup/be_thread.h	2009-10-21 13:32:24 +0000
+++ b/sql/backup/be_thread.h	2009-11-26 09:02:58 +0000
@@ -15,8 +15,6 @@ typedef enum {
   LOCK_IN_PROGRESS,
   LOCK_ACQUIRED,
   LOCK_DONE,
-  LOCK_ERROR,
-  LOCK_SIGNAL
 } LOCK_STATE;
 
 using backup::result_t;
@@ -53,10 +51,21 @@ public:
   Locking_thread_st();
   ~Locking_thread_st();
 
-  pthread_mutex_t THR_LOCK_thread; ///< Mutex for thread variables.
-  pthread_cond_t COND_thread_wait; ///< Condition variable for wait.
-  pthread_mutex_t THR_LOCK_caller; ///< Mutex for thread variables.
-  pthread_cond_t COND_caller_wait; ///< Condition variable for wait.
+  /** 
+    Signalled when the locking thread has been killed and should release
+    table locks.
+  */
+  pthread_cond_t  COND_killed;
+
+  /** 
+    Signalled when it is OK for the locking thread to delete the THD instance
+    and terminate.
+  */
+  pthread_cond_t  COND_can_terminate;
+
+  /// Signalled by locking thread when it has finished and is about to exit.
+  pthread_cond_t  COND_done;
+  pthread_mutex_t THR_LOCK_cond; ///< Mutex for thread variables.
 
   TABLE_LIST *tables_in_backup;    ///< List of tables used in backup.
   THD *lock_thd;                   ///< Locking thread pointer.
@@ -66,9 +75,19 @@ public:
   /// Indicates if the locking thread has been started.
   my_bool m_thread_started;
 
+  /** 
+    Set to TRUE if the locking thread should not yet terminate. In particular
+    the THD instance should not be deleted if this flag is TRUE.
+  */
+  my_bool  do_not_terminate;
+
+  /** 
+    TRUE if error was encountered by the locking thread.
+  */
+  my_bool  m_error;
+
   result_t start_locking_thread(const char *tname);
   void kill_locking_thread();
-  void wait_until_locking_thread_dies();
 
 }; // Locking_thread_st
 


Attachment: [text/bzr-bundle] bzr/rafal.somla@sun.com-20091126090258-038073fqwz1t01oh.bundle
Thread
bzr commit into mysql-6.0-backup branch (Rafal.Somla:2895) Bug#48930Rafal Somla26 Nov