List:Maria Storage Engine« Previous MessageNext Message »
From:Guilhem Bichot Date:October 13 2008 9:03pm
Subject:bzr commit into MySQL/Maria:mysql-maria branch (guilhem:2677) Bug#39210
View as plain text  
#At bzr+ssh://bk-internal.mysql.com/bzrroot/server/mysql-maria/

 2677 Guilhem Bichot	2008-10-13
      Fix for BUG#39210 "Maria deadlock in _ma_bitmap_wait_or_flush". It was actually a story of
      threads which nobody woke up (see comment of ma_blockrec.c). No testcase, this requires
      multiple threads and is automatically tested at push time by maria_stress.yy (pushbuild2).
modified:
  storage/maria/ma_bitmap.c
  storage/maria/ma_blockrec.c
  storage/maria/maria_def.h

per-file messages:
  storage/maria/ma_bitmap.c
    _ma_bitmap_wait_or_flush() didn't publish that it was waiting for bitmap to not
    be over-allocated (i.e. didn't modify bitmap->flush_all_requested) so nobody
    (_ma_bitmap_flushable(), _ma_bitmap_release_unused()) knew it had to wake it up => it stalled.
    After fixing that, this still stalled sometimes, because there can be two concurrent
    threads waiting for the bitmap to not be over-allocated (full table scan and checkpoint) so
    bitmap->flush_all_requested must be a counter, not just a boolean.
    After fixing that, this still stalled sometimes, because _ma_bitmap_flushable(1)
    (going to over-allocate the bitmap) intentionally waits for those who are waiting
    for the bitmap to not be over-allocated: in return, those must wake up
    _ma_bitmap_flushable(1), but _ma_bitmap_wait_or_flush() didn't do so.
    After fixing that, a thread which unpins bitmap pages in _ma_bitmap_unpin_all() sometimes
    hit an assertion in the page cache which states that you can unpin/unlock only what
    *you* have pinned/locked. Fixed by changing the pagecache_unlock_by_link() call
    in _ma_bitmap_unpin_all().
    After fixing that, we hit BUG 39665 :)
  storage/maria/ma_blockrec.c
    split assertion in two, I see the second half fire (filed as BUG 39665)
  storage/maria/maria_def.h
    At one moment, there can be more than one thread waiting for the bitmap to not be over-allocated
    (table scan and checkpoint are those threads).
=== modified file 'storage/maria/ma_bitmap.c'
--- a/storage/maria/ma_bitmap.c	2008-10-13 15:50:28 +0000
+++ b/storage/maria/ma_bitmap.c	2008-10-13 21:03:05 +0000
@@ -306,32 +306,50 @@ my_bool _ma_bitmap_flush(MARIA_SHARE *sh
 }
 
 
-/*
-  @brief Send updated bitmap to the page cache if bitmap is free
-
-  @note
-  This is used by reader threads which don't unpin things
+/**
+  Wait for bitmap page to not be over-allocated and send it to page cache
 */
 
 my_bool _ma_bitmap_wait_or_flush(MARIA_SHARE *share)
 {
   my_bool res= 0;
   MARIA_FILE_BITMAP *bitmap= &share->bitmap;
-  DBUG_ENTER("_ma_bitmap_flush");
+  DBUG_ENTER("_ma_bitmap_wait_or_flush");
+  /*
+    We need to flush what's in memory (bitmap.map) to page cache otherwise, as
+    we are going to read bitmaps from page cache in table scan (see
+    _ma_scan_block_record()), we may miss recently inserted rows (page in page
+    cache would be too old). This matters only for committed rows, that is,
+    rows for which there was a commit before our transaction started; as
+    commit and transaction's start are protected by the same mutex, we see
+    memory at least as new as at other transaction's commit time, so if the
+    committed rows caused bitmap->changed to be true, we see it; if we see 0
+    it really means a flush happened since then. So, read without bitmap's
+    mutex is safe here.
+  */
   if (bitmap->changed)
   {
     pthread_mutex_lock(&bitmap->bitmap_lock);
+    /*
+      The following wait is not for correctness (we could just go directly
+      into write_changed_bitmap() and thus pin the unflushable bitmap page),
+      but as pin means write-lock it could hurt concurrency.
+    */
+    bitmap->flush_all_requested++;
     while (bitmap->non_flushable && bitmap->changed)
     {
       DBUG_PRINT("info", ("waiting for bitmap to be flushable"));
       pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock);
     }
+    bitmap->flush_all_requested--;
     if (bitmap->changed)
     {
       bitmap->changed= 0;
       res= write_changed_bitmap(share, bitmap);
     }
     pthread_mutex_unlock(&bitmap->bitmap_lock);
+    DBUG_PRINT("info", ("bitmap flusher waking up others"));
+    pthread_cond_broadcast(&bitmap->bitmap_cond);
   }
   DBUG_RETURN(res);
 }
@@ -373,7 +391,7 @@ my_bool _ma_bitmap_flush_all(MARIA_SHARE
   pthread_mutex_lock(&bitmap->bitmap_lock);
   if (bitmap->changed)
   {
-    bitmap->flush_all_requested= TRUE;
+    bitmap->flush_all_requested++;
 #ifndef WRONG_BITMAP_FLUSH
     while (bitmap->non_flushable > 0)
     {
@@ -381,14 +399,18 @@ my_bool _ma_bitmap_flush_all(MARIA_SHARE
       pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock);
     }
 #endif
+    bitmap->flush_all_requested--;
     /*
       Bitmap is in a flushable state: its contents in memory are reflected by
       log records (complete REDO-UNDO groups) and all bitmap pages are
       unpinned. We keep the mutex to preserve this situation, and flush to the
       file.
     */
-    res= write_changed_bitmap(share, bitmap);
-    bitmap->changed= FALSE;
+    if (bitmap->changed)
+    {
+      res= write_changed_bitmap(share, bitmap);
+      bitmap->changed= FALSE;
+    }
     /*
       We do NOT use FLUSH_KEEP_LAZY because we must be sure that bitmap
       pages have been flushed. That's a condition of correctness of
@@ -405,7 +427,6 @@ my_bool _ma_bitmap_flush_all(MARIA_SHARE
                                            &bitmap->pages_covered) &
         PCFLUSH_PINNED_AND_ERROR)
       res= TRUE;
-    bitmap->flush_all_requested= FALSE;
     /*
       Some well-behaved threads may be waiting for flush_all_requested to
       become false, wake them up.
@@ -425,6 +446,8 @@ my_bool _ma_bitmap_flush_all(MARIA_SHARE
 
   @return Operation status
     @retval   0   ok
+
+  @note This unlocks and unpins pages locked and pinned by other threads.
 */
 
 static void _ma_bitmap_unpin_all(MARIA_SHARE *share)
@@ -438,7 +461,7 @@ static void _ma_bitmap_unpin_all(MARIA_S
   while (pinned_page-- != page_link)
     pagecache_unlock_by_link(share->pagecache, pinned_page->link,
                              pinned_page->unlock, PAGECACHE_UNPIN,
-                             LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, TRUE, FALSE);
+                             LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, TRUE, TRUE);
   bitmap->pinned_pages.elements= 0;
   DBUG_VOID_RETURN;
 }
@@ -2140,8 +2163,8 @@ my_bool _ma_bitmap_set_full_page_bits(MA
    function first waits for the flush to be done.
 
    @note
-   info->non_flushable_state is set to 1 if we have incremented
-   bitmap->info->non_flushable and not yet decremented it.
+   this sets info->non_flushable_state to 1 if we have incremented
+   bitmap->non_flushable and not yet decremented it.
 
    @param  share               Table's share
    @param  non_flushable_inc   Increment of MARIA_FILE_BITMAP::non_flushable
@@ -2152,20 +2175,21 @@ void _ma_bitmap_flushable(MARIA_HA *info
 {
   MARIA_SHARE *share= info->s;
   MARIA_FILE_BITMAP *bitmap;
+  DBUG_ENTER("_ma_bitmap_flushable");
 
   /*
     Not transactional tables are never automaticly flushed and needs no
     protection
   */
   if (!share->now_transactional)
-    return;
+    DBUG_VOID_RETURN;
 
   bitmap= &share->bitmap;
   if (non_flushable_inc == -1)
   {
     pthread_mutex_lock(&bitmap->bitmap_lock);
-    DBUG_ASSERT((int) bitmap->non_flushable > 0 &&
-                info->non_flushable_state == 1);
+    DBUG_ASSERT((int) bitmap->non_flushable > 0);
+    DBUG_ASSERT(info->non_flushable_state == 1);
     info->non_flushable_state= 0;
     if (--bitmap->non_flushable == 0)
     {
@@ -2183,14 +2207,15 @@ void _ma_bitmap_flushable(MARIA_HA *info
     }
     DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
     pthread_mutex_unlock(&bitmap->bitmap_lock);
-    return;
+    DBUG_VOID_RETURN;
   }
-  DBUG_ASSERT(non_flushable_inc == 1 && info->non_flushable_state == 0);
+  DBUG_ASSERT(non_flushable_inc == 1);
+  DBUG_ASSERT(info->non_flushable_state == 0);
   /* It is a read without mutex because only an optimization */
   if (unlikely(bitmap->flush_all_requested))
   {
     /*
-      _ma_bitmap_flush_all() is waiting for the bitmap to become
+      Some other thread is waiting for the bitmap to become
       flushable. Not the moment to make the bitmap unflushable or more
       unflushable; let's rather back off and wait. If we didn't do this, with
       multiple writers, there may always be one thread causing the bitmap to
@@ -2215,6 +2240,7 @@ void _ma_bitmap_flushable(MARIA_HA *info
   bitmap->non_flushable++;
   info->non_flushable_state= 1;
   DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
+  DBUG_VOID_RETURN;
 }
 
 
@@ -2322,10 +2348,12 @@ my_bool _ma_bitmap_release_unused(MARIA_
       goto err;
   }
 
-  if (info->s->now_transactional)
+  /*
+    This duplicates ma_bitmap_flushable(-1) except that it already has mutex.
+  */
+  if (info->non_flushable_state)
   {
-    DBUG_ASSERT((int) bitmap->non_flushable >= 0 &&
-                info->non_flushable_state);
+    DBUG_ASSERT((int) bitmap->non_flushable >= 0);
     info->non_flushable_state= 0;
     if (--bitmap->non_flushable == 0)
     {

=== modified file 'storage/maria/ma_blockrec.c'
--- a/storage/maria/ma_blockrec.c	2008-10-13 15:50:28 +0000
+++ b/storage/maria/ma_blockrec.c	2008-10-13 21:03:05 +0000
@@ -5141,7 +5141,9 @@ restart_record_read:
     if (end_of_data > info->scan.dir_end ||
         offset < PAGE_HEADER_SIZE || length < share->base.min_block_length)
     {
-      DBUG_ASSERT(0);
+      DBUG_ASSERT(!(end_of_data > info->scan.dir_end));
+      DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE));
+      DBUG_ASSERT(!(length < share->base.min_block_length));
       goto err;
     }
 #endif

=== modified file 'storage/maria/maria_def.h'
--- a/storage/maria/maria_def.h	2008-10-09 20:03:54 +0000
+++ b/storage/maria/maria_def.h	2008-10-13 21:03:05 +0000
@@ -242,7 +242,7 @@ typedef struct st_maria_file_bitmap
   pgcache_page_no_t page;              /* Page number for current bitmap */
   uint used_size;                      /* Size of bitmap head that is not 0 */
   my_bool changed;                     /* 1 if page needs to be flushed */
-  my_bool flush_all_requested;         /**< If _ma_bitmap_flush_all waiting */
+  uint flush_all_requested;            /**< count of bitmap flushers */
   uint non_flushable;                  /**< 0 if bitmap and log are in sync */
   PAGECACHE_FILE file;		       /* datafile where bitmap is stored */
 

Thread
bzr commit into MySQL/Maria:mysql-maria branch (guilhem:2677) Bug#39210Guilhem Bichot13 Oct