List:Commits« Previous MessageNext Message »
From:Inaam Rana Date:October 3 2011 6:05pm
Subject:bzr push into mysql-trunk branch (inaam.rana:3471 to 3472) Bug#11759044
View as plain text  
 3472 Inaam Rana	2011-10-03
      Bug#11759044 - 51325: DROPPING AN EMPTY INNODB TABLE TAKES A LONG TIME
      WITH LARGE BUFFER POOL
      
      rb://767
      approved by: Marko
      
      When dropping a table (with an .ibd file i.e.: with
      innodb_file_per_table set) we scan entire LRU to invalidate pages from
      that table. This can be painful in case of large buffer pools as we hold
      the buf_pool->mutex for the scan. Note that gravity of the problem does
      not depend on the size of the table. Even with an empty table but a
      large and filled up buffer pool we'll end up scanning a very long LRU
      list.
      
      The fix is to scan flush_list and just remove the blocks belonging to
      the table from the flush_list, marking them as non-dirty. The blocks
      are left in the LRU list for eventual eviction due to aging. The
      flush_list is typically much smaller than the LRU list but for cases
      where it is very long we have the solution of releasing the
      buf_pool->mutex after scanning 1K pages.
      
      buf_page_[set|unset]_sticky(): Use new IO-state BUF_IO_PIN to ensure
      that a block stays in the flush_list and LRU list when we release
      buf_pool->mutex. Previously we have been abusing BUF_IO_READ to achieve
      this.

    modified:
      storage/innobase/buf/buf0buf.c
      storage/innobase/buf/buf0lru.c
      storage/innobase/handler/i_s.cc
      storage/innobase/include/buf0buf.h
      storage/innobase/include/buf0buf.ic
      storage/innobase/include/buf0types.h
 3471 Norvald H. Ryeng	2011-10-03
      Bug #12330344 CRASH AND/OR VALGRIND ERRORS IN FREE_IO_CACHE WITH JOIN,
      VIEW, PARTITIONED TABLE
      
      Problem: Server crashes when executing query using a TEMPTABLE view
      with a subselect using a partitioned table.
      
      In prune_partitions(), the memory root is substituted for a memory
      root with a life span as long as the prune_partions() call. During
      optimization of functions, subselects may allocate memory that is
      expected to have the same life span as the subselect. Due to the new
      memory root, this memory is freed at the end of prune_partitions() and
      still referred to by the subselect and attempted used during query
      execution.
      
      Fix: Switch memory root back to the normal root before calling
      Item_func::select_optimize().
     @ mysql-test/r/partition.result
        Add test case for bug#12330344
     @ mysql-test/t/partition.test
        Add test case for bug#12330344
     @ sql/opt_range.cc
        Switch memory root before calling Item_func::select_optimize()

    modified:
      mysql-test/r/partition.result
      mysql-test/t/partition.test
      sql/opt_range.cc
=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c	revid:norvald.ryeng@stripped
+++ b/storage/innobase/buf/buf0buf.c	revid:inaam.rana@stripped
@@ -4255,6 +4255,9 @@ assert_s_latched:
 					ut_a(rw_lock_is_locked(&block->lock,
 							       RW_LOCK_EX));
 					break;
+
+				case BUF_IO_PIN:
+					break;
 				}
 
 				n_lru++;
@@ -4284,6 +4287,7 @@ assert_s_latched:
 		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
 		switch (buf_page_get_io_fix(b)) {
 		case BUF_IO_NONE:
+		case BUF_IO_PIN:
 			/* All clean blocks should be I/O-unfixed. */
 			break;
 		case BUF_IO_READ:
@@ -4324,6 +4328,7 @@ assert_s_latched:
 			switch (buf_page_get_io_fix(b)) {
 			case BUF_IO_NONE:
 			case BUF_IO_READ:
+			case BUF_IO_PIN:
 				break;
 			case BUF_IO_WRITE:
 				switch (buf_page_get_flush_type(b)) {

=== modified file 'storage/innobase/buf/buf0lru.c'
--- a/storage/innobase/buf/buf0lru.c	revid:norvald.ryeng@stripped
+++ b/storage/innobase/buf/buf0lru.c	revid:inaam.rana@stripped
@@ -70,8 +70,12 @@ allowed to point to either end of the LR
 
 /** When dropping the search hash index entries before deleting an ibd
 file, we build a local array of pages belonging to that tablespace
-in the buffer pool. Following is the size of that array. */
-#define BUF_LRU_DROP_SEARCH_HASH_SIZE	1024
+in the buffer pool. Following is the size of that array.
+We also release buf_pool->mutex after scanning this many pages of the
+flush_list when dropping a table. This is to ensure that other threads
+are not blocked for extended period of time when using very large
+buffer pools. */
+#define BUF_LRU_DROP_SEARCH_SIZE	1024
 
 /** If we switch on the InnoDB monitor because there are too few available
 frames in the buffer pool, we set this to TRUE */
@@ -216,7 +220,7 @@ buf_LRU_drop_page_hash_batch(
 	ulint	i;
 
 	ut_ad(arr != NULL);
-	ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE);
+	ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
 
 	for (i = 0; i < count; ++i) {
 		btr_search_drop_page_hash_when_freed(space_id, zip_size,
@@ -250,7 +254,7 @@ buf_LRU_drop_page_hash_for_tablespace(
 	}
 
 	page_arr = ut_malloc(
-		sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE);
+		sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
 
 	buf_pool_mutex_enter(buf_pool);
 	num_entries = 0;
@@ -289,10 +293,10 @@ next_page:
 		/* Store the page number so that we can drop the hash
 		index in a batch later. */
 		page_arr[num_entries] = bpage->offset;
-		ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+		ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
 		++num_entries;
 
-		if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+		if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
 			goto next_page;
 		}
 
@@ -337,38 +341,39 @@ next_page:
 }
 
 /******************************************************************//**
-Invalidates all pages belonging to a given tablespace inside a specific
+Remove all dirty pages belonging to a given tablespace inside a specific
 buffer pool instance when we are deleting the data file(s) of that
-tablespace. */
+tablespace. The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU. */
 static
 void
-buf_LRU_invalidate_tablespace_buf_pool_instance(
-/*============================================*/
+buf_LRU_remove_dirty_pages_for_tablespace(
+/*======================================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		id)		/*!< in: space id */
 {
 	buf_page_t*	bpage;
 	ibool		all_freed;
+	ulint		i;
 
 scan_again:
 	buf_pool_mutex_enter(buf_pool);
+	buf_flush_list_mutex_enter(buf_pool);
 
 	all_freed = TRUE;
 
-	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0;
+	     bpage != NULL; ++i) {
 
-	while (bpage != NULL) {
 		buf_page_t*	prev_bpage;
-		ulint		fold;
-		rw_lock_t* 	hash_lock = NULL;
-		mutex_t* 	block_mutex = NULL;
+		mutex_t*	block_mutex = NULL;
 
 		ut_a(buf_page_in_file(bpage));
 
-		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+		prev_bpage = UT_LIST_GET_PREV(list, bpage);
 
 		/* bpage->space and bpage->io_fix are protected by
-		buf_pool->mutex and block_mutex.  It is safe to check
+		buf_pool->mutex and block_mutex. It is safe to check
 		them while holding buf_pool->mutex only. */
 
 		if (buf_page_get_space(bpage) != id) {
@@ -382,95 +387,83 @@ scan_again:
 
 			all_freed = FALSE;
 			goto next_page;
-		} else {
-			fold = buf_page_address_fold(bpage->space,
-						     bpage->offset);
-			hash_lock = buf_page_hash_lock_get(buf_pool, fold);
-			block_mutex = buf_page_get_mutex(bpage);
-
-			rw_lock_x_lock(hash_lock);
-			mutex_enter(block_mutex);
-
-			if (bpage->buf_fix_count > 0) {
+		}
 
-				rw_lock_x_unlock(hash_lock);
-				mutex_exit(block_mutex);
+		/* We have to release the flush_list_mutex to obey the
+		latching order. We are however guaranteed that the page
+		will stay in the flush_list because buf_flush_remove()
+		needs buf_pool->mutex as well. */
+		buf_flush_list_mutex_exit(buf_pool);
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
 
-				/* We cannot remove this page during
-				this scan yet; maybe the system is
-				currently reading it in, or flushing
-				the modifications to the file */
+		if (bpage->buf_fix_count > 0) {
+			mutex_exit(block_mutex);
+			buf_flush_list_mutex_enter(buf_pool);
 
-				all_freed = FALSE;
-				goto next_page;
-			}
-		}
+			/* We cannot remove this page during
+			this scan yet; maybe the system is
+			currently reading it in, or flushing
+			the modifications to the file */
 
-		ut_ad(mutex_own(block_mutex));
-#ifdef UNIV_SYNC_DEBUG
-		ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
-
-#ifdef UNIV_DEBUG
-		if (buf_debug_prints) {
-			fprintf(stderr,
-				"Dropping space %lu page %lu\n",
-				(ulong) buf_page_get_space(bpage),
-				(ulong) buf_page_get_page_no(bpage));
+			all_freed = FALSE;
+			goto next_page;
 		}
-#endif
-		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
-			/* This is a compressed-only block
-			descriptor. Do nothing. */
-		} else if (((buf_block_t*) bpage)->is_hashed) {
-			ulint	page_no;
-			ulint	zip_size;
 
-			buf_pool_mutex_exit(buf_pool);
+		ut_ad(bpage->oldest_modification != 0);
 
-			zip_size = buf_page_get_zip_size(bpage);
-			page_no = buf_page_get_page_no(bpage);
+		buf_flush_remove(bpage);
 
-			rw_lock_x_unlock(hash_lock);
-			mutex_exit(block_mutex);
-
-			/* Note that the following call will acquire
-			an S-latch on the page */
+		mutex_exit(block_mutex);
+		buf_flush_list_mutex_enter(buf_pool);
+next_page:
+		bpage = prev_bpage;
 
-			btr_search_drop_page_hash_when_freed(
-				id, zip_size, page_no);
-			goto scan_again;
+		if (!bpage) {
+			break;
 		}
 
-		if (bpage->oldest_modification != 0) {
+		/* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
+		loop we release buf_pool->mutex to let other threads
+		do their job. */
+		if (i < BUF_LRU_DROP_SEARCH_SIZE) {
+			continue;
+		}
 
-			buf_flush_remove(bpage);
+		/* We IO-fix the block to make sure that the block
+		stays in its position in the flush_list. */
+		if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+			/* Block is already IO-fixed. We don't
+			want to change the value. Lets leave
+			this block alone. */
+			continue;
 		}
 
-		/* Remove from the LRU list. */
+		buf_flush_list_mutex_exit(buf_pool);
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
+		buf_page_set_sticky(bpage);
+		mutex_exit(block_mutex);
 
-		if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
-		    != BUF_BLOCK_ZIP_FREE) {
-			buf_LRU_block_free_hashed_page((buf_block_t*)
-						       bpage);
-		} else {
-			/* The block_mutex should have been
-			released by buf_LRU_block_remove_hashed_page() */
-			ut_ad(block_mutex == &buf_pool->zip_mutex);
-		}
+		/* Now it is safe to release the buf_pool->mutex. */
+		buf_pool_mutex_exit(buf_pool);
+		os_thread_yield();
+		buf_pool_mutex_enter(buf_pool);
 
-		ut_ad(!mutex_own(block_mutex));
-#ifdef UNIV_SYNC_DEBUG
-		ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
-		      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
-		ut_ad(!mutex_own(block_mutex));
-next_page:
-		bpage = prev_bpage;
+		mutex_enter(block_mutex);
+		buf_page_unset_sticky(bpage);
+		mutex_exit(block_mutex);
 
+		buf_flush_list_mutex_enter(buf_pool);
+		ut_ad(bpage->in_flush_list);
+
+		i = 0;
 	}
 
 	buf_pool_mutex_exit(buf_pool);
+	buf_flush_list_mutex_exit(buf_pool);
+
+	ut_ad(buf_flush_validate(buf_pool));
 
 	if (!all_freed) {
 		os_thread_sleep(20000);
@@ -501,7 +494,7 @@ buf_LRU_invalidate_tablespace(
 
 		buf_pool = buf_pool_from_array(i);
 		buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
-		buf_LRU_invalidate_tablespace_buf_pool_instance(buf_pool, id);
+		buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id);
 	}
 }
 
@@ -1581,15 +1574,18 @@ func_exit:
 
 		bpage->zip.data = NULL;
 		page_zip_set_size(&bpage->zip, 0);
+		mutex_exit(block_mutex);
 
 		/* Prevent buf_page_get_gen() from
 		decompressing the block while we release
 		buf_pool->mutex and block_mutex. */
-		b->buf_fix_count++;
-		b->io_fix = BUF_IO_READ;
+		block_mutex = buf_page_get_mutex(b);
+		mutex_enter(block_mutex);
+		buf_page_set_sticky(b);
+		mutex_exit(block_mutex);
 
 		rw_lock_x_unlock(hash_lock);
-		mutex_exit(block_mutex);
+
 	} else {
 
 		/* There can be multiple threads doing an LRU scan to
@@ -1600,11 +1596,9 @@ func_exit:
 		else considers this block as a victim for page
 		replacement. This block is already out of page_hash
 		and we are about to remove it from the LRU list and put
-		it on the free list. To avoid this situation we set the
-		buf_fix_count and io_fix fields here. */
+		it on the free list. */
 		mutex_enter(block_mutex);
-		buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__);
-		buf_page_set_io_fix(bpage, BUF_IO_READ);
+		buf_page_set_sticky(bpage);
 		mutex_exit(block_mutex);
 	}
 
@@ -1642,19 +1636,9 @@ func_exit:
 
 	buf_pool_mutex_enter(buf_pool);
 
-	if (b) {
-		mutex_enter(&buf_pool->zip_mutex);
-		b->buf_fix_count--;
-		buf_page_set_io_fix(b, BUF_IO_NONE);
-		mutex_exit(&buf_pool->zip_mutex);
-	} else {
-		mutex_enter(block_mutex);
-		ut_ad(bpage->buf_fix_count > 0);
-		ut_ad(bpage->io_fix == BUF_IO_READ);
-		buf_block_buf_fix_dec((buf_block_t*) bpage);
-		buf_page_set_io_fix(bpage, BUF_IO_NONE);
-		mutex_exit(block_mutex);
-	}
+	mutex_enter(block_mutex);
+	buf_page_unset_sticky(b != NULL ? b : bpage);
+	mutex_exit(block_mutex);
 
 	buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
 	return(TRUE);

=== modified file 'storage/innobase/handler/i_s.cc'
--- a/storage/innobase/handler/i_s.cc	revid:norvald.ryeng@stripped
+++ b/storage/innobase/handler/i_s.cc	revid:inaam.rana@stripped
@@ -3285,6 +3285,10 @@ i_s_innodb_buffer_page_fill(
 			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
 					      "IO_WRITE"));
 			break;
+		case BUF_IO_PIN:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_PIN"));
+			break;
 		}
 
 		OK(field_store_string(fields[IDX_BUFFER_PAGE_IS_OLD],

=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h	revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0buf.h	revid:inaam.rana@stripped
@@ -938,7 +938,27 @@ buf_block_set_io_fix(
 /*=================*/
 	buf_block_t*	block,	/*!< in/out: control block */
 	enum buf_io_fix	io_fix);/*!< in: io_fix state */
-
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+	buf_page_t*	bpage);	/*!< in/out: control block */
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+	buf_page_t*	bpage);	/*!< in/out: control block */
 /********************************************************************//**
 Determine if a buffer block can be relocated in memory.  The block
 can be dirty, but it must not be I/O-fixed or bufferfixed. */

=== modified file 'storage/innobase/include/buf0buf.ic'
--- a/storage/innobase/include/buf0buf.ic	revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0buf.ic	revid:inaam.rana@stripped
@@ -425,6 +425,7 @@ buf_page_get_io_fix(
 	case BUF_IO_NONE:
 	case BUF_IO_READ:
 	case BUF_IO_WRITE:
+	case BUF_IO_PIN:
 		return(io_fix);
 	}
 	ut_error;
@@ -475,6 +476,49 @@ buf_block_set_io_fix(
 	buf_page_set_io_fix(&block->page, io_fix);
 }
 
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+	buf_page_t*	bpage)	/*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+
+	bpage->io_fix = BUF_IO_PIN;
+}
+
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+	buf_page_t*	bpage)	/*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
+
+	bpage->io_fix = BUF_IO_NONE;
+}
+
 /********************************************************************//**
 Determine if a buffer block can be relocated in memory.  The block
 can be dirty, but it must not be I/O-fixed or bufferfixed. */

=== modified file 'storage/innobase/include/buf0types.h'
--- a/storage/innobase/include/buf0types.h	revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0types.h	revid:inaam.rana@stripped
@@ -56,7 +56,10 @@ enum buf_flush {
 enum buf_io_fix {
 	BUF_IO_NONE = 0,		/**< no pending I/O */
 	BUF_IO_READ,			/**< read pending */
-	BUF_IO_WRITE			/**< write pending */
+	BUF_IO_WRITE,			/**< write pending */
+	BUF_IO_PIN			/**< disallow relocation of
+					block and its removal of from
+					the flush_list */
 };
 
 /** Alternatives for srv_checksum_algorithm, which can be changed by


Attachment: [text/bzr-bundle] bzr/inaam.rana@oracle.com-20111003175354-4ww5itiev1sfq7md.bundle
Thread
bzr push into mysql-trunk branch (inaam.rana:3471 to 3472) Bug#11759044Inaam Rana5 Oct