List:Commits« Previous MessageNext Message »
From:Inaam Rana Date:October 3 2011 6:01pm
Subject:bzr commit into mysql-trunk branch (inaam.rana:3472) Bug#11759044
View as plain text  
#At file:///home/inaam/w/buf_io_fix/ based on revid:norvald.ryeng@stripped

 3472 Inaam Rana	2011-10-03
      Bug#11759044 - 51325: DROPPING AN EMPTY INNODB TABLE TAKES A LONG TIME
      WITH LARGE BUFFER POOL
      
      rb://767
      approved by: Marko
      
      When dropping a table (with an .ibd file i.e.: with
      innodb_file_per_table set) we scan entire LRU to invalidate pages from
      that table. This can be painful in case of large buffer pools as we hold
      the buf_pool->mutex for the scan. Note that gravity of the problem does
      not depend on the size of the table. Even with an empty table but a
      large and filled up buffer pool we'll end up scanning a very long LRU
      list.
      
      The fix is to scan flush_list and just remove the blocks belonging to
      the table from the flush_list, marking them as non-dirty. The blocks
      are left in the LRU list for eventual eviction due to aging. The
      flush_list is typically much smaller than the LRU list but for cases
      where it is very long we have the solution of releasing the
      buf_pool->mutex after scanning 1K pages.
      
      buf_page_[set|unset]_sticky(): Use new IO-state BUF_IO_PIN to ensure
      that a block stays in the flush_list and LRU list when we release
      buf_pool->mutex. Previously we have been abusing BUF_IO_READ to achieve
      this.

    modified:
      storage/innobase/buf/buf0buf.c
      storage/innobase/buf/buf0lru.c
      storage/innobase/handler/i_s.cc
      storage/innobase/include/buf0buf.h
      storage/innobase/include/buf0buf.ic
      storage/innobase/include/buf0types.h
=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c	revid:norvald.ryeng@stripped
+++ b/storage/innobase/buf/buf0buf.c	revid:inaam.rana@stripped
@@ -4255,6 +4255,9 @@ assert_s_latched:
 					ut_a(rw_lock_is_locked(&block->lock,
 							       RW_LOCK_EX));
 					break;
+
+				case BUF_IO_PIN:
+					break;
 				}
 
 				n_lru++;
@@ -4284,6 +4287,7 @@ assert_s_latched:
 		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
 		switch (buf_page_get_io_fix(b)) {
 		case BUF_IO_NONE:
+		case BUF_IO_PIN:
 			/* All clean blocks should be I/O-unfixed. */
 			break;
 		case BUF_IO_READ:
@@ -4324,6 +4328,7 @@ assert_s_latched:
 			switch (buf_page_get_io_fix(b)) {
 			case BUF_IO_NONE:
 			case BUF_IO_READ:
+			case BUF_IO_PIN:
 				break;
 			case BUF_IO_WRITE:
 				switch (buf_page_get_flush_type(b)) {

=== modified file 'storage/innobase/buf/buf0lru.c'
--- a/storage/innobase/buf/buf0lru.c	revid:norvald.ryeng@stripped
+++ b/storage/innobase/buf/buf0lru.c	revid:inaam.rana@stripped
@@ -70,8 +70,12 @@ allowed to point to either end of the LR
 
 /** When dropping the search hash index entries before deleting an ibd
 file, we build a local array of pages belonging to that tablespace
-in the buffer pool. Following is the size of that array. */
-#define BUF_LRU_DROP_SEARCH_HASH_SIZE	1024
+in the buffer pool. Following is the size of that array.
+We also release buf_pool->mutex after scanning this many pages of the
+flush_list when dropping a table. This is to ensure that other threads
+are not blocked for extended period of time when using very large
+buffer pools. */
+#define BUF_LRU_DROP_SEARCH_SIZE	1024
 
 /** If we switch on the InnoDB monitor because there are too few available
 frames in the buffer pool, we set this to TRUE */
@@ -216,7 +220,7 @@ buf_LRU_drop_page_hash_batch(
 	ulint	i;
 
 	ut_ad(arr != NULL);
-	ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE);
+	ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
 
 	for (i = 0; i < count; ++i) {
 		btr_search_drop_page_hash_when_freed(space_id, zip_size,
@@ -250,7 +254,7 @@ buf_LRU_drop_page_hash_for_tablespace(
 	}
 
 	page_arr = ut_malloc(
-		sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE);
+		sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
 
 	buf_pool_mutex_enter(buf_pool);
 	num_entries = 0;
@@ -289,10 +293,10 @@ next_page:
 		/* Store the page number so that we can drop the hash
 		index in a batch later. */
 		page_arr[num_entries] = bpage->offset;
-		ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+		ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
 		++num_entries;
 
-		if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+		if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
 			goto next_page;
 		}
 
@@ -337,38 +341,39 @@ next_page:
 }
 
 /******************************************************************//**
-Invalidates all pages belonging to a given tablespace inside a specific
+Remove all dirty pages belonging to a given tablespace inside a specific
 buffer pool instance when we are deleting the data file(s) of that
-tablespace. */
+tablespace. The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU. */
 static
 void
-buf_LRU_invalidate_tablespace_buf_pool_instance(
-/*============================================*/
+buf_LRU_remove_dirty_pages_for_tablespace(
+/*======================================*/
 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
 	ulint		id)		/*!< in: space id */
 {
 	buf_page_t*	bpage;
 	ibool		all_freed;
+	ulint		i;
 
 scan_again:
 	buf_pool_mutex_enter(buf_pool);
+	buf_flush_list_mutex_enter(buf_pool);
 
 	all_freed = TRUE;
 
-	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0;
+	     bpage != NULL; ++i) {
 
-	while (bpage != NULL) {
 		buf_page_t*	prev_bpage;
-		ulint		fold;
-		rw_lock_t* 	hash_lock = NULL;
-		mutex_t* 	block_mutex = NULL;
+		mutex_t*	block_mutex = NULL;
 
 		ut_a(buf_page_in_file(bpage));
 
-		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+		prev_bpage = UT_LIST_GET_PREV(list, bpage);
 
 		/* bpage->space and bpage->io_fix are protected by
-		buf_pool->mutex and block_mutex.  It is safe to check
+		buf_pool->mutex and block_mutex. It is safe to check
 		them while holding buf_pool->mutex only. */
 
 		if (buf_page_get_space(bpage) != id) {
@@ -382,95 +387,83 @@ scan_again:
 
 			all_freed = FALSE;
 			goto next_page;
-		} else {
-			fold = buf_page_address_fold(bpage->space,
-						     bpage->offset);
-			hash_lock = buf_page_hash_lock_get(buf_pool, fold);
-			block_mutex = buf_page_get_mutex(bpage);
-
-			rw_lock_x_lock(hash_lock);
-			mutex_enter(block_mutex);
-
-			if (bpage->buf_fix_count > 0) {
+		}
 
-				rw_lock_x_unlock(hash_lock);
-				mutex_exit(block_mutex);
+		/* We have to release the flush_list_mutex to obey the
+		latching order. We are however guaranteed that the page
+		will stay in the flush_list because buf_flush_remove()
+		needs buf_pool->mutex as well. */
+		buf_flush_list_mutex_exit(buf_pool);
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
 
-				/* We cannot remove this page during
-				this scan yet; maybe the system is
-				currently reading it in, or flushing
-				the modifications to the file */
+		if (bpage->buf_fix_count > 0) {
+			mutex_exit(block_mutex);
+			buf_flush_list_mutex_enter(buf_pool);
 
-				all_freed = FALSE;
-				goto next_page;
-			}
-		}
+			/* We cannot remove this page during
+			this scan yet; maybe the system is
+			currently reading it in, or flushing
+			the modifications to the file */
 
-		ut_ad(mutex_own(block_mutex));
-#ifdef UNIV_SYNC_DEBUG
-		ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
-
-#ifdef UNIV_DEBUG
-		if (buf_debug_prints) {
-			fprintf(stderr,
-				"Dropping space %lu page %lu\n",
-				(ulong) buf_page_get_space(bpage),
-				(ulong) buf_page_get_page_no(bpage));
+			all_freed = FALSE;
+			goto next_page;
 		}
-#endif
-		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
-			/* This is a compressed-only block
-			descriptor. Do nothing. */
-		} else if (((buf_block_t*) bpage)->is_hashed) {
-			ulint	page_no;
-			ulint	zip_size;
 
-			buf_pool_mutex_exit(buf_pool);
+		ut_ad(bpage->oldest_modification != 0);
 
-			zip_size = buf_page_get_zip_size(bpage);
-			page_no = buf_page_get_page_no(bpage);
+		buf_flush_remove(bpage);
 
-			rw_lock_x_unlock(hash_lock);
-			mutex_exit(block_mutex);
-
-			/* Note that the following call will acquire
-			an S-latch on the page */
+		mutex_exit(block_mutex);
+		buf_flush_list_mutex_enter(buf_pool);
+next_page:
+		bpage = prev_bpage;
 
-			btr_search_drop_page_hash_when_freed(
-				id, zip_size, page_no);
-			goto scan_again;
+		if (!bpage) {
+			break;
 		}
 
-		if (bpage->oldest_modification != 0) {
+		/* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
+		loop we release buf_pool->mutex to let other threads
+		do their job. */
+		if (i < BUF_LRU_DROP_SEARCH_SIZE) {
+			continue;
+		}
 
-			buf_flush_remove(bpage);
+		/* We IO-fix the block to make sure that the block
+		stays in its position in the flush_list. */
+		if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+			/* Block is already IO-fixed. We don't
+			want to change the value. Lets leave
+			this block alone. */
+			continue;
 		}
 
-		/* Remove from the LRU list. */
+		buf_flush_list_mutex_exit(buf_pool);
+		block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
+		buf_page_set_sticky(bpage);
+		mutex_exit(block_mutex);
 
-		if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
-		    != BUF_BLOCK_ZIP_FREE) {
-			buf_LRU_block_free_hashed_page((buf_block_t*)
-						       bpage);
-		} else {
-			/* The block_mutex should have been
-			released by buf_LRU_block_remove_hashed_page() */
-			ut_ad(block_mutex == &buf_pool->zip_mutex);
-		}
+		/* Now it is safe to release the buf_pool->mutex. */
+		buf_pool_mutex_exit(buf_pool);
+		os_thread_yield();
+		buf_pool_mutex_enter(buf_pool);
 
-		ut_ad(!mutex_own(block_mutex));
-#ifdef UNIV_SYNC_DEBUG
-		ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
-		      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
-		ut_ad(!mutex_own(block_mutex));
-next_page:
-		bpage = prev_bpage;
+		mutex_enter(block_mutex);
+		buf_page_unset_sticky(bpage);
+		mutex_exit(block_mutex);
 
+		buf_flush_list_mutex_enter(buf_pool);
+		ut_ad(bpage->in_flush_list);
+
+		i = 0;
 	}
 
 	buf_pool_mutex_exit(buf_pool);
+	buf_flush_list_mutex_exit(buf_pool);
+
+	ut_ad(buf_flush_validate(buf_pool));
 
 	if (!all_freed) {
 		os_thread_sleep(20000);
@@ -501,7 +494,7 @@ buf_LRU_invalidate_tablespace(
 
 		buf_pool = buf_pool_from_array(i);
 		buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
-		buf_LRU_invalidate_tablespace_buf_pool_instance(buf_pool, id);
+		buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id);
 	}
 }
 
@@ -1581,15 +1574,18 @@ func_exit:
 
 		bpage->zip.data = NULL;
 		page_zip_set_size(&bpage->zip, 0);
+		mutex_exit(block_mutex);
 
 		/* Prevent buf_page_get_gen() from
 		decompressing the block while we release
 		buf_pool->mutex and block_mutex. */
-		b->buf_fix_count++;
-		b->io_fix = BUF_IO_READ;
+		block_mutex = buf_page_get_mutex(b);
+		mutex_enter(block_mutex);
+		buf_page_set_sticky(b);
+		mutex_exit(block_mutex);
 
 		rw_lock_x_unlock(hash_lock);
-		mutex_exit(block_mutex);
+
 	} else {
 
 		/* There can be multiple threads doing an LRU scan to
@@ -1600,11 +1596,9 @@ func_exit:
 		else considers this block as a victim for page
 		replacement. This block is already out of page_hash
 		and we are about to remove it from the LRU list and put
-		it on the free list. To avoid this situation we set the
-		buf_fix_count and io_fix fields here. */
+		it on the free list. */
 		mutex_enter(block_mutex);
-		buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__);
-		buf_page_set_io_fix(bpage, BUF_IO_READ);
+		buf_page_set_sticky(bpage);
 		mutex_exit(block_mutex);
 	}
 
@@ -1642,19 +1636,9 @@ func_exit:
 
 	buf_pool_mutex_enter(buf_pool);
 
-	if (b) {
-		mutex_enter(&buf_pool->zip_mutex);
-		b->buf_fix_count--;
-		buf_page_set_io_fix(b, BUF_IO_NONE);
-		mutex_exit(&buf_pool->zip_mutex);
-	} else {
-		mutex_enter(block_mutex);
-		ut_ad(bpage->buf_fix_count > 0);
-		ut_ad(bpage->io_fix == BUF_IO_READ);
-		buf_block_buf_fix_dec((buf_block_t*) bpage);
-		buf_page_set_io_fix(bpage, BUF_IO_NONE);
-		mutex_exit(block_mutex);
-	}
+	mutex_enter(block_mutex);
+	buf_page_unset_sticky(b != NULL ? b : bpage);
+	mutex_exit(block_mutex);
 
 	buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
 	return(TRUE);

=== modified file 'storage/innobase/handler/i_s.cc'
--- a/storage/innobase/handler/i_s.cc	revid:norvald.ryeng@stripped
+++ b/storage/innobase/handler/i_s.cc	revid:inaam.rana@stripped
@@ -3285,6 +3285,10 @@ i_s_innodb_buffer_page_fill(
 			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
 					      "IO_WRITE"));
 			break;
+		case BUF_IO_PIN:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_PIN"));
+			break;
 		}
 
 		OK(field_store_string(fields[IDX_BUFFER_PAGE_IS_OLD],

=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h	revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0buf.h	revid:inaam.rana@stripped
@@ -938,7 +938,27 @@ buf_block_set_io_fix(
 /*=================*/
 	buf_block_t*	block,	/*!< in/out: control block */
 	enum buf_io_fix	io_fix);/*!< in: io_fix state */
-
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+	buf_page_t*	bpage);	/*!< in/out: control block */
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+	buf_page_t*	bpage);	/*!< in/out: control block */
 /********************************************************************//**
 Determine if a buffer block can be relocated in memory.  The block
 can be dirty, but it must not be I/O-fixed or bufferfixed. */

=== modified file 'storage/innobase/include/buf0buf.ic'
--- a/storage/innobase/include/buf0buf.ic	revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0buf.ic	revid:inaam.rana@stripped
@@ -425,6 +425,7 @@ buf_page_get_io_fix(
 	case BUF_IO_NONE:
 	case BUF_IO_READ:
 	case BUF_IO_WRITE:
+	case BUF_IO_PIN:
 		return(io_fix);
 	}
 	ut_error;
@@ -475,6 +476,49 @@ buf_block_set_io_fix(
 	buf_page_set_io_fix(&block->page, io_fix);
 }
 
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+	buf_page_t*	bpage)	/*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+
+	bpage->io_fix = BUF_IO_PIN;
+}
+
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+	buf_page_t*	bpage)	/*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
+
+	bpage->io_fix = BUF_IO_NONE;
+}
+
 /********************************************************************//**
 Determine if a buffer block can be relocated in memory.  The block
 can be dirty, but it must not be I/O-fixed or bufferfixed. */

=== modified file 'storage/innobase/include/buf0types.h'
--- a/storage/innobase/include/buf0types.h	revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0types.h	revid:inaam.rana@stripped
@@ -56,7 +56,10 @@ enum buf_flush {
 enum buf_io_fix {
 	BUF_IO_NONE = 0,		/**< no pending I/O */
 	BUF_IO_READ,			/**< read pending */
-	BUF_IO_WRITE			/**< write pending */
+	BUF_IO_WRITE,			/**< write pending */
+	BUF_IO_PIN			/**< disallow relocation of
+					block and its removal of from
+					the flush_list */
 };
 
 /** Alternatives for srv_checksum_algorithm, which can be changed by


Attachment: [text/bzr-bundle] bzr/inaam.rana@oracle.com-20111003175354-4ww5itiev1sfq7md.bundle
Thread
bzr commit into mysql-trunk branch (inaam.rana:3472) Bug#11759044Inaam Rana5 Oct