3472 Inaam Rana 2011-10-03
Bug#11759044 - 51325: DROPPING AN EMPTY INNODB TABLE TAKES A LONG TIME
WITH LARGE BUFFER POOL
rb://767
approved by: Marko
When dropping a table (with an .ibd file i.e.: with
innodb_file_per_table set) we scan entire LRU to invalidate pages from
that table. This can be painful in case of large buffer pools as we hold
the buf_pool->mutex for the scan. Note that gravity of the problem does
not depend on the size of the table. Even with an empty table but a
large and filled up buffer pool we'll end up scanning a very long LRU
list.
The fix is to scan flush_list and just remove the blocks belonging to
the table from the flush_list, marking them as non-dirty. The blocks
are left in the LRU list for eventual eviction due to aging. The
flush_list is typically much smaller than the LRU list but for cases
where it is very long we have the solution of releasing the
buf_pool->mutex after scanning 1K pages.
buf_page_[set|unset]_sticky(): Use new IO-state BUF_IO_PIN to ensure
that a block stays in the flush_list and LRU list when we release
buf_pool->mutex. Previously we have been abusing BUF_IO_READ to achieve
this.
modified:
storage/innobase/buf/buf0buf.c
storage/innobase/buf/buf0lru.c
storage/innobase/handler/i_s.cc
storage/innobase/include/buf0buf.h
storage/innobase/include/buf0buf.ic
storage/innobase/include/buf0types.h
3471 Norvald H. Ryeng 2011-10-03
Bug #12330344 CRASH AND/OR VALGRIND ERRORS IN FREE_IO_CACHE WITH JOIN,
VIEW, PARTITIONED TABLE
Problem: Server crashes when executing query using a TEMPTABLE view
with a subselect using a partitioned table.
In prune_partitions(), the memory root is substituted for a memory
root with a life span as long as the prune_partions() call. During
optimization of functions, subselects may allocate memory that is
expected to have the same life span as the subselect. Due to the new
memory root, this memory is freed at the end of prune_partitions() and
still referred to by the subselect and attempted used during query
execution.
Fix: Switch memory root back to the normal root before calling
Item_func::select_optimize().
@ mysql-test/r/partition.result
Add test case for bug#12330344
@ mysql-test/t/partition.test
Add test case for bug#12330344
@ sql/opt_range.cc
Switch memory root before calling Item_func::select_optimize()
modified:
mysql-test/r/partition.result
mysql-test/t/partition.test
sql/opt_range.cc
=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c revid:norvald.ryeng@stripped
+++ b/storage/innobase/buf/buf0buf.c revid:inaam.rana@stripped
@@ -4255,6 +4255,9 @@ assert_s_latched:
ut_a(rw_lock_is_locked(&block->lock,
RW_LOCK_EX));
break;
+
+ case BUF_IO_PIN:
+ break;
}
n_lru++;
@@ -4284,6 +4287,7 @@ assert_s_latched:
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
+ case BUF_IO_PIN:
/* All clean blocks should be I/O-unfixed. */
break;
case BUF_IO_READ:
@@ -4324,6 +4328,7 @@ assert_s_latched:
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
case BUF_IO_READ:
+ case BUF_IO_PIN:
break;
case BUF_IO_WRITE:
switch (buf_page_get_flush_type(b)) {
=== modified file 'storage/innobase/buf/buf0lru.c'
--- a/storage/innobase/buf/buf0lru.c revid:norvald.ryeng@stripped
+++ b/storage/innobase/buf/buf0lru.c revid:inaam.rana@stripped
@@ -70,8 +70,12 @@ allowed to point to either end of the LR
/** When dropping the search hash index entries before deleting an ibd
file, we build a local array of pages belonging to that tablespace
-in the buffer pool. Following is the size of that array. */
-#define BUF_LRU_DROP_SEARCH_HASH_SIZE 1024
+in the buffer pool. Following is the size of that array.
+We also release buf_pool->mutex after scanning this many pages of the
+flush_list when dropping a table. This is to ensure that other threads
+are not blocked for extended period of time when using very large
+buffer pools. */
+#define BUF_LRU_DROP_SEARCH_SIZE 1024
/** If we switch on the InnoDB monitor because there are too few available
frames in the buffer pool, we set this to TRUE */
@@ -216,7 +220,7 @@ buf_LRU_drop_page_hash_batch(
ulint i;
ut_ad(arr != NULL);
- ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
for (i = 0; i < count; ++i) {
btr_search_drop_page_hash_when_freed(space_id, zip_size,
@@ -250,7 +254,7 @@ buf_LRU_drop_page_hash_for_tablespace(
}
page_arr = ut_malloc(
- sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
buf_pool_mutex_enter(buf_pool);
num_entries = 0;
@@ -289,10 +293,10 @@ next_page:
/* Store the page number so that we can drop the hash
index in a batch later. */
page_arr[num_entries] = bpage->offset;
- ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
++num_entries;
- if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+ if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
goto next_page;
}
@@ -337,38 +341,39 @@ next_page:
}
/******************************************************************//**
-Invalidates all pages belonging to a given tablespace inside a specific
+Remove all dirty pages belonging to a given tablespace inside a specific
buffer pool instance when we are deleting the data file(s) of that
-tablespace. */
+tablespace. The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU. */
static
void
-buf_LRU_invalidate_tablespace_buf_pool_instance(
-/*============================================*/
+buf_LRU_remove_dirty_pages_for_tablespace(
+/*======================================*/
buf_pool_t* buf_pool, /*!< buffer pool instance */
ulint id) /*!< in: space id */
{
buf_page_t* bpage;
ibool all_freed;
+ ulint i;
scan_again:
buf_pool_mutex_enter(buf_pool);
+ buf_flush_list_mutex_enter(buf_pool);
all_freed = TRUE;
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0;
+ bpage != NULL; ++i) {
- while (bpage != NULL) {
buf_page_t* prev_bpage;
- ulint fold;
- rw_lock_t* hash_lock = NULL;
- mutex_t* block_mutex = NULL;
+ mutex_t* block_mutex = NULL;
ut_a(buf_page_in_file(bpage));
- prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+ prev_bpage = UT_LIST_GET_PREV(list, bpage);
/* bpage->space and bpage->io_fix are protected by
- buf_pool->mutex and block_mutex. It is safe to check
+ buf_pool->mutex and block_mutex. It is safe to check
them while holding buf_pool->mutex only. */
if (buf_page_get_space(bpage) != id) {
@@ -382,95 +387,83 @@ scan_again:
all_freed = FALSE;
goto next_page;
- } else {
- fold = buf_page_address_fold(bpage->space,
- bpage->offset);
- hash_lock = buf_page_hash_lock_get(buf_pool, fold);
- block_mutex = buf_page_get_mutex(bpage);
-
- rw_lock_x_lock(hash_lock);
- mutex_enter(block_mutex);
-
- if (bpage->buf_fix_count > 0) {
+ }
- rw_lock_x_unlock(hash_lock);
- mutex_exit(block_mutex);
+ /* We have to release the flush_list_mutex to obey the
+ latching order. We are however guaranteed that the page
+ will stay in the flush_list because buf_flush_remove()
+ needs buf_pool->mutex as well. */
+ buf_flush_list_mutex_exit(buf_pool);
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
- /* We cannot remove this page during
- this scan yet; maybe the system is
- currently reading it in, or flushing
- the modifications to the file */
+ if (bpage->buf_fix_count > 0) {
+ mutex_exit(block_mutex);
+ buf_flush_list_mutex_enter(buf_pool);
- all_freed = FALSE;
- goto next_page;
- }
- }
+ /* We cannot remove this page during
+ this scan yet; maybe the system is
+ currently reading it in, or flushing
+ the modifications to the file */
- ut_ad(mutex_own(block_mutex));
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
-
-#ifdef UNIV_DEBUG
- if (buf_debug_prints) {
- fprintf(stderr,
- "Dropping space %lu page %lu\n",
- (ulong) buf_page_get_space(bpage),
- (ulong) buf_page_get_page_no(bpage));
+ all_freed = FALSE;
+ goto next_page;
}
-#endif
- if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
- /* This is a compressed-only block
- descriptor. Do nothing. */
- } else if (((buf_block_t*) bpage)->is_hashed) {
- ulint page_no;
- ulint zip_size;
- buf_pool_mutex_exit(buf_pool);
+ ut_ad(bpage->oldest_modification != 0);
- zip_size = buf_page_get_zip_size(bpage);
- page_no = buf_page_get_page_no(bpage);
+ buf_flush_remove(bpage);
- rw_lock_x_unlock(hash_lock);
- mutex_exit(block_mutex);
-
- /* Note that the following call will acquire
- an S-latch on the page */
+ mutex_exit(block_mutex);
+ buf_flush_list_mutex_enter(buf_pool);
+next_page:
+ bpage = prev_bpage;
- btr_search_drop_page_hash_when_freed(
- id, zip_size, page_no);
- goto scan_again;
+ if (!bpage) {
+ break;
}
- if (bpage->oldest_modification != 0) {
+ /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
+ loop we release buf_pool->mutex to let other threads
+ do their job. */
+ if (i < BUF_LRU_DROP_SEARCH_SIZE) {
+ continue;
+ }
- buf_flush_remove(bpage);
+ /* We IO-fix the block to make sure that the block
+ stays in its position in the flush_list. */
+ if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ /* Block is already IO-fixed. We don't
+ want to change the value. Lets leave
+ this block alone. */
+ continue;
}
- /* Remove from the LRU list. */
+ buf_flush_list_mutex_exit(buf_pool);
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+ buf_page_set_sticky(bpage);
+ mutex_exit(block_mutex);
- if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
- != BUF_BLOCK_ZIP_FREE) {
- buf_LRU_block_free_hashed_page((buf_block_t*)
- bpage);
- } else {
- /* The block_mutex should have been
- released by buf_LRU_block_remove_hashed_page() */
- ut_ad(block_mutex == &buf_pool->zip_mutex);
- }
+ /* Now it is safe to release the buf_pool->mutex. */
+ buf_pool_mutex_exit(buf_pool);
+ os_thread_yield();
+ buf_pool_mutex_enter(buf_pool);
- ut_ad(!mutex_own(block_mutex));
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
- && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
- ut_ad(!mutex_own(block_mutex));
-next_page:
- bpage = prev_bpage;
+ mutex_enter(block_mutex);
+ buf_page_unset_sticky(bpage);
+ mutex_exit(block_mutex);
+ buf_flush_list_mutex_enter(buf_pool);
+ ut_ad(bpage->in_flush_list);
+
+ i = 0;
}
buf_pool_mutex_exit(buf_pool);
+ buf_flush_list_mutex_exit(buf_pool);
+
+ ut_ad(buf_flush_validate(buf_pool));
if (!all_freed) {
os_thread_sleep(20000);
@@ -501,7 +494,7 @@ buf_LRU_invalidate_tablespace(
buf_pool = buf_pool_from_array(i);
buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
- buf_LRU_invalidate_tablespace_buf_pool_instance(buf_pool, id);
+ buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id);
}
}
@@ -1581,15 +1574,18 @@ func_exit:
bpage->zip.data = NULL;
page_zip_set_size(&bpage->zip, 0);
+ mutex_exit(block_mutex);
/* Prevent buf_page_get_gen() from
decompressing the block while we release
buf_pool->mutex and block_mutex. */
- b->buf_fix_count++;
- b->io_fix = BUF_IO_READ;
+ block_mutex = buf_page_get_mutex(b);
+ mutex_enter(block_mutex);
+ buf_page_set_sticky(b);
+ mutex_exit(block_mutex);
rw_lock_x_unlock(hash_lock);
- mutex_exit(block_mutex);
+
} else {
/* There can be multiple threads doing an LRU scan to
@@ -1600,11 +1596,9 @@ func_exit:
else considers this block as a victim for page
replacement. This block is already out of page_hash
and we are about to remove it from the LRU list and put
- it on the free list. To avoid this situation we set the
- buf_fix_count and io_fix fields here. */
+ it on the free list. */
mutex_enter(block_mutex);
- buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__);
- buf_page_set_io_fix(bpage, BUF_IO_READ);
+ buf_page_set_sticky(bpage);
mutex_exit(block_mutex);
}
@@ -1642,19 +1636,9 @@ func_exit:
buf_pool_mutex_enter(buf_pool);
- if (b) {
- mutex_enter(&buf_pool->zip_mutex);
- b->buf_fix_count--;
- buf_page_set_io_fix(b, BUF_IO_NONE);
- mutex_exit(&buf_pool->zip_mutex);
- } else {
- mutex_enter(block_mutex);
- ut_ad(bpage->buf_fix_count > 0);
- ut_ad(bpage->io_fix == BUF_IO_READ);
- buf_block_buf_fix_dec((buf_block_t*) bpage);
- buf_page_set_io_fix(bpage, BUF_IO_NONE);
- mutex_exit(block_mutex);
- }
+ mutex_enter(block_mutex);
+ buf_page_unset_sticky(b != NULL ? b : bpage);
+ mutex_exit(block_mutex);
buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
return(TRUE);
=== modified file 'storage/innobase/handler/i_s.cc'
--- a/storage/innobase/handler/i_s.cc revid:norvald.ryeng@stripped
+++ b/storage/innobase/handler/i_s.cc revid:inaam.rana@stripped
@@ -3285,6 +3285,10 @@ i_s_innodb_buffer_page_fill(
OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
"IO_WRITE"));
break;
+ case BUF_IO_PIN:
+ OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+ "IO_PIN"));
+ break;
}
OK(field_store_string(fields[IDX_BUFFER_PAGE_IS_OLD],
=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0buf.h revid:inaam.rana@stripped
@@ -938,7 +938,27 @@ buf_block_set_io_fix(
/*=================*/
buf_block_t* block, /*!< in/out: control block */
enum buf_io_fix io_fix);/*!< in: io_fix state */
-
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+ buf_page_t* bpage); /*!< in/out: control block */
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+ buf_page_t* bpage); /*!< in/out: control block */
/********************************************************************//**
Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */
=== modified file 'storage/innobase/include/buf0buf.ic'
--- a/storage/innobase/include/buf0buf.ic revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0buf.ic revid:inaam.rana@stripped
@@ -425,6 +425,7 @@ buf_page_get_io_fix(
case BUF_IO_NONE:
case BUF_IO_READ:
case BUF_IO_WRITE:
+ case BUF_IO_PIN:
return(io_fix);
}
ut_error;
@@ -475,6 +476,49 @@ buf_block_set_io_fix(
buf_page_set_io_fix(&block->page, io_fix);
}
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+ buf_page_t* bpage) /*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+
+ bpage->io_fix = BUF_IO_PIN;
+}
+
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+ buf_page_t* bpage) /*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
+
+ bpage->io_fix = BUF_IO_NONE;
+}
+
/********************************************************************//**
Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */
=== modified file 'storage/innobase/include/buf0types.h'
--- a/storage/innobase/include/buf0types.h revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0types.h revid:inaam.rana@stripped
@@ -56,7 +56,10 @@ enum buf_flush {
enum buf_io_fix {
BUF_IO_NONE = 0, /**< no pending I/O */
BUF_IO_READ, /**< read pending */
- BUF_IO_WRITE /**< write pending */
+ BUF_IO_WRITE, /**< write pending */
+ BUF_IO_PIN /**< disallow relocation of
+ block and its removal of from
+ the flush_list */
};
/** Alternatives for srv_checksum_algorithm, which can be changed by
Attachment: [text/bzr-bundle] bzr/inaam.rana@oracle.com-20111003175354-4ww5itiev1sfq7md.bundle
| Thread |
|---|
| • bzr push into mysql-trunk branch (inaam.rana:3471 to 3472) Bug#11759044 | Inaam Rana | 5 Oct |