#At file:///home/inaam/w/buf_io_fix/ based on revid:norvald.ryeng@stripped
3472 Inaam Rana 2011-10-03
Bug#11759044 - 51325: DROPPING AN EMPTY INNODB TABLE TAKES A LONG TIME
WITH LARGE BUFFER POOL
rb://767
approved by: Marko
When dropping a table (with an .ibd file i.e.: with
innodb_file_per_table set) we scan entire LRU to invalidate pages from
that table. This can be painful in case of large buffer pools as we hold
the buf_pool->mutex for the scan. Note that gravity of the problem does
not depend on the size of the table. Even with an empty table but a
large and filled up buffer pool we'll end up scanning a very long LRU
list.
The fix is to scan flush_list and just remove the blocks belonging to
the table from the flush_list, marking them as non-dirty. The blocks
are left in the LRU list for eventual eviction due to aging. The
flush_list is typically much smaller than the LRU list but for cases
where it is very long we have the solution of releasing the
buf_pool->mutex after scanning 1K pages.
buf_page_[set|unset]_sticky(): Use new IO-state BUF_IO_PIN to ensure
that a block stays in the flush_list and LRU list when we release
buf_pool->mutex. Previously we have been abusing BUF_IO_READ to achieve
this.
modified:
storage/innobase/buf/buf0buf.c
storage/innobase/buf/buf0lru.c
storage/innobase/handler/i_s.cc
storage/innobase/include/buf0buf.h
storage/innobase/include/buf0buf.ic
storage/innobase/include/buf0types.h
=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c revid:norvald.ryeng@stripped
+++ b/storage/innobase/buf/buf0buf.c revid:inaam.rana@stripped
@@ -4255,6 +4255,9 @@ assert_s_latched:
ut_a(rw_lock_is_locked(&block->lock,
RW_LOCK_EX));
break;
+
+ case BUF_IO_PIN:
+ break;
}
n_lru++;
@@ -4284,6 +4287,7 @@ assert_s_latched:
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
+ case BUF_IO_PIN:
/* All clean blocks should be I/O-unfixed. */
break;
case BUF_IO_READ:
@@ -4324,6 +4328,7 @@ assert_s_latched:
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
case BUF_IO_READ:
+ case BUF_IO_PIN:
break;
case BUF_IO_WRITE:
switch (buf_page_get_flush_type(b)) {
=== modified file 'storage/innobase/buf/buf0lru.c'
--- a/storage/innobase/buf/buf0lru.c revid:norvald.ryeng@stripped
+++ b/storage/innobase/buf/buf0lru.c revid:inaam.rana@stripped
@@ -70,8 +70,12 @@ allowed to point to either end of the LR
/** When dropping the search hash index entries before deleting an ibd
file, we build a local array of pages belonging to that tablespace
-in the buffer pool. Following is the size of that array. */
-#define BUF_LRU_DROP_SEARCH_HASH_SIZE 1024
+in the buffer pool. Following is the size of that array.
+We also release buf_pool->mutex after scanning this many pages of the
+flush_list when dropping a table. This is to ensure that other threads
+are not blocked for extended period of time when using very large
+buffer pools. */
+#define BUF_LRU_DROP_SEARCH_SIZE 1024
/** If we switch on the InnoDB monitor because there are too few available
frames in the buffer pool, we set this to TRUE */
@@ -216,7 +220,7 @@ buf_LRU_drop_page_hash_batch(
ulint i;
ut_ad(arr != NULL);
- ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
for (i = 0; i < count; ++i) {
btr_search_drop_page_hash_when_freed(space_id, zip_size,
@@ -250,7 +254,7 @@ buf_LRU_drop_page_hash_for_tablespace(
}
page_arr = ut_malloc(
- sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
buf_pool_mutex_enter(buf_pool);
num_entries = 0;
@@ -289,10 +293,10 @@ next_page:
/* Store the page number so that we can drop the hash
index in a batch later. */
page_arr[num_entries] = bpage->offset;
- ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
++num_entries;
- if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+ if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
goto next_page;
}
@@ -337,38 +341,39 @@ next_page:
}
/******************************************************************//**
-Invalidates all pages belonging to a given tablespace inside a specific
+Remove all dirty pages belonging to a given tablespace inside a specific
buffer pool instance when we are deleting the data file(s) of that
-tablespace. */
+tablespace. The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU. */
static
void
-buf_LRU_invalidate_tablespace_buf_pool_instance(
-/*============================================*/
+buf_LRU_remove_dirty_pages_for_tablespace(
+/*======================================*/
buf_pool_t* buf_pool, /*!< buffer pool instance */
ulint id) /*!< in: space id */
{
buf_page_t* bpage;
ibool all_freed;
+ ulint i;
scan_again:
buf_pool_mutex_enter(buf_pool);
+ buf_flush_list_mutex_enter(buf_pool);
all_freed = TRUE;
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0;
+ bpage != NULL; ++i) {
- while (bpage != NULL) {
buf_page_t* prev_bpage;
- ulint fold;
- rw_lock_t* hash_lock = NULL;
- mutex_t* block_mutex = NULL;
+ mutex_t* block_mutex = NULL;
ut_a(buf_page_in_file(bpage));
- prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+ prev_bpage = UT_LIST_GET_PREV(list, bpage);
/* bpage->space and bpage->io_fix are protected by
- buf_pool->mutex and block_mutex. It is safe to check
+ buf_pool->mutex and block_mutex. It is safe to check
them while holding buf_pool->mutex only. */
if (buf_page_get_space(bpage) != id) {
@@ -382,95 +387,83 @@ scan_again:
all_freed = FALSE;
goto next_page;
- } else {
- fold = buf_page_address_fold(bpage->space,
- bpage->offset);
- hash_lock = buf_page_hash_lock_get(buf_pool, fold);
- block_mutex = buf_page_get_mutex(bpage);
-
- rw_lock_x_lock(hash_lock);
- mutex_enter(block_mutex);
-
- if (bpage->buf_fix_count > 0) {
+ }
- rw_lock_x_unlock(hash_lock);
- mutex_exit(block_mutex);
+ /* We have to release the flush_list_mutex to obey the
+ latching order. We are however guaranteed that the page
+ will stay in the flush_list because buf_flush_remove()
+ needs buf_pool->mutex as well. */
+ buf_flush_list_mutex_exit(buf_pool);
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
- /* We cannot remove this page during
- this scan yet; maybe the system is
- currently reading it in, or flushing
- the modifications to the file */
+ if (bpage->buf_fix_count > 0) {
+ mutex_exit(block_mutex);
+ buf_flush_list_mutex_enter(buf_pool);
- all_freed = FALSE;
- goto next_page;
- }
- }
+ /* We cannot remove this page during
+ this scan yet; maybe the system is
+ currently reading it in, or flushing
+ the modifications to the file */
- ut_ad(mutex_own(block_mutex));
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
-#endif /* UNIV_SYNC_DEBUG */
-
-#ifdef UNIV_DEBUG
- if (buf_debug_prints) {
- fprintf(stderr,
- "Dropping space %lu page %lu\n",
- (ulong) buf_page_get_space(bpage),
- (ulong) buf_page_get_page_no(bpage));
+ all_freed = FALSE;
+ goto next_page;
}
-#endif
- if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
- /* This is a compressed-only block
- descriptor. Do nothing. */
- } else if (((buf_block_t*) bpage)->is_hashed) {
- ulint page_no;
- ulint zip_size;
- buf_pool_mutex_exit(buf_pool);
+ ut_ad(bpage->oldest_modification != 0);
- zip_size = buf_page_get_zip_size(bpage);
- page_no = buf_page_get_page_no(bpage);
+ buf_flush_remove(bpage);
- rw_lock_x_unlock(hash_lock);
- mutex_exit(block_mutex);
-
- /* Note that the following call will acquire
- an S-latch on the page */
+ mutex_exit(block_mutex);
+ buf_flush_list_mutex_enter(buf_pool);
+next_page:
+ bpage = prev_bpage;
- btr_search_drop_page_hash_when_freed(
- id, zip_size, page_no);
- goto scan_again;
+ if (!bpage) {
+ break;
}
- if (bpage->oldest_modification != 0) {
+ /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
+ loop we release buf_pool->mutex to let other threads
+ do their job. */
+ if (i < BUF_LRU_DROP_SEARCH_SIZE) {
+ continue;
+ }
- buf_flush_remove(bpage);
+ /* We IO-fix the block to make sure that the block
+ stays in its position in the flush_list. */
+ if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ /* Block is already IO-fixed. We don't
+ want to change the value. Lets leave
+ this block alone. */
+ continue;
}
- /* Remove from the LRU list. */
+ buf_flush_list_mutex_exit(buf_pool);
+ block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
+ buf_page_set_sticky(bpage);
+ mutex_exit(block_mutex);
- if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
- != BUF_BLOCK_ZIP_FREE) {
- buf_LRU_block_free_hashed_page((buf_block_t*)
- bpage);
- } else {
- /* The block_mutex should have been
- released by buf_LRU_block_remove_hashed_page() */
- ut_ad(block_mutex == &buf_pool->zip_mutex);
- }
+ /* Now it is safe to release the buf_pool->mutex. */
+ buf_pool_mutex_exit(buf_pool);
+ os_thread_yield();
+ buf_pool_mutex_enter(buf_pool);
- ut_ad(!mutex_own(block_mutex));
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
- && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
-#endif /* UNIV_SYNC_DEBUG */
- ut_ad(!mutex_own(block_mutex));
-next_page:
- bpage = prev_bpage;
+ mutex_enter(block_mutex);
+ buf_page_unset_sticky(bpage);
+ mutex_exit(block_mutex);
+ buf_flush_list_mutex_enter(buf_pool);
+ ut_ad(bpage->in_flush_list);
+
+ i = 0;
}
buf_pool_mutex_exit(buf_pool);
+ buf_flush_list_mutex_exit(buf_pool);
+
+ ut_ad(buf_flush_validate(buf_pool));
if (!all_freed) {
os_thread_sleep(20000);
@@ -501,7 +494,7 @@ buf_LRU_invalidate_tablespace(
buf_pool = buf_pool_from_array(i);
buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
- buf_LRU_invalidate_tablespace_buf_pool_instance(buf_pool, id);
+ buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id);
}
}
@@ -1581,15 +1574,18 @@ func_exit:
bpage->zip.data = NULL;
page_zip_set_size(&bpage->zip, 0);
+ mutex_exit(block_mutex);
/* Prevent buf_page_get_gen() from
decompressing the block while we release
buf_pool->mutex and block_mutex. */
- b->buf_fix_count++;
- b->io_fix = BUF_IO_READ;
+ block_mutex = buf_page_get_mutex(b);
+ mutex_enter(block_mutex);
+ buf_page_set_sticky(b);
+ mutex_exit(block_mutex);
rw_lock_x_unlock(hash_lock);
- mutex_exit(block_mutex);
+
} else {
/* There can be multiple threads doing an LRU scan to
@@ -1600,11 +1596,9 @@ func_exit:
else considers this block as a victim for page
replacement. This block is already out of page_hash
and we are about to remove it from the LRU list and put
- it on the free list. To avoid this situation we set the
- buf_fix_count and io_fix fields here. */
+ it on the free list. */
mutex_enter(block_mutex);
- buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__);
- buf_page_set_io_fix(bpage, BUF_IO_READ);
+ buf_page_set_sticky(bpage);
mutex_exit(block_mutex);
}
@@ -1642,19 +1636,9 @@ func_exit:
buf_pool_mutex_enter(buf_pool);
- if (b) {
- mutex_enter(&buf_pool->zip_mutex);
- b->buf_fix_count--;
- buf_page_set_io_fix(b, BUF_IO_NONE);
- mutex_exit(&buf_pool->zip_mutex);
- } else {
- mutex_enter(block_mutex);
- ut_ad(bpage->buf_fix_count > 0);
- ut_ad(bpage->io_fix == BUF_IO_READ);
- buf_block_buf_fix_dec((buf_block_t*) bpage);
- buf_page_set_io_fix(bpage, BUF_IO_NONE);
- mutex_exit(block_mutex);
- }
+ mutex_enter(block_mutex);
+ buf_page_unset_sticky(b != NULL ? b : bpage);
+ mutex_exit(block_mutex);
buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
return(TRUE);
=== modified file 'storage/innobase/handler/i_s.cc'
--- a/storage/innobase/handler/i_s.cc revid:norvald.ryeng@stripped
+++ b/storage/innobase/handler/i_s.cc revid:inaam.rana@stripped
@@ -3285,6 +3285,10 @@ i_s_innodb_buffer_page_fill(
OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
"IO_WRITE"));
break;
+ case BUF_IO_PIN:
+ OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+ "IO_PIN"));
+ break;
}
OK(field_store_string(fields[IDX_BUFFER_PAGE_IS_OLD],
=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0buf.h revid:inaam.rana@stripped
@@ -938,7 +938,27 @@ buf_block_set_io_fix(
/*=================*/
buf_block_t* block, /*!< in/out: control block */
enum buf_io_fix io_fix);/*!< in: io_fix state */
-
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+ buf_page_t* bpage); /*!< in/out: control block */
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+ buf_page_t* bpage); /*!< in/out: control block */
/********************************************************************//**
Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */
=== modified file 'storage/innobase/include/buf0buf.ic'
--- a/storage/innobase/include/buf0buf.ic revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0buf.ic revid:inaam.rana@stripped
@@ -425,6 +425,7 @@ buf_page_get_io_fix(
case BUF_IO_NONE:
case BUF_IO_READ:
case BUF_IO_WRITE:
+ case BUF_IO_PIN:
return(io_fix);
}
ut_error;
@@ -475,6 +476,49 @@ buf_block_set_io_fix(
buf_page_set_io_fix(&block->page, io_fix);
}
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+ buf_page_t* bpage) /*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+
+ bpage->io_fix = BUF_IO_PIN;
+}
+
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+ buf_page_t* bpage) /*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
+
+ bpage->io_fix = BUF_IO_NONE;
+}
+
/********************************************************************//**
Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */
=== modified file 'storage/innobase/include/buf0types.h'
--- a/storage/innobase/include/buf0types.h revid:norvald.ryeng@stripped
+++ b/storage/innobase/include/buf0types.h revid:inaam.rana@stripped
@@ -56,7 +56,10 @@ enum buf_flush {
enum buf_io_fix {
BUF_IO_NONE = 0, /**< no pending I/O */
BUF_IO_READ, /**< read pending */
- BUF_IO_WRITE /**< write pending */
+ BUF_IO_WRITE, /**< write pending */
+ BUF_IO_PIN /**< disallow relocation of
+ block and its removal of from
+ the flush_list */
};
/** Alternatives for srv_checksum_algorithm, which can be changed by
Attachment: [text/bzr-bundle] bzr/inaam.rana@oracle.com-20111003175354-4ww5itiev1sfq7md.bundle
| Thread |
|---|
| • bzr commit into mysql-trunk branch (inaam.rana:3472) Bug#11759044 | Inaam Rana | 5 Oct |