From: Date: May 9 2009 5:16pm Subject: bzr commit into mysql-5.1 branch (mikael:2847) List-Archive: http://lists.mysql.com/commits/73739 Message-Id: <200905091516.n49FGaxt008052@dator6> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit #At file:///home/mikael/mysql_clones/buf_flush_list_mutex/ 2847 Mikael Ronstrom 2009-05-09 Broke out BUF_FLUSH_LIST from buffer pool mutex and thus decreased pressure on both buffer pool mutex and log_sys mutex modified: storage/innobase/buf/buf0buf.c storage/innobase/buf/buf0flu.c storage/innobase/buf/buf0lru.c storage/innobase/include/buf0buf.h storage/innobase/include/buf0buf.ic storage/innobase/include/buf0flu.ic storage/innobase/include/sync0sync.h storage/innobase/mtr/mtr0mtr.c storage/innobase/sync/sync0sync.c === modified file 'storage/innobase/buf/buf0buf.c' --- a/storage/innobase/buf/buf0buf.c 2008-10-15 18:54:18 +0000 +++ b/storage/innobase/buf/buf0buf.c 2009-05-09 15:13:22 +0000 @@ -86,6 +86,14 @@ create a separate mutex for the page has accessing the hash table takes 2 microseconds, about half of the total buf_pool mutex hold time. +We have added a mutex protecting only the flush_list, in order to allow +some operations (buf_flush_note_modification, buf_get_oldest_modification) +to be performed without acquiring the buffer pool mutex. This reduces +log_sys mutex contention, since the log_sys holder no longer has to wait +for the buffer pool mutex. This new mutex must be acquired after the +buffer pool mutex, AND after the block->mutex protecting the meta-data +(control block) for an individual buffer pool page. + Control blocks -------------- @@ -724,7 +732,11 @@ buf_pool_init( /* 2. Initialize flushing fields ---------------------------- */ + mutex_create(&buf_pool->flush_list_mutex, SYNC_BUF_FLUSH_LIST); + + mutex_enter(&(buf_pool->flush_list_mutex)); UT_LIST_INIT(buf_pool->flush_list); + mutex_exit(&(buf_pool->flush_list_mutex)); for (i = BUF_FLUSH_LRU; i <= BUF_FLUSH_LIST; i++) { buf_pool->n_flush[i] = 0; @@ -2147,6 +2159,8 @@ buf_validate(void) mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); + for (i = 0; i < buf_pool->curr_size; i++) { block = buf_pool_get_nth_block(buf_pool, i); @@ -2221,6 +2235,7 @@ buf_validate(void) ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); mutex_exit(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); ut_a(buf_LRU_validate()); ut_a(buf_flush_validate()); === modified file 'storage/innobase/buf/buf0flu.c' --- a/storage/innobase/buf/buf0flu.c 2008-02-01 10:55:39 +0000 +++ b/storage/innobase/buf/buf0flu.c 2009-05-09 15:13:22 +0000 @@ -48,7 +48,8 @@ buf_flush_insert_into_flush_list( /*=============================*/ buf_block_t* block) /* in: block which is modified */ { - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); + ut_ad(mutex_own(&(buf_pool->flush_list_mutex))); ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) @@ -74,11 +75,17 @@ buf_flush_insert_sorted_into_flush_list( buf_block_t* prev_b; buf_block_t* b; - ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); + ut_ad(mutex_own(&(buf_pool->flush_list_mutex))); prev_b = NULL; b = UT_LIST_GET_FIRST(buf_pool->flush_list); + /* These reads of oldest_modification are not protected by b->mutex: + This should be safe, since oldest_modifcation for flush list blocks + can only change when the block is flushed, and thus removed from the + flush list, which requires holding the flush_list_mutex. */ + while (b && (ut_dulint_cmp(b->oldest_modification, block->oldest_modification) > 0)) { prev_b = b; @@ -174,15 +181,23 @@ buf_flush_write_complete( ut_ad(block); #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); + /* Get flush_list_mutex before updating oldest_modification to + ensure the flush list holds exactly those blocks with non-zero + oldest_modification. */ + mutex_enter(&(buf_pool->flush_list_mutex)); + block->oldest_modification = ut_dulint_zero; UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block); ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list)); + mutex_exit(&(buf_pool->flush_list_mutex)); + (buf_pool->n_flush[block->flush_type])--; if (block->flush_type == BUF_FLUSH_LRU) { @@ -842,6 +857,7 @@ buf_flush_batch( ulint space; ulint offset; ibool found; + ulint remaining = 0; ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)); @@ -878,7 +894,11 @@ buf_flush_batch( } else { ut_ad(flush_type == BUF_FLUSH_LIST); + mutex_enter(&(buf_pool->flush_list_mutex)); + remaining = UT_LIST_GET_LEN(buf_pool->flush_list); block = UT_LIST_GET_LAST(buf_pool->flush_list); + mutex_exit(&(buf_pool->flush_list_mutex)); + if (!block || (ut_dulint_cmp(block->oldest_modification, lsn_limit) >= 0)) { @@ -931,14 +951,17 @@ buf_flush_batch( ut_ad(flush_type == BUF_FLUSH_LIST); mutex_exit(&block->mutex); - + mutex_enter(&(buf_pool->flush_list_mutex)); block = UT_LIST_GET_PREV(flush_list, block); + mutex_exit(&(buf_pool->flush_list_mutex)); + remaining--; } } - /* If we could not find anything to flush, leave the loop */ - - if (!found) { + /* If we could not find anything to flush, leave the loop. + If remaining > 0, the list changed during our loop, so we + may want to try again. */ + if (!found && !remaining) { break; } } @@ -1075,11 +1098,21 @@ buf_flush_validate_low(void) buf_block_t* block; dulint om; +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&buf_pool->flush_list_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list); block = UT_LIST_GET_FIRST(buf_pool->flush_list); while (block != NULL) { + /* oldest_modification can't change here, since the block + cannot be removed from the flush_list while we hold the + flush list mutex (and removal is the only way to change the + oldest_modification of a block in the flush list). + Therefore, we do not need the block->mutex. */ + om = block->oldest_modification; ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0); @@ -1105,11 +1138,11 @@ buf_flush_validate(void) { ibool ret; - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(buf_pool->flush_list_mutex)); ret = buf_flush_validate_low(); - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&(buf_pool->flush_list_mutex)); return(ret); } === modified file 'storage/innobase/buf/buf0lru.c' --- a/storage/innobase/buf/buf0lru.c 2008-12-14 20:47:17 +0000 +++ b/storage/innobase/buf/buf0lru.c 2009-05-09 15:13:22 +0000 @@ -261,10 +261,12 @@ scan_again: /* Remove from the flush list of modified blocks */ + mutex_enter(&(buf_pool->flush_list_mutex)); block->oldest_modification = ut_dulint_zero; UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block); + mutex_exit(&(buf_pool->flush_list_mutex)); } /* Remove from the LRU list */ === modified file 'storage/innobase/include/buf0buf.h' --- a/storage/innobase/include/buf0buf.h 2008-08-20 00:37:41 +0000 +++ b/storage/innobase/include/buf0buf.h 2009-05-09 15:13:22 +0000 @@ -955,6 +955,8 @@ struct buf_pool_struct{ ulint n_pages_awe_remapped_old; /* 2. Page flushing algorithm fields */ + mutex_t flush_list_mutex;/* mutex protecting the + flush_list */ UT_LIST_BASE_NODE_T(buf_block_t) flush_list; /* base node of the modified block list */ === modified file 'storage/innobase/include/buf0buf.ic' --- a/storage/innobase/include/buf0buf.ic 2008-10-15 18:54:18 +0000 +++ b/storage/innobase/include/buf0buf.ic 2009-05-09 15:13:22 +0000 @@ -104,19 +104,32 @@ buf_pool_get_oldest_modification(void) buf_block_t* block; dulint lsn; - mutex_enter(&(buf_pool->mutex)); +try_again: + + mutex_enter(&(buf_pool->flush_list_mutex)); block = UT_LIST_GET_LAST(buf_pool->flush_list); + mutex_exit(&(buf_pool->flush_list_mutex)); + if (block == NULL) { lsn = ut_dulint_zero; } else { lsn = block->oldest_modification; - } + /* If lsn is zero, than the block may have been removed + from the flush_list since we released flush_list mutex. + We try again to get a more accurate oldest_modification.*/ + if(ut_dulint_cmp(lsn, ut_dulint_zero) == 0) { + /* TODO: should determine the frequency of this */ + goto try_again; + } - mutex_exit(&(buf_pool->mutex)); + } return(lsn); + + /* The returned answer may be out of date: the flush_list can + change after mutex release and before the function returns. */ } /*********************************************************************** @@ -623,12 +636,6 @@ buf_page_release( ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_a(block->buf_fix_count > 0); - if (rw_latch == RW_X_LATCH && mtr->modifications) { - mutex_enter(&buf_pool->mutex); - buf_flush_note_modification(block, mtr); - mutex_exit(&buf_pool->mutex); - } - mutex_enter(&block->mutex); #ifdef UNIV_SYNC_DEBUG === modified file 'storage/innobase/include/buf0flu.ic' --- a/storage/innobase/include/buf0flu.ic 2007-03-22 21:59:35 +0000 +++ b/storage/innobase/include/buf0flu.ic 2009-05-09 15:13:22 +0000 @@ -43,25 +43,37 @@ buf_flush_note_modification( #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(ut_dulint_cmp(mtr->start_lsn, ut_dulint_zero) != 0); ut_ad(mtr->modifications); + + mutex_enter(&block->mutex); + ut_ad(ut_dulint_cmp(block->newest_modification, mtr->end_lsn) <= 0); block->newest_modification = mtr->end_lsn; if (ut_dulint_is_zero(block->oldest_modification)) { + /* Get flush_list_mutex before updating oldest_modification to + ensure the flush list holds exactly those blocks with non-zero + oldest_modification. */ + + mutex_enter(&buf_pool->flush_list_mutex); block->oldest_modification = mtr->start_lsn; ut_ad(!ut_dulint_is_zero(block->oldest_modification)); buf_flush_insert_into_flush_list(block); + + mutex_exit(&buf_pool->flush_list_mutex); + } else { ut_ad(ut_dulint_cmp(block->oldest_modification, mtr->start_lsn) <= 0); } + mutex_exit(&block->mutex); + ++srv_buf_pool_write_requests; } @@ -84,23 +96,30 @@ buf_flush_recv_note_modification( ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); ut_ad(ut_dulint_cmp(block->newest_modification, end_lsn) <= 0); block->newest_modification = end_lsn; if (ut_dulint_is_zero(block->oldest_modification)) { + /* Get flush_list_mutex before updating oldest_modification to + ensure the flush list holds exactly those blocks with non-zero + oldest_modification. */ + + mutex_enter(&buf_pool->flush_list_mutex); block->oldest_modification = start_lsn; ut_ad(!ut_dulint_is_zero(block->oldest_modification)); buf_flush_insert_sorted_into_flush_list(block); + + mutex_exit(&buf_pool->flush_list_mutex); } else { ut_ad(ut_dulint_cmp(block->oldest_modification, start_lsn) <= 0); } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); } === modified file 'storage/innobase/include/sync0sync.h' --- a/storage/innobase/include/sync0sync.h 2008-12-04 10:57:56 +0000 +++ b/storage/innobase/include/sync0sync.h 2009-05-09 15:13:22 +0000 @@ -454,6 +454,7 @@ or row lock! */ the level is SYNC_MEM_HASH. */ #define SYNC_BUF_POOL 150 #define SYNC_BUF_BLOCK 149 +#define SYNC_BUF_FLUSH_LIST 145 #define SYNC_DOUBLEWRITE 140 #define SYNC_ANY_LATCH 135 #define SYNC_THR_LOCAL 133 === modified file 'storage/innobase/mtr/mtr0mtr.c' --- a/storage/innobase/mtr/mtr0mtr.c 2007-07-10 14:34:21 +0000 +++ b/storage/innobase/mtr/mtr0mtr.c 2009-05-09 15:13:22 +0000 @@ -13,6 +13,7 @@ Created 11/26/1995 Heikki Tuuri #endif #include "buf0buf.h" +#include "buf0flu.h" #include "page0types.h" #include "mtr0log.h" #include "log0log.h" @@ -103,6 +104,40 @@ mtr_memo_pop_all( } } +/************************************************************** +Updates the lsn for buffer pages modified by this mtr. Pages will be +added to the buf_pool->flush_list as well, but locks and bufferfix will +not be released until after log_sys->mutex is released. */ +UNIV_INLINE +void +mtr_memo_note_modification( +/*=============*/ + mtr_t* mtr) /* in: mtr */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + ulint offset; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in + commit */ + ut_ad(mtr->modifications); + + memo = &(mtr->memo); + + offset = dyn_array_get_data_size(memo); + while (offset > 0) { + offset -= sizeof(mtr_memo_slot_t); + slot = dyn_array_get_element(memo, offset); + if (UNIV_LIKELY(slot->object != NULL) && + slot->type == MTR_MEMO_PAGE_X_FIX) { + buf_flush_note_modification( + (buf_block_t*)slot->object, mtr); + } + } +} + /**************************************************************** Writes the contents of a mini-transaction log, if any, to the database log. */ static @@ -178,7 +213,6 @@ mtr_commit( #endif if (mtr->modifications) { mtr_log_reserve_and_write(mtr); - } /* We first update the modification info to buffer pages, and only after that release the log mutex: this guarantees that when the log @@ -188,12 +222,14 @@ mtr_commit( required when we insert modified buffer pages in to the flush list which must be sorted on oldest_modification. */ - mtr_memo_pop_all(mtr); - - if (mtr->modifications) { + mtr_memo_note_modification(mtr); log_release(); } + + /* All unlocking has been moved here, after log_sys mutex release. */ + mtr_memo_pop_all(mtr); + #ifdef UNIV_DEBUG mtr->state = MTR_COMMITTED; #endif @@ -263,7 +299,12 @@ mtr_memo_release( slot = dyn_array_get_element(memo, offset); if ((object == slot->object) && (type == slot->type)) { - + if (mtr->modifications && + UNIV_LIKELY(slot->object != NULL) && + slot->type == MTR_MEMO_PAGE_X_FIX) { + buf_flush_note_modification( + (buf_block_t*)slot->object, mtr); + } mtr_memo_slot_release(mtr, slot); break; === modified file 'storage/innobase/sync/sync0sync.c' --- a/storage/innobase/sync/sync0sync.c 2008-10-30 09:23:36 +0000 +++ b/storage/innobase/sync/sync0sync.c 2009-05-09 15:13:22 +0000 @@ -1107,6 +1107,9 @@ sync_thread_add_level( case SYNC_BUF_POOL: ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL)); break; + case SYNC_BUF_FLUSH_LIST: + ut_a(sync_thread_levels_g(array, SYNC_BUF_FLUSH_LIST)); + break; case SYNC_SEARCH_SYS: ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS)); break;