#At file:///home/mikael/mysql_clones/buf_flush_list_mutex/
2847 Mikael Ronstrom 2009-05-09
Broke out BUF_FLUSH_LIST from buffer pool mutex and thus decreased pressure on both buffer pool mutex and log_sys mutex
modified:
storage/innobase/buf/buf0buf.c
storage/innobase/buf/buf0flu.c
storage/innobase/buf/buf0lru.c
storage/innobase/include/buf0buf.h
storage/innobase/include/buf0buf.ic
storage/innobase/include/buf0flu.ic
storage/innobase/include/sync0sync.h
storage/innobase/mtr/mtr0mtr.c
storage/innobase/sync/sync0sync.c
=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c 2008-10-15 18:54:18 +0000
+++ b/storage/innobase/buf/buf0buf.c 2009-05-09 15:13:22 +0000
@@ -86,6 +86,14 @@ create a separate mutex for the page has
accessing the hash table takes 2 microseconds, about half
of the total buf_pool mutex hold time.
+We have added a mutex protecting only the flush_list, in order to allow
+some operations (buf_flush_note_modification, buf_get_oldest_modification)
+to be performed without acquiring the buffer pool mutex. This reduces
+log_sys mutex contention, since the log_sys holder no longer has to wait
+for the buffer pool mutex. This new mutex must be acquired after the
+buffer pool mutex, AND after the block->mutex protecting the meta-data
+(control block) for an individual buffer pool page.
+
Control blocks
--------------
@@ -724,7 +732,11 @@ buf_pool_init(
/* 2. Initialize flushing fields
---------------------------- */
+ mutex_create(&buf_pool->flush_list_mutex, SYNC_BUF_FLUSH_LIST);
+
+ mutex_enter(&(buf_pool->flush_list_mutex));
UT_LIST_INIT(buf_pool->flush_list);
+ mutex_exit(&(buf_pool->flush_list_mutex));
for (i = BUF_FLUSH_LRU; i <= BUF_FLUSH_LIST; i++) {
buf_pool->n_flush[i] = 0;
@@ -2147,6 +2159,8 @@ buf_validate(void)
mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&(buf_pool->flush_list_mutex));
+
for (i = 0; i < buf_pool->curr_size; i++) {
block = buf_pool_get_nth_block(buf_pool, i);
@@ -2221,6 +2235,7 @@ buf_validate(void)
ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
mutex_exit(&(buf_pool->mutex));
+ mutex_enter(&(buf_pool->flush_list_mutex));
ut_a(buf_LRU_validate());
ut_a(buf_flush_validate());
=== modified file 'storage/innobase/buf/buf0flu.c'
--- a/storage/innobase/buf/buf0flu.c 2008-02-01 10:55:39 +0000
+++ b/storage/innobase/buf/buf0flu.c 2009-05-09 15:13:22 +0000
@@ -48,7 +48,8 @@ buf_flush_insert_into_flush_list(
/*=============================*/
buf_block_t* block) /* in: block which is modified */
{
- ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
+ ut_ad(mutex_own(&(buf_pool->flush_list_mutex)));
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
@@ -74,11 +75,17 @@ buf_flush_insert_sorted_into_flush_list(
buf_block_t* prev_b;
buf_block_t* b;
- ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
+ ut_ad(mutex_own(&(buf_pool->flush_list_mutex)));
prev_b = NULL;
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+ /* These reads of oldest_modification are not protected by b->mutex:
+ This should be safe, since oldest_modifcation for flush list blocks
+ can only change when the block is flushed, and thus removed from the
+ flush list, which requires holding the flush_list_mutex. */
+
while (b && (ut_dulint_cmp(b->oldest_modification,
block->oldest_modification) > 0)) {
prev_b = b;
@@ -174,15 +181,23 @@ buf_flush_write_complete(
ut_ad(block);
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
#endif /* UNIV_SYNC_DEBUG */
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ /* Get flush_list_mutex before updating oldest_modification to
+ ensure the flush list holds exactly those blocks with non-zero
+ oldest_modification. */
+ mutex_enter(&(buf_pool->flush_list_mutex));
+
block->oldest_modification = ut_dulint_zero;
UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block);
ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list));
+ mutex_exit(&(buf_pool->flush_list_mutex));
+
(buf_pool->n_flush[block->flush_type])--;
if (block->flush_type == BUF_FLUSH_LRU) {
@@ -842,6 +857,7 @@ buf_flush_batch(
ulint space;
ulint offset;
ibool found;
+ ulint remaining = 0;
ut_ad((flush_type == BUF_FLUSH_LRU)
|| (flush_type == BUF_FLUSH_LIST));
@@ -878,7 +894,11 @@ buf_flush_batch(
} else {
ut_ad(flush_type == BUF_FLUSH_LIST);
+ mutex_enter(&(buf_pool->flush_list_mutex));
+ remaining = UT_LIST_GET_LEN(buf_pool->flush_list);
block = UT_LIST_GET_LAST(buf_pool->flush_list);
+ mutex_exit(&(buf_pool->flush_list_mutex));
+
if (!block
|| (ut_dulint_cmp(block->oldest_modification,
lsn_limit) >= 0)) {
@@ -931,14 +951,17 @@ buf_flush_batch(
ut_ad(flush_type == BUF_FLUSH_LIST);
mutex_exit(&block->mutex);
-
+ mutex_enter(&(buf_pool->flush_list_mutex));
block = UT_LIST_GET_PREV(flush_list, block);
+ mutex_exit(&(buf_pool->flush_list_mutex));
+ remaining--;
}
}
- /* If we could not find anything to flush, leave the loop */
-
- if (!found) {
+ /* If we could not find anything to flush, leave the loop.
+ If remaining > 0, the list changed during our loop, so we
+ may want to try again. */
+ if (!found && !remaining) {
break;
}
}
@@ -1075,11 +1098,21 @@ buf_flush_validate_low(void)
buf_block_t* block;
dulint om;
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&buf_pool->flush_list_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list);
block = UT_LIST_GET_FIRST(buf_pool->flush_list);
while (block != NULL) {
+ /* oldest_modification can't change here, since the block
+ cannot be removed from the flush_list while we hold the
+ flush list mutex (and removal is the only way to change the
+ oldest_modification of a block in the flush list).
+ Therefore, we do not need the block->mutex. */
+
om = block->oldest_modification;
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0);
@@ -1105,11 +1138,11 @@ buf_flush_validate(void)
{
ibool ret;
- mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&(buf_pool->flush_list_mutex));
ret = buf_flush_validate_low();
- mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&(buf_pool->flush_list_mutex));
return(ret);
}
=== modified file 'storage/innobase/buf/buf0lru.c'
--- a/storage/innobase/buf/buf0lru.c 2008-12-14 20:47:17 +0000
+++ b/storage/innobase/buf/buf0lru.c 2009-05-09 15:13:22 +0000
@@ -261,10 +261,12 @@ scan_again:
/* Remove from the flush list of modified
blocks */
+ mutex_enter(&(buf_pool->flush_list_mutex));
block->oldest_modification = ut_dulint_zero;
UT_LIST_REMOVE(flush_list,
buf_pool->flush_list, block);
+ mutex_exit(&(buf_pool->flush_list_mutex));
}
/* Remove from the LRU list */
=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h 2008-08-20 00:37:41 +0000
+++ b/storage/innobase/include/buf0buf.h 2009-05-09 15:13:22 +0000
@@ -955,6 +955,8 @@ struct buf_pool_struct{
ulint n_pages_awe_remapped_old;
/* 2. Page flushing algorithm fields */
+ mutex_t flush_list_mutex;/* mutex protecting the
+ flush_list */
UT_LIST_BASE_NODE_T(buf_block_t) flush_list;
/* base node of the modified block
list */
=== modified file 'storage/innobase/include/buf0buf.ic'
--- a/storage/innobase/include/buf0buf.ic 2008-10-15 18:54:18 +0000
+++ b/storage/innobase/include/buf0buf.ic 2009-05-09 15:13:22 +0000
@@ -104,19 +104,32 @@ buf_pool_get_oldest_modification(void)
buf_block_t* block;
dulint lsn;
- mutex_enter(&(buf_pool->mutex));
+try_again:
+
+ mutex_enter(&(buf_pool->flush_list_mutex));
block = UT_LIST_GET_LAST(buf_pool->flush_list);
+ mutex_exit(&(buf_pool->flush_list_mutex));
+
if (block == NULL) {
lsn = ut_dulint_zero;
} else {
lsn = block->oldest_modification;
- }
+ /* If lsn is zero, than the block may have been removed
+ from the flush_list since we released flush_list mutex.
+ We try again to get a more accurate oldest_modification.*/
+ if(ut_dulint_cmp(lsn, ut_dulint_zero) == 0) {
+ /* TODO: should determine the frequency of this */
+ goto try_again;
+ }
- mutex_exit(&(buf_pool->mutex));
+ }
return(lsn);
+
+ /* The returned answer may be out of date: the flush_list can
+ change after mutex release and before the function returns. */
}
/***********************************************************************
@@ -623,12 +636,6 @@ buf_page_release(
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
ut_a(block->buf_fix_count > 0);
- if (rw_latch == RW_X_LATCH && mtr->modifications) {
- mutex_enter(&buf_pool->mutex);
- buf_flush_note_modification(block, mtr);
- mutex_exit(&buf_pool->mutex);
- }
-
mutex_enter(&block->mutex);
#ifdef UNIV_SYNC_DEBUG
=== modified file 'storage/innobase/include/buf0flu.ic'
--- a/storage/innobase/include/buf0flu.ic 2007-03-22 21:59:35 +0000
+++ b/storage/innobase/include/buf0flu.ic 2009-05-09 15:13:22 +0000
@@ -43,25 +43,37 @@ buf_flush_note_modification(
#ifdef UNIV_SYNC_DEBUG
ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
- ut_ad(mutex_own(&(buf_pool->mutex)));
ut_ad(ut_dulint_cmp(mtr->start_lsn, ut_dulint_zero) != 0);
ut_ad(mtr->modifications);
+
+ mutex_enter(&block->mutex);
+
ut_ad(ut_dulint_cmp(block->newest_modification, mtr->end_lsn) <= 0);
block->newest_modification = mtr->end_lsn;
if (ut_dulint_is_zero(block->oldest_modification)) {
+ /* Get flush_list_mutex before updating oldest_modification to
+ ensure the flush list holds exactly those blocks with non-zero
+ oldest_modification. */
+
+ mutex_enter(&buf_pool->flush_list_mutex);
block->oldest_modification = mtr->start_lsn;
ut_ad(!ut_dulint_is_zero(block->oldest_modification));
buf_flush_insert_into_flush_list(block);
+
+ mutex_exit(&buf_pool->flush_list_mutex);
+
} else {
ut_ad(ut_dulint_cmp(block->oldest_modification,
mtr->start_lsn) <= 0);
}
+ mutex_exit(&block->mutex);
+
++srv_buf_pool_write_requests;
}
@@ -84,23 +96,30 @@ buf_flush_recv_note_modification(
ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
- mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
ut_ad(ut_dulint_cmp(block->newest_modification, end_lsn) <= 0);
block->newest_modification = end_lsn;
if (ut_dulint_is_zero(block->oldest_modification)) {
+ /* Get flush_list_mutex before updating oldest_modification to
+ ensure the flush list holds exactly those blocks with non-zero
+ oldest_modification. */
+
+ mutex_enter(&buf_pool->flush_list_mutex);
block->oldest_modification = start_lsn;
ut_ad(!ut_dulint_is_zero(block->oldest_modification));
buf_flush_insert_sorted_into_flush_list(block);
+
+ mutex_exit(&buf_pool->flush_list_mutex);
} else {
ut_ad(ut_dulint_cmp(block->oldest_modification,
start_lsn) <= 0);
}
- mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&block->mutex);
}
=== modified file 'storage/innobase/include/sync0sync.h'
--- a/storage/innobase/include/sync0sync.h 2008-12-04 10:57:56 +0000
+++ b/storage/innobase/include/sync0sync.h 2009-05-09 15:13:22 +0000
@@ -454,6 +454,7 @@ or row lock! */
the level is SYNC_MEM_HASH. */
#define SYNC_BUF_POOL 150
#define SYNC_BUF_BLOCK 149
+#define SYNC_BUF_FLUSH_LIST 145
#define SYNC_DOUBLEWRITE 140
#define SYNC_ANY_LATCH 135
#define SYNC_THR_LOCAL 133
=== modified file 'storage/innobase/mtr/mtr0mtr.c'
--- a/storage/innobase/mtr/mtr0mtr.c 2007-07-10 14:34:21 +0000
+++ b/storage/innobase/mtr/mtr0mtr.c 2009-05-09 15:13:22 +0000
@@ -13,6 +13,7 @@ Created 11/26/1995 Heikki Tuuri
#endif
#include "buf0buf.h"
+#include "buf0flu.h"
#include "page0types.h"
#include "mtr0log.h"
#include "log0log.h"
@@ -103,6 +104,40 @@ mtr_memo_pop_all(
}
}
+/**************************************************************
+Updates the lsn for buffer pages modified by this mtr. Pages will be
+added to the buf_pool->flush_list as well, but locks and bufferfix will
+not be released until after log_sys->mutex is released. */
+UNIV_INLINE
+void
+mtr_memo_note_modification(
+/*=============*/
+ mtr_t* mtr) /* in: mtr */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+ ulint offset;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
+ commit */
+ ut_ad(mtr->modifications);
+
+ memo = &(mtr->memo);
+
+ offset = dyn_array_get_data_size(memo);
+ while (offset > 0) {
+ offset -= sizeof(mtr_memo_slot_t);
+ slot = dyn_array_get_element(memo, offset);
+ if (UNIV_LIKELY(slot->object != NULL) &&
+ slot->type == MTR_MEMO_PAGE_X_FIX) {
+ buf_flush_note_modification(
+ (buf_block_t*)slot->object, mtr);
+ }
+ }
+}
+
/****************************************************************
Writes the contents of a mini-transaction log, if any, to the database log. */
static
@@ -178,7 +213,6 @@ mtr_commit(
#endif
if (mtr->modifications) {
mtr_log_reserve_and_write(mtr);
- }
/* We first update the modification info to buffer pages, and only
after that release the log mutex: this guarantees that when the log
@@ -188,12 +222,14 @@ mtr_commit(
required when we insert modified buffer pages in to the flush list
which must be sorted on oldest_modification. */
- mtr_memo_pop_all(mtr);
-
- if (mtr->modifications) {
+ mtr_memo_note_modification(mtr);
log_release();
}
+
+ /* All unlocking has been moved here, after log_sys mutex release. */
+ mtr_memo_pop_all(mtr);
+
#ifdef UNIV_DEBUG
mtr->state = MTR_COMMITTED;
#endif
@@ -263,7 +299,12 @@ mtr_memo_release(
slot = dyn_array_get_element(memo, offset);
if ((object == slot->object) && (type == slot->type)) {
-
+ if (mtr->modifications &&
+ UNIV_LIKELY(slot->object != NULL) &&
+ slot->type == MTR_MEMO_PAGE_X_FIX) {
+ buf_flush_note_modification(
+ (buf_block_t*)slot->object, mtr);
+ }
mtr_memo_slot_release(mtr, slot);
break;
=== modified file 'storage/innobase/sync/sync0sync.c'
--- a/storage/innobase/sync/sync0sync.c 2008-10-30 09:23:36 +0000
+++ b/storage/innobase/sync/sync0sync.c 2009-05-09 15:13:22 +0000
@@ -1107,6 +1107,9 @@ sync_thread_add_level(
case SYNC_BUF_POOL:
ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL));
break;
+ case SYNC_BUF_FLUSH_LIST:
+ ut_a(sync_thread_levels_g(array, SYNC_BUF_FLUSH_LIST));
+ break;
case SYNC_SEARCH_SYS:
ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS));
break;
| Thread |
|---|
| • bzr commit into mysql-5.1 branch (mikael:2847) | Mikael Ronstrom | 11 May |