List:Commits« Previous MessageNext Message »
From:Mikael Ronstrom Date:May 9 2009 3:16pm
Subject:bzr commit into mysql-5.1 branch (mikael:2847)
View as plain text  
#At file:///home/mikael/mysql_clones/buf_flush_list_mutex/

 2847 Mikael Ronstrom	2009-05-09
      Broke out BUF_FLUSH_LIST from buffer pool mutex and thus decreased pressure on both buffer pool mutex and log_sys mutex
      modified:
        storage/innobase/buf/buf0buf.c
        storage/innobase/buf/buf0flu.c
        storage/innobase/buf/buf0lru.c
        storage/innobase/include/buf0buf.h
        storage/innobase/include/buf0buf.ic
        storage/innobase/include/buf0flu.ic
        storage/innobase/include/sync0sync.h
        storage/innobase/mtr/mtr0mtr.c
        storage/innobase/sync/sync0sync.c

=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c	2008-10-15 18:54:18 +0000
+++ b/storage/innobase/buf/buf0buf.c	2009-05-09 15:13:22 +0000
@@ -86,6 +86,14 @@ create a separate mutex for the page has
 accessing the hash table takes 2 microseconds, about half
 of the total buf_pool mutex hold time.
 
+We have added a mutex protecting only the flush_list, in order to allow
+some operations (buf_flush_note_modification, buf_get_oldest_modification)
+to be performed without acquiring the buffer pool mutex. This reduces
+log_sys mutex contention, since the log_sys holder no longer has to wait
+for the buffer pool mutex. This new mutex must be acquired after the
+buffer pool mutex, AND after the block->mutex protecting the meta-data
+(control block) for an individual buffer pool page.
+
 		Control blocks
 		--------------
 
@@ -724,7 +732,11 @@ buf_pool_init(
 
 	/* 2. Initialize flushing fields
 	---------------------------- */
+        mutex_create(&buf_pool->flush_list_mutex, SYNC_BUF_FLUSH_LIST);
+ 
+        mutex_enter(&(buf_pool->flush_list_mutex));
 	UT_LIST_INIT(buf_pool->flush_list);
+        mutex_exit(&(buf_pool->flush_list_mutex));
 
 	for (i = BUF_FLUSH_LRU; i <= BUF_FLUSH_LIST; i++) {
 		buf_pool->n_flush[i] = 0;
@@ -2147,6 +2159,8 @@ buf_validate(void)
 
 	mutex_enter(&(buf_pool->mutex));
 
+	mutex_enter(&(buf_pool->flush_list_mutex));
+
 	for (i = 0; i < buf_pool->curr_size; i++) {
 
 		block = buf_pool_get_nth_block(buf_pool, i);
@@ -2221,6 +2235,7 @@ buf_validate(void)
 	ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
 
 	mutex_exit(&(buf_pool->mutex));
+	mutex_enter(&(buf_pool->flush_list_mutex));
 
 	ut_a(buf_LRU_validate());
 	ut_a(buf_flush_validate());

=== modified file 'storage/innobase/buf/buf0flu.c'
--- a/storage/innobase/buf/buf0flu.c	2008-02-01 10:55:39 +0000
+++ b/storage/innobase/buf/buf0flu.c	2009-05-09 15:13:22 +0000
@@ -48,7 +48,8 @@ buf_flush_insert_into_flush_list(
 /*=============================*/
 	buf_block_t*	block)	/* in: block which is modified */
 {
-	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
+	ut_ad(mutex_own(&(buf_pool->flush_list_mutex)));
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
 	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
@@ -74,11 +75,17 @@ buf_flush_insert_sorted_into_flush_list(
 	buf_block_t*	prev_b;
 	buf_block_t*	b;
 
-	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
+	ut_ad(mutex_own(&(buf_pool->flush_list_mutex)));
 
 	prev_b = NULL;
 	b = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
+        /* These reads of oldest_modification are not protected by b->mutex:
+        This should be safe, since oldest_modifcation for flush list blocks
+        can only change when the block is flushed, and thus removed from the
+        flush list, which requires holding the flush_list_mutex. */
+
 	while (b && (ut_dulint_cmp(b->oldest_modification,
 				   block->oldest_modification) > 0)) {
 		prev_b = b;
@@ -174,15 +181,23 @@ buf_flush_write_complete(
 	ut_ad(block);
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
+        /* Get flush_list_mutex before updating oldest_modification to
+        ensure the flush list holds exactly those blocks with non-zero
+        oldest_modification. */
+        mutex_enter(&(buf_pool->flush_list_mutex));
+ 
 	block->oldest_modification = ut_dulint_zero;
 
 	UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block);
 
 	ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list));
 
+        mutex_exit(&(buf_pool->flush_list_mutex));
+ 
 	(buf_pool->n_flush[block->flush_type])--;
 
 	if (block->flush_type == BUF_FLUSH_LRU) {
@@ -842,6 +857,7 @@ buf_flush_batch(
 	ulint		space;
 	ulint		offset;
 	ibool		found;
+        ulint           remaining       = 0;
 
 	ut_ad((flush_type == BUF_FLUSH_LRU)
 	      || (flush_type == BUF_FLUSH_LIST));
@@ -878,7 +894,11 @@ buf_flush_batch(
 		} else {
 			ut_ad(flush_type == BUF_FLUSH_LIST);
 
+                        mutex_enter(&(buf_pool->flush_list_mutex));
+                        remaining = UT_LIST_GET_LEN(buf_pool->flush_list);
 			block = UT_LIST_GET_LAST(buf_pool->flush_list);
+                        mutex_exit(&(buf_pool->flush_list_mutex));
+
 			if (!block
 			    || (ut_dulint_cmp(block->oldest_modification,
 					      lsn_limit) >= 0)) {
@@ -931,14 +951,17 @@ buf_flush_batch(
 				ut_ad(flush_type == BUF_FLUSH_LIST);
 
 				mutex_exit(&block->mutex);
-
+                                mutex_enter(&(buf_pool->flush_list_mutex));
 				block = UT_LIST_GET_PREV(flush_list, block);
+                                mutex_exit(&(buf_pool->flush_list_mutex));
+                                remaining--;
 			}
 		}
 
-		/* If we could not find anything to flush, leave the loop */
-
-		if (!found) {
+                /* If we could not find anything to flush, leave the loop.
+                If remaining > 0, the list changed during our loop, so we
+                may want to try again. */
+                if (!found && !remaining) {
 			break;
 		}
 	}
@@ -1075,11 +1098,21 @@ buf_flush_validate_low(void)
 	buf_block_t*	block;
 	dulint		om;
 
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(mutex_own(&buf_pool->flush_list_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
 	UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list);
 
 	block = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
 	while (block != NULL) {
+                /* oldest_modification can't change here, since the block
+                 cannot be removed from the flush_list while we hold the
+                 flush list mutex (and removal is the only way to change the
+                 oldest_modification of a block in the flush list).
+                 Therefore, we do not need the block->mutex. */
+
 		om = block->oldest_modification;
 		ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 		ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0);
@@ -1105,11 +1138,11 @@ buf_flush_validate(void)
 {
 	ibool	ret;
 
-	mutex_enter(&(buf_pool->mutex));
+	mutex_enter(&(buf_pool->flush_list_mutex));
 
 	ret = buf_flush_validate_low();
 
-	mutex_exit(&(buf_pool->mutex));
+	mutex_exit(&(buf_pool->flush_list_mutex));
 
 	return(ret);
 }

=== modified file 'storage/innobase/buf/buf0lru.c'
--- a/storage/innobase/buf/buf0lru.c	2008-12-14 20:47:17 +0000
+++ b/storage/innobase/buf/buf0lru.c	2009-05-09 15:13:22 +0000
@@ -261,10 +261,12 @@ scan_again:
 
 				/* Remove from the flush list of modified
 				blocks */
+				mutex_enter(&(buf_pool->flush_list_mutex));
 				block->oldest_modification = ut_dulint_zero;
 
 				UT_LIST_REMOVE(flush_list,
 					       buf_pool->flush_list, block);
+				mutex_exit(&(buf_pool->flush_list_mutex));
 			}
 
 			/* Remove from the LRU list */

=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h	2008-08-20 00:37:41 +0000
+++ b/storage/innobase/include/buf0buf.h	2009-05-09 15:13:22 +0000
@@ -955,6 +955,8 @@ struct buf_pool_struct{
 	ulint		n_pages_awe_remapped_old;
 	/* 2. Page flushing algorithm fields */
 
+        mutex_t                flush_list_mutex;/* mutex protecting the
+                                                flush_list */
 	UT_LIST_BASE_NODE_T(buf_block_t) flush_list;
 					/* base node of the modified block
 					list */

=== modified file 'storage/innobase/include/buf0buf.ic'
--- a/storage/innobase/include/buf0buf.ic	2008-10-15 18:54:18 +0000
+++ b/storage/innobase/include/buf0buf.ic	2009-05-09 15:13:22 +0000
@@ -104,19 +104,32 @@ buf_pool_get_oldest_modification(void)
 	buf_block_t*	block;
 	dulint		lsn;
 
-	mutex_enter(&(buf_pool->mutex));
+try_again:
+
+	mutex_enter(&(buf_pool->flush_list_mutex));
 
 	block = UT_LIST_GET_LAST(buf_pool->flush_list);
 
+	mutex_exit(&(buf_pool->flush_list_mutex));
+
 	if (block == NULL) {
 		lsn = ut_dulint_zero;
 	} else {
 		lsn = block->oldest_modification;
-	}
+                /* If lsn is zero, than the block may have been removed
+                from the flush_list since we released flush_list mutex.
+                We try again to get a more accurate oldest_modification.*/
+                if(ut_dulint_cmp(lsn, ut_dulint_zero) == 0) {
+                   /* TODO: should determine the frequency of this */
+                      goto try_again;
+                }
 
-	mutex_exit(&(buf_pool->mutex));
+	}
 
 	return(lsn);
+
+        /* The returned answer may be out of date: the flush_list can
+        change after mutex release and before the function returns. */
 }
 
 /***********************************************************************
@@ -623,12 +636,6 @@ buf_page_release(
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 	ut_a(block->buf_fix_count > 0);
 
-	if (rw_latch == RW_X_LATCH && mtr->modifications) {
-		mutex_enter(&buf_pool->mutex);
-		buf_flush_note_modification(block, mtr);
-		mutex_exit(&buf_pool->mutex);
-	}
-
 	mutex_enter(&block->mutex);
 
 #ifdef UNIV_SYNC_DEBUG

=== modified file 'storage/innobase/include/buf0flu.ic'
--- a/storage/innobase/include/buf0flu.ic	2007-03-22 21:59:35 +0000
+++ b/storage/innobase/include/buf0flu.ic	2009-05-09 15:13:22 +0000
@@ -43,25 +43,37 @@ buf_flush_note_modification(
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
-	ut_ad(mutex_own(&(buf_pool->mutex)));
 
 	ut_ad(ut_dulint_cmp(mtr->start_lsn, ut_dulint_zero) != 0);
 	ut_ad(mtr->modifications);
+
+        mutex_enter(&block->mutex);
+
 	ut_ad(ut_dulint_cmp(block->newest_modification, mtr->end_lsn) <= 0);
 
 	block->newest_modification = mtr->end_lsn;
 
 	if (ut_dulint_is_zero(block->oldest_modification)) {
+                /* Get flush_list_mutex before updating oldest_modification to
+                ensure the flush list holds exactly those blocks with non-zero
+                oldest_modification. */
+
+                mutex_enter(&buf_pool->flush_list_mutex);
 
 		block->oldest_modification = mtr->start_lsn;
 		ut_ad(!ut_dulint_is_zero(block->oldest_modification));
 
 		buf_flush_insert_into_flush_list(block);
+
+                mutex_exit(&buf_pool->flush_list_mutex);
+
 	} else {
 		ut_ad(ut_dulint_cmp(block->oldest_modification,
 				    mtr->start_lsn) <= 0);
 	}
 
+        mutex_exit(&block->mutex);
+
 	++srv_buf_pool_write_requests;
 }
 
@@ -84,23 +96,30 @@ buf_flush_recv_note_modification(
 	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 
-	mutex_enter(&(buf_pool->mutex));
+	mutex_enter(&block->mutex);
 
 	ut_ad(ut_dulint_cmp(block->newest_modification, end_lsn) <= 0);
 
 	block->newest_modification = end_lsn;
 
 	if (ut_dulint_is_zero(block->oldest_modification)) {
+                /* Get flush_list_mutex before updating oldest_modification to
+                ensure the flush list holds exactly those blocks with non-zero
+                oldest_modification. */
+
+                mutex_enter(&buf_pool->flush_list_mutex);
 
 		block->oldest_modification = start_lsn;
 
 		ut_ad(!ut_dulint_is_zero(block->oldest_modification));
 
 		buf_flush_insert_sorted_into_flush_list(block);
+
+                mutex_exit(&buf_pool->flush_list_mutex);
 	} else {
 		ut_ad(ut_dulint_cmp(block->oldest_modification,
 				    start_lsn) <= 0);
 	}
 
-	mutex_exit(&(buf_pool->mutex));
+	mutex_exit(&block->mutex);
 }

=== modified file 'storage/innobase/include/sync0sync.h'
--- a/storage/innobase/include/sync0sync.h	2008-12-04 10:57:56 +0000
+++ b/storage/innobase/include/sync0sync.h	2009-05-09 15:13:22 +0000
@@ -454,6 +454,7 @@ or row lock! */
 					the level is SYNC_MEM_HASH. */
 #define	SYNC_BUF_POOL		150
 #define	SYNC_BUF_BLOCK		149
+#define SYNC_BUF_FLUSH_LIST     145
 #define SYNC_DOUBLEWRITE	140
 #define	SYNC_ANY_LATCH		135
 #define SYNC_THR_LOCAL		133

=== modified file 'storage/innobase/mtr/mtr0mtr.c'
--- a/storage/innobase/mtr/mtr0mtr.c	2007-07-10 14:34:21 +0000
+++ b/storage/innobase/mtr/mtr0mtr.c	2009-05-09 15:13:22 +0000
@@ -13,6 +13,7 @@ Created 11/26/1995 Heikki Tuuri
 #endif
 
 #include "buf0buf.h"
+#include "buf0flu.h"
 #include "page0types.h"
 #include "mtr0log.h"
 #include "log0log.h"
@@ -103,6 +104,40 @@ mtr_memo_pop_all(
 	}
 }
 
+/**************************************************************
+Updates the lsn for buffer pages modified by this mtr. Pages will be
+added to the buf_pool->flush_list as well, but locks and bufferfix will
+not be released until after log_sys->mutex is released. */
+UNIV_INLINE
+void
+mtr_memo_note_modification(
+/*=============*/
+        mtr_t*  mtr)    /* in: mtr */
+{
+        mtr_memo_slot_t* slot;
+        dyn_array_t*    memo;
+        ulint           offset;
+   
+        ut_ad(mtr);
+        ut_ad(mtr->magic_n == MTR_MAGIC_N);
+        ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
+                                                                commit */
+        ut_ad(mtr->modifications);
+   
+        memo = &(mtr->memo);
+   
+        offset = dyn_array_get_data_size(memo);
+        while (offset > 0) {
+                offset -= sizeof(mtr_memo_slot_t);
+                slot = dyn_array_get_element(memo, offset);
+                if (UNIV_LIKELY(slot->object != NULL) &&
+                    slot->type == MTR_MEMO_PAGE_X_FIX) {
+                         buf_flush_note_modification(
+                             (buf_block_t*)slot->object, mtr);
+                }
+        }
+}
+
 /****************************************************************
 Writes the contents of a mini-transaction log, if any, to the database log. */
 static
@@ -178,7 +213,6 @@ mtr_commit(
 #endif
 	if (mtr->modifications) {
 		mtr_log_reserve_and_write(mtr);
-	}
 
 	/* We first update the modification info to buffer pages, and only
 	after that release the log mutex: this guarantees that when the log
@@ -188,12 +222,14 @@ mtr_commit(
 	required when we insert modified buffer pages in to the flush list
 	which must be sorted on oldest_modification. */
 
-	mtr_memo_pop_all(mtr);
-
-	if (mtr->modifications) {
+                mtr_memo_note_modification(mtr);
 		log_release();
 	}
 
+
+        /* All unlocking has been moved here, after log_sys mutex release. */
+        mtr_memo_pop_all(mtr);
+
 #ifdef UNIV_DEBUG
 	mtr->state = MTR_COMMITTED;
 #endif
@@ -263,7 +299,12 @@ mtr_memo_release(
 		slot = dyn_array_get_element(memo, offset);
 
 		if ((object == slot->object) && (type == slot->type)) {
-
+                        if (mtr->modifications &&
+                            UNIV_LIKELY(slot->object != NULL) &&
+                            slot->type == MTR_MEMO_PAGE_X_FIX) {
+                                buf_flush_note_modification(
+                                    (buf_block_t*)slot->object, mtr);
+                        }
 			mtr_memo_slot_release(mtr, slot);
 
 			break;

=== modified file 'storage/innobase/sync/sync0sync.c'
--- a/storage/innobase/sync/sync0sync.c	2008-10-30 09:23:36 +0000
+++ b/storage/innobase/sync/sync0sync.c	2009-05-09 15:13:22 +0000
@@ -1107,6 +1107,9 @@ sync_thread_add_level(
 	case SYNC_BUF_POOL:
 		ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL));
 		break;
+        case SYNC_BUF_FLUSH_LIST:
+                ut_a(sync_thread_levels_g(array, SYNC_BUF_FLUSH_LIST));
+                break;
 	case SYNC_SEARCH_SYS:
 		ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS));
 		break;

Thread
bzr commit into mysql-5.1 branch (mikael:2847) Mikael Ronstrom11 May