List:Commits« Previous MessageNext Message »
From:Mikael Ronstrom Date:May 12 2009 6:09pm
Subject:bzr commit into mysql-5.1 branch (mikael:2847)
View as plain text  
#At file:///home/mikael/mysql_clones/buf_page_hash_mutex_split/

 2847 Mikael Ronstrom	2009-05-12
      Split out buffer page hash access from buffer pool mutex based on Foogle v3 patch
      modified:
        storage/innobase/buf/buf0buf.c
        storage/innobase/buf/buf0flu.c
        storage/innobase/buf/buf0lru.c
        storage/innobase/include/buf0buf.h
        storage/innobase/include/buf0buf.ic
        storage/innobase/include/sync0sync.h
        storage/innobase/sync/sync0sync.c

=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c	2008-10-15 18:54:18 +0000
+++ b/storage/innobase/buf/buf0buf.c	2009-05-12 18:09:48 +0000
@@ -37,6 +37,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "log0log.h"
 #include "trx0undo.h"
 #include "srv0srv.h"
+#include "ha0ha.h"
 
 /*
 		IMPLEMENTATION OF THE BUFFER POOL
@@ -81,8 +82,8 @@ maybe every 10 microseconds. We gave up 
 for each control block, for instance, because it seemed to be
 complicated.
 
-A solution to reduce mutex contention of the buf_pool mutex is to
-create a separate mutex for the page hash table. On Pentium,
+To reduce mutex contention of the buf_pool mutex we have created
+a separate array of mutexes for the page hash table. On Pentium,
 accessing the hash table takes 2 microseconds, about half
 of the total buf_pool mutex hold time.
 
@@ -704,8 +705,8 @@ buf_pool_init(
 		}
 	}
 
-	buf_pool->page_hash = hash_create(2 * max_size);
-
+        buf_pool->page_hash = ha_create(FALSE, 2 * max_size, 64,
+                                        SYNC_BUF_PAGE_HASH);
 	buf_pool->n_pend_reads = 0;
 
 	buf_pool->last_printout_time = time(NULL);
@@ -1174,27 +1175,42 @@ buf_page_get_gen(
 	buf_pool->n_page_gets++;
 loop:
 	block = NULL;
-	mutex_enter_fast(&(buf_pool->mutex));
 
+        /* We have removed buf_pool->mutex here. I have verified it is safe
+        to access the following block members below with only block->mutex:
+        offset, space, state, io_fix, buf_fix_count. Other functions call
+        buf_block_align without protection, so that should be fine too. */
 	if (guess) {
 		block = buf_block_align(guess);
 
+                mutex_enter(&block->mutex);
 		if ((offset != block->offset) || (space != block->space)
 		    || (block->state != BUF_BLOCK_FILE_PAGE)) {
 
+                        mutex_exit(&block->mutex);
 			block = NULL;
 		}
 	}
 
 	if (block == NULL) {
 		block = buf_page_hash_get(space, offset);
+                if(block) {
+                        mutex_enter(&block->mutex);
+                        /* Verify block contains the data we want. It may have
+                        changed before acquiring block->mutex, because we don't
+                        lock buf_pool->mutex before buf_page_hash_get. */
+                        if (UNIV_UNLIKELY((offset != block->offset) ||
+                                      (space != block->space) ||
+                                      (block->state != BUF_BLOCK_FILE_PAGE))) {
+                                mutex_exit(&block->mutex);
+                                block = NULL;
+                        }
+                }
 	}
 
 	if (block == NULL) {
 		/* Page not in buf_pool: needs to be read from file */
 
-		mutex_exit(&(buf_pool->mutex));
-
 		if (mode == BUF_GET_IF_IN_POOL) {
 
 			return(NULL);
@@ -1212,7 +1228,7 @@ loop:
 		goto loop;
 	}
 
-	mutex_enter(&block->mutex);
+        /* Now we know block is not null, and we hold block->mutex */
 
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
@@ -1224,7 +1240,6 @@ loop:
 
 		if (mode == BUF_GET_IF_IN_POOL) {
 			/* The page is only being read to buffer */
-			mutex_exit(&buf_pool->mutex);
 			mutex_exit(&block->mutex);
 
 			return(NULL);
@@ -1237,11 +1252,18 @@ loop:
 	if (block->frame == NULL) {
 		ut_a(srv_use_awe);
 
+                /* TODO: Let buf_awe_map_page_to_frame do its own locking,
+                but this requires an overhaul to buf_flush_try_page */
+                mutex_exit(&(block->mutex));
+                mutex_enter(&(buf_pool->mutex));
+                mutex_enter(&(block->mutex));
+ 
 		/* We set second parameter TRUE because the block is in the
 		LRU list and we must put it to awe_LRU_free_mapped list once
 		mapped to a frame */
 
 		buf_awe_map_page_to_frame(block, TRUE);
+                mutex_exit(&(buf_pool->mutex));
 	}
 
 #ifdef UNIV_SYNC_DEBUG
@@ -1249,7 +1271,6 @@ loop:
 #else
 	buf_block_buf_fix_inc(block);
 #endif
-	mutex_exit(&buf_pool->mutex);
 
 	/* Check if this is the first access to the page */
 
@@ -1630,6 +1651,7 @@ buf_page_init(
 				in units of a page */
 	buf_block_t*	block)	/* in: block to init */
 {
+        ulint fold;
 
 	ut_ad(mutex_own(&(buf_pool->mutex)));
 	ut_ad(mutex_own(&(block->mutex)));
@@ -1673,8 +1695,10 @@ buf_page_init(
 		ut_a(0);
 	}
 
-	HASH_INSERT(buf_block_t, hash, buf_pool->page_hash,
-		    buf_page_address_fold(space, offset), block);
+        fold = buf_page_address_fold(space, offset);
+        hash_mutex_enter(buf_pool->page_hash, fold);
+        HASH_INSERT(buf_block_t, hash, buf_pool->page_hash, fold, block);
+        hash_mutex_exit(buf_pool->page_hash, fold);
 
 	block->freed_page_clock = 0;
 
@@ -2151,8 +2175,6 @@ buf_validate(void)
 
 		block = buf_pool_get_nth_block(buf_pool, i);
 
-		mutex_enter(&block->mutex);
-
 		if (block->state == BUF_BLOCK_FILE_PAGE) {
 
 			ut_a(buf_page_hash_get(block->space,
@@ -2197,8 +2219,6 @@ buf_validate(void)
 		} else if (block->state == BUF_BLOCK_NOT_USED) {
 			n_free++;
 		}
-
-		mutex_exit(&block->mutex);
 	}
 
 	if (n_lru + n_free > buf_pool->curr_size) {
@@ -2385,16 +2405,13 @@ buf_get_modified_ratio_pct(void)
 {
 	ulint	ratio;
 
-	mutex_enter(&(buf_pool->mutex));
-
+        /* Unprotected reads of buf_pool variables should be okay here. */
 	ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
 		/ (1 + UT_LIST_GET_LEN(buf_pool->LRU)
 		   + UT_LIST_GET_LEN(buf_pool->free));
 
 	/* 1 + is there to avoid division by zero */
 
-	mutex_exit(&(buf_pool->mutex));
-
 	return(ratio);
 }
 
@@ -2426,6 +2443,7 @@ buf_print_io(
 			(ulong)
 			UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
 	}
+        if (file) {
 	fprintf(file,
 		"Buffer pool size   %lu\n"
 		"Free buffers       %lu\n"
@@ -2443,12 +2461,13 @@ buf_print_io(
 		(ulong) buf_pool->n_flush[BUF_FLUSH_LIST]
 		+ buf_pool->init_flush[BUF_FLUSH_LIST],
 		(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
-
+        } // if (file)
 	current_time = time(NULL);
 	time_elapsed = 0.001 + difftime(current_time,
 					buf_pool->last_printout_time);
 	buf_pool->last_printout_time = current_time;
 
+        if (file) {
 	fprintf(file,
 		"Pages read %lu, created %lu, written %lu\n"
 		"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
@@ -2461,6 +2480,7 @@ buf_print_io(
 		/ time_elapsed,
 		(buf_pool->n_pages_written - buf_pool->n_pages_written_old)
 		/ time_elapsed);
+        } // if (file)
 
 	if (srv_use_awe) {
 		fprintf(file, "AWE: %.2f page remaps/s\n",
@@ -2470,15 +2490,18 @@ buf_print_io(
 	}
 
 	if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) {
-		fprintf(file, "Buffer pool hit rate %lu / 1000\n",
-			(ulong)
-			(1000 - ((1000 * (buf_pool->n_pages_read
-					  - buf_pool->n_pages_read_old))
-				 / (buf_pool->n_page_gets
-				    - buf_pool->n_page_gets_old))));
+                ulong buf_pool_hit_per_k = (ulong) (1000 - ((1000 *
+                        (buf_pool->n_pages_read - buf_pool->n_pages_read_old))
+                        / (buf_pool->n_page_gets - buf_pool->n_page_gets_old)));
+                if (file) {
+                        fprintf(file, "Buffer pool hit rate %lu / 1000\n",
+                                buf_pool_hit_per_k);
+                } // if (file)
 	} else {
-		fputs("No buffer pool page gets since the last printout\n",
-		      file);
+                if (file) {
+                fputs("No buffer pool page gets since the last printout\n",
+                      file);
+                } // if (file)
 	}
 
 	buf_pool->n_page_gets_old = buf_pool->n_page_gets;

=== modified file 'storage/innobase/buf/buf0flu.c'
--- a/storage/innobase/buf/buf0flu.c	2008-02-01 10:55:39 +0000
+++ b/storage/innobase/buf/buf0flu.c	2009-05-12 18:09:48 +0000
@@ -546,22 +546,30 @@ buf_flush_try_page(
 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
 	      || flush_type == BUF_FLUSH_SINGLE_PAGE);
 
-	mutex_enter(&(buf_pool->mutex));
-
 	block = buf_page_hash_get(space, offset);
 
-	ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
-
 	if (!block) {
-		mutex_exit(&(buf_pool->mutex));
 		return(0);
 	}
 
+        mutex_enter(&(buf_pool->mutex));
+ 
 	mutex_enter(&block->mutex);
 
-	if (flush_type == BUF_FLUSH_LIST
+        /* Verify block contains the data we want. It may have
+        changed before acquiring block->mutex, because we don't
+        lock buf_pool->mutex before buf_page_hash_get. */
+        if (UNIV_UNLIKELY((offset != block->offset) ||
+                           (space != block->space) ||
+                           (block->state != BUF_BLOCK_FILE_PAGE))) {
+                /* Block changed before we acquired block->mutex. Do not
+                try to flush. */
+   
+	} else if (flush_type == BUF_FLUSH_LIST
 	    && buf_flush_ready_for_flush(block, flush_type)) {
 
+                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ 
 		block->io_fix = BUF_IO_WRITE;
 
 		/* If AWE is enabled and the page is not mapped to a frame,
@@ -630,6 +638,8 @@ buf_flush_try_page(
 		the page not to be bufferfixed (in function
 		..._ready_for_flush). */
 
+                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
 		block->io_fix = BUF_IO_WRITE;
 
 		/* If AWE is enabled and the page is not mapped to a frame,
@@ -670,6 +680,8 @@ buf_flush_try_page(
 	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE
 		   && buf_flush_ready_for_flush(block, flush_type)) {
 
+                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
 		block->io_fix = BUF_IO_WRITE;
 
 		/* If AWE is enabled and the page is not mapped to a frame,

=== modified file 'storage/innobase/buf/buf0lru.c'
--- a/storage/innobase/buf/buf0lru.c	2008-12-14 20:47:17 +0000
+++ b/storage/innobase/buf/buf0lru.c	2009-05-12 18:09:48 +0000
@@ -1029,6 +1029,8 @@ buf_LRU_block_remove_hashed_page(
 				be in a state where it can be freed; there
 				may or may not be a hash index to the page */
 {
+        ulint           fold;
+
 	ut_ad(mutex_own(&(buf_pool->mutex)));
 	ut_ad(mutex_own(&block->mutex));
 	ut_ad(block);
@@ -1073,10 +1075,10 @@ buf_LRU_block_remove_hashed_page(
 #endif
 		ut_a(0);
 	}
-
-	HASH_DELETE(buf_block_t, hash, buf_pool->page_hash,
-		    buf_page_address_fold(block->space, block->offset),
-		    block);
+        fold = buf_page_address_fold(block->space, block->offset);
+        hash_mutex_enter(buf_pool->page_hash, fold);
+        HASH_DELETE(buf_block_t, hash, buf_pool->page_hash, fold, block);
+        hash_mutex_exit(buf_pool->page_hash, fold);
 
 	UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
 	block->state = BUF_BLOCK_REMOVE_HASH;
@@ -1197,6 +1199,7 @@ buf_LRU_print(void)
 
 	while (block != NULL) {
 
+	        mutex_enter(&block->mutex);
 		fprintf(stderr, "BLOCK %lu ", (ulong) block->offset);
 
 		if (block->old) {
@@ -1225,6 +1228,8 @@ buf_LRU_print(void)
 			(ulong) ut_dulint_get_low
 			(btr_page_get_index_id(frame)));
 
+	        mutex_exit(&block->mutex);
+
 		block = UT_LIST_GET_NEXT(LRU, block);
 		if (++len == 10) {
 			len = 0;

=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h	2008-08-20 00:37:41 +0000
+++ b/storage/innobase/include/buf0buf.h	2009-05-12 18:09:48 +0000
@@ -733,8 +733,10 @@ struct buf_block_struct{
 					UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE
 					(normally = 4) physical memory
 					pages; otherwise NULL */
-	ulint		space;		/* space id of the page */
-	ulint		offset;		/* page number within the space */
+        ulint           space;          /* space id of the page.
+                                         protected by block->mutex.*/
+        ulint           offset;         /* page number within the space.
+                                         protected by block->mutex.*/
 	ulint		lock_hash_val;	/* hashed value of the page address
 					in the record lock hash table */
 	mutex_t		mutex;		/* mutex protecting this block:
@@ -925,7 +927,10 @@ struct buf_pool_struct{
 	ulint		curr_size;	/* current pool size in pages;
 					currently always the same as
 					max_size */
-	hash_table_t*	page_hash;	/* hash table of the file pages */
+        hash_table_t*   page_hash;      /* hash table of the file pages.
+                                         We protect this hash table with an
+                                         array of mutexes, using the ha_create
+                                         and hash_mutex_enter/exit functions. */
 
 	ulint		n_pend_reads;	/* number of pending read operations */
 

=== modified file 'storage/innobase/include/buf0buf.ic'
--- a/storage/innobase/include/buf0buf.ic	2008-10-15 18:54:18 +0000
+++ b/storage/innobase/include/buf0buf.ic	2009-05-12 18:09:48 +0000
@@ -421,7 +421,7 @@ buf_frame_get_newest_modification(
 
 	block = buf_block_align(frame);
 
-	mutex_enter(&(buf_pool->mutex));
+	mutex_enter(&block->mutex);
 
 	if (block->state == BUF_BLOCK_FILE_PAGE) {
 		lsn = block->newest_modification;
@@ -429,7 +429,7 @@ buf_frame_get_newest_modification(
 		lsn = ut_dulint_zero;
 	}
 
-	mutex_exit(&(buf_pool->mutex));
+	mutex_exit(&block->mutex);
 
 	return(lsn);
 }
@@ -547,14 +547,15 @@ buf_page_hash_get(
 	ulint		fold;
 
 	ut_ad(buf_pool);
-	ut_ad(mutex_own(&(buf_pool->mutex)));
 
 	/* Look for the page in the hash table */
 
 	fold = buf_page_address_fold(space, offset);
 
+        hash_mutex_enter(buf_pool->page_hash, fold);
 	HASH_SEARCH(hash, buf_pool->page_hash, fold, block,
 		    (block->space == space) && (block->offset == offset));
+        hash_mutex_exit(buf_pool->page_hash, fold);
 	ut_a(block == NULL || block->state == BUF_BLOCK_FILE_PAGE);
 
 	return(block);

=== modified file 'storage/innobase/include/sync0sync.h'
--- a/storage/innobase/include/sync0sync.h	2008-12-04 10:57:56 +0000
+++ b/storage/innobase/include/sync0sync.h	2009-05-12 18:09:48 +0000
@@ -454,6 +454,7 @@ or row lock! */
 					the level is SYNC_MEM_HASH. */
 #define	SYNC_BUF_POOL		150
 #define	SYNC_BUF_BLOCK		149
+#define SYNC_BUF_PAGE_HASH      143
 #define SYNC_DOUBLEWRITE	140
 #define	SYNC_ANY_LATCH		135
 #define SYNC_THR_LOCAL		133

=== modified file 'storage/innobase/sync/sync0sync.c'
--- a/storage/innobase/sync/sync0sync.c	2008-10-30 09:23:36 +0000
+++ b/storage/innobase/sync/sync0sync.c	2009-05-12 18:09:48 +0000
@@ -1107,6 +1107,9 @@ sync_thread_add_level(
 	case SYNC_BUF_POOL:
 		ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL));
 		break;
+	case SYNC_BUF_PAGE_HASH:
+		ut_a(sync_thread_levels_g(array, SYNC_BUF_PAGE_HASH));
+		break;
 	case SYNC_SEARCH_SYS:
 		ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS));
 		break;

Thread
bzr commit into mysql-5.1 branch (mikael:2847) Mikael Ronstrom12 May