List:Commits« Previous MessageNext Message »
From:Mikael Ronstrom Date:May 19 2009 11:15am
Subject:bzr commit into mysql-5.1 branch (mikael:2847)
View as plain text  
#At file:///home/mikael/mysql_clones/mixed_buf_page_hash/

 2847 Mikael Ronstrom	2009-05-19
      Split buffer page hash mutex from buffer pool mutex, mix of ideas from Sun, Google and Percona
      modified:
        storage/innobase/buf/buf0buf.c
        storage/innobase/buf/buf0flu.c
        storage/innobase/buf/buf0lru.c
        storage/innobase/buf/buf0rea.c
        storage/innobase/include/buf0buf.h
        storage/innobase/include/buf0buf.ic
        storage/innobase/include/sync0sync.h
        storage/innobase/sync/sync0sync.c

=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c	2008-10-15 18:54:18 +0000
+++ b/storage/innobase/buf/buf0buf.c	2009-05-19 11:15:22 +0000
@@ -81,8 +81,8 @@ maybe every 10 microseconds. We gave up 
 for each control block, for instance, because it seemed to be
 complicated.
 
-A solution to reduce mutex contention of the buf_pool mutex is to
-create a separate mutex for the page hash table. On Pentium,
+To reduce mutex contention of the buf_pool mutex we have created
+a separate array of mutexes for the page hash table. On Pentium,
 accessing the hash table takes 2 microseconds, about half
 of the total buf_pool mutex hold time.
 
@@ -595,6 +595,7 @@ buf_pool_init(
 	/* 1. Initialize general fields
 	---------------------------- */
 	mutex_create(&buf_pool->mutex, SYNC_BUF_POOL);
+        buf_page_hash_create_locks();
 
 	mutex_enter(&(buf_pool->mutex));
 
@@ -704,8 +705,7 @@ buf_pool_init(
 		}
 	}
 
-	buf_pool->page_hash = hash_create(2 * max_size);
-
+        buf_page_hash_create(max_size);
 	buf_pool->n_pend_reads = 0;
 
 	buf_pool->last_printout_time = time(NULL);
@@ -998,7 +998,7 @@ buf_page_peek_block(
 
 	mutex_enter_fast(&(buf_pool->mutex));
 
-	block = buf_page_hash_get(space, offset);
+	block = buf_page_hash_get(space, offset, FALSE);
 
 	mutex_exit(&(buf_pool->mutex));
 
@@ -1019,7 +1019,7 @@ buf_reset_check_index_page_at_flush(
 
 	mutex_enter_fast(&(buf_pool->mutex));
 
-	block = buf_page_hash_get(space, offset);
+	block = buf_page_hash_get(space, offset, FALSE);
 
 	if (block) {
 		block->check_index_page_at_flush = FALSE;
@@ -1046,7 +1046,7 @@ buf_page_peek_if_search_hashed(
 
 	mutex_enter_fast(&(buf_pool->mutex));
 
-	block = buf_page_hash_get(space, offset);
+	block = buf_page_hash_get(space, offset, FALSE);
 
 	if (!block) {
 		is_hashed = FALSE;
@@ -1098,7 +1098,7 @@ buf_page_set_file_page_was_freed(
 
 	mutex_enter_fast(&(buf_pool->mutex));
 
-	block = buf_page_hash_get(space, offset);
+	block = buf_page_hash_get(space, offset, FALSE);
 
 	if (block) {
 		block->file_page_was_freed = TRUE;
@@ -1127,7 +1127,7 @@ buf_page_reset_file_page_was_freed(
 
 	mutex_enter_fast(&(buf_pool->mutex));
 
-	block = buf_page_hash_get(space, offset);
+	block = buf_page_hash_get(space, offset, FALSE);
 
 	if (block) {
 		block->file_page_was_freed = FALSE;
@@ -1174,27 +1174,42 @@ buf_page_get_gen(
 	buf_pool->n_page_gets++;
 loop:
 	block = NULL;
-	mutex_enter_fast(&(buf_pool->mutex));
 
+        /* We have removed buf_pool->mutex here. I have verified it is safe
+        to access the following block members below with only block->mutex:
+        offset, space, state, io_fix, buf_fix_count. Other functions call
+        buf_block_align without protection, so that should be fine too. */
 	if (guess) {
 		block = buf_block_align(guess);
 
+                mutex_enter(&block->mutex);
 		if ((offset != block->offset) || (space != block->space)
 		    || (block->state != BUF_BLOCK_FILE_PAGE)) {
 
+                        mutex_exit(&block->mutex);
 			block = NULL;
 		}
 	}
 
 	if (block == NULL) {
-		block = buf_page_hash_get(space, offset);
+		block = buf_page_hash_get(space, offset, FALSE);
+                if(block) {
+                        mutex_enter(&block->mutex);
+                        /* Verify block contains the data we want. It may have
+                        changed before acquiring block->mutex, because we don't
+                        lock buf_pool->mutex before buf_page_hash_get. */
+                        if (UNIV_UNLIKELY((offset != block->offset) ||
+                                      (space != block->space) ||
+                                      (block->state != BUF_BLOCK_FILE_PAGE))) {
+                                mutex_exit(&block->mutex);
+                                block = NULL;
+                        }
+                }
 	}
 
 	if (block == NULL) {
 		/* Page not in buf_pool: needs to be read from file */
 
-		mutex_exit(&(buf_pool->mutex));
-
 		if (mode == BUF_GET_IF_IN_POOL) {
 
 			return(NULL);
@@ -1212,7 +1227,7 @@ loop:
 		goto loop;
 	}
 
-	mutex_enter(&block->mutex);
+        /* Now we know block is not null, and we hold block->mutex */
 
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 
@@ -1224,7 +1239,6 @@ loop:
 
 		if (mode == BUF_GET_IF_IN_POOL) {
 			/* The page is only being read to buffer */
-			mutex_exit(&buf_pool->mutex);
 			mutex_exit(&block->mutex);
 
 			return(NULL);
@@ -1237,11 +1251,18 @@ loop:
 	if (block->frame == NULL) {
 		ut_a(srv_use_awe);
 
+                /* TODO: Let buf_awe_map_page_to_frame do its own locking,
+                but this requires an overhaul to buf_flush_try_page */
+                mutex_exit(&(block->mutex));
+                mutex_enter(&(buf_pool->mutex));
+                mutex_enter(&(block->mutex));
+ 
 		/* We set second parameter TRUE because the block is in the
 		LRU list and we must put it to awe_LRU_free_mapped list once
 		mapped to a frame */
 
 		buf_awe_map_page_to_frame(block, TRUE);
+                mutex_exit(&(buf_pool->mutex));
 	}
 
 #ifdef UNIV_SYNC_DEBUG
@@ -1249,7 +1270,6 @@ loop:
 #else
 	buf_block_buf_fix_inc(block);
 #endif
-	mutex_exit(&buf_pool->mutex);
 
 	/* Check if this is the first access to the page */
 
@@ -1630,6 +1650,8 @@ buf_page_init(
 				in units of a page */
 	buf_block_t*	block)	/* in: block to init */
 {
+        ulint fold;
+        ulint latch_number;
 
 	ut_ad(mutex_own(&(buf_pool->mutex)));
 	ut_ad(mutex_own(&(block->mutex)));
@@ -1658,7 +1680,7 @@ buf_page_init(
 
 	/* Insert into the hash table of file pages */
 
-	if (buf_page_hash_get(space, offset)) {
+	if (buf_page_hash_get(space, offset, FALSE)) {
 		fprintf(stderr,
 			"InnoDB: Error: page %lu %lu already found"
 			" in the hash table\n",
@@ -1673,8 +1695,12 @@ buf_page_init(
 		ut_a(0);
 	}
 
-	HASH_INSERT(buf_block_t, hash, buf_pool->page_hash,
-		    buf_page_address_fold(space, offset), block);
+        fold = buf_page_address_fold(space, offset);
+        latch_number = buf_page_hash_table(space, offset);
+        rw_lock_x_lock(&(buf_pool->hash_latches[latch_number]));
+        HASH_INSERT(buf_block_t, hash, buf_pool->page_hash[latch_number],
+                    fold, block);
+        rw_lock_x_unlock(&(buf_pool->hash_latches[latch_number]));
 
 	block->freed_page_clock = 0;
 
@@ -1756,7 +1782,7 @@ buf_page_init_for_read(
 	}
 
 	if (*err == DB_TABLESPACE_DELETED
-	    || NULL != buf_page_hash_get(space, offset)) {
+	    || NULL != buf_page_hash_get(space, offset, FALSE)) {
 
 		/* The page belongs to a space which has been
 		deleted or is being deleted, or the page is
@@ -1832,7 +1858,7 @@ buf_page_create(
 
 	mutex_enter(&(buf_pool->mutex));
 
-	block = buf_page_hash_get(space, offset);
+	block = buf_page_hash_get(space, offset, FALSE);
 
 	if (block != NULL) {
 #ifdef UNIV_IBUF_DEBUG
@@ -2151,12 +2177,11 @@ buf_validate(void)
 
 		block = buf_pool_get_nth_block(buf_pool, i);
 
-		mutex_enter(&block->mutex);
-
 		if (block->state == BUF_BLOCK_FILE_PAGE) {
 
 			ut_a(buf_page_hash_get(block->space,
-					       block->offset) == block);
+					       block->offset) == block,
+                                               FALSE);
 			n_page++;
 
 #ifdef UNIV_IBUF_DEBUG
@@ -2197,8 +2222,6 @@ buf_validate(void)
 		} else if (block->state == BUF_BLOCK_NOT_USED) {
 			n_free++;
 		}
-
-		mutex_exit(&block->mutex);
 	}
 
 	if (n_lru + n_free > buf_pool->curr_size) {
@@ -2385,16 +2408,13 @@ buf_get_modified_ratio_pct(void)
 {
 	ulint	ratio;
 
-	mutex_enter(&(buf_pool->mutex));
-
+        /* Unprotected reads of buf_pool variables should be okay here. */
 	ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
 		/ (1 + UT_LIST_GET_LEN(buf_pool->LRU)
 		   + UT_LIST_GET_LEN(buf_pool->free));
 
 	/* 1 + is there to avoid division by zero */
 
-	mutex_exit(&(buf_pool->mutex));
-
 	return(ratio);
 }
 
@@ -2426,6 +2446,7 @@ buf_print_io(
 			(ulong)
 			UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
 	}
+        if (file) {
 	fprintf(file,
 		"Buffer pool size   %lu\n"
 		"Free buffers       %lu\n"
@@ -2443,12 +2464,13 @@ buf_print_io(
 		(ulong) buf_pool->n_flush[BUF_FLUSH_LIST]
 		+ buf_pool->init_flush[BUF_FLUSH_LIST],
 		(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
-
+        } // if (file)
 	current_time = time(NULL);
 	time_elapsed = 0.001 + difftime(current_time,
 					buf_pool->last_printout_time);
 	buf_pool->last_printout_time = current_time;
 
+        if (file) {
 	fprintf(file,
 		"Pages read %lu, created %lu, written %lu\n"
 		"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
@@ -2461,6 +2483,7 @@ buf_print_io(
 		/ time_elapsed,
 		(buf_pool->n_pages_written - buf_pool->n_pages_written_old)
 		/ time_elapsed);
+        } // if (file)
 
 	if (srv_use_awe) {
 		fprintf(file, "AWE: %.2f page remaps/s\n",
@@ -2470,15 +2493,18 @@ buf_print_io(
 	}
 
 	if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) {
-		fprintf(file, "Buffer pool hit rate %lu / 1000\n",
-			(ulong)
-			(1000 - ((1000 * (buf_pool->n_pages_read
-					  - buf_pool->n_pages_read_old))
-				 / (buf_pool->n_page_gets
-				    - buf_pool->n_page_gets_old))));
+                ulong buf_pool_hit_per_k = (ulong) (1000 - ((1000 *
+                        (buf_pool->n_pages_read - buf_pool->n_pages_read_old))
+                        / (buf_pool->n_page_gets - buf_pool->n_page_gets_old)));
+                if (file) {
+                        fprintf(file, "Buffer pool hit rate %lu / 1000\n",
+                                buf_pool_hit_per_k);
+                } // if (file)
 	} else {
-		fputs("No buffer pool page gets since the last printout\n",
-		      file);
+                if (file) {
+                fputs("No buffer pool page gets since the last printout\n",
+                      file);
+                } // if (file)
 	}
 
 	buf_pool->n_page_gets_old = buf_pool->n_page_gets;

=== modified file 'storage/innobase/buf/buf0flu.c'
--- a/storage/innobase/buf/buf0flu.c	2008-02-01 10:55:39 +0000
+++ b/storage/innobase/buf/buf0flu.c	2009-05-19 11:15:22 +0000
@@ -546,22 +546,30 @@ buf_flush_try_page(
 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
 	      || flush_type == BUF_FLUSH_SINGLE_PAGE);
 
-	mutex_enter(&(buf_pool->mutex));
-
-	block = buf_page_hash_get(space, offset);
-
-	ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
+	block = buf_page_hash_get(space, offset, FALSE);
 
 	if (!block) {
-		mutex_exit(&(buf_pool->mutex));
 		return(0);
 	}
 
+        mutex_enter(&(buf_pool->mutex));
+ 
 	mutex_enter(&block->mutex);
 
-	if (flush_type == BUF_FLUSH_LIST
+        /* Verify block contains the data we want. It may have
+        changed before acquiring block->mutex, because we don't
+        lock buf_pool->mutex before buf_page_hash_get. */
+        if (UNIV_UNLIKELY((offset != block->offset) ||
+                           (space != block->space) ||
+                           (block->state != BUF_BLOCK_FILE_PAGE))) {
+                /* Block changed before we acquired block->mutex. Do not
+                try to flush. */
+   
+	} else if (flush_type == BUF_FLUSH_LIST
 	    && buf_flush_ready_for_flush(block, flush_type)) {
 
+                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+ 
 		block->io_fix = BUF_IO_WRITE;
 
 		/* If AWE is enabled and the page is not mapped to a frame,
@@ -630,6 +638,8 @@ buf_flush_try_page(
 		the page not to be bufferfixed (in function
 		..._ready_for_flush). */
 
+                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
 		block->io_fix = BUF_IO_WRITE;
 
 		/* If AWE is enabled and the page is not mapped to a frame,
@@ -670,6 +680,8 @@ buf_flush_try_page(
 	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE
 		   && buf_flush_ready_for_flush(block, flush_type)) {
 
+                ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
 		block->io_fix = BUF_IO_WRITE;
 
 		/* If AWE is enabled and the page is not mapped to a frame,
@@ -759,7 +771,7 @@ buf_flush_try_neighbors(
 
 	for (i = low; i < high; i++) {
 
-		block = buf_page_hash_get(space, i);
+		block = buf_page_hash_get(space, i, FALSE);
 		ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
 
 		if (!block) {

=== modified file 'storage/innobase/buf/buf0lru.c'
--- a/storage/innobase/buf/buf0lru.c	2008-12-14 20:47:17 +0000
+++ b/storage/innobase/buf/buf0lru.c	2009-05-19 11:15:22 +0000
@@ -1029,6 +1029,9 @@ buf_LRU_block_remove_hashed_page(
 				be in a state where it can be freed; there
 				may or may not be a hash index to the page */
 {
+        ulint           fold;
+        ulint           latch_number;
+
 	ut_ad(mutex_own(&(buf_pool->mutex)));
 	ut_ad(mutex_own(&block->mutex));
 	ut_ad(block);
@@ -1046,22 +1049,22 @@ buf_LRU_block_remove_hashed_page(
 
 	buf_block_modify_clock_inc(block);
 
-	if (block != buf_page_hash_get(block->space, block->offset)) {
+	if (block != buf_page_hash_get(block->space, block->offset, FALSE)) {
 		fprintf(stderr,
 			"InnoDB: Error: page %lu %lu not found"
 			" in the hash table\n",
 			(ulong) block->space,
 			(ulong) block->offset);
-		if (buf_page_hash_get(block->space, block->offset)) {
+		if (buf_page_hash_get(block->space, block->offset, FALSE)) {
 			fprintf(stderr,
 				"InnoDB: In hash table we find block"
 				" %p of %lu %lu which is not %p\n",
 				(void*) buf_page_hash_get
-				(block->space, block->offset),
+				(block->space, block->offset, FALSE),
 				(ulong) buf_page_hash_get
-				(block->space, block->offset)->space,
+				(block->space, block->offset, FALSE)->space,
 				(ulong) buf_page_hash_get
-				(block->space, block->offset)->offset,
+				(block->space, block->offset, FALSE)->offset,
 				(void*) block);
 		}
 
@@ -1073,10 +1076,12 @@ buf_LRU_block_remove_hashed_page(
 #endif
 		ut_a(0);
 	}
-
-	HASH_DELETE(buf_block_t, hash, buf_pool->page_hash,
-		    buf_page_address_fold(block->space, block->offset),
-		    block);
+        fold = buf_page_address_fold(block->space, block->offset);
+        latch_number = buf_page_hash_table(block->space, block->offset);
+        rw_lock_x_lock(&(buf_pool->hash_latches[latch_number]));
+        HASH_DELETE(buf_block_t, hash, buf_pool->page_hash[latch_number],
+                    fold, block);
+        rw_lock_x_unlock(&(buf_pool->hash_latches[latch_number]));
 
 	UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
 	block->state = BUF_BLOCK_REMOVE_HASH;
@@ -1197,6 +1202,7 @@ buf_LRU_print(void)
 
 	while (block != NULL) {
 
+	        mutex_enter(&block->mutex);
 		fprintf(stderr, "BLOCK %lu ", (ulong) block->offset);
 
 		if (block->old) {
@@ -1225,6 +1231,8 @@ buf_LRU_print(void)
 			(ulong) ut_dulint_get_low
 			(btr_page_get_index_id(frame)));
 
+	        mutex_exit(&block->mutex);
+
 		block = UT_LIST_GET_NEXT(LRU, block);
 		if (++len == 10) {
 			len = 0;

=== modified file 'storage/innobase/buf/buf0rea.c'
--- a/storage/innobase/buf/buf0rea.c	2006-09-21 07:39:09 +0000
+++ b/storage/innobase/buf/buf0rea.c	2009-05-19 11:15:22 +0000
@@ -175,6 +175,7 @@ buf_read_ahead_random(
 	ulint		low, high;
 	ulint		err;
 	ulint		i;
+	ulint		latch_number;
 
 	if (srv_startup_is_before_trx_rollback_phase) {
 		/* No read-ahead to avoid thread deadlocks */
@@ -223,8 +224,10 @@ buf_read_ahead_random(
 	/* Count how many blocks in the area have been recently accessed,
 	that is, reside near the start of the LRU list. */
 
+        latch_number = buf_page_hash_table(space, low);
+        rw_lock_s_lock(&(buf_pool->hash_latches[latch_number]));
 	for (i = low; i < high; i++) {
-		block = buf_page_hash_get(space, i);
+		block = buf_page_hash_get(space, i, TRUE);
 
 		if ((block)
 		    && (block->LRU_position > LRU_recent_limit)
@@ -233,6 +236,7 @@ buf_read_ahead_random(
 			recent_blocks++;
 		}
 	}
+        rw_lock_s_unlock(&(buf_pool->hash_latches[latch_number]));
 
 	mutex_exit(&(buf_pool->mutex));
 
@@ -385,6 +389,7 @@ buf_read_ahead_linear(
 	ulint		low, high;
 	ulint		err;
 	ulint		i;
+	ulint		latch_number;
 
 	if (srv_startup_is_before_trx_rollback_phase) {
 		/* No read-ahead to avoid thread deadlocks */
@@ -445,8 +450,10 @@ buf_read_ahead_linear(
 
 	fail_count = 0;
 
+        latch_number = buf_page_hash_table(space, low);
+        rw_lock_s_lock(&(buf_pool->hash_latches[latch_number]));
 	for (i = low; i < high; i++) {
-		block = buf_page_hash_get(space, i);
+		block = buf_page_hash_get(space, i, TRUE);
 
 		if ((block == NULL) || !block->accessed) {
 			/* Not accessed */
@@ -462,6 +469,7 @@ buf_read_ahead_linear(
 			pred_block = block;
 		}
 	}
+        rw_lock_s_unlock(&(buf_pool->hash_latches[latch_number]));
 
 	if (fail_count > BUF_READ_AHEAD_LINEAR_AREA
 	    - BUF_READ_AHEAD_LINEAR_THRESHOLD) {
@@ -475,7 +483,7 @@ buf_read_ahead_linear(
 	/* If we got this far, we know that enough pages in the area have
 	been accessed in the right order: linear read-ahead can be sensible */
 
-	block = buf_page_hash_get(space, offset);
+	block = buf_page_hash_get(space, offset, FALSE);
 
 	if (block == NULL) {
 		mutex_exit(&(buf_pool->mutex));

=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h	2008-08-20 00:37:41 +0000
+++ b/storage/innobase/include/buf0buf.h	2009-05-19 11:15:22 +0000
@@ -690,7 +690,25 @@ buf_page_hash_get(
 /*==============*/
 			/* out: block, NULL if not found */
 	ulint	space,	/* in: space id */
-	ulint	offset);/* in: offset of the page within space */
+	ulint	offset, /* in: offset of the page within space */
+        ibool   hold_lock);/* in: Do we hold RW-lock on buffer page hash */
+/***********************************************************************
+Create Page Cache Hashes */
+UNIV_INLINE
+void
+buf_page_hash_create(ulint max_size);
+/***********************************************************************
+Create Page Cache Hash RW-locks */
+UNIV_INLINE
+void
+buf_page_hash_create_locks();
+/***********************************************************************
+Calculate which page table to use */
+UNIV_INLINE
+ulint
+buf_page_hash_table(
+       ulint   space,  /* in: space id */
+       ulint   offset);/* in: offset of the page within space */
 /***********************************************************************
 Increments the pool clock by one and returns its new value. Remember that
 in the 32 bit version the clock wraps around at 4 billion! */
@@ -733,8 +751,10 @@ struct buf_block_struct{
 					UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE
 					(normally = 4) physical memory
 					pages; otherwise NULL */
-	ulint		space;		/* space id of the page */
-	ulint		offset;		/* page number within the space */
+        ulint           space;          /* space id of the page.
+                                         protected by block->mutex.*/
+        ulint           offset;         /* page number within the space.
+                                         protected by block->mutex.*/
 	ulint		lock_hash_val;	/* hashed value of the page address
 					in the record lock hash table */
 	mutex_t		mutex;		/* mutex protecting this block:
@@ -884,6 +904,11 @@ struct buf_block_struct{
 
 #define BUF_BLOCK_MAGIC_N	41526563
 
+/* Number of Page Hash latches, that is how many different latches
+   do we use to protect the Page Cache Hash. Needs to be on the form
+   2**n to make AND fast */
+#define NUM_PAGE_HASH_LATCHES 16
+
 /* The buffer pool structure. NOTE! The definition appears here only for
 other modules of this directory (buf) to see it. Do not use from outside! */
 
@@ -925,7 +950,11 @@ struct buf_pool_struct{
 	ulint		curr_size;	/* current pool size in pages;
 					currently always the same as
 					max_size */
-	hash_table_t*	page_hash;	/* hash table of the file pages */
+        hash_table_t*   page_hash[NUM_PAGE_HASH_LATCHES];
+                                        /* hash tables of the file pages */
+        rw_lock_t       hash_latches[NUM_PAGE_HASH_LATCHES];
+                                        /* Read-write latch protecting the
+                                         parts of the page cache hash above */
 
 	ulint		n_pend_reads;	/* number of pending read operations */
 

=== modified file 'storage/innobase/include/buf0buf.ic'
--- a/storage/innobase/include/buf0buf.ic	2008-10-15 18:54:18 +0000
+++ b/storage/innobase/include/buf0buf.ic	2009-05-19 11:15:22 +0000
@@ -421,7 +421,7 @@ buf_frame_get_newest_modification(
 
 	block = buf_block_align(frame);
 
-	mutex_enter(&(buf_pool->mutex));
+	mutex_enter(&block->mutex);
 
 	if (block->state == BUF_BLOCK_FILE_PAGE) {
 		lsn = block->newest_modification;
@@ -429,7 +429,7 @@ buf_frame_get_newest_modification(
 		lsn = ut_dulint_zero;
 	}
 
-	mutex_exit(&(buf_pool->mutex));
+	mutex_exit(&block->mutex);
 
 	return(lsn);
 }
@@ -533,6 +533,27 @@ buf_block_buf_fix_inc(
 	block->buf_fix_count++;
 }
 #endif /* UNIV_SYNC_DEBUG */
+/************************************************************************
+Calculate which page table to use */
+#define LOG_BUF_READ_AHEAD_AREA 6
+UNIV_INLINE
+ulint
+buf_page_hash_table(
+       ulint   space,  /* in: space id */
+       ulint   offset) /* in: offset of the page within space */
+{
+        ulint ignored_offset, fold, hash_table;
+        /* Calculate proper page hash to use for this space and offset.
+           We ignore the six first bits to ensure that all pages within
+           1 MByte have the same page hash to ensure that flushing of
+           neighbours and read ahead algorithms don't have to lock
+           several locks. */
+        ignored_offset = offset >> LOG_BUF_READ_AHEAD_AREA;
+        fold = buf_page_address_fold(space, ignored_offset);
+        hash_table = fold & (NUM_PAGE_HASH_LATCHES - 1);
+        return hash_table;
+}
+
 /**********************************************************************
 Returns the control block of a file page, NULL if not found. */
 UNIV_INLINE
@@ -541,26 +562,60 @@ buf_page_hash_get(
 /*==============*/
 			/* out: block, NULL if not found */
 	ulint	space,	/* in: space id */
-	ulint	offset)	/* in: offset of the page within space */
+	ulint	offset,	/* in: offset of the page within space */
+        ibool   hold_lock) /* in: Do we hold RW-lock on buffer page hash */
 {
 	buf_block_t*	block;
 	ulint		fold;
+	ulint		latch_number;
 
 	ut_ad(buf_pool);
-	ut_ad(mutex_own(&(buf_pool->mutex)));
 
 	/* Look for the page in the hash table */
 
 	fold = buf_page_address_fold(space, offset);
 
-	HASH_SEARCH(hash, buf_pool->page_hash, fold, block,
+        latch_number = buf_page_hash_table(space, offset);
+        if (!hold_lock)
+          rw_lock_s_lock(&buf_pool->hash_latches[latch_number]);
+	HASH_SEARCH(hash, buf_pool->page_hash[latch_number], fold, block,
 		    (block->space == space) && (block->offset == offset));
+        if (!hold_lock)
+          rw_lock_s_unlock(&buf_pool->hash_latches[latch_number]);
 	ut_a(block == NULL || block->state == BUF_BLOCK_FILE_PAGE);
 
 	return(block);
 }
 
 /************************************************************************
+Create Page Cache Hashes */
+UNIV_INLINE
+void
+buf_page_hash_create(ulint max_size)
+{
+        ulint i;
+        for (i = 0; i < NUM_PAGE_HASH_LATCHES; i++)
+        {
+                buf_pool->page_hash[i] = hash_create((2 * max_size) /
+                                         NUM_PAGE_HASH_LATCHES);
+        }
+}
+
+/************************************************************************
+Create Page Cache Hash RW-locks */
+UNIV_INLINE
+void
+buf_page_hash_create_locks()
+{
+        ulint i;
+        for (i = 0; i < NUM_PAGE_HASH_LATCHES; i++)
+        {
+          rw_lock_create(&(buf_pool->hash_latches[i]),
+                         SYNC_NO_ORDER_CHECK);
+        }
+}
+
+/************************************************************************
 Tries to get the page, but if file io is required, releases all latches
 in mtr down to the given savepoint. If io is required, this function
 retrieves the page to buffer buf_pool, but does not bufferfix it or latch

=== modified file 'storage/innobase/include/sync0sync.h'
--- a/storage/innobase/include/sync0sync.h	2008-12-04 10:57:56 +0000
+++ b/storage/innobase/include/sync0sync.h	2009-05-19 11:15:22 +0000
@@ -454,6 +454,7 @@ or row lock! */
 					the level is SYNC_MEM_HASH. */
 #define	SYNC_BUF_POOL		150
 #define	SYNC_BUF_BLOCK		149
+#define SYNC_BUF_PAGE_HASH      143
 #define SYNC_DOUBLEWRITE	140
 #define	SYNC_ANY_LATCH		135
 #define SYNC_THR_LOCAL		133

=== modified file 'storage/innobase/sync/sync0sync.c'
--- a/storage/innobase/sync/sync0sync.c	2008-10-30 09:23:36 +0000
+++ b/storage/innobase/sync/sync0sync.c	2009-05-19 11:15:22 +0000
@@ -1107,6 +1107,9 @@ sync_thread_add_level(
 	case SYNC_BUF_POOL:
 		ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL));
 		break;
+	case SYNC_BUF_PAGE_HASH:
+		ut_a(sync_thread_levels_g(array, SYNC_BUF_PAGE_HASH));
+		break;
 	case SYNC_SEARCH_SYS:
 		ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS));
 		break;

Thread
bzr commit into mysql-5.1 branch (mikael:2847) Mikael Ronstrom19 May