#At file:///home/mikael/mysql_clones/buf_page_hash_mutex/
2847 Mikael Ronstrom 2009-05-09
Split out buffer page hash access from buffer pool mutex
modified:
storage/innobase/buf/buf0buf.c
storage/innobase/buf/buf0flu.c
storage/innobase/buf/buf0lru.c
storage/innobase/include/buf0buf.h
storage/innobase/include/buf0buf.ic
storage/innobase/include/sync0sync.h
storage/innobase/sync/sync0sync.c
=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c 2008-10-15 18:54:18 +0000
+++ b/storage/innobase/buf/buf0buf.c 2009-05-09 15:32:13 +0000
@@ -37,6 +37,7 @@ Created 11/5/1995 Heikki Tuuri
#include "log0log.h"
#include "trx0undo.h"
#include "srv0srv.h"
+#include "ha0ha.h"
/*
IMPLEMENTATION OF THE BUFFER POOL
@@ -81,8 +82,8 @@ maybe every 10 microseconds. We gave up
for each control block, for instance, because it seemed to be
complicated.
-A solution to reduce mutex contention of the buf_pool mutex is to
-create a separate mutex for the page hash table. On Pentium,
+To reduce mutex contention of the buf_pool mutex we have created
+a separate array of mutexes for the page hash table. On Pentium,
accessing the hash table takes 2 microseconds, about half
of the total buf_pool mutex hold time.
@@ -704,8 +705,8 @@ buf_pool_init(
}
}
- buf_pool->page_hash = hash_create(2 * max_size);
-
+ buf_pool->page_hash = ha_create(FALSE, 2 * max_size, 64,
+ SYNC_BUF_PAGE_HASH);
buf_pool->n_pend_reads = 0;
buf_pool->last_printout_time = time(NULL);
@@ -1174,27 +1175,42 @@ buf_page_get_gen(
buf_pool->n_page_gets++;
loop:
block = NULL;
- mutex_enter_fast(&(buf_pool->mutex));
+ /* We have removed buf_pool->mutex here. I have verified it is safe
+ to access the following block members below with only block->mutex:
+ offset, space, state, io_fix, buf_fix_count. Other functions call
+ buf_block_align without protection, so that should be fine too. */
if (guess) {
block = buf_block_align(guess);
+ mutex_enter(&block->mutex);
if ((offset != block->offset) || (space != block->space)
|| (block->state != BUF_BLOCK_FILE_PAGE)) {
block = NULL;
+ mutex_exit(&block->mutex);
}
}
if (block == NULL) {
block = buf_page_hash_get(space, offset);
+ if(block) {
+ mutex_enter(&block->mutex);
+ /* Verify block contains the data we want. It may have
+ changed before acquiring block->mutex, because we don't
+ lock buf_pool->mutex before buf_page_hash_get. */
+ if (UNIV_UNLIKELY((offset != block->offset) ||
+ (space != block->space) ||
+ (block->state != BUF_BLOCK_FILE_PAGE))) {
+ mutex_exit(&block->mutex);
+ block = NULL;
+ }
+ }
}
if (block == NULL) {
/* Page not in buf_pool: needs to be read from file */
- mutex_exit(&(buf_pool->mutex));
-
if (mode == BUF_GET_IF_IN_POOL) {
return(NULL);
@@ -1212,7 +1228,7 @@ loop:
goto loop;
}
- mutex_enter(&block->mutex);
+ /* Now we know block is not null, and we hold block->mutex */
ut_a(block->state == BUF_BLOCK_FILE_PAGE);
@@ -1224,7 +1240,6 @@ loop:
if (mode == BUF_GET_IF_IN_POOL) {
/* The page is only being read to buffer */
- mutex_exit(&buf_pool->mutex);
mutex_exit(&block->mutex);
return(NULL);
@@ -1237,11 +1252,18 @@ loop:
if (block->frame == NULL) {
ut_a(srv_use_awe);
+ /* TODO: Let buf_awe_map_page_to_frame do its own locking,
+ but this requires an overhaul to buf_flush_try_page */
+ mutex_exit(&(block->mutex));
+ mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&(block->mutex));
+
/* We set second parameter TRUE because the block is in the
LRU list and we must put it to awe_LRU_free_mapped list once
mapped to a frame */
buf_awe_map_page_to_frame(block, TRUE);
+ mutex_exit(&(buf_pool->mutex));
}
#ifdef UNIV_SYNC_DEBUG
@@ -1249,7 +1271,6 @@ loop:
#else
buf_block_buf_fix_inc(block);
#endif
- mutex_exit(&buf_pool->mutex);
/* Check if this is the first access to the page */
@@ -1630,6 +1651,7 @@ buf_page_init(
in units of a page */
buf_block_t* block) /* in: block to init */
{
+ ulint fold;
ut_ad(mutex_own(&(buf_pool->mutex)));
ut_ad(mutex_own(&(block->mutex)));
@@ -1673,8 +1695,10 @@ buf_page_init(
ut_a(0);
}
- HASH_INSERT(buf_block_t, hash, buf_pool->page_hash,
- buf_page_address_fold(space, offset), block);
+ fold = buf_page_address_fold(space, offset);
+ hash_mutex_enter(buf_pool->page_hash, fold);
+ HASH_INSERT(buf_block_t, hash, buf_pool->page_hash, fold, block);
+ hash_mutex_exit(buf_pool->page_hash, fold);
block->freed_page_clock = 0;
@@ -2151,8 +2175,6 @@ buf_validate(void)
block = buf_pool_get_nth_block(buf_pool, i);
- mutex_enter(&block->mutex);
-
if (block->state == BUF_BLOCK_FILE_PAGE) {
ut_a(buf_page_hash_get(block->space,
@@ -2197,8 +2219,6 @@ buf_validate(void)
} else if (block->state == BUF_BLOCK_NOT_USED) {
n_free++;
}
-
- mutex_exit(&block->mutex);
}
if (n_lru + n_free > buf_pool->curr_size) {
@@ -2385,16 +2405,13 @@ buf_get_modified_ratio_pct(void)
{
ulint ratio;
- mutex_enter(&(buf_pool->mutex));
-
+ /* Unprotected reads of buf_pool variables should be okay here. */
ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
/ (1 + UT_LIST_GET_LEN(buf_pool->LRU)
+ UT_LIST_GET_LEN(buf_pool->free));
/* 1 + is there to avoid division by zero */
- mutex_exit(&(buf_pool->mutex));
-
return(ratio);
}
@@ -2426,6 +2443,7 @@ buf_print_io(
(ulong)
UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
}
+ if (file) {
fprintf(file,
"Buffer pool size %lu\n"
"Free buffers %lu\n"
@@ -2443,12 +2461,13 @@ buf_print_io(
(ulong) buf_pool->n_flush[BUF_FLUSH_LIST]
+ buf_pool->init_flush[BUF_FLUSH_LIST],
(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
-
+ } // if (file)
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
buf_pool->last_printout_time);
buf_pool->last_printout_time = current_time;
+ if (file) {
fprintf(file,
"Pages read %lu, created %lu, written %lu\n"
"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
@@ -2461,6 +2480,7 @@ buf_print_io(
/ time_elapsed,
(buf_pool->n_pages_written - buf_pool->n_pages_written_old)
/ time_elapsed);
+ } // if (file)
if (srv_use_awe) {
fprintf(file, "AWE: %.2f page remaps/s\n",
@@ -2470,15 +2490,18 @@ buf_print_io(
}
if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) {
- fprintf(file, "Buffer pool hit rate %lu / 1000\n",
- (ulong)
- (1000 - ((1000 * (buf_pool->n_pages_read
- - buf_pool->n_pages_read_old))
- / (buf_pool->n_page_gets
- - buf_pool->n_page_gets_old))));
+ ulong buf_pool_hit_per_k = (ulong) (1000 - ((1000 *
+ (buf_pool->n_pages_read - buf_pool->n_pages_read_old))
+ / (buf_pool->n_page_gets - buf_pool->n_page_gets_old)));
+ if (file) {
+ fprintf(file, "Buffer pool hit rate %lu / 1000\n",
+ buf_pool_hit_per_k);
+ } // if (file)
} else {
- fputs("No buffer pool page gets since the last printout\n",
- file);
+ if (file) {
+ fputs("No buffer pool page gets since the last printout\n",
+ file);
+ } // if (file)
}
buf_pool->n_page_gets_old = buf_pool->n_page_gets;
=== modified file 'storage/innobase/buf/buf0flu.c'
--- a/storage/innobase/buf/buf0flu.c 2008-02-01 10:55:39 +0000
+++ b/storage/innobase/buf/buf0flu.c 2009-05-09 15:32:13 +0000
@@ -546,22 +546,30 @@ buf_flush_try_page(
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
|| flush_type == BUF_FLUSH_SINGLE_PAGE);
- mutex_enter(&(buf_pool->mutex));
-
block = buf_page_hash_get(space, offset);
- ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
-
if (!block) {
- mutex_exit(&(buf_pool->mutex));
return(0);
}
+ mutex_enter(&(buf_pool->mutex));
+
mutex_enter(&block->mutex);
- if (flush_type == BUF_FLUSH_LIST
+ /* Verify block contains the data we want. It may have
+ changed before acquiring block->mutex, because we don't
+ lock buf_pool->mutex before buf_page_hash_get. */
+ if (UNIV_UNLIKELY((offset != block->offset) ||
+ (space != block->space) ||
+ (block->state != BUF_BLOCK_FILE_PAGE))) {
+ /* Block changed before we acquired block->mutex. Do not
+ try to flush. */
+
+ } else if (flush_type == BUF_FLUSH_LIST
&& buf_flush_ready_for_flush(block, flush_type)) {
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
block->io_fix = BUF_IO_WRITE;
/* If AWE is enabled and the page is not mapped to a frame,
@@ -630,6 +638,8 @@ buf_flush_try_page(
the page not to be bufferfixed (in function
..._ready_for_flush). */
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
block->io_fix = BUF_IO_WRITE;
/* If AWE is enabled and the page is not mapped to a frame,
@@ -670,6 +680,8 @@ buf_flush_try_page(
} else if (flush_type == BUF_FLUSH_SINGLE_PAGE
&& buf_flush_ready_for_flush(block, flush_type)) {
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
block->io_fix = BUF_IO_WRITE;
/* If AWE is enabled and the page is not mapped to a frame,
=== modified file 'storage/innobase/buf/buf0lru.c'
--- a/storage/innobase/buf/buf0lru.c 2008-12-14 20:47:17 +0000
+++ b/storage/innobase/buf/buf0lru.c 2009-05-09 15:32:13 +0000
@@ -1029,6 +1029,8 @@ buf_LRU_block_remove_hashed_page(
be in a state where it can be freed; there
may or may not be a hash index to the page */
{
+ ulint fold;
+
ut_ad(mutex_own(&(buf_pool->mutex)));
ut_ad(mutex_own(&block->mutex));
ut_ad(block);
@@ -1073,10 +1075,10 @@ buf_LRU_block_remove_hashed_page(
#endif
ut_a(0);
}
-
- HASH_DELETE(buf_block_t, hash, buf_pool->page_hash,
- buf_page_address_fold(block->space, block->offset),
- block);
+ fold = buf_page_address_fold(block->space, block->offset);
+ hash_mutex_enter(buf_pool->page_hash, fold);
+ HASH_DELETE(buf_block_t, hash, buf_pool->page_hash, fold, block);
+ hash_mutex_exit(buf_pool->page_hash, fold);
UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
block->state = BUF_BLOCK_REMOVE_HASH;
@@ -1197,6 +1199,7 @@ buf_LRU_print(void)
while (block != NULL) {
+ mutex_enter(&block->mutex);
fprintf(stderr, "BLOCK %lu ", (ulong) block->offset);
if (block->old) {
@@ -1225,6 +1228,8 @@ buf_LRU_print(void)
(ulong) ut_dulint_get_low
(btr_page_get_index_id(frame)));
+ mutex_exit(&block->mutex);
+
block = UT_LIST_GET_NEXT(LRU, block);
if (++len == 10) {
len = 0;
=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h 2008-08-20 00:37:41 +0000
+++ b/storage/innobase/include/buf0buf.h 2009-05-09 15:32:13 +0000
@@ -733,8 +733,10 @@ struct buf_block_struct{
UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE
(normally = 4) physical memory
pages; otherwise NULL */
- ulint space; /* space id of the page */
- ulint offset; /* page number within the space */
+ ulint space; /* space id of the page.
+ protected by block->mutex.*/
+ ulint offset; /* page number within the space.
+ protected by block->mutex.*/
ulint lock_hash_val; /* hashed value of the page address
in the record lock hash table */
mutex_t mutex; /* mutex protecting this block:
@@ -925,7 +927,10 @@ struct buf_pool_struct{
ulint curr_size; /* current pool size in pages;
currently always the same as
max_size */
- hash_table_t* page_hash; /* hash table of the file pages */
+ hash_table_t* page_hash; /* hash table of the file pages.
+ We protect this hash table with an
+ array of mutexes, using the ha_create
+ and hash_mutex_enter/exit functions. */
ulint n_pend_reads; /* number of pending read operations */
=== modified file 'storage/innobase/include/buf0buf.ic'
--- a/storage/innobase/include/buf0buf.ic 2008-10-15 18:54:18 +0000
+++ b/storage/innobase/include/buf0buf.ic 2009-05-09 15:32:13 +0000
@@ -421,7 +421,7 @@ buf_frame_get_newest_modification(
block = buf_block_align(frame);
- mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
if (block->state == BUF_BLOCK_FILE_PAGE) {
lsn = block->newest_modification;
@@ -429,7 +429,7 @@ buf_frame_get_newest_modification(
lsn = ut_dulint_zero;
}
- mutex_exit(&(buf_pool->mutex));
+ mutex_exit(&block->mutex);
return(lsn);
}
@@ -547,14 +547,15 @@ buf_page_hash_get(
ulint fold;
ut_ad(buf_pool);
- ut_ad(mutex_own(&(buf_pool->mutex)));
/* Look for the page in the hash table */
fold = buf_page_address_fold(space, offset);
+ hash_mutex_enter(buf_pool->page_hash, fold);
HASH_SEARCH(hash, buf_pool->page_hash, fold, block,
(block->space == space) && (block->offset == offset));
+ hash_mutex_exit(buf_pool->page_hash, fold);
ut_a(block == NULL || block->state == BUF_BLOCK_FILE_PAGE);
return(block);
=== modified file 'storage/innobase/include/sync0sync.h'
--- a/storage/innobase/include/sync0sync.h 2008-12-04 10:57:56 +0000
+++ b/storage/innobase/include/sync0sync.h 2009-05-09 15:32:13 +0000
@@ -454,6 +454,7 @@ or row lock! */
the level is SYNC_MEM_HASH. */
#define SYNC_BUF_POOL 150
#define SYNC_BUF_BLOCK 149
+#define SYNC_BUF_PAGE_HASH 143
#define SYNC_DOUBLEWRITE 140
#define SYNC_ANY_LATCH 135
#define SYNC_THR_LOCAL 133
=== modified file 'storage/innobase/sync/sync0sync.c'
--- a/storage/innobase/sync/sync0sync.c 2008-10-30 09:23:36 +0000
+++ b/storage/innobase/sync/sync0sync.c 2009-05-09 15:32:13 +0000
@@ -1107,6 +1107,9 @@ sync_thread_add_level(
case SYNC_BUF_POOL:
ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL));
break;
+ case SYNC_BUF_PAGE_HASH:
+ ut_a(sync_thread_levels_g(array, SYNC_BUF_PAGE_HASH));
+ break;
case SYNC_SEARCH_SYS:
ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS));
break;
| Thread |
|---|
| • bzr commit into mysql-5.1 branch (mikael:2847) | Mikael Ronstrom | 11 May |