From: Date: May 12 2009 8:09pm Subject: bzr commit into mysql-5.1 branch (mikael:2847) List-Archive: http://lists.mysql.com/commits/73859 Message-Id: <200905121809.n4CI9rB7029884@dator6.liljeholmen.mysql.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit #At file:///home/mikael/mysql_clones/buf_page_hash_mutex_split/ 2847 Mikael Ronstrom 2009-05-12 Split out buffer page hash access from buffer pool mutex based on Foogle v3 patch modified: storage/innobase/buf/buf0buf.c storage/innobase/buf/buf0flu.c storage/innobase/buf/buf0lru.c storage/innobase/include/buf0buf.h storage/innobase/include/buf0buf.ic storage/innobase/include/sync0sync.h storage/innobase/sync/sync0sync.c === modified file 'storage/innobase/buf/buf0buf.c' --- a/storage/innobase/buf/buf0buf.c 2008-10-15 18:54:18 +0000 +++ b/storage/innobase/buf/buf0buf.c 2009-05-12 18:09:48 +0000 @@ -37,6 +37,7 @@ Created 11/5/1995 Heikki Tuuri #include "log0log.h" #include "trx0undo.h" #include "srv0srv.h" +#include "ha0ha.h" /* IMPLEMENTATION OF THE BUFFER POOL @@ -81,8 +82,8 @@ maybe every 10 microseconds. We gave up for each control block, for instance, because it seemed to be complicated. -A solution to reduce mutex contention of the buf_pool mutex is to -create a separate mutex for the page hash table. On Pentium, +To reduce mutex contention of the buf_pool mutex we have created +a separate array of mutexes for the page hash table. On Pentium, accessing the hash table takes 2 microseconds, about half of the total buf_pool mutex hold time. @@ -704,8 +705,8 @@ buf_pool_init( } } - buf_pool->page_hash = hash_create(2 * max_size); - + buf_pool->page_hash = ha_create(FALSE, 2 * max_size, 64, + SYNC_BUF_PAGE_HASH); buf_pool->n_pend_reads = 0; buf_pool->last_printout_time = time(NULL); @@ -1174,27 +1175,42 @@ buf_page_get_gen( buf_pool->n_page_gets++; loop: block = NULL; - mutex_enter_fast(&(buf_pool->mutex)); + /* We have removed buf_pool->mutex here. I have verified it is safe + to access the following block members below with only block->mutex: + offset, space, state, io_fix, buf_fix_count. Other functions call + buf_block_align without protection, so that should be fine too. */ if (guess) { block = buf_block_align(guess); + mutex_enter(&block->mutex); if ((offset != block->offset) || (space != block->space) || (block->state != BUF_BLOCK_FILE_PAGE)) { + mutex_exit(&block->mutex); block = NULL; } } if (block == NULL) { block = buf_page_hash_get(space, offset); + if(block) { + mutex_enter(&block->mutex); + /* Verify block contains the data we want. It may have + changed before acquiring block->mutex, because we don't + lock buf_pool->mutex before buf_page_hash_get. */ + if (UNIV_UNLIKELY((offset != block->offset) || + (space != block->space) || + (block->state != BUF_BLOCK_FILE_PAGE))) { + mutex_exit(&block->mutex); + block = NULL; + } + } } if (block == NULL) { /* Page not in buf_pool: needs to be read from file */ - mutex_exit(&(buf_pool->mutex)); - if (mode == BUF_GET_IF_IN_POOL) { return(NULL); @@ -1212,7 +1228,7 @@ loop: goto loop; } - mutex_enter(&block->mutex); + /* Now we know block is not null, and we hold block->mutex */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); @@ -1224,7 +1240,6 @@ loop: if (mode == BUF_GET_IF_IN_POOL) { /* The page is only being read to buffer */ - mutex_exit(&buf_pool->mutex); mutex_exit(&block->mutex); return(NULL); @@ -1237,11 +1252,18 @@ loop: if (block->frame == NULL) { ut_a(srv_use_awe); + /* TODO: Let buf_awe_map_page_to_frame do its own locking, + but this requires an overhaul to buf_flush_try_page */ + mutex_exit(&(block->mutex)); + mutex_enter(&(buf_pool->mutex)); + mutex_enter(&(block->mutex)); + /* We set second parameter TRUE because the block is in the LRU list and we must put it to awe_LRU_free_mapped list once mapped to a frame */ buf_awe_map_page_to_frame(block, TRUE); + mutex_exit(&(buf_pool->mutex)); } #ifdef UNIV_SYNC_DEBUG @@ -1249,7 +1271,6 @@ loop: #else buf_block_buf_fix_inc(block); #endif - mutex_exit(&buf_pool->mutex); /* Check if this is the first access to the page */ @@ -1630,6 +1651,7 @@ buf_page_init( in units of a page */ buf_block_t* block) /* in: block to init */ { + ulint fold; ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&(block->mutex))); @@ -1673,8 +1695,10 @@ buf_page_init( ut_a(0); } - HASH_INSERT(buf_block_t, hash, buf_pool->page_hash, - buf_page_address_fold(space, offset), block); + fold = buf_page_address_fold(space, offset); + hash_mutex_enter(buf_pool->page_hash, fold); + HASH_INSERT(buf_block_t, hash, buf_pool->page_hash, fold, block); + hash_mutex_exit(buf_pool->page_hash, fold); block->freed_page_clock = 0; @@ -2151,8 +2175,6 @@ buf_validate(void) block = buf_pool_get_nth_block(buf_pool, i); - mutex_enter(&block->mutex); - if (block->state == BUF_BLOCK_FILE_PAGE) { ut_a(buf_page_hash_get(block->space, @@ -2197,8 +2219,6 @@ buf_validate(void) } else if (block->state == BUF_BLOCK_NOT_USED) { n_free++; } - - mutex_exit(&block->mutex); } if (n_lru + n_free > buf_pool->curr_size) { @@ -2385,16 +2405,13 @@ buf_get_modified_ratio_pct(void) { ulint ratio; - mutex_enter(&(buf_pool->mutex)); - + /* Unprotected reads of buf_pool variables should be okay here. */ ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list)) / (1 + UT_LIST_GET_LEN(buf_pool->LRU) + UT_LIST_GET_LEN(buf_pool->free)); /* 1 + is there to avoid division by zero */ - mutex_exit(&(buf_pool->mutex)); - return(ratio); } @@ -2426,6 +2443,7 @@ buf_print_io( (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); } + if (file) { fprintf(file, "Buffer pool size %lu\n" "Free buffers %lu\n" @@ -2443,12 +2461,13 @@ buf_print_io( (ulong) buf_pool->n_flush[BUF_FLUSH_LIST] + buf_pool->init_flush[BUF_FLUSH_LIST], (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); - + } // if (file) current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, buf_pool->last_printout_time); buf_pool->last_printout_time = current_time; + if (file) { fprintf(file, "Pages read %lu, created %lu, written %lu\n" "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", @@ -2461,6 +2480,7 @@ buf_print_io( / time_elapsed, (buf_pool->n_pages_written - buf_pool->n_pages_written_old) / time_elapsed); + } // if (file) if (srv_use_awe) { fprintf(file, "AWE: %.2f page remaps/s\n", @@ -2470,15 +2490,18 @@ buf_print_io( } if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) { - fprintf(file, "Buffer pool hit rate %lu / 1000\n", - (ulong) - (1000 - ((1000 * (buf_pool->n_pages_read - - buf_pool->n_pages_read_old)) - / (buf_pool->n_page_gets - - buf_pool->n_page_gets_old)))); + ulong buf_pool_hit_per_k = (ulong) (1000 - ((1000 * + (buf_pool->n_pages_read - buf_pool->n_pages_read_old)) + / (buf_pool->n_page_gets - buf_pool->n_page_gets_old))); + if (file) { + fprintf(file, "Buffer pool hit rate %lu / 1000\n", + buf_pool_hit_per_k); + } // if (file) } else { - fputs("No buffer pool page gets since the last printout\n", - file); + if (file) { + fputs("No buffer pool page gets since the last printout\n", + file); + } // if (file) } buf_pool->n_page_gets_old = buf_pool->n_page_gets; === modified file 'storage/innobase/buf/buf0flu.c' --- a/storage/innobase/buf/buf0flu.c 2008-02-01 10:55:39 +0000 +++ b/storage/innobase/buf/buf0flu.c 2009-05-12 18:09:48 +0000 @@ -546,22 +546,30 @@ buf_flush_try_page( ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST || flush_type == BUF_FLUSH_SINGLE_PAGE); - mutex_enter(&(buf_pool->mutex)); - block = buf_page_hash_get(space, offset); - ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); - if (!block) { - mutex_exit(&(buf_pool->mutex)); return(0); } + mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); - if (flush_type == BUF_FLUSH_LIST + /* Verify block contains the data we want. It may have + changed before acquiring block->mutex, because we don't + lock buf_pool->mutex before buf_page_hash_get. */ + if (UNIV_UNLIKELY((offset != block->offset) || + (space != block->space) || + (block->state != BUF_BLOCK_FILE_PAGE))) { + /* Block changed before we acquired block->mutex. Do not + try to flush. */ + + } else if (flush_type == BUF_FLUSH_LIST && buf_flush_ready_for_flush(block, flush_type)) { + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + block->io_fix = BUF_IO_WRITE; /* If AWE is enabled and the page is not mapped to a frame, @@ -630,6 +638,8 @@ buf_flush_try_page( the page not to be bufferfixed (in function ..._ready_for_flush). */ + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + block->io_fix = BUF_IO_WRITE; /* If AWE is enabled and the page is not mapped to a frame, @@ -670,6 +680,8 @@ buf_flush_try_page( } else if (flush_type == BUF_FLUSH_SINGLE_PAGE && buf_flush_ready_for_flush(block, flush_type)) { + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + block->io_fix = BUF_IO_WRITE; /* If AWE is enabled and the page is not mapped to a frame, === modified file 'storage/innobase/buf/buf0lru.c' --- a/storage/innobase/buf/buf0lru.c 2008-12-14 20:47:17 +0000 +++ b/storage/innobase/buf/buf0lru.c 2009-05-12 18:09:48 +0000 @@ -1029,6 +1029,8 @@ buf_LRU_block_remove_hashed_page( be in a state where it can be freed; there may or may not be a hash index to the page */ { + ulint fold; + ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&block->mutex)); ut_ad(block); @@ -1073,10 +1075,10 @@ buf_LRU_block_remove_hashed_page( #endif ut_a(0); } - - HASH_DELETE(buf_block_t, hash, buf_pool->page_hash, - buf_page_address_fold(block->space, block->offset), - block); + fold = buf_page_address_fold(block->space, block->offset); + hash_mutex_enter(buf_pool->page_hash, fold); + HASH_DELETE(buf_block_t, hash, buf_pool->page_hash, fold, block); + hash_mutex_exit(buf_pool->page_hash, fold); UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE); block->state = BUF_BLOCK_REMOVE_HASH; @@ -1197,6 +1199,7 @@ buf_LRU_print(void) while (block != NULL) { + mutex_enter(&block->mutex); fprintf(stderr, "BLOCK %lu ", (ulong) block->offset); if (block->old) { @@ -1225,6 +1228,8 @@ buf_LRU_print(void) (ulong) ut_dulint_get_low (btr_page_get_index_id(frame))); + mutex_exit(&block->mutex); + block = UT_LIST_GET_NEXT(LRU, block); if (++len == 10) { len = 0; === modified file 'storage/innobase/include/buf0buf.h' --- a/storage/innobase/include/buf0buf.h 2008-08-20 00:37:41 +0000 +++ b/storage/innobase/include/buf0buf.h 2009-05-12 18:09:48 +0000 @@ -733,8 +733,10 @@ struct buf_block_struct{ UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE (normally = 4) physical memory pages; otherwise NULL */ - ulint space; /* space id of the page */ - ulint offset; /* page number within the space */ + ulint space; /* space id of the page. + protected by block->mutex.*/ + ulint offset; /* page number within the space. + protected by block->mutex.*/ ulint lock_hash_val; /* hashed value of the page address in the record lock hash table */ mutex_t mutex; /* mutex protecting this block: @@ -925,7 +927,10 @@ struct buf_pool_struct{ ulint curr_size; /* current pool size in pages; currently always the same as max_size */ - hash_table_t* page_hash; /* hash table of the file pages */ + hash_table_t* page_hash; /* hash table of the file pages. + We protect this hash table with an + array of mutexes, using the ha_create + and hash_mutex_enter/exit functions. */ ulint n_pend_reads; /* number of pending read operations */ === modified file 'storage/innobase/include/buf0buf.ic' --- a/storage/innobase/include/buf0buf.ic 2008-10-15 18:54:18 +0000 +++ b/storage/innobase/include/buf0buf.ic 2009-05-12 18:09:48 +0000 @@ -421,7 +421,7 @@ buf_frame_get_newest_modification( block = buf_block_align(frame); - mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); if (block->state == BUF_BLOCK_FILE_PAGE) { lsn = block->newest_modification; @@ -429,7 +429,7 @@ buf_frame_get_newest_modification( lsn = ut_dulint_zero; } - mutex_exit(&(buf_pool->mutex)); + mutex_exit(&block->mutex); return(lsn); } @@ -547,14 +547,15 @@ buf_page_hash_get( ulint fold; ut_ad(buf_pool); - ut_ad(mutex_own(&(buf_pool->mutex))); /* Look for the page in the hash table */ fold = buf_page_address_fold(space, offset); + hash_mutex_enter(buf_pool->page_hash, fold); HASH_SEARCH(hash, buf_pool->page_hash, fold, block, (block->space == space) && (block->offset == offset)); + hash_mutex_exit(buf_pool->page_hash, fold); ut_a(block == NULL || block->state == BUF_BLOCK_FILE_PAGE); return(block); === modified file 'storage/innobase/include/sync0sync.h' --- a/storage/innobase/include/sync0sync.h 2008-12-04 10:57:56 +0000 +++ b/storage/innobase/include/sync0sync.h 2009-05-12 18:09:48 +0000 @@ -454,6 +454,7 @@ or row lock! */ the level is SYNC_MEM_HASH. */ #define SYNC_BUF_POOL 150 #define SYNC_BUF_BLOCK 149 +#define SYNC_BUF_PAGE_HASH 143 #define SYNC_DOUBLEWRITE 140 #define SYNC_ANY_LATCH 135 #define SYNC_THR_LOCAL 133 === modified file 'storage/innobase/sync/sync0sync.c' --- a/storage/innobase/sync/sync0sync.c 2008-10-30 09:23:36 +0000 +++ b/storage/innobase/sync/sync0sync.c 2009-05-12 18:09:48 +0000 @@ -1107,6 +1107,9 @@ sync_thread_add_level( case SYNC_BUF_POOL: ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL)); break; + case SYNC_BUF_PAGE_HASH: + ut_a(sync_thread_levels_g(array, SYNC_BUF_PAGE_HASH)); + break; case SYNC_SEARCH_SYS: ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS)); break;