List:Commits« Previous MessageNext Message »
From:marko.makela Date:February 28 2011 2:04pm
Subject:bzr push into mysql-5.1-innodb branch (marko.makela:3714 to 3715) Bug#58549
View as plain text  
 3715 Marko Mäkelä	2011-02-28
      Bug #58549 Race condition in buf_LRU_drop_page_hash_for_tablespace()
      and compressed tables
      
      buf_LRU_drop_page_hash_for_tablespace(): after releasing and
      reacquiring the buffer pool mutex, do not dereference any block
      descriptor pointer that is not known to be a pointer to an
      uncompressed page frame (type buf_block_t; state ==
      BUF_BLOCK_FILE_PAGE). Also, defer the acquisition of the block_mutex
      until it is needed.
      
      buf_page_get_gen(): Add mode == BUF_GET_IF_IN_POOL_PEEK for
      buffer-fixing a block without making it young in the LRU list.
      
      buf_page_get_gen(), buf_page_init(), buf_LRU_block_remove_hashed_page():
      Set bpage->state = BUF_BLOCK_ZIP_FREE before buf_buddy_free(bpage),
      so that similar race conditions might be detected a little easier.
      
      btr_search_drop_page_hash_when_freed(): Use BUF_GET_IF_IN_POOL_PEEK
      when dropping the hash indexes.
      
      rb://528 approved by Jimmy Yang

    modified:
      storage/innodb_plugin/ChangeLog
      storage/innodb_plugin/btr/btr0sea.c
      storage/innodb_plugin/buf/buf0buf.c
      storage/innodb_plugin/buf/buf0lru.c
      storage/innodb_plugin/include/buf0buf.h
 3714 Vasil Dimov	2011-02-25
      Fix BUG#11798085 - INCORRECT INTEGER TYPES USED IN CALCULATION RESULT
      IN OVERFLOW
      
      Do not assign the result of the difference to a signed variable and
      checking whether it is negative afterwards because this limits the max diff
      to 2G on 32 bit systems. E.g. "signed = 3.5G - 1G" would be negative and the
      code would assume that 3.5G < 1G. Instead compare the two variables directly
      and assign to unsigned only if we know that the result of the subtraction
      will be positive.
      
      Discussed with:	Jimmy and Sunny (via IRC)

    modified:
      storage/innodb_plugin/buf/buf0buf.c
=== modified file 'storage/innodb_plugin/ChangeLog'
--- a/storage/innodb_plugin/ChangeLog	revid:vasil.dimov@strippedm-20110225095018-jgmv1pnuprrjzat1
+++ b/storage/innodb_plugin/ChangeLog	revid:marko.makela@stripped18-ogs3ib1eaz9bsgkt
@@ -1,3 +1,9 @@
+2011-02-28	The InnoDB Team
+
+	* btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c:
+	Fix Bug#58549 Race condition in buf_LRU_drop_page_hash_for_tablespace()
+	and compressed tables
+
 2011-02-15	The InnoDB Team
 
 	* sync/sync0rw.c, innodb_bug59307.test:

=== modified file 'storage/innodb_plugin/btr/btr0sea.c'
--- a/storage/innodb_plugin/btr/btr0sea.c	revid:vasil.dimov@stripped
+++ b/storage/innodb_plugin/btr/btr0sea.c	revid:marko.makela@oracle.com-20110228115118-ogs3ib1eaz9bsgkt
@@ -1201,8 +1201,8 @@ btr_search_drop_page_hash_when_freed(
 	having to fear a deadlock. */
 
 	block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL,
-				BUF_GET_IF_IN_POOL, __FILE__, __LINE__,
-				&mtr);
+				 BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__,
+				 &mtr);
 	/* Because the buffer pool mutex was released by
 	buf_page_peek_if_search_hashed(), it is possible that the
 	block was removed from the buffer pool by another thread

=== modified file 'storage/innodb_plugin/buf/buf0buf.c'
--- a/storage/innodb_plugin/buf/buf0buf.c	revid:vasil.dimov@stripped95018-jgmv1pnuprrjzat1
+++ b/storage/innodb_plugin/buf/buf0buf.c	revid:marko.makela@strippedib1eaz9bsgkt
@@ -2031,7 +2031,7 @@ buf_page_get_gen(
 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
 	buf_block_t*	guess,	/*!< in: guessed block or NULL */
 	ulint		mode,	/*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
-				BUF_GET_NO_LATCH */
+				BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line where called */
 	mtr_t*		mtr)	/*!< in: mini-transaction */
@@ -2047,9 +2047,19 @@ buf_page_get_gen(
 	ut_ad((rw_latch == RW_S_LATCH)
 	      || (rw_latch == RW_X_LATCH)
 	      || (rw_latch == RW_NO_LATCH));
-	ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
-	ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
-	      || (mode == BUF_GET_NO_LATCH));
+#ifdef UNIV_DEBUG
+	switch (mode) {
+	case BUF_GET_NO_LATCH:
+		ut_ad(rw_latch == RW_NO_LATCH);
+		break;
+	case BUF_GET:
+	case BUF_GET_IF_IN_POOL:
+	case BUF_PEEK_IF_IN_POOL:
+		break;
+	default:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
 	ut_ad(zip_size == fil_space_get_zip_size(space));
 	ut_ad(ut_is_2pow(zip_size));
 #ifndef UNIV_LOG_DEBUG
@@ -2091,7 +2101,8 @@ loop2:
 
 		buf_pool_mutex_exit();
 
-		if (mode == BUF_GET_IF_IN_POOL) {
+		if (mode == BUF_GET_IF_IN_POOL
+		    || mode == BUF_PEEK_IF_IN_POOL) {
 
 			return(NULL);
 		}
@@ -2130,7 +2141,8 @@ loop2:
 
 	must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
 
-	if (must_read && mode == BUF_GET_IF_IN_POOL) {
+	if (must_read && (mode == BUF_GET_IF_IN_POOL
+			  || mode == BUF_PEEK_IF_IN_POOL)) {
 		/* The page is only being read to buffer */
 		buf_pool_mutex_exit();
 
@@ -2248,6 +2260,7 @@ wait_until_unfixed:
 		mutex_exit(&buf_pool_zip_mutex);
 		buf_pool->n_pend_unzip++;
 
+		bpage->state = BUF_BLOCK_ZIP_FREE;
 		buf_buddy_free(bpage, sizeof *bpage);
 
 		buf_pool_mutex_exit();
@@ -2324,7 +2337,9 @@ wait_until_unfixed:
 
 	buf_pool_mutex_exit();
 
-	buf_page_set_accessed_make_young(&block->page, access_time);
+	if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) {
+		buf_page_set_accessed_make_young(&block->page, access_time);
+	}
 
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
 	ut_a(!block->page.file_page_was_freed);
@@ -2377,7 +2392,7 @@ wait_until_unfixed:
 
 	mtr_memo_push(mtr, block, fix_type);
 
-	if (!access_time) {
+	if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL) && !access_time) {
 		/* In the case of a first access, try to apply linear
 		read-ahead */
 
@@ -2926,6 +2941,7 @@ err_exit:
 		    && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) {
 
 			/* The block was added by some other thread. */
+			bpage->state = BUF_BLOCK_ZIP_FREE;
 			buf_buddy_free(bpage, sizeof *bpage);
 			buf_buddy_free(data, zip_size);
 

=== modified file 'storage/innodb_plugin/buf/buf0lru.c'
--- a/storage/innodb_plugin/buf/buf0lru.c	revid:vasil.dimov@stripped
+++ b/storage/innodb_plugin/buf/buf0lru.c	revid:marko.makela@oracle.com-20110228115118-ogs3ib1eaz9bsgkt
@@ -246,71 +246,75 @@ buf_LRU_drop_page_hash_for_tablespace(
 	page_arr = ut_malloc(sizeof(ulint)
 			     * BUF_LRU_DROP_SEARCH_HASH_SIZE);
 	buf_pool_mutex_enter();
+	num_entries = 0;
 
 scan_again:
-	num_entries = 0;
 	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
 
 	while (bpage != NULL) {
-		mutex_t*	block_mutex = buf_page_get_mutex(bpage);
 		buf_page_t*	prev_bpage;
+		ibool		is_fixed;
 
-		mutex_enter(block_mutex);
 		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
 
 		ut_a(buf_page_in_file(bpage));
 
 		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
 		    || bpage->space != id
-		    || bpage->buf_fix_count > 0
 		    || bpage->io_fix != BUF_IO_NONE) {
-			/* We leave the fixed pages as is in this scan.
-			To be dealt with later in the final scan. */
-			mutex_exit(block_mutex);
-			goto next_page;
+			/* Compressed pages are never hashed.
+			Skip blocks of other tablespaces.
+			Skip I/O-fixed blocks (to be dealt with later). */
+next_page:
+			bpage = prev_bpage;
+			continue;
 		}
 
-		if (((buf_block_t*) bpage)->is_hashed) {
+		mutex_enter(&((buf_block_t*) bpage)->mutex);
+		is_fixed = bpage->buf_fix_count > 0
+			|| !((buf_block_t*) bpage)->is_hashed;
+		mutex_exit(&((buf_block_t*) bpage)->mutex);
 
-			/* Store the offset(i.e.: page_no) in the array
-			so that we can drop hash index in a batch
-			later. */
-			page_arr[num_entries] = bpage->offset;
-			mutex_exit(block_mutex);
-			ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
-			++num_entries;
+		if (is_fixed) {
+			goto next_page;
+		}
 
-			if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
-				goto next_page;
-			}
-			/* Array full. We release the buf_pool_mutex to
-			obey the latching order. */
-			buf_pool_mutex_exit();
-
-			buf_LRU_drop_page_hash_batch(id, zip_size, page_arr,
-						     num_entries);
-			num_entries = 0;
-			buf_pool_mutex_enter();
-		} else {
-			mutex_exit(block_mutex);
+		/* Store the page number so that we can drop the hash
+		index in a batch later. */
+		page_arr[num_entries] = bpage->offset;
+		ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+		++num_entries;
+
+		if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+			goto next_page;
 		}
 
-next_page:
-		/* Note that we may have released the buf_pool mutex
-		above after reading the prev_bpage during processing
-		of a page_hash_batch (i.e.: when the array was full).
-		This means that prev_bpage can change in LRU list.
-		This is OK because this function is a 'best effort'
-		to drop as many search hash entries as possible and
-		it does not guarantee that ALL such entries will be
-		dropped. */
-		bpage = prev_bpage;
+		/* Array full. We release the buf_pool_mutex to
+		obey the latching order. */
+		buf_pool_mutex_exit();
+		buf_LRU_drop_page_hash_batch(id, zip_size, page_arr,
+					     num_entries);
+		buf_pool_mutex_enter();
+		num_entries = 0;
+
+		/* Note that we released the buf_pool mutex above
+		after reading the prev_bpage during processing of a
+		page_hash_batch (i.e.: when the array was full).
+		Because prev_bpage could belong to a compressed-only
+		block, it may have been relocated, and thus the
+		pointer cannot be trusted. Because bpage is of type
+		buf_block_t, it is safe to dereference.
+
+		bpage can change in the LRU list. This is OK because
+		this function is a 'best effort' to drop as many
+		search hash entries as possible and it does not
+		guarantee that ALL such entries will be dropped. */
 
 		/* If, however, bpage has been removed from LRU list
 		to the free list then we should restart the scan.
 		bpage->state is protected by buf_pool mutex. */
-		if (bpage && !buf_page_in_file(bpage)) {
-			ut_a(num_entries == 0);
+		if (bpage
+		    && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
 			goto scan_again;
 		}
 	}
@@ -1799,6 +1803,7 @@ buf_LRU_block_remove_hashed_page(
 		buf_pool_mutex_exit_forbid();
 		buf_buddy_free(bpage->zip.data,
 			       page_zip_get_size(&bpage->zip));
+		bpage->state = BUF_BLOCK_ZIP_FREE;
 		buf_buddy_free(bpage, sizeof(*bpage));
 		buf_pool_mutex_exit_allow();
 		UNIV_MEM_UNDESC(bpage);

=== modified file 'storage/innodb_plugin/include/buf0buf.h'
--- a/storage/innodb_plugin/include/buf0buf.h	revid:vasil.dimov@stripped5018-jgmv1pnuprrjzat1
+++ b/storage/innodb_plugin/include/buf0buf.h	revid:marko.makela@strippedgs3ib1eaz9bsgkt
@@ -41,6 +41,8 @@ Created 11/5/1995 Heikki Tuuri
 /* @{ */
 #define BUF_GET			10	/*!< get always */
 #define	BUF_GET_IF_IN_POOL	11	/*!< get if in pool */
+#define BUF_PEEK_IF_IN_POOL	12	/*!< get if in pool, do not make
+					the block young in the LRU list */
 #define BUF_GET_NO_LATCH	14	/*!< get and bufferfix, but
 					set no latch; we have
 					separated this case, because
@@ -284,7 +286,7 @@ buf_page_get_gen(
 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
 	buf_block_t*	guess,	/*!< in: guessed block or NULL */
 	ulint		mode,	/*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
-				BUF_GET_NO_LATCH */
+				BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */
 	const char*	file,	/*!< in: file name */
 	ulint		line,	/*!< in: line where called */
 	mtr_t*		mtr);	/*!< in: mini-transaction */

Attachment: [text/bzr-bundle] bzr/marko.makela@oracle.com-20110228115118-ogs3ib1eaz9bsgkt.bundle
Thread
bzr push into mysql-5.1-innodb branch (marko.makela:3714 to 3715) Bug#58549marko.makela28 Feb