From: Date: October 7 2005 3:52pm Subject: bk commit into 5.0 tree (marko:1.2022) BUG#3300 List-Archive: http://lists.mysql.com/internals/30816 X-Bug: 3300 Message-Id: <200510071352.j97DqFHO020150@hundin.mysql.fi> Below is the list of changes that have just been committed into a local 5.0 repository of marko. When marko does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet 1.2022 05/10/07 16:52:00 marko@stripped +11 -0 Implement semi-consistent read in UPDATE to avoid record locks on non-matching rows when performing table scanning. TODO: Before the fetch loop, explicitly enable the semi-consistent reads in the storage engine. This is a partial fix to Bug #3300. It will not yet handle records that are found through secondary indexes. sql/sql_update.cc 1.173 05/10/07 16:50:43 marko@stripped +6 -0 Implement semi-consistent read in UPDATE to avoid record locks on non-matching rows when performing table scanning. TODO: Before the fetch loop, explicitly enable the semi-consistent reads in the storage engine. sql/handler.h 1.161 05/10/07 16:50:43 marko@stripped +14 -0 Add was_semi_consistent_read() sql/ha_innodb.h 1.107 05/10/07 16:50:43 marko@stripped +1 -0 Add was_semi_consistent_read() sql/ha_innodb.cc 1.271 05/10/07 16:50:43 marko@stripped +24 -4 unlock_row(): Check prebuilt->was_semi_consistent_read Add was_semi_consistent_read() innobase/row/row0vers.c 1.18 05/10/07 16:50:43 marko@stripped +114 -0 Add row_vers_build_for_semi_consistent_read(). innobase/row/row0sel.c 1.105 05/10/07 16:50:43 marko@stripped +68 -3 Add row_sel_build_committed_vers_for_mysql(). row_search_for_mysql(): Implement semi-consistent read. innobase/row/row0mysql.c 1.115 05/10/07 16:50:43 marko@stripped +3 -0 row_prebuilt_t: Add the flags was_semi_consistent_read and force_re_read_of_update_row. innobase/lock/lock0lock.c 1.66 05/10/07 16:50:43 marko@stripped +1 -1 Declare lock_rec_cancel() in the global scope. innobase/include/row0vers.h 1.4 05/10/07 16:50:43 marko@stripped +28 -0 Add row_vers_build_for_semi_consistent_read(). innobase/include/row0mysql.h 1.46 05/10/07 16:50:42 marko@stripped +27 -0 row_prebuilt_t: Add the flags was_semi_consistent_read and force_re_read_of_update_row. innobase/include/lock0lock.h 1.26 05/10/07 16:50:42 marko@stripped +9 -0 Declare lock_rec_cancel() in the global scope. # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: marko # Host: hundin.mysql.fi # Root: /home/marko/mysql-5.0-bk --- 1.160/sql/handler.h 2005-10-05 17:38:40 +03:00 +++ 1.161/sql/handler.h 2005-10-07 16:50:43 +03:00 @@ -676,6 +676,20 @@ { return extra(operation); } virtual int reset() { return extra(HA_EXTRA_RESET); } virtual int external_lock(THD *thd, int lock_type) { return 0; } + /* + In an UPDATE or DELETE, if the row under the cursor was locked by another + transaction, and the engine used an optimistic read of the last + committed row value under the cursor, then the engine returns 1 from this + function. MySQL must NOT try to update this optimistic value. If the + optimistic value does not match the WHERE condition, MySQL can decide to + skip over this row. Currently only works for InnoDB. This can be used to + avoid unnecessary lock waits. + + If this method returns true, it will also set a flag in the storage + engine to indicate that the next read will be a locking re-read of + the current version of the row. + */ + virtual bool was_semi_consistent_read() { return 0; } virtual void unlock_row() {} virtual int start_stmt(THD *thd, thr_lock_type lock_type) {return 0;} /* --- 1.172/sql/sql_update.cc 2005-09-28 12:28:37 +03:00 +++ 1.173/sql/sql_update.cc 2005-10-07 16:50:43 +03:00 @@ -332,6 +332,9 @@ { if (!(select && select->skip_record())) { + if (table->file->was_semi_consistent_read()) + continue; /* repeat the read of the same row if it still exists */ + table->file->position(table->record[0]); if (my_b_write(&tempfile,table->file->ref, table->file->ref_length)) @@ -403,6 +406,9 @@ { if (!(select && select->skip_record())) { + if (table->file->was_semi_consistent_read()) + continue; /* repeat the read of the same row if it still exists */ + store_record(table,record[1]); if (fill_record_n_invoke_before_triggers(thd, fields, values, 0, table->triggers, --- 1.25/innobase/include/lock0lock.h 2005-06-21 07:36:01 +03:00 +++ 1.26/innobase/include/lock0lock.h 2005-10-07 16:50:42 +03:00 @@ -64,6 +64,15 @@ dict_index_t* index, /* in: clustered index */ const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /***************************************************************** +Cancels a waiting record lock request and releases the waiting transaction +that requested it. NOTE: does NOT check if waiting lock requests behind this +one can now be granted! */ + +void +lock_rec_cancel( +/*============*/ + lock_t* lock); /* in: waiting record lock request */ +/***************************************************************** Resets the lock bits for a single record. Releases transactions waiting for lock requests here. */ --- 1.45/innobase/include/row0mysql.h 2005-09-23 11:20:22 +03:00 +++ 1.46/innobase/include/row0mysql.h 2005-10-07 16:50:42 +03:00 @@ -612,6 +612,33 @@ that was decided in ha_innodb.cc, ::store_lock(), ::external_lock(), etc. */ + ibool was_semi_consistent_read; /* if + innodb_locks_unsafe_for_binlog is TRUE, + then this is set to TRUE if the row + under an UPDATE or DELETE cursor was + locked by another transaction, and we + resorted to reading the last committed + value ('semi-consistent read'); if + innodb_locks_unsafe_for_binlog is TRUE + and the last committed value of the + row does not match the WHERE condition, + then MySQL can simply skip over the + row; this eliminates lock waits in some + cases; note that this breaks + serializability */ + ibool force_re_read_of_update_row; /* if + innodb_locks_unsafe_for_binlog is TRUE, + and MySQL finds out that a row read + in the semi-consistent way WOULD match + the WHERE condition, then MySQL sets + this flag TRUE before calling fetch: + InnoDB tries to read the same row + again, this time using a normal locking + read; if purge had removed the row + under the cursor, then this flag has + no effect and we reset this flag; + we reset this flag always when we + return a row to MySQL */ ulint mysql_prefix_len;/* byte offset of the end of the last requested column */ ulint mysql_row_len; /* length in bytes of a row in the --- 1.3/innobase/include/row0vers.h 2005-03-07 15:22:47 +02:00 +++ 1.4/innobase/include/row0vers.h 2005-10-07 16:50:43 +03:00 @@ -92,6 +92,34 @@ record does not exist in the view, that is, it was freshly inserted afterwards */ +/********************************************************************* +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. We assume that the +trx id stored in rec is such that the consistent read should not see +rec in its present version. */ + +ulint +row_vers_build_for_semi_consistent_read( +/*====================================*/ + /* out: DB_SUCCESS or DB_MISSING_HISTORY */ + rec_t* rec, /* in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /* in: mtr holding the latch on rec */ + dict_index_t* index, /* in: the clustered index */ + ulint** offsets,/* in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/* in: memory heap from which the memory for + old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers);/* out, own: old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ + #ifndef UNIV_NONINL #include "row0vers.ic" --- 1.65/innobase/lock/lock0lock.c 2005-08-17 11:55:41 +03:00 +++ 1.66/innobase/lock/lock0lock.c 2005-10-07 16:50:43 +03:00 @@ -2249,7 +2249,7 @@ Cancels a waiting record lock request and releases the waiting transaction that requested it. NOTE: does NOT check if waiting lock requests behind this one can now be granted! */ -static + void lock_rec_cancel( /*============*/ --- 1.114/innobase/row/row0mysql.c 2005-09-23 11:20:22 +03:00 +++ 1.115/innobase/row/row0mysql.c 2005-10-07 16:50:43 +03:00 @@ -626,6 +626,9 @@ prebuilt->select_lock_type = LOCK_NONE; prebuilt->stored_select_lock_type = 99999999; + prebuilt->was_semi_consistent_read = FALSE; + prebuilt->force_re_read_of_update_row = FALSE; + prebuilt->sel_graph = NULL; prebuilt->search_tuple = dtuple_create(heap, --- 1.104/innobase/row/row0sel.c 2005-09-03 12:48:03 +03:00 +++ 1.105/innobase/row/row0sel.c 2005-10-07 16:50:43 +03:00 @@ -536,6 +536,41 @@ } /************************************************************************* +Builds the last committed version of a clustered index record for a +semi-consistent read */ +static +ulint +row_sel_build_committed_vers_for_mysql( +/*===================================*/ + /* out: DB_SUCCESS or error code */ + dict_index_t* clust_index, /* in: clustered index */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec, /* in: record in a clustered index */ + ulint** offsets, /* in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /* in/out: memory heap from which + the offsets are allocated */ + rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create(200); + } + + err = row_vers_build_for_semi_consistent_read(rec, mtr, clust_index, + offsets, offset_heap, + prebuilt->old_vers_heap, old_vers); + return(err); +} + +/************************************************************************* Tests the conditions which determine when the index segment we are searching through has been exhausted. */ UNIV_INLINE @@ -3163,7 +3198,7 @@ trx->search_latch_timeout = BTR_SEA_TIMEOUT; } - /* Reset the new record lock info if we srv_locks_unsafe_for_binlog + /* Reset the new record lock info if srv_locks_unsafe_for_binlog is set. Then we are able to remove the record locks set here on an individual row. */ @@ -3433,7 +3468,9 @@ if (UNIV_LIKELY(direction != 0)) { if (!sel_restore_position_for_mysql(&same_user_rec, BTR_SEARCH_LEAF, - pcur, moves_up, &mtr)) { + pcur, moves_up, &mtr) + && !prebuilt->was_semi_consistent_read) { + goto next_rec; } @@ -3751,7 +3788,33 @@ prebuilt->select_lock_type, lock_type, thr); - if (err != DB_SUCCESS) { + switch (err) { + case DB_SUCCESS: + break; + case DB_LOCK_WAIT: + if (!srv_locks_unsafe_for_binlog + || prebuilt->force_re_read_of_update_row + || prebuilt->select_lock_type != LOCK_X + || index != clust_index) { + + goto lock_wait_or_error; + } + + err = row_sel_build_committed_vers_for_mysql( + clust_index, prebuilt, rec, + &offsets, &heap, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + lock_rec_cancel(prebuilt->trx->wait_lock); + prebuilt->was_semi_consistent_read = TRUE; + + goto got_clust_rec; + default: goto lock_wait_or_error; } @@ -3903,6 +3966,8 @@ result_rec = rec; } } else { +got_clust_rec: + result_rec = rec; } --- 1.17/innobase/row/row0vers.c 2005-04-27 12:28:35 +03:00 +++ 1.18/innobase/row/row0vers.c 2005-10-07 16:50:43 +03:00 @@ -490,3 +490,117 @@ return(err); } + + +/********************************************************************* +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. We assume that the +trx id stored in rec is such that the consistent read should not see +rec in its present version. */ + +ulint +row_vers_build_for_semi_consistent_read( +/*====================================*/ + /* out: DB_SUCCESS or DB_MISSING_HISTORY */ + rec_t* rec, /* in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /* in: mtr holding the latch on rec */ + dict_index_t* index, /* in: the clustered index */ + ulint** offsets,/* in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/* in: memory heap from which the memory for + old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers)/* out, own: old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ +{ + rec_t* version; + rec_t* prev_version; + dulint prev_trx_id; + trx_t* prev_trx; + mem_heap_t* heap = NULL; + byte* buf; + ulint err; + + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains(mtr, buf_block_align(rec), + MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + rw_lock_s_lock(&(purge_sys->latch)); + version = rec; + + for (;;) { + mem_heap_t* heap2 = heap; + heap = mem_heap_create(1024); + + err = trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, &prev_version); + if (heap2) { + mem_heap_free(heap2); /* free version */ + } + + if (err != DB_SUCCESS) { + break; + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + err = DB_SUCCESS; + + break; + } + + *offsets = rec_get_offsets(prev_version, index, *offsets, + ULINT_UNDEFINED, offset_heap); + prev_trx_id = row_get_rec_trx_id(prev_version, index, + *offsets); + prev_trx = trx_get_on_id(prev_trx_id); + + if (!prev_trx + || prev_trx->conc_state == TRX_COMMITTED_IN_MEMORY) { + + /* This version belongs to a committed transaction: + we can copy it to in_heap and return. */ + + /* We assume that rolled-back transaction stays in + TRX_ACTIVE state until all the changes have been + rolled back and the transaction is removed from + the global list of transactions. */ + + /* We also assume that a transaction in + TRX_COMMITTED_IN_MEMORY state cannot be + overtaken, i.e., that the redo log is flushed + in a FIFO manner. Thus, when prev_trx + && prev_trx->conc_state == TRX_COMMITTED_IN_MEMORY + and the system crashes, both prev_trx and the + current trx will be rolled back in crash recovery. */ + + buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); + *old_vers = rec_copy(buf, prev_version, *offsets); + rec_offs_make_valid(*old_vers, index, *offsets); + err = DB_SUCCESS; + + break; + } + + version = prev_version; + }/* for (;;) */ + + mem_heap_free(heap); + rw_lock_s_unlock(&(purge_sys->latch)); + + return(err); +} --- 1.270/sql/ha_innodb.cc 2005-10-06 15:36:54 +03:00 +++ 1.271/sql/ha_innodb.cc 2005-10-07 16:50:43 +03:00 @@ -3682,9 +3682,9 @@ } /************************************************************************** -Removes a new lock set on a row. This can be called after a row has been read -in the processing of an UPDATE or a DELETE query, if the option -innodb_locks_unsafe_for_binlog is set. */ +Removes a new lock set on a row, if it was not read optimistically. This can +be called after a row has been read in the processing of an UPDATE or a DELETE +query, if the option innodb_locks_unsafe_for_binlog is set. */ void ha_innobase::unlock_row(void) @@ -3704,8 +3704,28 @@ } if (srv_locks_unsafe_for_binlog) { - row_unlock_for_mysql(prebuilt, FALSE); + if (!prebuilt->was_semi_consistent_read) { + + row_unlock_for_mysql(prebuilt, FALSE); + } + prebuilt->was_semi_consistent_read = FALSE; + prebuilt->force_re_read_of_update_row = FALSE; } +} + +/* See handler.h and row0mysql.h for docs on this function. */ +bool +ha_innobase::was_semi_consistent_read(void) +/*=======================================*/ +{ + row_prebuilt_t* prebuilt = (row_prebuilt_t*) innobase_prebuilt; + bool ret = (bool)prebuilt->was_semi_consistent_read; + + if (ret) { + prebuilt->force_re_read_of_update_row = TRUE; + } + + return(ret); } /********************************************************************** --- 1.106/sql/ha_innodb.h 2005-10-04 07:08:55 +03:00 +++ 1.107/sql/ha_innodb.h 2005-10-07 16:50:43 +03:00 @@ -122,6 +122,7 @@ int write_row(byte * buf); int update_row(const byte * old_data, byte * new_data); int delete_row(const byte * buf); + bool was_semi_consistent_read(); void unlock_row(); int index_init(uint index);