List:Commits« Previous MessageNext Message »
From:Bjorn Munch Date:August 30 2011 10:57am
Subject:bzr push into mysql-5.1-mtr branch (bjorn.munch:3011 to 3013)
View as plain text  
 3013 Bjorn Munch	2011-08-30 [merge]
      new merge from 5.1

    modified:
      mysql-test/r/group_by.result
      mysql-test/t/group_by.test
      mysql-test/valgrind.supp
      sql/filesort.cc
 3012 Bjorn Munch	2011-08-29 [merge]
      merge from 5.1 main

    modified:
      extra/innochecksum.c
      include/decimal.h
      mysql-test/r/merge.result
      mysql-test/r/type_newdecimal.result
      mysql-test/suite/innodb_plugin/r/innodb-index.result
      mysql-test/suite/innodb_plugin/t/innodb-index.test
      mysql-test/t/merge.test
      mysql-test/t/type_newdecimal.test
      mysql-test/valgrind.supp
      sql/filesort.cc
      sql/my_decimal.h
      sql/rpl_rli.h
      sql/slave.cc
      sql/sql_base.cc
      storage/innobase/btr/btr0btr.c
      storage/innobase/btr/btr0cur.c
      storage/innobase/buf/buf0buf.c
      storage/innobase/fsp/fsp0fsp.c
      storage/innobase/include/btr0btr.h
      storage/innobase/include/btr0cur.h
      storage/innobase/include/buf0buf.h
      storage/innobase/include/fsp0fsp.h
      storage/innobase/include/mtr0mtr.h
      storage/innobase/include/mtr0mtr.ic
      storage/innobase/mtr/mtr0mtr.c
      storage/innobase/row/row0ins.c
      storage/innobase/row/row0row.c
      storage/innobase/row/row0upd.c
      storage/innobase/trx/trx0undo.c
      storage/innodb_plugin/ChangeLog
      storage/innodb_plugin/btr/btr0btr.c
      storage/innodb_plugin/btr/btr0cur.c
      storage/innodb_plugin/buf/buf0buf.c
      storage/innodb_plugin/fsp/fsp0fsp.c
      storage/innodb_plugin/include/btr0btr.h
      storage/innodb_plugin/include/btr0cur.h
      storage/innodb_plugin/include/buf0buf.h
      storage/innodb_plugin/include/fsp0fsp.h
      storage/innodb_plugin/include/mtr0mtr.h
      storage/innodb_plugin/include/mtr0mtr.ic
      storage/innodb_plugin/include/trx0undo.h
      storage/innodb_plugin/mtr/mtr0mtr.c
      storage/innodb_plugin/row/row0ins.c
      storage/innodb_plugin/row/row0row.c
      storage/innodb_plugin/row/row0upd.c
      storage/innodb_plugin/sync/sync0sync.c
      storage/innodb_plugin/trx/trx0rec.c
      storage/innodb_plugin/trx/trx0undo.c
      strings/decimal.c
      support-files/mysql.spec.sh
 3011 Bjorn Munch	2011-08-22 [merge]
      merge from 5.1 main

    modified:
      configure.in
      mysql-test/collections/default.experimental
      mysql-test/r/func_time.result
      mysql-test/r/sp.result
      mysql-test/r/sp_trans.result
      mysql-test/suite/innodb_plugin/r/innodb-zip.result
      mysql-test/suite/innodb_plugin/t/innodb-zip.test
      mysql-test/t/func_time.test
      mysql-test/t/sp.test
      mysql-test/t/sp_trans.test
      sql/item_timefunc.cc
      sql/sql_base.cc
      sql/sql_class.cc
      sql/sql_class.h
      sql/sql_insert.cc
      sql/sql_lex.cc
      sql/sql_lex.h
      sql/sql_parse.cc
      sql/sql_prepare.cc
      sql/sql_select.cc
      sql/sql_yacc.yy
      storage/innobase/row/row0sel.c
      storage/innodb_plugin/ChangeLog
      storage/innodb_plugin/btr/btr0btr.c
      storage/innodb_plugin/btr/btr0cur.c
      storage/innodb_plugin/btr/btr0pcur.c
      storage/innodb_plugin/btr/btr0sea.c
      storage/innodb_plugin/dict/dict0crea.c
      storage/innodb_plugin/dict/dict0dict.c
      storage/innodb_plugin/handler/ha_innodb.cc
      storage/innodb_plugin/ibuf/ibuf0ibuf.c
      storage/innodb_plugin/include/btr0btr.h
      storage/innodb_plugin/include/btr0btr.ic
      storage/innodb_plugin/include/sync0sync.h
      storage/innodb_plugin/row/row0sel.c
      storage/innodb_plugin/sync/sync0sync.c
=== modified file 'extra/innochecksum.c'
--- a/extra/innochecksum.c	2011-07-03 18:08:47 +0000
+++ b/extra/innochecksum.c	2011-08-16 22:34:11 +0000
@@ -25,12 +25,7 @@
   Published with a permission.
 */
 
-/* needed to have access to 64 bit file functions */
-#define _LARGEFILE_SOURCE
-#define _LARGEFILE64_SOURCE
-
-#define _XOPEN_SOURCE 500 /* needed to include getopt.h on some platforms. */
-
+#include <my_global.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
@@ -53,7 +48,6 @@
 /* another argument to specify page ranges... seek to right spot and go from there */
 
 typedef unsigned long int ulint;
-typedef unsigned char uchar;
 
 /* innodb function in name; modified slightly to not have the ASM version (lots of #ifs that didn't apply) */
 ulint mach_read_from_4(uchar *b)

=== modified file 'include/decimal.h'
--- a/include/decimal.h	2007-05-24 10:24:36 +0000
+++ b/include/decimal.h	2011-08-29 09:24:36 +0000
@@ -21,6 +21,15 @@ typedef enum
   decimal_round_mode;
 typedef int32 decimal_digit_t;
 
+/**
+    intg is the number of *decimal* digits (NOT number of decimal_digit_t's !)
+         before the point
+    frac is the number of decimal digits after the point
+    len  is the length of buf (length of allocated space) in decimal_digit_t's,
+         not in bytes
+    sign false means positive, true means negative
+    buf  is an array of decimal_digit_t's
+ */
 typedef struct st_decimal_t {
   int    intg, frac, len;
   my_bool sign;

=== modified file 'mysql-test/r/group_by.result'
--- a/mysql-test/r/group_by.result	2011-02-18 10:50:06 +0000
+++ b/mysql-test/r/group_by.result	2011-08-30 07:56:07 +0000
@@ -1891,4 +1891,36 @@ a	AVG(t1.b)	t11c	t12c
 1	4.0000	6	6
 2	2.0000	7	7
 DROP TABLE t1;
+#
+# Bug#11765254 (58200): Assertion failed: param.sort_length when grouping
+# by functions
+#
+SET SQL_BIG_TABLES=1;
+CREATE TABLE t1(a INT);
+INSERT INTO t1 VALUES (0),(0);
+SELECT 1 FROM t1 GROUP BY IF(`a`,'','');
+1
+1
+SELECT 1 FROM t1 GROUP BY TRIM(LEADING RAND() FROM '');
+1
+1
+SELECT 1 FROM t1 GROUP BY SUBSTRING('',SLEEP(0),'');
+1
+1
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: ''
+Warning	1292	Truncated incorrect INTEGER value: ''
+Warning	1292	Truncated incorrect INTEGER value: ''
+SELECT 1 FROM t1 GROUP BY SUBSTRING(SYSDATE() FROM 'K' FOR 'jxW<');
+1
+1
+Warnings:
+Warning	1292	Truncated incorrect INTEGER value: 'K'
+Warning	1292	Truncated incorrect INTEGER value: 'jxW<'
+Warning	1292	Truncated incorrect INTEGER value: 'K'
+Warning	1292	Truncated incorrect INTEGER value: 'jxW<'
+Warning	1292	Truncated incorrect INTEGER value: 'K'
+Warning	1292	Truncated incorrect INTEGER value: 'jxW<'
+DROP TABLE t1;
+SET SQL_BIG_TABLES=0;
 # End of 5.1 tests

=== modified file 'mysql-test/r/merge.result'
--- a/mysql-test/r/merge.result	2010-11-30 04:46:43 +0000
+++ b/mysql-test/r/merge.result	2011-08-18 06:38:51 +0000
@@ -2341,4 +2341,33 @@ REPAIR TABLE m1;
 Table	Op	Msg_type	Msg_text
 test.m1	repair	note	The storage engine for the table doesn't support repair
 DROP TABLE m1, t1;
+#
+# BUG#11763712 - 56458: KILLING A FLUSH TABLE FOR A MERGE/CHILD
+#                       CRASHES SERVER
+#
+CREATE TABLE t1(a INT);
+CREATE TABLE t2(a INT);
+CREATE TABLE t3(a INT, b INT);
+CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2);
+# Test reopen merge parent failure
+LOCK TABLES m1 READ;
+# Remove 'm1' table using file operations.
+FLUSH TABLES;
+ERROR 42S02: Table 'test.m1' doesn't exist
+UNLOCK TABLES;
+CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2);
+# Test reopen merge child failure
+LOCK TABLES m1 READ;
+# Remove 't1' table using file operations.
+FLUSH TABLES;
+ERROR 42S02: Table 'test.t1' doesn't exist
+UNLOCK TABLES;
+CREATE TABLE t1(a INT);
+# Test reattach merge failure
+LOCK TABLES m1 READ;
+# Replace 't1' with 't3' table using file operations.
+FLUSH TABLES;
+ERROR HY000: Can't reopen table: 'm1'
+UNLOCK TABLES;
+DROP TABLE t1, t2, t3, m1;
 End of 5.1 tests

=== modified file 'mysql-test/r/type_newdecimal.result'
--- a/mysql-test/r/type_newdecimal.result	2010-11-11 09:46:49 +0000
+++ b/mysql-test/r/type_newdecimal.result	2011-08-29 09:24:36 +0000
@@ -1927,3 +1927,14 @@ f1
 0.000000000000000000000000
 DROP TABLE IF EXISTS t1;
 End of 5.1 tests
+#
+# BUG#12911710 - VALGRIND FAILURE IN 
+# ROW-DEBUG:PERFSCHEMA.SOCKET_SUMMARY_BY_INSTANCE_FUNC 
+#
+CREATE TABLE t1(d1 DECIMAL(60,0) NOT NULL,
+d2 DECIMAL(60,0) NOT NULL);
+INSERT INTO t1 (d1, d2) VALUES(0.0, 0.0);
+SELECT d1 * d2 FROM t1;
+d1 * d2
+0
+DROP TABLE t1;

=== modified file 'mysql-test/suite/innodb_plugin/r/innodb-index.result'
--- a/mysql-test/suite/innodb_plugin/r/innodb-index.result	2011-07-07 21:29:30 +0000
+++ b/mysql-test/suite/innodb_plugin/r/innodb-index.result	2011-08-29 08:16:42 +0000
@@ -1024,6 +1024,15 @@ INSERT INTO t1 VALUES(9,@r,@r,@r,@r,@r,@
 UPDATE t1 SET a=1000;
 DELETE FROM t1;
 DROP TABLE t1;
+CREATE TABLE bug12547647(
+a INT NOT NULL, b BLOB NOT NULL, c TEXT,
+PRIMARY KEY (b(10), a), INDEX (c(10))
+) ENGINE=InnoDB ROW_FORMAT=DYNAMIC;
+INSERT INTO bug12547647 VALUES (5,repeat('khdfo5AlOq',1900),repeat('g',7731));
+COMMIT;
+UPDATE bug12547647 SET c = REPEAT('b',16928);
+ERROR 42000: Row size too large. The maximum row size for the used table type, not counting BLOBs, is 8126. You have to change some columns to TEXT or BLOBs
+DROP TABLE bug12547647;
 set global innodb_file_per_table=0;
 set global innodb_file_format=Antelope;
 set global innodb_file_format_check=Antelope;

=== modified file 'mysql-test/suite/innodb_plugin/t/innodb-index.test'
--- a/mysql-test/suite/innodb_plugin/t/innodb-index.test	2011-07-07 21:29:30 +0000
+++ b/mysql-test/suite/innodb_plugin/t/innodb-index.test	2011-08-29 08:16:42 +0000
@@ -480,6 +480,19 @@ DELETE FROM t1;
 -- sleep 10
 DROP TABLE t1;
 
+# Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE
+CREATE TABLE bug12547647(
+a INT NOT NULL, b BLOB NOT NULL, c TEXT,
+PRIMARY KEY (b(10), a), INDEX (c(10))
+) ENGINE=InnoDB ROW_FORMAT=DYNAMIC;
+
+INSERT INTO bug12547647 VALUES (5,repeat('khdfo5AlOq',1900),repeat('g',7731));
+COMMIT;
+# The following used to cause infinite undo log allocation.
+--error ER_TOO_BIG_ROWSIZE
+UPDATE bug12547647 SET c = REPEAT('b',16928);
+DROP TABLE bug12547647;
+
 eval set global innodb_file_per_table=$per_table;
 eval set global innodb_file_format=$format;
 eval set global innodb_file_format_check=$format;

=== modified file 'mysql-test/t/group_by.test'
--- a/mysql-test/t/group_by.test	2011-02-18 10:50:06 +0000
+++ b/mysql-test/t/group_by.test	2011-08-30 07:56:07 +0000
@@ -1283,5 +1283,19 @@ FROM t1 GROUP BY a;
 
 DROP TABLE t1;
 
+--echo #
+--echo # Bug#11765254 (58200): Assertion failed: param.sort_length when grouping
+--echo # by functions
+--echo #
+
+SET SQL_BIG_TABLES=1;
+CREATE TABLE t1(a INT);
+INSERT INTO t1 VALUES (0),(0);
+SELECT 1 FROM t1 GROUP BY IF(`a`,'','');
+SELECT 1 FROM t1 GROUP BY TRIM(LEADING RAND() FROM '');
+SELECT 1 FROM t1 GROUP BY SUBSTRING('',SLEEP(0),'');
+SELECT 1 FROM t1 GROUP BY SUBSTRING(SYSDATE() FROM 'K' FOR 'jxW<');
+DROP TABLE t1;
+SET SQL_BIG_TABLES=0;
 
 --echo # End of 5.1 tests

=== modified file 'mysql-test/t/merge.test'
--- a/mysql-test/t/merge.test	2010-04-26 13:44:10 +0000
+++ b/mysql-test/t/merge.test	2011-08-18 06:38:51 +0000
@@ -1783,4 +1783,49 @@ REPAIR TABLE m1;
 #
 DROP TABLE m1, t1;
 
+
+--echo #
+--echo # BUG#11763712 - 56458: KILLING A FLUSH TABLE FOR A MERGE/CHILD
+--echo #                       CRASHES SERVER
+--echo #
+CREATE TABLE t1(a INT);
+CREATE TABLE t2(a INT);
+CREATE TABLE t3(a INT, b INT);
+CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2);
+
+--echo # Test reopen merge parent failure
+LOCK TABLES m1 READ;
+--echo # Remove 'm1' table using file operations.
+remove_file $MYSQLD_DATADIR/test/m1.MRG;
+remove_file $MYSQLD_DATADIR/test/m1.frm;
+--error ER_NO_SUCH_TABLE
+FLUSH TABLES;
+UNLOCK TABLES;
+CREATE TABLE m1(a INT) ENGINE=MERGE UNION=(t1, t2);
+
+--echo # Test reopen merge child failure
+LOCK TABLES m1 READ;
+--echo # Remove 't1' table using file operations.
+remove_file $MYSQLD_DATADIR/test/t1.frm;
+remove_file $MYSQLD_DATADIR/test/t1.MYI;
+remove_file $MYSQLD_DATADIR/test/t1.MYD;
+--error ER_NO_SUCH_TABLE
+FLUSH TABLES;
+UNLOCK TABLES;
+CREATE TABLE t1(a INT);
+
+--echo # Test reattach merge failure
+LOCK TABLES m1 READ;
+--echo # Replace 't1' with 't3' table using file operations.
+remove_file $MYSQLD_DATADIR/test/t1.frm;
+remove_file $MYSQLD_DATADIR/test/t1.MYI;
+remove_file $MYSQLD_DATADIR/test/t1.MYD;
+copy_file $MYSQLD_DATADIR/test/t3.frm $MYSQLD_DATADIR/test/t1.frm;
+copy_file $MYSQLD_DATADIR/test/t3.MYI $MYSQLD_DATADIR/test/t1.MYI;
+copy_file $MYSQLD_DATADIR/test/t3.MYD $MYSQLD_DATADIR/test/t1.MYD;
+--error ER_CANT_REOPEN_TABLE
+FLUSH TABLES;
+UNLOCK TABLES;
+DROP TABLE t1, t2, t3, m1;
+
 --echo End of 5.1 tests

=== modified file 'mysql-test/t/type_newdecimal.test'
--- a/mysql-test/t/type_newdecimal.test	2010-11-11 09:46:49 +0000
+++ b/mysql-test/t/type_newdecimal.test	2011-08-29 09:24:36 +0000
@@ -1526,3 +1526,17 @@ DROP TABLE IF EXISTS t1;
 
 
 --echo End of 5.1 tests
+
+--echo #
+--echo # BUG#12911710 - VALGRIND FAILURE IN 
+--echo # ROW-DEBUG:PERFSCHEMA.SOCKET_SUMMARY_BY_INSTANCE_FUNC 
+--echo #
+
+CREATE TABLE t1(d1 DECIMAL(60,0) NOT NULL,
+                d2 DECIMAL(60,0) NOT NULL);
+
+INSERT INTO t1 (d1, d2) VALUES(0.0, 0.0);
+SELECT d1 * d2 FROM t1;
+
+DROP TABLE t1;
+

=== modified file 'mysql-test/valgrind.supp'
--- a/mysql-test/valgrind.supp	2011-06-30 15:37:13 +0000
+++ b/mysql-test/valgrind.supp	2011-08-30 08:19:36 +0000
@@ -791,3 +791,40 @@
    fun:fil_delete_tablespace
    fun:row_drop_table_for_mysql
 }
+
+# Note the wildcard in the (mangled) function signatures of
+# write_keys() and find_all_keys().
+# They both return ha_rows, which is platform dependent.
+{
+   Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX / one
+   Memcheck:Param
+   write(buf)
+   obj:*/libpthread*.so
+   fun:my_write
+   fun:my_b_flush_io_cache
+   fun:_my_b_write
+   fun:_Z*10write_keysP13st_sort_paramPPhjP11st_io_cacheS4_
+   fun:_Z*13find_all_keysP13st_sort_paramP10SQL_SELECTPPhP11st_io_cacheS6_S6_
+   fun:_Z8filesortP3THDP8st_tableP13st_sort_fieldjP10SQL_SELECTybPy
+}
+
+{
+   Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX / two
+   Memcheck:Param
+   write(buf)
+   obj:*/libpthread*.so
+   fun:my_write
+   fun:my_b_flush_io_cache
+   fun:_Z15merge_many_buffP13st_sort_paramPhP10st_buffpekPjP11st_io_cache
+   fun:_Z8filesortP3THDP8st_tableP13st_sort_fieldjP10SQL_SELECTybPy
+}
+
+{
+   Bug#12856915 VALGRIND FAILURE IN FILESORT/CREATE_SORT_INDEX / three
+   Memcheck:Param
+   write(buf)
+   obj:*/libpthread*.so
+   fun:my_write
+   fun:my_b_flush_io_cache
+   fun:_Z8filesortP3THDP8st_tableP13st_sort_fieldjP10SQL_SELECTybPy
+}

=== modified file 'sql/filesort.cc'
--- a/sql/filesort.cc	2011-06-30 15:37:13 +0000
+++ b/sql/filesort.cc	2011-08-30 07:56:07 +0000
@@ -144,8 +144,6 @@ ha_rows filesort(THD *thd, TABLE *table,
   error= 1;
   bzero((char*) &param,sizeof(param));
   param.sort_length= sortlength(thd, sortorder, s_length, &multi_byte_charset);
-  /* filesort cannot handle zero-length records. */
-  DBUG_ASSERT(param.sort_length);
   param.ref_length= table->file->ref_length;
   param.addon_field= 0;
   param.addon_length= 0;
@@ -257,6 +255,9 @@ ha_rows filesort(THD *thd, TABLE *table,
   }
   else
   {
+    /* filesort cannot handle zero-length records during merge. */
+    DBUG_ASSERT(param.sort_length != 0);
+
     if (table_sort.buffpek && table_sort.buffpek_len < maxbuffer)
     {
       x_free(table_sort.buffpek);
@@ -959,21 +960,10 @@ static void make_sortkey(register SORTPA
       if (addonf->null_bit && field->is_null())
       {
         nulls[addonf->null_offset]|= addonf->null_bit;
-#ifdef HAVE_purify
-	bzero(to, addonf->length);
-#endif
       }
       else
       {
-#ifdef HAVE_purify
-        uchar *end= field->pack(to, field->ptr);
-	uint length= (uint) ((to + addonf->length) - end);
-	DBUG_ASSERT((int) length >= 0);
-	if (length)
-	  bzero(end, length);
-#else
         (void) field->pack(to, field->ptr);
-#endif
       }
       to+= addonf->length;
     }

=== modified file 'sql/my_decimal.h'
--- a/sql/my_decimal.h	2011-07-03 15:47:37 +0000
+++ b/sql/my_decimal.h	2011-08-29 09:24:36 +0000
@@ -101,12 +101,8 @@ public:
   {
     len= DECIMAL_BUFF_LENGTH;
     buf= buffer;
-#if !defined (HAVE_purify) && !defined(DBUG_OFF)
-    /* Set buffer to 'random' value to find wrong buffer usage */
-    for (uint i= 0; i < DECIMAL_BUFF_LENGTH; i++)
-      buffer[i]= i;
-#endif
   }
+
   my_decimal()
   {
     init();

=== modified file 'sql/rpl_rli.h'
--- a/sql/rpl_rli.h	2011-06-30 15:37:13 +0000
+++ b/sql/rpl_rli.h	2011-08-26 09:57:29 +0000
@@ -221,7 +221,13 @@ public:
 #endif  
 
   /* if not set, the value of other members of the structure are undefined */
-  bool inited;
+  /*
+    inited changes its value within LOCK_active_mi-guarded critical
+    sections  at times of start_slave_threads() (0->1) and end_slave() (1->0).
+    Readers may not acquire the mutex while they realize potential concurrency
+    issue.
+  */
+  volatile bool inited;
   volatile bool abort_slave;
   volatile uint slave_running;
 

=== modified file 'sql/slave.cc'
--- a/sql/slave.cc	2011-07-03 15:47:37 +0000
+++ b/sql/slave.cc	2011-08-26 09:57:29 +0000
@@ -598,11 +598,15 @@ int start_slave_thread(pthread_handler h
       DBUG_PRINT("sleep",("Waiting for slave thread to start"));
       const char* old_msg = thd->enter_cond(start_cond,cond_lock,
                                             "Waiting for slave thread to start");
-      pthread_cond_wait(start_cond,cond_lock);
+      pthread_cond_wait(start_cond, cond_lock);
       thd->exit_cond(old_msg);
       pthread_mutex_lock(cond_lock); // re-acquire it as exit_cond() released
       if (thd->killed)
+      {
+        if (start_lock)
+          pthread_mutex_unlock(start_lock);
         DBUG_RETURN(thd->killed_errno());
+      }
     }
   }
   if (start_lock)
@@ -2531,6 +2535,7 @@ pthread_handler_t handle_slave_io(void *
 
   thd= new THD; // note that contructor of THD uses DBUG_ !
   THD_CHECK_SENTRY(thd);
+  DBUG_ASSERT(mi->io_thd == 0);
   mi->io_thd = thd;
 
   pthread_detach_this_thread();
@@ -4489,9 +4494,6 @@ int rotate_relay_log(Master_info* mi)
   Relay_log_info* rli= &mi->rli;
   int error= 0;
 
-  /* We don't lock rli->run_lock. This would lead to deadlocks. */
-  pthread_mutex_lock(&mi->run_lock);
-
   /*
      We need to test inited because otherwise, new_file() will attempt to lock
      LOCK_log, which may not be inited (if we're not a slave).
@@ -4521,7 +4523,6 @@ int rotate_relay_log(Master_info* mi)
   */
   rli->relay_log.harvest_bytes_written(&rli->log_space_total);
 end:
-  pthread_mutex_unlock(&mi->run_lock);
   DBUG_RETURN(error);
 }
 

=== modified file 'sql/sql_base.cc'
--- a/sql/sql_base.cc	2011-08-02 07:33:45 +0000
+++ b/sql/sql_base.cc	2011-08-24 07:18:00 +0000
@@ -96,6 +96,13 @@ static TABLE_SHARE *oldest_unused_share,
 static pthread_mutex_t LOCK_table_share;
 static bool table_def_inited= 0;
 
+/**
+  Dummy TABLE instance which is used in reopen_tables() and reattach_merge()
+  functions to mark MERGE tables and their children with which there is some
+  kind of problem and which therefore we need to close.
+*/
+static TABLE bad_merge_marker;
+
 static int open_unireg_entry(THD *thd, TABLE *entry, TABLE_LIST *table_list,
 			     const char *alias,
                              char *cache_key, uint cache_key_length,
@@ -3215,46 +3222,65 @@ void close_data_files_and_morph_locks(TH
 
 
 /**
+  @brief Mark merge parent and children with bad_merge_marker
+
+  @param[in,out]  parent       the TABLE object of the parent
+*/
+
+static void mark_merge_parent_and_children_as_bad(TABLE *parent)
+{
+  TABLE_LIST *child_l;
+  DBUG_ENTER("mark_merge_parent_and_children_as_bad");
+  parent->parent= &bad_merge_marker;
+  for (child_l= parent->child_l; ; child_l= child_l->next_global)
+  {
+    child_l->table->parent= &bad_merge_marker;
+    child_l->table= NULL;
+    if (&child_l->next_global == parent->child_last_l)
+      break;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
   Reattach MERGE children after reopen.
 
   @param[in]     thd            thread context
-  @param[in,out] err_tables_p   pointer to pointer of tables in error
+
+  @note If reattach failed for certain MERGE table, the table (and all
+        it's children) are marked with bad_merge_marker.
 
   @return       status
-    @retval     FALSE           OK, err_tables_p unchanged
-    @retval     TRUE            Error, err_tables_p contains table(s)
+    @retval     FALSE           OK
+    @retval     TRUE            Error
 */
 
-static bool reattach_merge(THD *thd, TABLE **err_tables_p)
+static bool reattach_merge(THD *thd)
 {
   TABLE *table;
-  TABLE *next;
-  TABLE **prv_p= &thd->open_tables;
   bool error= FALSE;
   DBUG_ENTER("reattach_merge");
 
-  for (table= thd->open_tables; table; table= next)
+  for (table= thd->open_tables; table; table= table->next)
   {
-    next= table->next;
-    DBUG_PRINT("tcache", ("check table: '%s'.'%s' 0x%lx  next: 0x%lx",
+    DBUG_PRINT("tcache", ("check table: '%s'.'%s' 0x%lx",
                           table->s->db.str, table->s->table_name.str,
-                          (long) table, (long) next));
-    /* Reattach children for MERGE tables with "closed data files" only. */
-    if (table->child_l && !table->children_attached)
+                          (long) table));
+    /*
+      Reattach children only for MERGE tables that had children or parent
+      with "closed data files" and were reopen. For extra safety skip MERGE
+      tables which we failed to reopen (should not happen with current code).
+    */
+    if (table->child_l && table->parent != &bad_merge_marker &&
+        !table->children_attached)
     {
       DBUG_PRINT("tcache", ("MERGE parent, attach children"));
-      if(table->file->extra(HA_EXTRA_ATTACH_CHILDREN))
+      if (table->file->extra(HA_EXTRA_ATTACH_CHILDREN))
       {
         my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias);
         error= TRUE;
-        /* Remove table from open_tables. */
-        *prv_p= next;
-        if (next)
-          prv_p= &next->next;
-        /* Stack table on error list. */
-        table->next= *err_tables_p;
-        *err_tables_p= table;
-        continue;
+        mark_merge_parent_and_children_as_bad(table);
       }
       else
       {
@@ -3264,7 +3290,6 @@ static bool reattach_merge(THD *thd, TAB
                             table->s->table_name.str, (long) table));
       }
     }
-    prv_p= &table->next;
   }
   DBUG_RETURN(error);
 }
@@ -3294,7 +3319,6 @@ bool reopen_tables(THD *thd, bool get_lo
 {
   TABLE *table,*next,**prev;
   TABLE **tables,**tables_ptr;			// For locks
-  TABLE *err_tables= NULL;
   bool error=0, not_used;
   bool merge_table_found= FALSE;
   const uint flags= MYSQL_LOCK_NOTIFY_IF_NEED_REOPEN |
@@ -3328,29 +3352,69 @@ bool reopen_tables(THD *thd, bool get_lo
   for (table=thd->open_tables; table ; table=next)
   {
     uint db_stat=table->db_stat;
+    TABLE *parent= table->child_l ? table : table->parent;
     next=table->next;
     DBUG_PRINT("tcache", ("open table: '%s'.'%s' 0x%lx  "
                           "parent: 0x%lx  db_stat: %u",
                           table->s->db.str, table->s->table_name.str,
                           (long) table, (long) table->parent, db_stat));
-    if (table->child_l && !db_stat)
+    /*
+      If we need to reopen child or parent table in a MERGE table, then
+      children in this MERGE table has to be already detached at this
+      point.
+    */
+    DBUG_ASSERT(db_stat || !parent || !parent->children_attached);
+    /*
+      Thanks to the above assumption the below condition will guarantee that
+      merge_table_found is TRUE when we need to reopen child or parent table.
+      Note that it works even in situation when it is only a child and not a
+      parent that needs reopen (this can happen when get_locks == FALSE).
+    */
+    if (table->child_l && !table->children_attached)
       merge_table_found= TRUE;
-    if (!tables || (!db_stat && reopen_table(table)))
+
+    if (!tables)
     {
-      my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias);
       /*
-        If we could not allocate 'tables', we may close open tables
-        here. If a MERGE table is affected, detach the children first.
-        It is not necessary to clear the child or parent table reference
-        of this table because the TABLE is freed. But we need to clear
-        the child or parent references of the other belonging tables so
-        that they cannot be moved into the unused_tables chain with
-        these pointers set.
+        If we could not allocate 'tables' we close ALL open tables here.
+        Before closing MERGE child or parent we need to detach children
+        and/or clear references in/to them.
       */
-      if (table->child_l || table->parent)
+      if (parent)
         detach_merge_children(table, TRUE);
-      VOID(hash_delete(&open_cache,(uchar*) table));
-      error=1;
+    }
+    else if (table->parent == &bad_merge_marker)
+    {
+      /*
+        This is either a child or a parent of a MERGE table for which
+        we already decided that we are unable to reopen it. Close it.
+
+        Reset parent reference, it may be used while freeing the table.
+      */
+      table->parent= NULL;
+    }
+    else if (!db_stat && reopen_table(table))
+    {
+      /*
+        If we fail to reopen a child or a parent in a MERGE table and the
+        MERGE table is affected for the first time, mark all relevant tables
+        invalid. Otherwise handle it as usual.
+
+        All in all we must end up with:
+        - child tables are detached from parent. This was done earlier,
+          but child<->parent references were kept valid for reopen.
+        - parent is not in the to-be-locked tables
+        - all child tables and parent are not in the THD::open_tables.
+        - all child tables and parent are not in the open_cache.
+
+        Please note that below we do additional pass through THD::open_tables
+        list to achieve the last three points.
+      */
+      if (parent)
+      {
+        mark_merge_parent_and_children_as_bad(parent);
+        table->parent= NULL;
+      }
     }
     else
     {
@@ -3366,21 +3430,56 @@ bool reopen_tables(THD *thd, bool get_lo
 	table->s->version=0;
 	table->open_placeholder= 0;
       }
+      continue;
     }
+    my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias);
+    VOID(hash_delete(&open_cache, (uchar *) table));
+    error= 1;
   }
   *prev=0;
   /*
     When all tables are open again, we can re-attach MERGE children to
-    their parents. All TABLE objects are still present.
+    their parents.
+
+    If there was an error while reopening a child or a parent of a MERGE
+    table, or while reattaching child tables to their parents, some tables
+    may have been kept open but marked for close with bad_merge_marker.
+    Close these tables now.
   */
-  DBUG_PRINT("tcache", ("re-attaching MERGE tables: %d", merge_table_found));
-  if (!error && merge_table_found && reattach_merge(thd, &err_tables))
+  if (tables && merge_table_found && (error|= reattach_merge(thd)))
   {
-    while (err_tables)
+    prev= &thd->open_tables;
+    for (table= thd->open_tables; table; table= next)
     {
-      VOID(hash_delete(&open_cache, (uchar*) err_tables));
-      err_tables= err_tables->next;
+      next= table->next;
+      if (table->parent == &bad_merge_marker)
+      {
+        /* Remove merge parent from to-be-locked tables array. */
+        if (get_locks && table->child_l)
+        {
+          TABLE **t;
+          for (t= tables; t < tables_ptr; t++)
+          {
+            if (*t == table)
+            {
+              tables_ptr--;
+              memmove(t, t + 1, (tables_ptr - t) * sizeof(TABLE *));
+              break;
+            }
+          }
+        }
+        /* Reset parent reference, it may be used while freeing the table. */
+        table->parent= NULL;
+        /* Free table. */
+        VOID(hash_delete(&open_cache, (uchar *) table));
+      }
+      else
+      {
+        *prev= table;
+        prev= &table->next;
+      }
     }
+    *prev= 0;
   }
   DBUG_PRINT("tcache", ("open tables to lock: %u",
                         (uint) (tables_ptr - tables)));

=== modified file 'storage/innobase/btr/btr0btr.c'
--- a/storage/innobase/btr/btr0btr.c	2011-06-16 13:11:43 +0000
+++ b/storage/innobase/btr/btr0btr.c	2011-08-29 08:16:42 +0000
@@ -300,29 +300,30 @@ btr_page_alloc_for_ibuf(
 /******************************************************************
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents! */
-
-page_t*
-btr_page_alloc(
-/*===========*/
-					/* out: new allocated page, x-latched;
-					NULL if out of space */
+static
+ulint
+btr_page_alloc_low(
+/*===============*/
+					/* out: allocated page number,
+					FIL_NULL if out of space */
 	dict_index_t*	index,		/* in: index */
 	ulint		hint_page_no,	/* in: hint of a good page */
 	byte		file_direction,	/* in: direction where a possible
 					page split is made */
 	ulint		level,		/* in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr)		/* in: mtr */
+	mtr_t*		mtr,		/* in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/* in/out: mini-transaction
+					in which the page should be
+					initialized (may be the same
+					as mtr), or NULL if it should
+					not be initialized (the page
+					at hint was previously freed
+					in mtr) */
 {
 	fseg_header_t*	seg_header;
 	page_t*		root;
-	page_t*		new_page;
-	ulint		new_page_no;
-
-	if (index->type & DICT_IBUF) {
-
-		return(btr_page_alloc_for_ibuf(index, mtr));
-	}
 
 	root = btr_root_get(index, mtr);
 
@@ -336,19 +337,61 @@ btr_page_alloc(
 	reservation for free extents, and thus we know that a page can
 	be allocated: */
 
-	new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
-						   file_direction, TRUE, mtr);
+	return(fseg_alloc_free_page_general(seg_header, hint_page_no,
+					    file_direction, TRUE,
+					    mtr, init_mtr));
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents! */
+
+page_t*
+btr_page_alloc(
+/*===========*/
+					/* out:	new allocated block, x-latched;
+					NULL if out of space */
+	dict_index_t*	index,		/* in: index */
+	ulint		hint_page_no,	/* in: hint of a good page */
+	byte		file_direction,	/* in: direction where a possible
+					page split is made */
+	ulint		level,		/* in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/* in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/* in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+{
+	page_t*		new_page;
+	ulint		new_page_no;
+
+	if (index->type & DICT_IBUF) {
+
+		return(btr_page_alloc_for_ibuf(index, mtr));
+	}
+
+	new_page_no = btr_page_alloc_low(
+		index, hint_page_no, file_direction, level, mtr, init_mtr);
+
 	if (new_page_no == FIL_NULL) {
 
 		return(NULL);
 	}
 
 	new_page = buf_page_get(dict_index_get_space(index), new_page_no,
-				RW_X_LATCH, mtr);
+				RW_X_LATCH, init_mtr);
 #ifdef UNIV_SYNC_DEBUG
 	buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW);
 #endif /* UNIV_SYNC_DEBUG */
 
+	if (mtr->freed_clust_leaf) {
+		mtr_memo_release(mtr, new_page, MTR_MEMO_FREE_CLUST_LEAF);
+		ut_ad(!mtr_memo_contains(mtr, buf_block_align(new_page),
+					 MTR_MEMO_FREE_CLUST_LEAF));
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
 	return(new_page);
 }
 
@@ -464,6 +507,16 @@ btr_page_free_low(
 	page_no = buf_frame_get_page_no(page);
 
 	fseg_free_page(seg_header, space, page_no, mtr);
+
+	/* The page was marked free in the allocation bitmap, but it
+	should remain buffer-fixed until mtr_commit(mtr) or until it
+	is explicitly freed from the mini-transaction. */
+	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
+				MTR_MEMO_PAGE_X_FIX));
+	/* TODO: Discard any operations on the page from the redo log
+	and remove the block from the flush list and the buffer pool.
+	This would free up buffer pool earlier and reduce writes to
+	both the tablespace and the redo log. */
 }
 
 /******************************************************************
@@ -479,12 +532,143 @@ btr_page_free(
 {
 	ulint		level;
 
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
 	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
 				MTR_MEMO_PAGE_X_FIX));
 	level = btr_page_get_level(page, mtr);
 
 	btr_page_free_low(index, page, level, mtr);
+
+	/* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */
+	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
+				MTR_MEMO_PAGE_X_FIX));
+
+	if (level == 0 && (index->type & DICT_CLUSTERED)) {
+		/* We may have to call btr_mark_freed_leaves() to
+		temporarily mark the block nonfree for invoking
+		btr_store_big_rec_extern_fields() after an
+		update. Remember that the block was freed. */
+		mtr->freed_clust_leaf = TRUE;
+		mtr_memo_push(mtr, buf_block_align(page),
+			      MTR_MEMO_FREE_CLUST_LEAF);
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
+}
+
+/**************************************************************//**
+Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
+For invoking btr_store_big_rec_extern_fields() after an update,
+we must temporarily mark freed clustered index pages allocated, so
+that off-page columns will not be allocated from them. Between the
+btr_store_big_rec_extern_fields() and mtr_commit() we have to
+mark the pages free again, so that no pages will be leaked. */
+
+void
+btr_mark_freed_leaves(
+/*==================*/
+	dict_index_t*	index,	/* in/out: clustered index */
+	mtr_t*		mtr,	/* in/out: mini-transaction */
+	ibool		nonfree)/* in: TRUE=mark nonfree, FALSE=mark freed */
+{
+	/* This is loosely based on mtr_memo_release(). */
+
+	ulint	offset;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	if (!mtr->freed_clust_leaf) {
+		return;
+	}
+
+	offset = dyn_array_get_data_size(&mtr->memo);
+
+	while (offset > 0) {
+		mtr_memo_slot_t*	slot;
+		buf_block_t*		block;
+
+		offset -= sizeof *slot;
+
+		slot = dyn_array_get_element(&mtr->memo, offset);
+
+		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
+			continue;
+		}
+
+		/* Because btr_page_alloc() does invoke
+		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
+		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
+		memo must still be clustered index leaf tree pages. */
+		block = slot->object;
+		ut_a(buf_block_get_space(block)
+		     == dict_index_get_space(index));
+		ut_a(fil_page_get_type(buf_block_get_frame(block))
+		     == FIL_PAGE_INDEX);
+		ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0);
+
+		if (nonfree) {
+			/* Allocate the same page again. */
+			ulint	page_no;
+			page_no = btr_page_alloc_low(
+				index, buf_block_get_page_no(block),
+				FSP_NO_DIR, 0, mtr, NULL);
+			ut_a(page_no == buf_block_get_page_no(block));
+		} else {
+			/* Assert that the page is allocated and free it. */
+			btr_page_free_low(index, buf_block_get_frame(block),
+					  0, mtr);
+		}
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
+}
+
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
+See btr_mark_freed_leaves(). */
+
+ibool
+btr_freed_leaves_validate(
+/*======================*/
+			/* out: TRUE if valid */
+	mtr_t*	mtr)	/* in: mini-transaction */
+{
+	ulint	offset;
+
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	offset = dyn_array_get_data_size(&mtr->memo);
+
+	while (offset > 0) {
+		mtr_memo_slot_t*	slot;
+		buf_block_t*		block;
+
+		offset -= sizeof *slot;
+
+		slot = dyn_array_get_element(&mtr->memo, offset);
+
+		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
+			continue;
+		}
+
+		ut_a(mtr->freed_clust_leaf);
+		/* Because btr_page_alloc() does invoke
+		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
+		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
+		memo must still be clustered index leaf tree pages. */
+		block = slot->object;
+		ut_a(fil_page_get_type(buf_block_get_frame(block))
+		     == FIL_PAGE_INDEX);
+		ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0);
+	}
+
+	return(TRUE);
 }
+#endif /* UNIV_DEBUG */
 
 /******************************************************************
 Sets the child node file address in a node pointer. */
@@ -1015,7 +1199,7 @@ btr_root_raise_and_insert(
 	a node pointer to the new page, and then splitting the new page. */
 
 	new_page = btr_page_alloc(index, 0, FSP_NO_DIR,
-				  btr_page_get_level(root, mtr), mtr);
+				  btr_page_get_level(root, mtr), mtr, mtr);
 
 	btr_page_create(new_page, index, mtr);
 
@@ -1636,7 +1820,7 @@ func_start:
 
 	/* 2. Allocate a new page to the index */
 	new_page = btr_page_alloc(cursor->index, hint_page_no, direction,
-				  btr_page_get_level(page, mtr), mtr);
+				  btr_page_get_level(page, mtr), mtr, mtr);
 	btr_page_create(new_page, cursor->index, mtr);
 
 	/* 3. Calculate the first record on the upper half-page, and the

=== modified file 'storage/innobase/btr/btr0cur.c'
--- a/storage/innobase/btr/btr0cur.c	2011-06-16 08:51:04 +0000
+++ b/storage/innobase/btr/btr0cur.c	2011-08-29 08:16:42 +0000
@@ -2051,43 +2051,6 @@ return_after_reservations:
 	return(err);
 }
 
-/*****************************************************************
-Commits and restarts a mini-transaction so that it will retain an
-x-lock on index->lock and the cursor page. */
-
-void
-btr_cur_mtr_commit_and_start(
-/*=========================*/
-	btr_cur_t*	cursor,	/* in: cursor */
-	mtr_t*		mtr)	/* in/out: mini-transaction */
-{
-	buf_block_t*	block;
-
-	block = buf_block_align(btr_cur_get_rec(cursor));
-
-	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
-				MTR_MEMO_X_LOCK));
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	/* Keep the locks across the mtr_commit(mtr). */
-	rw_lock_x_lock(dict_index_get_lock(cursor->index));
-	rw_lock_x_lock(&block->lock);
-	mutex_enter(&block->mutex);
-#ifdef UNIV_SYNC_DEBUG
-	buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__);
-#else
-	buf_block_buf_fix_inc(block);
-#endif
-	mutex_exit(&block->mutex);
-	/* Write out the redo log. */
-	mtr_commit(mtr);
-	mtr_start(mtr);
-	/* Reassociate the locks with the mini-transaction.
-	They will be released on mtr_commit(mtr). */
-	mtr_memo_push(mtr, dict_index_get_lock(cursor->index),
-		      MTR_MEMO_X_LOCK);
-	mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
-}
-
 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
 
 /********************************************************************
@@ -3494,6 +3457,11 @@ btr_store_big_rec_extern_fields(
 					this function returns */
 	big_rec_t*	big_rec_vec,	/* in: vector containing fields
 					to be stored externally */
+	mtr_t*		alloc_mtr,	/* in/out: in an insert, NULL;
+					in an update, local_mtr for
+					allocating BLOB pages and
+					updating BLOB pointers; alloc_mtr
+					must not have freed any leaf pages */
 	mtr_t*		local_mtr __attribute__((unused))) /* in: mtr
 					containing the latch to rec and to the
 					tree */
@@ -3514,6 +3482,8 @@ btr_store_big_rec_extern_fields(
 	ulint	i;
 	mtr_t	mtr;
 
+	ut_ad(local_mtr);
+	ut_ad(!alloc_mtr || alloc_mtr == local_mtr);
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
 				MTR_MEMO_X_LOCK));
@@ -3523,6 +3493,25 @@ btr_store_big_rec_extern_fields(
 
 	space_id = buf_frame_get_space_id(rec);
 
+	if (alloc_mtr) {
+		/* Because alloc_mtr will be committed after
+		mtr, it is possible that the tablespace has been
+		extended when the B-tree record was updated or
+		inserted, or it will be extended while allocating
+		pages for big_rec.
+
+		TODO: In mtr (not alloc_mtr), write a redo log record
+		about extending the tablespace to its current size,
+		and remember the current size. Whenever the tablespace
+		grows as pages are allocated, write further redo log
+		records to mtr. (Currently tablespace extension is not
+		covered by the redo log. If it were, the record would
+		only be written to alloc_mtr, which is committed after
+		mtr.) */
+	} else {
+		alloc_mtr = &mtr;
+	}
+
 	/* We have to create a file segment to the tablespace
 	for each field and put the pointer to the field in rec */
 
@@ -3549,7 +3538,7 @@ btr_store_big_rec_extern_fields(
 			}
 
 			page = btr_page_alloc(index, hint_page_no,
-					      FSP_NO_DIR, 0, &mtr);
+					      FSP_NO_DIR, 0, alloc_mtr, &mtr);
 			if (page == NULL) {
 
 				mtr_commit(&mtr);
@@ -3603,37 +3592,42 @@ btr_store_big_rec_extern_fields(
 
 			extern_len -= store_len;
 
+			if (alloc_mtr == &mtr) {
 #ifdef UNIV_SYNC_DEBUG
-			rec_page =
+				rec_page =
 #endif /* UNIV_SYNC_DEBUG */
-			buf_page_get(space_id,
-				     buf_frame_get_page_no(data),
-				     RW_X_LATCH, &mtr);
+					buf_page_get(
+						space_id,
+						buf_frame_get_page_no(data),
+						RW_X_LATCH, &mtr);
 #ifdef UNIV_SYNC_DEBUG
-			buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK);
+				buf_page_dbg_add_level(
+					rec_page, SYNC_NO_ORDER_CHECK);
 #endif /* UNIV_SYNC_DEBUG */
+			}
+
 			mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0,
-					 MLOG_4BYTES, &mtr);
+					 MLOG_4BYTES, alloc_mtr);
 			mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4,
 					 big_rec_vec->fields[i].len
 					 - extern_len,
-					 MLOG_4BYTES, &mtr);
+					 MLOG_4BYTES, alloc_mtr);
 
 			if (prev_page_no == FIL_NULL) {
 				mlog_write_ulint(data + local_len
 						 + BTR_EXTERN_SPACE_ID,
 						 space_id,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);
 
 				mlog_write_ulint(data + local_len
 						 + BTR_EXTERN_PAGE_NO,
 						 page_no,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);
 
 				mlog_write_ulint(data + local_len
 						 + BTR_EXTERN_OFFSET,
 						 FIL_PAGE_DATA,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);
 
 				/* Set the bit denoting that this field
 				in rec is stored externally */
@@ -3641,7 +3635,7 @@ btr_store_big_rec_extern_fields(
 				rec_set_nth_field_extern_bit(
 					rec, index,
 					big_rec_vec->fields[i].field_no,
-					TRUE, &mtr);
+					TRUE, alloc_mtr);
 			}
 
 			prev_page_no = page_no;

=== modified file 'storage/innobase/buf/buf0buf.c'
--- a/storage/innobase/buf/buf0buf.c	2011-01-06 07:05:45 +0000
+++ b/storage/innobase/buf/buf0buf.c	2011-08-29 08:16:42 +0000
@@ -1009,29 +1009,6 @@ buf_page_peek_block(
 }
 
 /************************************************************************
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/* in: space id */
-	ulint	offset)	/* in: page number */
-{
-	buf_block_t*	block;
-
-	mutex_enter_fast(&(buf_pool->mutex));
-
-	block = buf_page_hash_get(space, offset);
-
-	if (block) {
-		block->check_index_page_at_flush = FALSE;
-	}
-
-	mutex_exit(&(buf_pool->mutex));
-}
-
-/************************************************************************
 Returns the current state of is_hashed of a page. FALSE if the page is
 not in the pool. NOTE that this operation does not fix the page in the
 pool if it is found there. */

=== modified file 'storage/innobase/fsp/fsp0fsp.c'
--- a/storage/innobase/fsp/fsp0fsp.c	2011-01-06 07:05:45 +0000
+++ b/storage/innobase/fsp/fsp0fsp.c	2011-08-29 08:16:42 +0000
@@ -293,15 +293,19 @@ fseg_alloc_free_page_low(
 				/* out: the allocated page number, FIL_NULL
 				if no page could be allocated */
 	ulint		space,	/* in: space */
-	fseg_inode_t*	seg_inode, /* in: segment inode */
+	fseg_inode_t*	seg_inode, /* in/out: segment inode */
 	ulint		hint,	/* in: hint of which page would be desirable */
 	byte		direction, /* in: if the new page is needed because
 				of an index page split, and records are
 				inserted there in order, into which
 				direction they go alphabetically: FSP_DOWN,
 				FSP_UP, FSP_NO_DIR */
-	mtr_t*		mtr);	/* in: mtr handle */
-
+	mtr_t*		mtr,	/* in/out: mini-transaction */
+	mtr_t*		init_mtr);/* in/out: mini-transaction in which the
+				page should be initialized
+				(may be the same as mtr), or NULL if it
+				should not be initialized (the page at hint
+				was previously freed in mtr) */
 
 /**************************************************************************
 Reads the file space size stored in the header page. */
@@ -1371,6 +1375,43 @@ fsp_alloc_free_extent(
 	return(descr);
 }
 
+/**********************************************************************//**
+Allocates a single free page from a space. */
+static __attribute__((nonnull))
+void
+fsp_alloc_from_free_frag(
+/*=====================*/
+	fsp_header_t*	header,	/* in/out: tablespace header */
+	xdes_t*		descr,	/* in/out: extent descriptor */
+	ulint		bit,	/* in: slot to allocate in the extent */
+	mtr_t*		mtr)	/* in/out: mini-transaction */
+{
+	ulint		frag_n_used;
+
+	ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG);
+	ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr));
+	xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr);
+
+	/* Update the FRAG_N_USED field */
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+				     mtr);
+	frag_n_used++;
+	mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES,
+			 mtr);
+	if (xdes_is_full(descr, mtr)) {
+		/* The fragment is full: move it to another list */
+		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+			    mtr);
+		xdes_set_state(descr, XDES_FULL_FRAG, mtr);
+
+		flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
+			      mtr);
+		mlog_write_ulint(header + FSP_FRAG_N_USED,
+				 frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES,
+				 mtr);
+	}
+}
+
 /**************************************************************************
 Allocates a single free page from a space. The page is marked as used. */
 static
@@ -1381,19 +1422,22 @@ fsp_alloc_free_page(
 			be allocated */
 	ulint	space,	/* in: space id */
 	ulint	hint,	/* in: hint of which page would be desirable */
-	mtr_t*	mtr)	/* in: mtr handle */
+	mtr_t*	mtr,	/* in/out: mini-transaction */
+	mtr_t*	init_mtr)/* in/out: mini-transaction in which the
+			page should be initialized
+			(may be the same as mtr) */
 {
 	fsp_header_t*	header;
 	fil_addr_t	first;
 	xdes_t*		descr;
 	page_t*		page;
 	ulint		free;
-	ulint		frag_n_used;
 	ulint		page_no;
 	ulint		space_size;
 	ibool		success;
 
 	ut_ad(mtr);
+	ut_ad(init_mtr);
 
 	header = fsp_get_space_header(space, mtr);
 
@@ -1441,6 +1485,7 @@ fsp_alloc_free_page(
 	if (free == ULINT_UNDEFINED) {
 
 		ut_print_buf(stderr, ((byte*)descr) - 500, 1000);
+		putc('\n', stderr);
 
 		ut_error;
 	}
@@ -1472,40 +1517,21 @@ fsp_alloc_free_page(
 		}
 	}
 
-	xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr);
-
-	/* Update the FRAG_N_USED field */
-	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
-				     mtr);
-	frag_n_used++;
-	mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES,
-			 mtr);
-	if (xdes_is_full(descr, mtr)) {
-		/* The fragment is full: move it to another list */
-		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
-			    mtr);
-		xdes_set_state(descr, XDES_FULL_FRAG, mtr);
-
-		flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
-			      mtr);
-		mlog_write_ulint(header + FSP_FRAG_N_USED,
-				 frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES,
-				 mtr);
-	}
+	fsp_alloc_from_free_frag(header, descr, free, mtr);
 
 	/* Initialize the allocated page to the buffer pool, so that it can
 	be obtained immediately with buf_page_get without need for a disk
 	read. */
 
-	buf_page_create(space, page_no, mtr);
+	buf_page_create(space, page_no, init_mtr);
 
-	page = buf_page_get(space, page_no, RW_X_LATCH, mtr);
+	page = buf_page_get(space, page_no, RW_X_LATCH, init_mtr);
 #ifdef UNIV_SYNC_DEBUG
 	buf_page_dbg_add_level(page, SYNC_FSP_PAGE);
 #endif /* UNIV_SYNC_DEBUG */
 
 	/* Prior contents of the page should be ignored */
-	fsp_init_file_page(page, mtr);
+	fsp_init_file_page(page, init_mtr);
 
 	return(page_no);
 }
@@ -1724,7 +1750,7 @@ fsp_alloc_seg_inode_page(
 
 	space = buf_frame_get_space_id(space_header);
 
-	page_no = fsp_alloc_free_page(space, 0, mtr);
+	page_no = fsp_alloc_free_page(space, 0, mtr, mtr);
 
 	if (page_no == FIL_NULL) {
 
@@ -2094,7 +2120,8 @@ fseg_create_general(
 	}
 
 	if (page == 0) {
-		page = fseg_alloc_free_page_low(space, inode, 0, FSP_UP, mtr);
+		page = fseg_alloc_free_page_low(space,
+						inode, 0, FSP_UP, mtr, mtr);
 
 		if (page == FIL_NULL) {
 
@@ -2331,14 +2358,19 @@ fseg_alloc_free_page_low(
 				/* out: the allocated page number, FIL_NULL
 				if no page could be allocated */
 	ulint		space,	/* in: space */
-	fseg_inode_t*	seg_inode, /* in: segment inode */
+	fseg_inode_t*	seg_inode, /* in/out: segment inode */
 	ulint		hint,	/* in: hint of which page would be desirable */
 	byte		direction, /* in: if the new page is needed because
 				of an index page split, and records are
 				inserted there in order, into which
 				direction they go alphabetically: FSP_DOWN,
 				FSP_UP, FSP_NO_DIR */
-	mtr_t*		mtr)	/* in: mtr handle */
+	mtr_t*		mtr,	/* in/out: mini-transaction */
+	mtr_t*		init_mtr)/* in/out: mini-transaction in which the
+				page should be initialized
+				(may be the same as mtr), or NULL if it
+				should not be initialized (the page at hint
+				was previously freed in mtr) */
 {
 	fsp_header_t*	space_header;
 	ulint		space_size;
@@ -2350,7 +2382,6 @@ fseg_alloc_free_page_low(
 					if could not be allocated */
 	xdes_t*		ret_descr;	/* the extent of the allocated page */
 	page_t*		page;
-	ibool		frag_page_allocated = FALSE;
 	ibool		success;
 	ulint		n;
 
@@ -2371,6 +2402,8 @@ fseg_alloc_free_page_low(
 	if (descr == NULL) {
 		/* Hint outside space or too high above free limit: reset
 		hint */
+		ut_a(init_mtr);
+		/* The file space header page is always allocated. */
 		hint = 0;
 		descr = xdes_get_descriptor(space, hint, mtr);
 	}
@@ -2382,15 +2415,20 @@ fseg_alloc_free_page_low(
 						   mtr), seg_id))
 	    && (xdes_get_bit(descr, XDES_FREE_BIT,
 			     hint % FSP_EXTENT_SIZE, mtr) == TRUE)) {
-
+take_hinted_page:
 		/* 1. We can take the hinted page
 		=================================*/
 		ret_descr = descr;
 		ret_page = hint;
+		/* Skip the check for extending the tablespace. If the
+		page hint were not within the size of the tablespace,
+		we would have got (descr == NULL) above and reset the hint. */
+		goto got_hinted_page;
 		/*-----------------------------------------------------------*/
-	} else if ((xdes_get_state(descr, mtr) == XDES_FREE)
-		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
-		   && (used >= FSEG_FRAG_LIMIT)) {
+	} else if (xdes_get_state(descr, mtr) == XDES_FREE
+		   && (!init_mtr
+		       || ((reserved - used < reserved / FSEG_FILLFACTOR)
+			   && used >= FSEG_FRAG_LIMIT))) {
 
 		/* 2. We allocate the free extent from space and can take
 		=========================================================
@@ -2408,8 +2446,20 @@ fseg_alloc_free_page_low(
 		/* Try to fill the segment free list */
 		fseg_fill_free_list(seg_inode, space,
 				    hint + FSP_EXTENT_SIZE, mtr);
-		ret_page = hint;
+		goto take_hinted_page;
 		/*-----------------------------------------------------------*/
+	} else if (!init_mtr) {
+		ut_a(xdes_get_state(descr, mtr) == XDES_FREE_FRAG);
+		fsp_alloc_from_free_frag(space_header, descr,
+					 hint % FSP_EXTENT_SIZE, mtr);
+		ret_page = hint;
+		ret_descr = NULL;
+
+		/* Put the page in the fragment page array of the segment */
+		n = fseg_find_free_frag_page_slot(seg_inode, mtr);
+		ut_a(n != FIL_NULL);
+		fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr);
+		goto got_hinted_page;
 	} else if ((direction != FSP_NO_DIR)
 		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
 		   && (used >= FSEG_FRAG_LIMIT)
@@ -2467,11 +2517,9 @@ fseg_alloc_free_page_low(
 	} else if (used < FSEG_FRAG_LIMIT) {
 		/* 6. We allocate an individual page from the space
 		===================================================*/
-		ret_page = fsp_alloc_free_page(space, hint, mtr);
+		ret_page = fsp_alloc_free_page(space, hint, mtr, init_mtr);
 		ret_descr = NULL;
 
-		frag_page_allocated = TRUE;
-
 		if (ret_page != FIL_NULL) {
 			/* Put the page in the fragment page array of the
 			segment */
@@ -2481,6 +2529,10 @@ fseg_alloc_free_page_low(
 			fseg_set_nth_frag_page_no(seg_inode, n, ret_page,
 						  mtr);
 		}
+
+		/* fsp_alloc_free_page() invoked fsp_init_file_page()
+		already. */
+		return(ret_page);
 		/*-----------------------------------------------------------*/
 	} else {
 		/* 7. We allocate a new extent and take its first page
@@ -2527,22 +2579,31 @@ fseg_alloc_free_page_low(
 		}
 	}
 
-	if (!frag_page_allocated) {
+got_hinted_page:
+	{
 		/* Initialize the allocated page to buffer pool, so that it
 		can be obtained immediately with buf_page_get without need
 		for a disk read */
+		mtr_t*	block_mtr = init_mtr ? init_mtr : mtr;
 
-		page = buf_page_create(space, ret_page, mtr);
+		page = buf_page_create(space, ret_page, block_mtr);
 
-		ut_a(page == buf_page_get(space, ret_page, RW_X_LATCH, mtr));
+		ut_a(page == buf_page_get(space, ret_page, RW_X_LATCH,
+					  block_mtr));
 
 #ifdef UNIV_SYNC_DEBUG
 		buf_page_dbg_add_level(page, SYNC_FSP_PAGE);
 #endif /* UNIV_SYNC_DEBUG */
 
-		/* The prior contents of the page should be ignored */
-		fsp_init_file_page(page, mtr);
+		if (init_mtr) {
+			/* The prior contents of the page should be ignored */
+			fsp_init_file_page(page, init_mtr);
+		}
+	}
 
+	/* ret_descr == NULL if the block was allocated from free_frag
+	(XDES_FREE_FRAG) */
+	if (ret_descr != NULL) {
 		/* At this point we know the extent and the page offset.
 		The extent is still in the appropriate list (FSEG_NOT_FULL
 		or FSEG_FREE), and the page is not yet marked as used. */
@@ -2554,8 +2615,6 @@ fseg_alloc_free_page_low(
 		fseg_mark_page_used(seg_inode, space, ret_page, mtr);
 	}
 
-	buf_reset_check_index_page_at_flush(space, ret_page);
-
 	return(ret_page);
 }
 
@@ -2569,7 +2628,7 @@ fseg_alloc_free_page_general(
 /*=========================*/
 				/* out: allocated page offset, FIL_NULL if no
 				page could be allocated */
-	fseg_header_t*	seg_header,/* in: segment header */
+	fseg_header_t*	seg_header,/* in/out: segment header */
 	ulint		hint,	/* in: hint of which page would be desirable */
 	byte		direction,/* in: if the new page is needed because
 				of an index page split, and records are
@@ -2581,7 +2640,11 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr)	/* in: mtr handle */
+	mtr_t*		mtr,	/* in/out: mini-transaction handle */
+	mtr_t*		init_mtr)/* in/out: mtr or another mini-transaction
+				in which the page should be initialized,
+				or NULL if this is a "fake allocation" of
+				a page that was previously freed in mtr */
 {
 	fseg_inode_t*	inode;
 	ulint		space;
@@ -2619,7 +2682,8 @@ fseg_alloc_free_page_general(
 	}
 
 	page_no = fseg_alloc_free_page_low(buf_frame_get_space_id(inode),
-					   inode, hint, direction, mtr);
+					   inode, hint, direction,
+					   mtr, init_mtr);
 	if (!has_done_reservation) {
 		fil_space_release_free_extents(space, n_reserved);
 	}
@@ -2647,7 +2711,7 @@ fseg_alloc_free_page(
 	mtr_t*		mtr)	/* in: mtr handle */
 {
 	return(fseg_alloc_free_page_general(seg_header, hint, direction,
-					    FALSE, mtr));
+					    FALSE, mtr, mtr));
 }
 
 /**************************************************************************

=== modified file 'storage/innobase/include/btr0btr.h'
--- a/storage/innobase/include/btr0btr.h	2011-06-16 07:27:21 +0000
+++ b/storage/innobase/include/btr0btr.h	2011-08-29 08:16:42 +0000
@@ -379,7 +379,11 @@ btr_page_alloc(
 					page split is made */
 	ulint		level,		/* in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr);		/* in: mtr */
+	mtr_t*		mtr,		/* in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr);	/* in/out: mini-transaction
+					for x-latching and initializing
+					the page */
 /******************************************************************
 Frees a file page used in an index tree. NOTE: cannot free field external
 storage pages because the page must contain info on its level. */
@@ -402,6 +406,31 @@ btr_page_free_low(
 	page_t*		page,	/* in: page to be freed, x-latched */
 	ulint		level,	/* in: page level */
 	mtr_t*		mtr);	/* in: mtr */
+/**************************************************************//**
+Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
+For invoking btr_store_big_rec_extern_fields() after an update,
+we must temporarily mark freed clustered index pages allocated, so
+that off-page columns will not be allocated from them. Between the
+btr_store_big_rec_extern_fields() and mtr_commit() we have to
+mark the pages free again, so that no pages will be leaked. */
+
+void
+btr_mark_freed_leaves(
+/*==================*/
+	dict_index_t*	index,	/* in/out: clustered index */
+	mtr_t*		mtr,	/* in/out: mini-transaction */
+	ibool		nonfree);/* in: TRUE=mark nonfree, FALSE=mark freed */
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
+See btr_mark_freed_leaves(). */
+
+ibool
+btr_freed_leaves_validate(
+/*======================*/
+			/* out: TRUE if valid */
+	mtr_t*	mtr);	/* in: mini-transaction */
+#endif /* UNIV_DEBUG */
 #ifdef UNIV_BTR_PRINT
 /*****************************************************************
 Prints size info of a B-tree. */

=== modified file 'storage/innobase/include/btr0cur.h'
--- a/storage/innobase/include/btr0cur.h	2011-06-16 07:27:21 +0000
+++ b/storage/innobase/include/btr0cur.h	2011-08-29 08:16:42 +0000
@@ -252,15 +252,6 @@ btr_cur_pessimistic_update(
 				updates */
 	que_thr_t*	thr,	/* in: query thread */
 	mtr_t*		mtr);	/* in: mtr */
-/*****************************************************************
-Commits and restarts a mini-transaction so that it will retain an
-x-lock on index->lock and the cursor page. */
-
-void
-btr_cur_mtr_commit_and_start(
-/*=========================*/
-	btr_cur_t*	cursor,	/* in: cursor */
-	mtr_t*		mtr);	/* in/out: mini-transaction */
 /***************************************************************
 Marks a clustered index record deleted. Writes an undo log record to
 undo log on this delete marking. Writes in the trx id field the id
@@ -471,6 +462,11 @@ btr_store_big_rec_extern_fields(
 					this function returns */
 	big_rec_t*	big_rec_vec,	/* in: vector containing fields
 					to be stored externally */
+	mtr_t*		alloc_mtr,	/* in/out: in an insert, NULL;
+					in an update, local_mtr for
+					allocating BLOB pages and
+					updating BLOB pointers; alloc_mtr
+					must not have freed any leaf pages */
 	mtr_t*		local_mtr);	/* in: mtr containing the latch to
 					rec and to the tree */
 /***********************************************************************

=== modified file 'storage/innobase/include/buf0buf.h'
--- a/storage/innobase/include/buf0buf.h	2011-06-16 07:27:21 +0000
+++ b/storage/innobase/include/buf0buf.h	2011-08-29 08:16:42 +0000
@@ -294,15 +294,6 @@ buf_page_peek_block(
 	ulint	space,	/* in: space id */
 	ulint	offset);/* in: page number */
 /************************************************************************
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/* in: space id */
-	ulint	offset);/* in: page number */
-/************************************************************************
 Sets file_page_was_freed TRUE if the page is found in the buffer pool.
 This function should be called when we free a file page and want the
 debug version to check that it is not accessed any more unless

=== modified file 'storage/innobase/include/fsp0fsp.h'
--- a/storage/innobase/include/fsp0fsp.h	2009-05-26 12:57:12 +0000
+++ b/storage/innobase/include/fsp0fsp.h	2011-08-29 08:16:42 +0000
@@ -167,7 +167,7 @@ fseg_alloc_free_page_general(
 /*=========================*/
 				/* out: allocated page offset, FIL_NULL if no
 				page could be allocated */
-	fseg_header_t*	seg_header,/* in: segment header */
+	fseg_header_t*	seg_header,/* in/out: segment header */
 	ulint		hint,	/* in: hint of which page would be desirable */
 	byte		direction,/* in: if the new page is needed because
 				of an index page split, and records are
@@ -179,7 +179,11 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr);	/* in: mtr handle */
+	mtr_t*		mtr,	/* in/out: mini-transaction */
+	mtr_t*		init_mtr);/* in/out: mtr or another mini-transaction
+				in which the page should be initialized,
+				or NULL if this is a "fake allocation" of
+				a page that was previously freed in mtr */
 /**************************************************************************
 Reserves free pages from a tablespace. All mini-transactions which may
 use several pages from the tablespace should call this function beforehand

=== modified file 'storage/innobase/include/mtr0mtr.h'
--- a/storage/innobase/include/mtr0mtr.h	2009-12-15 13:23:54 +0000
+++ b/storage/innobase/include/mtr0mtr.h	2011-08-29 08:16:42 +0000
@@ -36,6 +36,8 @@ first 3 values must be RW_S_LATCH, RW_X_
 #define MTR_MEMO_MODIFY		54
 #define	MTR_MEMO_S_LOCK		55
 #define	MTR_MEMO_X_LOCK		56
+/* The mini-transaction freed a clustered index leaf page. */
+#define MTR_MEMO_FREE_CLUST_LEAF	57
 
 /* Log item types: we have made them to be of the type 'byte'
 for the compiler to warn if val and type parameters are switched
@@ -325,9 +327,12 @@ struct mtr_struct{
 	ulint		state;	/* MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */
 	dyn_array_t	memo;	/* memo stack for locks etc. */
 	dyn_array_t	log;	/* mini-transaction log */
-	ibool		modifications;
+	unsigned	modifications:1;
 				/* TRUE if the mtr made modifications to
 				buffer pool pages */
+	unsigned	freed_clust_leaf:1;
+				/* TRUE if MTR_MEMO_FREE_CLUST_LEAF
+				was logged in the mini-transaction */
 	ulint		n_log_recs;
 				/* count of how many page initial log records
 				have been written to the mtr log */

=== modified file 'storage/innobase/include/mtr0mtr.ic'
--- a/storage/innobase/include/mtr0mtr.ic	2006-09-26 07:08:48 +0000
+++ b/storage/innobase/include/mtr0mtr.ic	2011-08-29 08:16:42 +0000
@@ -26,6 +26,7 @@ mtr_start(
 
 	mtr->log_mode = MTR_LOG_ALL;
 	mtr->modifications = FALSE;
+	mtr->freed_clust_leaf = FALSE;
 	mtr->n_log_recs = 0;
 
 #ifdef UNIV_DEBUG
@@ -50,7 +51,8 @@ mtr_memo_push(
 
 	ut_ad(object);
 	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
-	ut_ad(type <= MTR_MEMO_X_LOCK);
+	ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF);
+	ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf);
 	ut_ad(mtr);
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 

=== modified file 'storage/innobase/mtr/mtr0mtr.c'
--- a/storage/innobase/mtr/mtr0mtr.c	2006-09-26 07:08:48 +0000
+++ b/storage/innobase/mtr/mtr0mtr.c	2011-08-29 08:16:42 +0000
@@ -53,17 +53,13 @@ mtr_memo_slot_release(
 			buf_page_release((buf_block_t*)object, type, mtr);
 		} else if (type == MTR_MEMO_S_LOCK) {
 			rw_lock_s_unlock((rw_lock_t*)object);
-#ifdef UNIV_DEBUG
-		} else if (type == MTR_MEMO_X_LOCK) {
-			rw_lock_x_unlock((rw_lock_t*)object);
-		} else {
-			ut_ad(type == MTR_MEMO_MODIFY);
+		} else if (type != MTR_MEMO_X_LOCK) {
+			ut_ad(type == MTR_MEMO_MODIFY
+			      || type == MTR_MEMO_FREE_CLUST_LEAF);
 			ut_ad(mtr_memo_contains(mtr, object,
 						MTR_MEMO_PAGE_X_FIX));
-#else
 		} else {
 			rw_lock_x_unlock((rw_lock_t*)object);
-#endif
 		}
 	}
 

=== modified file 'storage/innobase/row/row0ins.c'
--- a/storage/innobase/row/row0ins.c	2011-06-16 07:27:21 +0000
+++ b/storage/innobase/row/row0ins.c	2011-08-29 08:16:42 +0000
@@ -2089,15 +2089,20 @@ row_ins_index_entry_low(
 			if (big_rec) {
 				ut_a(err == DB_SUCCESS);
 				/* Write out the externally stored
-				columns while still x-latching
-				index->lock and block->lock. We have
-				to mtr_commit(mtr) first, so that the
-				redo log will be written in the
-				correct order. Otherwise, we would run
-				into trouble on crash recovery if mtr
-				freed B-tree pages on which some of
-				the big_rec fields will be written. */
-				btr_cur_mtr_commit_and_start(&cursor, &mtr);
+				columns, but allocate the pages and
+				write the pointers using the
+				mini-transaction of the record update.
+				If any pages were freed in the update,
+				temporarily mark them allocated so
+				that off-page columns will not
+				overwrite them. We must do this,
+				because we will write the redo log for
+				the BLOB writes before writing the
+				redo log for the record update. Thus,
+				redo log application at crash recovery
+				will see BLOBs being written to free pages. */
+
+				btr_mark_freed_leaves(index, &mtr, TRUE);
 
 				rec = btr_cur_get_rec(&cursor);
 				offsets = rec_get_offsets(rec, index, offsets,
@@ -2105,7 +2110,8 @@ row_ins_index_entry_low(
 							  &heap);
 
 				err = btr_store_big_rec_extern_fields(
-					index, rec, offsets, big_rec, &mtr);
+					index, rec, offsets, big_rec,
+					&mtr, &mtr);
 				/* If writing big_rec fails (for
 				example, because of DB_OUT_OF_FILE_SPACE),
 				the record will be corrupted. Even if
@@ -2118,6 +2124,9 @@ row_ins_index_entry_low(
 				undo log, and thus the record cannot
 				be rolled back. */
 				ut_a(err == DB_SUCCESS);
+				/* Free the pages again
+				in order to avoid a leak. */
+				btr_mark_freed_leaves(index, &mtr, FALSE);
 				goto stored_big_rec;
 			}
 		} else {
@@ -2165,7 +2174,8 @@ function_exit:
 					  ULINT_UNDEFINED, &heap);
 
 		err = btr_store_big_rec_extern_fields(index, rec,
-						      offsets, big_rec, &mtr);
+						      offsets, big_rec,
+						      NULL, &mtr);
 stored_big_rec:
 		if (modify) {
 			dtuple_big_rec_free(big_rec);

=== modified file 'storage/innobase/row/row0row.c'
--- a/storage/innobase/row/row0row.c	2011-06-29 06:57:15 +0000
+++ b/storage/innobase/row/row0row.c	2011-08-29 08:16:42 +0000
@@ -212,23 +212,27 @@ row_build(
 	}
 
 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
-	/* This condition can occur during crash recovery before
-	trx_rollback_or_clean_all_without_sess() has completed
-	execution.
+	if (rec_offs_any_null_extern(rec, offsets)) {
+		/* This condition can occur during crash recovery
+		before trx_rollback_or_clean_all_without_sess() has
+		completed execution.
 
-	This condition is possible if the server crashed
-	during an insert or update before
-	btr_store_big_rec_extern_fields() did mtr_commit() all
-	BLOB pointers to the clustered index record.
+		This condition is possible if the server crashed
+		during an insert or update before
+		btr_store_big_rec_extern_fields() did mtr_commit() all
+		BLOB pointers to the clustered index record.
 
-	If the record contains a null BLOB pointer, look up the
-	transaction that holds the implicit lock on this record, and
-	assert that it is active. (In this version of InnoDB, we
-	cannot assert that it was recovered, because there is no
-	trx->is_recovered field.) */
+		If the record contains a null BLOB pointer, look up the
+		transaction that holds the implicit lock on this record, and
+		assert that it is active. (In this version of InnoDB, we
+		cannot assert that it was recovered, because there is no
+		trx->is_recovered field.) */
 
-	ut_a(!rec_offs_any_null_extern(rec, offsets)
-	     || trx_assert_active(row_get_rec_trx_id(rec, index, offsets)));
+		ut_a(trx_assert_active(
+			     row_get_rec_trx_id(rec, index, offsets)));
+		ut_a(trx_undo_roll_ptr_is_insert(
+			     row_get_rec_roll_ptr(rec, index, offsets)));
+	}
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 	if (type != ROW_COPY_POINTERS) {

=== modified file 'storage/innobase/row/row0upd.c'
--- a/storage/innobase/row/row0upd.c	2011-06-16 07:27:21 +0000
+++ b/storage/innobase/row/row0upd.c	2011-08-29 08:16:42 +0000
@@ -1591,21 +1591,22 @@ row_upd_clust_rec(
 		*offsets_ = (sizeof offsets_) / sizeof *offsets_;
 
 		ut_a(err == DB_SUCCESS);
-		/* Write out the externally stored columns while still
-		x-latching index->lock and block->lock. We have to
-		mtr_commit(mtr) first, so that the redo log will be
-		written in the correct order. Otherwise, we would run
-		into trouble on crash recovery if mtr freed B-tree
-		pages on which some of the big_rec fields will be
-		written. */
-		btr_cur_mtr_commit_and_start(btr_cur, mtr);
+		/* Write out the externally stored columns, but
+		allocate the pages and write the pointers using the
+		mini-transaction of the record update. If any pages
+		were freed in the update, temporarily mark them
+		allocated so that off-page columns will not overwrite
+		them. We must do this, because we write the redo log
+		for the BLOB writes before writing the redo log for
+		the record update. */
 
+		btr_mark_freed_leaves(index, mtr, TRUE);
 		rec = btr_cur_get_rec(btr_cur);
 		err = btr_store_big_rec_extern_fields(
 			index, rec,
 			rec_get_offsets(rec, index, offsets_,
 					ULINT_UNDEFINED, &heap),
-			big_rec, mtr);
+			big_rec, mtr, mtr);
 		if (UNIV_LIKELY_NULL(heap)) {
 			mem_heap_free(heap);
 		}
@@ -1618,6 +1619,8 @@ row_upd_clust_rec(
 		to the undo log, and thus the record cannot be rolled
 		back. */
 		ut_a(err == DB_SUCCESS);
+		/* Free the pages again in order to avoid a leak. */
+		btr_mark_freed_leaves(index, mtr, FALSE);
 	}
 
 	mtr_commit(mtr);

=== modified file 'storage/innobase/trx/trx0undo.c'
--- a/storage/innobase/trx/trx0undo.c	2011-02-22 21:03:32 +0000
+++ b/storage/innobase/trx/trx0undo.c	2011-08-29 08:16:42 +0000
@@ -864,7 +864,7 @@ trx_undo_add_page(
 	page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR
 					       + TRX_UNDO_FSEG_HEADER,
 					       undo->top_page_no + 1, FSP_UP,
-					       TRUE, mtr);
+					       TRUE, mtr, mtr);
 
 	fil_space_release_free_extents(undo->space, n_reserved);
 

=== modified file 'storage/innodb_plugin/ChangeLog'
--- a/storage/innodb_plugin/ChangeLog	2011-08-15 09:11:43 +0000
+++ b/storage/innodb_plugin/ChangeLog	2011-08-29 08:16:42 +0000
@@ -1,3 +1,17 @@
+2011-08-29	The InnoDB Team
+
+	* btr/btr0btr.c, btr/btr0cur.c, fsp/fsp0fsp.c,
+	include/btr0btr.h, include/btr0cur.h, include/fsp0fsp.h,
+	include/mtr0mtr.h, include/mtr0mtr.ic, mtr/mtr0mtr.c,
+	row/row0ins.c, row/row0row.c, row/row0upd.c, trx/trx0undo.c:
+	Fix Bug#12704861 Corruption after a crash during BLOB update
+	and other regressions from the fix of Bug#12612184
+
+2011-08-23	The InnoDB Team
+
+	* include/trx0undo.h, trx/trx0rec.c, trx/trx0undo.c:
+	Fix Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE
+
 2011-08-15	The InnoDB Team
 
 	* btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, btr/btr0sea.c,

=== modified file 'storage/innodb_plugin/btr/btr0btr.c'
--- a/storage/innodb_plugin/btr/btr0btr.c	2011-08-15 09:11:43 +0000
+++ b/storage/innodb_plugin/btr/btr0btr.c	2011-08-29 08:16:42 +0000
@@ -906,28 +906,29 @@ btr_page_alloc_for_ibuf(
 /**************************************************************//**
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents!
-@return	new allocated block, x-latched; NULL if out of space */
-UNIV_INTERN
-buf_block_t*
-btr_page_alloc(
-/*===========*/
+@return	allocated page number, FIL_NULL if out of space */
+static __attribute__((nonnull(1,5), warn_unused_result))
+ulint
+btr_page_alloc_low(
+/*===============*/
 	dict_index_t*	index,		/*!< in: index */
 	ulint		hint_page_no,	/*!< in: hint of a good page */
 	byte		file_direction,	/*!< in: direction where a possible
 					page split is made */
 	ulint		level,		/*!< in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr)		/*!< in: mtr */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+					in which the page should be
+					initialized (may be the same
+					as mtr), or NULL if it should
+					not be initialized (the page
+					at hint was previously freed
+					in mtr) */
 {
 	fseg_header_t*	seg_header;
 	page_t*		root;
-	buf_block_t*	new_block;
-	ulint		new_page_no;
-
-	if (dict_index_is_ibuf(index)) {
-
-		return(btr_page_alloc_for_ibuf(index, mtr));
-	}
 
 	root = btr_root_get(index, mtr);
 
@@ -941,8 +942,42 @@ btr_page_alloc(
 	reservation for free extents, and thus we know that a page can
 	be allocated: */
 
-	new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
-						   file_direction, TRUE, mtr);
+	return(fseg_alloc_free_page_general(
+		       seg_header, hint_page_no, file_direction,
+		       TRUE, mtr, init_mtr));
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@return	new allocated block, x-latched; NULL if out of space */
+UNIV_INTERN
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+{
+	buf_block_t*	new_block;
+	ulint		new_page_no;
+
+	if (dict_index_is_ibuf(index)) {
+
+		return(btr_page_alloc_for_ibuf(index, mtr));
+	}
+
+	new_page_no = btr_page_alloc_low(
+		index, hint_page_no, file_direction, level, mtr, init_mtr);
+
 	if (new_page_no == FIL_NULL) {
 
 		return(NULL);
@@ -950,9 +985,16 @@ btr_page_alloc(
 
 	new_block = buf_page_get(dict_index_get_space(index),
 				 dict_table_zip_size(index->table),
-				 new_page_no, RW_X_LATCH, mtr);
+				 new_page_no, RW_X_LATCH, init_mtr);
 	buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
 
+	if (mtr->freed_clust_leaf) {
+		mtr_memo_release(mtr, new_block, MTR_MEMO_FREE_CLUST_LEAF);
+		ut_ad(!mtr_memo_contains(mtr, new_block,
+					 MTR_MEMO_FREE_CLUST_LEAF));
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
 	return(new_block);
 }
 
@@ -1065,6 +1107,15 @@ btr_page_free_low(
 	fseg_free_page(seg_header,
 		       buf_block_get_space(block),
 		       buf_block_get_page_no(block), mtr);
+
+	/* The page was marked free in the allocation bitmap, but it
+	should remain buffer-fixed until mtr_commit(mtr) or until it
+	is explicitly freed from the mini-transaction. */
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* TODO: Discard any operations on the page from the redo log
+	and remove the block from the flush list and the buffer pool.
+	This would free up buffer pool earlier and reduce writes to
+	both the tablespace and the redo log. */
 }
 
 /**************************************************************//**
@@ -1078,12 +1129,139 @@ btr_page_free(
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ulint		level;
-
-	level = btr_page_get_level(buf_block_get_frame(block), mtr);
+	const page_t*	page	= buf_block_get_frame(block);
+	ulint		level	= btr_page_get_level(page, mtr);
 
+	ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX);
 	btr_page_free_low(index, block, level, mtr);
+
+	/* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	if (level == 0 && dict_index_is_clust(index)) {
+		/* We may have to call btr_mark_freed_leaves() to
+		temporarily mark the block nonfree for invoking
+		btr_store_big_rec_extern_fields_func() after an
+		update. Remember that the block was freed. */
+		mtr->freed_clust_leaf = TRUE;
+		mtr_memo_push(mtr, block, MTR_MEMO_FREE_CLUST_LEAF);
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
+}
+
+/**************************************************************//**
+Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
+For invoking btr_store_big_rec_extern_fields() after an update,
+we must temporarily mark freed clustered index pages allocated, so
+that off-page columns will not be allocated from them. Between the
+btr_store_big_rec_extern_fields() and mtr_commit() we have to
+mark the pages free again, so that no pages will be leaked. */
+UNIV_INTERN
+void
+btr_mark_freed_leaves(
+/*==================*/
+	dict_index_t*	index,	/*!< in/out: clustered index */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool		nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */
+{
+	/* This is loosely based on mtr_memo_release(). */
+
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	if (!mtr->freed_clust_leaf) {
+		return;
+	}
+
+	offset = dyn_array_get_data_size(&mtr->memo);
+
+	while (offset > 0) {
+		mtr_memo_slot_t*	slot;
+		buf_block_t*		block;
+
+		offset -= sizeof *slot;
+
+		slot = dyn_array_get_element(&mtr->memo, offset);
+
+		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
+			continue;
+		}
+
+		/* Because btr_page_alloc() does invoke
+		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
+		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
+		memo must still be clustered index leaf tree pages. */
+		block = slot->object;
+		ut_a(buf_block_get_space(block)
+		     == dict_index_get_space(index));
+		ut_a(fil_page_get_type(buf_block_get_frame(block))
+		     == FIL_PAGE_INDEX);
+		ut_a(page_is_leaf(buf_block_get_frame(block)));
+
+		if (nonfree) {
+			/* Allocate the same page again. */
+			ulint	page_no;
+			page_no = btr_page_alloc_low(
+				index, buf_block_get_page_no(block),
+				FSP_NO_DIR, 0, mtr, NULL);
+			ut_a(page_no == buf_block_get_page_no(block));
+		} else {
+			/* Assert that the page is allocated and free it. */
+			btr_page_free_low(index, block, 0, mtr);
+		}
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
+}
+
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
+@see btr_mark_freed_leaves()
+@return TRUE */
+UNIV_INTERN
+ibool
+btr_freed_leaves_validate(
+/*======================*/
+	mtr_t*	mtr)	/*!< in: mini-transaction */
+{
+	ulint	offset;
+
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	offset = dyn_array_get_data_size(&mtr->memo);
+
+	while (offset > 0) {
+		const mtr_memo_slot_t*	slot;
+		const buf_block_t*	block;
+
+		offset -= sizeof *slot;
+
+		slot = dyn_array_get_element(&mtr->memo, offset);
+
+		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
+			continue;
+		}
+
+		ut_a(mtr->freed_clust_leaf);
+		/* Because btr_page_alloc() does invoke
+		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
+		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
+		memo must still be clustered index leaf tree pages. */
+		block = slot->object;
+		ut_a(fil_page_get_type(buf_block_get_frame(block))
+		     == FIL_PAGE_INDEX);
+		ut_a(page_is_leaf(buf_block_get_frame(block)));
+	}
+
+	return(TRUE);
 }
+#endif /* UNIV_DEBUG */
 
 /**************************************************************//**
 Sets the child node file address in a node pointer. */
@@ -1806,7 +1984,7 @@ btr_root_raise_and_insert(
 
 	level = btr_page_get_level(root, mtr);
 
-	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr);
+	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
 	new_page = buf_block_get_frame(new_block);
 	new_page_zip = buf_block_get_page_zip(new_block);
 	ut_a(!new_page_zip == !root_page_zip);
@@ -2542,7 +2720,7 @@ func_start:
 
 	/* 2. Allocate a new page to the index */
 	new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
-				   btr_page_get_level(page, mtr), mtr);
+				   btr_page_get_level(page, mtr), mtr, mtr);
 	new_page = buf_block_get_frame(new_block);
 	new_page_zip = buf_block_get_page_zip(new_block);
 	btr_page_create(new_block, new_page_zip, cursor->index,

=== modified file 'storage/innodb_plugin/btr/btr0cur.c'
--- a/storage/innodb_plugin/btr/btr0cur.c	2011-08-15 09:11:43 +0000
+++ b/storage/innodb_plugin/btr/btr0cur.c	2011-08-29 08:16:42 +0000
@@ -2414,39 +2414,6 @@ return_after_reservations:
 	return(err);
 }
 
-/**************************************************************//**
-Commits and restarts a mini-transaction so that it will retain an
-x-lock on index->lock and the cursor page. */
-UNIV_INTERN
-void
-btr_cur_mtr_commit_and_start(
-/*=========================*/
-	btr_cur_t*	cursor,	/*!< in: cursor */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-{
-	buf_block_t*	block;
-
-	block = btr_cur_get_block(cursor);
-
-	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
-				MTR_MEMO_X_LOCK));
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	/* Keep the locks across the mtr_commit(mtr). */
-	rw_lock_x_lock(dict_index_get_lock(cursor->index));
-	rw_lock_x_lock(&block->lock);
-	mutex_enter(&block->mutex);
-	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
-	mutex_exit(&block->mutex);
-	/* Write out the redo log. */
-	mtr_commit(mtr);
-	mtr_start(mtr);
-	/* Reassociate the locks with the mini-transaction.
-	They will be released on mtr_commit(mtr). */
-	mtr_memo_push(mtr, dict_index_get_lock(cursor->index),
-		      MTR_MEMO_X_LOCK);
-	mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
-}
-
 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
 
 /****************************************************************//**
@@ -3901,6 +3868,9 @@ btr_store_big_rec_extern_fields_func(
 					the "external storage" flags in offsets
 					will not correspond to rec when
 					this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+
 #ifdef UNIV_DEBUG
 	mtr_t*		local_mtr,	/*!< in: mtr containing the
 					latch to rec and to the tree */
@@ -3909,9 +3879,11 @@ btr_store_big_rec_extern_fields_func(
 	ibool		update_in_place,/*! in: TRUE if the record is updated
 					in place (not delete+insert) */
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
-	const big_rec_t*big_rec_vec)	/*!< in: vector containing fields
-					to be stored externally */
-
+	mtr_t*		alloc_mtr)	/*!< in/out: in an insert, NULL;
+					in an update, local_mtr for
+					allocating BLOB pages and
+					updating BLOB pointers; alloc_mtr
+					must not have freed any leaf pages */
 {
 	ulint	rec_page_no;
 	byte*	field_ref;
@@ -3930,6 +3902,9 @@ btr_store_big_rec_extern_fields_func(
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(rec_offs_any_extern(offsets));
+	ut_ad(local_mtr);
+	ut_ad(!alloc_mtr || alloc_mtr == local_mtr);
+	ut_ad(!update_in_place || alloc_mtr);
 	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
 				MTR_MEMO_X_LOCK));
 	ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
@@ -3945,6 +3920,25 @@ btr_store_big_rec_extern_fields_func(
 	rec_page_no = buf_block_get_page_no(rec_block);
 	ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
 
+	if (alloc_mtr) {
+		/* Because alloc_mtr will be committed after
+		mtr, it is possible that the tablespace has been
+		extended when the B-tree record was updated or
+		inserted, or it will be extended while allocating
+		pages for big_rec.
+
+		TODO: In mtr (not alloc_mtr), write a redo log record
+		about extending the tablespace to its current size,
+		and remember the current size. Whenever the tablespace
+		grows as pages are allocated, write further redo log
+		records to mtr. (Currently tablespace extension is not
+		covered by the redo log. If it were, the record would
+		only be written to alloc_mtr, which is committed after
+		mtr.) */
+	} else {
+		alloc_mtr = &mtr;
+	}
+
 	if (UNIV_LIKELY_NULL(page_zip)) {
 		int	err;
 
@@ -4021,7 +4015,7 @@ btr_store_big_rec_extern_fields_func(
 			}
 
 			block = btr_page_alloc(index, hint_page_no,
-					       FSP_NO_DIR, 0, &mtr);
+					       FSP_NO_DIR, 0, alloc_mtr, &mtr);
 			if (UNIV_UNLIKELY(block == NULL)) {
 
 				mtr_commit(&mtr);
@@ -4148,11 +4142,15 @@ btr_store_big_rec_extern_fields_func(
 					goto next_zip_page;
 				}
 
-				rec_block = buf_page_get(space_id, zip_size,
-							 rec_page_no,
-							 RW_X_LATCH, &mtr);
-				buf_block_dbg_add_level(rec_block,
-							SYNC_NO_ORDER_CHECK);
+				if (alloc_mtr == &mtr) {
+					rec_block = buf_page_get(
+						space_id, zip_size,
+						rec_page_no,
+						RW_X_LATCH, &mtr);
+					buf_block_dbg_add_level(
+						rec_block,
+						SYNC_NO_ORDER_CHECK);
+				}
 
 				if (err == Z_STREAM_END) {
 					mach_write_to_4(field_ref
@@ -4186,7 +4184,8 @@ btr_store_big_rec_extern_fields_func(
 
 				page_zip_write_blob_ptr(
 					page_zip, rec, index, offsets,
-					big_rec_vec->fields[i].field_no, &mtr);
+					big_rec_vec->fields[i].field_no,
+					alloc_mtr);
 
 next_zip_page:
 				prev_page_no = page_no;
@@ -4231,19 +4230,23 @@ next_zip_page:
 
 				extern_len -= store_len;
 
-				rec_block = buf_page_get(space_id, zip_size,
-							 rec_page_no,
-							 RW_X_LATCH, &mtr);
-				buf_block_dbg_add_level(rec_block,
-							SYNC_NO_ORDER_CHECK);
+				if (alloc_mtr == &mtr) {
+					rec_block = buf_page_get(
+						space_id, zip_size,
+						rec_page_no,
+						RW_X_LATCH, &mtr);
+					buf_block_dbg_add_level(
+						rec_block,
+						SYNC_NO_ORDER_CHECK);
+				}
 
 				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);
 				mlog_write_ulint(field_ref
 						 + BTR_EXTERN_LEN + 4,
 						 big_rec_vec->fields[i].len
 						 - extern_len,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);
 
 				if (prev_page_no == FIL_NULL) {
 					btr_blob_dbg_add_blob(
@@ -4253,18 +4256,19 @@ next_zip_page:
 
 					mlog_write_ulint(field_ref
 							 + BTR_EXTERN_SPACE_ID,
-							 space_id,
-							 MLOG_4BYTES, &mtr);
+							 space_id, MLOG_4BYTES,
+							 alloc_mtr);
 
 					mlog_write_ulint(field_ref
 							 + BTR_EXTERN_PAGE_NO,
-							 page_no,
-							 MLOG_4BYTES, &mtr);
+							 page_no, MLOG_4BYTES,
+							 alloc_mtr);
 
 					mlog_write_ulint(field_ref
 							 + BTR_EXTERN_OFFSET,
 							 FIL_PAGE_DATA,
-							 MLOG_4BYTES, &mtr);
+							 MLOG_4BYTES,
+							 alloc_mtr);
 				}
 
 				prev_page_no = page_no;

=== modified file 'storage/innodb_plugin/buf/buf0buf.c'
--- a/storage/innodb_plugin/buf/buf0buf.c	2011-07-19 14:37:37 +0000
+++ b/storage/innodb_plugin/buf/buf0buf.c	2011-08-29 08:16:42 +0000
@@ -1175,29 +1175,6 @@ buf_page_set_accessed_make_young(
 }
 
 /********************************************************************//**
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-UNIV_INTERN
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/*!< in: space id */
-	ulint	offset)	/*!< in: page number */
-{
-	buf_block_t*	block;
-
-	buf_pool_mutex_enter();
-
-	block = (buf_block_t*) buf_page_hash_get(space, offset);
-
-	if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
-		block->check_index_page_at_flush = FALSE;
-	}
-
-	buf_pool_mutex_exit();
-}
-
-/********************************************************************//**
 Returns the current state of is_hashed of a page. FALSE if the page is
 not in the pool. NOTE that this operation does not fix the page in the
 pool if it is found there.

=== modified file 'storage/innodb_plugin/fsp/fsp0fsp.c'
--- a/storage/innodb_plugin/fsp/fsp0fsp.c	2011-02-02 12:05:12 +0000
+++ b/storage/innodb_plugin/fsp/fsp0fsp.c	2011-08-29 08:16:42 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -312,8 +312,9 @@ fsp_fill_free_list(
 					descriptor page and ibuf bitmap page;
 					then we do not allocate more extents */
 	ulint		space,		/*!< in: space */
-	fsp_header_t*	header,		/*!< in: space header */
-	mtr_t*		mtr);		/*!< in: mtr */
+	fsp_header_t*	header,		/*!< in/out: space header */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize file space
@@ -326,14 +327,20 @@ fseg_alloc_free_page_low(
 	ulint		space,	/*!< in: space */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	fseg_inode_t*	seg_inode, /*!< in/out: segment inode */
 	ulint		hint,	/*!< in: hint of which page would be desirable */
 	byte		direction, /*!< in: if the new page is needed because
 				of an index page split, and records are
 				inserted there in order, into which
 				direction they go alphabetically: FSP_DOWN,
 				FSP_UP, FSP_NO_DIR */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr)/*!< in/out: mini-transaction in which the
+				page should be initialized
+				(may be the same as mtr), or NULL if it
+				should not be initialized (the page at hint
+				was previously freed in mtr) */
+	__attribute__((warn_unused_result, nonnull(3,6)));
 #endif /* !UNIV_HOTBACKUP */
 
 /**********************************************************************//**
@@ -701,17 +708,18 @@ list, if not free limit == space size. T
 descriptor defined, as they are uninitialized above the free limit.
 @return pointer to the extent descriptor, NULL if the page does not
 exist in the space or if the offset exceeds the free limit */
-UNIV_INLINE
+UNIV_INLINE __attribute__((nonnull, warn_unused_result))
 xdes_t*
 xdes_get_descriptor_with_space_hdr(
 /*===============================*/
-	fsp_header_t*	sp_header,/*!< in/out: space header, x-latched */
-	ulint		space,	/*!< in: space id */
-	ulint		offset,	/*!< in: page offset;
-				if equal to the free limit,
-				we try to add new extents to
-				the space free list */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	fsp_header_t*	sp_header,	/*!< in/out: space header, x-latched
+					in mtr */
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page offset; if equal
+					to the free limit, we try to
+					add new extents to the space
+					free list */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	ulint	limit;
 	ulint	size;
@@ -719,11 +727,9 @@ xdes_get_descriptor_with_space_hdr(
 	ulint	descr_page_no;
 	page_t*	descr_page;
 
-	ut_ad(mtr);
 	ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
 				MTR_MEMO_X_LOCK));
-	ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_S_FIX)
-	      || mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET);
 	/* Read free limit and space size */
 	limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT);
@@ -773,7 +779,7 @@ is necessary to make the descriptor defi
 above the free limit.
 @return pointer to the extent descriptor, NULL if the page does not
 exist in the space or if the offset exceeds the free limit */
-static
+static __attribute__((nonnull, warn_unused_result))
 xdes_t*
 xdes_get_descriptor(
 /*================*/
@@ -782,7 +788,7 @@ xdes_get_descriptor(
 			or 0 for uncompressed pages */
 	ulint	offset,	/*!< in: page offset; if equal to the free limit,
 			we try to add new extents to the space free list */
-	mtr_t*	mtr)	/*!< in: mtr handle */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
 	buf_block_t*	block;
 	fsp_header_t*	sp_header;
@@ -1160,14 +1166,14 @@ fsp_header_get_tablespace_size(void)
 Tries to extend a single-table tablespace so that a page would fit in the
 data file.
 @return	TRUE if success */
-static
+static __attribute__((nonnull, warn_unused_result))
 ibool
 fsp_try_extend_data_file_with_pages(
 /*================================*/
 	ulint		space,		/*!< in: space */
 	ulint		page_no,	/*!< in: page number */
-	fsp_header_t*	header,		/*!< in: space header */
-	mtr_t*		mtr)		/*!< in: mtr */
+	fsp_header_t*	header,		/*!< in/out: space header */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	ibool	success;
 	ulint	actual_size;
@@ -1192,7 +1198,7 @@ fsp_try_extend_data_file_with_pages(
 /***********************************************************************//**
 Tries to extend the last data file of a tablespace if it is auto-extending.
 @return	FALSE if not auto-extending */
-static
+static __attribute__((nonnull))
 ibool
 fsp_try_extend_data_file(
 /*=====================*/
@@ -1202,8 +1208,8 @@ fsp_try_extend_data_file(
 					the actual file size rounded down to
 					megabyte */
 	ulint		space,		/*!< in: space */
-	fsp_header_t*	header,		/*!< in: space header */
-	mtr_t*		mtr)		/*!< in: mtr */
+	fsp_header_t*	header,		/*!< in/out: space header */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	ulint	size;
 	ulint	zip_size;
@@ -1339,7 +1345,7 @@ fsp_fill_free_list(
 					then we do not allocate more extents */
 	ulint		space,		/*!< in: space */
 	fsp_header_t*	header,		/*!< in/out: space header */
-	mtr_t*		mtr)		/*!< in: mtr */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
 {
 	ulint	limit;
 	ulint	size;
@@ -1538,9 +1544,46 @@ fsp_alloc_free_extent(
 }
 
 /**********************************************************************//**
+Allocates a single free page from a space. */
+static __attribute__((nonnull))
+void
+fsp_alloc_from_free_frag(
+/*=====================*/
+	fsp_header_t*	header,	/*!< in/out: tablespace header */
+	xdes_t*		descr,	/*!< in/out: extent descriptor */
+	ulint		bit,	/*!< in: slot to allocate in the extent */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		frag_n_used;
+
+	ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG);
+	ut_a(xdes_get_bit(descr, XDES_FREE_BIT, bit, mtr));
+	xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr);
+
+	/* Update the FRAG_N_USED field */
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+				     mtr);
+	frag_n_used++;
+	mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES,
+			 mtr);
+	if (xdes_is_full(descr, mtr)) {
+		/* The fragment is full: move it to another list */
+		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+			    mtr);
+		xdes_set_state(descr, XDES_FULL_FRAG, mtr);
+
+		flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
+			      mtr);
+		mlog_write_ulint(header + FSP_FRAG_N_USED,
+				 frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES,
+				 mtr);
+	}
+}
+
+/**********************************************************************//**
 Allocates a single free page from a space. The page is marked as used.
 @return	the page offset, FIL_NULL if no page could be allocated */
-static
+static __attribute__((nonnull, warn_unused_result))
 ulint
 fsp_alloc_free_page(
 /*================*/
@@ -1548,19 +1591,22 @@ fsp_alloc_free_page(
 	ulint	zip_size,/*!< in: compressed page size in bytes
 			or 0 for uncompressed pages */
 	ulint	hint,	/*!< in: hint of which page would be desirable */
-	mtr_t*	mtr)	/*!< in: mtr handle */
+	mtr_t*	mtr,	/*!< in/out: mini-transaction */
+	mtr_t*	init_mtr)/*!< in/out: mini-transaction in which the
+			page should be initialized
+			(may be the same as mtr) */
 {
 	fsp_header_t*	header;
 	fil_addr_t	first;
 	xdes_t*		descr;
 	buf_block_t*	block;
 	ulint		free;
-	ulint		frag_n_used;
 	ulint		page_no;
 	ulint		space_size;
 	ibool		success;
 
 	ut_ad(mtr);
+	ut_ad(init_mtr);
 
 	header = fsp_get_space_header(space, zip_size, mtr);
 
@@ -1642,38 +1688,19 @@ fsp_alloc_free_page(
 		}
 	}
 
-	xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr);
-
-	/* Update the FRAG_N_USED field */
-	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
-				     mtr);
-	frag_n_used++;
-	mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES,
-			 mtr);
-	if (xdes_is_full(descr, mtr)) {
-		/* The fragment is full: move it to another list */
-		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
-			    mtr);
-		xdes_set_state(descr, XDES_FULL_FRAG, mtr);
-
-		flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
-			      mtr);
-		mlog_write_ulint(header + FSP_FRAG_N_USED,
-				 frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES,
-				 mtr);
-	}
+	fsp_alloc_from_free_frag(header, descr, free, mtr);
 
 	/* Initialize the allocated page to the buffer pool, so that it can
 	be obtained immediately with buf_page_get without need for a disk
 	read. */
 
-	buf_page_create(space, page_no, zip_size, mtr);
+	buf_page_create(space, page_no, zip_size, init_mtr);
 
-	block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+	block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, init_mtr);
 	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
 
 	/* Prior contents of the page should be ignored */
-	fsp_init_file_page(block, mtr);
+	fsp_init_file_page(block, init_mtr);
 
 	return(page_no);
 }
@@ -1909,7 +1936,7 @@ fsp_alloc_seg_inode_page(
 	zip_size = dict_table_flags_to_zip_size(
 		mach_read_from_4(FSP_SPACE_FLAGS + space_header));
 
-	page_no = fsp_alloc_free_page(space, zip_size, 0, mtr);
+	page_no = fsp_alloc_free_page(space, zip_size, 0, mtr, mtr);
 
 	if (page_no == FIL_NULL) {
 
@@ -2323,7 +2350,7 @@ fseg_create_general(
 
 	if (page == 0) {
 		page = fseg_alloc_free_page_low(space, zip_size,
-						inode, 0, FSP_UP, mtr);
+						inode, 0, FSP_UP, mtr, mtr);
 
 		if (page == FIL_NULL) {
 
@@ -2572,14 +2599,19 @@ fseg_alloc_free_page_low(
 	ulint		space,	/*!< in: space */
 	ulint		zip_size,/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
-	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	fseg_inode_t*	seg_inode, /*!< in/out: segment inode */
 	ulint		hint,	/*!< in: hint of which page would be desirable */
 	byte		direction, /*!< in: if the new page is needed because
 				of an index page split, and records are
 				inserted there in order, into which
 				direction they go alphabetically: FSP_DOWN,
 				FSP_UP, FSP_NO_DIR */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr)/*!< in/out: mini-transaction in which the
+				page should be initialized
+				(may be the same as mtr), or NULL if it
+				should not be initialized (the page at hint
+				was previously freed in mtr) */
 {
 	fsp_header_t*	space_header;
 	ulint		space_size;
@@ -2590,7 +2622,6 @@ fseg_alloc_free_page_low(
 	ulint		ret_page;	/*!< the allocated page offset, FIL_NULL
 					if could not be allocated */
 	xdes_t*		ret_descr;	/*!< the extent of the allocated page */
-	ibool		frag_page_allocated = FALSE;
 	ibool		success;
 	ulint		n;
 
@@ -2612,6 +2643,8 @@ fseg_alloc_free_page_low(
 	if (descr == NULL) {
 		/* Hint outside space or too high above free limit: reset
 		hint */
+		ut_a(init_mtr);
+		/* The file space header page is always allocated. */
 		hint = 0;
 		descr = xdes_get_descriptor(space, zip_size, hint, mtr);
 	}
@@ -2623,15 +2656,20 @@ fseg_alloc_free_page_low(
 						   mtr), seg_id))
 	    && (xdes_get_bit(descr, XDES_FREE_BIT,
 			     hint % FSP_EXTENT_SIZE, mtr) == TRUE)) {
-
+take_hinted_page:
 		/* 1. We can take the hinted page
 		=================================*/
 		ret_descr = descr;
 		ret_page = hint;
+		/* Skip the check for extending the tablespace. If the
+		page hint were not within the size of the tablespace,
+		we would have got (descr == NULL) above and reset the hint. */
+		goto got_hinted_page;
 		/*-----------------------------------------------------------*/
-	} else if ((xdes_get_state(descr, mtr) == XDES_FREE)
-		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
-		   && (used >= FSEG_FRAG_LIMIT)) {
+	} else if (xdes_get_state(descr, mtr) == XDES_FREE
+		   && (!init_mtr
+		       || ((reserved - used < reserved / FSEG_FILLFACTOR)
+			   && used >= FSEG_FRAG_LIMIT))) {
 
 		/* 2. We allocate the free extent from space and can take
 		=========================================================
@@ -2649,8 +2687,20 @@ fseg_alloc_free_page_low(
 		/* Try to fill the segment free list */
 		fseg_fill_free_list(seg_inode, space, zip_size,
 				    hint + FSP_EXTENT_SIZE, mtr);
-		ret_page = hint;
+		goto take_hinted_page;
 		/*-----------------------------------------------------------*/
+	} else if (!init_mtr) {
+		ut_a(xdes_get_state(descr, mtr) == XDES_FREE_FRAG);
+		fsp_alloc_from_free_frag(space_header, descr,
+					 hint % FSP_EXTENT_SIZE, mtr);
+		ret_page = hint;
+		ret_descr = NULL;
+
+		/* Put the page in the fragment page array of the segment */
+		n = fseg_find_free_frag_page_slot(seg_inode, mtr);
+		ut_a(n != FIL_NULL);
+		fseg_set_nth_frag_page_no(seg_inode, n, ret_page, mtr);
+		goto got_hinted_page;
 	} else if ((direction != FSP_NO_DIR)
 		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
 		   && (used >= FSEG_FRAG_LIMIT)
@@ -2710,11 +2760,10 @@ fseg_alloc_free_page_low(
 	} else if (used < FSEG_FRAG_LIMIT) {
 		/* 6. We allocate an individual page from the space
 		===================================================*/
-		ret_page = fsp_alloc_free_page(space, zip_size, hint, mtr);
+		ret_page = fsp_alloc_free_page(space, zip_size, hint,
+					       mtr, init_mtr);
 		ret_descr = NULL;
 
-		frag_page_allocated = TRUE;
-
 		if (ret_page != FIL_NULL) {
 			/* Put the page in the fragment page array of the
 			segment */
@@ -2724,6 +2773,10 @@ fseg_alloc_free_page_low(
 			fseg_set_nth_frag_page_no(seg_inode, n, ret_page,
 						  mtr);
 		}
+
+		/* fsp_alloc_free_page() invoked fsp_init_file_page()
+		already. */
+		return(ret_page);
 		/*-----------------------------------------------------------*/
 	} else {
 		/* 7. We allocate a new extent and take its first page
@@ -2771,26 +2824,34 @@ fseg_alloc_free_page_low(
 		}
 	}
 
-	if (!frag_page_allocated) {
+got_hinted_page:
+	{
 		/* Initialize the allocated page to buffer pool, so that it
 		can be obtained immediately with buf_page_get without need
 		for a disk read */
 		buf_block_t*	block;
 		ulint		zip_size = dict_table_flags_to_zip_size(
 			mach_read_from_4(FSP_SPACE_FLAGS + space_header));
+		mtr_t*		block_mtr = init_mtr ? init_mtr : mtr;
 
-		block = buf_page_create(space, ret_page, zip_size, mtr);
+		block = buf_page_create(space, ret_page, zip_size, block_mtr);
 		buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
 
 		if (UNIV_UNLIKELY(block != buf_page_get(space, zip_size,
 							ret_page, RW_X_LATCH,
-							mtr))) {
+							block_mtr))) {
 			ut_error;
 		}
 
-		/* The prior contents of the page should be ignored */
-		fsp_init_file_page(block, mtr);
+		if (init_mtr) {
+			/* The prior contents of the page should be ignored */
+			fsp_init_file_page(block, init_mtr);
+		}
+	}
 
+	/* ret_descr == NULL if the block was allocated from free_frag
+	(XDES_FREE_FRAG) */
+	if (ret_descr != NULL) {
 		/* At this point we know the extent and the page offset.
 		The extent is still in the appropriate list (FSEG_NOT_FULL
 		or FSEG_FREE), and the page is not yet marked as used. */
@@ -2803,8 +2864,6 @@ fseg_alloc_free_page_low(
 		fseg_mark_page_used(seg_inode, space, zip_size, ret_page, mtr);
 	}
 
-	buf_reset_check_index_page_at_flush(space, ret_page);
-
 	return(ret_page);
 }
 
@@ -2817,7 +2876,7 @@ UNIV_INTERN
 ulint
 fseg_alloc_free_page_general(
 /*=========================*/
-	fseg_header_t*	seg_header,/*!< in: segment header */
+	fseg_header_t*	seg_header,/*!< in/out: segment header */
 	ulint		hint,	/*!< in: hint of which page would be desirable */
 	byte		direction,/*!< in: if the new page is needed because
 				of an index page split, and records are
@@ -2829,7 +2888,11 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr)	/*!< in: mtr handle */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction handle */
+	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized,
+				or NULL if this is a "fake allocation" of
+				a page that was previously freed in mtr */
 {
 	fseg_inode_t*	inode;
 	ulint		space;
@@ -2871,7 +2934,8 @@ fseg_alloc_free_page_general(
 	}
 
 	page_no = fseg_alloc_free_page_low(space, zip_size,
-					   inode, hint, direction, mtr);
+					   inode, hint, direction,
+					   mtr, init_mtr);
 	if (!has_done_reservation) {
 		fil_space_release_free_extents(space, n_reserved);
 	}
@@ -2880,28 +2944,6 @@ fseg_alloc_free_page_general(
 }
 
 /**********************************************************************//**
-Allocates a single free page from a segment. This function implements
-the intelligent allocation strategy which tries to minimize file space
-fragmentation.
-@return	allocated page offset, FIL_NULL if no page could be allocated */
-UNIV_INTERN
-ulint
-fseg_alloc_free_page(
-/*=================*/
-	fseg_header_t*	seg_header,/*!< in: segment header */
-	ulint		hint,	/*!< in: hint of which page would be desirable */
-	byte		direction,/*!< in: if the new page is needed because
-				of an index page split, and records are
-				inserted there in order, into which
-				direction they go alphabetically: FSP_DOWN,
-				FSP_UP, FSP_NO_DIR */
-	mtr_t*		mtr)	/*!< in: mtr handle */
-{
-	return(fseg_alloc_free_page_general(seg_header, hint, direction,
-					    FALSE, mtr));
-}
-
-/**********************************************************************//**
 Checks that we have at least 2 frag pages free in the first extent of a
 single-table tablespace, and they are also physically initialized to the data
 file. That is we have already extended the data file so that those pages are

=== modified file 'storage/innodb_plugin/include/btr0btr.h'
--- a/storage/innodb_plugin/include/btr0btr.h	2011-08-15 09:11:43 +0000
+++ b/storage/innodb_plugin/include/btr0btr.h	2011-08-29 08:16:42 +0000
@@ -557,7 +557,12 @@ btr_page_alloc(
 					page split is made */
 	ulint		level,		/*!< in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr);		/*!< in: mtr */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+	__attribute__((nonnull, warn_unused_result));
 /**************************************************************//**
 Frees a file page used in an index tree. NOTE: cannot free field external
 storage pages because the page must contain info on its level. */
@@ -580,6 +585,33 @@ btr_page_free_low(
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
 	ulint		level,	/*!< in: page level */
 	mtr_t*		mtr);	/*!< in: mtr */
+/**************************************************************//**
+Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
+For invoking btr_store_big_rec_extern_fields() after an update,
+we must temporarily mark freed clustered index pages allocated, so
+that off-page columns will not be allocated from them. Between the
+btr_store_big_rec_extern_fields() and mtr_commit() we have to
+mark the pages free again, so that no pages will be leaked. */
+UNIV_INTERN
+void
+btr_mark_freed_leaves(
+/*==================*/
+	dict_index_t*	index,	/*!< in/out: clustered index */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool		nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
+@see btr_mark_freed_leaves()
+@return TRUE */
+UNIV_INTERN
+ibool
+btr_freed_leaves_validate(
+/*======================*/
+	mtr_t*	mtr)	/*!< in: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
 #ifdef UNIV_BTR_PRINT
 /*************************************************************//**
 Prints size info of a B-tree. */

=== modified file 'storage/innodb_plugin/include/btr0cur.h'
--- a/storage/innodb_plugin/include/btr0cur.h	2011-06-16 07:27:21 +0000
+++ b/storage/innodb_plugin/include/btr0cur.h	2011-08-29 08:16:42 +0000
@@ -326,16 +326,6 @@ btr_cur_pessimistic_update(
 	que_thr_t*	thr,	/*!< in: query thread */
 	mtr_t*		mtr);	/*!< in: mtr; must be committed before
 				latching any further pages */
-/*****************************************************************
-Commits and restarts a mini-transaction so that it will retain an
-x-lock on index->lock and the cursor page. */
-UNIV_INTERN
-void
-btr_cur_mtr_commit_and_start(
-/*=========================*/
-	btr_cur_t*	cursor,	/*!< in: cursor */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	__attribute__((nonnull));
 /***********************************************************//**
 Marks a clustered index record deleted. Writes an undo log record to
 undo log on this delete marking. Writes in the trx id field the id
@@ -540,6 +530,8 @@ btr_store_big_rec_extern_fields_func(
 					the "external storage" flags in offsets
 					will not correspond to rec when
 					this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
 #ifdef UNIV_DEBUG
 	mtr_t*		local_mtr,	/*!< in: mtr containing the
 					latch to rec and to the tree */
@@ -548,9 +540,12 @@ btr_store_big_rec_extern_fields_func(
 	ibool		update_in_place,/*! in: TRUE if the record is updated
 					in place (not delete+insert) */
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
-	const big_rec_t*big_rec_vec)	/*!< in: vector containing fields
-					to be stored externally */
-	__attribute__((nonnull));
+	mtr_t*		alloc_mtr)	/*!< in/out: in an insert, NULL;
+					in an update, local_mtr for
+					allocating BLOB pages and
+					updating BLOB pointers; alloc_mtr
+					must not have freed any leaf pages */
+	__attribute__((nonnull(1,2,3,4,5), warn_unused_result));
 
 /** Stores the fields in big_rec_vec to the tablespace and puts pointers to
 them in rec.  The extern flags in rec will have to be set beforehand.
@@ -559,21 +554,22 @@ file segment of the index tree.
 @param index	in: clustered index; MUST be X-latched by mtr
 @param b	in/out: block containing rec; MUST be X-latched by mtr
 @param rec	in/out: clustered index record
-@param offsets	in: rec_get_offsets(rec, index);
+@param offs	in: rec_get_offsets(rec, index);
 		the "external storage" flags in offsets will not be adjusted
+@param big	in: vector containing fields to be stored externally
 @param mtr	in: mini-transaction that holds x-latch on index and b
 @param upd	in: TRUE if the record is updated in place (not delete+insert)
-@param big	in: vector containing fields to be stored externally
+@param rmtr	in/out: in updates, the mini-transaction that holds rec
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
 #ifdef UNIV_DEBUG
-# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
-	btr_store_big_rec_extern_fields_func(index,b,rec,offsets,mtr,upd,big)
+# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \
+	btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,mtr,upd,rmtr)
 #elif defined UNIV_BLOB_LIGHT_DEBUG
-# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
-	btr_store_big_rec_extern_fields_func(index,b,rec,offsets,upd,big)
+# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \
+	btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,upd,rmtr)
 #else
-# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
-	btr_store_big_rec_extern_fields_func(index,b,rec,offsets,big)
+# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \
+	btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,rmtr)
 #endif
 
 /*******************************************************************//**

=== modified file 'storage/innodb_plugin/include/buf0buf.h'
--- a/storage/innodb_plugin/include/buf0buf.h	2011-07-19 14:37:37 +0000
+++ b/storage/innodb_plugin/include/buf0buf.h	2011-08-29 08:16:42 +0000
@@ -372,15 +372,6 @@ buf_page_peek(
 /*==========*/
 	ulint	space,	/*!< in: space id */
 	ulint	offset);/*!< in: page number */
-/********************************************************************//**
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-UNIV_INTERN
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/*!< in: space id */
-	ulint	offset);/*!< in: page number */
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
 /********************************************************************//**
 Sets file_page_was_freed TRUE if the page is found in the buffer pool.

=== modified file 'storage/innodb_plugin/include/fsp0fsp.h'
--- a/storage/innodb_plugin/include/fsp0fsp.h	2009-08-06 22:04:03 +0000
+++ b/storage/innodb_plugin/include/fsp0fsp.h	2011-08-29 08:16:42 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -176,19 +176,18 @@ fseg_n_reserved_pages(
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize
 file space fragmentation.
-@return	the allocated page offset FIL_NULL if no page could be allocated */
-UNIV_INTERN
-ulint
-fseg_alloc_free_page(
-/*=================*/
-	fseg_header_t*	seg_header, /*!< in: segment header */
-	ulint		hint,	/*!< in: hint of which page would be desirable */
-	byte		direction, /*!< in: if the new page is needed because
+@param[in/out] seg_header	segment header
+@param[in] hint			hint of which page would be desirable
+@param[in] direction		if the new page is needed because
 				of an index page split, and records are
 				inserted there in order, into which
 				direction they go alphabetically: FSP_DOWN,
-				FSP_UP, FSP_NO_DIR */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+				FSP_UP, FSP_NO_DIR
+@param[in/out] mtr		mini-transaction
+@return	the allocated page offset FIL_NULL if no page could be allocated */
+#define fseg_alloc_free_page(seg_header, hint, direction, mtr)		\
+	fseg_alloc_free_page_general(seg_header, hint, direction,	\
+				     FALSE, mtr, mtr)
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize file space
@@ -198,7 +197,7 @@ UNIV_INTERN
 ulint
 fseg_alloc_free_page_general(
 /*=========================*/
-	fseg_header_t*	seg_header,/*!< in: segment header */
+	fseg_header_t*	seg_header,/*!< in/out: segment header */
 	ulint		hint,	/*!< in: hint of which page would be desirable */
 	byte		direction,/*!< in: if the new page is needed because
 				of an index page split, and records are
@@ -210,7 +209,12 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized,
+				or NULL if this is a "fake allocation" of
+				a page that was previously freed in mtr */
+	__attribute__((warn_unused_result, nonnull(1,5)));
 /**********************************************************************//**
 Reserves free pages from a tablespace. All mini-transactions which may
 use several pages from the tablespace should call this function beforehand

=== modified file 'storage/innodb_plugin/include/mtr0mtr.h'
--- a/storage/innodb_plugin/include/mtr0mtr.h	2009-09-16 07:07:21 +0000
+++ b/storage/innodb_plugin/include/mtr0mtr.h	2011-08-29 08:16:42 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -53,6 +53,8 @@ first 3 values must be RW_S_LATCH, RW_X_
 #define MTR_MEMO_MODIFY		54
 #define	MTR_MEMO_S_LOCK		55
 #define	MTR_MEMO_X_LOCK		56
+/** The mini-transaction freed a clustered index leaf page. */
+#define MTR_MEMO_FREE_CLUST_LEAF	57
 
 /** @name Log item types
 The log items are declared 'byte' so that the compiler can warn if val
@@ -387,9 +389,12 @@ struct mtr_struct{
 #endif
 	dyn_array_t	memo;	/*!< memo stack for locks etc. */
 	dyn_array_t	log;	/*!< mini-transaction log */
-	ibool		modifications;
-				/* TRUE if the mtr made modifications to
-				buffer pool pages */
+	unsigned	modifications:1;
+				/*!< TRUE if the mini-transaction
+				modified buffer pool pages */
+	unsigned	freed_clust_leaf:1;
+				/*!< TRUE if MTR_MEMO_FREE_CLUST_LEAF
+				was logged in the mini-transaction */
 	ulint		n_log_recs;
 				/* count of how many page initial log records
 				have been written to the mtr log */

=== modified file 'storage/innodb_plugin/include/mtr0mtr.ic'
--- a/storage/innodb_plugin/include/mtr0mtr.ic	2010-03-26 14:19:01 +0000
+++ b/storage/innodb_plugin/include/mtr0mtr.ic	2011-08-29 08:16:42 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -44,6 +44,7 @@ mtr_start(
 
 	mtr->log_mode = MTR_LOG_ALL;
 	mtr->modifications = FALSE;
+	mtr->freed_clust_leaf = FALSE;
 	mtr->n_log_recs = 0;
 
 	ut_d(mtr->state = MTR_ACTIVE);
@@ -67,7 +68,8 @@ mtr_memo_push(
 
 	ut_ad(object);
 	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
-	ut_ad(type <= MTR_MEMO_X_LOCK);
+	ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF);
+	ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf);
 	ut_ad(mtr);
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 	ut_ad(mtr->state == MTR_ACTIVE);

=== modified file 'storage/innodb_plugin/include/trx0undo.h'
--- a/storage/innodb_plugin/include/trx0undo.h	2011-04-07 18:12:54 +0000
+++ b/storage/innodb_plugin/include/trx0undo.h	2011-08-29 08:16:42 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -204,17 +204,51 @@ trx_undo_add_page(
 	mtr_t*		mtr);	/*!< in: mtr which does not have a latch to any
 				undo log page; the caller must have reserved
 				the rollback segment mutex */
+/********************************************************************//**
+Frees the last undo log page.
+The caller must hold the rollback segment mutex. */
+UNIV_INTERN
+void
+trx_undo_free_last_page_func(
+/*==========================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in/out: undo log memory copy */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction which does not
+				have a latch to any undo log page or which
+				has allocated the undo log page */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+# define trx_undo_free_last_page(trx,undo,mtr)	\
+	trx_undo_free_last_page_func(trx,undo,mtr)
+#else /* UNIV_DEBUG */
+# define trx_undo_free_last_page(trx,undo,mtr)	\
+	trx_undo_free_last_page_func(undo,mtr)
+#endif /* UNIV_DEBUG */
+
 /***********************************************************************//**
 Truncates an undo log from the end. This function is used during a rollback
 to free space from an undo log. */
 UNIV_INTERN
 void
-trx_undo_truncate_end(
-/*==================*/
-	trx_t*		trx,	/*!< in: transaction whose undo log it is */
-	trx_undo_t*	undo,	/*!< in: undo log */
-	undo_no_t	limit);	/*!< in: all undo records with undo number
+trx_undo_truncate_end_func(
+/*=======================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction whose undo log it is */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in/out: undo log */
+	undo_no_t	limit)	/*!< in: all undo records with undo number
 				>= this value should be truncated */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+# define trx_undo_truncate_end(trx,undo,limit)		\
+	trx_undo_truncate_end_func(trx,undo,limit)
+#else /* UNIV_DEBUG */
+# define trx_undo_truncate_end(trx,undo,limit)		\
+	trx_undo_truncate_end_func(undo,limit)
+#endif /* UNIV_DEBUG */
+
 /***********************************************************************//**
 Truncates an undo log from the start. This function is used during a purge
 operation. */

=== modified file 'storage/innodb_plugin/mtr/mtr0mtr.c'
--- a/storage/innodb_plugin/mtr/mtr0mtr.c	2009-09-30 12:55:23 +0000
+++ b/storage/innodb_plugin/mtr/mtr0mtr.c	2011-08-29 08:16:42 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -58,12 +58,11 @@ mtr_memo_slot_release(
 			buf_page_release((buf_block_t*)object, type, mtr);
 		} else if (type == MTR_MEMO_S_LOCK) {
 			rw_lock_s_unlock((rw_lock_t*)object);
-#ifdef UNIV_DEBUG
 		} else if (type != MTR_MEMO_X_LOCK) {
-			ut_ad(type == MTR_MEMO_MODIFY);
+			ut_ad(type == MTR_MEMO_MODIFY
+			      || type == MTR_MEMO_FREE_CLUST_LEAF);
 			ut_ad(mtr_memo_contains(mtr, object,
 						MTR_MEMO_PAGE_X_FIX));
-#endif /* UNIV_DEBUG */
 		} else {
 			rw_lock_x_unlock((rw_lock_t*)object);
 		}

=== modified file 'storage/innodb_plugin/row/row0ins.c'
--- a/storage/innodb_plugin/row/row0ins.c	2011-06-16 07:27:21 +0000
+++ b/storage/innodb_plugin/row/row0ins.c	2011-08-29 08:16:42 +0000
@@ -2094,15 +2094,20 @@ row_ins_index_entry_low(
 			if (big_rec) {
 				ut_a(err == DB_SUCCESS);
 				/* Write out the externally stored
-				columns while still x-latching
-				index->lock and block->lock. We have
-				to mtr_commit(mtr) first, so that the
-				redo log will be written in the
-				correct order. Otherwise, we would run
-				into trouble on crash recovery if mtr
-				freed B-tree pages on which some of
-				the big_rec fields will be written. */
-				btr_cur_mtr_commit_and_start(&cursor, &mtr);
+				columns, but allocate the pages and
+				write the pointers using the
+				mini-transaction of the record update.
+				If any pages were freed in the update,
+				temporarily mark them allocated so
+				that off-page columns will not
+				overwrite them. We must do this,
+				because we will write the redo log for
+				the BLOB writes before writing the
+				redo log for the record update. Thus,
+				redo log application at crash recovery
+				will see BLOBs being written to free pages. */
+
+				btr_mark_freed_leaves(index, &mtr, TRUE);
 
 				rec = btr_cur_get_rec(&cursor);
 				offsets = rec_get_offsets(
@@ -2111,7 +2116,8 @@ row_ins_index_entry_low(
 
 				err = btr_store_big_rec_extern_fields(
 					index, btr_cur_get_block(&cursor),
-					rec, offsets, &mtr, FALSE, big_rec);
+					rec, offsets, big_rec, &mtr,
+					FALSE, &mtr);
 				/* If writing big_rec fails (for
 				example, because of DB_OUT_OF_FILE_SPACE),
 				the record will be corrupted. Even if
@@ -2124,6 +2130,9 @@ row_ins_index_entry_low(
 				undo log, and thus the record cannot
 				be rolled back. */
 				ut_a(err == DB_SUCCESS);
+				/* Free the pages again
+				in order to avoid a leak. */
+				btr_mark_freed_leaves(index, &mtr, FALSE);
 				goto stored_big_rec;
 			}
 		} else {
@@ -2165,7 +2174,7 @@ function_exit:
 
 		err = btr_store_big_rec_extern_fields(
 			index, btr_cur_get_block(&cursor),
-			rec, offsets, &mtr, FALSE, big_rec);
+			rec, offsets, big_rec, &mtr, FALSE, NULL);
 
 stored_big_rec:
 		if (modify) {

=== modified file 'storage/innodb_plugin/row/row0row.c'
--- a/storage/innodb_plugin/row/row0row.c	2011-07-07 21:29:30 +0000
+++ b/storage/innodb_plugin/row/row0row.c	2011-08-29 08:16:42 +0000
@@ -243,19 +243,20 @@ row_build(
 	}
 
 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
-	/* This condition can occur during crash recovery before
-	trx_rollback_active() has completed execution.
+	if (rec_offs_any_null_extern(rec, offsets)) {
+		/* This condition can occur during crash recovery
+		before trx_rollback_active() has completed execution.
 
-	This condition is possible if the server crashed
-	during an insert or update before
-	btr_store_big_rec_extern_fields() did mtr_commit() all
-	BLOB pointers to the clustered index record.
-
-	If the record contains a null BLOB pointer, look up the
-	transaction that holds the implicit lock on this record, and
-	assert that it was recovered (and will soon be rolled back). */
-	ut_a(!rec_offs_any_null_extern(rec, offsets)
-	     || trx_assert_recovered(row_get_rec_trx_id(rec, index, offsets)));
+		This condition is possible if the server crashed
+		during an insert or update-by-delete-and-insert before
+		btr_store_big_rec_extern_fields() did mtr_commit() all
+		BLOB pointers to the freshly inserted clustered index
+		record. */
+		ut_a(trx_assert_recovered(
+			     row_get_rec_trx_id(rec, index, offsets)));
+		ut_a(trx_undo_roll_ptr_is_insert(
+			     row_get_rec_roll_ptr(rec, index, offsets)));
+	}
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 
 	if (type != ROW_COPY_POINTERS) {

=== modified file 'storage/innodb_plugin/row/row0upd.c'
--- a/storage/innodb_plugin/row/row0upd.c	2011-06-16 07:27:21 +0000
+++ b/storage/innodb_plugin/row/row0upd.c	2011-08-29 08:16:42 +0000
@@ -1978,21 +1978,22 @@ row_upd_clust_rec(
 		rec_offs_init(offsets_);
 
 		ut_a(err == DB_SUCCESS);
-		/* Write out the externally stored columns while still
-		x-latching index->lock and block->lock. We have to
-		mtr_commit(mtr) first, so that the redo log will be
-		written in the correct order. Otherwise, we would run
-		into trouble on crash recovery if mtr freed B-tree
-		pages on which some of the big_rec fields will be
-		written. */
-		btr_cur_mtr_commit_and_start(btr_cur, mtr);
+		/* Write out the externally stored columns, but
+		allocate the pages and write the pointers using the
+		mini-transaction of the record update. If any pages
+		were freed in the update, temporarily mark them
+		allocated so that off-page columns will not overwrite
+		them. We must do this, because we write the redo log
+		for the BLOB writes before writing the redo log for
+		the record update. */
 
+		btr_mark_freed_leaves(index, mtr, TRUE);
 		rec = btr_cur_get_rec(btr_cur);
 		err = btr_store_big_rec_extern_fields(
 			index, btr_cur_get_block(btr_cur), rec,
 			rec_get_offsets(rec, index, offsets_,
 					ULINT_UNDEFINED, &heap),
-			mtr, TRUE, big_rec);
+			big_rec, mtr, TRUE, mtr);
 		/* If writing big_rec fails (for example, because of
 		DB_OUT_OF_FILE_SPACE), the record will be corrupted.
 		Even if we did not update any externally stored
@@ -2002,6 +2003,8 @@ row_upd_clust_rec(
 		to the undo log, and thus the record cannot be rolled
 		back. */
 		ut_a(err == DB_SUCCESS);
+		/* Free the pages again in order to avoid a leak. */
+		btr_mark_freed_leaves(index, mtr, FALSE);
 	}
 
 	mtr_commit(mtr);

=== modified file 'storage/innodb_plugin/sync/sync0sync.c'
--- a/storage/innodb_plugin/sync/sync0sync.c	2011-08-15 09:11:43 +0000
+++ b/storage/innodb_plugin/sync/sync0sync.c	2011-08-22 14:03:07 +0000
@@ -1248,7 +1248,13 @@ sync_thread_add_level(
 					     TRUE));
 		break;
 	case SYNC_IBUF_TREE_NODE_NEW:
-		ut_a(sync_thread_levels_contain(array, SYNC_IBUF_MUTEX));
+		/* ibuf_add_free_page() allocates new pages for the
+		change buffer while only holding the tablespace
+		x-latch. These pre-allocated new pages may only be
+		taken in use while holding ibuf_mutex, in
+		btr_page_alloc_for_ibuf(). */
+		ut_a(sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)
+		     || sync_thread_levels_contain(array, SYNC_FSP));
 		break;
 	case SYNC_IBUF_INDEX_TREE:
 		if (sync_thread_levels_contain(array, SYNC_FSP)) {

=== modified file 'storage/innodb_plugin/trx/trx0rec.c'
--- a/storage/innodb_plugin/trx/trx0rec.c	2011-06-16 08:51:04 +0000
+++ b/storage/innodb_plugin/trx/trx0rec.c	2011-08-29 08:16:42 +0000
@@ -1097,22 +1097,29 @@ trx_undo_rec_get_partial_row(
 #endif /* !UNIV_HOTBACKUP */
 
 /***********************************************************************//**
-Erases the unused undo log page end. */
-static
-void
+Erases the unused undo log page end.
+@return TRUE if the page contained something, FALSE if it was empty */
+static __attribute__((nonnull, warn_unused_result))
+ibool
 trx_undo_erase_page_end(
 /*====================*/
-	page_t*	undo_page,	/*!< in: undo page whose end to erase */
-	mtr_t*	mtr)		/*!< in: mtr */
+	page_t*	undo_page,	/*!< in/out: undo page whose end to erase */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
 {
 	ulint	first_free;
 
 	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
 				      + TRX_UNDO_PAGE_FREE);
+	if (first_free == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) {
+		/* This was an empty page to begin with.
+		Do nothing here; the caller should free the page. */
+		return(FALSE);
+	}
 	memset(undo_page + first_free, 0xff,
 	       (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free);
 
 	mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr);
+	return(TRUE);
 }
 
 /***********************************************************//**
@@ -1134,7 +1141,11 @@ trx_undo_parse_erase_page_end(
 		return(ptr);
 	}
 
-	trx_undo_erase_page_end(page, mtr);
+	if (!trx_undo_erase_page_end(page, mtr)) {
+		/* The function trx_undo_erase_page_end() should not
+		have done anything to an empty page. */
+		ut_ad(0);
+	}
 
 	return(ptr);
 }
@@ -1180,6 +1191,9 @@ trx_undo_report_row_operation(
 	mem_heap_t*	heap		= NULL;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 	ulint*		offsets		= offsets_;
+#ifdef UNIV_DEBUG
+	int		loop_count	= 0;
+#endif /* UNIV_DEBUG */
 	rec_offs_init(offsets_);
 
 	ut_a(dict_index_is_clust(index));
@@ -1242,7 +1256,7 @@ trx_undo_report_row_operation(
 
 	mtr_start(&mtr);
 
-	for (;;) {
+	do {
 		buf_block_t*	undo_block;
 		page_t*		undo_page;
 		ulint		offset;
@@ -1271,7 +1285,19 @@ trx_undo_report_row_operation(
 			version the replicate page constructed using the log
 			records stays identical to the original page */
 
-			trx_undo_erase_page_end(undo_page, &mtr);
+			if (!trx_undo_erase_page_end(undo_page, &mtr)) {
+				/* The record did not fit on an empty
+				undo page. Discard the freshly allocated
+				page and return an error. */
+
+				mutex_enter(&rseg->mutex);
+				trx_undo_free_last_page(trx, undo, &mtr);
+				mutex_exit(&rseg->mutex);
+
+				err = DB_TOO_BIG_RECORD;
+				goto err_exit;
+			}
+
 			mtr_commit(&mtr);
 		} else {
 			/* Success */
@@ -1291,16 +1317,15 @@ trx_undo_report_row_operation(
 			*roll_ptr = trx_undo_build_roll_ptr(
 				op_type == TRX_UNDO_INSERT_OP,
 				rseg->id, page_no, offset);
-			if (UNIV_LIKELY_NULL(heap)) {
-				mem_heap_free(heap);
-			}
-			return(DB_SUCCESS);
+			err = DB_SUCCESS;
+			goto func_exit;
 		}
 
 		ut_ad(page_no == undo->last_page_no);
 
 		/* We have to extend the undo log by one page */
 
+		ut_ad(++loop_count < 2);
 		mtr_start(&mtr);
 
 		/* When we add a page to an undo log, this is analogous to
@@ -1312,18 +1337,19 @@ trx_undo_report_row_operation(
 		page_no = trx_undo_add_page(trx, undo, &mtr);
 
 		mutex_exit(&(rseg->mutex));
+	} while (UNIV_LIKELY(page_no != FIL_NULL));
 
-		if (UNIV_UNLIKELY(page_no == FIL_NULL)) {
-			/* Did not succeed: out of space */
+	/* Did not succeed: out of space */
+	err = DB_OUT_OF_FILE_SPACE;
 
-			mutex_exit(&(trx->undo_mutex));
-			mtr_commit(&mtr);
-			if (UNIV_LIKELY_NULL(heap)) {
-				mem_heap_free(heap);
-			}
-			return(DB_OUT_OF_FILE_SPACE);
-		}
+err_exit:
+	mutex_exit(&trx->undo_mutex);
+	mtr_commit(&mtr);
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
 	}
+	return(err);
 }
 
 /*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/

=== modified file 'storage/innodb_plugin/trx/trx0undo.c'
--- a/storage/innodb_plugin/trx/trx0undo.c	2011-06-14 05:40:32 +0000
+++ b/storage/innodb_plugin/trx/trx0undo.c	2011-08-29 08:16:42 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -912,7 +912,7 @@ trx_undo_add_page(
 	page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR
 					       + TRX_UNDO_FSEG_HEADER,
 					       undo->top_page_no + 1, FSP_UP,
-					       TRUE, mtr);
+					       TRUE, mtr, mtr);
 
 	fil_space_release_free_extents(undo->space, n_reserved);
 
@@ -998,29 +998,28 @@ trx_undo_free_page(
 }
 
 /********************************************************************//**
-Frees an undo log page when there is also the memory object for the undo
-log. */
-static
+Frees the last undo log page.
+The caller must hold the rollback segment mutex. */
+UNIV_INTERN
 void
-trx_undo_free_page_in_rollback(
-/*===========================*/
-	trx_t*		trx __attribute__((unused)), /*!< in: transaction */
-	trx_undo_t*	undo,	/*!< in: undo log memory copy */
-	ulint		page_no,/*!< in: page number to free: must not be the
-				header page */
-	mtr_t*		mtr)	/*!< in: mtr which does not have a latch to any
-				undo log page; the caller must have reserved
-				the rollback segment mutex */
-{
-	ulint	last_page_no;
-
-	ut_ad(undo->hdr_page_no != page_no);
-	ut_ad(mutex_own(&(trx->undo_mutex)));
-
-	last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space,
-					  undo->hdr_page_no, page_no, mtr);
+trx_undo_free_last_page_func(
+/*==========================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in/out: undo log memory copy */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction which does not
+				have a latch to any undo log page or which
+				has allocated the undo log page */
+{
+	ut_ad(mutex_own(&trx->undo_mutex));
+	ut_ad(undo->hdr_page_no != undo->last_page_no);
+	ut_ad(undo->size > 0);
+
+	undo->last_page_no = trx_undo_free_page(
+		undo->rseg, FALSE, undo->space,
+		undo->hdr_page_no, undo->last_page_no, mtr);
 
-	undo->last_page_no = last_page_no;
 	undo->size--;
 }
 
@@ -1056,9 +1055,11 @@ Truncates an undo log from the end. This
 to free space from an undo log. */
 UNIV_INTERN
 void
-trx_undo_truncate_end(
-/*==================*/
-	trx_t*		trx,	/*!< in: transaction whose undo log it is */
+trx_undo_truncate_end_func(
+/*=======================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction whose undo log it is */
+#endif /* UNIV_DEBUG */
 	trx_undo_t*	undo,	/*!< in: undo log */
 	undo_no_t	limit)	/*!< in: all undo records with undo number
 				>= this value should be truncated */
@@ -1084,18 +1085,7 @@ trx_undo_truncate_end(
 
 		rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no,
 						 undo->hdr_offset);
-		for (;;) {
-			if (rec == NULL) {
-				if (last_page_no == undo->hdr_page_no) {
-
-					goto function_exit;
-				}
-
-				trx_undo_free_page_in_rollback(
-					trx, undo, last_page_no, &mtr);
-				break;
-			}
-
+		while (rec) {
 			if (ut_dulint_cmp(trx_undo_rec_get_undo_no(rec), limit)
 			    >= 0) {
 				/* Truncate at least this record off, maybe
@@ -1110,6 +1100,14 @@ trx_undo_truncate_end(
 							 undo->hdr_offset);
 		}
 
+		if (last_page_no == undo->hdr_page_no) {
+
+			goto function_exit;
+		}
+
+		ut_ad(last_page_no == undo->last_page_no);
+		trx_undo_free_last_page(trx, undo, &mtr);
+
 		mtr_commit(&mtr);
 	}
 

=== modified file 'strings/decimal.c'
--- a/strings/decimal.c	2011-07-03 15:47:37 +0000
+++ b/strings/decimal.c	2011-08-29 09:24:36 +0000
@@ -1423,11 +1423,18 @@ int bin2decimal(const uchar *from, decim
     buf++;
   }
   my_afree(d_copy);
+
+  /*
+    No digits? We have read the number zero, of unspecified precision.
+    Make it a proper zero, with non-zero precision.
+  */
+  if (to->intg == 0 && to->frac == 0)
+    decimal_make_zero(to);
   return error;
 
 err:
   my_afree(d_copy);
-  decimal_make_zero(((decimal_t*) to));
+  decimal_make_zero(to);
   return(E_DEC_BAD_NUM);
 }
 

=== modified file 'support-files/mysql.spec.sh'
--- a/support-files/mysql.spec.sh	2011-07-07 15:22:24 +0000
+++ b/support-files/mysql.spec.sh	2011-08-19 16:48:14 +0000
@@ -382,7 +382,7 @@ sh -c  "PATH=\"${MYSQL_BUILD_PATH:-$PATH
 	    --enable-local-infile \
 	    --with-fast-mutexes \
 	    --with-mysqld-user=%{mysqld_user} \
-	    --with-unix-socket-path=/var/lib/mysql/mysql.sock \
+	    --with-unix-socket-path=%{mysqldatadir}/mysql.sock \
 	    --with-pic \
 	    --prefix=/ \
 %if %{CLUSTER_BUILD}
@@ -858,6 +858,13 @@ chown -R %{mysqld_user}:%{mysqld_group}
 # ----------------------------------------------------------------------
 chmod -R og-rw $mysql_datadir/mysql
 
+# ----------------------------------------------------------------------
+# Deal with SELinux, if it is installed / used
+# ----------------------------------------------------------------------
+if [ -x /sbin/restorecon ] ; then
+	/sbin/restorecon -R %{mysqldatadir}
+fi
+
 # Was the server running before the upgrade? If so, restart the new one.
 if [ "$SERVER_TO_START" = "true" ] ; then
 	# Restart in the same way that mysqld will be started normally.
@@ -1165,6 +1172,15 @@ fi
 # merging BK trees)
 ##############################################################################
 %changelog
+* Fri Aug 19 2011 Joerg Bruehe <joerg.bruehe@stripped>
+
+- Fix bug#37165 "((Generic rpm)) fail to install on Fedora 9 x86_64"
+  On Fedora, certain accesses to "/var/lib/mysql/HOSTNAME.err" were blocked
+  by SELinux policy, this made the server start fail with the message
+      Manager of pid-file quit without updating file
+  Calling "/sbin/restorecon -R /var/lib/mysql" fixes this.
+- Replace occurrences of that path name by the spec file variable %{mysqldatadir}.
+
 * Thu Jul 07 2011 Joerg Bruehe <joerg.bruehe@stripped>
 
 - Fix bug#45415: "rpm upgrade recreates test database"

No bundle (reason: useless for push emails).
Thread
bzr push into mysql-5.1-mtr branch (bjorn.munch:3011 to 3013) Bjorn Munch30 Aug