List:Commits« Previous MessageNext Message »
From:Sergei Golubchik Date:February 17 2009 11:37am
Subject:bzr push into mysql-6.0 branch (serg:2715 to 2718)
View as plain text  
 2718 Sergei Golubchik	2009-02-17
      .bzr-mysql/default.conf -> 6.0 tree
modified:
  .bzr-mysql/default.conf

 2717 Sergei Golubchik	2009-02-17 [merge]
      merged
modified:
  mysql-test/suite/rpl_ndb/t/disabled.def

 2716 Sergei Golubchik	2009-02-17
      disabled failing tests
added:
  mysql-test/suite/maria/t/disabled.def
modified:
  mysql-test/suite/backup/t/disabled.def

 2715 Sergei Golubchik	2009-02-16 [merge]
      merge with 6.0
removed:
  cmd-line-utils/libedit/TEST/
  cmd-line-utils/libedit/TEST/test.c
  cmd-line-utils/libedit/compat.h
  cmd-line-utils/libedit/compat_conf.h
  cmd-line-utils/libedit/editline.3
  cmd-line-utils/libedit/editrc.5
  cmd-line-utils/libedit/fgetln.c
  cmd-line-utils/libedit/fgetln.h
  cmd-line-utils/libedit/libedit_term.h
  cmd-line-utils/libedit/strlcpy.c
  cmd-line-utils/libedit/strlcpy.h
  cmd-line-utils/libedit/tokenizer.h
  cmd-line-utils/libedit/unvis.c
  cmd-line-utils/libedit/vis.c
  cmd-line-utils/libedit/vis.h
  mysql-test/include/wait_for_query_to_suceed.inc
  mysql-test/suite/falcon_team/r/falcon_bug_29246.result
  mysql-test/suite/falcon_team/t/falcon_bug_29246.test
  storage/falcon/Blob.cpp
  storage/falcon/Page.cpp
  storage/falcon/SyncWait.cpp
added:
  cmd-line-utils/libedit/README
  cmd-line-utils/libedit/filecomplete.c
  cmd-line-utils/libedit/filecomplete.h
  mysql-test/extra/binlog_tests/binlog_truncate.test
  mysql-test/include/count_sessions.inc
  mysql-test/include/wait_for_query_to_succeed.inc
  mysql-test/include/wait_until_count_sessions.inc
  mysql-test/r/innodb_ignore_builtin.result
  mysql-test/suite/binlog/r/binlog_truncate_innodb.result
  mysql-test/suite/binlog/r/binlog_truncate_myisam.result
  mysql-test/suite/binlog/t/binlog_truncate_innodb-master.opt
  mysql-test/suite/binlog/t/binlog_truncate_innodb.test
  mysql-test/suite/binlog/t/binlog_truncate_myisam.test
  mysql-test/suite/falcon/r/falcon_bug_26433-big.result
  mysql-test/suite/falcon/r/falcon_bug_33148.result
  mysql-test/suite/falcon/r/falcon_bug_33720.result
  mysql-test/suite/falcon/r/falcon_bug_35257.result
  mysql-test/suite/falcon/r/falcon_bug_36186.result
  mysql-test/suite/falcon/r/falcon_bug_40607.result
  mysql-test/suite/falcon/r/falcon_bug_40801.result
  mysql-test/suite/falcon/r/falcon_bug_41548.result
  mysql-test/suite/falcon/r/falcon_bug_41582.result
  mysql-test/suite/falcon/r/falcon_bug_41688.result
  mysql-test/suite/falcon/r/falcon_bug_42069.result
  mysql-test/suite/falcon/r/falcon_bug_42196.result
  mysql-test/suite/falcon/r/falcon_ps_repeatable_read.result
  mysql-test/suite/falcon/t/falcon_bug_26433-big.test
  mysql-test/suite/falcon/t/falcon_bug_33148.test
  mysql-test/suite/falcon/t/falcon_bug_33720.test
  mysql-test/suite/falcon/t/falcon_bug_35257.test
  mysql-test/suite/falcon/t/falcon_bug_36186.test
  mysql-test/suite/falcon/t/falcon_bug_40607.test
  mysql-test/suite/falcon/t/falcon_bug_40801.test
  mysql-test/suite/falcon/t/falcon_bug_41548.test
  mysql-test/suite/falcon/t/falcon_bug_41582.test
  mysql-test/suite/falcon/t/falcon_bug_41688.test
  mysql-test/suite/falcon/t/falcon_bug_42069.test
  mysql-test/suite/falcon/t/falcon_bug_42196.test
  mysql-test/suite/falcon/t/falcon_ps_repeatable_read.test
  mysql-test/suite/rpl/r/rpl_drop_if_exists.result
  mysql-test/suite/rpl/t/rpl_drop_if_exists.test
  mysql-test/t/innodb_ignore_builtin-master.opt
  mysql-test/t/innodb_ignore_builtin.test
  storage/falcon/SRLInventoryPage.cpp
  storage/falcon/SRLInventoryPage.h
renamed:
  mysql-test/suite/falcon/r/falcon_bug_34351_C.result => mysql-test/suite/falcon_team/r/falcon_bug_34351_C.result
  mysql-test/suite/falcon/t/falcon_bug_34351_C.test => mysql-test/suite/falcon_team/t/falcon_bug_34351_C.test
  mysql-test/suite/falcon_team/r/falcon_bug_26433.result => mysql-test/suite/falcon/r/falcon_bug_26433.result
  mysql-test/suite/falcon_team/r/falcon_bug_28048.result => mysql-test/suite/falcon/r/falcon_bug_28048.result
  mysql-test/suite/falcon_team/r/falcon_bug_31296.result => mysql-test/suite/falcon/r/falcon_bug_31296.result
  mysql-test/suite/falcon_team/r/falcon_bug_36294.result => mysql-test/suite/falcon_team/r/falcon_bug_36294-big.result
  mysql-test/suite/falcon_team/t/falcon_bug_26433.test => mysql-test/suite/falcon/t/falcon_bug_26433.test
  mysql-test/suite/falcon_team/t/falcon_bug_28048.test => mysql-test/suite/falcon/t/falcon_bug_28048.test
  mysql-test/suite/falcon_team/t/falcon_bug_31296.test => mysql-test/suite/falcon/t/falcon_bug_31296.test
  mysql-test/suite/falcon_team/t/falcon_bug_36294.test => mysql-test/suite/falcon_team/t/falcon_bug_36294-big.test
  netware/BUILD/nwbootstrap => netware/BUILD/nwbuild
modified:
  .bzr-mysql/default.conf
  BUILD/Makefile.am
  Docs/Makefile.am
  Makefile.am
  client/Makefile.am
  client/mysql.cc
  client/mysqldump.c
  cmd-line-utils/Makefile.am
  cmd-line-utils/libedit/Makefile.am
  cmd-line-utils/libedit/chared.c
  cmd-line-utils/libedit/chared.h
  cmd-line-utils/libedit/common.c
  cmd-line-utils/libedit/config.h
  cmd-line-utils/libedit/el.c
  cmd-line-utils/libedit/el.h
  cmd-line-utils/libedit/el_term.h
  cmd-line-utils/libedit/emacs.c
  cmd-line-utils/libedit/hist.c
  cmd-line-utils/libedit/histedit.h
  cmd-line-utils/libedit/history.c
  cmd-line-utils/libedit/key.c
  cmd-line-utils/libedit/key.h
  cmd-line-utils/libedit/makelist.sh
  cmd-line-utils/libedit/map.c
  cmd-line-utils/libedit/np/fgetln.c
  cmd-line-utils/libedit/np/strlcat.c
  cmd-line-utils/libedit/np/strlcpy.c
  cmd-line-utils/libedit/np/unvis.c
  cmd-line-utils/libedit/np/vis.c
  cmd-line-utils/libedit/np/vis.h
  cmd-line-utils/libedit/parse.c
  cmd-line-utils/libedit/parse.h
  cmd-line-utils/libedit/prompt.c
  cmd-line-utils/libedit/read.c
  cmd-line-utils/libedit/read.h
  cmd-line-utils/libedit/readline.c
  cmd-line-utils/libedit/readline/readline.h
  cmd-line-utils/libedit/refresh.c
  cmd-line-utils/libedit/search.c
  cmd-line-utils/libedit/sig.c
  cmd-line-utils/libedit/sig.h
  cmd-line-utils/libedit/sys.h
  cmd-line-utils/libedit/term.c
  cmd-line-utils/libedit/tokenizer.c
  cmd-line-utils/libedit/tty.c
  cmd-line-utils/libedit/tty.h
  cmd-line-utils/libedit/vi.c
  cmd-line-utils/readline/Makefile.am
  config/ac-macros/alloca.m4
  config/ac-macros/check_cpu.m4
  config/ac-macros/compiler_flag.m4
  config/ac-macros/ha_ndbcluster.m4
  config/ac-macros/large_file.m4
  config/ac-macros/libevent_configure.m4
  config/ac-macros/misc.m4
  config/ac-macros/readline.m4
  config/ac-macros/ssl.m4
  config/ac-macros/zlib.m4
  configure.in
  dbug/Makefile.am
  extra/CMakeLists.txt*
  extra/Makefile.am
  extra/comp_err.c
  extra/libevent/Makefile.am
  extra/yassl/Makefile.am
  extra/yassl/src/Makefile.am
  extra/yassl/taocrypt/Makefile.am
  extra/yassl/taocrypt/benchmark/Makefile.am
  extra/yassl/taocrypt/src/Makefile.am
  extra/yassl/taocrypt/test/Makefile.am
  extra/yassl/testsuite/Makefile.am
  include/Makefile.am
  include/config-win.h
  include/my_global.h
  include/thr_lock.h
  libmysql/Makefile.am
  libmysql_r/Makefile.am
  libmysqld/Makefile.am
  libmysqld/examples/Makefile.am
  man/Makefile.am
  mysql-test/Makefile.am
  mysql-test/extra/rpl_tests/rpl_extraMaster_Col.test
  mysql-test/extra/rpl_tests/rpl_truncate.test
  mysql-test/extra/rpl_tests/rpl_truncate_helper.test
  mysql-test/include/ps_modify.inc
  mysql-test/include/wait_until_connected_again.inc
  mysql-test/lib/My/SafeProcess/safe_process_win.cc
  mysql-test/lib/mtr_cases.pm
  mysql-test/lib/mtr_report.pm
  mysql-test/lib/mtr_unique.pm
  mysql-test/lib/v1/mtr_report.pl
  mysql-test/lib/v1/mysql-test-run.pl
  mysql-test/mysql-test-run.pl
  mysql-test/r/auto_increment.result
  mysql-test/r/commit_1innodb.result
  mysql-test/r/csv_not_null.result
  mysql-test/r/delayed.result
  mysql-test/r/func_misc.result
  mysql-test/r/func_sapdb.result
  mysql-test/r/grant.result
  mysql-test/r/grant2.result
  mysql-test/r/information_schema.result
  mysql-test/r/innodb_mysql.result
  mysql-test/r/join_cache.result
  mysql-test/r/kill.result
  mysql-test/r/lock_tables_lost_commit.result
  mysql-test/r/lowercase_table.result
  mysql-test/r/merge.result
  mysql-test/r/multi_update.result
  mysql-test/r/myisampack.result
  mysql-test/r/mysqlbinlog_row_trans.result
  mysql-test/r/mysqldump.result
  mysql-test/r/null.result
  mysql-test/r/outfile.result
  mysql-test/r/packet.result
  mysql-test/r/partition_hash.result
  mysql-test/r/partition_pruning.result
  mysql-test/r/ps_2myisam.result
  mysql-test/r/ps_3innodb.result
  mysql-test/r/ps_4heap.result
  mysql-test/r/ps_5merge.result
  mysql-test/r/query_cache_notembedded.result
  mysql-test/r/read_only.result
  mysql-test/r/status.result
  mysql-test/r/subselect.result
  mysql-test/r/subselect_no_mat.result
  mysql-test/r/subselect_no_opts.result
  mysql-test/r/subselect_no_semijoin.result
  mysql-test/r/synchronization.result
  mysql-test/r/warnings.result
  mysql-test/r/windows.result
  mysql-test/r/xml.result
  mysql-test/suite/backup/r/backup_tablespace.result
  mysql-test/suite/backup_engines/t/disabled.def
  mysql-test/suite/binlog/r/binlog_row_mix_innodb_myisam.result
  mysql-test/suite/binlog/r/binlog_stm_mix_innodb_myisam.result
  mysql-test/suite/binlog/r/binlog_unsafe.result
  mysql-test/suite/binlog/t/binlog_unsafe.test
  mysql-test/suite/binlog/t/disabled.def
  mysql-test/suite/falcon/r/falcon_bug_29246.result
  mysql-test/suite/falcon/r/falcon_bug_32398.result
  mysql-test/suite/falcon/r/falcon_bug_32833.result
  mysql-test/suite/falcon/r/falcon_online_index.result
  mysql-test/suite/falcon/r/falcon_options.result
  mysql-test/suite/falcon/r/falcon_options2.result
  mysql-test/suite/falcon/r/ps_8falcon.result
  mysql-test/suite/falcon/t/disabled.def
  mysql-test/suite/falcon/t/falcon_bug.template
  mysql-test/suite/falcon/t/falcon_bug_22180.test
  mysql-test/suite/falcon/t/falcon_bug_23689.test
  mysql-test/suite/falcon/t/falcon_bug_24511.test
  mysql-test/suite/falcon/t/falcon_bug_24858.test
  mysql-test/suite/falcon/t/falcon_bug_24921.test
  mysql-test/suite/falcon/t/falcon_bug_25555.test
  mysql-test/suite/falcon/t/falcon_bug_26057.test
  mysql-test/suite/falcon/t/falcon_bug_27697.test
  mysql-test/suite/falcon/t/falcon_bug_29246.test
  mysql-test/suite/falcon/t/falcon_bug_30480_A.test
  mysql-test/suite/falcon/t/falcon_bug_30480_B.test
  mysql-test/suite/falcon/t/falcon_bug_32398.test
  mysql-test/suite/falcon/t/falcon_bug_32833.test
  mysql-test/suite/falcon/t/falcon_bug_38304.test
  mysql-test/suite/falcon/t/falcon_bug_39708.test
  mysql-test/suite/falcon/t/falcon_bugs.test
  mysql-test/suite/falcon/t/falcon_bugs2.test
  mysql-test/suite/falcon/t/falcon_deadlock_collection.test
  mysql-test/suite/falcon/t/falcon_online_index.test
  mysql-test/suite/falcon/t/falcon_select.test
  mysql-test/suite/falcon/t/falcon_unicode-big.test
  mysql-test/suite/falcon_team/r/falcon_deadlock.result
  mysql-test/suite/falcon_team/t/falcon_bug_34174.test
  mysql-test/suite/falcon_team/t/falcon_bug_34351_B.test
  mysql-test/suite/falcon_team/t/falcon_deadlock.test
  mysql-test/suite/falcon_team/t/test2bug.def
  mysql-test/suite/funcs_1/r/innodb_trig_09.result
  mysql-test/suite/funcs_1/r/memory_trig_09.result
  mysql-test/suite/funcs_1/r/myisam_trig_09.result
  mysql-test/suite/funcs_1/r/ndb_trig_09.result
  mysql-test/suite/funcs_1/triggers/triggers_09.inc
  mysql-test/suite/maria/r/ps_maria.result
  mysql-test/suite/ndb/r/ndb_read_multi_range.result
  mysql-test/suite/ndb/r/ps_7ndb.result
  mysql-test/suite/ndb/t/disabled.def
  mysql-test/suite/ndb/t/ndb_read_multi_range.test
  mysql-test/suite/parts/inc/partition_auto_increment.inc
  mysql-test/suite/parts/r/partition_auto_increment_blackhole.result
  mysql-test/suite/parts/r/partition_auto_increment_falcon.result
  mysql-test/suite/parts/r/partition_auto_increment_innodb.result
  mysql-test/suite/parts/r/partition_auto_increment_maria.result
  mysql-test/suite/parts/r/partition_auto_increment_memory.result
  mysql-test/suite/parts/r/partition_auto_increment_myisam.result
  mysql-test/suite/parts/r/partition_auto_increment_ndb.result
  mysql-test/suite/rpl/r/rpl_extraColmaster_falcon.result
  mysql-test/suite/rpl/r/rpl_extraColmaster_innodb.result
  mysql-test/suite/rpl/r/rpl_extraColmaster_myisam.result
  mysql-test/suite/rpl/r/rpl_innodb_mixed_dml.result
  mysql-test/suite/rpl/r/rpl_sp.result
  mysql-test/suite/rpl/r/rpl_truncate_2myisam.result
  mysql-test/suite/rpl/r/rpl_truncate_3innodb.result
  mysql-test/suite/rpl/r/rpl_truncate_falcon.result
  mysql-test/suite/rpl/t/disabled.def
  mysql-test/suite/rpl/t/rpl_err_ignoredtable.test
  mysql-test/suite/rpl/t/rpl_heartbeat.test
  mysql-test/suite/rpl/t/rpl_truncate_falcon.test
  mysql-test/suite/rpl_ndb_big/t/disabled.def
  mysql-test/suite/sys_vars/t/disabled.def
  mysql-test/t/auto_increment.test
  mysql-test/t/check.test
  mysql-test/t/compress.test
  mysql-test/t/csv_not_null.test
  mysql-test/t/delayed.test
  mysql-test/t/disabled.def
  mysql-test/t/func_misc.test
  mysql-test/t/func_sapdb.test
  mysql-test/t/grant.test
  mysql-test/t/grant2.test
  mysql-test/t/grant3.test
  mysql-test/t/information_schema.test
  mysql-test/t/innodb_mysql.test
  mysql-test/t/join_cache.test
  mysql-test/t/kill.test
  mysql-test/t/lock_tables_lost_commit.test
  mysql-test/t/lowercase_table.test
  mysql-test/t/merge.test
  mysql-test/t/multi_update.test
  mysql-test/t/myisampack.test
  mysql-test/t/mysqldump.test
  mysql-test/t/null.test
  mysql-test/t/openssl_1.test
  mysql-test/t/outfile.test
  mysql-test/t/overflow.test
  mysql-test/t/packet.test
  mysql-test/t/partition_pruning.test
  mysql-test/t/query_cache_notembedded.test
  mysql-test/t/read_only.test
  mysql-test/t/rename.test
  mysql-test/t/sp-threads.test
  mysql-test/t/status.test
  mysql-test/t/subselect.test
  mysql-test/t/synchronization.test
  mysql-test/t/timezone_grant.test
  mysql-test/t/warnings.test
  mysql-test/t/windows.test
  mysql-test/t/xml.test
  mysys/Makefile.am
  mysys/tests/Makefile.am
  mysys/thr_lock.c
  netware/Makefile.am
  plugin/Makefile.am
  plugin/audit_null/Makefile.am
  plugin/daemon_example/Makefile.am
  plugin/fulltext/Makefile.am
  pstack/Makefile.am
  pstack/aout/Makefile.am
  regex/Makefile.am
  scripts/Makefile.am
  scripts/make_win_bin_dist
  scripts/mysqld_multi.sh
  sql-bench/Makefile.am
  sql-bench/bench-init.pl.sh
  sql-common/Makefile.am
  sql/CMakeLists.txt
  sql/Makefile.am
  sql/backup/Makefile.am
  sql/item.cc
  sql/item_timefunc.h
  sql/item_xmlfunc.cc
  sql/lock.cc
  sql/log_event.cc
  sql/mysql_priv.h
  sql/mysqld.cc
  sql/set_var.cc
  sql/share/Makefile.am
  sql/share/errmsg.txt
  sql/sql_base.cc
  sql/sql_class.cc
  sql/sql_db.cc
  sql/sql_delete.cc
  sql/sql_insert.cc
  sql/sql_join_cache.cc
  sql/sql_parse.cc
  sql/sql_partition.cc
  sql/sql_plugin.cc
  sql/sql_select.cc
  sql/sql_select.h
  sql/sql_show.cc
  sql/sql_show.h
  sql/sql_tablespace.cc
  sql/sql_update.cc
  sql/sql_yacc.yy
  sql/table.cc
  storage/Makefile.am
  storage/archive/Makefile.am
  storage/blackhole/Makefile.am
  storage/csv/Makefile.am
  storage/example/Makefile.am
  storage/falcon/BackLog.cpp
  storage/falcon/CMakeLists.txt
  storage/falcon/Connection.cpp
  storage/falcon/DataPage.h
  storage/falcon/Database.cpp
  storage/falcon/Database.h
  storage/falcon/DateTime.cpp
  storage/falcon/Dbb.cpp
  storage/falcon/Dbb.h
  storage/falcon/DeferredIndex.cpp
  storage/falcon/DeferredIndex.h
  storage/falcon/Engine.h
  storage/falcon/IO.cpp
  storage/falcon/Index.cpp
  storage/falcon/Interlock.h
  storage/falcon/Makefile.am
  storage/falcon/MemMgr.cpp
  storage/falcon/MemMgr.h
  storage/falcon/PageInventoryPage.cpp
  storage/falcon/Record.cpp
  storage/falcon/Record.h
  storage/falcon/RecordGroup.cpp
  storage/falcon/RecordGroup.h
  storage/falcon/RecordLeaf.cpp
  storage/falcon/RecordLeaf.h
  storage/falcon/RecordLocatorPage.cpp
  storage/falcon/RecordScavenge.cpp
  storage/falcon/RecordScavenge.h
  storage/falcon/RecordSection.cpp
  storage/falcon/RecordSection.h
  storage/falcon/RecordVersion.cpp
  storage/falcon/RecordVersion.h
  storage/falcon/SRLCommit.cpp
  storage/falcon/SRLCreateTableSpace.cpp
  storage/falcon/SRLPrepare.cpp
  storage/falcon/SRLRollback.cpp
  storage/falcon/SRLRollback.h
  storage/falcon/SRLUpdateIndex.cpp
  storage/falcon/SRLUpdateRecords.cpp
  storage/falcon/SRLUpdateRecords.h
  storage/falcon/SRLVersion.h
  storage/falcon/Scavenger.cpp
  storage/falcon/Section.cpp
  storage/falcon/SerialLog.cpp
  storage/falcon/SerialLogControl.cpp
  storage/falcon/SerialLogControl.h
  storage/falcon/SerialLogRecord.cpp
  storage/falcon/SerialLogRecord.h
  storage/falcon/SparseArray.h
  storage/falcon/Statement.cpp
  storage/falcon/StorageDatabase.cpp
  storage/falcon/StorageDatabase.h
  storage/falcon/StorageHandler.cpp
  storage/falcon/StorageParameters.h
  storage/falcon/StorageTable.cpp
  storage/falcon/StorageTable.h
  storage/falcon/StorageTableShare.cpp
  storage/falcon/StorageTableShare.h
  storage/falcon/StorageVersion.h
  storage/falcon/SyncObject.cpp
  storage/falcon/SyncObject.h
  storage/falcon/Table.cpp
  storage/falcon/Table.h
  storage/falcon/TableSpaceManager.cpp
  storage/falcon/TableSpaceManager.h
  storage/falcon/Thread.cpp
  storage/falcon/Thread.h
  storage/falcon/Transaction.cpp
  storage/falcon/Transaction.h
  storage/falcon/TransactionManager.cpp
  storage/falcon/TransactionManager.h
  storage/falcon/TransformLib/Makefile.am
  storage/falcon/ha_falcon.cpp
  storage/falcon/ha_falcon.h
  storage/falcon/plug.in
  storage/federated/Makefile.am
  storage/heap/Makefile.am
  storage/innobase/Makefile.am
  storage/innobase/handler/ha_innodb.cc
  storage/maria/Makefile.am
  storage/maria/plug.in
  storage/maria/unittest/Makefile.am
  storage/myisam/Makefile.am
  storage/myisam/myisampack.c
  storage/myisammrg/Makefile.am
  storage/myisammrg/myrg_open.c
  storage/ndb/Makefile.am
  storage/ndb/config/common.mk.am
  storage/ndb/config/type_kernel.mk.am
  storage/ndb/config/type_mgmapiclient.mk.am
  storage/ndb/config/type_ndbapi.mk.am
  storage/ndb/config/type_ndbapiclient.mk.am
  storage/ndb/config/type_ndbapitest.mk.am
  storage/ndb/config/type_ndbapitools.mk.am
  storage/ndb/config/type_util.mk.am
  storage/ndb/docs/Makefile.am
  storage/ndb/include/Makefile.am
  storage/ndb/src/Makefile.am
  storage/ndb/src/common/Makefile.am
  storage/ndb/src/common/debugger/Makefile.am
  storage/ndb/src/common/debugger/signaldata/Makefile.am
  storage/ndb/src/common/logger/Makefile.am
  storage/ndb/src/common/mgmcommon/Makefile.am
  storage/ndb/src/common/portlib/Makefile.am
  storage/ndb/src/common/transporter/Makefile.am
  storage/ndb/src/common/transporter/TransporterRegistry.cpp
  storage/ndb/src/common/util/Makefile.am
  storage/ndb/src/cw/Makefile.am
  storage/ndb/src/cw/cpcd/Makefile.am
  storage/ndb/src/kernel/Makefile.am
  storage/ndb/src/kernel/blocks/Makefile.am
  storage/ndb/src/kernel/blocks/backup/Makefile.am
  storage/ndb/src/kernel/blocks/dbdict/Makefile.am
  storage/ndb/src/kernel/blocks/dbdih/Makefile.am
  storage/ndb/src/kernel/blocks/dblqh/Makefile.am
  storage/ndb/src/kernel/blocks/dbtup/Makefile.am
  storage/ndb/src/kernel/error/Makefile.am
  storage/ndb/src/kernel/vm/Makefile.am
  storage/ndb/src/mgmapi/Makefile.am
  storage/ndb/src/mgmclient/Makefile.am
  storage/ndb/src/mgmsrv/Makefile.am
  storage/ndb/src/ndbapi/Makefile.am
  storage/ndb/swig/Makefile.am
  storage/ndb/test/Makefile.am
  storage/ndb/test/ndbapi/Makefile.am
  storage/ndb/test/ndbapi/bank/Makefile.am
  storage/ndb/test/run-test/Makefile.am
  storage/ndb/test/src/Makefile.am
  storage/ndb/test/tools/Makefile.am
  storage/ndb/tools/Makefile.am
  strings/Makefile.am
  support-files/MacOSX/Makefile.am
  support-files/Makefile.am
  support-files/RHEL4-SElinux/Makefile.am
  tests/Makefile.am
  tests/mysql_client_test.c
  unittest/Makefile.am
  unittest/examples/Makefile.am
  unittest/mysys/Makefile.am
  unittest/mytap/Makefile.am
  unittest/mytap/t/Makefile.am
  vio/Makefile.am
  win/Makefile.am
  win/configure.js
  zlib/Makefile.am
  mysql-test/suite/falcon/r/falcon_bug_26433.result
  mysql-test/suite/falcon/t/falcon_bug_26433.test
  mysql-test/suite/falcon_team/t/falcon_bug_36294-big.test
  netware/BUILD/nwbuild

=== modified file '.bzrignore'
--- a/.bzrignore	2009-02-05 12:49:39 +0000
+++ b/.bzrignore	2009-02-13 16:30:54 +0000
@@ -2010,4 +2010,5 @@ libmysql/probes.h
 libmysql_r/probes.h
 unittest/tmp
 libmysqld/sql_join_cache.cc
+storage/maria/maria_non_trans_log
 libmysqld/examples/mysqltest.cc

=== modified file 'Makefile.am'
--- a/Makefile.am	2009-02-11 12:11:20 +0000
+++ b/Makefile.am	2009-02-16 21:18:45 +0000
@@ -132,10 +132,6 @@ smoke:
 	cd mysql-test ; \
 	    @PERL@ ./mysql-test-run.pl --do-test=s
 
-smoke:
-	cd mysql-test ; \
-	    @PERL@ ./mysql-test-run.pl --do-test=s
-
 test-full:	test test-nr test-ps
 
 test-force:

=== modified file 'client/mysqltest.cc'
--- a/client/mysqltest.cc	2009-02-12 17:56:03 +0000
+++ b/client/mysqltest.cc	2009-02-16 21:18:45 +0000
@@ -5912,7 +5912,6 @@ int parse_args(int argc, char **argv)
   if (debug_check_flag)
     my_end_arg= MY_CHECK_ERROR;
 
-
   if (!record)
   {
     /* Check that the result file exists */

=== modified file 'configure.in'
--- a/configure.in	2009-02-11 12:11:20 +0000
+++ b/configure.in	2009-02-16 21:18:45 +0000
@@ -835,7 +835,7 @@ AC_HEADER_STDC
 AC_HEADER_SYS_WAIT
 AC_CHECK_HEADERS(fcntl.h fenv.h float.h floatingpoint.h fpu_control.h ieeefp.h \
  limits.h memory.h pwd.h select.h \
- stdlib.h stddef.h sys/fpu.h sys/stat.h \
+ stdlib.h stddef.h sys/fpu.h sys/stat.h fnmatch.h \
  strings.h string.h synch.h sys/mman.h sys/socket.h netinet/in.h arpa/inet.h \
  sys/timeb.h sys/types.h sys/un.h sys/vadvise.h sys/wait.h term.h \
  unistd.h utime.h sys/utime.h termio.h termios.h sched.h crypt.h alloca.h \

=== modified file 'dbug/dbug.c'
--- a/dbug/dbug.c	2008-12-24 10:48:24 +0000
+++ b/dbug/dbug.c	2009-02-13 16:30:54 +0000
@@ -75,7 +75,7 @@
  *        (the logic is - think of a call stack as of a path.
  *        "function" means only this function, "function/" means the hierarchy.
  *        in the future, filters like function1/function2 could be supported.
- *        wildcards are a natural extension too: * and ?)
+ *        following this logic glob(7) wildcards are supported.)
  *
  */
 
@@ -88,6 +88,13 @@
 #include <my_global.h>
 #include <m_string.h>
 #include <errno.h>
+
+#ifdef HAVE_FNMATCH_H
+#include <fnmatch.h>
+#else
+#define fnmatch(A,B,C) strcmp(A,B)
+#endif
+
 #if defined(MSDOS) || defined(__WIN__)
 #include <process.h>
 #endif
@@ -1477,7 +1484,9 @@ next:
     {
       if (!strncmp((*cur)->str, start, len))
       {
-        if (todo == EXCLUDE)
+        if ((*cur)->flags & todo)  /* same action ? */
+          (*cur)->flags|= subdir;  /* just merge the SUBDIR flag */
+        else if (todo == EXCLUDE)
         {
           struct link *delme=*cur;
           *cur=(*cur)->next_link;
@@ -1571,7 +1580,7 @@ static int InList(struct link *linkp, co
 
   for (result=MATCHED; linkp != NULL; linkp= linkp->next_link)
   {
-    if (!strcmp(linkp->str, cp))
+    if (!fnmatch(linkp->str, cp, 0))
       return linkp->flags;
     if (!(linkp->flags & EXCLUDE))
       result=NOT_MATCHED;

=== modified file 'dbug/tests-t.pl'
--- a/dbug/tests-t.pl	2008-04-16 14:31:38 +0000
+++ b/dbug/tests-t.pl	2009-02-10 18:13:24 +0000
@@ -483,3 +483,14 @@ func2: info: s=ko
 | | <func3
 | <func2
 <main
+# repeated keyword
+% ./tests d:-d,info,info
+=> execute
+=> evaluate: ON
+=> evaluate_if: OFF
+main: explain: dbug explained: d:-d,info
+% ./tests d:-d,info/,info
+=> execute
+=> evaluate: ON
+=> evaluate_if: OFF
+main: explain: dbug explained: d:-d,info/

=== modified file 'dbug/user.r'
--- a/dbug/user.r	2008-04-28 16:24:05 +0000
+++ b/dbug/user.r	2009-02-11 18:57:38 +0000
@@ -941,6 +941,9 @@ a complete log file in case of crashes. 
 multi-threaded environment.
 .LI d[,keywords]
 Enable output from macros with specified keywords.
+Every keyword can be a
+.I glob(7)
+pattern.
 An empty list of keywords implies that all keywords are selected.
 .LI D[,time]
 Delay for specified time after each output line, to let output drain.
@@ -948,6 +951,9 @@ Time is given in tenths of a second (val
 Default is zero.
 .LI f[,functions]
 Limit debugger actions to the specified list of functions.
+Every function can be a
+.I glob(7)
+pattern.
 An empty list of functions implies that all functions are selected.
 Every function in the list may optionally be followed by a '/' -
 this will implicitly select all the functions down the call stack.
@@ -966,6 +972,9 @@ Mark each debugger output line with the 
 current process.
 .LI g,[functions]
 Enable profiling for the specified list of functions.
+Every function can be a
+.I glob(7)
+pattern.
 An empty list of functions enables profiling for all functions.
 See
 .B PROFILING\ WITH\ DBUG
@@ -984,7 +993,11 @@ Like 'a[,file]' but overwrite old file, 
 .LI O[,file]
 Like 'A[,file]' but overwrite old file, do not append.
 .LI p[,processes]
-Limit debugger actions to the specified processes.  An empty list
+Limit debugger actions to the specified processes.
+Every name can be a
+.I glob(7)
+pattern.
+An empty list
 implies all processes.  This is useful for processes which run child
 processes.  Note that each debugger output line can be marked with the
 name of the current process via the 'P' flag.  The process name must

=== modified file 'include/atomic/generic-msvc.h'
--- a/include/atomic/generic-msvc.h	2008-10-20 09:16:47 +0000
+++ b/include/atomic/generic-msvc.h	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -56,11 +56,11 @@ C_MODE_END
 #endif /*_M_IX86*/
 
 #define MY_ATOMIC_MODE "msvc-intrinsics"
-#define IL_EXCHG_ADD32(X,Y) InterlockedExchangeAdd((volatile LONG *)(X),(Y))
-#define IL_COMP_EXCHG32(X,Y,Z) InterlockedCompareExchange((volatile LONG *)(X),(Y),(Z))
-#define IL_COMP_EXCHGptr InterlockedCompareExchangePointer
-#define IL_EXCHG32       InterlockedExchange
-#define IL_EXCHGptr      InterlockedExchangePointer
+#define IL_EXCHG_ADD32(X,Y)     InterlockedExchangeAdd((volatile LONG *)(X),(Y))
+#define IL_COMP_EXCHG32(X,Y,Z)  InterlockedCompareExchange((volatile LONG *)(X),(Y),(Z))
+#define IL_COMP_EXCHGptr        InterlockedCompareExchangePointer
+#define IL_EXCHG32(X,Y)         InterlockedExchange((volatile LONG *)(X),(Y))
+#define IL_EXCHGptr             InterlockedExchangePointer
 #define make_atomic_add_body(S) \
   v= IL_EXCHG_ADD ## S (a, v)
 #define make_atomic_cas_body(S)                                 \

=== modified file 'include/lf.h'
--- a/include/lf.h	2009-01-27 02:08:48 +0000
+++ b/include/lf.h	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007 MySQL AB
+/* Copyright (C) 2007-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -187,8 +187,8 @@ typedef struct st_lf_allocator {
   uchar * volatile top;
   uint element_size;
   uint32 volatile mallocs;
-  void (*constructor)(uchar *);
-  void (*destructor)(uchar *);
+  void (*constructor)(uchar *); /* called, when an object is malloc()'ed */
+  void (*destructor)(uchar *);  /* called, when an object is free()'d    */
 } LF_ALLOCATOR;
 
 void lf_alloc_init(LF_ALLOCATOR *allocator, uint size, uint free_ptr_offset);
@@ -219,7 +219,7 @@ lock_wrap(lf_alloc_new, void *,
 #define LF_HASH_UNIQUE 1
 
 /* lf_hash overhead per element (that is, sizeof(LF_SLIST) */
-#define LF_HASH_OVERHEAD (sizeof(int*)*4)
+extern const int LF_HASH_OVERHEAD;
 
 typedef struct {
   LF_DYNARRAY array;                    /* hash itself */

=== modified file 'include/maria.h'
--- a/include/maria.h	2008-10-14 21:23:33 +0000
+++ b/include/maria.h	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -14,7 +14,10 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-/* This file should be included when using maria functions */
+/**
+  @file
+  This file should be included when using maria functions.
+*/
 
 #ifndef _maria_h
 #define _maria_h
@@ -33,6 +36,8 @@ extern "C" {
 #include <myisamchk.h>
 #include <mysql/plugin.h>
 
+#define MARIA_CANNOT_ROLLBACK
+
 /*
   Limit max keys according to HA_MAX_POSSIBLE_KEY; See myisamchk.h for details
 */
@@ -109,6 +114,7 @@ extern "C" {
 
 typedef ulonglong MARIA_RECORD_POS;
 
+/** Information local to the table's instance */
 typedef struct st_maria_info
 {
   ha_rows records;			/* Records in database */
@@ -189,7 +195,7 @@ typedef struct st_maria_keydef          
   uint16 maxlength;                     /* max length of (packed) key (auto) */
   uint32 write_comp_flag;		/* compare flag for write key (auto) */
   uint32 version;                       /* For concurrent read/write */
-  uint32 ftparser_nr;                   /* distinct ftparser number */
+  uint32 ftkey_nr;                      /* full-text index number */
 
   HA_KEYSEG *seg, *end;
   struct st_mysql_ftparser *parser;     /* Fulltext [pre]parser */
@@ -264,6 +270,8 @@ typedef struct st_maria_columndef		/* co
 } MARIA_COLUMNDEF;
 
 
+/** Physical logging is always compiled in. Undefine if want to benchmark */
+#define HAVE_MARIA_PHYSICAL_LOGGING 1
 extern ulong maria_block_size, maria_checkpoint_frequency;
 extern ulong maria_concurrent_insert;
 extern my_bool maria_flush, maria_single_user, maria_page_checksums;
@@ -323,6 +331,14 @@ extern int maria_extra(MARIA_HA *file,
 extern int maria_reset(MARIA_HA *file);
 extern ha_rows maria_records_in_range(MARIA_HA *info, int inx,
 				      key_range *min_key, key_range *max_key);
+/** Open/close actions allowed on a Maria physical log */
+enum enum_ma_log_action
+{
+  MA_LOG_ACTION_OPEN,
+  MA_LOG_ACTION_CLOSE_CONSISTENT, MA_LOG_ACTION_CLOSE_INCONSISTENT
+};
+extern int ma_log(enum enum_ma_log_action action,
+                  const char *log_filename, const HASH *tables);
 extern int maria_is_changed(MARIA_HA *info);
 extern int maria_delete_all_rows(MARIA_HA *info);
 extern uint maria_get_pointer_length(ulonglong file_length, uint def);

=== modified file 'include/my_pthread.h'
--- a/include/my_pthread.h	2009-01-26 19:15:24 +0000
+++ b/include/my_pthread.h	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -380,19 +380,19 @@ int my_pthread_mutex_trylock(pthread_mut
 
 /* adapt for two different flavors of struct timespec */
 #ifdef HAVE_TIMESPEC_TS_SEC
-#define TV_sec  ts_sec
-#define TV_nsec ts_nsec
+#define MY_tv_sec  ts_sec
+#define MY_tv_nsec ts_nsec
 #else
-#define TV_sec  tv_sec
-#define TV_nsec tv_nsec
+#define MY_tv_sec  tv_sec
+#define MY_tv_nsec tv_nsec
 #endif /* HAVE_TIMESPEC_TS_SEC */
 
 #ifndef set_timespec_time_nsec
 #define set_timespec_time_nsec(ABSTIME,TIME,NSEC) do {                  \
   ulonglong nsec= (NSEC);                                               \
   ulonglong now= (TIME) + (nsec/100);                                   \
-  (ABSTIME).TV_sec=  (now / ULL(10000000));                             \
-  (ABSTIME).TV_nsec= (now % ULL(10000000) * 100 + (nsec % 100));        \
+  (ABSTIME).MY_tv_sec=  (now / ULL(10000000));                          \
+  (ABSTIME).MY_tv_nsec= (now % ULL(10000000) * 100 + (nsec % 100));     \
 } while(0)
 #endif /* !set_timespec_time_nsec */
 

=== modified file 'include/myisam.h'
--- a/include/myisam.h	2009-01-30 14:13:39 +0000
+++ b/include/myisam.h	2009-02-13 16:30:54 +0000
@@ -52,7 +52,6 @@ extern "C" {
   The following defines can be increased if necessary.
   But beware the dependency of MI_MAX_POSSIBLE_KEY_BUFF and MI_MAX_KEY_LENGTH.
 */
-#define MI_MAX_KEY_LENGTH           1332            /* Max length in bytes */
 #define MI_MAX_KEY_SEG              16              /* Max segments for key */
 
 #define MI_MAX_POSSIBLE_KEY_BUFF    HA_MAX_POSSIBLE_KEY_BUFF

=== modified file 'include/waiting_threads.h'
--- a/include/waiting_threads.h	2008-12-17 18:40:14 +0000
+++ b/include/waiting_threads.h	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008 MySQL AB
+/* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -24,16 +24,18 @@
 C_MODE_START
 
 typedef struct st_wt_resource_id WT_RESOURCE_ID;
+typedef struct st_wt_resource WT_RESOURCE;
 
 typedef struct st_wt_resource_type {
-  int (*compare)(void *a, void *b);
-  const void *(*make_key)(WT_RESOURCE_ID *id, uint *len);
+  my_bool (*compare)(const void *a, const void *b);
+  const void *(*make_key)(const WT_RESOURCE_ID *id, uint *len); /* not used */
 } WT_RESOURCE_TYPE;
 
 struct st_wt_resource_id {
   ulonglong value;
-  WT_RESOURCE_TYPE *type;
+  const WT_RESOURCE_TYPE *type;
 };
+/* the below differs from sizeof(WT_RESOURCE_ID) by the amount of padding */
 #define sizeof_WT_RESOURCE_ID (sizeof(ulonglong)+sizeof(void*))
 
 #define WT_WAIT_STATS  24
@@ -43,93 +45,17 @@ extern uint32    wt_wait_stats[WT_WAIT_S
 extern uint32    wt_cycle_stats[2][WT_CYCLE_STATS+1];
 extern uint32    wt_success_stats;
 
-/*
-  'lock' protects 'owners', 'state', and 'waiter_count'
-  'id' is read-only
-
-  a resource is picked up from a hash in a lock-free manner
-  it's returned pinned, so it cannot be freed at once
-  but it may be freed right after the pin is removed
-  to free a resource it should be
-    1. have no owners
-    2. have no waiters
-
-  two ways to access a resource:
-    1. find it in a hash
-       - it's returned pinned.
-        a) take a lock in exclusive mode
-        b) check the state, it should be ACTIVE
-        c) unpin
-    2. by a direct reference
-       - could only used if a resource cannot be freed
-       e.g. accessing a resource by thd->waiting_for is safe,
-       a resource cannot be freed as there's a thread waiting for it
-*/
-typedef struct st_wt_resource {
-  WT_RESOURCE_ID  id;
-  uint            waiter_count;
-  enum { ACTIVE, FREE } state;
-#ifndef DBUG_OFF
-  pthread_mutex_t  *mutex;
-#endif
-  /*
-    before the 'lock' all elements are mutable, after (and including) -
-    immutable in the sense that lf_hash_insert() won't memcpy() over them.
-    See wt_init().
-  */
-#ifdef WT_RWLOCKS_USE_MUTEXES
-  /*
-    we need a special rwlock-like 'lock' to allow readers bypass
-    waiting writers, otherwise readers can deadlock. For example:
-
-      A waits on resource x, owned by B, B waits on resource y, owned
-      by A, we have a cycle (A->x->B->y->A)
-      Both A and B start deadlock detection:
-
-        A locks x                          B locks y
-        A goes deeper                      B goes deeper
-        A locks y                          B locks x
-
-      with mutexes it would deadlock. With rwlocks it won't, as long
-      as both A and B are taking read locks (and they do).
-      But other threads may take write locks. Assume there's
-      C who wants to start waiting on x, and D who wants to start
-      waiting on y.
-
-        A read-locks x                       B read-locks y
-        A goes deeper                        B goes deeper
-     => C write-locks x (to add a new edge)  D write-locks y
-     .. C is blocked                         D is blocked
-        A read-locks y                       B read-locks x
-
-      Now, if a read lock can bypass a pending wrote lock request, we're fine.
-      If it can not, we have a deadlock.
-
-    writer starvation is technically possible, but unlikely, because
-    the contention is expected to be low.
-  */
-  struct {
-    pthread_cond_t   cond;
-    pthread_mutex_t  mutex;
-    uint readers: 16;
-    uint pending_writers: 15;
-    uint write_locked: 1;
-  } lock;
-#else
-  rw_lock_t lock;
-#endif
-  pthread_cond_t   cond;
-  DYNAMIC_ARRAY    owners;
-} WT_RESOURCE;
-
 typedef struct st_wt_thd {
   /*
     XXX
-    there's no protection (mutex) against concurrent access of
-    the dynarray below. it is assumed that a caller will have it
-    automatically (not to protect this array but to protect its
-    own - caller's - data structures, and we'll get it for free.
-    If not, we'll need to add a mutex
+    there's no protection (mutex) against concurrent access of the
+    dynarray below. it is assumed that a caller will have it anyway
+    (not to protect this array but to protect its own - caller's -
+    data structures), and we'll get it for free. A caller needs to
+    ensure that a blocker won't release a resource before a blocked
+    thread starts waiting, which is usually done with a mutex.
+    
+    If the above assumption is wrong, we'll need to add a mutex here.
   */
   DYNAMIC_ARRAY   my_resources;
   /*
@@ -141,8 +67,10 @@ typedef struct st_wt_thd {
   LF_PINS        *pins;
 
   /* pointers to values */
-  ulong *timeout_short, *deadlock_search_depth_short;
-  ulong *timeout_long, *deadlock_search_depth_long;
+  const ulong *timeout_short;
+  const ulong *deadlock_search_depth_short;
+  const ulong *timeout_long;
+  const ulong *deadlock_search_depth_long;
 
   /*
     weight relates to the desirability of a transaction being killed if it's
@@ -169,13 +97,13 @@ typedef struct st_wt_thd {
   */
   ulong volatile weight;
   /*
-    'killed' is indirectly protected by waiting_for->lock -
-    a killed thread needs to clear its 'waiting_for', and thus needs a lock.
+    'killed' is indirectly protected by waiting_for->lock because
+    a killed thread needs to clear its 'waiting_for' and thus needs a lock.
     That is a thread needs an exclusive lock to read 'killed' reliably.
     But other threads may change 'killed' from 0 to 1, a shared
     lock is enough for that.
    */
-  my_bool volatile killed;
+  my_bool killed;
 #ifndef DBUG_OFF
   const char     *name;
 #endif
@@ -185,16 +113,17 @@ typedef struct st_wt_thd {
 #define WT_OK                   0
 #define WT_DEADLOCK             -1
 #define WT_DEPTH_EXCEEDED       -2
+#define WT_FREE_TO_GO           -3
 
 void wt_init(void);
 void wt_end(void);
-void wt_thd_lazy_init(WT_THD *, ulong *, ulong *, ulong *, ulong *);
+void wt_thd_lazy_init(WT_THD *, const ulong *, const ulong *, const ulong *, const ulong *);
 void wt_thd_destroy(WT_THD *);
-int wt_thd_will_wait_for(WT_THD *, WT_THD *, WT_RESOURCE_ID *);
+int wt_thd_will_wait_for(WT_THD *, WT_THD *, const WT_RESOURCE_ID *);
 int wt_thd_cond_timedwait(WT_THD *, pthread_mutex_t *);
-void wt_thd_release(WT_THD *, WT_RESOURCE_ID *);
+void wt_thd_release(WT_THD *, const WT_RESOURCE_ID *);
 #define wt_thd_release_all(THD) wt_thd_release((THD), 0)
-int wt_resource_id_memcmp(void *, void *);
+my_bool wt_resource_id_memcmp(const void *, const void *);
 
 C_MODE_END
 

=== modified file 'mysql-test/Makefile.am'
--- a/mysql-test/Makefile.am	2009-02-12 17:56:03 +0000
+++ b/mysql-test/Makefile.am	2009-02-16 21:18:45 +0000
@@ -96,7 +96,7 @@ TEST_DIRS = t r include std_data std_dat
 	suite/jp suite/jp/t suite/jp/r suite/jp/std_data \
 	suite/manual/t suite/manual/r \
 	suite/rpl suite/rpl/data suite/rpl/include suite/rpl/r \
-	suite/rpl/t \
+	suite/rpl/t suite/maria/t suite/maria/r \
 	suite/stress/include suite/stress/t suite/stress/r \
 	suite/ndb suite/ndb/t suite/ndb/r \
 	suite/ndb_binlog suite/ndb_binlog/t suite/ndb_binlog/r \

=== added file 'mysql-test/include/have_myisam_or_maria_default.inc'
--- a/mysql-test/include/have_myisam_or_maria_default.inc	1970-01-01 00:00:00 +0000
+++ b/mysql-test/include/have_myisam_or_maria_default.inc	2009-01-28 11:08:55 +0000
@@ -0,0 +1,9 @@
+# This is for tests which create a table using the default engine
+# and expect MyISAM-like behaviour
+disable_query_log;
+let $have_myisam_or_maria_default=`select (@@storage_engine in ("myisam","maria")) as `TRUE``;
+if (!$have_myisam_or_maria_default)
+{
+  skip Test relies on MyISAM or Maria being the default storage engine;
+}
+enable_query_log;

=== modified file 'mysql-test/lib/mtr_cases.pm'
--- a/mysql-test/lib/mtr_cases.pm	2009-01-21 13:21:45 +0000
+++ b/mysql-test/lib/mtr_cases.pm	2009-02-16 21:18:45 +0000
@@ -55,7 +55,7 @@ sub collect_option {
 }
 
 use File::Basename;
-use File::Spec::Functions qw / splitdir /;
+use File::Spec::Functions qw/ splitdir /;
 use IO::File();
 use My::Config;
 use My::Platform;

=== modified file 'mysql-test/mysql-test-run.pl'
--- a/mysql-test/mysql-test-run.pl	2009-02-12 17:56:03 +0000
+++ b/mysql-test/mysql-test-run.pl	2009-02-16 21:18:45 +0000
@@ -115,11 +115,11 @@ my $path_config_file;           # The ge
 # executables will be used by the test suite.
 our $opt_vs_config = $ENV{'MTR_VS_CONFIG'};
 
-my $DEFAULT_SUITES= "main,backup,binlog,federated,rpl,rpl_ndb,ndb";
+my $DEFAULT_SUITES= "main,backup,binlog,federated,rpl,rpl_ndb,ndb,maria";
 
 our $opt_usage;
+our $opt_list_options;
 our $opt_suites;
-our $opt_suites_default= "main,backup,backup_engines,binlog,rpl,rpl_ndb,ndb"; # Default suites to run
 our $opt_script_debug= 0;  # Script debugging, enable with --script-debug
 our $opt_verbose= 0;  # Verbose output, enable with --verbose
 our $exe_mysql;
@@ -771,7 +771,7 @@ sub command_line_setup {
   # Read the command line options
   # Note: Keep list, and the order, in sync with usage at end of this file
   Getopt::Long::Configure("pass_through");
-  GetOptions(
+  my %options=(
              # Control what engine/variation to run
              'embedded-server'          => \$opt_embedded_server,
              'ps-protocol'              => \$opt_ps_protocol,
@@ -892,9 +892,13 @@ sub command_line_setup {
 	     'timediff'                 => \&report_option,
 
              'help|h'                   => \$opt_usage,
-            ) or usage("Can't read options");
+             'list-options'             => \$opt_list_options,
+            );
+
+  GetOptions(%options) or usage("Can't read options");
 
   usage("") if $opt_usage;
+  list_options(\%options) if $opt_list_options;
 
   # --------------------------------------------------------------------------
   # Setup verbosity
@@ -2618,6 +2622,7 @@ sub mysql_install_db {
   mtr_add_arg($args, "--loose-skip-innodb");
   mtr_add_arg($args, "--loose-skip-falcon");
   mtr_add_arg($args, "--loose-skip-ndbcluster");
+  mtr_add_arg($args, "--loose-skip-maria");
   mtr_add_arg($args, "--tmpdir=%s", "$opt_vardir/tmp/");
   mtr_add_arg($args, "--core-file");
 
@@ -5130,3 +5135,15 @@ HERE
 
 }
 
+
+sub list_options ($) {
+  my $hash= shift;
+
+  for (keys %$hash) {
+    s/(=.*|!)$//;
+    s/\|/\n--/g;
+    print "--$_\n";
+  }
+
+  exit(1);
+}

=== modified file 'mysql-test/r/ctype_utf8.result'
--- a/mysql-test/r/ctype_utf8.result	2008-12-13 19:55:44 +0000
+++ b/mysql-test/r/ctype_utf8.result	2009-02-13 16:30:54 +0000
@@ -240,7 +240,7 @@ select hex(s1) from t1;
 hex(s1)
 41
 drop table t1;
-create table t1 (a text character set utf8, primary key(a(360)));
+create table t1 (a text character set utf8, primary key(a(371)));
 ERROR 42000: Specified key was too long; max key length is 1332 bytes
 CREATE TABLE t1 ( a varchar(10) ) CHARACTER SET utf8;
 INSERT INTO t1 VALUES ( 'test' );

=== modified file 'mysql-test/suite/backup/r/backup.result'
--- a/mysql-test/suite/backup/r/backup.result	2008-12-10 15:53:06 +0000
+++ b/mysql-test/suite/backup/r/backup.result	2009-01-28 11:08:55 +0000
@@ -9,13 +9,13 @@ USE db1;
 CREATE TABLE `building` (
 `dir_code` char(4),
 `building` char(6)
-) ENGINE=MYISAM DEFAULT CHARSET=latin1;
+) DEFAULT CHARSET=latin1;
 INSERT INTO `building` VALUES ('N41','1300'),('N01','1453'),('M00','1000'),('N41','1301'),('N41','1305');
 CREATE TABLE `directorate` (
 `dir_code` char(4),
 `dir_name` char(30),
 `dir_head_id` char(9)
-) ENGINE=MYISAM DEFAULT CHARSET=latin1;
+) DEFAULT CHARSET=latin1;
 INSERT INTO `directorate` VALUES ('N41','Development','333445555'),('N01','Human Resources','123654321'),('M00','Management','333444444');
 USE db2;
 CREATE TABLE `staff` (
@@ -26,7 +26,7 @@ CREATE TABLE `staff` (
 `sex` char(1),
 `salary` int(11),
 `mgr_id` char(9)
-) ENGINE=MYISAM DEFAULT CHARSET=latin1;
+) DEFAULT CHARSET=latin1;
 INSERT INTO `staff` VALUES ('333445555','John','Q','Smith','M',30000,'333444444'),('123763153','William','E','Walters','M',25000,'123654321'),('333444444','Alicia','F','St.Cruz','F',25000,NULL),('921312388','Goy','X','Hong','F',40000,'123654321'),('800122337','Rajesh','G','Kardakarna','M',38000,'333445555'),('820123637','Monty','C','Smythe','M',38000,'333445555'),('830132335','Richard','E','Jones','M',38000,'333445555'),('333445665','Edward','E','Engles','M',25000,'333445555'),('123654321','Beware','D','Borg','F',55000,'333444444'),('123456789','Wilma','N','Maxima','F',43000,'333445555');
 CREATE TABLE `tasking` (
 `id` char(9),
@@ -39,8 +39,7 @@ Depending on the driver used, one of the
 and the result differs in PROCESSLIST state and info.
 SET DEBUG_SYNC= 'default_locking_thread_added SIGNAL bup_thread_added
                  WAIT_FOR finish';
-SET DEBUG_SYNC= 'myisam_locking_thread_added SIGNAL bup_thread_added
-                 WAIT_FOR finish';
+SET DEBUG_SYNC= concat(lower(@@storage_engine), '_locking_thread_added SIGNAL bup_thread_added WAIT_FOR finish');
 backup: Send the backup command.
 BACKUP DATABASE db1,db2 TO 'test.ba';
 breakpoints: Wait for BACKUP to reach its synchronization point.
@@ -190,7 +189,7 @@ CREATE TABLE bup_default.wide (
 PRIMARY KEY (`a`)
 ) ENGINE=INNODB DEFAULT CHARSET=latin1;
 CREATE TABLE bup_default.t1 (a int) engine=innodb;
-CREATE TABLE bup_default.t2 (a int) engine=MYISAM;
+CREATE TABLE bup_default.t2 (a int);
 CREATE TABLE bup_default.t1_blob (a int, b text) engine=innodb;
 INSERT INTO bup_default.wide VALUES (
 NULL,

=== modified file 'mysql-test/suite/backup/r/backup_commit_backup.result'
--- a/mysql-test/suite/backup/r/backup_commit_backup.result	2008-11-17 09:57:51 +0000
+++ b/mysql-test/suite/backup/r/backup_commit_backup.result	2009-01-28 11:08:55 +0000
@@ -5,10 +5,10 @@ CREATE DATABASE db2;
 CREATE TABLE db1.t1 (s1 CHAR(3)) ENGINE=innodb;
 CREATE TABLE db1.t2 (s1 CHAR(3)) ENGINE=falcon;
 CREATE TABLE db1.t3 (s1 CHAR(3)) ENGINE=memory;
-CREATE TABLE db1.t4 (s1 CHAR(3)) ENGINE=myisam;
+CREATE TABLE db1.t4 (s1 CHAR(3));
 CREATE TABLE db2.t1 (s1 CHAR(3)) ENGINE=innodb;
 CREATE TABLE db2.t2 (s1 CHAR(3)) ENGINE=falcon;
-CREATE TABLE db2.t3 (s1 CHAR(3)) ENGINE=myisam;
+CREATE TABLE db2.t3 (s1 CHAR(3));
 connection B - starting transaction B
 SET autocommit=0;
 BEGIN;

=== modified file 'mysql-test/suite/backup/r/backup_default.result'
--- a/mysql-test/suite/backup/r/backup_default.result	2008-12-04 13:04:16 +0000
+++ b/mysql-test/suite/backup/r/backup_default.result	2009-01-28 11:08:55 +0000
@@ -36,7 +36,6 @@ USE db1;
 CREATE TABLE partition_table (
 int_column int(11), 
 char_column char(5))
-ENGINE=myisam
 PARTITION BY HASH (int_column);
 INSERT INTO partition_table VALUES (0,'pVtIa');
 INSERT INTO partition_table VALUES (5,'jTfSg');
@@ -52,7 +51,7 @@ INSERT INTO csv_table VALUES(1,'aa1','bb
 CREATE TABLE myisam_table(
 i int,
 v varchar(20))
-ENGINE=myisam;
+;
 INSERT INTO myisam_table VALUES(1,'v1'),(2,'v2'),(3,'v3');
 backup on mixed table database
 BACKUP DATABASE db1 to 'bup_mixed.bak';

=== modified file 'mysql-test/suite/backup/r/backup_lock_myisam.result'
--- a/mysql-test/suite/backup/r/backup_lock_myisam.result	2008-12-04 23:14:30 +0000
+++ b/mysql-test/suite/backup/r/backup_lock_myisam.result	2009-01-28 11:08:55 +0000
@@ -5,7 +5,7 @@ DROP DATABASE IF EXISTS db2;
 DROP DATABASE IF EXISTS db3;
 Create database 1 and a table then populate it
 CREATE DATABASE db1;
-CREATE TABLE db1.t1 (a INT) ENGINE=MYISAM;
+CREATE TABLE db1.t1 (a INT);
 INSERT INTO db1.t1 VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(0);
 INSERT INTO db1.t1 SELECT * FROM db1.t1;
 INSERT INTO db1.t1 SELECT * FROM db1.t1;
@@ -147,7 +147,7 @@ a
 SET DEBUG_SYNC= 'RESET';
 Create a database with a table and a view using the MyISAM engine.
 CREATE DATABASE db3;
-CREATE TABLE db3.t1(name CHAR(10)) ENGINE=MYISAM;
+CREATE TABLE db3.t1(name CHAR(10));
 INSERT INTO db3.t1 VALUES('A'),('B'),('C'),('D');
 CREATE VIEW db3.v1 AS SELECT * FROM db3.t1;
 Show the data before backup

=== modified file 'mysql-test/suite/backup/r/backup_logs.result'
--- a/mysql-test/suite/backup/r/backup_logs.result	2009-01-29 21:17:59 +0000
+++ b/mysql-test/suite/backup/r/backup_logs.result	2009-02-13 12:40:13 +0000
@@ -100,7 +100,7 @@ is included in backup image(BUG#39109)
 Include all objects in database(Databases, tables, procedures and
 functions, views, triggers and events) and perform backup operation.
 con1: Create tables
-CREATE TABLE backup_logs.t1 (a char(30)) ENGINE=MYISAM;
+CREATE TABLE backup_logs.t1 (a char(30));
 CREATE TABLE backup_logs.t2 (a char(30)) ENGINE=INNODB;
 CREATE TABLE backup_logs.t3 (a char(30)) ENGINE=MEMORY;
 CREATE TABLE backup_logs.t4(id INT, name CHAR(20))ENGINE=BLACKHOLE;

=== modified file 'mysql-test/suite/backup/r/backup_logs_purge.result'
--- a/mysql-test/suite/backup/r/backup_logs_purge.result	2009-01-08 14:57:41 +0000
+++ b/mysql-test/suite/backup/r/backup_logs_purge.result	2009-02-13 12:40:13 +0000
@@ -6,7 +6,7 @@ Now starting real tests
 DROP DATABASE IF EXISTS backup_logs;
 CREATE DATABASE backup_logs;
 Create table and populate with data.
-CREATE TABLE backup_logs.t1 (a char(30)) ENGINE=MYISAM;
+CREATE TABLE backup_logs.t1 (a char(30));
 CREATE TABLE backup_logs.t2 (a char(30)) ENGINE=INNODB;
 CREATE TABLE backup_logs.t3 (a char(30)) ENGINE=MEMORY;
 INSERT INTO backup_logs.t1 VALUES ("01 Test #1 - progress");

=== added file 'mysql-test/suite/backup/r/backup_maria.result'
--- a/mysql-test/suite/backup/r/backup_maria.result	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/backup/r/backup_maria.result	2009-02-13 12:40:13 +0000
@@ -0,0 +1,257 @@
+DROP DATABASE IF EXISTS mysqltest;
+CREATE DATABASE mysqltest;
+USE mysqltest;
+CREATE TABLE t1 (a int, b varchar(100), c blob, index(a), index(b),
+index(c(10))) transactional=1 engine=maria;
+insert into t1 values(1,"abc","def");
+insert into t1 values(100,"UJL-JK","PPMLsn!");
+CREATE TABLE t2 like t1;
+alter table t2 transactional=0 row_format=page;
+insert into t2 select * from t1;
+CREATE TABLE t3 like t1;
+alter table t3 transactional=0 row_format=dynamic;
+insert into t3 select * from t1;
+CREATE TABLE t4 like t1;
+alter table t4 page_checksum=0;
+insert into t4 select * from t1;
+CHECKSUM TABLE t1;
+Table	Checksum
+mysqltest.t1	2361375562
+CHECKSUM TABLE t2;
+Table	Checksum
+mysqltest.t2	2361375562
+CHECKSUM TABLE t3;
+Table	Checksum
+mysqltest.t3	2361375562
+CHECKSUM TABLE t4;
+Table	Checksum
+mysqltest.t4	2361375562
+BACKUP DATABASE mysqltest TO 'test.ba';
+backup_id
+#
+DROP DATABASE mysqltest;
+RESTORE FROM 'test.ba';
+backup_id
+#
+select count(*) from t1;
+count(*)
+2
+CHECKSUM TABLE t1;
+Table	Checksum
+mysqltest.t1	2361375562
+CHECKSUM TABLE t2;
+Table	Checksum
+mysqltest.t2	2361375562
+CHECKSUM TABLE t3;
+Table	Checksum
+mysqltest.t3	2361375562
+CHECKSUM TABLE t4;
+Table	Checksum
+mysqltest.t4	2361375562
+
+connection backup: Start backup
+SET DEBUG_SYNC= 'before_backup_data_prepare SIGNAL bup_sync
+                     WAIT_FOR bup_finish';
+BACKUP DATABASE mysqltest TO 'test.ba';
+
+connection default: Wait for BACKUP to reach its sync point
+SET DEBUG_SYNC= 'now WAIT_FOR bup_sync';
+Modify tables
+update t1 set c=repeat("SUN", 4000);
+insert into t1 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t1 select * from t1;
+insert into t1 select * from t1;
+delete from t1 where a=200;
+CHECKSUM TABLE t1;
+Table	Checksum
+mysqltest.t1	3493755364
+select count(*) from t1;
+count(*)
+8
+update t2 set c=repeat("SUN", 4000);
+insert into t2 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t2 select * from t2;
+insert into t2 select * from t2;
+delete from t2 where a=200;
+CHECKSUM TABLE t2;
+Table	Checksum
+mysqltest.t2	3493755364
+select count(*) from t2;
+count(*)
+8
+update t3 set c=repeat("SUN", 4000);
+insert into t3 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t3 select * from t3;
+insert into t3 select * from t3;
+delete from t3 where a=200;
+CHECKSUM TABLE t3;
+Table	Checksum
+mysqltest.t3	3493755364
+select count(*) from t3;
+count(*)
+8
+update t4 set c=repeat("SUN", 4000);
+insert into t4 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t4 select * from t4;
+insert into t4 select * from t4;
+delete from t4 where a=200;
+CHECKSUM TABLE t4;
+Table	Checksum
+mysqltest.t4	3493755364
+select count(*) from t4;
+count(*)
+8
+Signal BACKUP to finish
+SET DEBUG_SYNC= 'now SIGNAL bup_finish';
+
+connection backup: Fetch result
+backup_id
+#
+DROP DATABASE mysqltest;
+RESTORE FROM 'test.ba';
+backup_id
+#
+CHECKSUM TABLE t1;
+Table	Checksum
+mysqltest.t1	3493755364
+CHECK TABLE t1 EXTENDED;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+CHECKSUM TABLE t2;
+Table	Checksum
+mysqltest.t2	3493755364
+CHECK TABLE t2 EXTENDED;
+Table	Op	Msg_type	Msg_text
+mysqltest.t2	check	status	OK
+CHECKSUM TABLE t3;
+Table	Checksum
+mysqltest.t3	3493755364
+CHECK TABLE t3 EXTENDED;
+Table	Op	Msg_type	Msg_text
+mysqltest.t3	check	status	OK
+CHECKSUM TABLE t4;
+Table	Checksum
+mysqltest.t4	3493755364
+CHECK TABLE t4 EXTENDED;
+Table	Op	Msg_type	Msg_text
+mysqltest.t4	check	status	OK
+
+connection backup: Start backup
+SET DEBUG_SYNC= 'before_backup_data_prepare SIGNAL bup_sync
+                     WAIT_FOR bup_finish';
+BACKUP DATABASE mysqltest TO 'test.ba';
+
+connection default: Wait for BACKUP to reach its sync point
+SET DEBUG_SYNC= 'now WAIT_FOR bup_sync';
+Modify tables
+delete from t1;
+insert into t1 values(2000,"UJLlk5454j-JK","PferZZZfPMLsn!");
+update t1 set c=repeat("NUS", 4000);
+insert into t1 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t1 select * from t1;
+insert into t1 select * from t1;
+delete from t1 where a=200;
+CHECKSUM TABLE t1;
+Table	Checksum
+mysqltest.t1	3470596356
+select count(*) from t1;
+count(*)
+4
+delete from t2;
+insert into t2 values(2000,"UJLlk5454j-JK","PferZZZfPMLsn!");
+update t2 set c=repeat("NUS", 4000);
+insert into t2 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t2 select * from t2;
+insert into t2 select * from t2;
+delete from t2 where a=200;
+CHECKSUM TABLE t2;
+Table	Checksum
+mysqltest.t2	3470596356
+select count(*) from t2;
+count(*)
+4
+delete from t3;
+insert into t3 values(2000,"UJLlk5454j-JK","PferZZZfPMLsn!");
+update t3 set c=repeat("NUS", 4000);
+insert into t3 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t3 select * from t3;
+insert into t3 select * from t3;
+delete from t3 where a=200;
+CHECKSUM TABLE t3;
+Table	Checksum
+mysqltest.t3	3470596356
+select count(*) from t3;
+count(*)
+4
+delete from t4;
+insert into t4 values(2000,"UJLlk5454j-JK","PferZZZfPMLsn!");
+update t4 set c=repeat("NUS", 4000);
+insert into t4 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t4 select * from t4;
+insert into t4 select * from t4;
+delete from t4 where a=200;
+CHECKSUM TABLE t4;
+Table	Checksum
+mysqltest.t4	3470596356
+select count(*) from t4;
+count(*)
+4
+Signal BACKUP to finish
+SET DEBUG_SYNC= 'now SIGNAL bup_finish';
+
+connection backup: Fetch result
+backup_id
+#
+DROP DATABASE mysqltest;
+RESTORE FROM 'test.ba';
+backup_id
+#
+CHECKSUM TABLE t1;
+Table	Checksum
+mysqltest.t1	3470596356
+CHECK TABLE t1 EXTENDED;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+CHECKSUM TABLE t2;
+Table	Checksum
+mysqltest.t2	3470596356
+CHECK TABLE t2 EXTENDED;
+Table	Op	Msg_type	Msg_text
+mysqltest.t2	check	status	OK
+CHECKSUM TABLE t3;
+Table	Checksum
+mysqltest.t3	3470596356
+CHECK TABLE t3 EXTENDED;
+Table	Op	Msg_type	Msg_text
+mysqltest.t3	check	status	OK
+CHECKSUM TABLE t4;
+Table	Checksum
+mysqltest.t4	3470596356
+CHECK TABLE t4 EXTENDED;
+Table	Op	Msg_type	Msg_text
+mysqltest.t4	check	status	OK
+drop database mysqltest;
+create database mysqltest;
+CREATE TABLE mysqltest.t1 (a int, b varchar(100), unique(a)) engine=maria;
+BACKUP DATABASE mysqltest TO 'test.ba';
+backup_id
+#
+DROP DATABASE mysqltest;
+SET DEBUG_SYNC= 'before_restore_locks_tables SIGNAL wait_for_restore WAIT_FOR finish';
+RESTORE FROM 'test.ba';
+SET DEBUG_SYNC= 'now WAIT_FOR wait_for_restore';
+insert into mysqltest.t1 values(1,"def");
+SET DEBUG_SYNC= 'now SIGNAL finish';
+backup_id
+#
+select * from mysqltest.t1;
+a	b
+check table mysqltest.t1;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+select * from mysqltest.t1;
+a	b
+
+connection default: cleanup
+drop database mysqltest;
+SET DEBUG_SYNC= 'RESET';

=== added file 'mysql-test/suite/backup/r/backup_maria_other_instance.result'
--- a/mysql-test/suite/backup/r/backup_maria_other_instance.result	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/backup/r/backup_maria_other_instance.result	2009-01-28 11:08:55 +0000
@@ -0,0 +1,32 @@
+drop database if exists mysqltest;
+create database mysqltest;
+use mysqltest;
+create table t1(a int, unique(a)) engine=maria;
+insert into t1 values(1);
+insert into t1 values(2);
+BACKUP DATABASE mysqltest TO 'test.ba';
+backup_id
+#
+* shut down mysqld, removed logs, restarted it
+RESTORE FROM 'test.ba' OVERWRITE;
+backup_id
+#
+select * from t1;
+a
+1
+2
+Warnings:
+Error	1194	t1' is marked as crashed and should be repaired
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+flush table t1;
+Status:              checked,analyzed,zerofilled,movable
+create_rename_lsn has magic value
+insert into t1 values(3);
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+flush table t1;
+create_rename_lsn has non-magic value
+drop database mysqltest;

=== modified file 'mysql-test/suite/backup/r/backup_multi_blocks.result'
--- a/mysql-test/suite/backup/r/backup_multi_blocks.result	2008-10-07 17:15:44 +0000
+++ b/mysql-test/suite/backup/r/backup_multi_blocks.result	2009-01-28 11:08:55 +0000
@@ -2,7 +2,7 @@ DROP DATABASE IF EXISTS mysqltest;
 Creating database and tables ...
 CREATE DATABASE mysqltest;
 USE mysqltest;
-CREATE TABLE t1 (a LONGTEXT) ENGINE=MYISAM;
+CREATE TABLE t1 (a LONGTEXT);
 Inserting data ...
 USE mysqltest;
 INSERT INTO t1 VALUES ("text");

=== modified file 'mysql-test/suite/backup/r/backup_myisam.result'
--- a/mysql-test/suite/backup/r/backup_myisam.result	2008-12-30 12:13:31 +0000
+++ b/mysql-test/suite/backup/r/backup_myisam.result	2009-02-13 12:40:13 +0000
@@ -32,7 +32,7 @@ DROP DATABASE mysql_db1;
 #
 CREATE DATABASE mysqltest;
 USE mysqltest;
-CREATE TABLE `ц╓ц╤ц╪ц÷бёц╔` (id SERIAL) ENGINE=MyISAM;
+CREATE TABLE `ц╓ц╤ц╪ц÷бёц╔` (id SERIAL);
 BACKUP DATABASE mysqltest TO 'bup_myisam.bak';
 backup_id
 #

=== modified file 'mysql-test/suite/backup/r/backup_myisam_extlocking.result'
--- a/mysql-test/suite/backup/r/backup_myisam_extlocking.result	2008-12-30 12:13:31 +0000
+++ b/mysql-test/suite/backup/r/backup_myisam_extlocking.result	2009-02-13 12:40:13 +0000
@@ -2,7 +2,7 @@ USE test;
 DROP DATABASE IF EXISTS mysqltest;
 CREATE DATABASE mysqltest;
 USE mysqltest;
-CREATE TABLE t1 (a int) engine=myisam;
+CREATE TABLE t1 (a int);
 BACKUP DATABASE mysqltest TO 'test.ba';
 ERROR HY000: Got error -1 'online backup impossible with --external-locking' from MyISAM
 USE test;

=== modified file 'mysql-test/suite/backup/r/backup_myisam_sync.result'
--- a/mysql-test/suite/backup/r/backup_myisam_sync.result	2008-12-30 12:13:31 +0000
+++ b/mysql-test/suite/backup/r/backup_myisam_sync.result	2009-02-13 12:40:13 +0000
@@ -3,7 +3,7 @@ USE test;
 DROP DATABASE IF EXISTS mysqltest;
 CREATE DATABASE mysqltest;
 USE mysqltest;
-CREATE TABLE t1 (c1 LONGTEXT) ENGINE=MyISAM;
+CREATE TABLE t1 (c1 LONGTEXT);
 
 connection backup: Start backup
 SET DEBUG_SYNC= 'before_backup_data_prepare SIGNAL bup_sync
@@ -42,9 +42,6 @@ SET DEBUG_SYNC= 'now SIGNAL bup_finish';
 connection backup: Fetch result
 backup_id
 #
-REPAIR TABLE t1 QUICK;
-Table	Op	Msg_type	Msg_text
-mysqltest.t1	repair	status	OK
 DROP DATABASE mysqltest;
 RESTORE FROM 'bup_myisam_sync.bak';
 backup_id
@@ -55,6 +52,9 @@ LENGTH(c1)
 CHECKSUM TABLE t1;
 Table	Checksum
 mysqltest.t1	1728069308
+CHECK TABLE t1 EXTENDED;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
 
 connection default: cleanup
 SET DEBUG_SYNC= 'RESET';

=== modified file 'mysql-test/suite/backup/r/backup_no_be.result'
--- a/mysql-test/suite/backup/r/backup_no_be.result	2008-11-17 09:57:51 +0000
+++ b/mysql-test/suite/backup/r/backup_no_be.result	2009-01-28 11:08:55 +0000
@@ -1,7 +1,7 @@
 DROP DATABASE IF EXISTS db1;
 CREATE DATABASE db1;
 USE db1;
-CREATE TABLE t1 (a int) ENGINE=Myisam;
+CREATE TABLE t1 (a int);
 INSERT INTO t1 VALUES (1),(2),(3);
 SET SESSION debug="-d,";
 SELECT @@debug;

=== modified file 'mysql-test/suite/backup/r/backup_objects.result'
--- a/mysql-test/suite/backup/r/backup_objects.result	2008-10-07 17:15:44 +0000
+++ b/mysql-test/suite/backup/r/backup_objects.result	2009-01-28 11:08:55 +0000
@@ -1,4 +1,3 @@
-SET storage_engine=MyISAM;
 
 Starting Test - Backup
 

=== modified file 'mysql-test/suite/backup/t/backup.test'
--- a/mysql-test/suite/backup/t/backup.test	2009-02-01 13:26:18 +0000
+++ b/mysql-test/suite/backup/t/backup.test	2009-02-13 12:40:13 +0000
@@ -30,7 +30,7 @@ USE db1;
 CREATE TABLE `building` (
   `dir_code` char(4),
   `building` char(6)
-) ENGINE=MYISAM DEFAULT CHARSET=latin1;
+) DEFAULT CHARSET=latin1;
 
 #
 # Dumping data for table `building`
@@ -47,7 +47,7 @@ CREATE TABLE `directorate` (
   `dir_code` char(4),
   `dir_name` char(30),
   `dir_head_id` char(9)
-) ENGINE=MYISAM DEFAULT CHARSET=latin1;
+) DEFAULT CHARSET=latin1;
 
 #
 # Dumping data for table `directorate`
@@ -69,7 +69,7 @@ CREATE TABLE `staff` (
   `sex` char(1),
   `salary` int(11),
   `mgr_id` char(9)
-) ENGINE=MYISAM DEFAULT CHARSET=latin1;
+) DEFAULT CHARSET=latin1;
 
 #
 # Dumping data for table `staff`
@@ -106,8 +106,7 @@ connection backup;
 --echo and the result differs in PROCESSLIST state and info.
 SET DEBUG_SYNC= 'default_locking_thread_added SIGNAL bup_thread_added
                  WAIT_FOR finish';
-SET DEBUG_SYNC= 'myisam_locking_thread_added SIGNAL bup_thread_added
-                 WAIT_FOR finish';
+SET DEBUG_SYNC= concat(lower(@@storage_engine), '_locking_thread_added SIGNAL bup_thread_added WAIT_FOR finish');
 --echo backup: Send the backup command.
 send BACKUP DATABASE db1,db2 TO 'test.ba';
 
@@ -275,7 +274,7 @@ CREATE TABLE bup_default.wide (
 ) ENGINE=INNODB DEFAULT CHARSET=latin1;
 
 CREATE TABLE bup_default.t1 (a int) engine=innodb;
-CREATE TABLE bup_default.t2 (a int) engine=MYISAM;
+CREATE TABLE bup_default.t2 (a int);
 CREATE TABLE bup_default.t1_blob (a int, b text) engine=innodb;
 
 # Insert some data.

=== modified file 'mysql-test/suite/backup/t/backup_commit_backup.test'
--- a/mysql-test/suite/backup/t/backup_commit_backup.test	2008-11-17 09:57:51 +0000
+++ b/mysql-test/suite/backup/t/backup_commit_backup.test	2009-01-28 11:08:55 +0000
@@ -9,7 +9,7 @@
 --source include/not_embedded.inc
 --source include/have_innodb.inc
 --source include/have_falcon.inc
-
+--source include/have_myisam_or_maria_default.inc
 
 LET $BDIR=`select @@backupdir`;
 
@@ -26,11 +26,11 @@ CREATE DATABASE db2;
 CREATE TABLE db1.t1 (s1 CHAR(3)) ENGINE=innodb; # CS driver
 CREATE TABLE db1.t2 (s1 CHAR(3)) ENGINE=falcon; # CS driver
 CREATE TABLE db1.t3 (s1 CHAR(3)) ENGINE=memory; # default driver
-CREATE TABLE db1.t4 (s1 CHAR(3)) ENGINE=myisam; # native driver
+CREATE TABLE db1.t4 (s1 CHAR(3)); # native driver
 
 CREATE TABLE db2.t1 (s1 CHAR(3)) ENGINE=innodb; # trx1
 CREATE TABLE db2.t2 (s1 CHAR(3)) ENGINE=falcon; # trx2
-CREATE TABLE db2.t3 (s1 CHAR(3)) ENGINE=myisam; # non-trx
+CREATE TABLE db2.t3 (s1 CHAR(3)); # non-trx
 
 
 # Create test connections. The setup is as follows

=== modified file 'mysql-test/suite/backup/t/backup_ddl_blocker.test'
--- a/mysql-test/suite/backup/t/backup_ddl_blocker.test	2008-12-06 00:24:23 +0000
+++ b/mysql-test/suite/backup/t/backup_ddl_blocker.test	2009-02-13 12:40:13 +0000
@@ -932,7 +932,7 @@ CREATE DATABASE bup_ddl_blocker_4;
 # Create transaction tables and load them with data.
 --echo con1: Creating tables
 CREATE TABLE bup_ddl_blocker_2.t1 (col_a CHAR(40)) ENGINE=INNODB;
-CREATE TABLE bup_ddl_blocker_4.t1 (col_a CHAR(40)) ENGINE=MYISAM;
+CREATE TABLE bup_ddl_blocker_4.t1 (col_a CHAR(40));
 
 --echo con1: Loading data
 INSERT INTO bup_ddl_blocker_2.t1 VALUES ("01 Some data to test");
@@ -1093,7 +1093,7 @@ CREATE DATABASE bup_ddl_blocker_4;
 # Create transaction tables and load them with data.
 --echo con1: Creating tables
 CREATE TABLE bup_ddl_blocker_2.t1 (col_a CHAR(40)) ENGINE=INNODB;
-CREATE TABLE bup_ddl_blocker_4.t1 (col_a CHAR(40)) ENGINE=MYISAM;
+CREATE TABLE bup_ddl_blocker_4.t1 (col_a CHAR(40));
 
 --echo con1: Loading data
 INSERT INTO bup_ddl_blocker_2.t1 VALUES ("01 Some data to test");
@@ -1245,7 +1245,7 @@ CREATE DATABASE bup_ddl_blocker_4 CHARAC
 # Create transaction tables and load them with data.
 --echo con1: Creating tables
 CREATE TABLE bup_ddl_blocker_2.t1 (col_a CHAR(40)) ENGINE=INNODB;
-CREATE TABLE bup_ddl_blocker_4.t1 (col_a CHAR(40)) ENGINE=MYISAM;
+CREATE TABLE bup_ddl_blocker_4.t1 (col_a CHAR(40));
 
 --echo con1: Loading data
 
@@ -1420,7 +1420,7 @@ CREATE DATABASE bup_ddl_blocker_4 CHARAC
 # Create transaction tables and load them with data.
 --echo con1: Creating tables
 CREATE TABLE bup_ddl_blocker_2.t1 (col_a CHAR(40)) ENGINE=INNODB;
-CREATE TABLE bup_ddl_blocker_4.t1 (col_a CHAR(40)) ENGINE=MYISAM;
+CREATE TABLE bup_ddl_blocker_4.t1 (col_a CHAR(40));
 
 --echo con1: Loading data
 
@@ -1959,7 +1959,7 @@ DROP TABLE IF EXISTS bup_ddl_blocker.t1,
 # Create transaction tables and load them with data.
 --echo con1: Creating tables
 CREATE TABLE bup_ddl_blocker.t1 (col_a CHAR(40)) ENGINE=INNODB;
-CREATE TABLE test.t2 (col_a CHAR(40)) ENGINE=MYISAM;
+CREATE TABLE test.t2 (col_a CHAR(40));
 CREATE TABLE bup_ddl_blocker.t3 (col_a CHAR(40)) ENGINE=MEMORY;
 
 --echo con1: Loading data

=== modified file 'mysql-test/suite/backup/t/backup_default.test'
--- a/mysql-test/suite/backup/t/backup_default.test	2008-12-24 10:48:24 +0000
+++ b/mysql-test/suite/backup/t/backup_default.test	2009-02-13 12:40:13 +0000
@@ -55,7 +55,6 @@ USE db1;
 CREATE TABLE partition_table (
   int_column int(11), 
   char_column char(5))
-ENGINE=myisam
 PARTITION BY HASH (int_column);
 
 INSERT INTO partition_table VALUES (0,'pVtIa');
@@ -77,7 +76,7 @@ INSERT INTO csv_table VALUES(1,'aa1','bb
 CREATE TABLE myisam_table(
    i int,
    v varchar(20))
-ENGINE=myisam;
+;
 
 INSERT INTO myisam_table VALUES(1,'v1'),(2,'v2'),(3,'v3');
 

=== modified file 'mysql-test/suite/backup/t/backup_lock_myisam.test'
--- a/mysql-test/suite/backup/t/backup_lock_myisam.test	2008-12-06 00:24:23 +0000
+++ b/mysql-test/suite/backup/t/backup_lock_myisam.test	2009-02-13 12:40:13 +0000
@@ -9,6 +9,7 @@
 --source include/not_embedded.inc
 --source include/have_innodb.inc
 --source include/have_debug_sync.inc
+--source include/have_myisam_or_maria_default.inc
 
 SET DEBUG_SYNC= 'RESET';
 
@@ -34,7 +35,7 @@ remove_file $MYSQLD_DATADIR/db3.bak;
 
 --echo Create database 1 and a table then populate it
 CREATE DATABASE db1;
-CREATE TABLE db1.t1 (a INT) ENGINE=MYISAM;
+CREATE TABLE db1.t1 (a INT);
 
 INSERT INTO db1.t1 VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(0);
 INSERT INTO db1.t1 SELECT * FROM db1.t1;
@@ -194,7 +195,7 @@ SET DEBUG_SYNC= 'RESET';
 
 --echo Create a database with a table and a view using the MyISAM engine.
 CREATE DATABASE db3;
-CREATE TABLE db3.t1(name CHAR(10)) ENGINE=MYISAM;
+CREATE TABLE db3.t1(name CHAR(10));
 INSERT INTO db3.t1 VALUES('A'),('B'),('C'),('D');
 CREATE VIEW db3.v1 AS SELECT * FROM db3.t1;
 

=== modified file 'mysql-test/suite/backup/t/backup_logs.test'
--- a/mysql-test/suite/backup/t/backup_logs.test	2009-02-01 13:26:18 +0000
+++ b/mysql-test/suite/backup/t/backup_logs.test	2009-02-13 12:40:13 +0000
@@ -8,6 +8,7 @@
 --source include/have_innodb.inc
 --source include/not_embedded.inc
 --source include/have_debug.inc
+--source include/have_myisam_or_maria_default.inc
 --source include/blackhole.inc
 
 SET DEBUG_SYNC= 'RESET';
@@ -162,7 +163,7 @@ connection con1;
 # driver), Memory(Default driver) and Blackhole(no-data driver). Ensure that
 # drivers column indicates all types of drivers for backup database operation.
 
-CREATE TABLE backup_logs.t1 (a char(30)) ENGINE=MYISAM;
+CREATE TABLE backup_logs.t1 (a char(30));
 CREATE TABLE backup_logs.t2 (a char(30)) ENGINE=INNODB;
 CREATE TABLE backup_logs.t3 (a char(30)) ENGINE=MEMORY;
 CREATE TABLE backup_logs.t4(id INT, name CHAR(20))ENGINE=BLACKHOLE;

=== modified file 'mysql-test/suite/backup/t/backup_logs_purge.test'
--- a/mysql-test/suite/backup/t/backup_logs_purge.test	2009-01-29 21:17:59 +0000
+++ b/mysql-test/suite/backup/t/backup_logs_purge.test	2009-02-13 12:40:13 +0000
@@ -32,7 +32,7 @@ CREATE DATABASE backup_logs;
 
 --echo Create table and populate with data.
 
-CREATE TABLE backup_logs.t1 (a char(30)) ENGINE=MYISAM;
+CREATE TABLE backup_logs.t1 (a char(30));
 CREATE TABLE backup_logs.t2 (a char(30)) ENGINE=INNODB;
 CREATE TABLE backup_logs.t3 (a char(30)) ENGINE=MEMORY;
 

=== added file 'mysql-test/suite/backup/t/backup_maria.test'
--- a/mysql-test/suite/backup/t/backup_maria.test	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/backup/t/backup_maria.test	2009-02-13 12:40:13 +0000
@@ -0,0 +1,243 @@
+# A simple test for Maria's online backup.
+# More advanced tests can be run like this:
+# perl mysql-test-run.pl --suite=backup,backup_engines --mysqld=--default-storage-engine=maria
+
+--source include/not_embedded.inc
+--source include/have_debug_sync.inc
+
+let $MYSQLD_DATADIR= `select @@datadir`;
+
+--disable_warnings
+DROP DATABASE IF EXISTS mysqltest;
+--error 0,1
+remove_file $MYSQLD_DATADIR/test.ba;
+--enable_warnings
+
+CREATE DATABASE mysqltest;
+USE mysqltest;
+CREATE TABLE t1 (a int, b varchar(100), c blob, index(a), index(b),
+index(c(10))) transactional=1 engine=maria;
+insert into t1 values(1,"abc","def");
+insert into t1 values(100,"UJL-JK","PPMLsn!");
+
+# test other table formats
+CREATE TABLE t2 like t1;
+alter table t2 transactional=0 row_format=page;
+insert into t2 select * from t1;
+
+CREATE TABLE t3 like t1;
+alter table t3 transactional=0 row_format=dynamic;
+insert into t3 select * from t1;
+
+CREATE TABLE t4 like t1;
+alter table t4 page_checksum=0;
+insert into t4 select * from t1;
+
+CHECKSUM TABLE t1;
+CHECKSUM TABLE t2;
+CHECKSUM TABLE t3;
+CHECKSUM TABLE t4;
+
+# First a backup when nobody is using the tables
+
+--replace_column 1 #
+BACKUP DATABASE mysqltest TO 'test.ba';
+DROP DATABASE mysqltest;
+--replace_column 1 #
+RESTORE FROM 'test.ba';
+
+select count(*) from t1;
+CHECKSUM TABLE t1;
+CHECKSUM TABLE t2;
+CHECKSUM TABLE t3;
+CHECKSUM TABLE t4;
+remove_file $MYSQLD_DATADIR/test.ba;
+
+# Then test that backup is "at end"
+
+    #
+    # Create a worker connection, using mysqltest as its default database.
+    #
+    --echo
+    --echo connection backup: Start backup
+    connect (backup,localhost,root,,mysqltest);
+
+    # Activate the sync point for BACKUP. Before starting the prepare phase,
+    # BACKUP reaches the sync point "before_backup_data_prepare", which will
+    # emit the signal "bup_sync" and then wait for the signal "bup_finish"
+    # to be emitted by another connection.
+    SET DEBUG_SYNC= 'before_backup_data_prepare SIGNAL bup_sync
+                     WAIT_FOR bup_finish';
+    send BACKUP DATABASE mysqltest TO 'test.ba';
+
+--echo
+--echo connection default: Wait for BACKUP to reach its sync point
+connection default;
+SET DEBUG_SYNC= 'now WAIT_FOR bup_sync';
+
+--echo Modify tables
+update t1 set c=repeat("SUN", 4000); # a multi-page blob
+insert into t1 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t1 select * from t1;
+insert into t1 select * from t1;
+delete from t1 where a=200;
+CHECKSUM TABLE t1;
+select count(*) from t1;
+update t2 set c=repeat("SUN", 4000); # a multi-page blob
+insert into t2 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t2 select * from t2;
+insert into t2 select * from t2;
+delete from t2 where a=200;
+CHECKSUM TABLE t2;
+select count(*) from t2;
+update t3 set c=repeat("SUN", 4000); # a multi-page blob
+insert into t3 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t3 select * from t3;
+insert into t3 select * from t3;
+delete from t3 where a=200;
+CHECKSUM TABLE t3;
+select count(*) from t3;
+update t4 set c=repeat("SUN", 4000); # a multi-page blob
+insert into t4 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t4 select * from t4;
+insert into t4 select * from t4;
+delete from t4 where a=200;
+CHECKSUM TABLE t4;
+select count(*) from t4;
+
+--echo Signal BACKUP to finish
+SET DEBUG_SYNC= 'now SIGNAL bup_finish';
+
+    --echo
+    --echo connection backup: Fetch result
+    connection backup;
+    --replace_column 1 #
+    reap;
+    DROP DATABASE mysqltest;
+
+    --replace_column 1 #
+    RESTORE FROM 'test.ba';
+    CHECKSUM TABLE t1;
+    CHECK TABLE t1 EXTENDED;
+    CHECKSUM TABLE t2;
+    CHECK TABLE t2 EXTENDED;
+    CHECKSUM TABLE t3;
+    CHECK TABLE t3 EXTENDED;
+    CHECKSUM TABLE t4;
+    CHECK TABLE t4 EXTENDED;
+
+connection default;
+remove_file $MYSQLD_DATADIR/test.ba;
+
+# Once more, with deletion of all rows
+
+    --echo
+    --echo connection backup: Start backup
+    connection backup;
+
+    SET DEBUG_SYNC= 'before_backup_data_prepare SIGNAL bup_sync
+                     WAIT_FOR bup_finish';
+    send BACKUP DATABASE mysqltest TO 'test.ba';
+
+--echo
+--echo connection default: Wait for BACKUP to reach its sync point
+connection default;
+SET DEBUG_SYNC= 'now WAIT_FOR bup_sync';
+
+--echo Modify tables
+delete from t1;
+insert into t1 values(2000,"UJLlk5454j-JK","PferZZZfPMLsn!");
+update t1 set c=repeat("NUS", 4000); # a multi-page blob
+insert into t1 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t1 select * from t1;
+insert into t1 select * from t1;
+delete from t1 where a=200;
+CHECKSUM TABLE t1;
+select count(*) from t1;
+delete from t2;
+insert into t2 values(2000,"UJLlk5454j-JK","PferZZZfPMLsn!");
+update t2 set c=repeat("NUS", 4000); # a multi-page blob
+insert into t2 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t2 select * from t2;
+insert into t2 select * from t2;
+delete from t2 where a=200;
+CHECKSUM TABLE t2;
+select count(*) from t2;
+delete from t3;
+insert into t3 values(2000,"UJLlk5454j-JK","PferZZZfPMLsn!");
+update t3 set c=repeat("NUS", 4000); # a multi-page blob
+insert into t3 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t3 select * from t3;
+insert into t3 select * from t3;
+delete from t3 where a=200;
+CHECKSUM TABLE t3;
+select count(*) from t3;
+delete from t4;
+insert into t4 values(2000,"UJLlk5454j-JK","PferZZZfPMLsn!");
+update t4 set c=repeat("NUS", 4000); # a multi-page blob
+insert into t4 values(200,"UJLlkj-JK","PferfPMLsn!");
+insert into t4 select * from t4;
+insert into t4 select * from t4;
+delete from t4 where a=200;
+CHECKSUM TABLE t4;
+select count(*) from t4;
+
+
+--echo Signal BACKUP to finish
+SET DEBUG_SYNC= 'now SIGNAL bup_finish';
+
+    --echo
+    --echo connection backup: Fetch result
+    connection backup;
+    --replace_column 1 #
+    reap;
+    DROP DATABASE mysqltest;
+
+    --replace_column 1 #
+    RESTORE FROM 'test.ba';
+    CHECKSUM TABLE t1;
+    CHECK TABLE t1 EXTENDED;
+    CHECKSUM TABLE t2;
+    CHECK TABLE t2 EXTENDED;
+    CHECKSUM TABLE t3;
+    CHECK TABLE t3 EXTENDED;
+    CHECKSUM TABLE t4;
+    CHECK TABLE t4 EXTENDED;
+
+connection default;
+drop database mysqltest;
+remove_file $MYSQLD_DATADIR/test.ba;
+
+# test for BUG#42519 "Maria: RESTORE leads to corrupted table and assertion":
+
+create database mysqltest;
+CREATE TABLE mysqltest.t1 (a int, b varchar(100), unique(a)) engine=maria;
+
+--replace_column 1 #
+BACKUP DATABASE mysqltest TO 'test.ba';
+DROP DATABASE mysqltest;
+
+connection backup;
+SET DEBUG_SYNC= 'before_restore_locks_tables SIGNAL wait_for_restore WAIT_FOR finish';
+send RESTORE FROM 'test.ba';
+
+connection default;
+SET DEBUG_SYNC= 'now WAIT_FOR wait_for_restore';
+insert into mysqltest.t1 values(1,"def");
+SET DEBUG_SYNC= 'now SIGNAL finish';
+
+connection backup;
+--replace_column 1 #
+reap;
+
+connection default;
+select * from mysqltest.t1;
+check table mysqltest.t1;
+select * from mysqltest.t1;
+
+--echo
+--echo connection default: cleanup
+connection default;
+drop database mysqltest;
+remove_file $MYSQLD_DATADIR/test.ba;
+SET DEBUG_SYNC= 'RESET';

=== added file 'mysql-test/suite/backup/t/backup_maria_other_instance.test'
--- a/mysql-test/suite/backup/t/backup_maria_other_instance.test	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/backup/t/backup_maria_other_instance.test	2009-02-13 12:40:13 +0000
@@ -0,0 +1,78 @@
+# Test to verify that auto-zerofilling happens when a table is
+# restored from a different Maria instance
+
+# can't restart in embedded
+--source include/not_embedded.inc
+--source include/have_maria.inc
+
+let $MARIA_LOG=.;
+let $MYSQLD_DATADIR= `select @@datadir`;
+
+--disable_warnings
+drop database if exists mysqltest;
+--error 0,1
+remove_file $MYSQLD_DATADIR/test.ba;
+--enable_warnings
+create database mysqltest;
+let $mms_tname=t;
+
+connect (admin, localhost, root,,mysqltest,,);
+--enable_reconnect
+
+connection default;
+use mysqltest;
+--enable_reconnect
+
+create table t1(a int, unique(a)) engine=maria;
+insert into t1 values(1);
+insert into t1 values(2);
+--replace_column 1 #
+BACKUP DATABASE mysqltest TO 'test.ba';
+
+# this will remove control file, so change the uuid of the Maria
+# instance, thus t1 will appear as imported from elsewhere.
+
+-- source include/maria_empty_logs.inc
+
+--replace_column 1 #
+RESTORE FROM 'test.ba' OVERWRITE;
+disable_ps_protocol; # see maria-recover.test
+replace_regex /Table.*t1/t1/ ;
+select * from t1;
+enable_ps_protocol;
+check table t1 extended;
+flush table t1;
+
+# Check that table is auto-zerofilled, movable
+--exec $MARIA_CHK -dv $MYSQLD_DATADIR/mysqltest/t1 >$MYSQLTEST_VARDIR/tmp/mariachk.txt
+perl;
+    use strict;
+    use warnings;
+    my $fname= "$ENV{'MYSQLTEST_VARDIR'}/tmp/mariachk.txt";
+    open(FILE, "<", $fname) or die;
+    my @content= <FILE>;
+    print grep(/Status:.*zerofilled/, @content);
+    print "create_rename_lsn has magic value\n" if grep(/create_rename \(0,0x2\)/, @content);
+    close FILE;
+EOF
+
+# this will attach t1 to the current Maria instance
+insert into t1 values(3);
+check table t1 extended;
+flush table t1;
+
+# Check that table is not zerofilled, not movable
+--exec $MARIA_CHK -dv $MYSQLD_DATADIR/mysqltest/t1 >$MYSQLTEST_VARDIR/tmp/mariachk.txt
+perl;
+    use strict;
+    use warnings;
+    my $fname= "$ENV{'MYSQLTEST_VARDIR'}/tmp/mariachk.txt";
+    open(FILE, "<", $fname) or die;
+    my @content= <FILE>;
+    print grep(/Status:.*(zerofilled|movable)/, @content);
+    print "create_rename_lsn has non-magic value\n" if grep(/create_rename \([0-9]+/, @content);
+    close FILE;
+EOF
+
+drop database mysqltest;
+remove_file $MYSQLD_DATADIR/test.ba;

=== modified file 'mysql-test/suite/backup/t/backup_multi_blocks.test'
--- a/mysql-test/suite/backup/t/backup_multi_blocks.test	2008-10-29 08:45:14 +0000
+++ b/mysql-test/suite/backup/t/backup_multi_blocks.test	2009-02-13 12:40:13 +0000
@@ -11,7 +11,7 @@ DROP DATABASE IF EXISTS mysqltest;
 CREATE DATABASE mysqltest;
 USE mysqltest;
 
-CREATE TABLE t1 (a LONGTEXT) ENGINE=MYISAM;
+CREATE TABLE t1 (a LONGTEXT);
 
 --echo Inserting data ...
 USE mysqltest;

=== modified file 'mysql-test/suite/backup/t/backup_myisam.test'
--- a/mysql-test/suite/backup/t/backup_myisam.test	2009-01-30 13:28:43 +0000
+++ b/mysql-test/suite/backup/t/backup_myisam.test	2009-02-13 12:40:13 +0000
@@ -53,7 +53,7 @@ DROP DATABASE mysql_db1;
 --echo #
 CREATE DATABASE mysqltest;
 USE mysqltest;
-CREATE TABLE `ц╓ц╤ц╪ц÷бёцCKUP DATABASE mysqltest TO 'bup_myisam.bak';
 DROP TABLE `ц╓ц╤ц╪ц÷бёц╔`;

=== modified file 'mysql-test/suite/backup/t/backup_myisam_extlocking.test'
--- a/mysql-test/suite/backup/t/backup_myisam_extlocking.test	2009-01-29 21:17:59 +0000
+++ b/mysql-test/suite/backup/t/backup_myisam_extlocking.test	2009-02-13 12:40:13 +0000
@@ -2,6 +2,7 @@
 # see if --external-locking=1 causes backup to fail as expected
 
 --source include/not_embedded.inc
+--source include/have_myisam_or_maria_default.inc
 
 disable_query_log;
 call mtr.add_suppression("Backup:");
@@ -21,7 +22,7 @@ remove_file $MYSQLD_DATADIR/test.ba;
 
 CREATE DATABASE mysqltest;
 USE mysqltest;
-CREATE TABLE t1 (a int) engine=myisam;
+CREATE TABLE t1 (a int);
 
 --replace_column 1 #
 --error ER_GET_ERRMSG

=== modified file 'mysql-test/suite/backup/t/backup_myisam_sync.test'
--- a/mysql-test/suite/backup/t/backup_myisam_sync.test	2009-01-29 21:17:59 +0000
+++ b/mysql-test/suite/backup/t/backup_myisam_sync.test	2009-02-13 12:40:13 +0000
@@ -2,6 +2,7 @@
 
 --source include/not_embedded.inc
 --source include/have_debug_sync.inc
+--source include/have_myisam_or_maria_default.inc
 
 #
 # Cleanup from former test cases
@@ -25,7 +26,7 @@ USE mysqltest;
 # Create table with long records (causing records' length to be stored
 # in a long format in the backup log).
 #
-CREATE TABLE t1 (c1 LONGTEXT) ENGINE=MyISAM;
+CREATE TABLE t1 (c1 LONGTEXT);
 
     #
     # Create a worker connection, using mysqltest as its default database.
@@ -66,7 +67,6 @@ SET DEBUG_SYNC= 'now SIGNAL bup_finish';
     connection backup;
     --replace_column 1 #
     reap;
-    REPAIR TABLE t1 QUICK;
     DROP DATABASE mysqltest;
 
     --replace_column 1 #
@@ -74,6 +74,7 @@ SET DEBUG_SYNC= 'now SIGNAL bup_finish';
 
     SELECT LENGTH(c1) FROM t1;
     CHECKSUM TABLE t1;
+    CHECK TABLE t1 EXTENDED;
 
     disconnect backup;
 

=== modified file 'mysql-test/suite/backup/t/backup_no_be.test'
--- a/mysql-test/suite/backup/t/backup_no_be.test	2008-12-06 00:24:23 +0000
+++ b/mysql-test/suite/backup/t/backup_no_be.test	2009-02-13 12:40:13 +0000
@@ -1,6 +1,7 @@
 --source include/not_embedded.inc
 --source include/have_debug.inc
 --source include/not_embedded.inc
+--source include/have_myisam_or_maria_default.inc
 
 disable_query_log;
 call mtr.add_suppression("Backup:");
@@ -35,7 +36,7 @@ remove_file $MYSQLD_DATADIR/db1.bak;
 
 CREATE DATABASE db1;
 USE db1;
-CREATE TABLE t1 (a int) ENGINE=Myisam;
+CREATE TABLE t1 (a int);
 INSERT INTO t1 VALUES (1),(2),(3);
 
 # First check the normal behaviour when server is not modified.

=== modified file 'mysql-test/suite/backup/t/backup_no_data.test'
--- a/mysql-test/suite/backup/t/backup_no_data.test	2008-12-06 00:24:23 +0000
+++ b/mysql-test/suite/backup/t/backup_no_data.test	2009-02-13 12:40:13 +0000
@@ -56,7 +56,7 @@ DROP TABLE IF EXISTS t1;
 CREATE TABLE t1 (
   `dir_code` char(4),
   `building` char(6)
-) ENGINE=MYISAM DEFAULT CHARSET=latin1;
+) DEFAULT CHARSET=latin1;
 
 --error 0,1
 --remove_file $MYSQLD_DATADIR/empty_db.bak

=== modified file 'mysql-test/suite/backup/t/backup_no_engine.test'
--- a/mysql-test/suite/backup/t/backup_no_engine.test	2008-10-29 08:45:14 +0000
+++ b/mysql-test/suite/backup/t/backup_no_engine.test	2009-02-13 12:40:13 +0000
@@ -14,7 +14,7 @@ DROP DATABASE IF EXISTS db;
 
 CREATE DATABASE db;
 CREATE TABLE db.t1 (a int, b char(32))
-ENGINE=MYISAM;
+;
 
 # copy description of a table using non-existent storage engine
 copy_file $table_def $MYSQLD_DATADIR/db/t2.frm;

=== modified file 'mysql-test/suite/backup/t/backup_objects.test'
--- a/mysql-test/suite/backup/t/backup_objects.test	2008-10-29 08:45:14 +0000
+++ b/mysql-test/suite/backup/t/backup_objects.test	2009-02-13 12:40:13 +0000
@@ -16,6 +16,7 @@
 
 --source include/have_innodb.inc
 --source include/not_embedded.inc
+--source include/have_myisam_or_maria_default.inc
 
 # Note: BACKUP crashes server when InnoDB engine is used and views are 
 # included in the image (see BUG#34758). Until this issue is resolved we 
@@ -23,15 +24,14 @@
 # foreign key constraints.
 #
 # If tables use MyISAM storage engine the test works but all foreign
-# key constraints are silently ignored. When BUG#34758 is fixed, the line 
-# below should be changed to make InnoDB the default storage engine so that
+# key constraints are silently ignored. When BUG#34758 is fixed, 
+# "SET storage_engine=InnoDB;" should be added just below
+# to make InnoDB the default storage engine so that
 # handling of dependencies introduced by foreign keys can also be tested.
 #
 # Note that there is a separate test backup_fkey testing foreign key
 # constraints.
 
-SET storage_engine=MyISAM;
-
 ##############################################################
 --echo 
 --echo Starting Test - Backup

=== modified file 'mysql-test/suite/backup/t/disabled.def'
--- a/mysql-test/suite/backup/t/disabled.def	2008-12-29 12:06:48 +0000
+++ b/mysql-test/suite/backup/t/disabled.def	2009-02-17 10:45:58 +0000
@@ -14,3 +14,4 @@ backup_triggers_and_events    : Bug#3776
 backup_no_data                : Bug#41008 2008-12-08 alik union.test does not cleanup
 backup_ddl_blocker            : Bug#41008 2008-12-08 alik union.test does not cleanup
 backup_views                  : Bug#41360 2008-12-10 ingo Test fails after merge of main and backup trees
+backup_maria_other_instance   : Bug#42924 2009-02-17 serg

=== modified file 'mysql-test/suite/maria/r/maria-big.result'
--- a/mysql-test/suite/maria/r/maria-big.result	2008-12-10 14:30:52 +0000
+++ b/mysql-test/suite/maria/r/maria-big.result	2009-02-13 16:30:54 +0000
@@ -1,8 +1,6 @@
 set global max_allowed_packet=400000000;
 set storage_engine=maria;
 affected rows: 0
-set global maria_log_file_size=4294967295;
-affected rows: 0
 drop table if exists t1, t2;
 affected rows: 0
 create table t1(a char(3));

=== modified file 'mysql-test/suite/maria/r/maria-big2.result'
--- a/mysql-test/suite/maria/r/maria-big2.result	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/r/maria-big2.result	2009-02-12 14:08:56 +0000
@@ -3,3 +3,4 @@ Table	Op	Msg_type	Msg_text
 test.t2	check	status	OK
 Table	Op	Msg_type	Msg_text
 test.t2	check	status	OK
+drop table t2;

=== modified file 'mysql-test/suite/maria/r/maria-connect.result'
--- a/mysql-test/suite/maria/r/maria-connect.result	2008-10-20 09:16:47 +0000
+++ b/mysql-test/suite/maria/r/maria-connect.result	2009-02-13 16:30:54 +0000
@@ -1,6 +1,5 @@
 set global storage_engine=maria;
 set session storage_engine=maria;
-set global maria_log_file_size=4294967295;
 drop table if exists t1;
 SET SQL_WARNINGS=1;
 RESET MASTER;

=== modified file 'mysql-test/suite/maria/r/maria-purge.result'
--- a/mysql-test/suite/maria/r/maria-purge.result	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/r/maria-purge.result	2009-02-12 14:08:56 +0000
@@ -38,13 +38,13 @@ set global maria_log_file_size=16777216;
 set global maria_checkpoint_interval=30;
 SHOW ENGINE maria logs;
 Type	Name	Status
-MARIA	master-data/maria_log.00000002	in use
+MARIA	maria_log.00000002	in use
 insert into t2 select * from t1;
 insert into t1 select * from t2;
 set global maria_checkpoint_interval=30;
 SHOW ENGINE maria logs;
 Type	Name	Status
-MARIA	master-data/maria_log.00000004	in use
+MARIA	maria_log.00000004	in use
 set global maria_log_file_size=16777216;
 select @@global.maria_log_file_size;
 @@global.maria_log_file_size
@@ -52,7 +52,7 @@ select @@global.maria_log_file_size;
 set global maria_checkpoint_interval=30;
 SHOW ENGINE maria logs;
 Type	Name	Status
-MARIA	master-data/maria_log.00000004	in use
+MARIA	maria_log.00000004	in use
 set global maria_log_file_size=8388608;
 select @@global.maria_log_file_size;
 @@global.maria_log_file_size
@@ -62,32 +62,32 @@ insert into t1 select * from t2;
 set global maria_checkpoint_interval=30;
 SHOW ENGINE maria logs;
 Type	Name	Status
-MARIA	master-data/maria_log.00000004	free
-MARIA	master-data/maria_log.00000005	free
-MARIA	master-data/maria_log.00000006	free
-MARIA	master-data/maria_log.00000007	free
-MARIA	master-data/maria_log.00000008	in use
+MARIA	maria_log.00000004	free
+MARIA	maria_log.00000005	free
+MARIA	maria_log.00000006	free
+MARIA	maria_log.00000007	free
+MARIA	maria_log.00000008	in use
 flush logs;
 SHOW ENGINE maria logs;
 Type	Name	Status
-MARIA	master-data/maria_log.00000008	in use
+MARIA	maria_log.00000008	in use
 set global maria_log_file_size=16777216;
 set global maria_log_purge_type=external;
 insert into t1 select * from t2;
 set global maria_checkpoint_interval=30;
 SHOW ENGINE maria logs;
 Type	Name	Status
-MARIA	master-data/maria_log.00000008	free
-MARIA	master-data/maria_log.00000009	in use
+MARIA	maria_log.00000008	free
+MARIA	maria_log.00000009	in use
 flush logs;
 SHOW ENGINE maria logs;
 Type	Name	Status
-MARIA	master-data/maria_log.00000008	free
-MARIA	master-data/maria_log.00000009	in use
+MARIA	maria_log.00000008	free
+MARIA	maria_log.00000009	in use
 set global maria_log_purge_type=immediate;
 insert into t1 select * from t2;
 set global maria_checkpoint_interval=30;
 SHOW ENGINE maria logs;
 Type	Name	Status
-MARIA	master-data/maria_log.00000011	in use
+MARIA	maria_log.00000011	in use
 drop table t1, t2;

=== modified file 'mysql-test/suite/maria/r/maria-recover.result'
--- a/mysql-test/suite/maria/r/maria-recover.result	2008-11-12 15:23:22 +0000
+++ b/mysql-test/suite/maria/r/maria-recover.result	2009-02-13 16:30:54 +0000
@@ -35,3 +35,4 @@ select * from t_corrupted2;
 a
 ThursdayMorningsMarket
 drop database mysqltest;
+set global maria_recover=backup;

=== modified file 'mysql-test/suite/maria/r/maria-recovery-rtree-ft.result'
--- a/mysql-test/suite/maria/r/maria-recovery-rtree-ft.result	2008-10-20 09:16:47 +0000
+++ b/mysql-test/suite/maria/r/maria-recovery-rtree-ft.result	2009-02-13 16:30:54 +0000
@@ -156,4 +156,6 @@ mysqltest.t1	check	status	OK
 Checksum-check
 ok
 use mysqltest;
-drop table t1,t2;
+drop database mysqltest_for_feeding_recovery;
+drop database mysqltest_for_comparison;
+drop database mysqltest;

=== modified file 'mysql-test/suite/maria/r/maria-recovery3.result'
--- a/mysql-test/suite/maria/r/maria-recovery3.result	2008-12-05 21:11:46 +0000
+++ b/mysql-test/suite/maria/r/maria-recovery3.result	2009-02-12 14:08:56 +0000
@@ -25,5 +25,70 @@ Checksum-check
 ok
 use mysqltest;
 drop table t1;
+* TEST of logging of BLOBs
+CREATE TABLE `t1` (
+`blob` blob,
+`blob_key` blob
+) ENGINE=maria ROW_FORMAT=page
+;
+* copied t1 for feeding_recovery
+* compared t1 to old version
+set global maria_checkpoint_interval=0;
+INSERT INTO `t1` VALUES (NULL,repeat('A',5198));
+INSERT INTO `t1` VALUES (NULL,repeat('B',65535));
+INSERT INTO `t1` VALUES (repeat('K',5198),repeat('L',2325));
+INSERT INTO `t1` VALUES (repeat('C',65535),NULL);
+INSERT INTO `t1` VALUES (NULL,repeat('D',65535));
+INSERT INTO `t1` VALUES (repeat('E',65535),repeat('F',16111));
+INSERT INTO `t1` VALUES (repeat('G',65535),repeat('H',65535));
+INSERT INTO `t1` VALUES (repeat('I',5198),repeat('J',65535));
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+flush table t1;
+* copied t1 for comparison
+* compared t1 to old version
+SET SESSION debug="+d,maria_flush_whole_log,maria_crash";
+* crashing mysqld intentionally
+set global maria_checkpoint_interval=1;
+ERROR HY000: Lost connection to MySQL server during query
+* copied t1 back for feeding_recovery
+* recovery happens
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+* testing that checksum after recovery is as expected
+Checksum-check
+ok
+* compared t1 to old version
+use mysqltest;
+drop table t1;
+create table t1 engine=maria select 1;
+* copied t1 for feeding_recovery
+set global maria_checkpoint_interval=0;
+insert into t1 values(2);
+truncate table t1;
+flush table t1;
+* copied t1 for comparison
+truncate table t1;
+SET SESSION debug="+d,maria_flush_whole_log,maria_crash_create_table";
+* crashing mysqld intentionally
+truncate table t1;
+ERROR HY000: Lost connection to MySQL server during query
+* recovery happens
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	warning	Size of indexfile is: 372      Should be: 8192
+mysqltest.t1	check	status	OK
+* testing that checksum after recovery is as expected
+Checksum-check
+ok
+use mysqltest;
+truncate table t1;
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+drop table t1;
+drop database mysqltest_for_feeding_recovery;
 drop database mysqltest_for_comparison;
 drop database mysqltest;

=== modified file 'mysql-test/suite/maria/r/maria.result'
--- a/mysql-test/suite/maria/r/maria.result	2008-12-14 11:36:15 +0000
+++ b/mysql-test/suite/maria/r/maria.result	2009-02-13 16:30:54 +0000
@@ -2132,7 +2132,7 @@ c3 VARCHAR(10) NOT NULL,
 KEY (c1),
 KEY (c2)
 ) ENGINE=maria DEFAULT CHARSET=utf8 PACK_KEYS=0;
-MARIA file:          MYSQLTEST_VARDIR/master-data/test/t1
+MARIA file:          MYSQLD_DATADIR/test/t1
 Record format:       Block
 Crashsafe:           yes
 Character set:       utf8_general_ci (45)
@@ -2594,3 +2594,13 @@ flush tables with read lock;
 ERROR HY000: Can't execute the given command because you have active locked tables or an active transaction
 unlock tables;
 drop table t1, t2;
+create table t1(a int primary key, b blob, c blob) engine=maria;
+insert into t1 values(1,repeat('a',100), repeat('b',657860));
+Warnings:
+Warning	1265	Data truncated for column 'c' at row 1
+insert into t1 values(1,repeat('a',100), repeat('b',657860));
+ERROR 23000: Duplicate entry '1' for key 'PRIMARY'
+check table t1;
+Table	Op	Msg_type	Msg_text
+test.t1	check	status	OK
+drop table t1;

=== renamed file 'mysql-test/r/maria_mrr.result' => 'mysql-test/suite/maria/r/maria_mrr.result'
=== modified file 'mysql-test/suite/maria/r/maria_notembedded.result'
--- a/mysql-test/suite/maria/r/maria_notembedded.result	2008-11-27 15:18:17 +0000
+++ b/mysql-test/suite/maria/r/maria_notembedded.result	2009-02-13 16:30:54 +0000
@@ -30,9 +30,27 @@ insert t1 values (2);
 lock table t1 write concurrent;
 insert t1 values (3);
 insert t1 values (2);
+lock table t1 write concurrent;
+insert t1 values (4);
 insert t1 values (3);
+lock table t1 write concurrent;
+insert t1 values (5);
+insert t1 values (4);
+lock table t1 write concurrent;
+insert t1 values (6);
+insert t1 values (5);
+insert t1 values (6);
 ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
 unlock tables;
 ERROR 23000: Duplicate entry '2' for key 'a'
 unlock tables;
+ERROR 23000: Duplicate entry '3' for key 'a'
+unlock tables;
+ERROR 23000: Duplicate entry '4' for key 'a'
+unlock tables;
+ERROR 23000: Duplicate entry '5' for key 'a'
+unlock tables;
+check table t1;
+Table	Op	Msg_type	Msg_text
+test.t1	check	status	OK
 drop table t1;

=== modified file 'mysql-test/suite/maria/r/maria_partition.result'
--- a/mysql-test/suite/maria/r/maria_partition.result	2008-10-15 12:44:31 +0000
+++ b/mysql-test/suite/maria/r/maria_partition.result	2009-02-12 14:08:56 +0000
@@ -1,7 +1,6 @@
 set global storage_engine=maria;
 set session storage_engine=maria;
 set global maria_page_checksum=0;
-set global maria_log_file_size=4294967295;
 drop table if exists t1,t2;
 drop view if exists v1;
 SET SQL_WARNINGS=1;
@@ -10,3 +9,27 @@ insert into t1 values (1);
 alter table t1 partition by list (s1) (partition p1 values in (2));
 ERROR HY000: Table has no partition for value 1
 drop table t1;
+create table t2(a blob) engine=maria;
+create table t1(a int primary key) engine=maria;
+insert into t2 values ('foo'),('bar');
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+a	a
+insert into t1 values (1);
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+a	a
+insert into t1 values (2);
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+a	a
+drop table t1,t2;
+create table t2(a blob);
+create table t1(a int primary key) PARTITION BY HASH (a) PARTITIONS 2;
+insert into t2 values ('foo'),('bar');
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+a	a
+insert into t1 values (1);
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+a	a
+insert into t1 values (2);
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+a	a
+drop table t1,t2;

=== modified file 'mysql-test/suite/maria/r/ps_maria.result'
--- a/mysql-test/suite/maria/r/ps_maria.result	2008-10-20 12:42:30 +0000
+++ b/mysql-test/suite/maria/r/ps_maria.result	2009-02-16 21:18:45 +0000
@@ -1,4 +1,3 @@
-set global maria_log_file_size=4294967295;
 use test;
 drop table if exists t1, t9 ;
 create table t1
@@ -1304,11 +1303,12 @@ a	b
 set @arg00=NULL;
 set @arg01=2;
 execute stmt1 using @arg00, @arg01;
-ERROR 23000: Column 'a' cannot be null
+Warnings:
+Warning	1048	Column 'a' cannot be null
 select a,b from t1 order by a;
 a	b
+0	two
 1	one
-2	two
 3	three
 4	four
 set @arg00=0;

=== added file 'mysql-test/suite/maria/t/disabled.def'
--- a/mysql-test/suite/maria/t/disabled.def	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/disabled.def	2009-02-17 10:45:58 +0000
@@ -0,0 +1 @@
+maria-recovery-bitmap : BUG#42180

=== modified file 'mysql-test/suite/maria/t/maria-autozerofill.test'
--- a/mysql-test/suite/maria/t/maria-autozerofill.test	2008-11-12 15:23:22 +0000
+++ b/mysql-test/suite/maria/t/maria-autozerofill.test	2009-02-13 16:30:54 +0000
@@ -24,8 +24,8 @@ create table t1(a int) engine=maria;
 insert into t1 values(1);
 flush table t1;
 # Check that table is not zerofilled, not movable
-let $MYSQLTEST_DATADIR= `select @@datadir`;
---exec $MARIA_CHK -dv $MYSQLTEST_DATADIR/mysqltest/t1 >$MYSQLTEST_VARDIR/tmp/mariachk.txt
+let $MYSQLD_DATADIR= `select @@datadir`;
+--exec $MARIA_CHK -dv $MYSQLD_DATADIR/mysqltest/t1 >$MYSQLTEST_VARDIR/tmp/mariachk.txt
 perl;
     use strict;
     use warnings;
@@ -49,7 +49,7 @@ enable_ps_protocol;
 flush table t1;
 
 # Check that table is auto-zerofilled, movable
---exec $MARIA_CHK -dv $MYSQLTEST_DATADIR/mysqltest/t1 >$MYSQLTEST_VARDIR/tmp/mariachk.txt
+--exec $MARIA_CHK -dv $MYSQLD_DATADIR/mysqltest/t1 >$MYSQLTEST_VARDIR/tmp/mariachk.txt
 perl;
     use strict;
     use warnings;
@@ -66,7 +66,7 @@ insert into t1 values(2);
 flush table t1;
 
 # Check that table is not zerofilled, not movable
---exec $MARIA_CHK -dv $MYSQLTEST_DATADIR/mysqltest/t1 >$MYSQLTEST_VARDIR/tmp/mariachk.txt
+--exec $MARIA_CHK -dv $MYSQLD_DATADIR/mysqltest/t1 >$MYSQLTEST_VARDIR/tmp/mariachk.txt
 perl;
     use strict;
     use warnings;

=== modified file 'mysql-test/suite/maria/t/maria-big.test'
--- a/mysql-test/suite/maria/t/maria-big.test	2008-12-10 14:30:52 +0000
+++ b/mysql-test/suite/maria/t/maria-big.test	2009-02-13 16:30:54 +0000
@@ -2,6 +2,7 @@
 --source include/have_maria.inc
 --source include/big_test.inc
 
+let $default_max_allowed_packet=`select @@global.max_allowed_packet`;
 set global max_allowed_packet=400000000;
 # need new session to use setting above
 connect (root,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK);
@@ -9,7 +10,6 @@ connection root;
 
 enable_info;
 set storage_engine=maria;
-set global maria_log_file_size=4294967295;
 disable_warnings;
 drop table if exists t1, t2;
 enable_warnings;
@@ -64,3 +64,8 @@ select a,length(b) from t1;
 check table t1;
 
 drop table t1;
+--disable_result_log
+--disable_query_log
+eval set global max_allowed_packet=$default_max_allowed_packet;
+--enable_result_log
+--enable_query_log

=== modified file 'mysql-test/suite/maria/t/maria-big2.test'
--- a/mysql-test/suite/maria/t/maria-big2.test	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/t/maria-big2.test	2009-02-12 14:08:56 +0000
@@ -4074,3 +4074,4 @@ insert into t2(a,b) values ('mozkakabudl
 ;
 }
 enable_query_log;
+drop table t2;

=== modified file 'mysql-test/suite/maria/t/maria-connect.test'
--- a/mysql-test/suite/maria/t/maria-connect.test	2008-10-20 09:16:47 +0000
+++ b/mysql-test/suite/maria/t/maria-connect.test	2009-02-13 16:30:54 +0000
@@ -9,8 +9,6 @@ let $default=`select @@global.storage_en
 set global storage_engine=maria;
 set session storage_engine=maria;
 
-set global maria_log_file_size=4294967295;
-
 # Initialise
 --disable_warnings
 drop table if exists t1;

=== added file 'mysql-test/suite/maria/t/maria-preload-master.opt'
--- a/mysql-test/suite/maria/t/maria-preload-master.opt	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/maria-preload-master.opt	2009-01-12 12:08:06 +0000
@@ -0,0 +1 @@
+--skip-safemalloc

=== modified file 'mysql-test/suite/maria/t/maria-purge.test'
--- a/mysql-test/suite/maria/t/maria-purge.test	2008-10-20 09:16:47 +0000
+++ b/mysql-test/suite/maria/t/maria-purge.test	2009-02-13 16:30:54 +0000
@@ -65,19 +65,19 @@ insert into t1 select * from t2;
 set global maria_log_file_size=16777216;
 # force a checkpoint to allow log purge
 eval set global maria_checkpoint_interval=$def_checkinterval;
---replace_regex /Size +[0-9]+ ; .+master-data/master-data/
+--replace_regex /Size +[0-9]+ ; .+maria_log/maria_log/
 SHOW ENGINE maria logs;
 
 insert into t2 select * from t1;
 insert into t1 select * from t2;
 
 eval set global maria_checkpoint_interval=$def_checkinterval;
---replace_regex /Size +[0-9]+ ; .+master-data/master-data/
+--replace_regex /Size +[0-9]+ ; .+maria_log/maria_log/
 SHOW ENGINE maria logs;
 set global maria_log_file_size=16777216;
 select @@global.maria_log_file_size;
 eval set global maria_checkpoint_interval=$def_checkinterval;
---replace_regex /Size +[0-9]+ ; .+master-data/master-data/
+--replace_regex /Size +[0-9]+ ; .+maria_log/maria_log/
 SHOW ENGINE maria logs;
 set global maria_log_file_size=8388608;
 select @@global.maria_log_file_size;
@@ -85,26 +85,26 @@ select @@global.maria_log_file_size;
 set global maria_log_purge_type=at_flush;
 insert into t1 select * from t2;
 eval set global maria_checkpoint_interval=$def_checkinterval;
---replace_regex /Size +[0-9]+ ; .+master-data/master-data/
+--replace_regex /Size +[0-9]+ ; .+maria_log/maria_log/
 SHOW ENGINE maria logs;
 flush logs;
---replace_regex /Size +[0-9]+ ; .+master-data/master-data/
+--replace_regex /Size +[0-9]+ ; .+maria_log/maria_log/
 SHOW ENGINE maria logs;
 
 set global maria_log_file_size=16777216;
 set global maria_log_purge_type=external;
 insert into t1 select * from t2;
 eval set global maria_checkpoint_interval=$def_checkinterval;
---replace_regex /Size +[0-9]+ ; .+master-data/master-data/
+--replace_regex /Size +[0-9]+ ; .+maria_log/maria_log/
 SHOW ENGINE maria logs;
 flush logs;
---replace_regex /Size +[0-9]+ ; .+master-data/master-data/
+--replace_regex /Size +[0-9]+ ; .+maria_log/maria_log/
 SHOW ENGINE maria logs;
 
 set global maria_log_purge_type=immediate;
 insert into t1 select * from t2;
 eval set global maria_checkpoint_interval=$def_checkinterval;
---replace_regex /Size +[0-9]+ ; .+master-data/master-data/
+--replace_regex /Size +[0-9]+ ; .+maria_log/maria_log/
 SHOW ENGINE maria logs;
 
 drop table t1, t2;

=== modified file 'mysql-test/suite/maria/t/maria-recover.test'
--- a/mysql-test/suite/maria/t/maria-recover.test	2008-11-12 15:23:22 +0000
+++ b/mysql-test/suite/maria/t/maria-recover.test	2009-02-13 16:30:54 +0000
@@ -62,3 +62,4 @@ enable_ps_protocol;
 select * from t_corrupted2; # should show just rows
 
 drop database mysqltest;
+set global maria_recover=backup;

=== modified file 'mysql-test/suite/maria/t/maria-recovery-rtree-ft.test'
--- a/mysql-test/suite/maria/t/maria-recovery-rtree-ft.test	2008-10-20 09:16:47 +0000
+++ b/mysql-test/suite/maria/t/maria-recovery-rtree-ft.test	2009-02-13 16:30:54 +0000
@@ -208,4 +208,10 @@ select count(*) from t2;
 }
 
 -- source include/maria_verify_recovery.inc
-drop table t1,t2;
+
+# clean up everything
+let $mms_purpose=feeding_recovery;
+eval drop database mysqltest_for_$mms_purpose;
+let $mms_purpose=comparison;
+eval drop database mysqltest_for_$mms_purpose;
+drop database mysqltest;

=== modified file 'mysql-test/suite/maria/t/maria-recovery3.test'
--- a/mysql-test/suite/maria/t/maria-recovery3.test	2008-12-05 21:11:46 +0000
+++ b/mysql-test/suite/maria/t/maria-recovery3.test	2009-02-12 14:08:56 +0000
@@ -6,7 +6,7 @@
 --source include/have_maria.inc
 
 set global maria_log_file_size=4294967295;
-let $MARIA_LOG=../tmp;
+let $MARIA_LOG=../../tmp;
 
 --disable_warnings
 drop database if exists mysqltest;
@@ -39,7 +39,6 @@ let $mvr_restore_old_snapshot=0;
 # UNDO phase prevents physical comparison, normally,
 # so we'll only use checksums to compare.
 let $mms_compare_physically=0;
-let $mvr_crash_statement= set global maria_checkpoint_interval=1;
 create table t1(a int primary key) engine=maria;
 insert into t1 values(1);
 -- source include/maria_make_snapshot_for_comparison.inc
@@ -65,7 +64,55 @@ drop table t1;
 # before checkpoint happens, test should still pass (though it won't
 # reproduce the conditions of the bug).
 
+# Test for BUG#41493 Maria: two recovery failures (wrong logging of BLOB pages)
+--echo * TEST of logging of BLOBs
+let $mvr_restore_old_snapshot=1;
+let $mms_compare_physically=1;
+CREATE TABLE `t1` (
+`blob` blob,
+`blob_key` blob
+) ENGINE=maria ROW_FORMAT=page
+;
+-- source include/maria_make_snapshot_for_feeding_recovery.inc
+set global maria_checkpoint_interval=0; # no checkpoints
+INSERT INTO `t1` VALUES (NULL,repeat('A',5198));
+INSERT INTO `t1` VALUES (NULL,repeat('B',65535));
+INSERT INTO `t1` VALUES (repeat('K',5198),repeat('L',2325));
+INSERT INTO `t1` VALUES (repeat('C',65535),NULL);
+INSERT INTO `t1` VALUES (NULL,repeat('D',65535));
+INSERT INTO `t1` VALUES (repeat('E',65535),repeat('F',16111));
+INSERT INTO `t1` VALUES (repeat('G',65535),repeat('H',65535));
+INSERT INTO `t1` VALUES (repeat('I',5198),repeat('J',65535));
+check table t1 extended;
+-- source include/maria_make_snapshot_for_comparison.inc
+-- source include/maria_verify_recovery.inc
+drop table t1;
+
+# Test for BUG#42112 "Maria: recovery failure (pushbuild2) Assertion
+# `rownr == 0 && new_page' failed"
+
+let $mvr_restore_old_snapshot=0;
+let $mms_compare_physically=0;
+create table t1 engine=maria select 1;
+-- source include/maria_make_snapshot_for_feeding_recovery.inc
+set global maria_checkpoint_interval=0; # no checkpoints
+insert into t1 values(2);
+truncate table t1;
+-- source include/maria_make_snapshot_for_comparison.inc
+let $mvr_crash_statement= truncate table t1;
+let $mvr_debug_option="+d,maria_flush_whole_log,maria_crash_create_table";
+truncate table t1;
+-- source include/maria_verify_recovery.inc
+# Table is bad but at least Recovery didn't crash and a new truncate
+# can succeed:
+truncate table t1;
+check table t1 extended;
+drop table t1;
+
+
 # clean up everything
+let $mms_purpose=feeding_recovery;
+eval drop database mysqltest_for_$mms_purpose;
 let $mms_purpose=comparison;
 eval drop database mysqltest_for_$mms_purpose;
 drop database mysqltest;

=== modified file 'mysql-test/suite/maria/t/maria.test'
--- a/mysql-test/suite/maria/t/maria.test	2008-11-27 15:18:17 +0000
+++ b/mysql-test/suite/maria/t/maria.test	2009-02-13 16:30:54 +0000
@@ -12,6 +12,7 @@ let $default_checksum=`select @@global.m
 set global storage_engine=maria;
 set session storage_engine=maria;
 set global maria_page_checksum=0;
+let $default_log_file_size=`select @@global.maria_log_file_size`;
 set global maria_log_file_size=4294967295;
 
 # Initialise
@@ -1407,8 +1408,9 @@ CREATE TABLE t1 (
   KEY (c1),
   KEY (c2)
 ) ENGINE=maria DEFAULT CHARSET=utf8 PACK_KEYS=0;
---replace_result $MYSQLTEST_VARDIR MYSQLTEST_VARDIR
---exec $MARIA_CHK -d $MYSQLTEST_VARDIR/master-data/test/t1
+let $MYSQLD_DATADIR= `select @@datadir`;
+--replace_result $MYSQLD_DATADIR MYSQLD_DATADIR
+--exec $MARIA_CHK -d $MYSQLD_DATADIR/test/t1
 DROP TABLE t1;
 
 # Test warnings with transactional=1 with MyISAM
@@ -1870,10 +1872,23 @@ unlock tables;
 drop table t1, t2;
 
 #
+# Bug #40311
+# Crash when aborting inserting of row with 2 blobs where first is short
+#
+
+create table t1(a int primary key, b blob, c blob) engine=maria;
+insert into t1 values(1,repeat('a',100), repeat('b',657860));
+--error ER_DUP_ENTRY
+insert into t1 values(1,repeat('a',100), repeat('b',657860));
+check table t1;
+drop table t1;
+
 # Set defaults back
 #
 --disable_result_log
 --disable_query_log
-eval set global storage_engine=$default_engine, maria_page_checksum=$default_checksum;
+eval set global storage_engine=$default_engine,
+maria_page_checksum=$default_checksum,
+maria_log_file_size=$default_log_file_size;
 --enable_result_log
 --enable_query_log

=== modified file 'mysql-test/suite/maria/t/maria3.test'
--- a/mysql-test/suite/maria/t/maria3.test	2008-12-10 14:30:52 +0000
+++ b/mysql-test/suite/maria/t/maria3.test	2009-02-13 16:30:54 +0000
@@ -7,6 +7,7 @@ let $default_checksum=`select @@global.m
 set global storage_engine=maria;
 set session storage_engine=maria;
 set global maria_page_checksum=0;
+let $default_log_file_size=`select @@global.maria_log_file_size`;
 set global maria_log_file_size=4294967295;
 
 # Initialise
@@ -481,6 +482,8 @@ drop table t1, t2;
 
 --disable_result_log
 --disable_query_log
-eval set global storage_engine=$default_engine, maria_page_checksum=$default_checksum;
+eval set global storage_engine=$default_engine,
+maria_page_checksum=$default_checksum,
+maria_log_file_size=$default_log_file_size;
 --enable_result_log
 --enable_query_log

=== renamed file 'mysql-test/t/maria_mrr.test' => 'mysql-test/suite/maria/t/maria_mrr.test'
=== modified file 'mysql-test/suite/maria/t/maria_notembedded.test'
--- a/mysql-test/suite/maria/t/maria_notembedded.test	2008-11-27 15:18:17 +0000
+++ b/mysql-test/suite/maria/t/maria_notembedded.test	2009-02-13 16:30:54 +0000
@@ -33,27 +33,65 @@ drop table t1;
 #
 create table t1 (a int unique) transactional=1;
 insert t1 values (1);
+
 lock table t1 write concurrent;
 insert t1 values (2);
-connect(con_d,localhost,root,,);
+
+connect(con_a,localhost,root,,);
 lock table t1 write concurrent;
 insert t1 values (3);
 send insert t1 values (2);
+
+connect(con_b,localhost,root,,);
+lock table t1 write concurrent;
+insert t1 values (4);
+send insert t1 values (3);
+
+connect(con_c,localhost,root,,);
+lock table t1 write concurrent;
+insert t1 values (5);
+send insert t1 values (4);
+
+connect(con_d,localhost,root,,);
+lock table t1 write concurrent;
+insert t1 values (6);
+send insert t1 values (5);
+
 connection default;
-let $wait_condition=select count(*) = 1 from information_schema.processlist where state="waiting for a resource";
+let $wait_condition=select count(*) = 4 from information_schema.processlist where state="waiting for a resource";
 --source include/wait_condition.inc
 --error ER_LOCK_DEADLOCK
-insert t1 values (3);
+insert t1 values (6);
 unlock tables;
+
+connection con_a;
+--error ER_DUP_ENTRY
+reap;
+unlock tables;
+disconnect con_a;
+
+connection con_b;
+--error ER_DUP_ENTRY
+reap;
+unlock tables;
+disconnect con_b;
+
+connection con_c;
+--error ER_DUP_ENTRY
+reap;
+unlock tables;
+disconnect con_c;
+
 connection con_d;
 --error ER_DUP_ENTRY
 reap;
 unlock tables;
 disconnect con_d;
+
 connection default;
+check table t1;
 drop table t1;
 
-
 --disable_result_log
 --disable_query_log
 eval set session storage_engine=$default_engine;

=== modified file 'mysql-test/suite/maria/t/maria_partition.test'
--- a/mysql-test/suite/maria/t/maria_partition.test	2008-10-15 12:44:31 +0000
+++ b/mysql-test/suite/maria/t/maria_partition.test	2009-02-12 14:08:56 +0000
@@ -8,7 +8,6 @@ let $default_checksum=`select @@global.m
 set global storage_engine=maria;
 set session storage_engine=maria;
 set global maria_page_checksum=0;
-set global maria_log_file_size=4294967295;
 
 # Initialise
 --disable_warnings
@@ -27,6 +26,29 @@ insert into t1 values (1);
 alter table t1 partition by list (s1) (partition p1 values in (2));
 drop table t1;
 
+#
+# Test outer join const propagation
+#
+create table t2(a blob) engine=maria;
+create table t1(a int primary key) engine=maria;
+insert into t2 values ('foo'),('bar');
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+insert into t1 values (1);
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+insert into t1 values (2);
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+drop table t1,t2;
+
+create table t2(a blob);
+create table t1(a int primary key) PARTITION BY HASH (a) PARTITIONS 2;
+insert into t2 values ('foo'),('bar');
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+insert into t1 values (1);
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+insert into t1 values (2);
+select * from t2 left join t1 on (t2.a=t1.a) where t2.a='bbb';
+drop table t1,t2;
+
 # Set defaults back
 --disable_result_log
 --disable_query_log

=== modified file 'mysql-test/suite/maria/t/maria_showlog_error.test'
--- a/mysql-test/suite/maria/t/maria_showlog_error.test	2008-12-05 09:15:23 +0000
+++ b/mysql-test/suite/maria/t/maria_showlog_error.test	2009-02-12 14:08:56 +0000
@@ -18,7 +18,8 @@ connection default;
 
 connection default;
 
-remove_file $MYSQLTEST_VARDIR/master-data/$MARIA_LOG/maria_log.00000001;
+let MYSQLD_DATADIR= `select @@datadir`;
+remove_file $MYSQLD_DATADIR/$MARIA_LOG/maria_log.00000001;
 --replace_regex /Size unknown ; .*maria_log.00000001/Size unknown ; maria_log.00000001/
 show engine maria logs;
 

=== modified file 'mysql-test/suite/maria/t/ps_maria.test'
--- a/mysql-test/suite/maria/t/ps_maria.test	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/t/ps_maria.test	2009-02-12 14:08:56 +0000
@@ -9,7 +9,6 @@
 #       BEFORE ADDING NEW TEST CASES HERE !!!
 
 -- source include/have_maria.inc
-set global maria_log_file_size=4294967295;
 
 use test;
 

=== modified file 'mysql-test/suite/rpl_ndb/t/disabled.def'
--- a/mysql-test/suite/rpl_ndb/t/disabled.def	2009-02-11 12:11:20 +0000
+++ b/mysql-test/suite/rpl_ndb/t/disabled.def	2009-02-16 18:11:41 +0000
@@ -10,6 +10,5 @@
 #
 ##############################################################################
 
-rpl_ndb_innodb2ndb       : Bug#34725 2008-02-26 hakank Currently failing.
-rpl_ndb_extraCol	: Bug#41369 2008-12-10 alik
-rpl_ndb_circular_2ch     : Bug #42396 2009-01-28 andrei test fails auto-inc insert
+rpl_ndb_extraCol	 : Bug#41369 2008-12-10 alik
+rpl_ndb_circular_2ch     : Bug#42396 2009-01-28 andrei test fails auto-inc insert

=== modified file 'mysql-test/t/ctype_utf8.test'
--- a/mysql-test/t/ctype_utf8.test	2008-11-21 13:38:27 +0000
+++ b/mysql-test/t/ctype_utf8.test	2009-02-13 16:30:54 +0000
@@ -164,7 +164,7 @@ drop table t1;
 # UTF8 breaks primary keys for cols > 333 characters
 #
 --error 1071
-create table t1 (a text character set utf8, primary key(a(360)));
+create table t1 (a text character set utf8, primary key(a(371)));
 
 
 #

=== modified file 'mysys/lf_alloc-pin.c'
--- a/mysys/lf_alloc-pin.c	2008-10-07 16:49:01 +0000
+++ b/mysys/lf_alloc-pin.c	2009-01-15 21:27:36 +0000
@@ -1,5 +1,5 @@
 /* QQ: TODO multi-pinbox */
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -330,7 +330,7 @@ static void _lf_pinbox_real_free(LF_PINS
 {
   int npins, alloca_size;
   void *list, **addr;
-  uchar *first, *last= NULL;
+  void *first, *last= NULL;
   LF_PINBOX *pinbox= pins->pinbox;
 
   LINT_INIT(first);

=== modified file 'mysys/lf_hash.c'
--- a/mysys/lf_hash.c	2009-01-27 02:08:48 +0000
+++ b/mysys/lf_hash.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -42,6 +42,8 @@ typedef struct {
   */
 } LF_SLIST;
 
+const int LF_HASH_OVERHEAD= sizeof(LF_SLIST);
+
 /*
   a structure to pass the context (pointers two the three successive elements
   in a list) from lfind to linsert/ldelete
@@ -315,7 +317,6 @@ void lf_hash_init(LF_HASH *hash, uint el
                   uint key_offset, uint key_length, my_hash_get_key get_key,
                   CHARSET_INFO *charset)
 {
-  compile_time_assert(sizeof(LF_SLIST) == LF_HASH_OVERHEAD);
   lf_alloc_init(&hash->alloc, sizeof(LF_SLIST)+element_size,
                 offsetof(LF_SLIST, key));
   lf_dynarray_init(&hash->array, sizeof(LF_SLIST *));

=== modified file 'mysys/my_static.c'
--- a/mysys/my_static.c	2008-10-31 18:02:34 +0000
+++ b/mysys/my_static.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -101,6 +101,7 @@ static const char *proc_info_dummy(void 
   return 0;
 }
 
+/* this is to be able to call set_thd_proc_info from the C code */
 const char *(*proc_info_hook)(void *, const char *, const char *, const char *,
                               const unsigned int)= proc_info_dummy;
 

=== modified file 'mysys/my_thr_init.c'
--- a/mysys/my_thr_init.c	2009-01-26 19:15:24 +0000
+++ b/mysys/my_thr_init.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -336,7 +336,8 @@ my_bool my_thread_init(void)
                         0);
   pthread_cond_init(&tmp->suspend, NULL);
 
-  tmp->stack_ends_here= &tmp + STACK_DIRECTION * my_thread_stack_size;
+  tmp->stack_ends_here= (char*)&tmp +
+                         STACK_DIRECTION * (long)my_thread_stack_size;
 
   pthread_mutex_lock(&THR_LOCK_threads);
   tmp->id= ++thread_id;

=== modified file 'mysys/thr_lock.c'
--- a/mysys/thr_lock.c	2009-02-12 17:56:03 +0000
+++ b/mysys/thr_lock.c	2009-02-16 21:18:45 +0000
@@ -516,7 +516,8 @@ wait_for_lock(struct st_lock_list *wait,
   {
     result= THR_LOCK_SUCCESS;
     if (data->lock->get_status)
-      (*data->lock->get_status)(data->status_param, 0);
+      (*data->lock->get_status)(data->status_param,
+                                data->type == TL_WRITE_CONCURRENT_INSERT);
     check_locks(data->lock,"got wait_for_lock",0);
   }
   pthread_mutex_unlock(&data->lock->mutex);

=== modified file 'mysys/thr_mutex.c'
--- a/mysys/thr_mutex.c	2009-01-27 02:08:48 +0000
+++ b/mysys/thr_mutex.c	2009-02-13 16:30:54 +0000
@@ -745,15 +745,23 @@ static void print_deadlock_warning(safe_
   fprintf(stderr, "safe_mutex: Found wrong usage of mutex "
           "'%s' and '%s'\n",
           parent_mutex->name, new_mutex->name);
+  DBUG_PRINT("info", ("safe_mutex: Found wrong usage of mutex "
+                      "'%s' and '%s'",
+                      parent_mutex->name, new_mutex->name));
   fprintf(stderr, "Mutex currently locked (in reverse order):\n");
+  DBUG_PRINT("info", ("Mutex currently locked (in reverse order):"));
   fprintf(stderr, "%-32.32s  %s  line %u\n", new_mutex->name, new_mutex->file,
           new_mutex->line);
+  DBUG_PRINT("info", ("%-32.32s  %s  line %u\n", new_mutex->name,
+                      new_mutex->file, new_mutex->line));
   for (mutex_root= *my_thread_var_mutex_in_use() ;
        mutex_root;
        mutex_root= mutex_root->next)
   {
     fprintf(stderr, "%-32.32s  %s  line %u\n", mutex_root->name,
             mutex_root->file, mutex_root->line);
+    DBUG_PRINT("info", ("%-32.32s  %s  line %u", mutex_root->name,
+                        mutex_root->file, mutex_root->line));
   }
   fflush(stderr);
   DBUG_VOID_RETURN;

=== modified file 'mysys/waiting_threads.c'
--- a/mysys/waiting_threads.c	2008-12-17 18:40:14 +0000
+++ b/mysys/waiting_threads.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008 MySQL AB
+/* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -13,74 +13,134 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-/*
+/**
+  @file
+
   "waiting threads" subsystem - a unified interface for threads to wait
   on each other, with built-in deadlock detection.
 
   Main concepts
   ^^^^^^^^^^^^^
-    a thread - is represented by a WT_THD structure. One physical thread
-      can have only one WT_THD descriptor.
+  a thread - is represented by a WT_THD structure. One physical thread
+    can have only one WT_THD descriptor at any given moment.
 
-    a resource - a thread does not wait for other threads directly,
-      instead it waits for a "resource", which is "owned" by other threads.
-      It waits, exactly, for all "owners" to "release" a resource.
-      It does not have to correspond to a physical resource. For example, it
-      may be convenient in certain cases to force resource == thread.
-      A resource is represented by a WT_RESOURCE structure. 
+  a resource - a thread does not wait for other threads directly,
+    instead it waits for a "resource", which is "owned" by other threads.
+    It waits, exactly, for all "owners" to "release" a resource.
+    It does not have to correspond to a physical resource. For example, it
+    may be convenient in certain cases to force resource == thread.
+    A resource is represented by a WT_RESOURCE structure. 
 
-    a resource identifier - a pair of {resource type, value}. A value is
-      an ulonglong number. Represented by a WT_RESOURCE_ID structure.
+  a resource identifier - a pair of {resource type, value}. A value is
+    an ulonglong number. Represented by a WT_RESOURCE_ID structure.
 
-    a resource type - a pointer to a statically defined instance of
+  a resource type - a pointer to a statically defined instance of
     WT_RESOURCE_TYPE structure. This structure contains a pointer to
     a function that knows how to compare values of this resource type.
     In the simple case it could be wt_resource_id_memcmp().
 
-   Usage
-   ^^^^^
-   to use the interface one needs to use this thread's WT_THD,
-   call wt_thd_will_wait_for() for every thread it needs to wait on,
-   then call wt_thd_cond_timedwait(). When thread releases a resource
-   it should call wt_thd_release() (or wt_thd_release_all()) - it will
-   notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
-   if appropriate.
-
-   Just like with pthread's cond_wait, there could be spurious
-   wake-ups from wt_thd_cond_timedwait(). A caller is expected to
-   handle that.
-
-   wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
-   WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
-   WT_TIMEOUT. Out of memory and other fatal errors are reported as
-   WT_DEADLOCK - and a transaction must be aborted just the same.
-
-   Configuration
-   ^^^^^^^^^^^^^
-   There are four config variables. Two deadlock search depths - short and
-   long - and two timeouts. Deadlock search is performed with the short
-   depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
-   waits with a short timeout, performs a deadlock search with the long
-   depth, and waits with a long timeout. As most deadlock cycles are supposed
-   to be short, most deadlocks will be detected at once, and waits will
-   rarely be necessary.
-
-   These config variables are thread-local. Different threads may have
-   different search depth and timeout values.
-
-   Also, deadlock detector supports different killing strategies, the victim
-   in a deadlock cycle is selected based on the "weight". See "weight"
-   description in waiting_threads.h for details. It's up to the caller to
-   set weights accordingly.
-
-   Status
-   ^^^^^^
-   We calculate the number of successfull waits (WT_OK returned from
-   wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
-   length distribution - number of deadlocks with every length from
-   1 to WT_CYCLE_STATS, and a wait time distribution - number
-   of waits with a time from 1 us to 1 min in WT_CYCLE_STATS
-   intervals on a log scale.
+  a wait-for graph - a graph, that represenst "wait-for" relationships.
+    It has two types of nodes - threads and resources. There are directed
+    edges from a thread to a resource it is waiting for (WT_THD::waiting_for),
+    from a thread to resources that it "owns" (WT_THD::my_resources),
+    and from a resource to threads that "own" it (WT_RESOURCE::owners)
+
+  Graph completeness
+  ^^^^^^^^^^^^^^^^^^
+
+  For flawless deadlock detection wait-for graph must be complete.
+  It means that when a thread starts waiting it needs to know *all* its
+  blockers, and call wt_thd_will_wait_for() for every one of them.
+  Otherwise two phenomena should be expected:
+
+  1. Fuzzy timeouts:
+
+    thread A needs to get a lock, and is blocked by a thread B.
+    it waits.
+    Just before the timeout thread B releases the lock.
+    thread A is ready to grab the lock but discovers that it is also
+    blocked by a thread C.
+    It waits and times out.
+
+    As a result thread A has waited two timeout intervals, instead of one.
+
+  2. Unreliable cycle detection:
+
+     Thread A waits for threads B and C
+     Thread C waits for D
+     Thread D wants to start waiting for A
+
+     one can see immediately that thread D creates a cycle, and thus
+     a deadlock is detected.
+
+     But if thread A would only wait for B, and start waiting for C
+     when B would unlock, thread D would be allowed to wait, a deadlock
+     would be only detected when B unlocks or somebody times out.
+
+  These two phenomena don't affect a correctness, and strictly speaking,
+  the caller is not required to call wt_thd_will_wait_for() for *all*
+  blockers - it may optimize wt_thd_will_wait_for() calls. But they
+  may be perceived as bugs by users, it must be understood that such
+  an optimization comes with its price.
+
+  Usage
+  ^^^^^
+
+  First, the wt* subsystem must be initialized by calling
+  wt_init(). In the server you don't need to do it, it's done
+  in mysqld.cc.
+
+  Similarly, wt_end() frees wt* structures, should be called
+  at the end, but in the server mysqld.cc takes care of that.
+
+  Every WT_THD should be initialized with wt_thd_lazy_init().
+  After that they can be used in other wt_thd_* calls.
+  Before discarding, WT_THD should be free'd with
+  wt_thd_destroy(). In the server both are handled in sql_class.cc,
+  it's an error to try to do it manually.
+
+  To use the deadlock detection one needs to use this thread's WT_THD,
+  call wt_thd_will_wait_for() for every thread it needs to wait on,
+  then call wt_thd_cond_timedwait(). When thread releases a resource
+  it should call wt_thd_release() (or wt_thd_release_all()) - it will
+  notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
+  if appropriate.
+
+  Just like with pthread's cond_wait, there could be spurious
+  wake-ups from wt_thd_cond_timedwait(). A caller is expected to
+  handle that (that is, to re-check the blocking criteria).
+
+  wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
+  WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
+  WT_TIMEOUT. Out of memory and other fatal errors are reported as
+  WT_DEADLOCK - and a transaction must be aborted just the same.
+
+  Configuration
+  ^^^^^^^^^^^^^
+  There are four config variables. Two deadlock search depths - short and
+  long - and two timeouts. Deadlock search is performed with the short
+  depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
+  waits with a short timeout, performs a deadlock search with the long
+  depth, and waits with a long timeout. As most deadlock cycles are supposed
+  to be short, most deadlocks will be detected at once, and waits will
+  rarely be necessary.
+
+  These config variables are thread-local. Different threads may have
+  different search depth and timeout values.
+
+  Also, deadlock detector supports different killing strategies, the victim
+  in a deadlock cycle is selected based on the "weight". See "weight"
+  description in waiting_threads.h for details. It's up to the caller to
+  set weights accordingly.
+
+  Status
+  ^^^^^^
+  We calculate the number of successfull waits (WT_OK returned from
+  wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
+  length distribution - number of deadlocks with every length from
+  1 to WT_CYCLE_STATS, and a wait time distribution - number
+  of waits with a time from 1 us to 1 min in WT_WAIT_STATS
+  intervals on a log e scale.
 */
 
 /*
@@ -93,10 +153,11 @@
 
       (example A=IX, B=IS, C=S, D=X)
 
-   you need to include lock level in the resource identifier - thread 1
-   waiting for lock A on resource R and thread 2 waiting for lock B
-   on resource R should wait on different WT_RESOURCE structures, on different
-   {lock, resource} pairs. Otherwise the following is possible:
+   you need to include lock level in the resource identifier - a
+   thread waiting for lock of the type A on resource R and another
+   thread waiting for lock of the type B on resource R should wait on
+   different WT_RESOURCE structures, on different {lock, resource}
+   pairs.  Otherwise the following is possible:
 
       thread1> take S-lock on R
       thread2> take IS-lock on R
@@ -113,40 +174,46 @@
 #include <waiting_threads.h>
 #include <m_string.h>
 
-/*
-  status variables:
-    distribution of cycle lengths
-    wait time log distribution
-
-  Note:
-
-    we call deadlock() twice per wait (with different search lengths).
-    it means a deadlock will be counted twice. It's difficult to avoid,
-    as on the second search we could find a *different* deadlock and we
-    *want* to count it too. So we just count all deadlocks - two searches
-    mean two increments on the wt_cycle_stats.
-*/
+/* status variables */
 
+/**
+  preset table of wait intervals
+*/
 ulonglong wt_wait_table[WT_WAIT_STATS];
-uint32    wt_wait_stats[WT_WAIT_STATS+1];
-uint32    wt_cycle_stats[2][WT_CYCLE_STATS+1], wt_success_stats;
+/**
+  wait time distribution (log e scale)
+*/
+uint32 wt_wait_stats[WT_WAIT_STATS+1];
+/**
+  distribution of cycle lengths
+  first column tells whether this was during short or long detection
+*/
+uint32 wt_cycle_stats[2][WT_CYCLE_STATS+1];
+uint32 wt_success_stats;
 
 static my_atomic_rwlock_t cycle_stats_lock, wait_stats_lock, success_stats_lock;
 
+#ifdef SAFE_STATISTICS
+#define incr(VAR, LOCK)                           \
+  do {                                            \
+    my_atomic_rwlock_wrlock(&(LOCK));             \
+    my_atomic_add32(&(VAR), 1);                   \
+    my_atomic_rwlock_wrunlock(&(LOCK));           \
+  } while(0)
+#else
+#define incr(VAR,LOCK)  do { (VAR)++; } while(0)
+#endif
+
 static void increment_success_stats()
 {
-  my_atomic_rwlock_wrlock(&success_stats_lock);
-  my_atomic_add32(&wt_success_stats, 1);
-  my_atomic_rwlock_wrunlock(&success_stats_lock);
+  incr(wt_success_stats, success_stats_lock);
 }
 
 static void increment_cycle_stats(uint depth, uint slot)
 {
   if (depth >= WT_CYCLE_STATS)
     depth= WT_CYCLE_STATS;
-  my_atomic_rwlock_wrlock(&cycle_stats_lock);
-  my_atomic_add32(&wt_cycle_stats[slot][depth], 1);
-  my_atomic_rwlock_wrunlock(&cycle_stats_lock);
+  incr(wt_cycle_stats[slot][depth], cycle_stats_lock);
 }
 
 static void increment_wait_stats(ulonglong waited,int ret)
@@ -155,12 +222,89 @@ static void increment_wait_stats(ulonglo
   if ((ret) == ETIMEDOUT)
     i= WT_WAIT_STATS;
   else
-    for (i=0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
-  my_atomic_rwlock_wrlock(&wait_stats_lock);
-  my_atomic_add32(wt_wait_stats+i, 1);
-  my_atomic_rwlock_wrunlock(&wait_stats_lock);
+    for (i= 0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
+  incr(wt_wait_stats[i], wait_stats_lock);
 }
 
+/*
+  'lock' protects 'owners', 'state', and 'waiter_count'
+  'id' is read-only
+
+  a resource is picked up from a hash in a lock-free manner
+  it's returned pinned, so it cannot be freed at once
+  but it may be freed right after the pin is removed
+  to free a resource it should
+    1. have no owners
+    2. have no waiters
+
+  two ways to access a resource:
+    1. find it in a hash
+       - it's returned pinned.
+        a) take a lock in exclusive mode
+        b) check the state, it should be ACTIVE to be usable
+        c) unpin
+    2. by a direct reference
+       - could only used if a resource cannot be freed
+       e.g. accessing a resource by thd->waiting_for is safe,
+       a resource cannot be freed as there's a thread waiting for it
+*/
+struct st_wt_resource {
+  WT_RESOURCE_ID  id;
+  uint            waiter_count;
+  enum { ACTIVE, FREE } state;
+#ifndef DBUG_OFF
+  pthread_mutex_t  *cond_mutex; /* a mutex for the 'cond' below */
+#endif
+  /*
+    before the 'lock' all elements are mutable, after (and including) -
+    immutable in the sense that lf_hash_insert() won't memcpy() over them.
+    See wt_init().
+  */
+#ifdef WT_RWLOCKS_USE_MUTEXES
+  /*
+    we need a special rwlock-like 'lock' to allow readers bypass
+    waiting writers, otherwise readers can deadlock. For example:
+
+      A waits on resource x, owned by B, B waits on resource y, owned
+      by A, we have a cycle (A->x->B->y->A)
+      Both A and B start deadlock detection:
+
+        A locks x                          B locks y
+        A goes deeper                      B goes deeper
+        A locks y                          B locks x
+
+      with mutexes it would deadlock. With rwlocks it won't, as long
+      as both A and B are taking read locks (and they do).
+      But other threads may take write locks. Assume there's
+      C who wants to start waiting on x, and D who wants to start
+      waiting on y.
+
+        A read-locks x                       B read-locks y
+        A goes deeper                        B goes deeper
+     => C write-locks x (to add a new edge)  D write-locks y
+     .. C is blocked                         D is blocked
+        A read-locks y                       B read-locks x
+
+      Now, if a read lock can bypass a pending wrote lock request, we're fine.
+      If it can not, we have a deadlock.
+
+    writer starvation is technically possible, but unlikely, because
+    the contention is expected to be low.
+  */
+  struct {
+    pthread_cond_t   cond;
+    pthread_mutex_t  mutex;
+    uint readers: 16;
+    uint pending_writers: 15;
+    uint write_locked: 1;
+  } lock;
+#else
+  rw_lock_t lock;
+#endif
+  pthread_cond_t   cond; /* the corresponding mutex is provided by the caller */
+  DYNAMIC_ARRAY    owners;
+};
+
 #ifdef  WT_RWLOCKS_USE_MUTEXES
 static void rc_rwlock_init(WT_RESOURCE *rc)
 {
@@ -169,6 +313,8 @@ static void rc_rwlock_init(WT_RESOURCE *
 }
 static void rc_rwlock_destroy(WT_RESOURCE *rc)
 {
+  DBUG_ASSERT(rc->lock.write_locked == 0);
+  DBUG_ASSERT(rc->lock.readers == 0);
   pthread_cond_destroy(&rc->lock.cond);
   pthread_mutex_destroy(&rc->lock.mutex);
 }
@@ -188,7 +334,7 @@ static void rc_wrlock(WT_RESOURCE *rc)
   pthread_mutex_lock(&rc->lock.mutex);
   while (rc->lock.write_locked || rc->lock.readers)
     pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex);
-  rc->lock.write_locked=1;
+  rc->lock.write_locked= 1;
   pthread_mutex_unlock(&rc->lock.mutex);
   DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
 }
@@ -198,7 +344,7 @@ static void rc_unlock(WT_RESOURCE *rc)
   pthread_mutex_lock(&rc->lock.mutex);
   if (rc->lock.write_locked)
   {
-    rc->lock.write_locked=0;
+    rc->lock.write_locked= 0;
     pthread_cond_broadcast(&rc->lock.cond);
   }
   else if (--rc->lock.readers == 0)
@@ -242,12 +388,12 @@ static LF_HASH      reshash;
 /**
   WT_RESOURCE constructor
 
-  It's called from lf_hash and takes an offset to LF_SLIST instance.
+  It's called from lf_hash and takes a pointer to an LF_SLIST instance.
   WT_RESOURCE is located at arg+sizeof(LF_SLIST)
 */
 static void wt_resource_init(uchar *arg)
 {
-  WT_RESOURCE *rc=(WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
+  WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
   DBUG_ENTER("wt_resource_init");
 
   bzero(rc, sizeof(*rc));
@@ -260,12 +406,12 @@ static void wt_resource_init(uchar *arg)
 /**
   WT_RESOURCE destructor
 
-  It's called from lf_hash and takes an offset to LF_SLIST instance.
+  It's called from lf_hash and takes a pointer to an LF_SLIST instance.
   WT_RESOURCE is located at arg+sizeof(LF_SLIST)
 */
 static void wt_resource_destroy(uchar *arg)
 {
-  WT_RESOURCE *rc=(WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
+  WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
   DBUG_ENTER("wt_resource_destroy");
 
   DBUG_ASSERT(rc->owners.elements == 0);
@@ -278,6 +424,7 @@ static void wt_resource_destroy(uchar *a
 void wt_init()
 {
   DBUG_ENTER("wt_init");
+  DBUG_ASSERT(reshash.alloc.constructor != wt_resource_init);
 
   lf_hash_init(&reshash, sizeof(WT_RESOURCE), LF_HASH_UNIQUE, 0,
                sizeof_WT_RESOURCE_ID, 0, 0);
@@ -293,15 +440,15 @@ void wt_init()
   reshash.element_size= offsetof(WT_RESOURCE, lock);
   bzero(wt_wait_stats, sizeof(wt_wait_stats));
   bzero(wt_cycle_stats, sizeof(wt_cycle_stats));
-  wt_success_stats=0;
-  { /* initialize wt_wait_table[]. from 1 us to 1 min, log scale */
+  wt_success_stats= 0;
+  { /* initialize wt_wait_table[]. from 1 us to 1 min, log e scale */
     int i;
-    double from=log(1);   /* 1 us */
-    double to=log(60e6);  /* 1 min */
-    for (i=0; i < WT_WAIT_STATS; i++)
+    double from= log(1);   /* 1 us */
+    double to= log(60e6);  /* 1 min */
+    for (i= 0; i < WT_WAIT_STATS; i++)
     {
-      wt_wait_table[i]=(ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
-      DBUG_ASSERT(i==0 || wt_wait_table[i-1] != wt_wait_table[i]);
+      wt_wait_table[i]= (ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
+      DBUG_ASSERT(i == 0 || wt_wait_table[i-1] != wt_wait_table[i]);
     }
   }
   my_atomic_rwlock_init(&cycle_stats_lock);
@@ -325,7 +472,7 @@ void wt_end()
 /**
   Lazy WT_THD initialization
 
-  Cheap initialization of WT_THD. Only initialized fields that don't require
+  Cheap initialization of WT_THD. Only initialize fields that don't require
   memory allocations - basically, it only does assignments. The rest of the
   WT_THD structure will be initialized on demand, on the first use.
   This allows one to initialize lazily all WT_THD structures, even if some
@@ -335,14 +482,18 @@ void wt_end()
   @param ts     a pointer to deadlock timeout short value
   @param dl     a pointer to deadlock search depth long value
   @param tl     a pointer to deadlock timeout long value
+
+  @note these are pointers to values, and WT_THD stores them as pointers.
+  It allows one later to change search depths and timeouts for existing
+  threads. It also means that the pointers must stay valid for the lifetime
+  of WT_THD.
 */
-void wt_thd_lazy_init(WT_THD *thd, ulong *ds, ulong *ts, ulong *dl, ulong *tl)
+void wt_thd_lazy_init(WT_THD *thd, const ulong *ds, const ulong *ts,
+                                   const ulong *dl, const ulong *tl)
 {
   DBUG_ENTER("wt_thd_lazy_init");
-  thd->waiting_for=0;
-  thd->my_resources.buffer= 0;
-  thd->my_resources.elements= 0;
-  thd->weight=0;
+  thd->waiting_for= 0;
+  thd->weight= 0;
   thd->deadlock_search_depth_short= ds;
   thd->timeout_short= ts;
   thd->deadlock_search_depth_long= dl;
@@ -350,7 +501,7 @@ void wt_thd_lazy_init(WT_THD *thd, ulong
   /* dynamic array is also initialized lazily - without memory allocations */
   my_init_dynamic_array(&thd->my_resources, sizeof(WT_RESOURCE *), 0, 5);
 #ifndef DBUG_OFF
-  thd->name=my_thread_name();
+  thd->name= my_thread_name();
 #endif
   DBUG_VOID_RETURN;
 }
@@ -367,9 +518,9 @@ static int fix_thd_pins(WT_THD *thd)
 {
   if (unlikely(thd->pins == 0))
   {
-    thd->pins=lf_hash_get_pins(&reshash);
+    thd->pins= lf_hash_get_pins(&reshash);
 #ifndef DBUG_OFF
-    thd->name=my_thread_name();
+    thd->name= my_thread_name();
 #endif
   }
   return thd->pins == 0;
@@ -380,12 +531,12 @@ void wt_thd_destroy(WT_THD *thd)
   DBUG_ENTER("wt_thd_destroy");
 
   DBUG_ASSERT(thd->my_resources.elements == 0);
+  DBUG_ASSERT(thd->waiting_for == 0);
 
   if (thd->pins != 0)
     lf_hash_put_pins(thd->pins);
 
   delete_dynamic(&thd->my_resources);
-  thd->waiting_for=0;
   DBUG_VOID_RETURN;
 }
 /**
@@ -394,7 +545,7 @@ void wt_thd_destroy(WT_THD *thd)
   It can be used in WT_RESOURCE_TYPE structures where bytewise
   comparison of values is sufficient.
 */
-int wt_resource_id_memcmp(void *a, void *b)
+my_bool wt_resource_id_memcmp(const void *a, const void *b)
 {
   /* we use the fact that there's no padding in the middle of WT_RESOURCE_ID */
   compile_time_assert(offsetof(WT_RESOURCE_ID, type) == sizeof(ulonglong));
@@ -405,10 +556,10 @@ int wt_resource_id_memcmp(void *a, void 
   arguments for the recursive deadlock_search function
 */
 struct deadlock_arg {
-  WT_THD *thd;          /**< starting point of a search */
-  uint    max_depth;    /**< search depth limit */
-  WT_THD *victim;       /**< a thread to be killed to resolve a deadlock */
-  WT_RESOURCE *rc;      /**< see comment at the end of deadlock_search() */
+  WT_THD * const thd;          /**< starting point of a search */
+  uint const max_depth;        /**< search depth limit */
+  WT_THD *victim;              /**< a thread to be killed to resolve a deadlock */
+  WT_RESOURCE *last_locked_rc; /**< see comment at the end of deadlock_search() */
 };
 
 /**
@@ -421,10 +572,10 @@ static void change_victim(WT_THD* found,
     if (arg->victim != arg->thd)
     {
       rc_unlock(arg->victim->waiting_for); /* release the previous victim */
-      DBUG_ASSERT(arg->rc == found->waiting_for);
+      DBUG_ASSERT(arg->last_locked_rc == found->waiting_for);
     }
     arg->victim= found;
-    arg->rc= 0;
+    arg->last_locked_rc= 0;
   }
 }
 
@@ -444,7 +595,7 @@ static int deadlock_search(struct deadlo
 
   LF_REQUIRE_PINS(1);
 
-  arg->rc= 0;
+  arg->last_locked_rc= 0;
 
   if (depth > arg->max_depth)
   {
@@ -453,7 +604,10 @@ static int deadlock_search(struct deadlo
   }
 
 retry:
-  /* safe dereference as explained in lf_alloc-pin.c */
+  /*
+    safe dereference as explained in lf_alloc-pin.c
+    (in short: protects against lf_alloc_free() in lf_hash_delete())
+  */
   do
   {
     rc= *shared_ptr;
@@ -469,6 +623,7 @@ retry:
   rc_rdlock(rc);
   if (rc->state != ACTIVE || *shared_ptr != rc)
   {
+    /* blocker is not waiting on this resource anymore */
     rc_unlock(rc);
     lf_unpin(arg->thd->pins, 0);
     goto retry;
@@ -480,20 +635,22 @@ retry:
     Below is not a pure depth-first search. It's a depth-first with a
     slightest hint of breadth-first. Depth-first is:
 
-      check(element):
+      check(element, X):
         foreach current in element->nodes[] do:
-          if current == element return error;
-          check(current);
+          if current == X return error;
+          check(current, X);
 
     while we do
 
-      check(element):
+      check(element, X):
         foreach current in element->nodes[] do:
-          if current == element return error;
+          if current == X return error;
         foreach current in element->nodes[] do:
-          check(current);
+          check(current, X);
+
+    preferring shorter deadlocks over longer ones.
   */
-  for (i=0; i < rc->owners.elements; i++)
+  for (i= 0; i < rc->owners.elements; i++)
   {
     cursor= *dynamic_element(&rc->owners, i, WT_THD**);
     /*
@@ -517,7 +674,7 @@ retry:
       goto end;
     }
   }
-  for (i=0; i < rc->owners.elements; i++)
+  for (i= 0; i < rc->owners.elements; i++)
   {
     cursor= *dynamic_element(&rc->owners, i, WT_THD**);
     switch (deadlock_search(arg, cursor, depth+1)) {
@@ -528,20 +685,21 @@ retry:
       break;
     case WT_DEADLOCK:
       ret= WT_DEADLOCK;
-      change_victim(cursor, arg);       /* also sets arg->rc to 0 */
+      change_victim(cursor, arg);       /* also sets arg->last_locked_rc to 0 */
       i= rc->owners.elements;           /* jump out of the loop */
       break;
     default:
       DBUG_ASSERT(0);
     }
-    if (arg->rc)
-      rc_unlock(arg->rc);
+    if (arg->last_locked_rc)
+      rc_unlock(arg->last_locked_rc);
   }
 end:
   /*
     Note that 'rc' is locked in this function, but it's never unlocked here.
-    Instead it's saved in arg->rc and the *caller* is expected to unlock it.
-    It's done to support different killing strategies. This is how it works:
+    Instead it's saved in arg->last_locked_rc and the *caller* is
+    expected to unlock it.  It's done to support different killing
+    strategies. This is how it works:
     Assuming a graph
 
       thd->A->B->C->thd
@@ -552,9 +710,9 @@ end:
     on. Goes down recursively, locks B. Goes down recursively, locks C.
     Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd.
     Returns from the last deadlock_search() call. C stays locked!
-    Now it checks whether C is a more appropriate victim then 'thd'.
+    Now it checks whether C is a more appropriate victim than 'thd'.
     If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked.
-    Now it checks whether B is a more appropriate victim then arg->victim.
+    Now it checks whether B is a more appropriate victim than arg->victim.
     If yes - old arg->victim is unlocked and arg->victim=B,
     otherwise B is unlocked. Return.
     And so on.
@@ -566,7 +724,7 @@ end:
     is unrolled and we are back to deadlock() function, there are only two
     locks left - on thd and on the victim.
   */
-  arg->rc= rc;
+  arg->last_locked_rc= rc;
   DBUG_PRINT("wt", ("exit: %s",
                     ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" :
                     ret ? "WT_DEADLOCK" : "OK"));
@@ -612,13 +770,31 @@ static int deadlock(WT_THD *thd, WT_THD 
   */
   if (ret == WT_DEADLOCK && depth)
     change_victim(blocker, &arg);
-  if (arg.rc)
-    rc_unlock(arg.rc);
+  if (arg.last_locked_rc)
+  {
+    /*
+      Special return code if there's nobody to wait for.
+
+      depth == 0 means that we start the search from thd (thd == blocker).
+      ret == WT_OK means that no cycle was found and
+        arg.last_locked_rc == thd->waiting_for.
+      and arg.last_locked_rc->owners.elements == 0 means that
+        (applying the rule above) thd->waiting_for->owners.elements == 0,
+        and thd doesn't have anybody to wait for.
+    */
+    if (depth == 0 && ret == WT_OK && arg.last_locked_rc->owners.elements == 0)
+    {
+      DBUG_ASSERT(thd == blocker);
+      DBUG_ASSERT(arg.last_locked_rc == thd->waiting_for);
+      ret= WT_FREE_TO_GO;
+    }
+    rc_unlock(arg.last_locked_rc);
+  }
   /* notify the victim, if appropriate */
   if (ret == WT_DEADLOCK && arg.victim != thd)
   {
     DBUG_PRINT("wt", ("killing %s", arg.victim->name));
-    arg.victim->killed=1;
+    arg.victim->killed= 1;
     pthread_cond_broadcast(&arg.victim->waiting_for->cond);
     rc_unlock(arg.victim->waiting_for);
     ret= WT_OK;
@@ -642,7 +818,7 @@ static int unlock_lock_and_free_resource
 
   if (rc->owners.elements || rc->waiter_count)
   {
-    DBUG_PRINT("wt", ("nothing to do, %d owners, %d waiters",
+    DBUG_PRINT("wt", ("nothing to do, %u owners, %u waiters",
                       rc->owners.elements, rc->waiter_count));
     rc_unlock(rc);
     DBUG_RETURN(0);
@@ -666,12 +842,8 @@ static int unlock_lock_and_free_resource
      2. set the state to FREE
      3. release the lock
      4. remove from the hash
-
-     I *think* it's safe to release the lock while the element is still
-     in the hash. If not, the corrected procedure should be
-     3. pin; 4; remove; 5; release; 6; unpin and it'll need pin[3].
   */
-  rc->state=FREE;
+  rc->state= FREE;
   rc_unlock(rc);
   DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1);
 }
@@ -722,15 +894,19 @@ static int stop_waiting(WT_THD *thd)
 /**
   notify the system that a thread needs to wait for another thread
 
-  called by a *waiter* to declare what resource it will wait for.
+  called by a *waiter* to declare that it (thd) will wait for another
+  thread (blocker) on a specific resource (resid).
   can be called many times, if many blockers own a blocking resource.
   but must always be called with the same resource id - a thread cannot
   wait for more than one resource at a time.
 
+  @return WT_OK or WT_DEADLOCK
+
   As a new edge is added to the wait-for graph, a deadlock detection is
   performed for this new edge.
 */
-int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker, WT_RESOURCE_ID *resid)
+int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker,
+                         const WT_RESOURCE_ID *resid)
 {
   uint i;
   WT_RESOURCE *rc;
@@ -805,7 +981,7 @@ retry:
 
     /*
       we can safely access the resource here, it's in the hash as it has
-      at least one owner, and non-zero waiter_count
+      non-zero waiter_count
     */
     rc= thd->waiting_for;
     rc_wrlock(rc);
@@ -818,7 +994,11 @@ retry:
       DBUG_RETURN(WT_DEADLOCK);
     }
   }
-  for (i=0; i < rc->owners.elements; i++)
+  /*
+    Another thread could be waiting on this resource for this very 'blocker'.
+    In this case we should not add it to the list for the second time.
+  */
+  for (i= 0; i < rc->owners.elements; i++)
     if (*dynamic_element(&rc->owners, i, WT_THD**) == blocker)
       break;
   if (i >= rc->owners.elements)
@@ -837,19 +1017,21 @@ retry:
   }
   rc_unlock(rc);
 
-  if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short))
+  if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short) != WT_OK)
   {
     stop_waiting(thd);
     DBUG_RETURN(WT_DEADLOCK);
   }
-  DBUG_RETURN(0);
+  DBUG_RETURN(WT_OK);
 }
 
 /**
-  called by a *waiter* to start waiting
+  called by a *waiter* (thd) to start waiting
 
   It's supposed to be a drop-in replacement for
   pthread_cond_timedwait(), and it takes mutex as an argument.
+
+  @return one of WT_TIMEOUT, WT_DEADLOCK, WT_OK
 */
 int wt_thd_cond_timedwait(WT_THD *thd, pthread_mutex_t *mutex)
 {
@@ -861,10 +1043,10 @@ int wt_thd_cond_timedwait(WT_THD *thd, p
   DBUG_PRINT("wt", ("enter: thd=%s, rc=%p", thd->name, rc));
 
 #ifndef DBUG_OFF
-  if (rc->mutex)
-    DBUG_ASSERT(rc->mutex == mutex);
+  if (rc->cond_mutex)
+    DBUG_ASSERT(rc->cond_mutex == mutex);
   else
-    rc->mutex= mutex;
+    rc->cond_mutex= mutex;
   safe_mutex_assert_owner(mutex);
 #endif
 
@@ -873,22 +1055,32 @@ int wt_thd_cond_timedwait(WT_THD *thd, p
 #ifdef __WIN__
   /*
     only for the sake of Windows we distinguish between
-    'before' and 'starttime'
+    'before' and 'starttime':
+
+    my_getsystime() returns high-resolution value, that cannot be used for
+    waiting (it doesn't follow system clock changes), but is good for time
+    intervals.
+
+    GetSystemTimeAsFileTime() follows system clock, but is low-resolution
+    and will result in lousy intervals.
   */
   GetSystemTimeAsFileTime((PFILETIME)&starttime);
 #endif
 
   rc_wrlock(rc);
-  if (rc->owners.elements == 0 || thd->killed)
+  if (rc->owners.elements == 0)
     ret= WT_OK;
   rc_unlock(rc);
 
   set_timespec_time_nsec(timeout, starttime, (*thd->timeout_short)*ULL(1000));
-  if (ret == WT_TIMEOUT)
+  if (ret == WT_TIMEOUT && !thd->killed)
     ret= pthread_cond_timedwait(&rc->cond, mutex, &timeout);
-  if (ret == WT_TIMEOUT)
+  if (ret == WT_TIMEOUT && !thd->killed)
   {
-    if (deadlock(thd, thd, 0, *thd->deadlock_search_depth_long))
+    int r= deadlock(thd, thd, 0, *thd->deadlock_search_depth_long);
+    if (r == WT_FREE_TO_GO)
+      ret= WT_OK;
+    else if (r != WT_OK)
       ret= WT_DEADLOCK;
     else if (*thd->timeout_long > *thd->timeout_short)
     {
@@ -915,24 +1107,25 @@ int wt_thd_cond_timedwait(WT_THD *thd, p
   @param resid   a resource to release. 0 to release all resources
 */
 
-void wt_thd_release(WT_THD *thd, WT_RESOURCE_ID *resid)
+void wt_thd_release(WT_THD *thd, const WT_RESOURCE_ID *resid)
 {
   uint i;
   DBUG_ENTER("wt_thd_release");
 
-  for (i=0; i < thd->my_resources.elements; i++)
+  for (i= 0; i < thd->my_resources.elements; i++)
   {
-    uint j;
     WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**);
     if (!resid || (resid->type->compare(&rc->id, resid) == 0))
     {
+      uint j;
+
       rc_wrlock(rc);
       /*
         nobody's trying to free the resource now,
         as its owners[] array is not empty (at least thd must be there)
       */
       DBUG_ASSERT(rc->state == ACTIVE);
-      for (j=0; j < rc->owners.elements; j++)
+      for (j= 0; j < rc->owners.elements; j++)
         if (*dynamic_element(&rc->owners, j, WT_THD**) == thd)
           break;
       DBUG_ASSERT(j < rc->owners.elements);
@@ -941,8 +1134,8 @@ void wt_thd_release(WT_THD *thd, WT_RESO
       {
         pthread_cond_broadcast(&rc->cond);
 #ifndef DBUG_OFF
-        if (rc->mutex)
-          safe_mutex_assert_owner(rc->mutex);
+        if (rc->cond_mutex)
+          safe_mutex_assert_owner(rc->cond_mutex);
 #endif
       }
       unlock_lock_and_free_resource(thd, rc);

=== modified file 'sql/backup/backup_info.cc'
--- a/sql/backup/backup_info.cc	2009-02-04 10:49:16 +0000
+++ b/sql/backup/backup_info.cc	2009-02-13 12:40:13 +0000
@@ -1,3 +1,18 @@
+/* Copyright (C) 2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
 /**
   @file
 
@@ -161,7 +176,11 @@ Backup_info::find_backup_engine(const ba
     {
       handlerton *hton= se_hton(se);
       saved_factory= hton->get_backup_engine;
-      if (hton == myisam_hton) 
+      if (hton == myisam_hton
+#ifdef WITH_MARIA_STORAGE_ENGINE
+          || hton == maria_hton
+#endif
+          ) 
         hton->get_backup_engine= dummy_backup_engine_factory;
     });
 #endif

=== modified file 'sql/backup/kernel.cc'
--- a/sql/backup/kernel.cc	2009-02-11 12:11:20 +0000
+++ b/sql/backup/kernel.cc	2009-02-16 21:18:45 +0000
@@ -1301,10 +1301,13 @@ int Backup_restore_ctx::do_restore(bool 
   close_thread_tables(m_thd);                   // Never errors
   m_thd->stmt_da->reset_diagnostics_area();     // Never errors
 
+  DEBUG_SYNC(m_thd, "before_restore_locks_tables");
+
   err= lock_tables_for_restore();               // logs errors
   if (err)
     DBUG_RETURN(fatal_error(err));
 
+  DEBUG_SYNC(m_thd, "after_restore_locks_tables");
   /* 
    Here restore drivers are created to restore table data. Data is being
    (potentially) changed so we set m_data_changed flag.

=== modified file 'sql/debug_sync.cc'
--- a/sql/debug_sync.cc	2008-12-17 10:11:14 +0000
+++ b/sql/debug_sync.cc	2009-02-13 12:40:13 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008 MySQL AB
+/* Copyright (C) 2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -1474,11 +1474,20 @@ bool sys_var_debug_sync::check(THD *thd,
 
 bool sys_var_debug_sync::update(THD *thd, set_var *var)
 {
-  char empty= '\0';
-  char *val_str= var ? var->value->str_value.c_ptr() : &empty;
+  char *val_str, buff[STRING_BUFFER_USUAL_SIZE], empty= '\0';
+  String *strres, str(buff, sizeof(buff), system_charset_info);
+
   DBUG_ENTER("sys_var_debug_sync::update");
   DBUG_ASSERT(thd);
 
+  if (var)
+  {
+    if ((strres= var->value->val_str(&str)) == NULL)
+      DBUG_RETURN(TRUE);
+    val_str= strres->c_ptr();
+  }
+  else
+    val_str= &empty;
   DBUG_PRINT("debug_sync", ("set action: '%s'", val_str));
 
   DBUG_RETURN(opt_debug_sync_timeout ?

=== modified file 'sql/item_cmpfunc.cc'
--- a/sql/item_cmpfunc.cc	2009-01-30 14:13:39 +0000
+++ b/sql/item_cmpfunc.cc	2009-02-13 16:30:54 +0000
@@ -5341,6 +5341,7 @@ void Item_equal::update_used_tables()
   not_null_tables_cache= used_tables_cache= 0;
   if ((const_item_cache= cond_false))
     return;
+  const_item_cache= 1;
   while ((item=li++))
   {
     item->update_used_tables();

=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc	2009-02-12 04:46:57 +0000
+++ b/sql/mysqld.cc	2009-02-16 21:18:45 +0000
@@ -1,4 +1,4 @@
-/* Copyright 2000-2008 MySQL AB, 2008 Sun Microsystems, Inc.
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -3937,8 +3937,6 @@ static int init_server_components()
   if (table_def_init() | hostname_cache_init())
     unireg_abort(1);
 
-  wt_init();
-
   query_cache_result_size_limit(query_cache_limit);
   query_cache_set_min_res_unit(query_cache_min_res_unit);
   query_cache_init();
@@ -3951,6 +3949,7 @@ static int init_server_components()
   init_slave_list();
   init_slave_start();
 #endif
+  wt_init();
 
   /* Setup logs */
 
@@ -7875,8 +7874,7 @@ static void usage(void)
     default_collation_name= (char*) default_charset_info->name;
   print_version();
   puts("\
-Copyright (C) 2000-2008 MySQL AB, by Monty and others\n\
-Copyright (C) 2008 Sun Microsystems, Inc.\n\
+Copyright (C) 2000-2008 MySQL AB, Monty and others, 2008-2009 Sun Microsystems, Inc.\n\
 This software comes with ABSOLUTELY NO WARRANTY. This is free software,\n\
 and you are welcome to modify and redistribute it under the GPL license\n\n\
 Starts the MySQL database server\n");

=== modified file 'sql/sql_class.cc'
--- a/sql/sql_class.cc	2009-02-11 12:11:20 +0000
+++ b/sql/sql_class.cc	2009-02-16 21:18:45 +0000
@@ -1,4 +1,4 @@
-/* Copyright 2000-2008 MySQL AB, 2008 Sun Microsystems, Inc.
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -1028,6 +1028,8 @@ bool THD::store_globals()
   */
   mysys_var->id= thread_id;
   real_id= pthread_self();                      // For debugging
+  mysys_var->stack_ends_here= thread_stack +    // for consistency, see libevent_thread_proc
+                              STACK_DIRECTION * (long)my_thread_stack_size;
 
   /*
     We have to call thr_lock_info_init() again here as THD may have been

=== modified file 'sql/sql_class.h'
--- a/sql/sql_class.h	2009-02-04 10:49:16 +0000
+++ b/sql/sql_class.h	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright 2000-2008 MySQL AB, 2008 Sun Microsystems, Inc.
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -1415,7 +1415,7 @@ public:
     THD_TRANS stmt;			// Trans for current statement
     bool on;                            // see ha_enable_transaction()
     XID_STATE xid_state;
-    WT_THD wt;
+    WT_THD wt;                          ///< for deadlock detection
     Rows_log_event *m_pending_rows_event;
 
     /*

=== modified file 'sql/sql_insert.cc'
--- a/sql/sql_insert.cc	2009-02-04 11:52:20 +0000
+++ b/sql/sql_insert.cc	2009-02-16 21:18:45 +0000
@@ -3565,6 +3565,12 @@ static TABLE *create_table_from_items(TH
                                     MYSQL_LOCK_IGNORE_FLUSH, &not_used)) ||
         hooks->postlock(&table, 1))
   {
+    /* purecov: begin tested */
+    /*
+      This can happen in innodb when you get a deadlock when using same table
+      in insert and select
+    */
+    my_error(ER_CANT_LOCK, MYF(0), my_errno);
     if (*lock)
     {
       mysql_unlock_tables(thd, *lock);
@@ -3574,6 +3580,7 @@ static TABLE *create_table_from_items(TH
     if (!create_info->table_existed)
       drop_open_table(thd, table, create_table->db, create_table->table_name);
     DBUG_RETURN(0);
+    /* purecov: end */
   }
   DBUG_RETURN(table);
 }

=== modified file 'sql/sql_select.cc'
--- a/sql/sql_select.cc	2009-02-13 17:44:21 +0000
+++ b/sql/sql_select.cc	2009-02-16 21:18:45 +0000
@@ -4036,11 +4036,10 @@ make_join_statistics(JOIN *join, TABLE_L
     s->needed_reg.init();
     table_vector[i]=s->table=table=tables->table;
     table->pos_in_table_list= tables;
-    error= table->file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK);
-    if(error)
+    if ((error= table->file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK)))
     {
-        table->file->print_error(error, MYF(0));
-        DBUG_RETURN(1);
+      table->file->print_error(error, MYF(0));
+      DBUG_RETURN(1);
     }
     table->quick_keys.clear_all();
     table->reginfo.join_tab=s;
@@ -16652,6 +16651,11 @@ join_read_const_table(JOIN_TAB *tab, POS
       if (!table->maybe_null || error > 0)
 	DBUG_RETURN(error);
     }
+    /*
+      The optimizer trust the engine that when stats.records is 0, there
+      was no found rows
+    */
+    DBUG_ASSERT(table->file->stats.records > 0 || error);
   }
   else
   {
@@ -16681,6 +16685,17 @@ join_read_const_table(JOIN_TAB *tab, POS
   }
   if (*tab->on_expr_ref && !table->null_row)
   {
+#if !defined(DBUG_OFF) && defined(NOT_USING_ITEM_EQUAL)
+    /*
+      This test could be very usefull to find bugs in the optimizer
+      where we would call this function with an expression that can't be
+      evaluated yet. We can't have this enabled by default as long as
+      have items like Item_equal, that doesn't report they are const but
+      they can still be called even if they contain not const items.
+    */
+    (*tab->on_expr_ref)->update_used_tables();
+    DBUG_ASSERT((*tab->on_expr_ref)->const_item());
+#endif
     if ((table->null_row= test((*tab->on_expr_ref)->val_int() == 0)))
       mark_as_null_row(table);  
   }

=== modified file 'storage/maria/CMakeLists.txt'
--- a/storage/maria/CMakeLists.txt	2008-12-10 11:39:22 +0000
+++ b/storage/maria/CMakeLists.txt	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 MySQL AB
+# Copyright (C) 2007 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 # 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -42,6 +42,8 @@ SET(MARIA_SOURCES ma_init.c ma_open.c ma
             ma_pagecache.c ma_pagecaches.c 
             ma_checkpoint.c ma_recovery.c ma_commit.c ma_pagecrc.c
             ha_maria.h maria_def.h ma_recovery_util.c
+            ma_non_trans_log.c ma_examine_non_trans_log.c
+            maria_backup_engine.cc
 )
 
 IF(NOT SOURCE_SUBLIBS)
@@ -55,6 +57,9 @@ TARGET_LINK_LIBRARIES(maria_ftdump maria
 ADD_EXECUTABLE(maria_chk maria_chk.c)
 TARGET_LINK_LIBRARIES(maria_chk maria myisam mysys dbug strings zlib)
 
+ADD_EXECUTABLE(maria_non_trans_log maria_non_trans_log.c)
+TARGET_LINK_LIBRARIES(maria_non_trans_log maria myisam mysys dbug strings zlib)
+
 ADD_EXECUTABLE(maria_read_log maria_read_log.c)
 TARGET_LINK_LIBRARIES(maria_read_log maria myisam mysys dbug strings zlib)
 
@@ -84,6 +89,7 @@ IF(EMBED_MANIFESTS)
   MYSQL_EMBED_MANIFEST("maria_ftdump" "asInvoker")
   MYSQL_EMBED_MANIFEST("maria_chk" "asInvoker")
   MYSQL_EMBED_MANIFEST("maria_read_log" "asInvoker")
+  MYSQL_EMBED_MANIFEST("maria_non_trans_log" "asInvoker")
   MYSQL_EMBED_MANIFEST("maria_pack" "asInvoker")
 ENDIF(EMBED_MANIFESTS)
 

=== modified file 'storage/maria/Makefile.am'
--- a/storage/maria/Makefile.am	2009-01-07 10:58:33 +0000
+++ b/storage/maria/Makefile.am	2009-02-16 21:18:45 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2000-2008 MySQL AB
+# Copyright (C) 2000-2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -34,7 +34,7 @@ EXTRA_DIST =		ma_test_all.sh ma_test_all
 pkgdata_DATA =		ma_test_all ma_test_all.res ma_test_recovery
 pkglib_LIBRARIES =	libmaria.a
 bin_PROGRAMS =		maria_chk maria_pack maria_ftdump maria_read_log \
-			maria_dump_log
+			maria_dump_log maria_non_trans_log
 maria_chk_DEPENDENCIES=	$(LIBRARIES)
 # Only reason to link with libmyisam.a here is that it's where some fulltext
 # pieces are (but soon we'll remove fulltext dependencies from Maria).
@@ -44,6 +44,12 @@ maria_chk_LDADD=		@CLIENT_EXTRA_LDFLAGS@
 			$(top_builddir)/mysys/libmysyslt.la \
 			$(top_builddir)/dbug/libdbuglt.la \
 			$(top_builddir)/strings/libmystringslt.la @ZLIB_LIBS@
+maria_non_trans_log_DEPENDENCIES=	$(LIBRARIES)
+maria_non_trans_log_LDADD=		@CLIENT_EXTRA_LDFLAGS@ libmaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysyslt.la \
+			$(top_builddir)/dbug/libdbuglt.la \
+			$(top_builddir)/strings/libmystringslt.la @ZLIB_LIBS@
 maria_pack_DEPENDENCIES=$(LIBRARIES)
 maria_pack_LDADD=		@CLIENT_EXTRA_LDFLAGS@ libmaria.a \
                         $(top_builddir)/storage/myisam/libmyisam.a \
@@ -134,7 +140,9 @@ libmaria_a_SOURCES =	ma_init.c ma_open.c
 			ma_pagecache.c ma_pagecaches.c \
 			ma_checkpoint.c ma_recovery.c ma_commit.c \
 			ma_pagecrc.c ma_recovery_util.c \
-			ha_maria.cc
+			ha_maria.cc ma_non_trans_log.c ma_examine_non_trans_log.c \
+			maria_backup_engine.cc
+
 CLEANFILES =		test?.MA? FT?.MA? isam.log ma_test_all ma_rt_test.MA? sp_test.MA? maria_log_control maria_log.0000*
 
 SUFFIXES = .sh

=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc	2009-02-11 12:11:20 +0000
+++ b/storage/maria/ha_maria.cc	2009-02-16 21:18:45 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (C) 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -40,9 +41,11 @@ C_MODE_END
   Note that in future versions, only *transactional* Maria tables can
   rollback, so this flag should be up or down conditionally.
 */
-#define MARIA_CANNOT_ROLLBACK HA_NO_TRANSACTIONS
 #ifdef MARIA_CANNOT_ROLLBACK
+#define CANNOT_ROLLBACK_FLAG HA_NO_TRANSACTIONS
 #define trans_register_ha(A, B, C)  do { /* nothing */ } while(0)
+#else
+#define CANNOT_ROLLBACK_FLAG 0
 #endif
 #define THD_TRN (*(TRN **)thd_ha_data(thd, maria_hton))
 
@@ -693,6 +696,44 @@ void _ma_check_print_warning(HA_CHECK *p
   DBUG_VOID_RETURN;
 }
 
+/**
+  Report list of threads (and queries) accessing a table, thread_id of a
+  thread that detected corruption, ource file name and line number where
+  this corruption was detected, optional extra information (string).
+
+  This function is intended to be used when table corruption is detected.
+
+  @param[in] file      MARIA_HA object.
+  @param[in] message   Optional error message.
+  @param[in] sfile     Name of source file.
+  @param[in] sline     Line number in source file.
+
+  @return void
+*/
+
+void _ma_report_crashed(MARIA_HA *file, const char *message,
+                        const char *sfile, uint sline)
+{
+  THD *cur_thd;
+  LIST *element;
+  char buf[1024];
+  pthread_mutex_lock(&file->s->intern_lock);
+  if ((cur_thd= (THD*) file->in_use.data))
+    sql_print_error("Got an error from thread_id=%lu, %s:%d", cur_thd->thread_id,
+                    sfile, sline);
+  else
+    sql_print_error("Got an error from unknown thread, %s:%d", sfile, sline);
+  if (message)
+    sql_print_error("%s", message);
+  for (element= file->s->in_use; element; element= list_rest(element))
+  {
+    THD *thd= (THD*) element->data;
+    sql_print_error("%s", thd ? thd_security_context(thd, buf, sizeof(buf), 0)
+                              : "Unknown thread accessing table");
+  }
+  pthread_mutex_unlock(&file->s->intern_lock);
+}
+
 }
 
 /**
@@ -716,7 +757,7 @@ handler(hton, table_arg), file(0),
 int_table_flags(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER |
                 HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
                 HA_DUPLICATE_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY |
-                HA_FILE_BASED | HA_CAN_GEOMETRY | MARIA_CANNOT_ROLLBACK |
+                HA_FILE_BASED | HA_CAN_GEOMETRY | CANNOT_ROLLBACK_FLAG |
                 HA_CAN_BIT_FIELD | HA_CAN_RTREEKEYS |
                 HA_HAS_RECORDS | HA_STATS_RECORDS_IS_EXACT),
 can_enable_indexes(1), bulk_insert_single_undo(BULK_INSERT_NONE)
@@ -785,90 +826,6 @@ uint ha_maria::max_supported_key_length(
 }
 
 
-#ifdef HAVE_REPLICATION
-int ha_maria::net_read_dump(NET * net)
-{
-  int data_fd= file->dfile.file;
-  int error= 0;
-
-  my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME));
-  for (;;)
-  {
-    ulong packet_len= my_net_read(net);
-    if (!packet_len)
-      break;                                    // end of file
-    if (packet_len == packet_error)
-    {
-      sql_print_error("ha_maria::net_read_dump - read error ");
-      error= -1;
-      goto err;
-    }
-    if (my_write(data_fd, (uchar *) net->read_pos, (uint) packet_len,
-                 MYF(MY_WME | MY_FNABP)))
-    {
-      error= errno;
-      goto err;
-    }
-  }
-err:
-  return error;
-}
-
-
-int ha_maria::dump(THD * thd, int fd)
-{
-  MARIA_SHARE *share= file->s;
-  NET *net= &thd->net;
-  uint block_size= share->block_size;
-  my_off_t bytes_to_read= share->state.state.data_file_length;
-  int data_fd= file->dfile.file;
-  uchar *buf= (uchar *) my_malloc(block_size, MYF(MY_WME));
-  if (!buf)
-    return ENOMEM;
-
-  int error= 0;
-  my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME));
-  for (; bytes_to_read > 0;)
-  {
-    size_t bytes= my_read(data_fd, buf, block_size, MYF(MY_WME));
-    if (bytes == MY_FILE_ERROR)
-    {
-      error= errno;
-      goto err;
-    }
-
-    if (fd >= 0)
-    {
-      if (my_write(fd, buf, bytes, MYF(MY_WME | MY_FNABP)))
-      {
-        error= errno ? errno : EPIPE;
-        goto err;
-      }
-    }
-    else
-    {
-      if (my_net_write(net, buf, bytes))
-      {
-        error= errno ? errno : EPIPE;
-        goto err;
-      }
-    }
-    bytes_to_read -= bytes;
-  }
-
-  if (fd < 0)
-  {
-    if (my_net_write(net, (uchar*) "", 0))
-      error= errno ? errno : EPIPE;
-    net_flush(net);
-  }
-
-err:
-  my_free((uchar*) buf, MYF(0));
-  return error;
-}
-#endif                                          /* HAVE_REPLICATION */
-
         /* Name is here without an extension */
 
 int ha_maria::open(const char *name, int mode, uint test_if_locked)
@@ -1037,7 +994,8 @@ int ha_maria::check(THD * thd, HA_CHECK_
       file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
       pthread_mutex_lock(&share->intern_lock);
       share->state.changed &= ~(STATE_CHANGED | STATE_CRASHED |
-                                STATE_CRASHED_ON_REPAIR);
+                                STATE_CRASHED_ON_REPAIR |
+                                STATE_BAD_OPEN_COUNT);
       if (!(table->db_stat & HA_READ_ONLY))
         error= maria_update_state_info(&param, file,
                                        UPDATE_TIME | UPDATE_OPEN_COUNT |
@@ -1330,7 +1288,8 @@ int ha_maria::repair(THD *thd, HA_CHECK 
     if ((share->state.changed & STATE_CHANGED) || maria_is_crashed(file))
     {
       share->state.changed &= ~(STATE_CHANGED | STATE_CRASHED |
-                                STATE_CRASHED_ON_REPAIR);
+                                STATE_CRASHED_ON_REPAIR |
+                                STATE_BAD_OPEN_COUNT);
       file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
     }
     /*
@@ -1457,7 +1416,7 @@ int ha_maria::preload_keys(THD * thd, HA
 
   if ((error= maria_preload(file, map, table_list->ignore_leaves)))
   {
-    char buf[MYSYS_ERRMSG_SIZE];
+    char buf[MYSQL_ERRMSG_SIZE];
     const char *errmsg;
 
     switch (error) {
@@ -2084,7 +2043,7 @@ int ha_maria::info(uint flag, my_bool lo
     stats.data_file_length=  maria_info.data_file_length;
     stats.index_file_length= maria_info.index_file_length;
     stats.delete_length=     maria_info.delete_length;
-    stats.check_time=        maria_info.check_time;
+    stats.check_time=        (ulong) maria_info.check_time;
     stats.mean_rec_length=   maria_info.mean_reclength;
   }
   if (flag & HA_STATUS_CONST)
@@ -2092,7 +2051,7 @@ int ha_maria::info(uint flag, my_bool lo
     TABLE_SHARE *share= table->s;
     stats.max_data_file_length=  maria_info.max_data_file_length;
     stats.max_index_file_length= maria_info.max_index_file_length;
-    stats.create_time= maria_info.create_time;
+    stats.create_time= (ulong) maria_info.create_time;
     ref_length= maria_info.reflength;
     share->db_options_in_use= maria_info.options;
     stats.block_size= maria_block_size;
@@ -2135,7 +2094,7 @@ int ha_maria::info(uint flag, my_bool lo
     my_store_ptr(dup_ref, ref_length, maria_info.dup_key_pos);
   }
   /* Faster to always update, than to do it based on flag */
-  stats.update_time= maria_info.update_time;
+  stats.update_time= (ulong) maria_info.update_time;
   stats.auto_increment_value= maria_info.auto_increment;
 
   return 0;
@@ -2186,6 +2145,9 @@ int ha_maria::extra_opt(enum ha_extra_fu
 
 int ha_maria::delete_all_rows()
 {
+  THD *thd= current_thd;
+  (void) translog_log_debug_info(file->trn, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query, thd->query_length);
   if (file->s->now_transactional &&
       ((table->in_use->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) ||
        table->in_use->locked_tables_mode))
@@ -2202,6 +2164,9 @@ int ha_maria::delete_all_rows()
 
 int ha_maria::delete_table(const char *name)
 {
+  THD *thd= current_thd;
+  (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query, thd->query_length);
   return maria_delete_table(name);
 }
 
@@ -2209,6 +2174,7 @@ int ha_maria::external_lock(THD *thd, in
 {
   TRN *trn= THD_TRN;
   DBUG_ENTER("ha_maria::external_lock");
+  file->in_use.data= thd;
   /*
     We don't test now_transactional because it may vary between lock/unlock
     and thus confuse our reference counting.
@@ -2244,13 +2210,7 @@ int ha_maria::external_lock(THD *thd, in
         trnman_new_statement(trn);
       }
 
-      /* If handler uses versioning */
-      if (file->s->lock_key_trees)
-      {
-        if (_ma_setup_live_state(file))
-          DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-      }
-      else
+      if (!file->s->lock_key_trees)             // If we don't use versioning
       {
         /*
           We come here in the following cases:
@@ -2281,6 +2241,16 @@ int ha_maria::external_lock(THD *thd, in
         DBUG_PRINT("info", ("Disabling logging for table"));
         _ma_tmp_disable_logging_for_table(file, TRUE);
       }
+#ifdef EXTRA_DEBUG
+      if (lock_type == F_WRLCK &&
+          ! (trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED))
+      {
+        trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED |
+                         TRN_STATE_TABLES_CAN_CHANGE);
+        (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY,
+                                       (uchar*) thd->query, thd->query_length);
+      }
+#endif
     }
     else
     {
@@ -2293,7 +2263,8 @@ int ha_maria::external_lock(THD *thd, in
         Note that we can come here without having an exclusive lock on the
         table, for example in this case:
         external_lock(F_(WR|RD)LCK); thr_lock() which fails due to lock
-        abortion; external_lock(F_UNLCK).
+        abortion; external_lock(F_UNLCK). Fortunately, the re-enabling happens
+        only if we were the thread which disabled logging.
       */
       if (_ma_reenable_logging_for_table(file, TRUE))
         DBUG_RETURN(1);
@@ -2305,9 +2276,10 @@ int ha_maria::external_lock(THD *thd, in
         external lock of the table
       */
       file->state= &file->s->state.state;
-      if (trn && trnman_has_locked_tables(trn))
+      if (trn)
       {
-        if (!trnman_decrement_locked_tables(trn))
+        if (trnman_has_locked_tables(trn) &&
+            !trnman_decrement_locked_tables(trn))
         {
           /*
             OK should not have been sent to client yet (ACID),
@@ -2331,6 +2303,7 @@ int ha_maria::external_lock(THD *thd, in
           }
 #endif
         }
+        trnman_set_flags(trn, trnman_get_flags(trn) & ~ TRN_STATE_INFO_LOGGED);
       }
     }
   } /* if transactional table */
@@ -2365,6 +2338,16 @@ int ha_maria::start_stmt(THD *thd, thr_l
       call to start_stmt().
     */
     trnman_new_statement(trn);
+
+#ifdef EXTRA_DEBUG
+    if (!(trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED) &&
+        trnman_get_flags(trn) & TRN_STATE_TABLES_CAN_CHANGE)
+    {
+      trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED);
+      (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY,
+                                     (uchar*) thd->query, thd->query_length);
+    }
+#endif
   }
   return 0;
 }
@@ -2574,6 +2557,7 @@ int ha_maria::create(const char *name, r
   TABLE_SHARE *share= table_arg->s;
   uint options= share->db_options_in_use;
   enum data_file_type row_type;
+  THD *thd= current_thd;
   DBUG_ENTER("ha_maria::create");
 
   for (i= 0; i < share->keys; i++)
@@ -2638,6 +2622,9 @@ int ha_maria::create(const char *name, r
        ha_create_info->page_checksum ==  HA_CHOICE_YES)
     create_flags|= HA_CREATE_PAGE_CHECKSUM;
 
+  (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query, thd->query_length);
+
   /* TODO: Check that the following fn_format is really needed */
   error=
     maria_create(fn_format(buff, name, "", "",
@@ -2654,6 +2641,9 @@ int ha_maria::create(const char *name, r
 
 int ha_maria::rename_table(const char *from, const char *to)
 {
+  THD *thd= current_thd;
+  (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query, thd->query_length);
   return maria_rename(from, to);
 }
 
@@ -2798,6 +2788,8 @@ static int maria_commit(handlerton *hton
   TRN *trn= THD_TRN;
   DBUG_ENTER("maria_commit");
   trnman_reset_locked_tables(trn, 0);
+  trnman_set_flags(trn, trnman_get_flags(trn) & ~TRN_STATE_INFO_LOGGED);
+
   /* statement or transaction ? */
   if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all)
     DBUG_RETURN(0); // end of statement
@@ -3033,6 +3025,9 @@ static int ha_maria_init(void *p)
   maria_hton->show_status= maria_show_status;
   /* TODO: decide if we support Maria being used for log tables */
   maria_hton->flags= HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES;
+#if !defined(EMBEDDED_LIBRARY) && defined(HAVE_MARIA_PHYSICAL_LOGGING)
+  maria_hton->get_backup_engine= maria_backup_engine;
+#endif
   bzero(maria_log_pagecache, sizeof(*maria_log_pagecache));
   maria_tmpdir= &mysql_tmpdir_list;             /* For REDO */
   res= maria_init() || ma_control_file_open(TRUE, TRUE) ||
@@ -3065,8 +3060,8 @@ static int ha_maria_init(void *p)
   @brief Register a named table with a call back function to the query cache.
 
   @param thd The thread handle
-  @param table_key A pointer to the table name in the table cache
-  @param key_length The length of the table name
+  @param table_name A pointer to the table name in the table cache
+  @param table_name_len The length of the table name
   @param[out] engine_callback The pointer to the storage engine call back
     function, currently 0
   @param[out] engine_data Engine data will be set to 0.

=== modified file 'storage/maria/ha_maria.h'
--- a/storage/maria/ha_maria.h	2009-01-12 17:50:30 +0000
+++ b/storage/maria/ha_maria.h	2009-02-13 12:40:13 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -152,10 +153,6 @@ public:
   int assign_to_keycache(THD * thd, HA_CHECK_OPT * check_opt);
   int preload_keys(THD * thd, HA_CHECK_OPT * check_opt);
   bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes);
-#ifdef HAVE_REPLICATION
-  int dump(THD * thd, int fd);
-  int net_read_dump(NET * net);
-#endif
 #ifdef HAVE_QUERY_CACHE
   my_bool register_query_cache_table(THD *thd, char *table_key,
                                      uint key_length,
@@ -187,3 +184,8 @@ private:
   DsMrr_impl ds_mrr;
   friend my_bool index_cond_func_maria(void *arg);
 };
+
+#if !defined(EMBEDDED_LIBRARY) && defined(HAVE_MARIA_PHYSICAL_LOGGING)
+// If embedded, there is no online backup
+Backup_result_t maria_backup_engine(handlerton *self, Backup_engine* &be);
+#endif

=== modified file 'storage/maria/ma_bitmap.c'
--- a/storage/maria/ma_bitmap.c	2008-12-09 09:56:02 +0000
+++ b/storage/maria/ma_bitmap.c	2009-02-10 14:51:40 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2007 Michael Widenius
+/* Copyright (C) 2007 Michael Widenius,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -144,7 +145,7 @@ static inline my_bool write_changed_bitm
 {
   DBUG_ENTER("write_changed_bitmap");
   DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size);
-  DBUG_ASSERT(bitmap->file.write_callback != 0);
+  DBUG_ASSERT(bitmap->file.pre_write_callback != NULL);
   DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
 
   if ((bitmap->non_flushable == 0)
@@ -2523,6 +2524,15 @@ int _ma_bitmap_create_first(MARIA_SHARE 
                 block_size - sizeof(marker),
                 MYF(MY_NABP | MY_WME)))
     return 1;
+  if (unlikely(ma_get_physical_logging_state(share)))
+  {
+    maria_log_chsize_physical(share, MA_LOG_CHSIZE_MAD,
+                              block_size - sizeof(marker));
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAD, share, marker,
+                              sizeof(marker),
+                              block_size - sizeof(marker));
+  }
+
   share->state.state.data_file_length= block_size;
   _ma_bitmap_delete_all(share);
   return 0;
@@ -2572,21 +2582,25 @@ void _ma_bitmap_set_pagecache_callbacks(
   file->callback_data= (uchar*) share;
   file->flush_log_callback= maria_flush_log_for_page_none;
   file->write_fail= maria_page_write_failure;
+  file->post_write_callback= &maria_flush_log_for_page_none;
 
   if (share->temporary)
   {
     file->read_callback=  &maria_page_crc_check_none;
-    file->write_callback= &maria_page_filler_set_none;
+    file->pre_write_callback= &maria_page_filler_set_none;
   }
   else
   {
     file->read_callback=  &maria_page_crc_check_bitmap;
     if (share->options & HA_OPTION_PAGE_CHECKSUM)
-      file->write_callback= &maria_page_crc_set_normal;
+      file->pre_write_callback= &maria_page_crc_set_normal;
     else
-      file->write_callback= &maria_page_filler_set_bitmap;
+      file->pre_write_callback= &maria_page_filler_set_bitmap;
     if (share->now_transactional)
       file->flush_log_callback= flush_log_for_bitmap;
+#ifdef HAVE_MARIA_PHYSICAL_LOGGING
+    file->post_write_callback= &maria_log_data_page_flush_physical;
+#endif
   }
 }
 

=== modified file 'storage/maria/ma_blockrec.c'
--- a/storage/maria/ma_blockrec.c	2008-12-09 09:56:02 +0000
+++ b/storage/maria/ma_blockrec.c	2009-02-05 22:38:30 +0000
@@ -2408,11 +2408,23 @@ static my_bool free_full_page_range(MARI
                                     uint count)
 {
   my_bool res= 0;
+  uint delete_count;
   MARIA_SHARE *share= info->s;
   DBUG_ENTER("free_full_page_range");
 
-  if (pagecache_delete_pages(share->pagecache, &info->dfile,
-                             page, count, PAGECACHE_LOCK_WRITE, 0))
+  delete_count= count;
+  if (share->state.state.data_file_length ==
+      (page + count) * share->block_size)
+  {
+    /*
+      Don't delete last page from pagecache as this will make the file
+      shorter than expected if the last operation extended the file
+    */
+    delete_count--;
+  }
+  if (delete_count &&
+      pagecache_delete_pages(share->pagecache, &info->dfile,
+                             page, delete_count, PAGECACHE_LOCK_WRITE, 0))
     res= 1;
 
   if (share->now_transactional)
@@ -3134,8 +3146,8 @@ static my_bool write_block_record(MARIA_
 
           log_pos= store_page_range(log_pos, tmp_block, block_size,
                                     blob_length, &extents);
-          tmp_block+= tmp_block->sub_blocks;
         }
+        tmp_block+= tmp_block->sub_blocks;
       }
     }
 
@@ -3489,23 +3501,26 @@ my_bool _ma_write_abort_block_record(MAR
   for (block= blocks->block + 1, end= block + blocks->count - 1; block < end;
        block++)
   {
-    if (block->used & BLOCKUSED_TAIL)
-    {
-      /*
-        block->page_count is set to the tail directory entry number in
-        write_block_record()
-      */
-      if (delete_head_or_tail(info, block->page, block->page_count & ~TAIL_BIT,
-                              0, 0))
-        res= 1;
-    }
-    else if (block->used & BLOCKUSED_USED)
+    if (block->used & BLOCKUSED_USED)
     {
-      if (free_full_page_range(info, block->page, block->page_count))
-        res= 1;
+      if (block->used & BLOCKUSED_TAIL)
+      {
+        /*
+          block->page_count is set to the tail directory entry number in
+          write_block_record()
+        */
+        if (delete_head_or_tail(info, block->page,
+                                block->page_count & ~TAIL_BIT,
+                                0, 0))
+          res= 1;
+      }
+      else
+      {
+        if (free_full_page_range(info, block->page, block->page_count))
+          res= 1;
+      }
     }
   }
-
   if (share->now_transactional)
   {
     if (_ma_write_clr(info, info->cur_row.orig_undo_lsn,
@@ -6167,6 +6182,7 @@ err:
                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
                              LSN_IMPOSSIBLE, 0, FALSE);
   _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0); /* catch recovery errors early */
   DBUG_RETURN((my_errno= error));
 }
 
@@ -6265,6 +6281,7 @@ err:
                            PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
                            LSN_IMPOSSIBLE, 0, FALSE);
   _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0);
   DBUG_RETURN((my_errno= error));
 
 }
@@ -6322,6 +6339,7 @@ uint _ma_apply_redo_free_blocks(MARIA_HA
     if (res)
     {
       _ma_mark_file_crashed(share);
+      DBUG_ASSERT(0);
       DBUG_RETURN(res);
     }
   }
@@ -6405,6 +6423,7 @@ uint _ma_apply_redo_free_head_or_tail(MA
 
 err:
   _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0);
   DBUG_RETURN(1);
 }
 
@@ -6416,6 +6435,10 @@ err:
    @parma  lsn             LSN to put on pages
    @param  header          Header (with FILEID)
    @param  redo_lsn        REDO record's LSN
+   @param[out] number_of_blobs Number of blobs found in log record
+   @param[out] number_of_ranges Number of ranges found
+   @param[out] first_page  First page touched
+   @param[out] last_page   Last page touched
 
    @note Write full pages (full head & blob pages)
 
@@ -6426,13 +6449,18 @@ err:
 
 uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
                                      LSN lsn, const uchar *header,
-                                     LSN redo_lsn)
+                                     LSN redo_lsn,
+                                     uint * const number_of_blobs,
+                                     uint * const number_of_ranges,
+                                     pgcache_page_no_t * const first_page,
+                                     pgcache_page_no_t * const last_page)
 {
   MARIA_SHARE *share= info->s;
   const uchar *data;
   uint      data_size= FULL_PAGE_SIZE(share->block_size);
   uint      blob_count, ranges;
   uint16    sid;
+  pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0;
   DBUG_ENTER("_ma_apply_redo_insert_row_blobs");
 
   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
@@ -6440,9 +6468,9 @@ uint _ma_apply_redo_insert_row_blobs(MAR
 
   sid= fileid_korr(header);
   header+= FILEID_STORE_SIZE;
-  ranges= pagerange_korr(header);
+  *number_of_ranges= ranges= pagerange_korr(header);
   header+= PAGERANGE_STORE_SIZE;
-  blob_count= pagerange_korr(header);
+  *number_of_blobs= blob_count= pagerange_korr(header);
   header+= PAGERANGE_STORE_SIZE;
   DBUG_ASSERT(ranges >= blob_count);
 
@@ -6480,6 +6508,8 @@ uint _ma_apply_redo_insert_row_blobs(MAR
         enum pagecache_page_pin unpin_method;
         uint length;
 
+        set_if_smaller(first_page2, page);
+        set_if_bigger(last_page2, page);
         if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE))
           continue;
 
@@ -6530,15 +6560,22 @@ uint _ma_apply_redo_insert_row_blobs(MAR
           }
           else
           {
+#ifndef DBUG_OFF
+            uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
+#endif
             if (lsn_korr(buff) >= lsn)
             {
               /* Already applied */
+              DBUG_PRINT("info", ("already applied %llu >= %llu",
+                                  lsn_korr(buff), lsn));
               pagecache_unlock_by_link(share->pagecache, page_link.link,
                                        PAGECACHE_LOCK_WRITE_UNLOCK,
                                        PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
                                        LSN_IMPOSSIBLE, 0, FALSE);
               continue;
             }
+            DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) ||
+                        (found_page_type == (uchar) UNALLOCATED_PAGE));
           }
           unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
           unpin_method=  PAGECACHE_UNPIN;
@@ -6580,10 +6617,13 @@ uint _ma_apply_redo_insert_row_blobs(MAR
         goto err;
     }
   }
+  *first_page= first_page2;
+  *last_page=  last_page2;
   DBUG_RETURN(0);
 
 err:
   _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0);
   DBUG_RETURN(1);
 }
 
@@ -7075,7 +7115,10 @@ my_bool _ma_apply_undo_bulk_insert(MARIA
   error= (maria_delete_all_rows(info) ||
           maria_enable_indexes(info) ||
           /* we enabled indices so need '2' below */
-          _ma_state_info_write(info->s, 1|2|4) ||
+          _ma_state_info_write(info->s,
+                               MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                               MA_STATE_INFO_WRITE_FULL_INFO |
+                               MA_STATE_INFO_WRITE_LOCK) ||
           _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT,
                         FALSE, 0, &lsn, NULL));
   DBUG_RETURN(error);

=== modified file 'storage/maria/ma_blockrec.h'
--- a/storage/maria/ma_blockrec.h	2008-12-05 21:11:46 +0000
+++ b/storage/maria/ma_blockrec.h	2009-02-09 21:52:42 +0000
@@ -122,6 +122,7 @@ static inline MARIA_RECORD_POS ma_record
                                             uint dir_entry)
 {
   DBUG_ASSERT(dir_entry <= 255);
+  DBUG_ASSERT(page > 0); /* page 0 is bitmap, not data page */
   return (MARIA_RECORD_POS) (((ulonglong) page << 8) | dir_entry);
 }
 
@@ -236,7 +237,11 @@ uint _ma_apply_redo_free_blocks(MARIA_HA
 uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
                                       const uchar *header);
 uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, LSN lsn,
-                                     const uchar *header, LSN redo_lsn);
+                                     const uchar *header, LSN redo_lsn,
+                                     uint * const number_of_blobs,
+                                     uint * const number_of_ranges,
+                                     pgcache_page_no_t * const first_page,
+                                     pgcache_page_no_t * const last_page);
 my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info, LSN lsn,
                                        const uchar *header);
 my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,

=== modified file 'storage/maria/ma_check.c'
--- a/storage/maria/ma_check.c	2008-10-20 09:16:47 +0000
+++ b/storage/maria/ma_check.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -100,6 +101,7 @@ static my_bool _ma_flush_table_files_bef
 static TrID max_trid_in_system(void);
 static void _ma_check_print_not_visible_error(HA_CHECK *param, TrID used_trid);
 void retry_if_quick(MARIA_SORT_PARAM *param, int error);
+static int chsize_kfile(MARIA_HA *info);
 
 
 /* Initialize check param with default values */
@@ -808,7 +810,7 @@ static int chk_index(HA_CHECK *param, MA
 		     ha_checksum *key_checksum, uint level)
 {
   int flag;
-  uint used_length,comp_flag,page_flag,nod_flag;
+  uint used_length,comp_flag,page_flag,nod_flag,key_length=0;
   uchar *temp_buff, *keypos, *old_keypos, *endpos;
   my_off_t next_page,record;
   MARIA_SHARE *share= info->s;
@@ -888,8 +890,9 @@ static int chk_index(HA_CHECK *param, MA
     }
     old_keypos=keypos;
     if (keypos >= endpos ||
-	!(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos))
+	(key_length=(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos)) == 0)
       break;
+    DBUG_ASSERT(key_length <= sizeof(tmp_key_buff));
     if (keypos > endpos)
     {
       _ma_check_print_error(param,
@@ -2204,7 +2207,11 @@ static my_bool protect_against_repair_cr
                             FLUSH_FORCE_WRITE,
                             discard_index ? FLUSH_IGNORE_CHANGED :
                             FLUSH_FORCE_WRITE) ||
-      (share->changed && _ma_state_info_write(share, 1|2|4)))
+      (share->changed &&
+       _ma_state_info_write(share,
+                            MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                            MA_STATE_INFO_WRITE_FULL_INFO |
+                            MA_STATE_INFO_WRITE_LOCK)))
     return TRUE;
   /* In maria_chk this is not needed: */
   if (maria_multi_threaded && share->base.born_transactional)
@@ -2213,7 +2220,9 @@ static my_bool protect_against_repair_cr
     {
       /* this can be true only for a transactional table */
       maria_mark_crashed_on_repair(info);
-      if (_ma_state_info_write(share, 1|4))
+      if (_ma_state_info_write(share,
+                               MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                               MA_STATE_INFO_WRITE_LOCK))
         return TRUE;
     }
     if (translog_status == TRANSLOG_OK &&
@@ -2619,7 +2628,7 @@ int maria_repair(HA_CHECK *param, regist
   {
     (void)(fputs("          \r",stdout)); (void)(fflush(stdout));
   }
-  if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0, MYF(0)))
+  if (chsize_kfile(info))
   {
     _ma_check_print_warning(param,
 			   "Can't change size of indexfile, error: %d",
@@ -2647,7 +2656,7 @@ int maria_repair(HA_CHECK *param, regist
   }
 
   (void)(end_io_cache(&sort_info.new_info->rec_cache));
-  info->opt_flag&= ~WRITE_CACHE_USED;
+  sort_info.new_info->opt_flag&= ~WRITE_CACHE_USED;
 
   /*
     As we have read the data file (sort_get_next_record()) we may have
@@ -2680,6 +2689,11 @@ int maria_repair(HA_CHECK *param, regist
       my_close(new_file, MYF(MY_WME));
     new_file= -1;
     change_data_file_descriptor(info, -1);
+    /*
+      File change like this is not handled in physical log. maria_filecopy()
+      above is also not handled.
+    */
+    DBUG_ASSERT(!share->physical_logging);
     if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT,
                                 DATA_TMP_EXT,
                                 (param->testflag & T_BACKUP_DATA ?
@@ -2993,6 +3007,7 @@ int maria_sort_index(HA_CHECK *param, re
   share->kfile.file = -1;
   pthread_mutex_unlock(&share->intern_lock);
   (void) my_close(new_file,MYF(MY_WME));
+  DBUG_ASSERT(!share->physical_logging);
   if (maria_change_to_newfile(share->index_file_name.str, MARIA_NAME_IEXT,
                               INDEX_TMP_EXT, sync_dir) ||
       _ma_open_keyfile(share))
@@ -3129,6 +3144,7 @@ static int sort_one_index(HA_CHECK *para
   length= _ma_get_page_used(share, buff);
   bzero((uchar*) buff+length,keyinfo->block_length-length);
   put_crc(buff, new_page_pos, share);
+  DBUG_ASSERT(!info->s->physical_logging);
   if (my_pwrite(new_file,(uchar*) buff,(uint) keyinfo->block_length,
 		new_page_pos,MYF(MY_NABP | MY_WAIT_IF_FULL)))
   {
@@ -3249,7 +3265,7 @@ static my_bool maria_zerofill_data(HA_CH
   pgcache_page_no_t page;
   uint block_size= share->block_size;
   MARIA_FILE_BITMAP *bitmap= &share->bitmap;
-  my_bool zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN);
+  my_bool zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN), error;
   DBUG_ENTER("maria_zerofill_data");
 
   /* This works only with BLOCK_RECORD files */
@@ -3344,15 +3360,22 @@ static my_bool maria_zerofill_data(HA_CH
                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
                              LSN_IMPOSSIBLE, 1, FALSE);
   }
-  DBUG_RETURN(_ma_bitmap_flush(share) ||
-              flush_pagecache_blocks(share->pagecache, &info->dfile,
-                                     FLUSH_FORCE_WRITE));
+  error= _ma_bitmap_flush(share);
+  if (flush_pagecache_blocks(share->pagecache, &info->dfile,
+                             FLUSH_FORCE_WRITE))
+    error= 1;
+  DBUG_RETURN(error);
 
 err:
   pagecache_unlock_by_link(share->pagecache, page_link.link,
                            PAGECACHE_LOCK_WRITE_UNLOCK,
                            PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
                            LSN_IMPOSSIBLE, 0, FALSE);
+  /* flush what was changed so far */
+  (void) _ma_bitmap_flush(share);
+  (void) flush_pagecache_blocks(share->pagecache, &info->dfile,
+                                FLUSH_FORCE_WRITE);
+
   DBUG_RETURN(1);
 }
 
@@ -3662,7 +3685,7 @@ int maria_repair_by_sort(HA_CHECK *param
 
         Note, built-in parser is always nr. 0 - see ftparser_call_initializer()
       */
-      if (sort_param.keyinfo->ftparser_nr == 0)
+      if (sort_param.keyinfo->ftkey_nr == 0)
       {
         /*
           for built-in parser the number of generated index entries
@@ -3789,6 +3812,7 @@ int maria_repair_by_sort(HA_CHECK *param
         new_file= -1;
       }
       change_data_file_descriptor(info, -1);
+      DBUG_ASSERT(!share->physical_logging);
       if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT,
                                   DATA_TMP_EXT,
                                   (param->testflag & T_BACKUP_DATA ?
@@ -3841,16 +3865,19 @@ int maria_repair_by_sort(HA_CHECK *param
       skr=share->base.reloc*share->base.min_pack_length;
 #endif
     if (skr != sort_info.filelength)
+    {
+      DBUG_ASSERT(!share->physical_logging);
       if (my_chsize(info->dfile.file, skr, 0, MYF(0)))
 	_ma_check_print_warning(param,
 			       "Can't change size of datafile,  error: %d",
 			       my_errno);
+    }
   }
 
   if (param->testflag & T_CALC_CHECKSUM)
     share->state.state.checksum=param->glob_crc;
 
-  if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0, MYF(0)))
+  if (chsize_kfile(info))
     _ma_check_print_warning(param,
 			   "Can't change size of indexfile, error: %d",
 			   my_errno);
@@ -4209,6 +4236,9 @@ int maria_repair_parallel(HA_CHECK *para
   sort_param[0].fix_datafile= ! rep_quick;
   sort_param[0].calc_checksum= test(param->testflag & T_CALC_CHECKSUM);
 
+  if (!maria_ftparser_alloc_param(info))
+    goto err;
+
   sort_info.got_error=0;
   pthread_mutex_lock(&sort_info.mutex);
 
@@ -4343,15 +4373,18 @@ int maria_repair_parallel(HA_CHECK *para
       skr=share->base.reloc*share->base.min_pack_length;
 #endif
     if (skr != sort_info.filelength)
+    {
+      DBUG_ASSERT(!share->physical_logging);
       if (my_chsize(info->dfile.file, skr, 0, MYF(0)))
 	_ma_check_print_warning(param,
 			       "Can't change size of datafile,  error: %d",
 			       my_errno);
+    }
   }
   if (param->testflag & T_CALC_CHECKSUM)
     share->state.state.checksum=param->glob_crc;
 
-  if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0, MYF(0)))
+  if (chsize_kfile(info))
     _ma_check_print_warning(param,
 			   "Can't change size of indexfile, error: %d",
                             my_errno);
@@ -4397,6 +4430,7 @@ err:
     {
       my_close(new_file,MYF(0));
       info->dfile.file= new_file= -1;
+      DBUG_ASSERT(!share->physical_logging);
       if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT,
                                   DATA_TMP_EXT,
                                   MYF((param->testflag & T_BACKUP_DATA ?
@@ -5529,6 +5563,10 @@ static int sort_insert_key(MARIA_SORT_PA
     if (my_pwrite(share->kfile.file, anc_buff,
                   (uint) keyinfo->block_length, filepos, param->myf_rw))
       DBUG_RETURN(1);
+    if (unlikely(ma_log_index_pages_physical &&
+                 ma_get_physical_logging_state(info->s)))
+      maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAI, info->s, anc_buff,
+                                keyinfo->block_length, filepos);
   }
   DBUG_DUMP("buff", anc_buff, _ma_get_page_used(share, anc_buff));
 
@@ -5658,6 +5696,11 @@ int _ma_flush_pending_blocks(MARIA_SORT_
       if (my_pwrite(info->s->kfile.file, key_block->buff,
                     (uint) keyinfo->block_length,filepos, myf_rw))
         goto err;
+      if (unlikely(ma_log_index_pages_physical &&
+                   ma_get_physical_logging_state(info->s)))
+        maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAI, info->s,
+                                  key_block->buff, keyinfo->block_length,
+                                  filepos);
     }
     DBUG_DUMP("buff",key_block->buff,length);
     nod_flag=1;
@@ -5953,7 +5996,9 @@ int maria_update_state_info(HA_CHECK *pa
       if (!share->state.create_time)
 	share->state.create_time= share->state.check_time;
     }
-    if (_ma_state_info_write(share, 1|2))
+    if (_ma_state_info_write(share,
+                             MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                             MA_STATE_INFO_WRITE_FULL_INFO))
       goto err;
     share->changed=0;
   }
@@ -6675,3 +6720,45 @@ void retry_if_quick(MARIA_SORT_PARAM *so
     param->testflag|=T_RETRY_WITHOUT_QUICK;
   }
 }
+
+
+/**
+  Changes the size of an index file, and logs the operation to the physical
+  log if needed.
+
+  The only known case when my_chsize(kfile) can happen on a table doing
+  physical logging, is when the table was empty, bulk insert on it has been
+  done, it's the end of bulk insert: we re-enable indices (maria_repair*()):
+  thus my_chsize() is in fact a void operation (file already has grown,
+  starting from empty, info->state->key_file_length is up-to-date and so file
+  already has the requested size). We however log the operation, in case there
+  are unknown cases.
+
+  @param  info            table
+
+  @return Operation status
+    @retval 0      ok
+    @retval !=0    error
+*/
+
+static int chsize_kfile(MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+  my_off_t new_length= info->state->key_file_length;
+  int ret;
+#ifndef DBUG_OFF
+  my_bool no_length_change=
+    (my_seek(share->kfile.file, 0L, MY_SEEK_END, MYF(0)) == new_length);
+#endif
+
+  ret= my_chsize(share->kfile.file, new_length, 0, MYF(0));
+
+  if (unlikely(ma_log_index_pages_physical &&
+               ma_get_physical_logging_state(share)))
+  {
+    DBUG_ASSERT(no_length_change);
+    maria_log_chsize_physical(share, MA_LOG_CHSIZE_MAI, new_length);
+  }
+
+  return ret;
+}

=== modified file 'storage/maria/ma_check_standalone.h'
--- a/storage/maria/ma_check_standalone.h	2008-05-29 15:44:11 +0000
+++ b/storage/maria/ma_check_standalone.h	2009-01-19 20:25:23 +0000
@@ -28,6 +28,7 @@
   dependencies on mysqld.o, which make linking fail.
   The solution is to declare a dummy _mi_report_crashed() in the present
   header file, and include it in Maria standalone programs.
+  Same for _ma_report_crashed().
 
   Some standalone Maria programs, but less numerous than above, use objects
   from ma_check.o like maria_repair(). This brings in linking dependencies of
@@ -49,6 +50,16 @@ void _mi_report_crashed(MI_INFO *file __
 {
 }
 
+#ifndef _maria_h
+struct st_maria_handler;
+typedef struct st_maria_handler MARIA_HA;
+#endif
+void _ma_report_crashed(MARIA_HA *file __attribute__((unused)),
+                        const char *message __attribute__((unused)),
+                        const char *sfile __attribute__((unused)),
+                        uint sline __attribute__((unused)))
+{
+}
 
 #if defined(MA_CHECK_STANDALONE) && (MA_CHECK_STANDALONE == 1)
 

=== modified file 'storage/maria/ma_checkpoint.c'
--- a/storage/maria/ma_checkpoint.c	2008-12-09 09:56:02 +0000
+++ b/storage/maria/ma_checkpoint.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006,2007 MySQL AB
+/* Copyright (C) 2006,2007 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -368,7 +368,9 @@ static void flush_all_tables(int what_to
                                    FLUSH_KEEP, FLUSH_KEEP);
         break;
       case 1:
-        res= _ma_state_info_write(info->s, 1|4);
+        res= _ma_state_info_write(info->s,
+                                  MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
+                                  MA_STATE_INFO_WRITE_LOCK);
         DBUG_PRINT("maria_flush_states",
                    ("is_of_horizon: LSN (%lu,0x%lx)",
                     LSN_IN_PARTS(info->s->state.is_of_horizon)));
@@ -651,6 +653,14 @@ pthread_handler_t ma_checkpoint_backgrou
             We use FLUSH_KEEP_LAZY: if a file is already in flush, it's
             smarter to move to the next file than wait for this one to be
             completely flushed, which may take long.
+            StaleFilePointersInFlush: notice how below we use "dfile" which
+            is an OS file descriptor plus some function and MARIA_SHARE
+            pointers; this data dates from a previous checkpoint; since then,
+            the table may have been closed (so MARIA_SHARE* became stale), and
+            the file descriptor reassigned to another table which does not
+            have the same CRC-read-set callbacks: it is thus important that
+            flush_pagecache_blocks_with_filter() does not use the pointers,
+            only the OS file descriptor.
           */
           int res=
             flush_pagecache_blocks_with_filter(maria_pagecache,
@@ -1046,7 +1056,9 @@ static int collect_tables(LEX_STRING *st
           state_copies_horizon;
         if (kfile.file >= 0)
           sync_error|=
-            _ma_state_info_write_sub(kfile.file, &state_copy->state, 1);
+            _ma_state_info_write_sub(share, kfile.file,
+                                     &state_copy->state,
+                                     MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
         /*
           We don't set share->changed=0 because it may interfere with a
           concurrent _ma_writeinfo() doing share->changed=1 (cancel its

=== modified file 'storage/maria/ma_close.c'
--- a/storage/maria/ma_close.c	2008-12-09 13:11:48 +0000
+++ b/storage/maria/ma_close.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -56,6 +57,8 @@ int maria_close(register MARIA_HA *info)
   }
   if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
   {
+    /* Logically there should not be a WRITE_CACHE at this stage */
+    DBUG_ASSERT(!(info->opt_flag & WRITE_CACHE_USED));
     if (end_io_cache(&info->rec_cache))
       error=my_errno;
     info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
@@ -81,7 +84,11 @@ int maria_close(register MARIA_HA *info)
                                  (share->temporary ?
                                   FLUSH_IGNORE_CHANGED :
                                   FLUSH_RELEASE)))
+      {
         error= my_errno;
+        maria_print_error(share, HA_ERR_CRASHED);
+        maria_mark_crashed(info);		/* Mark that table must be checked */
+      }
 #ifdef HAVE_MMAP
       if (share->file_map)
         _ma_unmap_file(info);
@@ -100,9 +107,12 @@ int maria_close(register MARIA_HA *info)
           State must be written to file as it was not done at table's
           unlocking.
         */
-        if (_ma_state_info_write(share, 1))
+        if (_ma_state_info_write(share, MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET))
           error= my_errno;
       }
+      if (share->MA_LOG_OPEN_stored_in_physical_log)
+        _maria_log_command(&maria_physical_log, MA_LOG_CLOSE, share,
+                           NULL, 0, error);
       /*
         File must be synced as it is going out of the maria_open_list and so
         becoming unknown to future Checkpoints.
@@ -114,6 +124,7 @@ int maria_close(register MARIA_HA *info)
     }
 #ifdef THREAD
     thr_lock_delete(&share->lock);
+    my_atomic_rwlock_destroy(&share->physical_logging_rwlock);
     (void) pthread_mutex_destroy(&share->key_del_lock);
     {
       int i,keys;
@@ -154,7 +165,7 @@ int maria_close(register MARIA_HA *info)
       MARIA_STATE_HISTORY_CLOSED *history;
       /*
         Here we ignore the unlikely case that we don't have memory to
-        store the case. In the worst case what happens is that any transaction
+        store the state. In the worst case what happens is that any transaction
         that tries to access this table will get a wrong status information.
       */
       if ((history= (MARIA_STATE_HISTORY_CLOSED *)
@@ -165,6 +176,8 @@ int maria_close(register MARIA_HA *info)
         if (my_hash_insert(&maria_stored_state, (uchar*) history))
           my_free(history, MYF(0));
       }
+      /* Marker for concurrent checkpoint */
+      share->state_history= 0;
     }
   }
   pthread_mutex_unlock(&THR_LOCK_maria);

=== modified file 'storage/maria/ma_commit.c'
--- a/storage/maria/ma_commit.c	2008-12-05 21:11:46 +0000
+++ b/storage/maria/ma_commit.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007 MySQL AB
+/* Copyright (C) 2007-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -17,7 +17,7 @@
 #include "trnman.h"
 
 /**
-   @brief writes a COMMIT record to log and commits transaction in memory
+   writes a COMMIT record to log and commits transaction in memory
 
    @param  trn              transaction
 
@@ -82,7 +82,7 @@ int ma_commit(TRN *trn)
 
 
 /**
-   @brief Writes a COMMIT record for a transaciton associated with a file
+   Writes a COMMIT record for a transaciton associated with a file
 
    @param  info              Maria handler
 
@@ -98,13 +98,17 @@ int maria_commit(MARIA_HA *info)
 
 
 /**
-   @brief Starts a transaction on a file handle
+   Starts a transaction on a file handle
 
    @param  info              Maria handler
 
    @return Operation status
      @retval 0      ok
      @retval #      Error code.
+
+   @note this can be used only in single-threaded programs (tests),
+   because we create a transaction (trnman_new_trn) with WT_THD=0.
+   XXX it needs to be fixed when we'll start using maria_begin from SQL.
 */
 
 int maria_begin(MARIA_HA *info)

=== modified file 'storage/maria/ma_create.c'
--- a/storage/maria/ma_create.c	2008-10-20 13:03:34 +0000
+++ b/storage/maria/ma_create.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -652,7 +653,8 @@ int maria_create(const char *name, enum 
   /* There are only 16 bits for the total header length. */
   if (info_length > 65535)
   {
-    my_printf_error(0, "Maria table '%s' has too many columns and/or "
+    my_printf_error(HA_WRONG_CREATE_OPTION,
+                    "Maria table '%s' has too many columns and/or "
                     "indexes and/or unique constraints.",
                     MYF(0), name + dirname_length(name));
     my_errno= HA_WRONG_CREATE_OPTION;
@@ -750,6 +752,13 @@ int maria_create(const char *name, enum 
       (via maria_recreate_table()) and it does not have a log.
     */
     sync_dir= MY_SYNC_DIR;
+    /*
+      If crash between _ma_state_info_write_sub() and
+      _ma_update_state__lsns_sub(), table should be ignored by Recovery (or
+      old REDOs would fail), so we cannot let LSNs be 0:
+    */
+    share.state.skip_redo_lsn= share.state.is_of_horizon=
+      share.state.create_rename_lsn= LSN_MAX;
   }
 
   if (datafile_type == DYNAMIC_RECORD)
@@ -765,6 +774,10 @@ int maria_create(const char *name, enum 
 
   if (! (flags & HA_DONT_TOUCH_DATA))
     share.state.create_time= (long) time((time_t*) 0);
+#ifdef THREAD
+  /* This rwlock is used in ma_state_info_write(). */
+  my_atomic_rwlock_init(&share.physical_logging_rwlock);
+#endif
 
   pthread_mutex_lock(&THR_LOCK_maria);
 
@@ -839,6 +852,14 @@ int maria_create(const char *name, enum 
     my_errno= HA_ERR_TABLE_EXIST;
     goto err;
   }
+  /*
+    TRUNCATE TABLE does not work with physical logging. If we changed TRUNCATE
+    to always use maria_delete_all_rows() (remove HTON_CAN_RECREATE from
+    Maria) this would solve the problem.
+  */
+  DBUG_ASSERT((options & HA_OPTION_TMP_TABLE) || !ma_log_tables_physical ||
+              !my_hash_search(ma_log_tables_physical, filename,
+                              strlen(filename)));
 
   if ((file= my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
 				    MYF(MY_WME|create_flag))) < 0)
@@ -846,7 +867,8 @@ int maria_create(const char *name, enum 
   errpos=1;
 
   DBUG_PRINT("info", ("write state info and base info"));
-  if (_ma_state_info_write_sub(file, &share.state, 2) ||
+  if (_ma_state_info_write_sub(&share, file, &share.state,
+                               MA_STATE_INFO_WRITE_FULL_INFO) ||
       _ma_base_info_write(file, &share.base))
     goto err;
   DBUG_PRINT("info", ("base_pos: %d  base_info_size: %d",
@@ -1059,11 +1081,21 @@ int maria_create(const char *name, enum 
                                        log_array, NULL, NULL) ||
                  translog_flush(lsn)))
       goto err;
+    share.kfile.file= file;
+    DBUG_EXECUTE_IF("maria_flush_whole_log",
+                    {
+                      DBUG_PRINT("maria_flush_whole_log", ("now"));
+                      translog_flush(translog_get_horizon());
+                    });
+    DBUG_EXECUTE_IF("maria_crash_create_table",
+                    {
+                      DBUG_PRINT("maria_crash_create_table", ("now"));
+                      DBUG_ABORT();
+                    });
     /*
       store LSN into file, needed for Recovery to not be confused if a
       DROP+CREATE happened (applying REDOs to the wrong table).
     */
-    share.kfile.file= file;
     if (_ma_update_state_lsns_sub(&share, lsn, trnman_get_min_safe_trid(),
                                   FALSE, TRUE))
       goto err;
@@ -1140,10 +1172,16 @@ int maria_create(const char *name, enum 
   errpos=0;
   if (my_close(file,MYF(0)))
     res= my_errno;
+#ifdef THREAD
+  my_atomic_rwlock_destroy(&share.physical_logging_rwlock);
+#endif
   DBUG_RETURN(res);
 
 err:
   pthread_mutex_unlock(&THR_LOCK_maria);
+#ifdef THREAD
+  my_atomic_rwlock_destroy(&share.physical_logging_rwlock);
+#endif
 
 err_no_lock:
   save_errno=my_errno;
@@ -1342,11 +1380,11 @@ int _ma_update_state_lsns_sub(MARIA_SHAR
   uchar buf[LSN_STORE_SIZE * 3], *ptr;
   uchar trid_buff[8];
   File file= share->kfile.file;
+  int res;
   DBUG_ASSERT(file >= 0);
 
   if (lsn == LSN_IMPOSSIBLE)
   {
-    int res;
     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
     /* table name is logged only for information */
     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
@@ -1387,13 +1425,24 @@ int _ma_update_state_lsns_sub(MARIA_SHAR
   }
   else
     lsn_store(buf, share->state.create_rename_lsn);
-  return (my_pwrite(file, buf, sizeof(buf),
-                    sizeof(share->state.header) +
-                    MARIA_FILE_CREATE_RENAME_LSN_OFFSET, MYF(MY_NABP)) ||
-          my_pwrite(file, trid_buff, sizeof(trid_buff),
-                    sizeof(share->state.header) +
-                    MARIA_FILE_CREATE_TRID_OFFSET, MYF(MY_NABP)) ||
-          (do_sync && my_sync(file, MYF(0))));
+  res= (my_pwrite(file, buf, sizeof(buf),
+                  sizeof(share->state.header) +
+                  MARIA_FILE_CREATE_RENAME_LSN_OFFSET, MYF(MY_NABP)) ||
+        my_pwrite(file, trid_buff, sizeof(trid_buff),
+                  sizeof(share->state.header) +
+                  MARIA_FILE_CREATE_TRID_OFFSET, MYF(MY_NABP)) ||
+        (do_sync && my_sync(file, MYF(0))));
+  if (unlikely(ma_get_physical_logging_state(share)))
+  {
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAI, share, buf,
+                              sizeof(buf),
+                              MARIA_FILE_CREATE_RENAME_LSN_OFFSET);
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAI, share, trid_buff,
+                              sizeof(trid_buff),
+                              sizeof(share->state.header) +
+                              MARIA_FILE_CREATE_TRID_OFFSET);
+  }
+  return res;
 }
 #if (_MSC_VER == 1310)
 #pragma optimize("",on)

=== modified file 'storage/maria/ma_dbug.c'
--- a/storage/maria/ma_dbug.c	2008-08-25 18:23:18 +0000
+++ b/storage/maria/ma_dbug.c	2009-02-13 16:30:54 +0000
@@ -180,6 +180,7 @@ my_bool _ma_check_table_is_closed(const 
   DBUG_ENTER("_ma_check_table_is_closed");
 
   (void) fn_format(filename,name,"",MARIA_NAME_IEXT,4+16+32);
+  pthread_mutex_lock(&THR_LOCK_maria);
   for (pos=maria_open_list ; pos ; pos=pos->next)
   {
     MARIA_HA *info=(MARIA_HA*) pos->data;
@@ -190,10 +191,12 @@ my_bool _ma_check_table_is_closed(const 
       {
 	fprintf(stderr,"Warning:  Table: %s is open on %s\n", name,where);
 	DBUG_PRINT("warning",("Table: %s is open on %s", name,where));
+        pthread_mutex_unlock(&THR_LOCK_maria);
 	DBUG_RETURN(1);
       }
     }
   }
+  pthread_mutex_unlock(&THR_LOCK_maria);
   DBUG_RETURN(0);
 }
 #endif /* EXTRA_DEBUG */

=== modified file 'storage/maria/ma_delete.c'
--- a/storage/maria/ma_delete.c	2008-11-20 19:18:59 +0000
+++ b/storage/maria/ma_delete.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -40,7 +41,6 @@ int maria_delete(MARIA_HA *info,const uc
   uint i;
   uchar *old_key;
   int save_errno;
-  char lastpos[8];
   MARIA_SHARE *share= info->s;
   MARIA_KEYDEF *keyinfo;
   DBUG_ENTER("maria_delete");
@@ -118,7 +118,6 @@ int maria_delete(MARIA_HA *info,const uc
                           STATE_NOT_ZEROFILLED);
   info->state->changed=1;
 
-  mi_sizestore(lastpos, info->cur_row.lastpos);
   (void)(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
   allow_break();			/* Allow SIGHUP & SIGINT */
   if (info->invalidator != 0)
@@ -136,7 +135,6 @@ err:
   if (!save_errno)
     save_errno= HA_ERR_INTERNAL_ERROR;          /* Should never happen */
 
-  mi_sizestore(lastpos, info->cur_row.lastpos);
   if (save_errno != HA_ERR_RECORD_CHANGED)
   {
     maria_print_error(share, HA_ERR_CRASHED);

=== modified file 'storage/maria/ma_delete_all.c'
--- a/storage/maria/ma_delete_all.c	2008-06-26 17:48:42 +0000
+++ b/storage/maria/ma_delete_all.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -101,6 +102,11 @@ int maria_delete_all_rows(MARIA_HA *info
       my_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) ||
       my_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME)))
     goto err;
+  if (unlikely(ma_get_physical_logging_state(share)))
+  {
+    maria_log_chsize_physical(share, MA_LOG_CHSIZE_MAD, 0);
+    maria_log_chsize_physical(share, MA_LOG_CHSIZE_MAI, share->base.keystart);
+  }
 
   if (_ma_initialize_data_file(share, info->dfile.file))
     goto err;
@@ -115,8 +121,16 @@ int maria_delete_all_rows(MARIA_HA *info
       but redo_insert are skipped (dirty pages list is empty).
       To avoid this, we need to set skip_redo_lsn now, and thus need to sync
       files.
+      Also fixes the problem of:
+      bulk insert; insert; delete_all; crash:
+      "bulk insert" is skipped (no REDOs), so if "insert" would not be skipped
+      (if we didn't update skip_redo_lsn below) then "insert" would be tried
+      and fail, saying that it sees that the first page has to be created
+      though the inserted row has rownr>0.
     */
-    my_bool error= _ma_state_info_write(share, 1|4) ||
+    my_bool error= _ma_state_info_write(share,
+                                        MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                        MA_STATE_INFO_WRITE_LOCK) ||
       _ma_update_state_lsns(share, lsn, trnman_get_min_trid(), FALSE, FALSE) ||
       _ma_sync_table_files(info);
     info->trn->rec_lsn= LSN_IMPOSSIBLE;

=== modified file 'storage/maria/ma_delete_table.c'
--- a/storage/maria/ma_delete_table.c	2008-08-06 14:03:27 +0000
+++ b/storage/maria/ma_delete_table.c	2008-12-22 00:17:37 +0000
@@ -69,11 +69,6 @@ int maria_delete_table(const char *name)
       MY_SYNC_DIR : 0;
     maria_close(info);
   }
-#ifdef USE_RAID
-#ifdef EXTRA_DEBUG
-  _ma_check_table_is_closed(name,"delete");
-#endif
-#endif /* USE_RAID */
 
   if (sync_dir)
   {

=== modified file 'storage/maria/ma_dynrec.c'
--- a/storage/maria/ma_dynrec.c	2008-06-05 16:11:22 +0000
+++ b/storage/maria/ma_dynrec.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -190,9 +191,11 @@ size_t _ma_nommap_pread(MARIA_HA *info, 
 size_t _ma_mmap_pwrite(MARIA_HA *info, const uchar *Buffer,
 		       size_t Count, my_off_t offset, myf MyFlags)
 {
+  MARIA_SHARE *share= info->s;
+  uint ret;
   DBUG_PRINT("info", ("maria_write with mmap %d\n", info->dfile.file));
-  if (info->s->lock_key_trees)
-    rw_rdlock(&info->s->mmap_lock);
+  if (share->lock_key_trees)
+    rw_rdlock(&share->mmap_lock);
 
   /*
     The following test may fail in the following cases:
@@ -201,21 +204,24 @@ size_t _ma_mmap_pwrite(MARIA_HA *info, c
     memory mapped area.
   */
 
-  if (info->s->mmaped_length >= offset + Count)
+  if (share->mmaped_length >= offset + Count)
   {
-    memcpy(info->s->file_map + offset, Buffer, Count);
-    if (info->s->lock_key_trees)
-      rw_unlock(&info->s->mmap_lock);
-    return 0;
+    memcpy(share->file_map + offset, Buffer, Count);
+    if (share->lock_key_trees)
+      rw_unlock(&share->mmap_lock);
+    ret= 0;
   }
   else
   {
-    info->s->nonmmaped_inserts++;
-    if (info->s->lock_key_trees)
-      rw_unlock(&info->s->mmap_lock);
-    return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
-  }
-
+    share->nonmmaped_inserts++;
+    if (share->lock_key_trees)
+      rw_unlock(&share->mmap_lock);
+    ret= my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
+  }
+  if (unlikely(ma_get_physical_logging_state(share)))
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAD,
+                              share, Buffer, Count, offset);
+  return ret;
 }
 
 
@@ -224,7 +230,12 @@ size_t _ma_mmap_pwrite(MARIA_HA *info, c
 size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer,
 			 size_t Count, my_off_t offset, myf MyFlags)
 {
-  return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
+  MARIA_SHARE *share= info->s;
+  uint ret= my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
+  if (unlikely(ma_get_physical_logging_state(share)))
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAD,
+                              share, Buffer, Count, offset);
+  return ret;
 }
 
 
@@ -1768,7 +1779,7 @@ int _ma_read_rnd_dynamic_record(MARIA_HA
       {						/* Check if changed */
 	info_read=1;
 	info->rec_cache.seek_not_done=1;
-	if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+	if (_ma_state_info_read_dsk(share->kfile.file, &share->state, 0))
 	  goto panic;
       }
       if (filepos >= info->state->data_file_length)

=== added file 'storage/maria/ma_examine_non_trans_log.c'
--- a/storage/maria/ma_examine_non_trans_log.c	1970-01-01 00:00:00 +0000
+++ b/storage/maria/ma_examine_non_trans_log.c	2009-02-10 14:51:40 +0000
@@ -0,0 +1,627 @@
+/* Copyright (C) 2009 - 2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file
+  Function to display and apply a Maria physical log to tables.
+*/
+
+#ifndef USE_MY_FUNC
+#define USE_MY_FUNC
+#endif
+
+#include "maria_def.h"
+#include <my_tree.h>
+#include <stdarg.h>
+#ifdef HAVE_GETRUSAGE
+#include <sys/resource.h>
+#endif
+
+/** Human-readable names of commands storable in Maria logs */
+const char *ma_log_command_name[]=
+{"open","close",
+ "write-bytes-to-MAD", "write-bytes-to-MAI", "chsize-MAD", "chsize-MAI",
+ /*
+   This one is special: it is never in log records, it's just used by
+   ma_examine_log() to tell the user that it failed when reopening a table. It
+   has to be last before NullS.
+ */
+ "re-open", NullS};
+
+#define FILENAME(A) (A ? A->show_name : "Unknown")
+
+/** In some cases we do not want to flush the index header in mi_close() */
+static my_bool update_index_on_close= TRUE;
+
+struct file_info {
+  long process;
+  /**
+    File descriptor of the table's index file at time of logging.
+    All log records contain a corresponding descriptor value to indicate the
+    table they are about.
+  */
+  int  filenr;
+  int id;
+  uint rnd;
+  char *name, *show_name;
+  uchar *record;
+  MARIA_HA *isam;
+  /**
+    If 'isam' is currently closed. A not 'used' file is always 'closed' (why
+    open it?). A 'used' file may temporarily be closed because of the max
+    open file descriptors limit (but if we later meet a command which wants
+    to use this file, we will re-open it).
+  */
+  my_bool closed;
+  /** If this table matches the inclusion rules (or has to be ignored) */
+  my_bool used;
+  ulong accessed;
+};
+
+struct test_if_open_param {
+  char * name;
+  int max_id;
+};
+
+struct st_access_param
+{
+  ulong min_accessed;
+  struct file_info *found;
+};
+
+#define NO_FILEPOS HA_OFFSET_ERROR
+
+void ma_examine_log_param_init(MA_EXAMINE_LOG_PARAM *param);
+int ma_examine_log(MA_EXAMINE_LOG_PARAM *param);
+static int read_string(IO_CACHE *file,uchar* *to,uint length);
+static int file_info_compare(void *cmp_arg, void *a,void *b);
+static int test_if_open(struct file_info *key,element_count count,
+			struct test_if_open_param *param);
+static int test_when_accessed(struct file_info *key,element_count count,
+			      struct st_access_param *access_param);
+static void file_info_free(struct file_info *info);
+static int close_some_file(TREE *tree);
+static int reopen_closed_file(TREE *tree,struct file_info *file_info);
+static int mi_close_care_state(MARIA_HA *info);
+static void printf_log(uint verbose, ulong isamlog_process,
+                       my_off_t isamlog_filepos, const char *format,...);
+static my_bool cmp_filename(struct file_info *file_info, const char *name);
+
+
+void ma_examine_log_param_init(MA_EXAMINE_LOG_PARAM *mi_exl)
+{
+  bzero(mi_exl,sizeof(*mi_exl));
+  mi_exl->number_of_commands= (ulong) ~0L;
+  mi_exl->record_pos= NO_FILEPOS;
+}
+
+
+/**
+  Displays or applies the content of a Maria physical log to tables.
+
+  Applies either to all tables referenced by the log, or only to a subset
+  specified in mi_exl->table_selection_hook.
+  If applying the content of the log, this function should be called only
+  when all involved tables are closed and cannot be opened by any concurrent
+  thread/program. It indeed opens tables and modifies them without locking
+  them.
+  Is used both by the standalone program maria_non_trans_log and by the restore
+  code of the Maria online backup driver.
+
+  @param  mi_exl           Parameters of the applying
+
+  @return Operation status
+    @retval 0      ok
+    @retval !=0    error
+*/
+
+int ma_examine_log(MA_EXAMINE_LOG_PARAM *mi_exl)
+{
+  ulong isamlog_process;
+  my_off_t isamlog_filepos;
+  uint command, result, files_open, big_numbers;
+  ulong access_time,length;
+  my_off_t filepos;
+  char isam_file_name[FN_REFLEN], llbuff[21];
+  uchar head[20], *head_ptr;
+  uchar	*buff;
+  struct test_if_open_param open_param;
+  IO_CACHE cache;
+  File log_file;
+  FILE *write_file;
+  TREE tree;
+  struct file_info file_info,*curr_file_info;
+  uint head_len[][2]=
+    { { 11, 14 }, { 11, 14 }, { 9, 16 }, { 9, 16 }, { 7, 12 }, { 7, 12 } };
+  uint has_pid_and_result[]= {1, 1, 0, 0, 0, 0};
+  DBUG_ENTER("ma_examine_log");
+
+  compile_time_assert((sizeof(ma_log_command_name) /
+                       sizeof(ma_log_command_name[0]) ==
+                       (MA_LOG_END_SENTINEL + 2)) &&
+                      (sizeof(has_pid_and_result) /
+                       sizeof(has_pid_and_result[0]) ==
+                       MA_LOG_END_SENTINEL) &&
+                      (sizeof(head_len) / sizeof(head_len[0]) ==
+                       MA_LOG_END_SENTINEL) &&
+                       (MA_LOG_END_SENTINEL <= MA_LOG_BIG_NUMBERS) &&
+                      (sizeof(mi_exl->com_count) /
+                       sizeof(mi_exl->com_count[0]) == MA_LOG_END_SENTINEL));
+  if ((log_file=my_open(mi_exl->log_filename,O_RDONLY,MYF(MY_WME))) < 0)
+    DBUG_RETURN(1);
+  write_file=0;
+  if (mi_exl->write_filename)
+  {
+    if (!(write_file=my_fopen(mi_exl->write_filename,O_WRONLY,MYF(MY_WME))))
+    {
+      my_close(log_file,MYF(0));
+      DBUG_RETURN(1);
+    }
+  }
+
+  init_io_cache(&cache,log_file,0,READ_CACHE,mi_exl->start_offset,0,MYF(0));
+  bzero(mi_exl->com_count,sizeof(mi_exl->com_count));
+  init_tree(&tree,0,0,sizeof(file_info),(qsort_cmp2) file_info_compare,1,
+	    (tree_element_free) file_info_free, NULL);
+
+  files_open=0; access_time=0;
+  while (access_time++ != mi_exl->number_of_commands &&
+	 !my_b_read(&cache, head, 1))
+  {
+    isamlog_filepos=my_b_tell(&cache)-1L;
+    head_ptr= head;
+    command=(uint) head_ptr[0];
+    command-= (big_numbers= (command & MA_LOG_BIG_NUMBERS));
+    if (big_numbers != 0)
+      big_numbers= 1;
+    if (my_b_read(&cache, head, head_len[command][big_numbers] - 1))
+      goto err;
+    if (big_numbers)
+    {
+      file_info.filenr= mi_uint3korr(head);
+      head_ptr+= 3;
+    }
+    else
+    {
+      file_info.filenr= mi_uint2korr(head);
+      head_ptr+= 2;
+    }
+    if (has_pid_and_result[command])
+    {
+      isamlog_process= file_info.process= (long) mi_uint4korr(head_ptr);
+      head_ptr+= 4;
+      if (!mi_exl->opt_processes)
+        file_info.process=0;
+      result= mi_uint2korr(head_ptr);
+      head_ptr+= 2;
+    }
+    else
+      isamlog_process= file_info.process= result= 0;
+    if ((curr_file_info=(struct file_info*) tree_search(&tree, &file_info,
+							tree.custom_arg)))
+    {
+      curr_file_info->accessed=access_time;
+      if (mi_exl->update && curr_file_info->used && curr_file_info->closed)
+      {
+	if (reopen_closed_file(&tree,curr_file_info))
+	{
+	  command=sizeof(mi_exl->com_count)/sizeof(mi_exl->com_count[0][0])/3;
+	  result=0;
+	  goto com_err;
+	}
+        mi_exl->re_open_count++;
+      }
+    }
+    DBUG_PRINT("info",("command: %u curr_file_info: 0x%lx used: %u",
+                       command, (ulong)curr_file_info,
+                       curr_file_info ? curr_file_info->used : 0));
+    /*
+      We update our statistic (how many commands issued, per command type),
+      if this is a valid command about a file we want to include.
+      For MA_LOG_OPEN decision must be postponed, as curr_file_info is
+      meaningless for it.
+    */
+    if ((command <
+         sizeof(mi_exl->com_count)/sizeof(mi_exl->com_count[0][0])/3) &&
+        (!mi_exl->table_selection_hook ||
+         (curr_file_info && curr_file_info->used)) &&
+        (((enum maria_log_commands) command) != MA_LOG_OPEN))
+    {
+      mi_exl->com_count[command][0]++;
+      if (result)
+        mi_exl->com_count[command][1]++;
+    }
+    switch ((enum maria_log_commands) command) {
+    case MA_LOG_OPEN:
+      if (curr_file_info)
+	printf("\nWarning: %s is opened with same process and filenumber\n"
+               "Maybe you should use the -P option ?\n",
+	       curr_file_info->show_name);
+      file_info.name=0;
+      file_info.show_name=0;
+      file_info.record=0;
+      length= big_numbers ? mi_uint4korr(head_ptr) : mi_uint2korr(head_ptr);
+      if (read_string(&cache, (uchar **)&file_info.name, length))
+	goto err;
+      {
+	uint i;
+	char *pos,*to;
+
+	/* Fix if old DOS files to new format */
+	for (pos=file_info.name; (pos=strchr(pos,'\\')) ; pos++)
+	  *pos= '/';
+
+	pos=file_info.name;
+	for (i=0 ; i < mi_exl->prefix_remove ; i++)
+	{
+	  char *next;
+	  if (!(next=strchr(pos,'/')))
+	    break;
+	  pos=next+1;
+	}
+	to=isam_file_name;
+	if (mi_exl->filepath)
+	  to=convert_dirname(isam_file_name,mi_exl->filepath,NullS);
+	strmov(to,pos);
+	fn_ext(isam_file_name)[0]=0;	/* Remove extension */
+      }
+      open_param.name=file_info.name;
+      open_param.max_id=0;
+      (void) tree_walk(&tree, (tree_walk_action) test_if_open,
+                       (void*) &open_param, left_root_right);
+      file_info.id=open_param.max_id+1;
+      /*
+       * In the line below +10 is added to accomodate '<' and '>' chars
+       * plus '\0' at the end, so that there is place for 7 digits.
+       * It is improbable that same table can have that many entries in
+       * the table cache.
+       * The additional space is needed for the sprintf commands two lines
+       * below.
+       */
+      file_info.show_name=my_memdup(isam_file_name,
+				    (uint) strlen(isam_file_name)+10,
+				    MYF(MY_WME));
+      if (file_info.id > 1)
+	sprintf(strend(file_info.show_name),"<%d>",file_info.id);
+      file_info.closed=1;
+      file_info.accessed=access_time;
+      file_info.used= !mi_exl->table_selection_hook ||
+        ((*(mi_exl->table_selection_hook))(isam_file_name));
+      if (mi_exl->update && file_info.used)
+      {
+	if (files_open >= mi_exl->max_files)
+	{
+	  if (close_some_file(&tree))
+	    goto com_err;
+	  files_open--;
+	}
+        /*
+          index may be truncated (if physical logging excluded its pages so
+          use HA_OPEN_FOR_REPAIR).
+        */
+	if (!(file_info.isam= maria_open(isam_file_name, O_RDWR,
+                                      HA_OPEN_FOR_REPAIR |
+                                      HA_OPEN_WAIT_IF_LOCKED)))
+	  goto com_err;
+	if (!(file_info.record=my_malloc(file_info.isam->s->base.reclength,
+					 MYF(MY_WME))))
+	  goto end;
+	files_open++;
+	file_info.closed=0;
+      }
+      (void) tree_insert(&tree, (uchar*) &file_info, 0, tree.custom_arg);
+      if (file_info.used)
+      {
+	if (mi_exl->verbose && !mi_exl->record_pos_file)
+	  printf_log(mi_exl->verbose, isamlog_process, isamlog_filepos,
+                     "%s: open -> %d",file_info.show_name, file_info.filenr);
+	mi_exl->com_count[command][0]++;
+        /* given how we log MA_LOG_OPEN, "result" is always 0 here */
+	if (result)
+	  mi_exl->com_count[command][1]++;
+      }
+      break;
+    case MA_LOG_CLOSE:
+      if (mi_exl->verbose && !mi_exl->record_pos_file &&
+	  (!mi_exl->table_selection_hook ||
+           (curr_file_info && curr_file_info->used)))
+	printf_log(mi_exl->verbose, isamlog_process, isamlog_filepos,
+                   "%s: %s -> %d",FILENAME(curr_file_info),
+                   ma_log_command_name[command],result);
+      if (curr_file_info)
+      {
+	if (!curr_file_info->closed)
+	  files_open--;
+	(void) tree_delete(&tree, (uchar*) curr_file_info, 0, tree.custom_arg);
+      }
+      break;
+    case MA_LOG_WRITE_BYTES_MAI:
+    case MA_LOG_WRITE_BYTES_MAD:
+      if (big_numbers)
+      {
+        filepos= mi_sizekorr(head_ptr);
+        head_ptr+= 8;
+        length= mi_uint4korr(head_ptr);
+      }
+      else
+      {
+        filepos= mi_uint4korr(head_ptr);
+        head_ptr+= 4;
+        length= mi_uint2korr(head_ptr);
+      }
+      buff=0;
+      if (read_string(&cache, &buff, length))
+        goto err;
+      if ((!mi_exl->record_pos_file ||
+           ((mi_exl->record_pos == filepos ||
+             mi_exl->record_pos == NO_FILEPOS) &&
+            !cmp_filename(curr_file_info,mi_exl->record_pos_file))) &&
+          (!mi_exl->table_selection_hook ||
+           (curr_file_info && curr_file_info->used)))
+      {
+        if (write_file &&
+            (my_fwrite(write_file, buff, length,
+                       MYF(MY_WAIT_IF_FULL | MY_NABP))))
+          goto end;
+        if (mi_exl->verbose)
+          printf_log(mi_exl->verbose, isamlog_process, isamlog_filepos,
+                     "%s: %s at %s, length=%lu -> %d",
+                     FILENAME(curr_file_info),
+                     ma_log_command_name[command], llstr(filepos,llbuff),
+                     length, result);
+      }
+      if (mi_exl->update && curr_file_info && !curr_file_info->closed)
+      {
+        update_index_on_close= FALSE;
+        if (my_pwrite((command == MA_LOG_WRITE_BYTES_MAI) ?
+                      curr_file_info->isam->s->kfile.file :
+                      curr_file_info->isam->dfile.file,
+                      buff,length,filepos,MYF(MY_NABP)))
+          goto com_err;
+      }
+      my_free(buff,MYF(0));
+      break;
+    case MA_LOG_CHSIZE_MAD:
+    case MA_LOG_CHSIZE_MAI:
+      /* here 'filepos' means new length of file */
+      if (big_numbers)
+        filepos= mi_sizekorr(head_ptr);
+      else
+        filepos= mi_uint4korr(head_ptr);
+      if ((!mi_exl->record_pos_file ||
+           ((mi_exl->record_pos == filepos ||
+             mi_exl->record_pos == NO_FILEPOS) &&
+            !cmp_filename(curr_file_info, mi_exl->record_pos_file))) &&
+          (!mi_exl->table_selection_hook ||
+           (curr_file_info && curr_file_info->used)))
+      {
+        /* nothing to write to write_file ("length" is 0) */
+        if (mi_exl->verbose)
+          printf_log(mi_exl->verbose, isamlog_process, isamlog_filepos,
+                     "%s: %s at %s -> %d", FILENAME(curr_file_info),
+                     ma_log_command_name[command], llstr(filepos,llbuff),
+                     result);
+      }
+      if (mi_exl->update && curr_file_info && !curr_file_info->closed)
+      {
+        update_index_on_close= FALSE;
+        if (my_chsize((command == MA_LOG_CHSIZE_MAI) ?
+                      curr_file_info->isam->s->kfile.file :
+                      curr_file_info->isam->dfile.file,
+                      filepos, 0, MYF(MY_WME)))
+          goto com_err;
+      }
+      break;
+    default:
+      fflush(stdout);
+      fprintf(stderr, "Error: found unknown command %d in logfile, aborted\n",
+              command);
+      fflush(stderr);
+      goto end;
+    }
+  }
+  delete_tree(&tree);
+  (void) end_io_cache(&cache);
+  (void) my_close(log_file,MYF(0));
+  if (write_file && my_fclose(write_file,MYF(MY_WME)))
+    DBUG_RETURN(1);
+  DBUG_RETURN(0);
+
+ err:
+  fflush(stdout);
+  fprintf(stderr,"Got error %d when reading from logfile\n",my_errno);
+  fflush(stderr);
+  goto end;
+ com_err:
+  fflush(stdout);
+  fprintf(stderr,"Got error %d, expected %d on command %s at %s\n",
+          my_errno,result,ma_log_command_name[command],
+          llstr(isamlog_filepos,llbuff));
+  fflush(stderr);
+ end:
+  delete_tree(&tree);
+  (void) end_io_cache(&cache);
+  (void) my_close(log_file,MYF(0));
+  if (write_file)
+    (void) my_fclose(write_file,MYF(MY_WME));
+  DBUG_RETURN(1);
+}
+
+
+static int read_string(IO_CACHE *file, register uchar* *to,
+                       register uint length)
+{
+  DBUG_ENTER("read_string");
+
+  if (*to)
+    my_free((uchar*) *to,MYF(0));
+  if (!(*to= (uchar*) my_malloc(length+1,MYF(MY_WME))) ||
+      my_b_read(file, *to,length))
+  {
+    if (*to)
+      my_free(*to,MYF(0));
+    *to= 0;
+    DBUG_RETURN(1);
+  }
+  *((char*) *to+length)= '\0';
+  DBUG_RETURN (0);
+}				/* read_string */
+
+
+static int file_info_compare(void* cmp_arg __attribute__((unused)),
+			     void *a, void *b)
+{
+  long lint;
+
+  if ((lint=((struct file_info*) a)->process -
+       ((struct file_info*) b)->process))
+    return lint < 0L ? -1 : 1;
+  return ((struct file_info*) a)->filenr - ((struct file_info*) b)->filenr;
+}
+
+	/* ARGSUSED */
+
+static int test_if_open (struct file_info *key,
+			 element_count count __attribute__((unused)),
+			 struct test_if_open_param *param)
+{
+  if (!strcmp(key->name,param->name) && key->id > param->max_id)
+    param->max_id=key->id;
+  return 0;
+}
+
+
+	/* close the file with hasn't been accessed for the longest time */
+	/* ARGSUSED */
+
+static int test_when_accessed (struct file_info *key,
+			       element_count count __attribute__((unused)),
+			       struct st_access_param *access_param)
+{
+  if (key->accessed < access_param->min_accessed && ! key->closed)
+  {
+    access_param->min_accessed=key->accessed;
+    access_param->found=key;
+  }
+  return 0;
+}
+
+
+static void file_info_free(struct file_info *fileinfo)
+{
+  DBUG_ENTER("file_info_free");
+  /* The 2 conditions below can be true only if 'update' */
+  if (!fileinfo->closed)
+    (void) mi_close_care_state(fileinfo->isam);
+  if (fileinfo->record)
+    my_free(fileinfo->record,MYF(0));
+  my_free(fileinfo->name,MYF(0));
+  my_free(fileinfo->show_name,MYF(0));
+  DBUG_VOID_RETURN;
+}
+
+
+
+static int close_some_file(TREE *tree)
+{
+  struct st_access_param access_param;
+
+  access_param.min_accessed=LONG_MAX;
+  access_param.found=0;
+
+  (void) tree_walk(tree,(tree_walk_action) test_when_accessed,
+                   (void*) &access_param,left_root_right);
+  if (!access_param.found)
+    return 1;			/* No open file that is possibly to close */
+  if (mi_close_care_state(access_param.found->isam))
+    return 1;
+  access_param.found->closed=1;
+  return 0;
+}
+
+
+static int reopen_closed_file(TREE *tree, struct file_info *fileinfo)
+{
+  char name[FN_REFLEN];
+  if (close_some_file(tree))
+    return 1;				/* No file to close */
+  strmov(name,fileinfo->show_name);
+  if (fileinfo->id > 1)
+    *strrchr(name,'<')='\0';		/* Remove "<id>" */
+
+  if (!(fileinfo->isam= maria_open(name, O_RDWR,
+                                HA_OPEN_FOR_REPAIR | HA_OPEN_WAIT_IF_LOCKED)))
+    return 1;
+  fileinfo->closed=0;
+  return 0;
+}
+
+
+/**
+  In practice this is only called if verbose>=1. When ma_examine_log() is
+  used in the server it is with verbose==0 so this is not called.
+*/
+
+static void printf_log(uint verbose, ulong isamlog_process,
+                       my_off_t isamlog_filepos, const char *format,...)
+{
+  char llbuff[21];
+  va_list args;
+  va_start(args,format);
+  DBUG_ASSERT(verbose > 0);
+  if (verbose > 2)
+    printf("%9s:",llstr(isamlog_filepos,llbuff));
+  if (verbose > 1)
+    printf("%5ld ",isamlog_process);	/* Write process number */
+  (void) vprintf((char*) format,args);
+  putchar('\n');
+  va_end(args);
+}
+
+
+static my_bool cmp_filename(struct file_info *file_info, const char *name)
+{
+  if (!file_info)
+    return 1;
+  return strcmp(file_info->name,name) ? 1 : 0;
+}
+
+
+/**
+  Closes a table but, if physical log, updates the share from disk first.
+
+  mi_close() calls mi_state_info_write() if the table is corrupted.
+  This can happen for example is the table is from an online backup which
+  made a copy of its data file and only its index' header.
+  But in that case, if we have executed some MA_LOG_WRITE_BYTES_MAI commands,
+  the state in memory is older than the state on disk, so we update the
+  share from disk.
+
+  @return Operation status
+    @retval 0      ok
+    @retval !=0    error
+*/
+
+static int mi_close_care_state(MARIA_HA *info)
+{
+  if (!update_index_on_close)
+  {
+    MARIA_SHARE *share;
+
+    share= info->s;
+    (void) _ma_state_info_read_dsk(share->kfile.file, &share->state, 1);
+  }
+  return maria_close(info);
+}

=== modified file 'storage/maria/ma_extra.c'
--- a/storage/maria/ma_extra.c	2008-12-08 21:15:06 +0000
+++ b/storage/maria/ma_extra.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -21,6 +22,9 @@
 
 static void maria_extra_keyflag(MARIA_HA *info,
                                 enum ha_extra_function function);
+static int log_flushed_write_cache_physical(IO_CACHE *cache_to_table,
+                                            const uchar *buffert,
+                                            uint length, my_off_t offset);
 
 /**
    @brief Set options and buffers to optimize table handling
@@ -152,6 +156,19 @@ int maria_extra(MARIA_HA *info, enum ha_
                            HA_STATE_WRITE_AT_END |
                            HA_STATE_EXTEND_BLOCK);
       }
+#ifdef HAVE_MARIA_PHYSICAL_LOGGING
+    if (!share->temporary)
+    {
+      /*
+        This is a post_write: physical_logging_state has to be checked after
+        doing the table write (see ma_log_start_physical()).
+        We set it now as physical logging may be requested later when the
+        cache has started being used.
+      */
+      info->rec_cache.post_write= log_flushed_write_cache_physical;
+      info->rec_cache.arg= share;
+    }
+#endif
     break;
   case HA_EXTRA_PREPARE_FOR_UPDATE:
     if (info->s->data_file_type != DYNAMIC_RECORD)
@@ -272,7 +289,9 @@ int maria_extra(MARIA_HA *info, enum ha_
         for the posterity is by writing it to disk.
       */
       DBUG_ASSERT(!maria_in_recovery);
-      error= _ma_state_info_write(share, 1|2);
+      error= _ma_state_info_write(share,
+                                  MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                  MA_STATE_INFO_WRITE_FULL_INFO);
     }
     pthread_mutex_unlock(&share->intern_lock);
     break;
@@ -290,7 +309,9 @@ int maria_extra(MARIA_HA *info, enum ha_
     if (!error && share->changed)
     {
       pthread_mutex_lock(&share->intern_lock);
-      if (!(error= _ma_state_info_write(share, 1|2)))
+      if (!(error= _ma_state_info_write(share,
+                                        MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
+                                        MA_STATE_INFO_WRITE_FULL_INFO)))
         share->changed= 0;
       pthread_mutex_unlock(&share->intern_lock);
     }
@@ -348,7 +369,10 @@ int maria_extra(MARIA_HA *info, enum ha_
       if (do_flush)
       {
         /* Save the state so that others can find it from disk. */
-        if ((share->changed && _ma_state_info_write(share, 1 | 2)) ||
+        if ((share->changed &&
+             _ma_state_info_write(share,
+                                  MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                  MA_STATE_INFO_WRITE_FULL_INFO)) ||
             my_sync(share->kfile.file, MYF(0)))
           error= my_errno;
         else
@@ -508,6 +532,19 @@ int maria_reset(MARIA_HA *info)
   */
   if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
   {
+    /*
+      If there is a WRITE_CACHE here and we don't hold a write-lock or
+      intern_lock on the table, then ma_log_stop_physical() may be running
+      now in another thread and may be flushing the write cache now (and two
+      concurrent end_io_cache() will cause problems). For example when the
+      SQL layer unlocks tables and then calls ha_maria::reset() we must not
+      come here. Temp tables are not concerned.
+    */
+    if (!share->temporary && (info->opt_flag & WRITE_CACHE_USED) &&
+        (info->lock.type <= TL_READ_NO_INSERT))
+    {
+      safe_mutex_assert_owner(&share->intern_lock);
+    }
     info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
     error= end_io_cache(&info->rec_cache);
   }
@@ -594,7 +631,11 @@ int _ma_flush_table_files(MARIA_HA *info
           error= 1;
       }
       else
-        info->s->bitmap.changed= 0;
+      {
+        pthread_mutex_lock(&share->bitmap.bitmap_lock);
+        share->bitmap.changed= 0;
+        pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+      }
       if (flush_pagecache_blocks(share->pagecache, &info->dfile,
                                  flush_type_for_data))
         error= 1;
@@ -611,3 +652,31 @@ int _ma_flush_table_files(MARIA_HA *info
   maria_mark_crashed(info);
   return 1;
 }
+
+
+/**
+  Logs when the WRITE_CACHE is flushed to the data file, to the physical
+  log.
+
+  @param  cache_for_table  pointer to the table's WRITE_CACHE IO_CACHE
+  @param  buffert          argument to the pwrite
+  @param  length           length of buffer
+  @param  filepos          offset in file where buffer was written
+
+  @return Operation status, always 0
+    @retval 0      ok. Yes, even if log write fails we return ok, don't want
+                   to make the table writer believe its table is now
+                   corrupted.
+*/
+
+static int log_flushed_write_cache_physical(IO_CACHE *cache_for_table,
+                                            const uchar *buffert,
+                                            uint length, my_off_t filepos)
+{
+  MARIA_SHARE *share= (MARIA_SHARE *)(cache_for_table->arg);
+  DBUG_ENTER("log_flushed_write_cache_physical");
+  if (unlikely(ma_get_physical_logging_state(share)))
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAD, share, buffert,
+                              length, filepos);
+  DBUG_RETURN(0);
+}

=== modified file 'storage/maria/ma_ft_boolean_search.c'
--- a/storage/maria/ma_ft_boolean_search.c	2008-06-26 05:18:28 +0000
+++ b/storage/maria/ma_ft_boolean_search.c	2009-02-12 17:51:00 +0000
@@ -161,11 +161,11 @@ static int FTB_WORD_cmp(my_off_t *v, FTB
 
 static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
 {
-  /* ORDER BY word DESC, ndepth DESC */
-  int i= ha_compare_text(cs, (uchar*) (*b)->word+1,(*b)->len-1,
-                             (uchar*) (*a)->word+1,(*a)->len-1,0,0);
+  /* ORDER BY word, ndepth */
+  int i= ha_compare_text(cs, (uchar*) (*a)->word + 1,(*a)->len - 1,
+                             (uchar*) (*b)->word + 1,(*b)->len - 1, 0, 0);
   if (!i)
-    i=CMP_NUM((*b)->ndepth,(*a)->ndepth);
+    i=CMP_NUM((*a)->ndepth, (*b)->ndepth);
   return i;
 }
 
@@ -879,23 +879,49 @@ static int ftb_find_relevance_add_word(M
   FT_INFO *ftb= ftb_param->ftb;
   FTB_WORD *ftbw;
   int a, b, c;
+  /*
+    Find right-most element in the array of query words matching this
+    word from a document.
+  */
   for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
   {
     ftbw= ftb->list[c];
     if (ha_compare_text(ftb->charset, (uchar*)word, len,
                         (uchar*)ftbw->word+1, ftbw->len-1,
-                        (my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) > 0)
+                        (my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) < 0)
       b= c;
     else
       a= c;
   }
+  /*
+    If there were no words with truncation operator, we iterate to the
+    beginning of an array until array element is equal to the word from
+    a document. This is done mainly because the same word may be
+    mentioned twice (or more) in the query.
+
+    In case query has words with truncation operator we must iterate
+    to the beginning of the array. There may be non-matching query words
+    between matching word with truncation operator and the right-most
+    matching element. E.g., if we're looking for 'aaa15' in an array of
+    'aaa1* aaa14 aaa15 aaa16'.
+
+    Worse of that there still may be match even if the binary search
+    above didn't find matching element. E.g., if we're looking for
+    'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will
+    stop at 'aaa16'.
+  */
   for (; c >= 0; c--)
   {
     ftbw= ftb->list[c];
     if (ha_compare_text(ftb->charset, (uchar*)word, len,
                         (uchar*)ftbw->word + 1,ftbw->len - 1,
                         (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
-      break;
+    {
+      if (ftb->with_scan & FTB_FLAG_TRUNC)
+        continue;
+      else
+        break;
+    }
     if (ftbw->docid[1] == ftb->info->cur_row.lastpos)
       continue;
     ftbw->docid[1]= ftb->info->cur_row.lastpos;

=== modified file 'storage/maria/ma_ft_parser.c'
--- a/storage/maria/ma_ft_parser.c	2008-06-24 14:14:56 +0000
+++ b/storage/maria/ma_ft_parser.c	2009-02-12 17:51:00 +0000
@@ -326,59 +326,39 @@ int maria_ft_parse(TREE *wtree, uchar *d
 
 
 #define MAX_PARAM_NR 2
-MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
-                                                      uint keynr, uint paramnr)
+
+MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info)
 {
-  uint32 ftparser_nr;
-  struct st_mysql_ftparser *parser;
-  if (! info->ftparser_param)
+  if (!info->ftparser_param)
   {
-    /* info->ftparser_param can not be zero after the initialization,
-       because it always includes built-in fulltext parser. And built-in
-       parser can be called even if the table has no fulltext indexes and
-       no varchar/text fields. */
-    if (! info->s->ftparsers)
-    {
-      /* It's ok that modification to shared structure is done w/o mutex
-         locks, because all threads would set the same variables to the
-         same values. */
-      uint i, j, keys= info->s->state.header.keys, ftparsers= 1;
-      for (i= 0; i < keys; i++)
-      {
-        MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i];
-        if (keyinfo->flag & HA_FULLTEXT)
-        {
-          for (j= 0;; j++)
-          {
-            if (j == i)
-            {
-              keyinfo->ftparser_nr= ftparsers++;
-              break;
-            }
-            if (info->s->keyinfo[j].flag & HA_FULLTEXT &&
-                keyinfo->parser == info->s->keyinfo[j].parser)
-            {
-              keyinfo->ftparser_nr= info->s->keyinfo[j].ftparser_nr;
-              break;
-            }
-          }
-        }
-      }
-      info->s->ftparsers= ftparsers;
-    }
-    /*
-      We have to allocate two MYSQL_FTPARSER_PARAM structures per plugin
-      because in a boolean search a parser is called recursively
-      ftb_find_relevance* calls ftb_check_phrase*
-      (MAX_PARAM_NR=2)
+    /* 
+.     info->ftparser_param can not be zero after the initialization,
+      because it always includes built-in fulltext parser. And built-in
+      parser can be called even if the table has no fulltext indexes and
+      no varchar/text fields.
+
+      ftb_find_relevance... parser (ftb_find_relevance_parse,
+      ftb_find_relevance_add_word) calls ftb_check_phrase... parser
+      (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
     */
     info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
       my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
-                info->s->ftparsers, MYF(MY_WME|MY_ZEROFILL));
+                info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
     init_alloc_root(&info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
-    if (! info->ftparser_param)
-      return 0;
   }
+  return info->ftparser_param;
+}
+
+
+MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
+                                                      uint keynr, uint paramnr)
+{
+  uint32 ftparser_nr;
+  struct st_mysql_ftparser *parser;
+  
+  if (!maria_ftparser_alloc_param(info))
+    return 0;
+
   if (keynr == NO_SUCH_KEY)
   {
     ftparser_nr= 0;
@@ -386,7 +366,7 @@ MYSQL_FTPARSER_PARAM *maria_ftparser_cal
   }
   else
   {
-    ftparser_nr= info->s->keyinfo[keynr].ftparser_nr;
+    ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
     parser= info->s->keyinfo[keynr].parser;
   }
   DBUG_ASSERT(paramnr < MAX_PARAM_NR);
@@ -419,7 +399,7 @@ void maria_ftparser_call_deinitializer(M
     for (j=0; j < MAX_PARAM_NR; j++)
     {
       MYSQL_FTPARSER_PARAM *ftparser_param=
-        &info->ftparser_param[keyinfo->ftparser_nr*MAX_PARAM_NR + j];
+        &info->ftparser_param[keyinfo->ftkey_nr*MAX_PARAM_NR + j];
       if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
       {
         if (keyinfo->parser->deinit)

=== modified file 'storage/maria/ma_ft_update.c'
--- a/storage/maria/ma_ft_update.c	2008-06-26 05:18:28 +0000
+++ b/storage/maria/ma_ft_update.c	2009-01-19 20:25:23 +0000
@@ -342,8 +342,10 @@ uint _ma_ft_convert_to_ft2(MARIA_HA *inf
   info->keyread_buff_used= info->page_changed=1;      /* info->buff is used */
   /**
     @todo RECOVERY BUG this is not logged yet. Ok as this code is never
-    called, but soon it will be.
+    called. We would need to pin pages until all REDOs are written. Probably
+    no need for an UNDO.
   */
+  DBUG_ASSERT(0);
   if ((root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR ||
       _ma_write_keypage(info, keyinfo, root, page_link->write_lock,
                         DFLT_INIT_HITS, info->buff))

=== modified file 'storage/maria/ma_ftdefs.h'
--- a/storage/maria/ma_ftdefs.h	2008-06-26 05:18:28 +0000
+++ b/storage/maria/ma_ftdefs.h	2009-02-12 17:51:00 +0000
@@ -147,6 +147,7 @@ void maria_ft_boolean_close_search(FT_IN
 float maria_ft_boolean_get_relevance(FT_INFO *);
 my_off_t maria_ft_boolean_get_docid(FT_INFO *);
 void maria_ft_boolean_reinit_search(FT_INFO *);
+MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info);
 extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
                                                              uint keynr,
                                                              uint paramnr);

=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c	2009-01-27 02:08:48 +0000
+++ b/storage/maria/ma_init.c	2009-02-13 12:40:13 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -62,6 +63,7 @@ int maria_init(void)
   {
     maria_inited= TRUE;
     pthread_mutex_init(&THR_LOCK_maria,MY_MUTEX_INIT_SLOW);
+    pthread_mutex_init(&THR_LOCK_maria_log,MY_MUTEX_INIT_SLOW);
     _ma_init_block_record_data();
     trnman_end_trans_hook= _ma_trnman_end_trans_hook;
     my_handler_error_register();
@@ -99,6 +101,7 @@ void maria_end(void)
     end_pagecache(maria_pagecache, TRUE);
     ma_control_file_end();
     pthread_mutex_destroy(&THR_LOCK_maria);
+    pthread_mutex_destroy(&THR_LOCK_maria_log);
     my_hash_free(&maria_stored_state);
   }
 }

=== modified file 'storage/maria/ma_key_recover.c'
--- a/storage/maria/ma_key_recover.c	2008-10-14 15:18:14 +0000
+++ b/storage/maria/ma_key_recover.c	2009-01-08 08:20:04 +0000
@@ -123,12 +123,23 @@ my_bool _ma_write_clr(MARIA_HA *info, LS
   log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
   log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data);
 
+
+  /*
+    We need intern_lock mutex for calling _ma_state_info_write in the trigger.
+    We do it here to have the same sequence of mutexes locking everywhere
+    (first intern_lock then transactional log  buffer lock)
+  */
+  if (undo_type == LOGREC_UNDO_BULK_INSERT)
+    pthread_mutex_lock(&info->s->intern_lock);
+
   res= translog_write_record(res_lsn, LOGREC_CLR_END,
                              info->trn, info,
                              (translog_size_t)
                              log_array[TRANSLOG_INTERNAL_PARTS + 0].length,
                              TRANSLOG_INTERNAL_PARTS + 1, log_array,
                              log_data + LSN_STORE_SIZE, &msg);
+  if (undo_type == LOGREC_UNDO_BULK_INSERT)
+    pthread_mutex_unlock(&info->s->intern_lock);
   DBUG_RETURN(res);
 }
 
@@ -149,6 +160,7 @@ my_bool write_hook_for_clr_end(enum tran
   struct st_msg_to_write_hook_for_clr_end *msg=
     (struct st_msg_to_write_hook_for_clr_end *)hook_arg;
   my_bool error= FALSE;
+  DBUG_ENTER("write_hook_for_clr_end");
   DBUG_ASSERT(trn->trid != 0);
   trn->undo_lsn= msg->previous_undo_lsn;
 
@@ -177,9 +189,12 @@ my_bool write_hook_for_clr_end(enum tran
   case LOGREC_UNDO_KEY_DELETE:
     break;
   case LOGREC_UNDO_BULK_INSERT:
+    safe_mutex_assert_owner(&share->intern_lock);
     error= (maria_enable_indexes(tbl_info) ||
             /* we enabled indices, need '2' below */
-            _ma_state_info_write(share, 1|2|4));
+            _ma_state_info_write(share,
+                                 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                 MA_STATE_INFO_WRITE_FULL_INFO));
     /* no need for _ma_reset_status(): REDO_DELETE_ALL is just before us */
     break;
   default:
@@ -187,7 +202,7 @@ my_bool write_hook_for_clr_end(enum tran
   }
   if (trn->undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */
     trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
-  return error;
+  DBUG_RETURN(error);
 }
 
 

=== modified file 'storage/maria/ma_locking.c'
--- a/storage/maria/ma_locking.c	2008-10-23 16:29:52 +0000
+++ b/storage/maria/ma_locking.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -13,9 +14,10 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-/*
+/**
+  @file
   Locking of Maria-tables.
-  Must be first request before doing any furter calls to any Maria function.
+  Must be first request before doing any further calls to any Maria function.
   Is used to allow many process use the same non transactional Maria table
 */
 
@@ -43,6 +45,7 @@ int maria_lock_database(MARIA_HA *info, 
     ++share->w_locks;
     ++share->tot_locks;
     info->lock_type= lock_type;
+    info->s->in_use= list_add(info->s->in_use, &info->in_use);
     DBUG_RETURN(0);
   }
 
@@ -78,6 +81,12 @@ int maria_lock_database(MARIA_HA *info, 
       }
       if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
       {
+        /*
+          Logically there should not be a WRITE_CACHE at this stage, except
+          maybe for temporary tables.
+        */
+        DBUG_ASSERT(share->temporary ||
+                    !(info->opt_flag & WRITE_CACHE_USED));
 	if (end_io_cache(&info->rec_cache))
 	{
 	  error=my_errno;
@@ -87,55 +96,21 @@ int maria_lock_database(MARIA_HA *info, 
       }
       if (!count)
       {
+        int local_error;
 	DBUG_PRINT("info",("changed: %u  w_locks: %u",
 			   (uint) share->changed, share->w_locks));
-	if (share->changed && !share->w_locks)
-	{
-#ifdef HAVE_MMAP
-          if ((share->mmaped_length !=
-               share->state.state.data_file_length) &&
-              (share->nonmmaped_inserts > MAX_NONMAPPED_INSERTS))
-          {
-            if (share->lock_key_trees)
-              rw_wrlock(&share->mmap_lock);
-            _ma_remap_file(info, share->state.state.data_file_length);
-            share->nonmmaped_inserts= 0;
-            if (share->lock_key_trees)
-              rw_unlock(&share->mmap_lock);
-          }
-#endif
-#ifdef EXTERNAL_LOCKING
-	  share->state.process= share->last_process=share->this_process;
-	  share->state.unique=   info->last_unique=  info->this_unique;
-	  share->state.update_count= info->last_loop= ++info->this_loop;
-#endif
-          /* transactional tables rather flush their state at Checkpoint */
-          if (!share->base.born_transactional)
-          {
-            if (_ma_state_info_write_sub(share->kfile.file, &share->state, 1))
-              error= my_errno;
-            else
-            {
-              /* A value of 0 means below means "state flushed" */
-              share->changed= 0;
-            }
-          }
-	  if (maria_flush)
-	  {
-            if (_ma_sync_table_files(info))
-	      error= my_errno;
-	  }
-	  else
-	    share->not_flushed=1;
-	  if (error)
-          {
-            maria_print_error(info->s, HA_ERR_CRASHED);
-	    maria_mark_crashed(info);
-          }
+	if (share->changed && !share->w_locks &&
+            (local_error=
+             ma_remap_file_and_write_state_for_unlock(info, FALSE)))
+        {
+          error= local_error;
+          maria_print_error(share, HA_ERR_CRASHED);
+          maria_mark_crashed(info);
 	}
       }
       info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
       info->lock_type= F_UNLCK;
+      info->s->in_use= list_delete(info->s->in_use, &info->in_use);
       break;
     case F_RDLCK:
       if (info->lock_type == F_WRLCK)
@@ -166,6 +141,7 @@ int maria_lock_database(MARIA_HA *info, 
       share->r_locks++;
       share->tot_locks++;
       info->lock_type=lock_type;
+      info->s->in_use= list_add(info->s->in_use, &info->in_use);
       break;
     case F_WRLCK:
       if (info->lock_type == F_RDLCK)
@@ -216,13 +192,14 @@ int maria_lock_database(MARIA_HA *info, 
       info->invalidator=share->invalidator;
       share->w_locks++;
       share->tot_locks++;
+      info->s->in_use= list_add(info->s->in_use, &info->in_use);
       break;
     default:
       DBUG_ASSERT(0);
       break;				/* Impossible */
     }
   }
-#ifdef __WIN__
+#ifdef _WIN32
   else
   {
     /*
@@ -316,15 +293,13 @@ int _ma_writeinfo(register MARIA_HA *inf
       share->state.update_count= info->last_loop= ++info->this_loop;
 #endif
 
-      if ((error= _ma_state_info_write_sub(share->kfile.file,
-                                           &share->state, 1)))
+      if ((error= _ma_state_info_write_sub(share, share->kfile.file,
+                                           &share->state,
+                                           MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)))
 	olderror=my_errno;
-#ifdef __WIN__
+#ifdef _WIN32
       if (maria_flush)
-      {
-	_commit(share->kfile.file);
-	_commit(info->dfile.file);
-      }
+        _ma_sync_table_files(info);
 #endif
       my_errno=olderror;
     }
@@ -388,9 +363,15 @@ int _ma_mark_file_changed(MARIA_HA *info
 {
   uchar buff[3];
   register MARIA_SHARE *share= info->s;
+  int error= 1;
   DBUG_ENTER("_ma_mark_file_changed");
 
-  if (!(share->state.changed & STATE_CHANGED) || ! share->global_changed)
+#define _MA_ALREADY_MARKED_FILE_CHANGED                                 \
+  ((share->state.changed & STATE_CHANGED) && share->global_changed)
+  if (_MA_ALREADY_MARKED_FILE_CHANGED)
+    DBUG_RETURN(0);
+  pthread_mutex_lock(&share->intern_lock); /* recheck under mutex */
+  if (! _MA_ALREADY_MARKED_FILE_CHANGED)
   {
     share->state.changed|=(STATE_CHANGED | STATE_NOT_ANALYZED |
 			   STATE_NOT_OPTIMIZED_KEYS);
@@ -413,11 +394,15 @@ int _ma_mark_file_changed(MARIA_HA *info
     {
       mi_int2store(buff,share->state.open_count);
       buff[2]=1;				/* Mark that it's changed */
+      /*
+        Don't need to log it to physical log, as online backup does dirty
+        copies anyway.
+      */
       if (my_pwrite(share->kfile.file, buff, sizeof(buff),
                     sizeof(share->state.header) +
                     MARIA_FILE_OPEN_COUNT_OFFSET,
                     MYF(MY_NABP)))
-        DBUG_RETURN(1);
+        goto err;
     }
     /* Set uuid of file if not yet set (zerofilled file) */
     if (share->base.born_transactional &&
@@ -429,11 +414,15 @@ int _ma_mark_file_changed(MARIA_HA *info
            _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE,
                                      trnman_get_min_trid(),
                                      TRUE, TRUE)))
-        DBUG_RETURN(1);
+        goto err;
       share->state.changed|= STATE_NOT_MOVABLE;
     }
   }
-  DBUG_RETURN(0);
+  error= 0;
+err:
+  pthread_mutex_unlock(&share->intern_lock);
+  DBUG_RETURN(error);
+#undef _MA_ALREADY_MARKED_FILE_CHANGED
 }
 
 /*
@@ -480,6 +469,10 @@ int _ma_decrement_open_count(MARIA_HA *i
       if (!share->temporary)
       {
         mi_int2store(buff,share->state.open_count);
+        /*
+          Don't need to log it to physical log, as online backup does dirty
+          copies anyway.
+        */
         write_error= (int) my_pwrite(share->kfile.file, buff, sizeof(buff),
                                      sizeof(share->state.header) +
                                      MARIA_FILE_OPEN_COUNT_OFFSET,
@@ -527,14 +520,81 @@ void _ma_mark_file_crashed(MARIA_SHARE *
 my_bool _ma_set_uuid(MARIA_HA *info, my_bool reset_uuid)
 {
   uchar buff[MY_UUID_SIZE], *uuid;
-
+  my_bool ret;
   uuid= maria_uuid;
   if (reset_uuid)
   {
     bzero(buff, sizeof(buff));
     uuid= buff;
   }
-  return (my_bool) my_pwrite(info->s->kfile.file, uuid, MY_UUID_SIZE,
-                             mi_uint2korr(info->s->state.header.base_pos),
-                             MYF(MY_NABP));
+  ret= (my_bool) my_pwrite(info->s->kfile.file, uuid, MY_UUID_SIZE,
+                           mi_uint2korr(info->s->state.header.base_pos),
+                           MYF(MY_NABP));
+  if (unlikely(ma_get_physical_logging_state(info->s)))
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAI, info->s, uuid,
+                              MY_UUID_SIZE,
+                              mi_uint2korr(info->s->state.header.base_pos));
+  return ret;
+}
+
+
+/**
+  Remaps the data file, and writes state to index file.
+
+  When we unlock a table and no other thread has announced it is going to
+  write to it (w_locks==0), we want to flush some information to disk, so
+  that in case of crash the table is not too much corrupted. Physical
+  logging, when it is turning logging of for a table, needs to do this too,
+  so that this information reaches the log.
+
+  @param  info            table
+  @param  force           if FALSE, don't flush state of transactional tables
+                          (they do it at checkpoint)
+
+  @return Operation status
+    @retval 0      ok
+    @retval !=0    error
+*/
+
+int ma_remap_file_and_write_state_for_unlock(MARIA_HA *info, my_bool force)
+{
+  MARIA_SHARE *share= info->s;
+  int error= 0;
+#ifdef HAVE_MMAP
+  if ((share->mmaped_length != share->state.state.data_file_length) &&
+      (share->nonmmaped_inserts > MAX_NONMAPPED_INSERTS))
+  {
+    if (share->lock_key_trees)
+      rw_wrlock(&share->mmap_lock);
+    _ma_remap_file(info, share->state.state.data_file_length);
+    share->nonmmaped_inserts= 0;
+    if (share->lock_key_trees)
+      rw_unlock(&share->mmap_lock);
+  }
+#endif
+#ifdef EXTERNAL_LOCKING
+  share->state.process= share->last_process=share->this_process;
+  share->state.unique=   info->last_unique=  info->this_unique;
+  share->state.update_count= info->last_loop= ++info->this_loop;
+#endif
+  if (!share->base.born_transactional || force)
+  {
+    if (_ma_state_info_write_sub(share, share->kfile.file, &share->state,
+                                 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET))
+      error=my_errno;
+    else
+    {
+      /* A value of 0 means below means "state flushed" */
+      share->changed= 0;
+    }
+  }
+  if (maria_flush)
+  {
+    if (_ma_sync_table_files(info))
+      error= my_errno;
+  }
+  else
+    share->not_flushed=1;
+
+  return error;
 }

=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c	2008-12-09 13:11:48 +0000
+++ b/storage/maria/ma_loghandler.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -685,6 +686,10 @@ static LOG_DESC INIT_LOGREC_IMPORTED_TAB
 {LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
  "imported_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
 
+static LOG_DESC INIT_LOGREC_DEBUG_INFO=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "info", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
 const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL;
 
 void translog_table_init()
@@ -774,6 +779,9 @@ void translog_table_init()
     INIT_LOGREC_REDO_BITMAP_NEW_PAGE;
   log_record_type_descriptor[LOGREC_IMPORTED_TABLE]=
     INIT_LOGREC_IMPORTED_TABLE;
+  log_record_type_descriptor[LOGREC_DEBUG_INFO]=
+    INIT_LOGREC_DEBUG_INFO;
+
   for (i= LOGREC_FIRST_FREE; i < LOGREC_NUMBER_OF_TYPES; i++)
     log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED;
 #ifndef DBUG_OFF
@@ -1512,7 +1520,9 @@ static void translog_file_init(TRANSLOG_
   pagecache_file_init(file->handler, &translog_page_validator,
                       &translog_dummy_callback,
                       &translog_dummy_write_failure,
-                      maria_flush_log_for_page_none, file);
+                      maria_flush_log_for_page_none,
+                      &translog_dummy_callback,
+                      file);
   file->number= number;
   file->was_recovered= 0;
   file->is_sync= is_sync;
@@ -7418,6 +7428,10 @@ static void translog_force_current_buffe
   log_descriptor.bc.buffer->offset= new_buff_beginning;
   log_descriptor.bc.write_counter= write_counter;
   log_descriptor.bc.previous_offset= previous_offset;
+  new_buffer->prev_last_lsn= BUFFER_MAX_LSN(old_buffer);
+  DBUG_PRINT("info", ("prev_last_lsn set to (%lu,0x%lx)  buffer: 0x%lx",
+                      LSN_IN_PARTS(new_buffer->prev_last_lsn),
+                      (ulong) new_buffer));
 
   /*
     Advances this log pointer, increases writers and let other threads to
@@ -8299,6 +8313,46 @@ void translog_set_file_size(uint32 size)
   DBUG_VOID_RETURN;
 }
 
+
+/**
+   Write debug information to log if we EXTRA_DEBUG is enabled
+*/
+
+my_bool translog_log_debug_info(TRN *trn __attribute__((unused)),
+                                enum translog_debug_info_type type
+                                __attribute__((unused)),
+                                uchar *info __attribute__((unused)),
+                                size_t length __attribute__((unused)))
+{
+#ifdef EXTRA_DEBUG
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+  uchar debug_type;
+  LSN lsn;
+
+  if (!trn)
+  {
+    /*
+      We can't log the current transaction because we don't have
+      an active transaction. Use a temporary transaction object instead
+    */
+    trn= &dummy_transaction_object;
+  }
+  debug_type= (uchar) type;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str= &debug_type;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= 1;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str= info;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
+  return translog_write_record(&lsn, LOGREC_DEBUG_INFO,
+                               trn, NULL,
+                               (translog_size_t) (1+ length),
+                               sizeof(log_array)/sizeof(log_array[0]),
+                               log_array, NULL, NULL);
+#else
+  return 0;
+#endif
+}
+
+
 #ifdef MARIA_DUMP_LOG
 #include <my_getopt.h>
 extern void translog_example_table_init();
@@ -8655,7 +8709,7 @@ static void dump_datapage(uchar *buffer)
     }
     tfile.number= file;
     tfile.handler.file= handler;
-    pagecache_file_init(tfile.handler, NULL, NULL, NULL, NULL, NULL);
+    pagecache_file_init(tfile.handler, NULL, NULL, NULL, NULL, NULL, NULL);
     tfile.was_recovered= 0;
     tfile.is_sync= 1;
     if (translog_check_sector_protection(buffer, &tfile))

=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h	2008-07-09 09:02:27 +0000
+++ b/storage/maria/ma_loghandler.h	2009-01-15 22:25:53 +0000
@@ -144,6 +144,7 @@ enum translog_record_type
   LOGREC_UNDO_BULK_INSERT,
   LOGREC_REDO_BITMAP_NEW_PAGE,
   LOGREC_IMPORTED_TABLE,
+  LOGREC_DEBUG_INFO,
   LOGREC_FIRST_FREE,
   LOGREC_RESERVED_FUTURE_EXTENSION= 63
 };
@@ -167,6 +168,12 @@ enum en_key_op
   KEY_OP_COMPACT_PAGE   /* Compact key page */
 };
 
+
+enum translog_debug_info_type
+{
+  LOGREC_DEBUG_INFO_QUERY
+};
+
 /* Size of log file; One log file is restricted to 4G */
 typedef uint32 translog_size_t;
 
@@ -323,6 +330,9 @@ translog_assign_id_to_share_from_recover
 extern my_bool translog_walk_filenames(const char *directory,
                                        my_bool (*callback)(const char *,
                                                            const char *));
+extern my_bool translog_log_debug_info(TRN *trn,
+                                       enum translog_debug_info_type type,
+                                       uchar *info, size_t length);
 
 enum enum_translog_status
 {

=== added file 'storage/maria/ma_non_trans_log.c'
--- a/storage/maria/ma_non_trans_log.c	1970-01-01 00:00:00 +0000
+++ b/storage/maria/ma_non_trans_log.c	2009-02-13 12:40:13 +0000
@@ -0,0 +1,727 @@
+/* Copyright (C) 2009 - 2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file
+  Logging of Maria commands and records, unrelated to transaction log.
+
+  The logs implemented here have NOTHING to do with the transaction log which
+  contains REDOs and UNDOs and is used for Recovery and ROLLBACK.
+
+  The physical log contains each call to OS write functions on the Maria
+  files. Most of its entries are physical for example "write these bytes at
+  this offset". For example, a maria_write() with lots of BLOBs in many places
+  will cause lots of entries in this log. It also contains some logical ones
+  like MA_LOG_DELETE_ALL (we wouldn't want to log the deletion of all rows
+  one by one).
+
+  In MyISAM there is a logical log (contains each call to higher-level
+  operations like mi_write()/mi_update()) but not in Maria.
+
+  Writes to the physical log happen when the physical operation happens,
+  i.e. when the file is written, which can be at three moments:
+  -# when the row write directly writes to the file (_ma_[no]mmap_pwrite())
+  -# if the row write went to a WRITE_CACHE, when this cache gets written to
+     the file (post_write callback in that cache)
+  -# if the row write went to the page cache, when this cache block gets
+     written ("flushed") to the file (post_write callback in that cache)
+  Additionally, an entry for opening and an entry for closing the table, are
+  written to the physical log: the first "direct row write" or "WRITE_CACHE"
+  or "page cache block flush" log write for a certain MARIA_SHARE, an entry
+  for opening (MA_LOG_OPEN) is written. All entries refer to the table by the
+  file descriptor of the index file; the MA_LOG_OPEN entry links this number
+  to a table name. The entry for closing is written by maria_close() if an
+  entry for opening had been written before and if the index file is being
+  closed.
+
+  Physical log is used for online backup, because if applied to a dirtily
+  copied table it can make this table consistent.
+
+  This log:
+  - is idempotent (if you apply such log to a table, then applying it a
+  second time has no effect).
+  - can be used to debug Maria
+  - can be examined and applied to tables with the maria_non_trans_log
+  utility.
+
+  Physical log is about to a set of tables, can be turned on and off at any
+  time.
+
+  ma_log() is the entry point.
+*/
+
+#include "maria_def.h"
+#if defined(MSDOS) || defined(__WIN__)
+#include <fcntl.h>
+#ifndef __WIN__
+#include <process.h>
+#endif
+#endif
+#ifdef VMS
+#include <processes.h>
+#endif
+
+#undef GETPID					/* For HPUX */
+#ifdef THREAD
+#define GETPID() (log_type == 1 ? (long) maria_pid : (long) my_thread_dbug_id())
+#else
+#define GETPID() maria_pid
+#endif
+
+/** the log_type global variable is probably obsolete, it's always 0 now */
+static const int log_type=0;
+ulong maria_pid=0;
+static int ma_log_open_cache(const char *log_filename);
+static int ma_log_close_cache();
+static int ma_log_start_physical(const char *log_filename,
+                                 const HASH *tables);
+static int ma_log_stop_physical();
+
+
+/**
+  Starts Maria physical logging for a set of tables, or stops it.
+
+  @param  action           what to do (start, stop (in)consistently)
+  @param  log_filename     name of the log file to create
+  @param  tables           hash of names of tables for which we want logging
+                           (only for physical log)
+
+  @return Operation status
+    @retval 0      ok
+    @retval !=0    error; then caller should call ma_log_stop_physical(TRUE)
+*/
+
+int ma_log(enum enum_ma_log_action action,
+           const char *log_filename, const HASH *tables)
+{
+  int error;
+  DBUG_ENTER("ma_log");
+
+#ifndef HAVE_MARIA_PHYSICAL_LOGGING
+  DBUG_ASSERT(0);
+  DBUG_RETURN(1);
+#endif
+
+  /* starting/stopping are complex operations so split in functions */
+  switch (action)
+  {
+  case MA_LOG_ACTION_OPEN:
+    error= ma_log_start_physical(log_filename, tables);
+    break;
+  case MA_LOG_ACTION_CLOSE_CONSISTENT:
+  case MA_LOG_ACTION_CLOSE_INCONSISTENT:
+    error= ma_log_stop_physical(action);
+    break;
+  default:
+    DBUG_ASSERT(0);
+    error= 1;
+  }
+  DBUG_RETURN(error);
+}
+
+
+/**
+  Sets up a log's IO_CACHE (for physical log).
+
+  Log is IO_CACHE to be fast.
+
+  @param  log_filename     name of file to create
+
+  @note logs are not created with MY_WAIT_IF_FULL: a log can itself be the
+  cause of filling the disk, so better corrupt it (and make a backup
+  fail for example) than prevent other normal operations.
+
+  @todo A realistic benchmark to see if the size of the IO_CACHE makes any
+  speed difference.
+
+  @return Operation status
+    @retval 0      ok
+    @retval !=0    error
+*/
+
+static int ma_log_open_cache(const char *log_filename)
+{
+  int error=0;
+  char buff[FN_REFLEN];
+  int access_flags;
+  File file;
+  IO_CACHE *log;
+  uint cache_size;
+  DBUG_ENTER("ma_log_open_cache");
+
+  DBUG_ASSERT(log_filename != NULL);
+  pthread_mutex_lock(&THR_LOCK_maria_log);
+  log= &maria_physical_log;
+  /* We want to fail if file exists */
+  access_flags= O_WRONLY | O_BINARY | O_TRUNC | O_EXCL;
+  /*
+    We want a large IO_CACHE to have large contiguous disk writes.
+    In many systems this size is affordable. In small embedded ones it is
+    not, but would they use this log?
+  */
+  cache_size= IO_SIZE*256;
+
+  if (!maria_pid)
+    maria_pid=(ulong) getpid();
+  if (!my_b_inited(log))
+  {
+    DBUG_ASSERT(log_filename);
+    fn_format(buff, log_filename, "", "", MY_UNPACK_FILENAME);
+    if ((file= my_create(buff,
+                         0, access_flags,
+                         MYF(MY_WME | ME_WAITTANG))) < 0)
+      error= my_errno;
+    else if (init_io_cache(log, file,
+                           cache_size, WRITE_CACHE,
+                           my_tell(file,MYF(MY_WME)), 0,
+                           MYF(MY_WME | MY_NABP)))
+    {
+      error= my_errno;
+      my_close(file, MYF(MY_WME));
+    }
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria_log);
+  DBUG_RETURN(error);
+}
+
+
+/**
+  Destroy's a log's IO_CACHE for physical log
+
+  @return Operation status
+    @retval 0      ok
+    @retval !=0    error
+*/
+
+static int ma_log_close_cache()
+{
+  int error= 0;
+  IO_CACHE *log;
+  DBUG_ENTER("ma_log_close_cache");
+  pthread_mutex_lock(&THR_LOCK_maria_log);
+  log         = &maria_physical_log;
+  if (my_b_inited(log))
+  {
+    if (end_io_cache(log) ||
+        my_close(log->file,MYF(MY_WME)))
+      error= my_errno;
+    log->file= -1;
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria_log);
+  DBUG_RETURN(error);
+}
+
+
+/**
+  Logs a Maria command and its return code to log.
+
+  If MA_LOG_OPEN has not already been stored for this MARIA_SHARE in this log,
+  also writes a MA_LOG_OPEN.
+
+  @param  log              pointer to the log's IO_CACHE
+  @param  command          Maria command (see code for allowed commands)
+  @param  share            MARIA_SHARE
+  @param  buffert          usually argument to the command (e.g. name of file
+                           to open for MA_LOG_OPEN), may be NULL
+  @param  length           length of buffert (0 if NULL)
+  @param  result           return code of the command
+*/
+
+void _maria_log_command(IO_CACHE *log, enum maria_log_commands command,
+                         MARIA_SHARE *share,
+                         const uchar *buffert, uint length, int result)
+{
+  uchar header[14];
+  int old_errno, headerlen;
+  ulong pid=(ulong) GETPID();
+  File file= share->kfile.file;
+  old_errno=my_errno;
+  DBUG_ENTER("_maria_log_command");
+  DBUG_PRINT("enter", ("command: %u share->open_file_name.str '%s'",
+                       command, share->open_file_name.str));
+  DBUG_ASSERT(command == MA_LOG_OPEN  || command == MA_LOG_CLOSE);
+
+  DBUG_ASSERT(((uint)result) <= UINT_MAX16);
+  if (file >= UINT_MAX16 || length >= UINT_MAX16)
+  {
+    header[0]= ((uchar) command) | MA_LOG_BIG_NUMBERS;
+    DBUG_ASSERT(file < (2<<24));
+    mi_int3store(header + 1, file);
+    mi_int4store(header + 4, pid);
+    mi_int2store(header + 8, result);
+    mi_int4store(header + 10, length);
+    headerlen= 14;
+  }
+  else
+  {
+    /* use a compact encoding for all these small numbers */
+    header[0]= (uchar) command;
+    mi_int2store(header + 1, file);
+    mi_int4store(header + 3, pid);
+    mi_int2store(header + 7, result);
+    mi_int2store(header + 9, length);
+    headerlen= 11;
+  }
+retry:
+  /*
+    Reasons to not use THR_LOCK_maria to serialize log writes:
+    - better concurrency (not stealing THR_LOCK_maria which is used for opens
+    and closes including long table flushes)
+    - maria_close() flushes indexes while holding THR_LOCK_maria, and that
+    flush can cause log writes, so we would lock the mutex twice.
+  */
+  pthread_mutex_lock(&THR_LOCK_maria_log);
+  /*
+    We need to check that 'log' is not closed, this can happen for a physical
+    log. Indeed we do not have full control on the table from the thread doing
+    ma_log_stop_physical(); it could be an inconsistent logging stop (in
+    the middle of writes) or even a consistent one (table can be in
+    maria_lock_database(F_UNLCK) and thus want to flush its header)). Log
+    might just have been closed while the table still has physical_logging
+    true.
+  */
+  if (likely(my_b_inited(log) != NULL))
+  {
+    if (command == MA_LOG_OPEN)
+    {
+      /*
+        If there could be two concurrent writers on a Maria table, it could
+        be that they both do a maria_log_command(c) where c!=MA_LOG_OPEN,
+        which both see MA_LOG_OPEN_stored_in_physical_log false, and both
+        call maria_log_command(MA_LOG_OPEN); we would then have to make one
+        single winner: one will run before the other, the other should
+        notice MA_LOG_OPEN_stored_in_physical_log became true and back off.
+        But there is always at most one writer to a Maria table, so the
+        assertion below should always be ok
+      */
+      DBUG_ASSERT(!share->MA_LOG_OPEN_stored_in_physical_log);
+      share->MA_LOG_OPEN_stored_in_physical_log= TRUE;
+      /*
+        We must keep the mutex between setting the boolean above and writing
+        to the log ; one instant after unlocking the mutex, the log may be
+        closed and so it would be wrong to say that the MA_LOG_OPEN is in
+        the log (it would possibly influence a next physical log).
+      */
+    }
+    else if (unlikely(!share->MA_LOG_OPEN_stored_in_physical_log))
+    {
+      DBUG_ASSERT(command != MA_LOG_CLOSE);
+      pthread_mutex_unlock(&THR_LOCK_maria_log);
+      _maria_log_command(&maria_physical_log, MA_LOG_OPEN, share,
+                         (uchar *)share->open_file_name.str,
+                         share->open_file_name.length, 0);
+      goto retry;
+    }
+    /*
+      Any failure to write the log does not prevent the table write (table
+      should still be usable even though log breaks).
+      but sets up log->hard_write_error_in_the_past, which can be tested by
+      those who want to use this log.
+    */
+    (void) my_b_write(log, header, headerlen);
+    if (buffert)
+      (void) my_b_write(log, buffert, length);
+    else
+    {
+      DBUG_ASSERT(length == 0);
+    }
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria_log);
+  my_errno=old_errno;
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  Logs a my_pwrite() (done to data or index file) to the physical log.
+
+  Also logs MA_LOG_OPEN if first time. Thus, a MARIA_HA will write MA_LOG_OPEN
+  to the log only if it is doing a write to the table: a table which does
+  only reads logs nothing to the physical log.
+
+  @param  command          Maria command (MA_LOG_WRITE_BYTES_TO_MAD|MAI)
+  @param  share            table's share
+  @param  buffert          argument to the pwrite
+  @param  length           length of buffer
+  @param  filepos          offset in file where buffer was written
+
+  @note length may be small (for example, if updating only a numeric field of
+  a record, it could be only a few bytes), so we try to minimize the header's
+  size of the log entry (no 'pid', no 'result').
+*/
+
+void maria_log_pwrite_physical(enum maria_log_commands command,
+                                MARIA_SHARE *share, const uchar *buffert,
+                                uint length, my_off_t filepos)
+{
+  uchar header[21];
+  int old_errno, headerlen;
+  DBUG_ENTER("maria_log_pwrite_physical");
+  DBUG_ASSERT(command == MA_LOG_WRITE_BYTES_MAD ||
+              command == MA_LOG_WRITE_BYTES_MAI);
+  DBUG_ASSERT(buffert != NULL && length > 0);
+  old_errno= my_errno;
+  if (share->kfile.file >= UINT_MAX16 || filepos >= UINT_MAX32 ||
+      length >= UINT_MAX16)
+  {
+    header[0]= ((uchar) command) | MA_LOG_BIG_NUMBERS;
+    DBUG_ASSERT(share->kfile.file < (2<<24));
+    mi_int3store(header + 1, share->kfile.file);
+    mi_sizestore(header + 4, filepos);
+    mi_int4store(header + 12, length);
+    headerlen= 16;
+  }
+  else
+  {
+    header[0]= (uchar) command;
+    mi_int2store(header + 1, share->kfile.file);
+    mi_int4store(header + 3, filepos);
+    mi_int2store(header + 7, length);
+    headerlen= 9;
+  }
+  /* pid and result are not needed */
+retry:
+  pthread_mutex_lock(&THR_LOCK_maria_log);
+  if (likely(my_b_inited(&maria_physical_log) != NULL))
+  {
+    if (unlikely(!share->MA_LOG_OPEN_stored_in_physical_log))
+    {
+      pthread_mutex_unlock(&THR_LOCK_maria_log);
+      _maria_log_command(&maria_physical_log, MA_LOG_OPEN, share,
+                         (uchar *)share->open_file_name.str,
+                          share->open_file_name.length, 0);
+      goto retry;
+    }
+    (void) my_b_write(&maria_physical_log, header, headerlen);
+    (void) my_b_write(&maria_physical_log, buffert, length);
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria_log);
+  my_errno= old_errno;
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  Logs a my_chsize() done to the data or index file to the physical log.
+
+  Also logs MA_LOG_OPEN if first time.
+
+  @param  share            table's share
+  @param  command          MA_LOG_CHSIZE_MAD or MA_LOG_CHSIZE_MAI
+  @param  new_length       new length of the table's file
+*/
+
+void maria_log_chsize_physical(MARIA_SHARE *share,
+                               enum maria_log_commands command,
+                               my_off_t new_length)
+{
+  uchar header[12];
+  int old_errno, headerlen;
+  DBUG_ENTER("maria_log_chsize_physical");
+  old_errno= my_errno;
+  DBUG_ASSERT(command == MA_LOG_CHSIZE_MAD  || command == MA_LOG_CHSIZE_MAI);
+  if (share->kfile.file >= UINT_MAX16 || new_length >= UINT_MAX32)
+  {
+    header[0]= ((uchar) command) | MA_LOG_BIG_NUMBERS;
+    DBUG_ASSERT(share->kfile.file < (2<<24));
+    mi_int3store(header + 1, share->kfile.file);
+    mi_sizestore(header + 4, new_length);
+    headerlen= 12;
+  }
+  else
+  {
+    header[0]= (uchar)command;
+    mi_int2store(header + 1, share->kfile.file);
+    mi_int4store(header + 3, new_length);
+    headerlen= 7;
+  }
+  /* pid and result are not needed */
+retry:
+  pthread_mutex_lock(&THR_LOCK_maria_log);
+  if (likely(my_b_inited(&maria_physical_log) != NULL))
+  {
+    if (unlikely(!share->MA_LOG_OPEN_stored_in_physical_log))
+    {
+      pthread_mutex_unlock(&THR_LOCK_maria_log);
+      _maria_log_command(&maria_physical_log, MA_LOG_OPEN, share,
+                         (uchar *)share->open_file_name.str,
+                          share->open_file_name.length, 0);
+      goto retry;
+    }
+    (void) my_b_write(&maria_physical_log, header, headerlen);
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria_log);
+  my_errno= old_errno;
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  Starts Maria physical logging for a set of tables.
+
+  Physical logging is used for online backup.
+  A condition of correctness of online backup is that:
+  after the copy process has started (i.e. after the function below has
+  terminated), any update done to a table-to-back-up must be present in the
+  log. This guides the algorithm below.
+
+  All writes (my_write, my_pwrite, memcpy to mmap'ed area, my_chsize) to the
+  data or index file are done this way:
+  @code
+  {
+    write_to_data_or_index_file;
+    if ((atomic read of MARIA_SHARE::physical_logging) != 0)
+      write log record to physical log;
+  }
+  @endcode
+
+  The present function sets MARIA_SHARE::physical_logging to 1 using an
+  atomic write. Atomic write happens before or after atomic read above, and
+  atomic read sees the latest value. If before, change will be in the log. If
+  after, it is also after the write_to_data_or_index_file and thus change
+  will be in the copy. So correctness is always guaranteed. Note the
+  importance of checking MARIA_SHARE::logging always _after_
+  write_to_data_or_index_file, with an _atomic_read_ for the reasoning to
+  hold.
+
+  @param  log_filename     Name of the physical log file to create
+  @param  tables           Hash of names of tables for which we want logging
+
+  @return Operation status
+    @retval 0      ok
+    @retval !=0    error
+*/
+
+static int ma_log_start_physical(const char *log_filename, const HASH *tables)
+{
+  LIST *list_item;
+  int error;
+  DBUG_ENTER("ma_log_start_physical");
+  DBUG_ASSERT(log_filename != NULL);
+  DBUG_ASSERT(my_hash_inited(tables));
+
+  pthread_mutex_lock(&THR_LOCK_maria);
+  if (ma_log_tables_physical) /* physical logging already running */
+  {
+    pthread_mutex_unlock(&THR_LOCK_maria);
+    DBUG_ASSERT(0); /* because it should not happen */
+    DBUG_RETURN(1);
+  }
+  ma_log_tables_physical= tables;
+
+  if (unlikely(ma_log_open_cache(log_filename)))
+  {
+    error= 1;
+    goto end;
+  }
+  /* Go through all open Maria tables */
+  for (list_item= maria_open_list; list_item; list_item= list_item->next)
+  {
+    MARIA_HA *info= (MARIA_HA*)list_item->data;
+    MARIA_SHARE *share= info->s;
+    DBUG_PRINT("info",("table '%s' 0x%lx tested against hash",
+                       share->unique_file_name.str, (ulong)info));
+    if (!my_hash_search(ma_log_tables_physical,
+                        (uchar *)share->unique_file_name.str,
+                        share->unique_file_name.length))
+      continue;
+    /* Backup kernel shouldn't ask for temporary table's backup */
+    DBUG_ASSERT(!share->temporary);
+    /*
+      We don't need to flush key blocks, WRITE_CACHE or the state
+      because every time they are written to disk (at the latest in
+      ma_log_stop_physical()) they check for physical logging
+      (key cache always has log_key_page_flush_physical() as
+      post_write, WRITE_CACHE always has log_flushed_write_cache_physical()
+      has post_write, even when _not_ in backup), so any now cached info will
+      finally reach the log.
+      Conversely, if we wanted to register no callback in key cache and
+      WRITE_CACHE when no backup is running (to save function calls
+      and atomic reads when no backup is running), we would have to
+      flush key cache and WRITE_CACHE here.
+    */
+    ma_set_physical_logging_state(info->s, 1);
+  }
+  error= 0;
+end:
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  if (unlikely(error))
+    ma_log_stop_physical(MA_LOG_ACTION_CLOSE_INCONSISTENT, NULL, NULL);
+  DBUG_RETURN(error);
+}
+
+
+/**
+  Stops Maria physical logging.
+
+  As part of this stop operation, user can request that the physical log ends
+  in a consistent state, i.e. that it contains copies of the currently cached
+  key pages etc. To be consistent assumes that the caller has relevant tables
+  write-locked, indeed otherwise the log could end in the middle of a
+  statement, and applying it would produce a likely corrupted table.Online
+  backup needs such a consistent log to be able to create consistent table
+  copies from the log. If online backup is being cancelled, then there is no
+  need that the physical log be consistent.
+
+  @param  action           MA_LOG_ACTION_CLOSE_CONSISTENT or
+                           MA_LOG_ACTION_CLOSE_INCONSISTENT.
+
+  @return Operation status
+    @retval 0      ok
+    @retval !=0    error
+
+  @note Even if MA_LOG_ACTION_CLOSE_CONSISTENT, tables may be being written
+  now (in practice caller has read-locked tables, but those tables may be
+  just going out of a write (after thr_unlock(), before or inside
+  maria_lock_database(F_UNLCK) which may be flushing the index header or index
+  pages).
+*/
+
+static int ma_log_stop_physical(enum enum_ma_log_action action)
+{
+  int error= 0;
+  LIST *list_item;
+  DBUG_ENTER("ma_log_stop_physical");
+  DBUG_ASSERT(action == MA_LOG_ACTION_CLOSE_CONSISTENT ||
+              action == MA_LOG_ACTION_CLOSE_INCONSISTENT);
+
+  pthread_mutex_lock(&THR_LOCK_maria);
+  if (ma_log_tables_physical == NULL) /* no physical logging running */
+  {
+    pthread_mutex_unlock(&THR_LOCK_maria);
+    DBUG_RETURN(0); /* it's ok if it happens */
+  }
+  /*
+    This is a pointer to a object provided by the caller through
+    ma_log_start_physical(); such object is to be freed by the caller.
+  */
+  ma_log_tables_physical= NULL;
+
+  if (action == MA_LOG_ACTION_CLOSE_CONSISTENT)
+  {
+    /**
+      @todo consider an algorithm which would not keep THR_LOCK_maria for the
+      time of flushing all these tables' indices; we could do a first loop
+      with THR_LOCK_maria to collect shares and "pin" them; then a second
+      loop without THR_LOCK_maria, flushing and unpinning them.
+    */
+    for (list_item= maria_open_list; list_item; list_item= list_item->next)
+    {
+      MARIA_HA *info= (MARIA_HA*)list_item->data;
+      MARIA_SHARE *share= info->s;
+      /*
+        Setting of the variable below always happens under THR_LOCK_maria,
+        which we have here, so we don't need atomic operations to read here.
+      */
+      if (!share->physical_logging)
+        continue;
+      /*
+        Must take intern_lock, at least because key cache isn't safe if two
+        calls to flush_key_blocks_int() run concurrently on the same file.
+      */
+      pthread_mutex_lock(&share->close_lock);
+      pthread_mutex_lock(&share->intern_lock);
+      /*
+        It is possible that some statement just finished, has not called
+        maria_lock_database(F_UNLCK) yet, and so some data/index blocks would
+        still be in memory. So we have to flush below, to put them into the
+        log.
+
+        It is also possible (same scenario) that some WRITE_CACHE is not
+        flushed yet. This should not happen but it does (can just be a
+        forgotten maria_extra(HA_EXTRA_NO_CACHE)); so maria_close() and
+        maria_lock_database(F_UNLCK) flush the cache; so we have to do it here
+        too, to put the data into the log. Mutices in maria_close() and
+        maria_lock_database() ensure that they don't flush at the same time as
+        us (which could corrupt the cache). Nobody should flush the
+        WRITE_CACHE without a write-lock or intern_lock (see assertion in
+        maria_reset()).
+
+        It is also possible (same scenario) that the index's header has not
+        been written yet and nobody is going to do it for us; indeed this can
+        happen (two concurrent threads): thread1 has just done
+        maria_lock_database(F_WRLCK), is blocked by the thr_lock of our caller,
+        thread2 has finished its write statement and is going to execute
+        maria_lock_database(F_UNLCK); no index header flush will be done by the
+        maria_lock_database(F_UNLCK) of thread2 as w_locks is >0 (due to
+        thread1). And no index header flush will be done by thread1 as it is
+        blocked. So, we need to flush the index header here, to put it into
+        the log.
+
+        Of course, for the flushing above to reach the log, it has to be done
+        before setting share->physical_logging to false and before closing the
+        log.
+      */
+      if (_ma_flush_table_files(info, (((info->dfile.file >= 0)) ?
+                                       MARIA_FLUSH_DATA : 0) | 
+                                ((ma_log_index_pages_physical &&
+                                  (share->kfile.file >= 0)) ?
+                                 MARIA_FLUSH_INDEX : 0),
+                                FLUSH_KEEP, FLUSH_KEEP) ||
+          ((info->opt_flag & WRITE_CACHE_USED) &&
+           flush_io_cache(&info->rec_cache)) ||
+          (share->changed &&
+           ma_remap_file_and_write_state_for_unlock(info, TRUE)))
+      {
+        error= 1; /* we continue, because log has to be closed anyway */
+        maria_print_error(share, HA_ERR_CRASHED);
+        maria_mark_crashed(info);	/* Mark that table must be checked */
+      }
+      pthread_mutex_unlock(&share->intern_lock);
+      pthread_mutex_unlock(&share->close_lock);
+    } /* ... for (list_item=...) */
+  } /* ... if (action == MA_LOG_ACTION_CLOSE_CONSISTENT) */
+
+  /*
+    Online backup wants to pick this log with my_read() calls, to send it to
+    the backup stream. So we don't delete log but close it now, so that its
+    IO_CACHE goes to disk (so that all log is visible to the my_read()
+    calls). Another reason related to concurrency is mentioned below.
+  */
+  if (ma_log_close_cache())
+    error= 1;
+
+  for (list_item= maria_open_list; list_item; list_item= list_item->next)
+  {
+    MARIA_SHARE *share= ((MARIA_HA*)list_item->data)->s;
+    /*
+      Setting of the variable below always happens under THR_LOCK_maria,
+      which we have here, so we don't need atomic operations to read here.
+    */
+    if (!share->physical_logging)
+      continue;
+    ma_set_physical_logging_state(share, 0);
+    /*
+      We reset MA_LOG_OPEN_stored_in_physical_log. How is this safe with a
+      concurrent logging operation (like maria_log_pwrite_physical()) which
+      may want to set it to TRUE at the same time?
+      The concurrent logging operation runs either before or after log closing
+      (serialization ensured by THR_LOCK_maria_log). If before, it is before
+      us (us==resetter), because log closing is before us, so we win. If
+      after, the concurrent logging operation finds the log closed and so
+      will not change MA_LOG_OPEN_stored_in_physical_log (so we win again).
+      Note the importance of closing the log before, for the reasoning to
+      hold.
+    */
+    share->MA_LOG_OPEN_stored_in_physical_log= FALSE;
+  }
+
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  /*
+    From this moment on, from the point of view of Maria, a new physical log
+    (a new backup) can start (new log will use a different tmp name).
+  */
+  DBUG_RETURN(error);
+}

=== modified file 'storage/maria/ma_open.c'
--- a/storage/maria/ma_open.c	2009-01-27 02:08:48 +0000
+++ b/storage/maria/ma_open.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -189,6 +190,12 @@ static MARIA_HA *maria_clone_internal(MA
   }
   info.state_start= info.state;                 /* Initial values */
 
+  if (unlikely(share->state.changed & STATE_BAD_OPEN_COUNT))
+  {
+    /* client may be a reader: ensure new state's flag not lost */
+    _ma_state_info_write(share, 1);
+  }
+
   pthread_mutex_unlock(&share->intern_lock);
 
   /* Allocate buffer for one record */
@@ -203,6 +210,11 @@ static MARIA_HA *maria_clone_internal(MA
 #ifdef THREAD
   thr_lock_data_init(&share->lock,&m_info->lock,(void*) m_info);
 #endif
+  if (ma_log_tables_physical &&
+      my_hash_search(ma_log_tables_physical,
+                     (uchar *)share->unique_file_name.str,
+                     share->unique_file_name.length))
+    m_info->s->physical_logging= TRUE; /* set before publishing table */
   m_info->open_list.data=(void*) m_info;
   maria_open_list=list_add(maria_open_list,&m_info->open_list);
 
@@ -397,18 +409,29 @@ MARIA_HA *maria_open(const char *name, i
     disk_pos= _ma_base_info_read(disk_cache + base_pos, &share->base);
     share->state.state_length=base_pos;
 
-    if (!(open_flags & HA_OPEN_FOR_REPAIR) &&
-	((share->state.changed & STATE_CRASHED) ||
-	 ((open_flags & HA_OPEN_ABORT_IF_CRASHED) &&
-	  (my_disable_locking && share->state.open_count))))
-    {
-      DBUG_PRINT("error",("Table is marked as crashed. open_flags: %u  "
-                          "changed: %u  open_count: %u  !locking: %d",
-                          open_flags, share->state.changed,
-                          share->state.open_count, my_disable_locking));
-      my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ?
-		HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE);
-      goto err;
+    if (!(open_flags & HA_OPEN_FOR_REPAIR))
+    {
+      if ((share->state.changed & STATE_CRASHED) ||
+          ((open_flags & HA_OPEN_ABORT_IF_CRASHED) &&
+           (my_disable_locking && share->state.open_count)))
+      {
+        DBUG_PRINT("error",("Table is marked as crashed. open_flags: %u  "
+                            "changed: %u  open_count: %u  !locking: %d",
+                            open_flags, share->state.changed,
+                            share->state.open_count, my_disable_locking));
+        my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ?
+                  HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE);
+        goto err;
+      }
+      /*
+        Tell future openers that open_count was positive at first open (sign
+        of a problem). See maria_backup_engine.cc.
+      */
+      if (my_disable_locking && share->state.open_count)
+      {
+        DBUG_PRINT("info", ("STATE_BAD_OPEN_COUNT set on"));
+        share->state.changed|= STATE_BAD_OPEN_COUNT;
+      }
     }
 
     /*
@@ -532,11 +555,13 @@ MARIA_HA *maria_open(const char *name, i
     strmov(share->unique_file_name.str, name_buff);
     strmov(share->index_file_name.str, index_name);
     strmov(share->data_file_name.str,  data_name);
+    /* unresolved name (no .sym or Unix symbolic link resolution) */
     strmov(share->open_file_name.str,  name);
 
     share->block_size= share->base.block_size;   /* Convenience */
     {
       HA_KEYSEG *pos=share->keyparts;
+      uint32 ftkey_nr= 1;
       for (i=0 ; i < keys ; i++)
       {
         share->keyinfo[i].share= share;
@@ -609,6 +634,7 @@ MARIA_HA *maria_open(const char *name, i
             share->ft2_keyinfo.end=pos;
             setup_key_functions(& share->ft2_keyinfo);
           }
+          share->keyinfo[i].ftkey_nr= ftkey_nr++;
 	}
         setup_key_functions(share->keyinfo+i);
 	share->keyinfo[i].end=pos;
@@ -646,7 +672,7 @@ MARIA_HA *maria_open(const char *name, i
 	pos->flag=0;
 	pos++;
       }
-      share->ftparsers= 0;
+      share->ftkeys= ftkey_nr;
     }
     share->data_file_type= share->state.header.data_file_type;
     share->base_length= (BASE_ROW_HEADER_SIZE +
@@ -660,6 +686,12 @@ MARIA_HA *maria_open(const char *name, i
                             KEYPAGE_USED_SIZE);
     share->kfile.file= kfile;
 
+#ifdef THREAD
+    /* we need this rwlock early for _ma_update_state_lsns_sub() */
+    my_atomic_rwlock_init(&share->physical_logging_rwlock);
+#endif
+    errpos= 5;
+
     if (open_flags & HA_OPEN_COPY)
     {
       /*
@@ -751,7 +783,7 @@ MARIA_HA *maria_open(const char *name, i
         goto err;
       data_file= info.dfile.file;
     }
-    errpos= 5;
+    errpos= 6;
 
     if (open_flags & HA_OPEN_DELAY_KEY_WRITE)
       share->options|= HA_OPTION_DELAY_KEY_WRITE;
@@ -915,13 +947,18 @@ err:
   if (save_errno == HA_ERR_OLD_FILE) /* uuid is different ? */
     save_errno= HA_ERR_CRASHED_ON_USAGE; /* the code to trigger auto-repair */
   switch (errpos) {
-  case 5:
+  case 6:
     if (data_file >= 0)
       (void)(my_close(data_file, MYF(0)));
     if (old_info)
       break;					/* Don't remove open table */
     (*share->once_end)(share);
     /* fall through */
+  case 5:
+#ifdef THREAD
+    my_atomic_rwlock_destroy(&share->physical_logging_rwlock);
+#endif
+    /* fall through */
   case 4:
     my_free((uchar*) share,MYF(0));
     /* fall through */
@@ -1173,11 +1210,13 @@ static void setup_key_functions(register
    Then calls _ma_state_info_write_sub().
 
    @param  share           table
-   @param  pWrite          bitmap: if 1 is set my_pwrite() is used otherwise
-                           my_write(); if 2 is set, info about keys is written
-                           (should only be needed after ALTER TABLE
-                           ENABLE/DISABLE KEYS, and REPAIR/OPTIMIZE); if 4 is
-                           set, MARIA_SHARE::intern_lock is taken.
+   @param  pWrite          bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)
+                           is set my_pwrite() is used otherwise my_write();
+                           if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info
+                           about keys is written (should only be needed
+                           after ALTER TABLE ENABLE/DISABLE KEYS, and
+                           REPAIR/OPTIMIZE); if 4 (MA_STATE_INFO_WRITE_LOCK)
+                           is set, MARIA_SHARE::intern_lock is taken.
 
    @return Operation status
      @retval 0      OK
@@ -1190,7 +1229,7 @@ uint _ma_state_info_write(MARIA_SHARE *s
   if (share->options & HA_OPTION_READ_ONLY_DATA)
     return 0;
 
-  if (pWrite & 4)
+  if (pWrite & MA_STATE_INFO_WRITE_LOCK)
     pthread_mutex_lock(&share->intern_lock);
   else if (maria_multi_threaded)
   {
@@ -1208,8 +1247,8 @@ uint _ma_state_info_write(MARIA_SHARE *s
     DBUG_PRINT("info", ("is_of_horizon set to LSN (%lu,0x%lx)",
                         LSN_IN_PARTS(share->state.is_of_horizon)));
   }
-  res= _ma_state_info_write_sub(share->kfile.file, &share->state, pWrite);
-  if (pWrite & 4)
+  res= _ma_state_info_write_sub(share, share->kfile.file, &share->state, pWrite);
+  if (pWrite & MA_STATE_INFO_WRITE_LOCK)
     pthread_mutex_unlock(&share->intern_lock);
   share->changed= 0;
   return res;
@@ -1217,16 +1256,19 @@ uint _ma_state_info_write(MARIA_SHARE *s
 
 
 /**
-   @brief Function to save and store the header in the index file (.MYI).
+   @brief Function to save and store the header in the index file (.MAI).
 
    Shortcut to use instead of _ma_state_info_write() when appropriate.
 
+   @param  share           table's share
    @param  file            descriptor of the index file to write
    @param  state           state information to write to the file
-   @param  pWrite          bitmap: if 1 is set my_pwrite() is used otherwise
-                           my_write(); if 2 is set, info about keys is written
-                           (should only be needed after ALTER TABLE
-                           ENABLE/DISABLE KEYS, and REPAIR/OPTIMIZE).
+   @param  pWrite          bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)
+                           is set my_pwrite() is used otherwise my_write();
+                           if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info
+                           about keys is written (should only be needed
+                           after ALTER TABLE ENABLE/DISABLE KEYS, and
+                           REPAIR/OPTIMIZE).
 
    @notes
      For transactional multiuser tables, this function is called
@@ -1240,13 +1282,19 @@ uint _ma_state_info_write(MARIA_SHARE *s
      @retval 1      Error
 */
 
-uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite)
+uint _ma_state_info_write_sub(MARIA_SHARE *share, File file,
+                              MARIA_STATE_INFO *state, uint pWrite)
 {
   uchar  buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE];
   uchar *ptr=buff;
   uint	i, keys= (uint) state->header.keys;
   size_t res;
   DBUG_ENTER("_ma_state_info_write_sub");
+  DBUG_PRINT("enter",("records: %llu data_file_length: %llu "
+                      "key_file_length: %llu",
+                      (ulonglong)state->state.records,
+                      (ulonglong)state->state.data_file_length,
+                      (ulonglong)state->state.key_file_length));
 
   memcpy_fixed(ptr,&state->header,sizeof(state->header));
   ptr+=sizeof(state->header);
@@ -1286,7 +1334,7 @@ uint _ma_state_info_write_sub(File file,
     mi_sizestore(ptr,state->key_root[i]);		ptr+= 8;
   }
   mi_sizestore(ptr,state->key_del);	        	ptr+= 8;
-  if (pWrite & 2)				/* From maria_chk */
+  if (pWrite & MA_STATE_INFO_WRITE_FULL_INFO)	/* From maria_chk */
   {
     uint key_parts= mi_uint2korr(state->header.key_parts);
     mi_int4store(ptr,state->sec_index_changed); 	ptr+= 4;
@@ -1306,11 +1354,14 @@ uint _ma_state_info_write_sub(File file,
     }
   }
 
-  res= (pWrite & 1) ?
+  res= (pWrite & MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET) ?
     my_pwrite(file, buff, (size_t) (ptr-buff), 0L,
               MYF(MY_NABP | MY_THREADSAFE)) :
     my_write(file,  buff, (size_t) (ptr-buff),
              MYF(MY_NABP));
+  if (ma_get_physical_logging_state(share))
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAI,
+                              share, buff, (uint) (ptr-buff), 0L);
   DBUG_RETURN(res != 0);
 }
 
@@ -1381,21 +1432,53 @@ static uchar *_ma_state_info_read(uchar 
    @param  state           state which will be filled
 */
 
-uint _ma_state_info_read_dsk(File file __attribute__((unused)),
-                             MARIA_STATE_INFO *state __attribute__((unused)))
+/**
+  Read state info from file.
+
+  @param[in]        file        index file descriptor
+  @param[in,out]    state       state info to update from file
+  @param[in]        force       force read
+
+  @return           status
+    @retval         0           ok
+    @retval         1           error
+
+  Should not be called for transactional tables, as their state on disk is
+  rarely current and so is often misleading for a reader.
+  Does nothing in single user mode.
+
+  Normally this function does not read the state info from file if
+  'maria_single_user' is true. This means that mysqld is the only
+  program that works on the table files. No other program modifies the
+  files. Hence the in-memory state is expected to be current.
+
+  If there are other programs tampering with the files, mysqld must be
+  started with --external-locking. This makes 'myisam_single_user'
+  false. In this case this function does indeed read the state from
+  disk.
+
+  In cases like restore, we modify the table files directly,
+  bypassing the MyISAM interface. We do this inside of mysqld, so
+  --external-locking need not be specified. We support this case by the
+  'force' parameter.
+*/
+
+uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state,
+                             my_bool force)
 {
-#ifdef EXTERNAL_LOCKING
   uchar	buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE];
-
   /* trick to detect transactional tables */
-  DBUG_ASSERT(state->create_rename_lsn == LSN_IMPOSSIBLE);
-  if (!maria_single_user)
+  DBUG_ASSERT(force || (state->create_rename_lsn == LSN_IMPOSSIBLE));
+  if (
+#ifdef EXTERNAL_LOCKING
+      !maria_single_user ||
+#endif
+      force)
   {
     if (my_pread(file, buff, state->state_length, 0L, MYF(MY_NABP)))
       return 1;
     _ma_state_info_read(buff, state);
   }
-#endif
   return 0;
 }
 
@@ -1524,7 +1607,7 @@ uchar *_ma_keydef_read(uchar *ptr, MARIA
    keydef->underflow_block_length=keydef->block_length/3;
    keydef->version	= 0;			/* Not saved */
    keydef->parser       = &ft_default_parser;
-   keydef->ftparser_nr  = 0;
+   keydef->ftkey_nr     = 0;
    return ptr;
 }
 
@@ -1679,21 +1762,26 @@ void _ma_set_data_pagecache_callbacks(PA
 {
   file->callback_data= (uchar*) share;
   file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */
+  file->post_write_callback= &maria_flush_log_for_page_none;
 
   if (share->temporary)
   {
     file->read_callback=  &maria_page_crc_check_none;
-    file->write_callback= &maria_page_filler_set_none;
+    file->pre_write_callback= &maria_page_filler_set_none;
   }
   else
   {
+
     file->read_callback=  &maria_page_crc_check_data;
     if (share->options & HA_OPTION_PAGE_CHECKSUM)
-      file->write_callback= &maria_page_crc_set_normal;
+      file->pre_write_callback= &maria_page_crc_set_normal;
     else
-      file->write_callback= &maria_page_filler_set_normal;
+      file->pre_write_callback= &maria_page_filler_set_normal;
     if (share->now_transactional)
       file->flush_log_callback= maria_flush_log_for_page;
+#ifdef HAVE_MARIA_PHYSICAL_LOGGING
+    file->post_write_callback= &maria_log_data_page_flush_physical;
+#endif
   }
 }
 
@@ -1712,22 +1800,26 @@ void _ma_set_index_pagecache_callbacks(P
   file->callback_data= (uchar*) share;
   file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */
   file->write_fail= maria_page_write_failure;
+  file->post_write_callback= &maria_flush_log_for_page_none;
 
   if (share->temporary)
   {
     file->read_callback=  &maria_page_crc_check_none;
-    file->write_callback= &maria_page_filler_set_none;
+    file->pre_write_callback= &maria_page_filler_set_none;
   }
   else
   {
     file->read_callback=  &maria_page_crc_check_index;
     if (share->options & HA_OPTION_PAGE_CHECKSUM)
-      file->write_callback= &maria_page_crc_set_index;
+      file->pre_write_callback= &maria_page_crc_set_index;
     else
-      file->write_callback= &maria_page_filler_set_normal;
+      file->pre_write_callback= &maria_page_filler_set_normal;
 
     if (share->now_transactional)
       file->flush_log_callback= maria_flush_log_for_page;
+#ifdef HAVE_MARIA_PHYSICAL_LOGGING
+    file->post_write_callback= &maria_log_index_page_flush_physical;
+#endif
   }
 }
 

=== modified file 'storage/maria/ma_pagecache.c'
--- a/storage/maria/ma_pagecache.c	2009-01-27 02:08:48 +0000
+++ b/storage/maria/ma_pagecache.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2008 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -617,12 +617,12 @@ static my_bool pagecache_fwrite(PAGECACH
   /* Todo: Integrate this with write_callback so we have only one callback */
   if ((*filedesc->flush_log_callback)(buffer, pageno, filedesc->callback_data))
     DBUG_RETURN(1);
-  DBUG_PRINT("info", ("write_callback: 0x%lx  data: 0x%lx",
-                      (ulong) filedesc->write_callback,
+  DBUG_PRINT("info", ("pre_write_callback: 0x%lx  data: 0x%lx",
+                      (ulong) filedesc->pre_write_callback,
                       (ulong) filedesc->callback_data));
-  if ((*filedesc->write_callback)(buffer, pageno, filedesc->callback_data))
+  if ((*filedesc->pre_write_callback)(buffer, pageno, filedesc->callback_data))
   {
-    DBUG_PRINT("error", ("write callback problem"));
+    DBUG_PRINT("error", ("pre_write callback problem"));
     DBUG_RETURN(1);
   }
   if (my_pwrite(filedesc->file, buffer, pagecache->block_size,
@@ -631,6 +631,11 @@ static my_bool pagecache_fwrite(PAGECACH
     (*filedesc->write_fail)(filedesc->callback_data);
     DBUG_RETURN(1);
   }
+  if ((*filedesc->post_write_callback)(buffer, pageno, filedesc->callback_data))
+  {
+    DBUG_PRINT("error", ("post_write callback problem"));
+    DBUG_RETURN(1);
+  }
   DBUG_RETURN(0);
 }
 
@@ -2974,7 +2979,11 @@ void pagecache_unlock_by_link(PAGECACHE 
     }
     if (lsn != LSN_IMPOSSIBLE)
       check_and_set_lsn(pagecache, lsn, block);
-    block->status&= ~PCBLOCK_ERROR;
+    /*
+      Reset error flag. Mark also that page is active; This may not have
+      been the case if there was an error reading the page
+    */
+    block->status= (block->status & ~PCBLOCK_ERROR) | PCBLOCK_READ;
   }
 
   /* if we lock for write we must link the block to changed blocks */
@@ -4223,11 +4232,11 @@ static int flush_cached_blocks(PAGECACHE
        @todo IO If page is contiguous with next page to flush, group flushes
        in one single my_pwrite().
     */
-    /*
+    /**
       It is important to use block->hash_link->file below and not 'file', as
-      the first one is right and the second may have different content (and
-      this matters for callbacks, bitmap pages and data pages have different
-      ones).
+      the first one is right and the second may have different out-of-date
+      content (see StaleFilePointersInFlush in ma_checkpoint.c).
+      @todo change argument of functions to be File.
     */
     error= pagecache_fwrite(pagecache, &block->hash_link->file,
                             block->buffer,

=== modified file 'storage/maria/ma_pagecache.h'
--- a/storage/maria/ma_pagecache.h	2008-10-14 15:18:14 +0000
+++ b/storage/maria/ma_pagecache.h	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -84,12 +84,15 @@ typedef struct st_pagecache_file
   my_bool (*read_callback)(uchar *page, pgcache_page_no_t offset,
                            uchar *data);
   /** Cannot be NULL */
-  my_bool (*write_callback)(uchar *page, pgcache_page_no_t offset,
-                            uchar *data);
+  my_bool (*pre_write_callback)(uchar *page, pgcache_page_no_t offset,
+                                uchar *data);
   void (*write_fail)(uchar *data);
   /** Cannot be NULL */
   my_bool (*flush_log_callback)(uchar *page, pgcache_page_no_t offset,
                                 uchar *data);
+  /** Cannot be NULL */
+  my_bool (*post_write_callback)(uchar *page, pgcache_page_no_t offset,
+                                 uchar *data);
   uchar *callback_data;
 } PAGECACHE_FILE;
 
@@ -264,10 +267,11 @@ extern void pagecache_unpin_by_link(PAGE
 /* PCFLUSH_ERROR and PCFLUSH_PINNED. */
 #define PCFLUSH_PINNED_AND_ERROR (PCFLUSH_ERROR|PCFLUSH_PINNED)
 
-#define pagecache_file_init(F,RC,WC,WF,GLC,D) \
+#define pagecache_file_init(F,RC,PREWC,WF,GLC,POSTWC,D) \
   do{ \
-    (F).read_callback= (RC); (F).write_callback= (WC); \
+    (F).read_callback= (RC); (F).pre_write_callback= (PREWC);   \
     (F).write_fail= (WF); \
+    (F).post_write_callback= (POSTWC);                             \
     (F).flush_log_callback= (GLC); (F).callback_data= (uchar*)(D); \
   } while(0)
 

=== modified file 'storage/maria/ma_pagecrc.c'
--- a/storage/maria/ma_pagecrc.c	2008-03-04 11:58:21 +0000
+++ b/storage/maria/ma_pagecrc.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2008 MySQL AB
+/* Copyright (C) 2007-2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -376,3 +376,52 @@ my_bool maria_flush_log_for_page_none(uc
 {
   return 0;
 }
+
+
+/**
+  Logs when the page cache flushes an index page to the file, to the physical
+  log.
+
+  Argument cannot be a MARIA_HA* (the MARIA_HA which put the page in the key
+  cache may have been freed long ago when the page is finally flushed), it is
+  MARIA_SHARE* which is sure to be valid.
+
+  @param page            The page data to set
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Write callback data pointer (pointer to MARIA_SHARE)
+
+  @return Operation status, always 0
+    @retval 0      ok. Yes, even if log write fails we return ok, don't want
+                   to make the table writer believe its table is now
+                   corrupted.
+*/
+
+my_bool maria_log_index_page_flush_physical(uchar *page,
+                                            pgcache_page_no_t page_no,
+                                            uchar *data_ptr)
+{
+  MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+  DBUG_ENTER("maria_log_index_page_flush_physical");
+  if (unlikely(ma_log_index_pages_physical &&
+               ma_get_physical_logging_state(share)))
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAI, share, page,
+                              maria_block_size, page_no * maria_block_size);
+  DBUG_RETURN(0);
+}
+
+
+/**
+  Same as maria_log_index_page_flush_physical() but for data page.
+*/
+
+my_bool maria_log_data_page_flush_physical(uchar *page,
+                                           pgcache_page_no_t page_no,
+                                           uchar *data_ptr)
+{
+  MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+  DBUG_ENTER("maria_log_data_page_flush_physical");
+  if (unlikely(ma_get_physical_logging_state(share)))
+    maria_log_pwrite_physical(MA_LOG_WRITE_BYTES_MAD, share, page,
+                              maria_block_size, page_no * maria_block_size);
+  DBUG_RETURN(0);
+}

=== modified file 'storage/maria/ma_recovery.c'
--- a/storage/maria/ma_recovery.c	2009-01-27 02:08:48 +0000
+++ b/storage/maria/ma_recovery.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006, 2007 MySQL AB
+/* Copyright (C) 2006, 2007 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -98,6 +98,7 @@ prototype_redo_exec_hook(UNDO_KEY_DELETE
 prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
 prototype_redo_exec_hook(COMMIT);
 prototype_redo_exec_hook(CLR_END);
+prototype_redo_exec_hook(DEBUG_INFO);
 prototype_undo_exec_hook(UNDO_ROW_INSERT);
 prototype_undo_exec_hook(UNDO_ROW_DELETE);
 prototype_undo_exec_hook(UNDO_ROW_UPDATE);
@@ -488,6 +489,11 @@ static void display_record_position(cons
          number ? "" : "   ", number, LSN_IN_PARTS(rec->lsn),
          rec->short_trid, log_desc->name, rec->type,
          (ulong)rec->record_length);
+  if (rec->type == LOGREC_DEBUG_INFO)
+  {
+    /* Print some extra information */
+    (*log_desc->record_execute_in_redo_phase)(rec);
+  }
 }
 
 
@@ -1412,6 +1418,9 @@ prototype_redo_exec_hook(REDO_INSERT_ROW
 {
   int error= 1;
   uchar *buff;
+  uint number_of_blobs, number_of_ranges;
+  pgcache_page_no_t first_page, last_page;
+  char llbuf1[22], llbuf2[22];
   MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
   if (info == NULL)
     return 0;
@@ -1426,11 +1435,19 @@ prototype_redo_exec_hook(REDO_INSERT_ROW
   }
   buff= log_record_buffer.str;
   if (_ma_apply_redo_insert_row_blobs(info, current_group_end_lsn,
-                                      buff, rec->lsn))
-    goto end;
+                                      buff, rec->lsn, &number_of_blobs,
+                                      &number_of_ranges,
+                                      &first_page, &last_page))
+    goto end;
+  llstr(first_page, llbuf1);
+  llstr(last_page, llbuf2);
+  tprint(tracef, " %u blobs %u ranges, first page %s last %s",
+         number_of_blobs, number_of_ranges, llbuf1, llbuf2);
+
   error= 0;
 
 end:
+  tprint(tracef, " \n");
   return error;
 }
 
@@ -1993,6 +2010,37 @@ prototype_redo_exec_hook(CLR_END)
 
 
 /**
+   Hock to print debug information (like MySQL query)
+*/
+
+prototype_redo_exec_hook(DEBUG_INFO)
+{
+  uchar *data;
+  enum translog_debug_info_type debug_info;
+
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+      rec->record_length)
+  {
+    eprint(tracef, "Failed to read record debug record");
+    return 1;
+  }
+  debug_info= (enum translog_debug_info_type) log_record_buffer.str[0];
+  data= log_record_buffer.str + 1;
+  switch (debug_info) {
+  case LOGREC_DEBUG_INFO_QUERY:
+    tprint(tracef, "Query: %s\n", (char*) data);
+    break;
+  default:
+    DBUG_ASSERT(0);
+  }
+  return 0;
+}
+
+
+/**
   In some cases we have to skip execution of an UNDO record during the UNDO
   phase.
 */
@@ -2350,6 +2398,7 @@ static int run_redo_phase(LSN lsn, enum 
   install_redo_exec_hook(UNDO_BULK_INSERT);
   install_undo_exec_hook(UNDO_BULK_INSERT);
   install_redo_exec_hook(IMPORTED_TABLE);
+  install_redo_exec_hook(DEBUG_INFO);
 
   current_group_end_lsn= LSN_IMPOSSIBLE;
 #ifndef DBUG_OFF
@@ -2760,7 +2809,8 @@ static void prepare_table_for_close(MARI
       cmp_translog_addr(share->lsn_of_file_id, horizon) < 0)
   {
     share->state.is_of_horizon= horizon;
-    _ma_state_info_write_sub(share->kfile.file, &share->state, 1);
+    _ma_state_info_write_sub(share, share->kfile.file, &share->state,
+                             MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
   }
 
   /*
@@ -3276,7 +3326,10 @@ void _ma_tmp_disable_logging_for_table(M
 /**
    Re-enables logging for a table which had it temporarily disabled.
 
-   Only the thread which disabled logging is allowed to reenable it.
+   Only the thread which disabled logging is allowed to reenable it. Indeed,
+   re-enabling logging affects all open instances, one must have exclusive
+   access to the table to do that. In practice, the one which disables has
+   such access.
 
    @param  info            table
    @param  flush_pages     if function needs to flush pages first
@@ -3315,7 +3368,9 @@ my_bool _ma_reenable_logging_for_table(M
       */
       if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
                                 FLUSH_RELEASE, FLUSH_RELEASE) ||
-          _ma_state_info_write(share, 1|4) ||
+          _ma_state_info_write(share,
+                               MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                               MA_STATE_INFO_WRITE_LOCK) ||
           _ma_sync_table_files(info))
         DBUG_RETURN(1);
     }
@@ -3392,6 +3447,7 @@ static void print_redo_phase_progress(TR
   }
 }
 
+
 #ifdef MARIA_EXTERNAL_LOCKING
 #error Marias Checkpoint and Recovery are really not ready for it
 #endif

=== modified file 'storage/maria/ma_rename.c'
--- a/storage/maria/ma_rename.c	2008-08-06 14:03:27 +0000
+++ b/storage/maria/ma_rename.c	2008-12-22 00:17:37 +0000
@@ -104,11 +104,6 @@ int maria_rename(const char *old_name, c
   }
 
   maria_close(info);
-#ifdef USE_RAID
-#ifdef EXTRA_DEBUG
-  _ma_check_table_is_closed(old_name,"rename raidcheck");
-#endif
-#endif /* USE_RAID */
 
   fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
   fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);

=== modified file 'storage/maria/ma_rt_index.c'
--- a/storage/maria/ma_rt_index.c	2008-06-26 05:18:28 +0000
+++ b/storage/maria/ma_rt_index.c	2009-01-26 21:14:43 +0000
@@ -433,7 +433,7 @@ int maria_rtree_get_first(MARIA_HA *info
   info->maria_rtree_recursion_depth= -1;
   info->keyread_buff_used= 1;
 
-  return maria_rtree_get_req(info, &keyinfo[keynr], key_length, root, 0);
+  return maria_rtree_get_req(info, keyinfo, key_length, root, 0);
 }
 
 

=== modified file 'storage/maria/ma_state.c'
--- a/storage/maria/ma_state.c	2008-12-09 12:36:51 +0000
+++ b/storage/maria/ma_state.c	2009-02-13 16:30:54 +0000
@@ -91,18 +91,26 @@ my_bool _ma_setup_live_state(MARIA_HA *i
     It's enough to compare trids here (instead of calling
     tranman_can_read_from) as history->trid is a commit_trid
   */
-  while (trn->trid < history->trid && history->trid != ~(TrID)0)
+  while (trn->trid <= history->trid)
     history= history->next;
   pthread_mutex_unlock(&share->intern_lock);
   /* The current item can't be deleted as it's the first one visible for us */
   tables->state_start=  tables->state_current= history->state;
-  tables->state_current.changed= 0;
+  tables->state_current.changed= tables->state_current.no_transid= 0;
 
   DBUG_PRINT("info", ("records: %ld", (ulong) tables->state_start.records));
 
 end:
   info->state_start= &tables->state_start;
   info->state= &tables->state_current;
+
+  /*
+    Mark in transaction state if we are not using transid (versioning)
+    on rows. If not, then we will in _ma_trnman_end_trans_hook()
+    ensure that the state is visible for all at end of transaction
+  */
+  tables->state_current.no_transid|= !(info->row_flag & ROW_FLAG_TRANSID);
+
   DBUG_RETURN(0);
 }
 
@@ -406,51 +414,70 @@ my_bool _ma_trnman_end_trans_hook(TRN *t
       MARIA_STATE_HISTORY *history;
 
       pthread_mutex_lock(&share->intern_lock);
-      if (active_transactions && share->now_transactional &&
-          trnman_exists_active_transactions(share->state_history->trid,
-                                            trn->commit_trid, 1))
+
+      /* We only have to update history state if something changed */
+      if (tables->state_current.changed)
       {
-        /*
-          There exist transactions that are still using the current
-          share->state_history.  Create a new history item for this
-          commit and add it first in the state_history list. This
-          ensures that all history items are stored in the list in
-          decresing trid order.
-        */
-        if (!(history= my_malloc(sizeof(*history), MYF(MY_WME))))
+        if (tables->state_current.no_transid)
         {
-          /* purecov: begin inspected */
-          error= 1;
-          pthread_mutex_unlock(&share->intern_lock);
-          my_free(tables, MYF(0));
-          continue;
-          /* purecov: end */
+          /*
+            The change was done without using transid on rows (like in
+            bulk insert). In this case this thread is the only one
+            that is using the table and all rows will be visble
+            for all transactions.
+          */
+          _ma_reset_history(share);
+        }
+        else
+        {
+          if (active_transactions && share->now_transactional &&
+              trnman_exists_active_transactions(share->state_history->trid,
+                                                trn->commit_trid, 1))
+          {
+            /*
+              There exist transactions that are still using the current
+              share->state_history.  Create a new history item for this
+              commit and add it first in the state_history list. This
+              ensures that all history items are stored in the list in
+              decresing trid order.
+            */
+            if (!(history= my_malloc(sizeof(*history), MYF(MY_WME))))
+            {
+              /* purecov: begin inspected */
+              error= 1;
+              pthread_mutex_unlock(&share->intern_lock);
+              my_free(tables, MYF(0));
+              continue;
+              /* purecov: end */
+            }
+            history->state= share->state_history->state;
+            history->next= share->state_history;
+            share->state_history= history;
+          }
+          else
+          {
+            /* Previous history can't be seen by anyone, reuse old memory */
+            history= share->state_history;
+            DBUG_PRINT("info", ("removing history->trid: %lu  new: %lu",
+                                (ulong) history->trid,
+                                (ulong) trn->commit_trid));
+          }
+
+          history->state.records+= (tables->state_current.records -
+                                    tables->state_start.records);
+          history->state.checksum+= (tables->state_current.checksum -
+                                     tables->state_start.checksum);
+          history->trid= trn->commit_trid;
+
+          if (history->next)
+          {
+            /* Remove not visible states */
+            share->state_history= _ma_remove_not_visible_states(history, 0, 1);
+          }
+          DBUG_PRINT("info", ("share: 0x%lx  in_trans: %d",
+                              (ulong) share, share->in_trans));
         }
-        history->state= share->state_history->state;
-        history->next= share->state_history;
-        share->state_history= history;
-      }
-      else
-      {
-        /* Previous history can't be seen by anyone, reuse old memory */
-        history= share->state_history;
-        DBUG_PRINT("info", ("removing history->trid: %lu  new: %lu",
-                            (ulong) history->trid, (ulong) trn->commit_trid));
-      }
-
-      history->state.records+= (tables->state_current.records -
-                                tables->state_start.records);
-      history->state.checksum+= (tables->state_current.checksum -
-                                 tables->state_start.checksum);
-      history->trid= trn->commit_trid;
-
-      if (history->next)
-      {
-        /* Remove not visible states */
-        share->state_history= _ma_remove_not_visible_states(history, 0, 1);
       }
-      DBUG_PRINT("info", ("share: 0x%lx  in_trans: %d",
-                          (ulong) share, share->in_trans));
       share->in_trans--;
       pthread_mutex_unlock(&share->intern_lock);
     }
@@ -511,7 +538,6 @@ void _ma_remove_table_from_trnman(MARIA_
 
 
 
-
 /****************************************************************************
   The following functions are called by thr_lock() in threaded applications
   for transactional tables.
@@ -536,9 +562,24 @@ void _ma_block_get_status(void* param, m
   info->row_flag= info->s->base.default_row_flag;
   if (concurrent_insert)
   {
+    DBUG_ASSERT(info->lock.type == TL_WRITE_CONCURRENT_INSERT);
     info->row_flag|= ROW_FLAG_TRANSID;
     info->row_base_length+= TRANSID_SIZE;
   }
+  else
+  {
+    DBUG_ASSERT(info->lock.type != TL_WRITE_CONCURRENT_INSERT);
+  }
+
+  if (info->s->lock_key_trees)
+  {
+    /*
+      Assume for now that this doesn't fail (It can only fail in
+      out of memory conditions)
+      TODO: Fix this by having one extra state pre-allocated
+    */
+    (void) _ma_setup_live_state(info);
+  }
   DBUG_VOID_RETURN;
 }
 
@@ -574,7 +615,16 @@ void maria_versioning(MARIA_HA *info, my
 {
   /* For now, this is a hack */
   if (info->s->have_versioning)
+  {
+    enum thr_lock_type save_lock_type;
+    /* Assume is a non threaded application (for now) */
+    info->s->lock_key_trees= 0;
+    /* Set up info->lock.type temporary for _ma_block_get_status() */
+    save_lock_type= info->lock.type;
+    info->lock.type= versioning ? TL_WRITE_CONCURRENT_INSERT : TL_WRITE;
     _ma_block_get_status((void*) info, versioning);
+    info->lock.type= save_lock_type;
+  }
 }
 
 
@@ -609,6 +659,7 @@ void _ma_copy_nontrans_state_information
 void _ma_reset_history(MARIA_SHARE *share)
 {
   MARIA_STATE_HISTORY *history, *next;
+  DBUG_ENTER("_ma_reset_history");
 
   share->state_history->trid= 0;          /* Visibly by all */
   share->state_history->state= share->state.state;
@@ -620,6 +671,7 @@ void _ma_reset_history(MARIA_SHARE *shar
     next= history->next;
     my_free(history, MYF(0));
   }
+  DBUG_VOID_RETURN;
 }
 
 

=== modified file 'storage/maria/ma_state.h'
--- a/storage/maria/ma_state.h	2008-12-08 20:09:59 +0000
+++ b/storage/maria/ma_state.h	2008-12-22 00:17:37 +0000
@@ -17,14 +17,15 @@
 
 typedef struct st_maria_status_info
 {
-  ha_rows records;				/* Rows in table */
-  ha_rows del;					/* Removed rows */
-  my_off_t empty;				/* lost space in datafile */
-  my_off_t key_empty;				/* lost space in indexfile */
+  ha_rows records;                      /* Rows in table */
+  ha_rows del;                          /* Removed rows */
+  my_off_t empty;                       /* lost space in datafile */
+  my_off_t key_empty;                   /* lost space in indexfile */
   my_off_t key_file_length;
   my_off_t data_file_length;
   ha_checksum checksum;
-  my_bool     changed;
+  uint32 changed:1,                     /* Set if table was changed */
+         no_transid:1;                  /* Set if no transid was set on rows */
 } MARIA_STATUS_INFO;
 
 

=== modified file 'storage/maria/ma_static.c'
--- a/storage/maria/ma_static.c	2008-10-16 08:54:53 +0000
+++ b/storage/maria/ma_static.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -14,8 +15,9 @@
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
 
-/*
-  Static variables for MARIA library. All definied here for easy making of
+/**
+  @file
+  Static variables for MARIA library. All defined here for easy making of
   a shared library
 */
 
@@ -31,13 +33,17 @@ uchar	maria_pack_file_magic[]=
 { (uchar) 254, (uchar) 254, (uchar) 10, '\001', };
 /* Unique number for this maria instance */
 uchar   maria_uuid[MY_UUID_SIZE];
+IO_CACHE maria_physical_log; /**< Physical log (used by online backup) */
 uint	maria_quick_table_bits=9;
 ulong	maria_block_size= MARIA_KEY_BLOCK_LENGTH;
 my_bool maria_flush= 0, maria_single_user= 0;
 my_bool maria_delay_key_write= 0, maria_page_checksums= 1;
 my_bool maria_inited= FALSE;
 my_bool maria_in_ha_maria= FALSE; /* If used from ha_maria or not */
+/** For insert/delete in the list of Maria open tables */
 pthread_mutex_t THR_LOCK_maria;
+/** For writing to the Maria logs */
+pthread_mutex_t THR_LOCK_maria_log;
 #if defined(THREAD) && !defined(DONT_USE_RW_LOCKS)
 ulong maria_concurrent_insert= 2;
 #else
@@ -105,3 +111,34 @@ static int always_valid(const char *file
 }
 
 int (*maria_test_invalid_symlink)(const char *filename)= always_valid;
+
+/** Hash of all tables for which we want physical logging */
+const HASH *ma_log_tables_physical;
+/**
+  If page changes to the index file should be logged to the physical log.
+
+  @note Changes to the header of the index file of a table in physical
+  logging are always logged because the header is not redundant with the data
+  file.
+*/
+my_bool ma_log_index_pages_physical;
+
+/**
+  All Maria-specific error messages which may be sent to the user.
+  They will be localized (translated) as part of
+  http://forge.mysql.com/worklog/task.php?id=2940
+  "MySQL plugin interface: error reporting".
+  Same order as enum myisam_errors.
+*/
+const char *maria_error_messages[] =
+{
+  "online backup impossible with --external-locking",
+  "backup archive format has too recent version (%u) (current: %u)"
+};
+
+static inline void maria_error_messages_dummy_validator()
+{
+  compile_time_assert((sizeof(maria_error_messages) /
+                       sizeof(maria_error_messages[0])) ==
+                      (-MARIA_ERR_LAST-1));
+}

=== modified file 'storage/maria/ma_write.c'
--- a/storage/maria/ma_write.c	2008-11-20 19:18:59 +0000
+++ b/storage/maria/ma_write.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (C) 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -39,8 +40,8 @@ static uchar *_ma_find_last_pos(MARIA_HA
                                 uchar *page, uchar **after_key);
 static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key);
 static my_bool _ma_ck_write_btree(register MARIA_HA *info, MARIA_KEY *key);
-static int _ma_ck_write_btree_with_log(MARIA_HA *info, MARIA_KEY *key,
-                                       my_off_t *root, uint32 comp_flag);
+static int _ma_ck_write_btree_with_log(MARIA_HA *, MARIA_KEY *, my_off_t *,
+                                       uint32);
 static my_bool _ma_log_split(MARIA_HA *info, my_off_t page, const uchar *buff,
                              uint org_length, uint new_length,
                              const uchar *key_pos,
@@ -181,9 +182,8 @@ int maria_write(MARIA_HA *info, uchar *r
       else
       {
         while (keyinfo->ck_insert(info,
-                                  (*keyinfo->make_key)(info, &int_key, i,
-                                                       buff, record, filepos,
-                                                       info->trn->trid)))
+                 (*keyinfo->make_key)(info, &int_key, i, buff, record,
+                                      filepos, info->trn->trid)))
         {
           TRN *blocker;
           DBUG_PRINT("error",("Got error: %d on write",my_errno));
@@ -193,10 +193,12 @@ int maria_write(MARIA_HA *info, uchar *r
             below doesn't work for them.
             Also, filter out non-thread maria use, and table modified in
             the same transaction.
+            At last, filter out non-dup-unique errors.
           */
           if (!local_lock_tree)
             goto err;
-          if (info->dup_key_trid == info->trn->trid)
+          if (info->dup_key_trid == info->trn->trid ||
+              my_errno != HA_ERR_FOUND_DUPP_KEY)
           {
 	    rw_unlock(&keyinfo->root_lock);
             goto err;
@@ -230,9 +232,11 @@ int maria_write(MARIA_HA *info, uchar *r
             /* running. now we wait */
             WT_RESOURCE_ID rc;
             int res;
+            const char *old_proc_info; 
 
             rc.type= &ma_rc_dup_unique;
-            rc.value= (intptr)blocker; /* TODO savepoint id when we'll have them */
+            /* TODO savepoint id when we'll have them */
+            rc.value= (intptr)blocker;
             res= wt_thd_will_wait_for(info->trn->wt, blocker->wt, & rc);
             if (res != WT_OK)
             {
@@ -240,14 +244,12 @@ int maria_write(MARIA_HA *info, uchar *r
               my_errno= HA_ERR_LOCK_DEADLOCK;
               goto err;
             }
-            {
-              const char *old_proc_info= proc_info_hook(0,
-                    "waiting for a resource", __func__, __FILE__, __LINE__);
-
-              res= wt_thd_cond_timedwait(info->trn->wt, & blocker->state_lock);
+            old_proc_info= proc_info_hook(0,
+                                          "waiting for a resource",
+                                          __func__, __FILE__, __LINE__);
+            res= wt_thd_cond_timedwait(info->trn->wt, & blocker->state_lock);
+            proc_info_hook(0, old_proc_info, __func__, __FILE__, __LINE__);
 
-              proc_info_hook(0, old_proc_info, __func__, __FILE__, __LINE__);
-            }
             pthread_mutex_unlock(& blocker->state_lock);
             if (res != WT_OK)
             {
@@ -257,6 +259,9 @@ int maria_write(MARIA_HA *info, uchar *r
             }
           }
           rw_wrlock(&keyinfo->root_lock);
+#ifndef MARIA_CANNOT_ROLLBACK
+          keyinfo->version++;
+#endif
         }
       }
 
@@ -320,6 +325,8 @@ err:
   fatal_error= 0;
   if (my_errno == HA_ERR_FOUND_DUPP_KEY ||
       my_errno == HA_ERR_RECORD_FILE_FULL ||
+      my_errno == HA_ERR_LOCK_DEADLOCK ||
+      my_errno == HA_ERR_LOCK_WAIT_TIMEOUT ||
       my_errno == HA_ERR_NULL_IN_SPATIAL ||
       my_errno == HA_ERR_OUT_OF_MEM)
   {
@@ -671,12 +678,14 @@ static int w_search(register MARIA_HA *i
         When the index will support true versioning - with multiple
         identical values in the UNIQUE index, invisible to each other -
         the following should be changed to "continue inserting keys, at the
-        end (of the row or statement) wait". Until it's done we cannot properly
-        support deadlock timeouts.
+        end (of the row or statement) wait". We need to wait on *all*
+        unique conflicts at once, not one-at-a-time, because we need to
+        know all blockers in advance, otherwise we'll have incomplete wait-for
+        graph.
       */
       /*
-        transaction that has inserted the conflicting key is in progress.
-        wait for it to be committed or aborted.
+        transaction that has inserted the conflicting key may be in progress.
+        the caller will wait for it to be committed or aborted.
       */
       info->dup_key_trid= _ma_trid_from_key(&tmp_key);
       info->dup_key_pos= dup_key_pos;

=== added file 'storage/maria/maria_backup_engine.cc'
--- a/storage/maria/maria_backup_engine.cc	1970-01-01 00:00:00 +0000
+++ b/storage/maria/maria_backup_engine.cc	2009-02-13 18:16:54 +0000
@@ -0,0 +1,2180 @@
+/* Copyright (C) 2009 - 2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file
+  Online backup engine for the Maria storage engine.
+
+  This is a clone of MyISAM's online backup, same design, same good and bad
+  points.
+  Later, for transactional tables, we will change the design so that backup:
+  1) re-uses the transaction log instead of adding a physical log
+  2) has an instant validity-point creation (roughly, just block commits).
+
+  @see maria_backup
+*/
+
+#define MYSQL_SERVER 1 // need it to have mysql_tmpdir defined
+#include "mysql_priv.h"
+#include "ha_maria.h"
+C_MODE_START
+#include "maria_def.h" // to access dfile and kfile
+#include "ma_blockrec.h"
+C_MODE_END
+#include "backup/backup_engine.h"
+#include "backup/backup_aux.h"         // for build_table_list()
+#include "debug_sync.h"
+#include <hash.h>
+
+/**
+  Online backup engine for the Maria storage engine.
+
+  Reference of the Online Backup API:
+  http://forge.mysql.com/source/OnlineBackup.
+
+  Here is how the Maria online backup works.
+  It is online because we dirtily copy the data and index files,
+  and the tables maintain a physical idempotent log of changes done to them
+  during the copy process, applying this log to the dirty copy yields a clean
+  table corresponding to how the original table was when logging ended.
+  Idempotent means that if you apply such log to a table, then applying it a
+  second time has no effect.
+
+  A condition for this to work is that any update done to a table after the
+  copy process started must be present in the log. See the comment of
+  ma_log_start_physical() for how this is ensured.
+
+  HOW THE BACKUP WORKS
+
+  In Backup::begin(), we instruct all needed tables to do backup
+  logging; this does not have to wait for existing updates to complete,
+  neither does it stall new updates.
+
+  Then we dirtily copy them in Backup::get_data(). That copy is intensive on
+  the hard drive, so can be optionally throttled (via a configurable sleep).
+
+  When the copy process is done with tables, it signals the backup kernel
+  that it is ready to lock tables (to create a validity point).
+  To not waste its time until the Backup::prelock() request is sent by the
+  backup kernel, the copy process starts copying the log.
+
+  Now the Backup::prelock() request comes.
+  To finish the backup, we need to synchronize (=read-lock) all tables of the
+  backup (thus creating a consistent state accross them), stop logging for
+  all of them, and unlock tables. This lock can wait for a long time if there
+  is a long running update. If it waited a long time, other drivers which have
+  already executed their lock(), would stay locked for a long time. To avoid
+  that, we do all the locking work before Backup::lock(), in
+  Backup::prelock() (called before lock() on any driver). Backup::prelock()
+  itself is not allowed to block, because it is called from the backup
+  kernel's thread: so it launches a separate thread (which will issue a LOCK
+  TABLES READ on our tables) and does not wait for completion of LOCK TABLES
+  READ: it immediately returns backup::OK which means "I have not completed my
+  preparations for locking".
+
+  In Backup::get_data(), the driver monitors the status of the locking
+  thread, and when finally that thread has managed to get its locks, we stop
+  logging and reply backup::READY.
+
+  So note the difference: this time, we have to wait for all updates to
+  finish, and stall new ones.
+
+  Next Backup::get_data() calls, if there are, send the final tail of the
+  log.
+
+  Backup::lock() comes, it's an empty operation for the driver.
+
+  Later we get a Backup::unlock() request. That kills the locking thread,
+  which thus unlocks tables. And Backup::end() cleans up memory.
+
+  HOW THE RESTORE WORKS
+
+  In Restore::send_data() we receive data which we write to tables (those
+  tables have just been created with their correct structure, but no data, by
+  the backup kernel). We similarly restore the log.
+
+  In Restore::end(), we apply the log to tables, making them clean.
+  If of the index file we backed up only the header (an option), we here do
+  an index rebuild.
+  Voila, the table is ready to work.
+
+  @todo if an index rebuild is needed, possibly do it at backup time.
+*/
+namespace maria_backup {
+
+using backup::byte;
+using backup::result_t;
+using backup::version_t;
+using backup::Table_list;
+using backup::Table_ref;
+using backup::Buffer;
+
+/**
+  The current version of the format stored in Maria backup images by this
+  code. Increase it when making a backward-incompatible change.
+*/
+#define MARIA_BACKUP_VERSION 1
+
+/**
+  Restore kernel opens tables and locks them for the duration of restore
+  (after having created them empty); this means that cached objects stay
+  around (MARIA_SHARE, MARIA_HA) and can become out-of-sync with the
+  data/index file filled by the driver, unless we take precautions which are
+  recognizable by this symbol.
+*/
+#define RESTORE_KERNEL_KEEPS_OPEN_TABLES 1
+/**
+  Restore kernel leaves a time windows between end of creation of table (via
+  execution of CREATE TABLE) and locking of this table; in this window another
+  client can open/lock/modify/unlock the table, which conflicts with what the
+  driver is going to write to the data/index file, unless we take precautions
+  which are recognizable by this symbol.
+*/
+#define RESTORE_KERNEL_NOT_ATOMIC 1
+
+/** Like Table_ref but with file name added */
+class Maria_table_ref
+{
+public:
+  Maria_table_ref(const Table_ref &);
+protected:
+  String db, name;
+  String file_name; ///< concatenation of db and table name
+};
+
+
+Maria_table_ref::Maria_table_ref(const Table_ref &tbl)
+{
+  int error= 0;
+  char path[FN_REFLEN];
+
+  /**
+    We keep local copies of the db and name. This is because during restore,
+    the Table_ref is apparently modified before the Table_restore is done
+    (symptom is that starting from second Table_restore::send_data() we see
+    Table_ref being garbage, and this is a problem in
+    Table_restore::post_restore()). Rafal suspects a bug.
+    @todo Once fixed, we can replace "String db,name" by "Table_ref &ref",
+    and this will save memory.
+    As Rafal is changing relevant code now, it may go away.
+  */
+  if (db.append(tbl.db().name()))
+    error= 1;
+  if (name.append(tbl.name()))
+    error= 1;
+  /*
+    Note: when we repair the table, we use open_temporary_table() which
+    requires db and table name separated. The internal_name is the
+    translated table name with ASCII characters only.
+  */
+  (void) tbl.internal_name(path, sizeof(path));
+  if (file_name.append(path))
+    error= 1;
+  /*
+    If one of the string allocations failed, clear all. This should be
+    noticed later, when we try to use the information.
+  */
+  if (error)
+  {
+    db.set("", 0, system_charset_info);
+    name.set("", 0, system_charset_info);
+    file_name.set("", 0, system_charset_info);
+  }
+}
+
+
+/**
+  Backup engine class. It is the master class: a Backup_engine creates a
+  Backup_driver and a corresponding Restore_driver. @see backup::Engine.
+*/
+class Engine: public Backup_engine
+{
+  public:
+    Engine() {}
+    virtual version_t version() const { return MARIA_BACKUP_VERSION; };
+    virtual result_t get_backup(const uint32, const Table_list &,
+                                Backup_driver* &);
+    virtual result_t get_restore(const version_t, const uint32,
+                                 const Table_list &,Restore_driver* &);
+  virtual void free() { delete this; }
+};
+
+/*************************
+ *
+ *  BACKUP FUNCTIONALITY
+ *
+ *************************/
+
+class Object_backup;
+
+
+/**
+  Handles backup orders received from the backup kernel (implements the API).
+*/
+class Backup: public Backup_driver
+{
+public:
+  Backup(const Table_list &);
+  virtual ~Backup();
+  /** Estimates total size of backup. @todo improve it */
+  virtual size_t    size() { return UNKNOWN_SIZE; };
+  /** Estimates size of backup before lock. @todo improve it */
+  virtual size_t    init_size() { return UNKNOWN_SIZE; };
+  virtual result_t  begin(const size_t);
+  virtual result_t  end();
+  virtual result_t  get_data(Buffer &);
+  virtual result_t  prelock();
+  virtual result_t  lock();
+  virtual result_t  unlock();
+  virtual result_t  cancel()
+    {
+      return backup::OK ; // free() will be called and suffice
+    };
+  virtual void free() { delete this; };
+  void lock_tables_TL_READ_NO_INSERT();
+
+private:
+  enum { DUMPING_DATA_INDEX_FILES,
+         DUMPING_LOG_FILE_BEFORE_TABLES_ARE_LOCKED,
+         DUMPING_LOG_FILE_AFTER_TABLES_ARE_LOCKED,
+         DONE, ERROR } state;
+  Object_backup  *image; ///< object in backup currently
+  uint stream; ///< which stream we are currently writing to
+  char backup_log_name[FN_REFLEN];
+  /**
+    All db||table names in a HASH structure. Passed to Maria functions for
+    them to detect if a table is part of the backup (=> should do logging) or
+    not.
+  */
+  HASH *hash_of_tables;
+  /**
+     Locking of tables goes through these states. It is a delicate variable
+     which must be set correctly after inspecting thread-safety and race
+     conditions.
+  */
+  enum { LOCK_NOT_STARTED, LOCK_STARTED, LOCK_ACQUIRED, LOCK_ERROR }
+    lock_state;
+  /**
+    The locking thread (so that we can kill it). Creating a validity point is
+    only possible by locking all tables (it is the only way to have tables
+    consistent with each other, as we have no UNDO log). But locking via
+    thr_lock() is blocking. So, to have a non-blocking prelock() call, this
+    locking is done in a separate thread (named "the locking thread").
+  */
+  THD *lock_thd;
+  bool cannot_delete_lock_thd;
+  pthread_cond_t COND_lock_state; ///< for communication with locking thread
+  void kill_locking_thread();
+  static const size_t bytes_between_sleeps= 10*1024*1024;
+  /** After copying bytes_between_sleeps we sleep sleep_time */
+  ulong sleep_time;
+  size_t bytes_since_last_sleep; ///< how many bytes sent since we last slept
+};
+
+/* Needed for VisualAge 6.0 */
+const size_t Backup::bytes_between_sleeps;
+
+/**
+  When we send a backup packet to the backup kernel, we prefix it with a code
+  which tells which type of file this packet belongs to. Starts at 1 because
+  garbage is often zeros and we want to spot it.
+*/
+enum enum_file_code { DATA_FILE_CODE= 1,
+                      WHOLE_INDEX_FILE_CODE, HEADER_INDEX_FILE_CODE,
+                      LOG_FILE_CODE };
+
+/** An object to backup; in practice, a table or the log */
+class Object_backup
+{
+public:
+  virtual result_t get_data(Buffer &)= 0;
+  virtual ~Object_backup() {};
+  bool internal_error() { return state == ERROR; }
+  /**
+    The only reason to have an end() and call it from the destructor, instead
+    of putting the code into the destructor, is that when the caller does a
+    "delete image", it cannot be told about errors, while if the caller does
+    "image->end()" (and then "delete image") it can see an error.
+  */
+  virtual result_t end()= 0; ///< cleanups
+protected:
+  enum { OK, ERROR } state; ///< serves to detect an error during construction
+};
+
+
+/**
+  An object to back up is made of one or more such files. This class does not
+  open the file, user has to open it. This class provides a helper method if
+  its user wants to close the file.
+*/
+class File_backup
+{
+public:
+  File_backup() : fd(-1), backup_file_size(0) {}
+
+  /**
+    Initializes the object.
+
+    @param  fd_arg        file descriptor to attach to
+    @param  file_size_arg copy should stop after copying that many bytes
+    @param  file_code_arg code to store at start of each sent data packet
+  */
+
+  void init(int fd_arg, my_off_t file_size_arg, enum_file_code file_code_arg)
+    { fd= fd_arg; file_size= file_size_arg; file_code= file_code_arg; }
+
+  result_t get_data(Buffer &);
+  result_t close_file();
+private:
+  int fd; ///< file descriptor
+  /**
+    After backing up that many bytes of the file, we can stop. In case of
+    ftruncate() happening to the file, we may even copy less than this size.
+  */
+  my_off_t file_size;
+  enum_file_code file_code; ///< code stored at start of each backup block
+  my_off_t backup_file_size; ///< how much of the file we already backed up
+};
+
+
+/** Handles backing up a single table */
+class Table_backup: public Maria_table_ref, public Object_backup
+{
+public:
+  Table_backup(const backup::Table_ref &);
+  virtual ~Table_backup();
+  virtual result_t get_data(Buffer &);
+  virtual result_t end(); ///< cleanups
+private:
+  File_backup dfile_backup, kfile_backup;
+  enum { DATA_FILE, INDEX_FILE } in_file; ///< which file we are dumping now
+};
+
+
+/** Handles backing up the log */
+class Log_backup: public Object_backup
+{
+public:
+  Log_backup(const char *);
+  virtual ~Log_backup();
+  virtual result_t get_data(Buffer &);
+  virtual result_t end();
+private:
+  const char *log_name;
+  File_backup log_file_backup;
+  bool log_deleted; ///< if we have already deleted the log or not
+};
+
+
+/**
+  Creates a backup driver, per the backup API. @see backup::Engine.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Engine::get_backup(const uint32, const Table_list &tables,
+                            Backup_driver* &drv)
+{
+  Backup *ptr= new Backup(tables);
+  if (unlikely(!ptr))
+    return backup::ERROR;
+  drv= ptr;
+  return backup::OK;
+}
+
+
+Backup::Backup(const Table_list &tables):
+  Backup_driver(tables), state(ERROR), image(NULL), stream(1),
+  hash_of_tables(NULL), lock_state(LOCK_NOT_STARTED), lock_thd(NULL),
+  cannot_delete_lock_thd(FALSE), bytes_since_last_sleep(0)
+{
+  /*
+    Driver is not ready at this point, so state is ERROR.
+    This constructor cannot fail, otherwise begin() would have to detect it.
+  */
+  pthread_cond_init(&COND_lock_state, NULL);
+}
+
+
+/** Kills the locking thread when it is time to unlock tables */
+
+void Backup::kill_locking_thread()
+{
+  DBUG_ENTER("maria_backup::Backup::kill_locking_thread");
+  /*
+    If everything worked well, when unlock() calls us we kill the thread and
+    so when free() calls us the locking thread is already dead here
+    (LOCK_ERROR).
+  */
+retry:
+  pthread_mutex_lock(&THR_LOCK_maria);
+  /* If thread started and not already dead, kill it */
+  if ((lock_state != LOCK_NOT_STARTED) & (lock_state != LOCK_ERROR))
+  {
+    /*
+      If the locking thread has not yet created THD (very unlikely), wait
+      for it.
+    */
+    if (unlikely(lock_thd == NULL))
+    {
+      pthread_mutex_unlock(&THR_LOCK_maria);
+      DBUG_PRINT("info",("lock_thd not yet set"));
+      sleep(1);
+      goto retry;
+    }
+    /*
+      Locking thread had time to create its THD, may be inside table locking
+      (waiting for others to release locks etc), wake it up and kill it. Or it
+      may have locked tables successfully, and be waiting for us to kill it.
+      To do that we will use lock_thd, but how to be sure that lock_thd is not
+      being deleted now? One way would be to hold THR_LOCK_maria but
+      THD::awake() can't bear it (same mutex locked twice).
+      Another way is to take lock_thd->LOCK_delete (THD::awake() requires it
+      anyway), but again that requires that lock_thd is not deleted while we
+      access the mutex. We cannot hold THR_LOCK_maria to get LOCK_delete,
+      because that could deadlock if a some other thread is doing a KILL on
+      the locking thread (it would indeed take LOCK_delete and then
+      THR_LOCK_maria to wake up the locking thread).
+      So So we set a flag:
+    */
+    cannot_delete_lock_thd= TRUE;
+    pthread_mutex_unlock(&THR_LOCK_maria);
+    /*
+      So now lock_thd cannot be destroyed.
+      We kill the thread (which will in particular work if it is waiting for
+      some table locks).
+    */
+    pthread_mutex_lock(&lock_thd->LOCK_delete);
+    lock_thd->awake(THD::KILL_CONNECTION);
+    pthread_mutex_unlock(&lock_thd->LOCK_delete);
+    /* won't look at lock_thd anymore, allow its deletion */
+    pthread_mutex_lock(&THR_LOCK_maria);
+    cannot_delete_lock_thd= FALSE;
+    /* we wake up thread if it was blocked on the bool above */
+    pthread_cond_broadcast(&COND_lock_state);
+    /* And we wait for the thread to inform of its death */
+    while (lock_state != LOCK_ERROR)
+      pthread_cond_wait(&COND_lock_state, &THR_LOCK_maria);
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  This destructor is only called by the class' free().
+  It cleans up any leftover the driver could have. It is safe to call it at
+  any point. In a normal (no error) situation, the hash freeing is the only
+  operation done here, all the rest should already have been done by earlier
+  stages.
+*/
+
+Backup::~Backup()
+{
+  DBUG_ENTER("maria_backup::Backup::~Backup");
+  /* If we had already started backup logging, we must dirtily stop it */
+  ma_log(MA_LOG_ACTION_CLOSE_INCONSISTENT, NULL, NULL);
+  delete image;
+  if (hash_of_tables)
+  {
+    my_hash_free(hash_of_tables);
+    delete hash_of_tables;
+    hash_of_tables= NULL;
+  }
+  kill_locking_thread();
+  pthread_cond_destroy(&COND_lock_state);
+  DBUG_VOID_RETURN;
+}
+
+
+/** Usual parameter to hash_init() */
+
+static uchar
+*backup_get_table_from_hash_key(const uchar *lsc, size_t *length,
+                                my_bool not_used __attribute__ ((unused)))
+{
+  const ::LEX_STRING *ls= reinterpret_cast<const ::LEX_STRING *>(lsc);
+  *length= ls->length;
+  return reinterpret_cast< uchar *>(ls->str);
+}
+
+
+/** Usual parameter to hash_init() */
+
+static void backup_free_hash_key(void *lsv)
+{
+  my_free(lsv, MYF(MY_WME));
+}
+
+
+#define SET_STATE_TO_ERROR_AND_DBUG_RETURN {                                 \
+    state= ERROR;                                                       \
+    DBUG_PRINT("error",("driver got an error at %s:%d",__FILE__,__LINE__)); \
+    DBUG_RETURN(backup::ERROR); }
+
+/* use this one only in constructors */
+#define SET_STATE_TO_ERROR_AND_DBUG_VOID_RETURN {                       \
+    state= ERROR;                                                       \
+    DBUG_PRINT("error",("driver got an error at %s:%d",__FILE__,__LINE__)); \
+    DBUG_VOID_RETURN; }
+
+
+/**
+  Sets Maria in a state ready for the copy to start. I.e. builds
+  a hash of tables and starts Maria physical logging for those tables.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Backup::begin(const size_t)
+{
+  DBUG_ENTER("maria_backup::Backup::begin");
+  DBUG_PRINT("info",("%lu tables", m_tables.count()));
+
+  /*
+    per the API, all significant allocations (large mem, opening files) must
+    not be in the constructor but in begin() or later.
+  */
+  DBUG_ASSERT(!hash_of_tables); // no double begin() call or reuse of driver
+  DBUG_ASSERT(m_tables.count() > 0); // or bug in the backup kernel
+  /*
+    If external locking is on, some other processes may modify our tables
+    while we are copying them, those modifications will not reach the log,
+    backup will be corrupted.
+  */
+  if (!my_disable_locking
+#ifdef EXTERNAL_LOCKING
+ || !maria_single_user
+#endif
+)
+  {
+    my_error(ER_GET_ERRMSG, MYF(0),
+             MARIA_ERR_NO_BACKUP_WITH_EXTERNAL_LOCKING,
+             MARIA_ERR(MARIA_ERR_NO_BACKUP_WITH_EXTERNAL_LOCKING), "Maria");
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  }
+  hash_of_tables= new HASH;
+  if (!hash_of_tables ||
+      my_hash_init(hash_of_tables, &my_charset_bin, m_tables.count(), 0, 0,
+                   (my_hash_get_key)backup_get_table_from_hash_key,
+                   (my_hash_free_key)backup_free_hash_key, 0))
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  /* Build the hash of tables for the Maria layer (ma_log.c etc) */
+  for (uint n=0 ; n < m_tables.count() ; n++ )
+  {
+    char path[FN_REFLEN];
+    char unique_file_name[FN_REFLEN], *str;
+    size_t str_len;
+    ::LEX_STRING *hash_key;
+
+    /*
+      The internal_name is the translated table name with ASCII
+      characters only.
+    */
+    (void) m_tables[n].internal_name(path, sizeof(path));
+    if (my_realpath(unique_file_name,
+                    fn_format(unique_file_name, path, "", MARIA_NAME_IEXT,
+                              MY_UNPACK_FILENAME), MYF(MY_WME)))
+        SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+    str_len= strlen(unique_file_name);
+    my_multi_malloc(MYF(MY_WME),
+                    &hash_key, sizeof(*hash_key),
+                    &str, static_cast<uint>(str_len), NullS);
+    if (!hash_key)
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+    memcpy(str, unique_file_name, str_len);
+    hash_key->length= str_len;
+    hash_key->str= str;
+    if (my_hash_insert(hash_of_tables,
+                       reinterpret_cast< uchar *>(hash_key)))
+    {
+      my_free(hash_key, MYF(MY_WME));
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+    }
+    DBUG_PRINT("info",("table '%.*s' inserted in hash",
+                       static_cast<int>(hash_key->length), hash_key->str));
+  }
+
+  {
+    THD *thd= current_thd;
+    /*
+      If tmpdir is in RAM (/dev/shm etc), we may exhaust it if our log is big
+    */
+    my_snprintf(backup_log_name, sizeof(backup_log_name),
+                "%s/%s%lx_%lx_%x-backuplog", mysql_tmpdir,
+                tmp_file_prefix, current_pid, thd->thread_id,
+                thd->tmp_table++); // it's not a tmp table but what...
+    unpack_filename(backup_log_name, backup_log_name);
+  }
+
+  {
+    /**
+      Until there exists a framework by which the user tells, via SQL,
+      indications on how it wants the backup, and by which the backup kernel
+      tells it to the driver (API), we resort to this.
+    */
+    char *env_arg= getenv("MARIA_BACKUP_NO_INDEX");
+    /* By default we log index pages */
+    ma_log_index_pages_physical= !(env_arg && atoi(env_arg));
+    env_arg= getenv("MARIA_BACKUP_SLEEP");
+    /*
+      By default we don't sleep at all; however, 500 ms every 10MB gives a
+      low penalty on clients, so it can be a good choice.
+    */
+    sleep_time= env_arg ? atoi(env_arg) : 0;
+  }
+
+  if (ma_log(MA_LOG_ACTION_OPEN, backup_log_name, hash_of_tables))
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+
+  state= DUMPING_DATA_INDEX_FILES;
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  If some error happened, end() is not called but free() is. So we do all
+  cleanup in free() i.e. in the destructor, and nothing here.
+
+  @return Operation status
+    @retval backup::OK
+*/
+
+result_t Backup::end()
+{
+  DBUG_ENTER("maria_backup::Backup::end");
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  Sends backup data for tables and log to the backup kernel.
+
+  @param  buf             reference to Buffer where data should be put
+
+  @return Operation status (see the API for when they are returned)
+    @retval backup::OK
+    @retval backup::DONE
+    @retval backup::READY
+    @retval backup::ERROR
+*/
+
+result_t Backup::get_data(Buffer &buf)
+{
+  result_t ret;
+  DBUG_ENTER("maria::backup::Backup::get_data");
+  DBUG_PRINT("enter",("stream %d",stream));
+
+  /* we are currently on stream 'stream' */
+  buf.table_num= stream;
+
+  /*
+    Rafal and I agreed that one single ERROR from the driver will cause the
+    upper layer to not call the driver anymore except for free().
+  */
+  DBUG_ASSERT(state != ERROR);
+  DBUG_ASSERT(buf.data != NULL); // to check that caller gave room
+
+  if (state == DONE)
+  {
+    /*
+      We never come here, because after returning from the call where we sent
+      the last piece of the last stream (when we set our internal state to
+      DONE), all streams were closed, so the upper layer wouldn't call us
+      again. At least it was so during testing. But if it calls us, we do all
+      that the API expects us to do:
+    */
+    buf.size= buf.table_num= 0;
+    buf.last= TRUE;
+    DBUG_RETURN(backup::DONE);
+  }
+
+  if (unlikely(image == NULL))
+  {
+    /*
+      Let's create it.
+      Table 0 will be image 1 on stream 1. Table N will be image N+1 on stream
+      N+1. Log will be image 0 on stream 0.
+    */
+    if (stream >= 1)
+      image= new Table_backup(m_tables[stream-1]);
+    else
+      image= new Log_backup(backup_log_name);
+    if (image == NULL || image->internal_error())
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  }
+
+  if ((ret= image->get_data(buf)) != backup::OK)
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+
+  if (sleep_time)
+  {
+    bytes_since_last_sleep+= buf.size;
+    /* sched_yield() is not as flexible (higher penalty) as sleep() */
+    if (bytes_since_last_sleep > bytes_between_sleeps)
+    {
+      my_sleep(sleep_time * 1000UL);
+      bytes_since_last_sleep= 0;
+    }
+  }
+
+  if (state == DUMPING_LOG_FILE_BEFORE_TABLES_ARE_LOCKED)
+  {
+    DBUG_ASSERT(stream == 0);
+    /*
+      We are sending the log; even if reached its EOF, some more may be
+      appended to it before prelock() ends, so this is not the stream's end.
+    */
+    buf.last= FALSE;
+    /*
+      API docs say we should return READY, but Rafal says OK is better (one
+      READY to signal end of initial phase; then OKs; one READY to signal end
+      of prelock(); then OKs).
+    */
+    if (lock_state == LOCK_NOT_STARTED)
+      DBUG_RETURN(backup::OK);
+    /* Let's see if the locking thread has finished locking all tables */
+    pthread_mutex_lock(&THR_LOCK_maria);
+    if (lock_state == LOCK_STARTED) // not yet
+    {
+      pthread_mutex_unlock(&THR_LOCK_maria);
+      DBUG_RETURN(backup::OK);
+    }
+    if (lock_state !=  LOCK_ACQUIRED) // it failed, so do we
+    {
+      pthread_mutex_unlock(&THR_LOCK_maria);
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+    }
+    DBUG_PRINT("info",("locking thread acquired locks on tables"));
+
+    pthread_mutex_unlock(&THR_LOCK_maria);
+    if (ma_log(MA_LOG_ACTION_CLOSE_CONSISTENT, NULL, NULL))
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+    state= DUMPING_LOG_FILE_AFTER_TABLES_ARE_LOCKED;
+    DEBUG_SYNC(current_thd, "maria_locking_thread_added");
+    /* signal "end of prepare-for-lock, ready for lock()" */
+    DBUG_RETURN(backup::READY);
+  }
+  else if (buf.last)
+  {
+    /*
+      we are sending the last chunk of the image, next call will be about the
+      next image:
+    */
+    if (image->end() != backup::OK)
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+    delete image;
+    image= NULL; /* next call of this function should open the next object */
+    stream++;    /* and send it on the next stream */
+    if (state == DUMPING_DATA_INDEX_FILES && stream > m_tables.count())
+    {
+      /* all tables done */
+      stream= 0; // send the log on stream 0
+      state= DUMPING_LOG_FILE_BEFORE_TABLES_ARE_LOCKED;
+      ret= backup::READY; // end of initial phase
+    }
+    else if (state == DUMPING_LOG_FILE_AFTER_TABLES_ARE_LOCKED) // log done
+      state= DONE;
+  }
+
+  DBUG_RETURN(ret);
+}
+
+
+/**
+  Creates a validity point by locking all tables. This is the only job of the
+  locking thread: call this function which locks tables, then wait for being
+  killed (which will unlock tables).
+
+  @todo GUILHEM_TODO use sql/backup/be_thread.cc instead.
+
+  @todo use a method which does not open closed tables. This will be needed
+  when backing up lots of tables (more than the limit of open file
+  descriptors).
+*/
+
+void Backup::lock_tables_TL_READ_NO_INSERT()
+{
+  THD *thd;
+  TABLE_LIST *tables_in_TABLE_LIST_form=NULL ; ///< for open_and_lock_tables()
+  const char thread_name[]= "Maria driver locking thread";
+  DBUG_ENTER("maria::backup::Backup::lock_tables_TL_READ_NO_INSERT");
+
+  thd= new THD;
+  if (unlikely(!thd))
+    goto end2;
+  thd->thread_stack = reinterpret_cast< char *>(&thd);
+  pthread_mutex_lock(&LOCK_thread_count);
+  thd->thread_id= thread_id++;
+  pthread_mutex_unlock(&LOCK_thread_count);
+  if (unlikely(thd->store_globals())) // for a proper MEM_ROOT
+    goto end2;
+  thd->init_for_queries(); // opening tables needs a proper LEX
+  thd->command= COM_DAEMON;
+  thd->system_thread= SYSTEM_THREAD_BACKUP;
+  thd->version= refresh_version;
+  thd->set_time();
+  thd->main_security_ctx.host_or_ip= "";
+  thd->client_capabilities= 0;
+  my_net_init(&thd->net, 0);
+  thd->main_security_ctx.master_access= ~0;
+  thd->main_security_ctx.priv_user= 0;
+  thd->real_id= pthread_self();
+  /*
+    Making this thread visible to SHOW PROCESSLIST is useful for
+    troubleshooting a backup job (why does it stall etc).
+  */
+  pthread_mutex_lock(&LOCK_thread_count);
+  threads.append(thd);
+  pthread_mutex_unlock(&LOCK_thread_count);
+  /*
+    Set info for the process list. Used in test cases.
+  */
+  thd->query= (char*) thread_name;
+  thd->query_length= sizeof(thread_name) - 1;
+
+  lex_start(thd);
+  mysql_reset_thd_for_next_command(thd);
+  /*
+    As locking tables can be a long operation, we need to support
+    cancellability during that time. So we publish our THD now to the thread
+    which created us (the "master" thread), so that it can kill us early if
+    needed.
+  */
+  pthread_mutex_lock(&THR_LOCK_maria);
+  lock_thd= thd;
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  /*
+    We need TL_READ_NO_INSERT (and not TL_READ) because we want to prevent
+    concurrent inserts (we indeed need to freeze the tables to correspond to
+    a position in the binlog).
+  */
+  tables_in_TABLE_LIST_form=
+    backup::build_table_list(m_tables, TL_READ_NO_INSERT);
+  if (!tables_in_TABLE_LIST_form)
+    goto end2;
+  if (open_and_lock_tables(thd, tables_in_TABLE_LIST_form))
+    goto end;
+
+  DBUG_PRINT("info",("Maria backup locking thread got locks"));
+  pthread_mutex_lock(&THR_LOCK_maria);
+  thd->enter_cond(&COND_lock_state, &THR_LOCK_maria,
+                  "Maria backup: holding table locks");
+  /* show master thread that we got locks */
+  lock_state= LOCK_ACQUIRED;
+  /* and wait for it to kill us */
+  while (!thd->killed)
+    pthread_cond_wait(&COND_lock_state, &THR_LOCK_maria);
+  thd->exit_cond("Maria backup: terminating");
+
+end:
+  DBUG_PRINT("info",("Maria backup locking thread dying"));
+  close_thread_tables(thd);
+end2:
+  pthread_mutex_lock(&THR_LOCK_maria);
+  while (cannot_delete_lock_thd)
+  {
+    /* master thread is looking at our THD; wait for authorization */
+    pthread_cond_wait(&COND_lock_state, &THR_LOCK_maria);
+  }
+  lock_state= LOCK_ERROR;
+  pthread_cond_broadcast(&COND_lock_state);
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  backup::free_table_list(tables_in_TABLE_LIST_form);
+  net_end(&thd->net);
+  delete thd;
+  DBUG_VOID_RETURN;
+}
+
+
+/** Entry point for the locking thread */
+
+pthread_handler_t maria_backup_separate_thread_for_locking(void *arg)
+{
+  my_thread_init();
+  DBUG_PRINT("info", ("maria_backup::separate_thread_for_locking"));
+  pthread_detach_this_thread();
+  (static_cast<Backup *>(arg))->lock_tables_TL_READ_NO_INSERT();
+  my_thread_end();
+  pthread_exit(0);
+  return 0;
+}
+
+
+/**
+  Launches a separate thread ("locking thread") which will lock
+  tables. Locking in a separate thread is needed to have a non-blocking
+  prelock() (given that thr_lock() is blocking). prelock() is indeed not
+  allowed to block, or it would block the entire backup kernel (see "HOW THE
+  BACKUP WORKS" at the start of this file).
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Backup::prelock()
+{
+  DBUG_ENTER("maria_backup::Backup::prelock");
+  /* we are going to launch a thread, we need to remember to kill it */
+  lock_state= LOCK_STARTED;
+  {
+    pthread_t th;
+    if (pthread_create(&th, &connection_attrib,
+                       maria_backup_separate_thread_for_locking, this))
+    {
+      lock_state= LOCK_ERROR;
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+    }
+  }
+  DBUG_RETURN(backup::OK);
+}
+
+
+result_t Backup::lock()
+{
+  DBUG_ENTER("maria_backup::Backup::lock");
+  /* locking was done in prelock() already, nothing to do */
+  DBUG_RETURN(backup::OK);
+}
+
+
+result_t Backup::unlock()
+{
+  DBUG_ENTER("maria_backup::Backup::unlock");
+  /* kill the locking thread which owns table locks, it will unlock them */
+  kill_locking_thread();
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  Backs up the log.
+
+  @todo For now we read the log file from disk. We could instead try to
+  "steal" it from its IO_CACHE; that might reduce the log portion which goes
+  to disk, if the backup thread is fast enough to catch up on client threads
+  filling the log.
+*/
+
+Log_backup::Log_backup(const char *log_name_arg) : log_name(log_name_arg),
+                                                   log_deleted(FALSE)
+{
+  DBUG_ENTER("maria_backup::Log_backup::Log_backup");
+  int fd= my_open(log_name, O_RDONLY, MYF(MY_WME));
+  if (fd < 0)
+    SET_STATE_TO_ERROR_AND_DBUG_VOID_RETURN;
+  /*
+    Log is alone on the shared stream for now, so LOG_FILE_CODE is useless,
+    except that it allows us to verify that what restore sends us is really a
+    log.
+  */
+  log_file_backup.init(fd, ~(ULL(0)), LOG_FILE_CODE);
+  state= OK;
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  Closes and deletes the log.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Log_backup::end()
+{
+  DBUG_ENTER("maria_backup::Log_backup::end");
+  /*
+    Log is safe in the stream, or backup is cancelled, so we don't need it
+    anymore.
+  */
+  if (log_file_backup.close_file() != backup::OK ||
+      (!log_deleted && my_delete(log_name, MYF(MY_WME))))
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  log_deleted= TRUE;
+  DBUG_RETURN(backup::OK);
+}
+
+
+Log_backup::~Log_backup()
+{
+  /*
+    If all went well, we don't do anything here.
+    All possible failures in end() below use MYF_WME so my_error() will be
+    called.
+  */
+  end();
+}
+
+
+/** The header of a MAI index file always fits in this size */
+#define MAX_INDEX_HEADER_SIZE (64*1024)
+
+
+/**
+  Opens a Maria table for backing it up.
+
+  @param  tbl             The table to open
+*/
+
+Table_backup::Table_backup(const backup::Table_ref &tbl) :
+  Maria_table_ref(tbl)
+{
+  MARIA_HA *mi_info;
+  File dfiledes= -1, kfiledes= -1;
+  my_off_t file_size;
+  DBUG_ENTER("maria_backup::Table_backup::Table_backup");
+  DBUG_PRINT("info",("Initializing backup image for table %s",
+                     file_name.ptr()));
+  /*
+    Here we use low-level maria_* functions as all we want is a pair of file
+    descriptors.
+    O_RDONLY is not ok, as it forces all instances of the table to be
+    read-only (sets HA_OPTION_READ_ONLY_DATA of share->options).
+    We don't use HA_OPEN_FOR_REPAIR so will fail to back up a known corrupted
+    table (would be a corrupted backup).
+  */
+  mi_info= maria_open(file_name.ptr(), O_RDWR, 0);
+  if (!mi_info) // table does not exist or is corrupted? backup not ok
+    goto err;
+  /*
+    we create our own descriptors, to use my_read() (faster than my_pread()
+    which may use mutex).
+  */
+  dfiledes= my_open(mi_info->s->data_file_name.str, O_RDONLY, MYF(MY_WME));
+  kfiledes= my_open(mi_info->s->unique_file_name.str, O_RDONLY, MYF(MY_WME));
+  if ((dfiledes < 0) || (kfiledes < 0))
+    goto err;
+  maria_close(mi_info);
+  mi_info= NULL;
+  file_size= my_seek(dfiledes, 0, SEEK_END, MYF(MY_WME));
+  if (file_size == MY_FILEPOS_ERROR ||
+      my_seek(dfiledes, 0, SEEK_SET, MYF(MY_WME)) == MY_FILEPOS_ERROR)
+    goto err;
+  dfile_backup.init(dfiledes, file_size, DATA_FILE_CODE);
+  if (ma_log_index_pages_physical)
+  {
+    file_size= my_seek(kfiledes, 0, SEEK_END, MYF(MY_WME));
+    if (file_size == MY_FILEPOS_ERROR ||
+        my_seek(kfiledes, 0, SEEK_SET, MYF(MY_WME)) == MY_FILEPOS_ERROR)
+      goto err;
+    kfile_backup.init(kfiledes, file_size, WHOLE_INDEX_FILE_CODE);
+  }
+  else
+    kfile_backup.init(kfiledes,
+                      MAX_INDEX_HEADER_SIZE /* upper limit */ ,
+                      HEADER_INDEX_FILE_CODE);
+  in_file= DATA_FILE; // dump the data file first (no specific reason)
+  state= OK;
+  DBUG_VOID_RETURN;
+  /*
+    Note: we are copying an index file of a table, which may have instances in
+    the MySQL table cache, so after restore it will show up as
+    "warning: 1 client is using or hasn't closed the table properly".
+    Maybe do a quick index update on the table at the end of restore to
+    remove this warning. But how to know if the problem pre-dates backup ?
+  */
+err:
+  if (dfiledes > 0)
+    my_close(dfiledes, MYF(MY_WME));
+  if (kfiledes > 0)
+    my_close(kfiledes, MYF(MY_WME));
+  if (mi_info != NULL)
+    maria_close(mi_info);
+  SET_STATE_TO_ERROR_AND_DBUG_VOID_RETURN;
+}
+
+
+/**
+  Closes the Maria table.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Table_backup::end()
+{
+  DBUG_ENTER("maria_backup::Table_backup::end");
+  /* even if one close fails we still want to try the other one */
+  if ((dfile_backup.close_file() != backup::OK) |
+      (kfile_backup.close_file() != backup::OK))
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  DBUG_RETURN(backup::OK);
+}
+
+
+Table_backup::~Table_backup()
+{
+  /* If all went well, we don't do anything here. */
+  end();
+}
+
+
+/**
+  Sends backup data for one table to the backup kernel.
+
+  @param  buf             reference to Buffer where data should be put
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Table_backup::get_data(Buffer &buf)
+{
+  result_t ret;
+  DBUG_ENTER("maria_backup::Table_backup::get_data");
+  switch (in_file)
+    {
+    case DATA_FILE:
+      ret= dfile_backup.get_data(buf);
+      if (buf.last) // move to dumping the index file...
+      {
+        in_file= INDEX_FILE;
+        buf.last= FALSE; // ... so this is not the last buffer on this stream
+      }
+      break;
+    case INDEX_FILE:
+      ret= kfile_backup.get_data(buf);
+      break;
+    default:
+      DBUG_ASSERT(0);
+      ret= backup::ERROR;
+    };
+  if (ret != backup::OK)
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  DBUG_RETURN(ret);
+}
+
+
+/**
+  Sends backup data for the log to the backup kernel.
+
+  @param  buf             reference to Buffer where data should be put
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Log_backup::get_data(Buffer &buf)
+{
+  result_t ret;
+  DBUG_ENTER("maria_backup::Log_backup::get_data");
+  /*
+    See, we detect a log write error encountered by the Maria maria_log*
+    and ma_log* functions, every time we read a packet from the log file.
+  */
+  if (((ret= log_file_backup.get_data(buf)) != backup::OK) ||
+      (maria_physical_log.hard_write_error_in_the_past == -1))
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  Closes a file in backup.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t File_backup::close_file()
+{
+  int ret;
+  if (fd < 0)
+    return backup::OK;
+  ret= my_close(fd, MYF(MY_WME));
+  fd= -1;
+  return ret ? backup::ERROR : backup::OK;
+}
+
+
+/**
+  Sends backup data for a single file to the backup kernel.
+
+  @param  buf             reference to Buffer where data should be put
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t File_backup::get_data(Buffer &buf)
+{
+  size_t    res, howmuch= buf.size;
+  result_t  ret= backup::OK;
+
+  DBUG_ENTER("maria_backup::File_backup::get_data");
+
+  buf.size= 1;
+  DBUG_ASSERT(howmuch >= 2); // need at least 2 bytes
+  *buf.data= static_cast<uchar>(file_code);
+  howmuch--;
+
+  if (backup_file_size >= file_size)
+    res= 0; // we don't have to read/send the rest of file
+  else
+  {
+    res= my_read(fd, buf.data + 1, howmuch, MYF(MY_WME));
+    // DBUG_DUMP("sending",buf_ptr-1, 16);
+  }
+  if (res == (size_t)(-1))
+  {
+    ret= backup::ERROR;
+    goto end;
+  }
+  backup_file_size+= res;
+  if (res == 0) // end of file
+  {
+    buf.size= 0; // don't even send a packet
+    buf.last= TRUE;
+    goto end;
+  }
+  buf.size+= res;
+  buf.last= FALSE;
+end:
+  DBUG_PRINT("info",("ret %d buf.last %d buf.size %u",
+                     ret, buf.last, static_cast<uint>(buf.size)));
+  DBUG_RETURN(ret);
+}
+
+
+/**************************************
+ *
+ *   RESTORE FUNCTIONALITY
+ *
+ **************************************/
+
+class Object_restore;
+
+/**
+  Handles restore orders received from the backup kernel (implements the
+  API).
+*/
+class Restore: public Restore_driver
+{
+public:
+  Restore(const Table_list &tables);
+  virtual ~Restore();
+  virtual result_t  begin(const size_t);
+  virtual result_t  end();
+  virtual result_t  send_data(Buffer &buf);
+  virtual result_t  cancel()
+    {
+      /* Nothing to do in cancel(); free() will suffice */
+      return backup::OK;
+    };
+  virtual void      free() { delete this; };
+
+private:
+  enum { PUMPING, DONE, ERROR } state;
+  uint            images_left; ///< how many images left to restore
+  Object_restore  **images; ///< one for the log and one per table
+  char restore_log_name[FN_REFLEN];
+};
+
+
+/** An object to restore; in practice, a table or the log */
+class Object_restore
+{
+public:
+  virtual result_t send_data(const Buffer &buf)= 0;
+  virtual ~Object_restore() {};
+  /**
+    Closes the object, post_restore() can later be called. Whereas in
+    Object_backup, closing is done in end() (there is no close()), here we
+    have a dedicated close() method. This is because we must close tables and
+    the log then apply the log then repair indices: we need to close way
+    before end()).
+  */
+  virtual result_t close()= 0;
+  /** Does additional restore operations between close() and end() */
+  virtual result_t post_restore()= 0;
+  bool internal_error() { return state == ERROR; }
+  virtual result_t end()= 0; ///< cleanups
+protected:
+  enum { OK, ERROR } state;
+};
+
+
+/**
+  An object to restore is made of one or more such files. This class does not
+  open the file, user has to open it. This class provides a helper method if
+  its user wants to close the file.
+*/
+class File_restore
+{
+public:
+  File_restore() : fd(-1) {}
+  void init(int fd_arg) { fd= fd_arg; }
+  result_t send_data(const Buffer &);
+  result_t close_file();
+private:
+  int fd; ///< file descriptor
+};
+
+
+/** Handles restoring a single table */
+class Table_restore: public Object_restore, public Maria_table_ref
+{
+public:
+  Table_restore(const Table_ref &tbl);
+  virtual result_t send_data(const Buffer &buf);
+  virtual ~Table_restore();
+  virtual result_t close();
+  virtual result_t post_restore();
+  virtual result_t end(); ///< cleanups
+ private:
+  File_restore dfile_restore, kfile_restore;
+  bool         rebuild_index; ///< if we have to rebuild index or not
+  THD          *thd; ///< rebuilding index requires a THD
+};
+
+
+/** Handles restoring the log */
+class Log_restore: public Object_restore
+{
+public:
+  Log_restore(const char *log_name_arg);
+  virtual result_t send_data(const Buffer &buf);
+  virtual ~Log_restore();
+  virtual result_t close();
+  virtual result_t post_restore();
+  virtual result_t end();
+private:
+  const char *log_name;
+  File_restore log_file_restore;
+  bool log_deleted; ///< if we have already deleted the log or not
+};
+
+
+/**
+  Creates a restore driver, per the backup API. @see backup::Engine.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Engine::get_restore(const version_t ver, const uint32,
+                             const Table_list &tables, Restore_driver* &drv)
+{
+  if (ver > MARIA_BACKUP_VERSION)
+  {
+    char errbuff[200];
+    my_snprintf(errbuff, sizeof(errbuff),
+                MARIA_ERR(MARIA_ERR_BACKUP_TOO_RECENT),
+                ver, MARIA_BACKUP_VERSION);
+    my_error(ER_GET_ERRMSG, MYF(0),
+             MARIA_ERR_BACKUP_TOO_RECENT, errbuff, "Maria");
+    return backup::ERROR;    
+  }
+
+  Restore *ptr= new Restore(tables);
+  if (unlikely(!ptr))
+    return backup::ERROR;
+  drv= ptr;
+  return backup::OK;
+}
+
+
+Restore::Restore(const Table_list &tables):
+  Restore_driver(tables), state(ERROR), images_left(0), images(NULL)
+{
+  /* This constructor cannot fail otherwise begin() would have to detect it */
+}
+
+
+/**
+  This destructor is only called by the class' free(). It cleans up any
+  leftover the driver could have. It is safe to call it at any point. In a
+  normal (no error) situation, it does nothing, all should already have been
+  done by earlier stages.
+*/
+
+Restore::~Restore()
+{
+  DBUG_ENTER("maria_backup::Restore::~Restore");
+  if (images)
+  {
+    for (uint n= 0; n <= m_tables.count(); ++n)
+      delete images[n];
+    delete[] images;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  Sets Maria in a state ready for us to restore. I.e. creates a temporary
+  file to host the log's restored copy.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Restore::begin(const size_t)
+{
+  THD *thd= current_thd;
+  DBUG_ENTER("maria_backup::Restore::begin");
+  my_snprintf(restore_log_name, sizeof(restore_log_name),
+	      "%s/%s%lx_%lx_%x-restorelog", mysql_tmpdir,
+	      tmp_file_prefix, current_pid, thd->thread_id,
+              thd->tmp_table++);
+  unpack_filename(restore_log_name, restore_log_name);
+
+  DBUG_ASSERT(m_tables.count() > 0); // or bug in the backup kernel
+  images_left= 1 + m_tables.count();
+  images= new Object_restore*[images_left];
+  if (unlikely(!images))
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  bzero(images, images_left * sizeof(*images));
+  state= PUMPING;
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  If no error happened, we have to apply the log and possibly repair
+  indexes; this has to be done here and not in the destructor (as it has to
+  be done only in case of success, while a destructor runs in all cases).
+  Because we have no "end of stream" notifications yet, when we come here all
+  our tables/logs are opened. and log is not applied (both things which could
+  be done in send_data() if we knew end-of-stream). Repairing indexes, on the
+  other hand, really has to be done here.
+
+  @todo selective restore (this is just passing a proper function which
+  checks if the table is in a hash of tables).
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Restore::end()
+{
+  DBUG_ENTER("maria_backup::Restore::end");
+  /*
+    Rafal said currently end() is called in case of error but said he'll fix
+    that (only free() will be called)
+  */
+  DBUG_ASSERT(state != ERROR);
+  if (images)
+  {
+    for (uint n=0; n <= m_tables.count(); ++n)
+      if (images[n] && images[n]->close() != backup::OK)
+        SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+
+    /*
+      Tables are closed. Apply backup log if it exists (it does not exist if
+      it was empty at backup time), this is post_restore() of images[0]. Then
+      repair indices if needed (post_restore() of other images).
+    */
+    for (uint n=0; n <= m_tables.count(); ++n)
+      if (images[n] && images[n]->post_restore() != backup::OK)
+        SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+
+    /*
+      By doing here the work of the destructor we can test the return code of
+      end(). We don't do it for tables as they will do nothing in end()
+      (except freeing their memory) so that can be left to the destructor.
+    */
+    if (images[0] && images[0]->end() != backup::OK)
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  }
+
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  Receives and restores data for tables and log from the backup kernel.
+
+  @param  buf             reference to Buffer where data is
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Restore::send_data(Buffer &buf)
+{
+  result_t ret;
+  uint stream= buf.table_num;
+  DBUG_ENTER("maria_backup::Restore::send_data");
+  DBUG_PRINT("enter",("Got packet with %u bytes from stream %d",
+                      static_cast<uint>(buf.size), buf.table_num));
+
+  if (state == DONE)
+  {
+    /* we never come here */
+    DBUG_PRINT("info",("Ignoring the packet (all objects already restored)"));
+    DBUG_RETURN(backup::DONE);
+  }
+
+  Object_restore *image= images[stream];
+
+  /*
+    We create an image when we see a new stream.
+    Still we have N open tables during the last table's restore.
+    But when Rafal implements that the last buffer of a stream has
+    buf.last==TRUE (soon), we can close tables earlier.
+  */
+  if (!image)
+  {
+    if (stream >= 1)
+      image= new Table_restore(m_tables[stream-1]);
+    else
+      image= new Log_restore(restore_log_name);
+    images[stream]= image;
+    if (unlikely(!image || image->internal_error()))
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  }
+
+  if ((ret= image->send_data(buf)) != backup::OK)
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+
+  /* for when we have "end of stream" notifications: */
+#ifdef TODO_HAVE_END_OF_STREAM
+  if (buf.last)
+  {
+    if (image->close() != backup::OK)
+      SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+    images_left--;
+    if (images_left == 0)
+    {
+      state= DONE;
+      /* DONE means done with all send_data() calls, but we have more work */
+      DBUG_RETURN(backup::DONE);
+    }
+  }
+#endif
+
+  DBUG_RETURN(backup::OK);
+}
+
+
+
+/**
+  Restores the log.
+
+  @param  log_name_arg    Name under which log should be created
+*/
+
+Log_restore::Log_restore(const char *log_name_arg) : log_name(log_name_arg)
+{
+  DBUG_ENTER("maria_backup::Log_restore::Log_restore");
+  int fd= my_create(log_name, 0, O_WRONLY, MYF(MY_WME));
+  if (fd < 0)
+  {
+    log_deleted= TRUE;
+    SET_STATE_TO_ERROR_AND_DBUG_VOID_RETURN;
+  }
+  log_deleted= FALSE;
+  log_file_restore.init(fd);
+  state= OK;
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  Closes the log.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Log_restore::close()
+{
+  DBUG_ENTER("maria_backup::Log_restore::close");
+  if (log_file_restore.close_file() != backup::OK)
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  Applies the log to restored tables, to make them consistent.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Log_restore::post_restore()
+{
+  MA_EXAMINE_LOG_PARAM mi_exl;
+  DBUG_ENTER("maria_backup::Log_restore::post_restore");
+  ma_examine_log_param_init(&mi_exl);
+  mi_exl.log_filename= log_name;
+  mi_exl.update= 1;
+  /*
+    For max_files, the assumption is that at backup time the server had
+    enough file descriptors and so should have that many now.
+  */
+  mi_exl.max_files= open_files_limit;
+  if (ma_examine_log(&mi_exl))
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  Closes and deletes the log.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Log_restore::end()
+{
+  DBUG_ENTER("maria_backup::Log_restore::end");
+  /* log is applied so we don't need it anymore */
+  if (close() != backup::OK ||
+      (!log_deleted && my_delete(log_name, MYF(MY_WME))))
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  log_deleted= TRUE;
+  DBUG_RETURN(backup::OK);
+}
+
+
+Log_restore::~Log_restore()
+{
+  /* If all went well, we don't do anything here. */
+  end();
+}
+
+
+/** Opens a Maria table for restoring it */
+
+Table_restore::Table_restore(const Table_ref &tbl):
+  Maria_table_ref(tbl), rebuild_index(FALSE)
+{
+  MARIA_HA *mi_info;
+  MARIA_SHARE *share;
+  File dfiledes= -1, kfiledes= -1;
+  my_bool save_transactional= FALSE;
+  DBUG_ENTER("maria_backup::Table_restore::Table_restore");
+  DBUG_PRINT("enter",("Initializing backup image for table %s",
+                      file_name.ptr()));
+  /*
+    Here we use low-level maria_* functions as all we want is a pair of file
+    descriptors.
+    Though we only want to write (O_WRONLY), the SQL layer uses only O_RDONLY
+    and O_RDWR, so here we don't try to be original.
+  */
+  mi_info= maria_open(file_name.ptr(), O_RDWR, 0);
+  if (!mi_info)
+  {
+    /* table does not exist or is corrupted? not normal, it's just created */
+    goto err;
+  }
+
+  share= mi_info->s;
+
+#ifdef RESTORE_KERNEL_NOT_ATOMIC
+  /*
+    Restore kernel leaves a window between creation of table and locking it;
+    in this window, another thread can modify the table, put pages in page
+    cache, changed cached bitmap or state, increase the files's length to
+    greater than what the driver has to write...
+    So we re-empty it here. We know we are alone using the table at this
+    point, as restore kernel has finished locking tables.
+    See BUG#42519, BUG#41716.
+  */
+  /* Another thread may have assigned an id */
+  pthread_mutex_lock(&share->intern_lock);
+  if (share->id != 0)
+  {
+    translog_deassign_id_from_share(share);
+    /*
+      Because id is 0, checkpoint will ignore this table, which is good
+      (otherwise Checkpoint may flush old info to the files, overwriting the
+      writes done by the driver.
+    */
+  }
+  pthread_mutex_unlock(&share->intern_lock);
+  save_transactional= share->now_transactional;
+  if (save_transactional) /* don't need logging */
+    _ma_tmp_disable_logging_for_table(mi_info, FALSE);
+  if (maria_delete_all_rows(mi_info))
+    goto err;
+  if (share->data_file_type == BLOCK_RECORD)
+  {
+    /*
+      maria_delete_all_rows() filled bitmap->map with zeroes and marked this
+      bitmap as changed. Flushing those zeroes would be wrong as soon as we
+      have restored the first bitmap page of the data file: prevent it.
+      Checkpoint can't flush between maria_delete_all_rows() and here, because
+      share->id is 0.
+    */
+    pthread_mutex_lock(&share->bitmap.bitmap_lock);
+    share->bitmap.changed= FALSE;
+    pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+    DBUG_ASSERT(share->id == 0);
+  }
+  if (save_transactional)
+  {
+    save_transactional= FALSE;
+    if (_ma_reenable_logging_for_table(mi_info, FALSE))
+      goto err;
+  }
+#endif
+
+  /*
+    It's ok to copy the kfile descriptor and write() to it as the upper layers
+    guarantee that we are the only user of the brand new table (nobody will
+    lseek() under our feet).
+  */
+  if (((dfiledes= my_dup(mi_info->dfile.file, MYF(MY_WME))) < 0) ||
+      ((kfiledes= my_dup(share->kfile.file, MYF(MY_WME))) < 0))
+    goto err;
+  /*
+    We are going to my_write() to the files without updating the table's
+    state (mi_info->state). If we called maria_close() only at end of restore,
+    that function may write its out-of-date state on the table.
+  */
+  maria_close(mi_info);
+  mi_info= NULL;
+  /* seek them at start, because we use my_write() */
+  if ((my_seek(dfiledes, 0, SEEK_SET, MYF(MY_WME)) == MY_FILEPOS_ERROR) ||
+      (my_seek(kfiledes, 0, SEEK_SET, MYF(MY_WME)) == MY_FILEPOS_ERROR))
+    goto err;
+  dfile_restore.init(dfiledes);
+  kfile_restore.init(kfiledes);
+  thd= current_thd;
+  state= OK;
+  DBUG_VOID_RETURN;
+err:
+  if (dfiledes > 0)
+    my_close(dfiledes, MYF(MY_WME));
+  if (kfiledes > 0)
+    my_close(kfiledes, MYF(MY_WME));
+  if (save_transactional)
+    _ma_reenable_logging_for_table(mi_info, FALSE);
+  if (mi_info != NULL)
+    maria_close(mi_info);
+}
+
+
+/**
+  Closes a table.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Table_restore::close()
+{
+  DBUG_ENTER("maria_backup::Table_restore::close");
+  DBUG_PRINT("info",("table: %s", file_name.ptr()));
+  if ((dfile_restore.close_file() != backup::OK) |
+      (kfile_restore.close_file() != backup::OK))
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+
+#ifdef RESTORE_KERNEL_KEEPS_OPEN_TABLES
+  /*
+    CAUTION! Ugliest hack ever!
+    This hack tries to recover from bypassing the Maria interface
+    by the Maria restore driver.
+    The situation is so:
+    The backup kernel opens and locks the tables in backup.
+    But the Maria restore driver does not use the open MARIA_HA
+    instance. Instead it opens another instance, duplicates its
+    file descriptors, and closes the instance. Then it uses the
+    duplicate file descriptors to write directly ("physically")
+    to the data and index files.
+    Among the writes are chunks of data from the index file, which
+    overwrite the index header with the state info.
+    In this function, called after all data have been written, the
+    duplicate file descriptors are closed (above). Now the index
+    and data files have the contents they ought to have.
+    Everything would be fine if no instance of the table would be
+    open at the time. Then a new open would read all table info from
+    disk and everybody would be happy.
+    However, the backup kernel still has the table open. Parts of
+    the index file are cached in the open MARIA_SHARE object.
+    If the backup kernel would close the tables, this old information
+    would be written to the index file, which crashes the table.
+    This hack tries to solve the problem by loading the share with
+    information from the index file. At first, we open a new MARIA_HA
+    instance from the table. This open does not read the state info
+    from the file because another instance is already open from the
+    same table. But the open gives us access to the share.
+    We do then explicitly call _ma_state_info_read_dsk(), which is
+    the function that loads the share from the index file at an
+    initial open. Well, not exactly. At open a similar function is
+    used, after the index header has been read by a direct read.
+    But the mentioned function includes both, read and share load.
+    Another small problem is that the function doesn't do anything
+    if external locking is disabled. It assumes that no external
+    (or bypassing) writes happen to the files. Since we did exactly
+    this, we must pretend that we are doing external locking. The
+    function uses the variable 'maria_single_user' for the
+    decision. So we temporarily change it.
+    Now we can close the new table instance. This won't write the
+    state again, because is is not the last open instance.
+    But since the share does now cache the new values from the
+    index file, the backup kernel's close writes the correct
+    information back to the file.
+  */
+  {
+    MARIA_HA      *mi_info;
+    MARIA_SHARE *share;
+
+    mi_info= maria_open(file_name.ptr(), O_RDWR, HA_OPEN_FOR_REPAIR);
+    if (mi_info == NULL)
+      goto err;
+    share= mi_info->s;
+    DBUG_PRINT("maria_backup", ("share data_file: %lu",
+                                 (ulong) share->state.state.data_file_length));
+    if (_ma_state_info_read_dsk(share->kfile.file, &share->state, 1))
+      goto err;
+    DBUG_PRINT("maria_backup", ("share data_file: %lu",
+                                 (ulong)
+                                 share->state.state.data_file_length));
+    /*
+      Now follows the most dirty part of the hack.
+      We have correct information in the share, but the instance that
+      holds the lock on the table has a local copy of the state.
+      We must find this instance and fix the local info.
+    */
+    {
+      LIST *list_element ;
+      pthread_mutex_lock(&THR_LOCK_maria);
+      pthread_mutex_lock(&share->close_lock);
+      pthread_mutex_lock(&share->intern_lock);
+      for (list_element= maria_open_list;
+           list_element;
+           list_element= list_element->next)
+      {
+        MARIA_HA *tmpinfo= (MARIA_HA*) list_element->data;
+        if (tmpinfo->s == share)
+          *tmpinfo->state= share->state.state;
+      }
+      pthread_mutex_unlock(&THR_LOCK_maria);
+      pthread_mutex_unlock(&share->intern_lock);
+      pthread_mutex_unlock(&share->close_lock);
+    }
+    if (maria_close(mi_info))
+      goto err;
+    goto end;
+
+  err:
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+
+  end :
+    do {} while (0); /* Empty statement, syntactically required. */
+  }
+#endif
+
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  Closes a table.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Table_restore::end()
+{
+  return close();
+}
+
+
+Table_restore::~Table_restore()
+{
+  end();
+}
+
+
+/**
+  Repairs table's index if needed. Has to be done after applying the log.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Table_restore::post_restore()
+{
+  HA_CHECK_OPT check_opt;
+  TABLE *table= NULL;
+  int error;
+  Vio* save_vio;
+  DBUG_ENTER("maria_backup::Table_restore::post_restore");
+
+  {
+    MARIA_HA *mi_info;
+    MARIA_SHARE *share;
+    /*
+      Table was copied while it was possibly open by other clients; we need to
+      correct open_count to not trigger superfluous warning messages or repair
+      by --maria-recover. If we rebuild the index, that will automatically
+      fix open_count.
+    */
+    mi_info= maria_open(file_name.ptr(), O_RDWR, HA_OPEN_FOR_REPAIR);
+    if ((error= (mi_info == NULL)))
+      goto err;
+    share= mi_info->s;
+    if (share->state.changed & STATE_BAD_OPEN_COUNT)
+    {
+      /* table already had a problem when backup started, leave open_count */
+      DBUG_PRINT("info", ("STATE_BAD_OPEN_COUNT is on"));
+    }
+    else
+    {
+      /* open_count>0 only because we copied while open, no problem */
+      share->state.open_count= 0;
+    }
+
+#ifdef RESTORE_KERNEL_KEEPS_OPEN_TABLES
+    if (share->data_file_type == BLOCK_RECORD)
+    {
+      /*
+        bitmap->map is full of zeroes (it dates from maria_delete_all_rows()
+        above). If we leave it like this, next threads may use it
+        (close_cached_tables() doesn't help here: if a thread has managed to
+        open the table while we had it locked, close_cached_tables() doesn't
+        close the table (BUG#40944)). So they will see it as empty, thus treat
+        data pages as empty, thus overwrite existing data records. To prevent
+        this, we reload this bitmap from disk.
+        We must do it in Table_restore::post_restore() and not in
+        Table_restore::close() (which is called two times): if we did it in
+        close(), the bitmap of before-log-applying would be read by page cache
+        and stay cached there, so the second close() (of after-log-applying)
+        will pick it from page cache instead of from disk, and so it will stay
+        old and empty.
+      */
+      pthread_mutex_lock(&share->bitmap.bitmap_lock);
+      share->bitmap.page= ~(ULL(0)); /* to force a read below */
+      (void)_ma_bitmap_get_page_bits(mi_info, &share->bitmap, 1);
+      pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+    }
+#endif
+    if (share->base.born_transactional)
+    {
+      /*
+        This table starts a new life: old REDOs shouldn't apply to it,
+        otherwise there could be this wrong sequence:
+        create table empty; back it up; bulk insert (no REDO); insert (REDO);
+        drop; restore; crash: then recovery will fail on the REDO for insert.
+        However, we don't change the uuid: if the table originally comes from
+        our instance, we don't want to zerofill it for nothing.
+      */
+      share->state.create_rename_lsn= share->state.is_of_horizon=
+        share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
+    }
+    /* force new open_count, LSNs to disk */
+    error= _ma_state_info_write_sub(share, share->kfile.file,
+                                    &share->state, 1);
+    error|= maria_close(mi_info);
+  }
+
+  if (!rebuild_index)
+    goto err;
+
+  /*
+    mariachk() as well as ha_maria::repair() do a lot of operations before
+    and after maria_repair(); to not duplicate code we reuse one of them.
+    As we are in the server here, we use the one of the server.
+    A "new ha_maria + ha_open()" is not sufficient as TABLE and TABLE_SHARE
+    are needed for ha_maria::open(). So we use open_temporary_table() which
+    sets up all fine without touching thread's structure (and so, without
+    causing problems to locks, without interfering with close_thread_tables()
+    which would be done by another driver in the same thread etc).
+    Note that as the table has just been created, and in theory is protected
+    from any usage, by the upper backup layer, opening it with
+    open_temporary_table() is correct.
+  */
+  char path[FN_REFLEN];
+  build_table_filename(path, sizeof(path), db.ptr(), name.ptr(), "", 0);
+  table= open_temporary_table(thd, path, db.ptr(), name.ptr(),
+                              false, OTM_OPEN);
+
+  if ((error= (!table || !table->file)))
+    goto err;
+
+  check_opt.init();
+  check_opt.flags|= T_VERY_SILENT | T_QUICK;
+  /*
+    We do not want repair() to spam us with messages (protocol->store() etc).
+    Just send them to the error log, and report the failure in case of
+    problems.
+    Note that ha_maria::restore() does not do that (merely uses the same
+    check_opt.flags as us), as it is allowed to return an array of errors.
+  */
+  save_vio= thd->net.vio;
+  thd->net.vio= NULL;
+  error= table->file->ha_repair(thd,&check_opt) != 0;
+  thd->net.vio= save_vio;
+
+err:
+  if (table)
+  {
+    intern_close_table(table);
+    my_free(table, MYF(MY_WME));
+  }
+  if (error)
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  DBUG_RETURN(backup::OK);
+}
+
+
+/**
+  Receives and restores data for one table from the backup kernel.
+
+  @param  buf             reference to Buffer where data is
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Table_restore::send_data(const Buffer &buf)
+{
+  enum enum_file_code file_code= static_cast<enum enum_file_code>(*buf.data);
+  result_t ret;
+  DBUG_ENTER("maria_backup::Table_restore::send_data");
+
+  switch (file_code)
+  {
+  case DATA_FILE_CODE:
+    ret= dfile_restore.send_data(buf);
+    break;
+  case HEADER_INDEX_FILE_CODE:
+    rebuild_index= TRUE; // because we are given only the index's header
+    // fall through
+  case WHOLE_INDEX_FILE_CODE:
+    ret= kfile_restore.send_data(buf);
+    break;
+  default:
+    DBUG_PRINT("info",("packet with code %d I didn't expect", file_code));
+    DBUG_ASSERT(0);
+    ret= backup::ERROR;
+  }
+  if (ret != backup::OK)
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  DBUG_RETURN(ret);
+}
+
+
+/**
+  Receives and restores data for the log from the backup kernel.
+
+  @param  buf             reference to Buffer where data is
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t Log_restore::send_data(const Buffer &buf)
+{
+  enum enum_file_code file_code= static_cast<enum enum_file_code>(*buf.data);
+  result_t ret;
+  DBUG_ENTER("maria_backup::Log_restore::send_data");
+
+  ret= (file_code == LOG_FILE_CODE) ? log_file_restore.send_data(buf) :
+    backup::ERROR;
+  if (ret != backup::OK)
+    SET_STATE_TO_ERROR_AND_DBUG_RETURN;
+  DBUG_RETURN(ret);
+}
+
+
+/**
+  Receives and restores data for one single file from the backup kernel.
+
+  @param  buf             reference to Buffer where data is
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t File_restore::send_data(const Buffer &buf)
+{
+  size_t howmuch= buf.size;
+
+  DBUG_ENTER("maria_backup::File_restore::send_data");
+  //DBUG_DUMP("receiving",buf.data + 1, 16);
+
+  // We should receive same buffers as those made at backup time
+  DBUG_ASSERT(howmuch >= 2);
+  howmuch--; // skip the first byte which contains the code
+  size_t res= my_write(fd, buf.data +1, howmuch, MYF(MY_WME));
+
+  DBUG_RETURN((res != howmuch) ? backup::ERROR : backup::OK);
+}
+
+
+/**
+  Closes a file in restore.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+result_t File_restore::close_file()
+{
+  int ret;
+  if (fd < 0)
+    return backup::OK;
+  ret= my_close(fd, MYF(MY_WME));
+  fd= -1;
+  return ret ? backup::ERROR : backup::OK;
+}
+
+
+} // maria_backup namespace
+
+
+/**
+  Returns the backup Engine used by this storage engine, per the API.
+
+  @return Operation status
+    @retval backup::OK
+    @retval backup::ERROR
+*/
+
+Backup_result_t maria_backup_engine(handlerton *self, Backup_engine* &be)
+{
+  be= new maria_backup::Engine();
+
+  if (unlikely(!be))
+    return backup::ERROR;
+
+  return backup::OK;
+}

=== modified file 'storage/maria/maria_chk.c'
--- a/storage/maria/maria_chk.c	2008-10-20 09:16:47 +0000
+++ b/storage/maria/maria_chk.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2003 MySQL AB
+/* Copyright (C) 2006-2003 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -1226,7 +1226,8 @@ static int maria_chk(HA_CHECK *param, ch
       error= maria_zerofill(param, info, filename);
     if (!error)
       share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
-                               STATE_CRASHED_ON_REPAIR);
+                               STATE_CRASHED_ON_REPAIR |
+                               STATE_BAD_OPEN_COUNT);
     else
       maria_mark_crashed(info);
   }
@@ -1279,7 +1280,8 @@ static int maria_chk(HA_CHECK *param, ch
           (param->testflag & T_UPDATE_STATE))
         info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
       share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
-                               STATE_CRASHED_ON_REPAIR);
+                               STATE_CRASHED_ON_REPAIR |
+                               STATE_BAD_OPEN_COUNT);
     }
     else if (!maria_is_crashed(info) &&
              (param->testflag & T_UPDATE_STATE))

=== modified file 'storage/maria/maria_def.h'
--- a/storage/maria/maria_def.h	2008-12-14 11:36:15 +0000
+++ b/storage/maria/maria_def.h	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -13,7 +14,10 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-/* This file is included by all internal maria files */
+/*
+  @file
+  This file is included by all internal maria files
+*/
 
 #include "maria.h"				/* Structs & some defines */
 #include <myisampack.h>				/* packing of keys */
@@ -263,6 +267,7 @@ typedef struct st_maria_file_bitmap
 #define MARIA_CHECKPOINT_SHOULD_FREE_ME 2
 #define MARIA_CHECKPOINT_SEEN_IN_LOOP 4
 
+/** Information shared by all open instances of the same table */
 typedef struct st_maria_share
 {					/* Shared between opens */
   MARIA_STATE_INFO state;
@@ -275,10 +280,12 @@ typedef struct st_maria_share
   MARIA_COLUMNDEF *columndef;		/* Pointer to column information */
   MARIA_PACK pack;			/* Data about packed records */
   MARIA_BLOB *blobs;			/* Pointer to blobs */
+  LIST *in_use;                         /* List of threads using this table */
   uint16 *column_nr;			/* Original column order */
   LEX_STRING unique_file_name;		/* realpath() of index file */
   LEX_STRING data_file_name;		/* Resolved path names from symlinks */
   LEX_STRING index_file_name;
+  /** File name before resolving any symlink or expanding directory */
   LEX_STRING open_file_name;		/* parameter to open filename */
   uchar *file_map;			/* mem-map of file if possible */
   PAGECACHE *pagecache;			/* ref to the current key cache */
@@ -348,7 +355,7 @@ typedef struct st_maria_share
   ulong state_diff_length;
   uint rec_reflength;			/* rec_reflength in use now */
   uint keypage_header;
-  uint32 ftparsers;			/* Number of distinct ftparsers
+  uint32 ftkeys;			/* Number of distinct full-text keys
 						   + 1 */
   PAGECACHE_FILE kfile;			/* Shared keyfile */
   File data_file;			/* Shared data file */
@@ -410,6 +417,21 @@ typedef struct st_maria_share
   MARIA_FILE_BITMAP bitmap;
   rw_lock_t mmap_lock;
   LSN lsn_of_file_id; /**< LSN of its last LOGREC_FILE_ID */
+  /**
+    If this table is doing physical logging (1) or not (0).
+    Set under MARIA_HA::physical_logging_rwlock and THR_LOCK_maria.
+    Read under either one of the two locks above.
+  */
+  volatile int32 physical_logging;
+  /** For protecting MARIA_SHARE::physical_logging */
+  my_atomic_rwlock_t physical_logging_rwlock;
+  /**
+    If we already stored MA_LOG_OPEN in physical log for this share.
+    Set to TRUE only by writer thread under THR_LOCK_maria_log atomically
+    with logging the MA_LOG_OPEN; set to FALSE only by ma_log_stop_physical()
+    after closing the log.
+  */
+  my_bool MA_LOG_OPEN_stored_in_physical_log;
 } MARIA_SHARE;
 
 
@@ -473,7 +495,7 @@ typedef struct st_maria_block_scan
   MARIA_RECORD_POS row_base_page;
 } MARIA_BLOCK_SCAN;
 
-//psergey-todo: do really need to have copies of this all over the place?
+/*psergey-todo: do really need to have copies of this all over the place?*/
 typedef my_bool (*index_cond_func_t)(void *param);
 
 struct st_maria_handler
@@ -496,6 +518,7 @@ struct st_maria_handler
   DYNAMIC_ARRAY *ft1_to_ft2;		/* used only in ft1->ft2 conversion */
   MEM_ROOT      ft_memroot;             /* used by the parser               */
   MYSQL_FTPARSER_PARAM *ftparser_param;	/* share info between init/deinit */
+  LIST in_use;                          /* Thread using this table          */
   uchar *buff;				/* page buffer */
   uchar *keyread_buff;                   /* Buffer for last key read */
   uchar *lastkey_buff;			/* Last used search key */
@@ -608,6 +631,12 @@ struct st_maria_handler
 #define STATE_NOT_ZEROFILLED     128
 #define STATE_NOT_MOVABLE        256
 #define STATE_MOVED              512 /* set if base->uuid != maria_uuid */
+/**
+   If open_count>0 the first time we opened this table; cleared after
+   successful check or repair
+*/
+#define STATE_BAD_OPEN_COUNT     1024
+
 
 /* options to maria_read_cache */
 
@@ -661,8 +690,10 @@ struct st_maria_handler
 */
 #define int4store_aligned(A,B) int4store((A),(B))
 
+#define ma_report_crashed(A, B) _ma_report_crashed((A), (B), __FILE__, __LINE__)
 #define maria_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \
     DBUG_PRINT("error", ("Marked table crashed"));                      \
+    ma_report_crashed((x), 0);                                          \
   }while(0)
 #define maria_mark_crashed_share(x)                                     \
   do{(x)->state.changed|= STATE_CRASHED;                                \
@@ -767,7 +798,7 @@ struct st_maria_handler
 #define maria_unique_store(A,B)    mi_int4store((A),(B))
 
 #ifdef THREAD
-extern pthread_mutex_t THR_LOCK_maria;
+extern pthread_mutex_t THR_LOCK_maria_log;
 #endif
 #if !defined(THREAD) || defined(DONT_USE_RW_LOCKS)
 #define rw_wrlock(A) {}
@@ -787,7 +818,10 @@ extern pthread_mutex_t THR_LOCK_maria;
 
 
 /* Some extern variables */
+
+C_MODE_START
 extern LIST *maria_open_list;
+C_MODE_END
 extern uchar maria_file_magic[], maria_pack_file_magic[];
 extern uchar maria_uuid[MY_UUID_SIZE];
 extern uint32 maria_read_vec[], maria_readnext_vec[];
@@ -796,6 +830,7 @@ extern char *maria_data_root;
 extern uchar maria_zero_string[];
 extern my_bool maria_inited, maria_in_ha_maria;
 extern HASH maria_stored_state;
+extern const HASH *ma_log_tables_physical;
 
 /* This is used by _ma_calc_xxx_key_length och _ma_store_key */
 typedef struct st_maria_s_param
@@ -1055,6 +1090,36 @@ typedef struct st_maria_block_info
 #define SORT_BUFFER_INIT	(1024L*1024L*64-MALLOC_OVERHEAD)
 #define MIN_SORT_BUFFER		(4096-MALLOC_OVERHEAD)
 
+/**
+  Commands storable in Maria non-transactional log (physical log).
+*/
+enum maria_log_commands {
+  MA_LOG_OPEN, /**< when maria_open() */
+  MA_LOG_CLOSE, /**< when maria_close() */
+  MA_LOG_WRITE_BYTES_MAD, /**< when Maria writes to the data file */
+  MA_LOG_WRITE_BYTES_MAI, /**< when Maria writes to the index file */
+  MA_LOG_CHSIZE_MAD,      /**< when Maria changes size of data file */
+  MA_LOG_CHSIZE_MAI,      /**< when Maria changes size of index file */
+  MA_LOG_END_SENTINEL /**< keep this one unused and last */
+};
+extern const char *ma_log_command_name[];
+/** If log record stores numerical info in long format */
+#define MA_LOG_BIG_NUMBERS 128
+
+/**
+  Maria-specific errors (not generic enough to be HA_ERR), sent to the
+  caller wrapped inside ER_GET_ERRMSG. Not yet stabilized, so not yet
+  exported to maria.h.
+*/
+enum maria_errors
+{
+  /* decrease, starting with -1 */
+  MARIA_ERR_NO_BACKUP_WITH_EXTERNAL_LOCKING= -1,
+  MARIA_ERR_BACKUP_TOO_RECENT= -2,
+  MARIA_ERR_LAST=-3 /**< keep it last and unused */
+  /* use only numbers<0, to not collide with OS errors, ER_, HA_ERR etc */
+};
+#define MARIA_ERR(errnumber) maria_error_messages[-errnumber-1]
 #define fast_ma_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _ma_writeinfo((INFO),0)
 #define fast_ma_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _ma_readinfo((INFO),F_RDLCK,1)
 
@@ -1065,6 +1130,57 @@ extern uint _ma_pack_get_block_info(MARI
                                     size_t *rec_buff_size,
                                     File file, my_off_t filepos);
 extern void _ma_store_blob_length(uchar *pos, uint pack_length, uint length);
+extern void _maria_log_command(IO_CACHE *log,
+                               enum maria_log_commands command,
+                               MARIA_SHARE *share,
+                               const uchar *buffert,
+                                uint length, int result);
+extern void maria_log_pwrite_physical(enum maria_log_commands command,
+                                       MARIA_SHARE *share,
+                                       const uchar *buffert, uint length,
+                                       my_off_t filepos);
+extern void maria_log_chsize_physical(MARIA_SHARE *share,
+                                      enum maria_log_commands command,
+                                      my_off_t new_length);
+#ifdef HAVE_MARIA_PHYSICAL_LOGGING
+static inline int32 ma_get_physical_logging_state(MARIA_SHARE *share)
+{
+  int32 ret;
+  my_atomic_rwlock_rdlock(&share->physical_logging_rwlock);
+  ret= my_atomic_load32(&share->physical_logging);
+  my_atomic_rwlock_rdunlock(&share->physical_logging_rwlock);
+  return ret;
+}
+static inline void
+ma_set_physical_logging_state(MARIA_SHARE *share, int32 new_state)
+{
+  my_atomic_rwlock_wrlock(&share->physical_logging_rwlock);
+  my_atomic_store32(&share->physical_logging, new_state);
+  my_atomic_rwlock_wrunlock(&share->physical_logging_rwlock);
+}
+#else
+#define ma_get_physical_logging_state(share) 0
+#define ma_set_physical_logging_state(share, new_state)
+#endif
+C_MODE_START
+/**
+  IN and OUT structure for instructing how to apply a Maria log and later
+  getting statistics about this log.
+*/
+typedef struct ma_examine_log_param
+{
+  uint verbose, update, max_files, re_open_count, recover, prefix_remove,
+    opt_processes;
+  ulong number_of_commands;
+  my_off_t start_offset,record_pos;
+  const char *log_filename, *filepath, *write_filename, *record_pos_file;
+  /** Count of commands found in log and their errors */
+  ulong com_count[MA_LOG_END_SENTINEL][3];
+  my_bool (*table_selection_hook)(const char *); /**< to filter tables */
+} MA_EXAMINE_LOG_PARAM;
+extern void ma_examine_log_param_init(MA_EXAMINE_LOG_PARAM *param);
+extern int ma_examine_log(MA_EXAMINE_LOG_PARAM *param);
+C_MODE_END
 extern void _ma_report_error(int errcode, const LEX_STRING *file_name);
 extern my_bool _ma_memmap_file(MARIA_HA *info);
 extern void _ma_unmap_file(MARIA_HA *info);
@@ -1081,9 +1197,20 @@ extern size_t _ma_nommap_pread(MARIA_HA 
 extern size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer,
 				size_t Count, my_off_t offset, myf MyFlags);
 
+C_MODE_START
+/* my_pwrite instead of my_write used */
+#define MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET 1
+/* info should be written */
+#define MA_STATE_INFO_WRITE_FULL_INFO        2
+/* intern_lock taking is needed */
+#define MA_STATE_INFO_WRITE_LOCK             4
 uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite);
-uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite);
-uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state);
+uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state,
+                             my_bool force);
+uint _ma_state_info_write_sub(MARIA_SHARE *share, File file,
+                              MARIA_STATE_INFO *state, uint pWrite);
+C_MODE_END
+int ma_remap_file_and_write_state_for_unlock(MARIA_HA *info, my_bool force);
 uint _ma_base_info_write(File file, MARIA_BASE_INFO *base);
 my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg);
 uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg);
@@ -1216,6 +1343,12 @@ extern my_bool maria_flush_log_for_page(
 extern my_bool maria_flush_log_for_page_none(uchar *page,
                                              pgcache_page_no_t page_no,
                                              uchar *data_ptr);
+extern my_bool maria_log_index_page_flush_physical(uchar *page,
+                                                   pgcache_page_no_t page_no,
+                                                   uchar *data_ptr);
+extern my_bool maria_log_data_page_flush_physical(uchar *page,
+                                                  pgcache_page_no_t page_no,
+                                                  uchar *data_ptr);
 void maria_concurrent_inserts(MARIA_HA *info, my_bool concurrent_insert);
 extern PAGECACHE *maria_log_pagecache;
 
@@ -1224,3 +1357,12 @@ extern void ma_set_index_cond_func(MARIA
                                    void *func_arg);
 int ma_check_index_cond(register MARIA_HA *info, uint keynr, uchar *record);
 
+void _ma_report_crashed(MARIA_HA *file, const char *message,
+                        const char *sfile, uint sline);
+
+C_MODE_START
+extern const char *maria_error_messages[];
+extern my_bool ma_log_index_pages_physical;
+extern IO_CACHE maria_physical_log;
+extern pthread_mutex_t THR_LOCK_maria;
+C_MODE_END

=== added file 'storage/maria/maria_non_trans_log.c'
--- a/storage/maria/maria_non_trans_log.c	1970-01-01 00:00:00 +0000
+++ b/storage/maria/maria_non_trans_log.c	2009-01-28 11:08:55 +0000
@@ -0,0 +1,313 @@
+/* Copyright (C) 2009 - 2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file
+  Utility to display and apply a Maria physical log to tables.
+
+  Prints what is in a Maria physical log, optionally
+  applies the changes to tables (all tables or only a set specified on the
+  command line). Works standalone (tables must not be modified by the
+  server during this).
+*/
+
+#ifndef USE_MY_FUNC
+#define USE_MY_FUNC
+#endif
+
+#include "maria_def.h"
+#include <my_tree.h>
+#include <stdarg.h>
+#ifdef HAVE_GETRUSAGE
+#include <sys/resource.h>
+#endif
+
+#define NO_FILEPOS (ulong) ~0L
+
+static void get_options(int *argc,char ***argv);
+static my_bool matches_list_of_tables(const char *isam_file_name);
+
+static MA_EXAMINE_LOG_PARAM mi_exl;
+static char **table_names;
+
+static uint test_info=0;
+
+int main(int argc, char **argv)
+{
+  int error,i,first;
+  ulong total_count,total_error,total_recover;
+  MY_INIT(argv[0]);
+
+  ma_examine_log_param_init(&mi_exl);
+  get_options(&argc,&argv);
+  if (mi_exl.log_filename == NULL)
+  {
+    fprintf(stderr, "I need a log file name. Use option -? to learn about"
+            " usage of this program.\n");
+    exit(1);
+  }
+  if (argv[0]) /* some table names passed on command line */
+  {
+    table_names= argv;
+    mi_exl.table_selection_hook= matches_list_of_tables;
+  }
+
+  /* Number of Maria files we can have open at one time */
+  mi_exl.max_files= (my_set_max_open_files(max(mi_exl.max_files,8))-6)/2;
+
+  /*
+    Program must work in all conditions: support symbolic links.
+    It should not be a security risk.
+  */
+#ifdef USE_SYMDIR
+  my_use_symdir= 1;
+#endif
+
+  if (mi_exl.update)
+    printf("Trying to %s Maria files according to log '%s'\n",
+	   (mi_exl.recover ? "recover" : "update"),mi_exl.log_filename);
+
+  if (maria_init())
+  {
+    fprintf(stderr, "Can't init Maria engine (%d)\n", errno);
+    exit(1);
+  }
+
+  error= ma_examine_log(&mi_exl);
+
+  if (mi_exl.update && ! error)
+    puts("Tables updated successfully");
+  total_count=total_error=total_recover=0;
+  for (i=first=0 ; ma_log_command_name[i] ; i++)
+  {
+    if (mi_exl.com_count[i][0])
+    {
+      if (!first++)
+      {
+	if (mi_exl.verbose || mi_exl.update)
+	  puts("");
+	puts("Commands                         Used count    Errors"
+             " Recover errors");
+      }
+      printf("%-20s%9ld%10ld%15ld\n", ma_log_command_name[i],
+             mi_exl.com_count[i][0],
+	     mi_exl.com_count[i][1],mi_exl.com_count[i][2]);
+      total_count+=mi_exl.com_count[i][0];
+      total_error+=mi_exl.com_count[i][1];
+      total_recover+=mi_exl.com_count[i][2];
+    }
+  }
+  if (total_count)
+    printf("%-12s%9ld%10ld%17ld\n","Total",total_count,total_error,
+	   total_recover);
+  if (mi_exl.re_open_count)
+    printf("Had to do %d re-open because of too few possibly open files\n",
+	   mi_exl.re_open_count);
+  (void) maria_panic(HA_PANIC_CLOSE);
+  my_free_open_file_info();
+  my_end(test_info ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
+  exit(error);
+  return 0;				/* No compiler warning */
+} /* main */
+
+
+static void get_options(register int *argc, register char ***argv)
+{
+  int help,version;
+  const char *pos,*usage;
+  char option;
+
+  help=0;
+  usage="Usage: %s [-?iruvDIV] [-c #] [-f #] [-F filepath/] [-o #] [-R file recordpos] [-w write_file] log-filename [table ...] \n";
+  pos="";
+
+  while (--*argc > 0 && *(pos = *(++*argv)) == '-' ) {
+    while (*++pos)
+    {
+      version=0;
+      switch((option=*pos)) {
+      case '#':
+	DBUG_PUSH (++pos);
+	pos=" ";				/* Skip rest of arg */
+	break;
+      case 'c':
+	if (! *++pos)
+	{
+	  if (!--*argc)
+	    goto err;
+	  else
+	    pos= *(++*argv);
+	}
+	mi_exl.number_of_commands= (ulong) atol(pos);
+	pos=" ";
+	break;
+      case 'u':
+	mi_exl.update=1;
+	break;
+      case 'f':
+	if (! *++pos)
+	{
+	  if (!--*argc)
+	    goto err;
+	  else
+	    pos= *(++*argv);
+	}
+	mi_exl.max_files=(uint) atoi(pos);
+	pos=" ";
+	break;
+      case 'i':
+	test_info=1;
+	break;
+      case 'o':
+	if (! *++pos)
+	{
+	  if (!--*argc)
+	    goto err;
+	  else
+	    pos= *(++*argv);
+	}
+	mi_exl.start_offset=(my_off_t) strtoll(pos,NULL,10);
+	pos=" ";
+	break;
+      case 'p':
+	if (! *++pos)
+	{
+	  if (!--*argc)
+	    goto err;
+	  else
+	    pos= *(++*argv);
+	}
+	mi_exl.prefix_remove=atoi(pos);
+	break;
+      case 'r':
+	mi_exl.update=1;
+	mi_exl.recover++;
+	break;
+      case 'P':
+	mi_exl.opt_processes=1;
+	break;
+      case 'R':
+	if (! *++pos)
+	{
+	  if (!--*argc)
+	    goto err;
+	  else
+	    pos= *(++*argv);
+	}
+	mi_exl.record_pos_file=(char*) pos;
+	if (!--*argc)
+	  goto err;
+	mi_exl.record_pos=(my_off_t) strtoll(*(++*argv),NULL,10);
+	pos=" ";
+	break;
+      case 'v':
+	mi_exl.verbose++;
+	break;
+      case 'w':
+	if (! *++pos)
+	{
+	  if (!--*argc)
+	    goto err;
+	  else
+	    pos= *(++*argv);
+	}
+	mi_exl.write_filename=(char*) pos;
+	pos=" ";
+	break;
+      case 'F':
+	if (! *++pos)
+	{
+	  if (!--*argc)
+	    goto err;
+	  else
+	    pos= *(++*argv);
+	}
+	mi_exl.filepath= (char*) pos;
+	pos=" ";
+	break;
+      case 'V':
+	version=1;
+	/* Fall through */
+      case 'I':
+      case '?':
+#include <help_start.h>
+	printf("%s  Ver 2.0 for %s at %s\n",my_progname,SYSTEM_TYPE,
+	       MACHINE_TYPE);
+	puts("By Monty, for your professional use\n");
+	if (version)
+	  break;
+	puts("Write info about what is in a Maria non-transactional physical log file.");
+	printf("Requires a file name in argument\n");
+	puts("");
+	printf(usage,my_progname);
+	puts("");
+	puts("Options: -? or -I \"Info\"     -V \"version\"   -c \"do only # commands\"");
+	puts("         -f \"max open files\" -F \"filepath\"  -i \"extra info\"");
+	puts("         -o \"offset\"         -p # \"remove # components from path\"");
+	puts("         -r \"recover\"        -R \"file recordposition\"");
+	puts("         -u \"update\"         -v \"verbose\"   -w \"write file\"");
+	puts("         -D \"maria compiled with DBUG\"   -P \"processes\"");
+	puts("\nOne can give a second and a third '-v' for more verbose.");
+	puts("Normaly one does a update (-u).");
+	puts("If a recover is done all writes and all possibly updates and deletes is done\nand errors are only counted.");
+	puts("If one gives table names as arguments only these tables will be updated\n");
+	help=1;
+#include <help_end.h>
+	break;
+      default:
+	printf("illegal option: \"-%c\"\n",*pos);
+	break;
+      }
+    }
+  }
+  if (! *argc)
+  {
+    if (help)
+    exit(0);
+    (*argv)++;
+  }
+  if (*argc >= 1)
+  {
+    mi_exl.log_filename=(char*) pos;
+    (*argc)--;
+    (*argv)++;
+  }
+  return;
+ err:
+  (void) fprintf(stderr,"option \"%c\" used without or with wrong argument\n",
+	       option);
+  exit(1);
+}
+
+
+static my_bool matches_list_of_tables(const char *isam_file_name)
+{
+  if (table_names && table_names[0])
+  {
+    char **name;
+    for (name= table_names ; *name ; name++)
+    {
+      if (!strcmp(*name, isam_file_name))
+        return 1;
+    }
+    return 0;
+  }
+  return 1;
+}
+
+#define MA_CHECK_STANDALONE 1
+#include "ma_check_standalone.h"
+#undef MA_CHECK_STANDALONE
+

=== modified file 'storage/maria/maria_pack.c'
--- a/storage/maria/maria_pack.c	2008-08-25 18:23:18 +0000
+++ b/storage/maria/maria_pack.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB,
+   2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -302,7 +303,7 @@ static void print_version(void)
 static void usage(void)
 {
   print_version();
-  puts("Copyright (C) 2002 MySQL AB");
+  puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
   puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
   puts("and you are welcome to modify and redistribute it under the GPL license\n");
 
@@ -3005,8 +3006,10 @@ static int save_state(MARIA_HA *isam_fil
   (void)(my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0)));
   if (share->base.keys)
     isamchk_neaded=1;
-  DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
-                                       &share->state, (1 + 2)));
+  DBUG_RETURN(_ma_state_info_write_sub(share, share->kfile.file,
+                                       &share->state,
+                                       MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                       MA_STATE_INFO_WRITE_FULL_INFO));
 }
 
 
@@ -3046,7 +3049,9 @@ static int save_state_mrg(File file,PACK
   if (isam_file->s->base.keys)
     isamchk_neaded=1;
   state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
-  DBUG_RETURN (_ma_state_info_write_sub(file,&state,1+2));
+  DBUG_RETURN (_ma_state_info_write_sub(isam_file->s, file,&state,
+                                        MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                        MA_STATE_INFO_WRITE_FULL_INFO));
 }
 
 

=== modified file 'storage/maria/maria_read_log.c'
--- a/storage/maria/maria_read_log.c	2008-06-05 16:11:22 +0000
+++ b/storage/maria/maria_read_log.c	2009-02-13 16:30:54 +0000
@@ -23,9 +23,9 @@ static const char *load_default_groups[]
 static void get_options(int *argc,char * * *argv);
 #ifndef DBUG_OFF
 #if defined(__WIN__)
-const char *default_dbug_option= "d:t:i:O,\\maria_read_log.trace";
+const char *default_dbug_option= "d:t:O,\\maria_read_log.trace";
 #else
-const char *default_dbug_option= "d:t:i:o,/tmp/maria_read_log.trace";
+const char *default_dbug_option= "d:t:o,/tmp/maria_read_log.trace";
 #endif
 #endif /* DBUG_OFF */
 static my_bool opt_display_only, opt_apply, opt_apply_undo, opt_silent;

=== modified file 'storage/maria/trnman.c'
--- a/storage/maria/trnman.c	2008-12-09 12:36:51 +0000
+++ b/storage/maria/trnman.c	2009-01-16 16:18:17 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -42,7 +42,7 @@ static TrID global_trid_generator;
   The default value is used when transaction manager not initialize;
   Probably called from maria_chk
 */
-static TrID trid_min_read_from= ~(TrID) 0;
+static TrID trid_min_read_from= MAX_TRID;
 
 /* the mutex for everything above */
 static pthread_mutex_t LOCK_trn_list;
@@ -59,6 +59,7 @@ static TRN **short_trid_to_active_trn;
 /* locks for short_trid_to_active_trn and pool */
 static my_atomic_rwlock_t LOCK_short_trid_to_trn, LOCK_pool;
 static my_bool default_trnman_end_trans_hook(TRN *, my_bool, my_bool);
+static void trnman_free_trn(TRN *);
 
 my_bool (*trnman_end_trans_hook)(TRN *, my_bool, my_bool)=
   default_trnman_end_trans_hook;
@@ -88,6 +89,19 @@ void trnman_reset_locked_tables(TRN *trn
   trn->locked_tables= locked_tables;
 }
 
+#ifdef EXTRA_DEBUG
+uint16 trnman_get_flags(TRN *trn)
+{
+  return trn->flags;
+}
+
+void trnman_set_flags(TRN *trn, uint16 flags)
+{
+  trn->flags= flags;
+}
+#endif
+
+/** Wake up threads waiting for this transaction */
 static void wt_thd_release_self(TRN *trn)
 {
   if (trn->wt)
@@ -149,12 +163,12 @@ int trnman_init(TrID initial_trid)
   */
 
   active_list_max.trid= active_list_min.trid= 0;
-  active_list_max.min_read_from= ~(TrID) 0;
+  active_list_max.min_read_from= MAX_TRID;
   active_list_max.next= active_list_min.prev= 0;
   active_list_max.prev= &active_list_min;
   active_list_min.next= &active_list_max;
 
-  committed_list_max.commit_trid= ~(TrID) 0;
+  committed_list_max.commit_trid= MAX_TRID;
   committed_list_max.next= committed_list_min.prev= 0;
   committed_list_max.prev= &committed_list_min;
   committed_list_min.next= &committed_list_max;
@@ -198,6 +212,7 @@ void trnman_destroy()
   {
     TRN *trn= pool;
     pool= pool->next;
+    DBUG_ASSERT(trn->wt == NULL);
     pthread_mutex_destroy(&trn->state_lock);
     my_free((void *)trn, MYF(0));
   }
@@ -251,10 +266,12 @@ static uint get_short_trid(TRN *trn)
   return res;
 }
 
-/*
-  DESCRIPTION
-    start a new transaction, allocate and initialize transaction object
-    mutex and cond will be used for lock waits
+/**
+  Allocates and initialzies a new TRN object
+
+  @note the 'wt' parameter can only be 0 in a single-threaded code (or,
+  generally, where threads cannot block each other), otherwise the
+  first call to the deadlock detector will sigsegv.
 */
 
 TRN *trnman_new_trn(WT_THD *wt)
@@ -338,7 +355,8 @@ TRN *trnman_new_trn(WT_THD *wt)
     trn->min_read_from= trn->trid + 1;
   }
 
-  trn->commit_trid=  ~(TrID)0;
+  /* no other transaction can read changes done by this one */
+  trn->commit_trid=  MAX_TRID;
   trn->rec_lsn= trn->undo_lsn= trn->first_undo_lsn= 0;
   trn->used_tables= 0;
 
@@ -394,6 +412,7 @@ my_bool trnman_end_trn(TRN *trn, my_bool
 
   /* if a rollback, all UNDO records should have been executed */
   DBUG_ASSERT(commit || trn->undo_lsn == 0);
+  DBUG_ASSERT(trn != &dummy_transaction_object);
   DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list"));
 
   pthread_mutex_lock(&LOCK_trn_list);
@@ -429,7 +448,8 @@ my_bool trnman_end_trn(TRN *trn, my_bool
   }
 
   pthread_mutex_lock(&trn->state_lock);
-  trn->commit_trid= global_trid_generator;
+  if (commit)
+    trn->commit_trid= global_trid_generator;
   wt_thd_release_self(trn);
   pthread_mutex_unlock(&trn->state_lock);
 
@@ -502,7 +522,7 @@ my_bool trnman_end_trn(TRN *trn, my_bool
   running. It may even be called automatically on checkpoints if no
   transactions are running.
 */
-void trnman_free_trn(TRN *trn)
+static void trnman_free_trn(TRN *trn)
 {
   /*
      union is to solve strict aliasing issue.
@@ -580,6 +600,16 @@ int trnman_can_read_from(TRN *trn, TrID 
   return can;
 }
 
+/**
+  Finds a TRN by its TrID
+
+  @param trn    current trn. Needed for pinning pointers (see lf_pin)
+  @param trid   trid to search for
+
+  @return found trn or 0
+
+  @note that trn is returned with its state locked!
+*/
 TRN *trnman_trid_to_trn(TRN *trn, TrID trid)
 {
   TRN **found;
@@ -604,7 +634,7 @@ TRN *trnman_trid_to_trn(TRN *trn, TrID t
   lf_hash_search_unpin(trn->pins);
 
   /* Gotcha! */
-  return *found; /* note that TRN is returned locked !!! */
+  return *found;
 }
 
 /* TODO: the stubs below are waiting for savepoints to be implemented */
@@ -885,11 +915,22 @@ my_bool trnman_exists_active_transaction
   for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
   {
     /*
-      We use >= for min_id as min_id is a commit_trid and trn->trid
-      is transaction id.  In the case they are the same, then the
-      trn started after the min_id was committed.
+      We use <= for max_id as max_id is a commit_trid and trn->trid
+      is transaction id.  When calculating commit_trid we use the
+      current value of global_trid_generator.  global_trid_generator is
+      incremented for each new transaction.
+
+      For example, assuming we have
+      min_id = 5
+      max_id = 10
+
+      A trid of value 5 can't see the history event between 5 & 10
+      at it vas started before min_id 5 was committed.
+      A trid of value 10 can't see the next history event (max_id = 10)
+      as it started before this was committed. In this case it must use
+      the this event.
     */
-    if (trn->trid >= min_id && trn->trid < max_id)
+    if (trn->trid > min_id && trn->trid <= max_id)
     {
       ret= 1;
       break;

=== modified file 'storage/maria/trnman.h'
--- a/storage/maria/trnman.h	2008-08-07 20:57:25 +0000
+++ b/storage/maria/trnman.h	2009-01-16 16:18:17 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@ C_MODE_START
 #include "trnman_public.h"
 #include "ma_loghandler_lsn.h"
 
-/*
+/**
   trid - 6 uchar transaction identifier. Assigned when a transaction
   is created. Transaction can always be identified by its trid,
   even after transaction has ended.
@@ -33,7 +33,7 @@ C_MODE_START
   when short_id is 0, TRN is not initialized, for all practical purposes
   it could be considered unused.
 
-  when commit_trid is ~(TrID)0 the transaction is running, otherwise it's
+  when commit_trid is MAX_TRID the transaction is running, otherwise it's
   committed.
 
   state_lock mutex protects the state of a TRN, that is whether a TRN
@@ -46,16 +46,18 @@ struct st_ma_transaction
   LF_PINS              *pins;
   WT_THD               *wt;
   pthread_mutex_t      state_lock;
-  void                 *used_tables;  /* Tables used by transaction */
+  void                 *used_tables;  /**< Tables used by transaction */
   TRN                  *next, *prev;
   TrID                 trid, min_read_from, commit_trid;
   LSN		       rec_lsn, undo_lsn;
   LSN_WITH_FLAGS       first_undo_lsn;
   uint                 locked_tables;
   uint16               short_id;
+  uint16               flags;         /**< Various flags */
 };
 
 #define TRANSACTION_LOGGED_LONG_ID ULL(0x8000000000000000)
+#define MAX_TRID (~(TrID)0)
 
 extern WT_RESOURCE_TYPE ma_rc_dup_unique;
 

=== modified file 'storage/maria/trnman_public.h'
--- a/storage/maria/trnman_public.h	2008-12-08 20:09:59 +0000
+++ b/storage/maria/trnman_public.h	2009-01-16 16:18:17 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -44,7 +44,6 @@ my_bool trnman_end_trn(TRN *trn, my_bool
 #define trnman_commit_trn(T) trnman_end_trn(T, TRUE)
 #define trnman_abort_trn(T)  trnman_end_trn(T, FALSE)
 #define trnman_rollback_trn(T)  trnman_end_trn(T, FALSE)
-void trnman_free_trn(TRN *trn);
 int trnman_can_read_from(TRN *trn, TrID trid);
 TRN *trnman_trid_to_trn(TRN *trn, TrID trid);
 void trnman_new_statement(TRN *trn);
@@ -70,5 +69,17 @@ my_bool trnman_exists_active_transaction
 void trnman_lock();
 void trnman_unlock();
 my_bool trman_is_inited();
+#ifdef EXTRA_DEBUG
+uint16 trnman_get_flags(TRN *);
+void trnman_set_flags(TRN *, uint16 flags);
+#else
+#define trnman_get_flags(A) 0
+#define trnman_set_flags(A, B) do { } while (0)
+#endif
+
+/* Flag bits */
+#define TRN_STATE_INFO_LOGGED       1  /* Query is logged */
+#define TRN_STATE_TABLES_CAN_CHANGE 2  /* Things can change during trans. */
+
 C_MODE_END
 #endif

=== modified file 'storage/maria/unittest/ma_pagecache_consist.c'
--- a/storage/maria/unittest/ma_pagecache_consist.c	2008-05-29 15:44:11 +0000
+++ b/storage/maria/unittest/ma_pagecache_consist.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2008 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -368,7 +368,8 @@ int main(int argc __attribute__((unused)
     exit(1);
   }
   pagecache_file_init(file1, &dummy_callback, &dummy_callback,
-                      &dummy_fail_callback, &dummy_callback, NULL);
+                      &dummy_fail_callback, &dummy_callback, &dummy_callback,
+                      NULL);
   DBUG_PRINT("info", ("file1: %d", file1.file));
   if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
     exit(1);

=== modified file 'storage/maria/unittest/ma_pagecache_rwconsist.c'
--- a/storage/maria/unittest/ma_pagecache_rwconsist.c	2008-10-20 09:16:47 +0000
+++ b/storage/maria/unittest/ma_pagecache_rwconsist.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2008 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -238,7 +238,8 @@ int main(int argc __attribute__((unused)
     exit(1);
   }
   pagecache_file_init(file1, &dummy_callback, &dummy_callback,
-                      &dummy_fail_callback, &dummy_callback, NULL);
+                      &dummy_fail_callback, &dummy_callback, &dummy_callback,
+                      NULL);
   DBUG_PRINT("info", ("file1: %d", file1.file));
   if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
     exit(1);

=== modified file 'storage/maria/unittest/ma_pagecache_rwconsist2.c'
--- a/storage/maria/unittest/ma_pagecache_rwconsist2.c	2008-10-20 13:03:34 +0000
+++ b/storage/maria/unittest/ma_pagecache_rwconsist2.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
+/* Copyright (C) 2006-2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -234,7 +234,8 @@ int main(int argc __attribute__((unused)
     exit(1);
   }
   pagecache_file_init(file1, &dummy_callback, &dummy_callback,
-                      &dummy_fail_callback, &dummy_callback, NULL);
+                      &dummy_fail_callback, &dummy_callback, &dummy_callback,
+                      NULL);
   DBUG_PRINT("info", ("file1: %d", file1.file));
   if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
     exit(1);

=== modified file 'storage/maria/unittest/ma_pagecache_single.c'
--- a/storage/maria/unittest/ma_pagecache_single.c	2008-10-20 09:16:47 +0000
+++ b/storage/maria/unittest/ma_pagecache_single.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2008 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -673,7 +673,8 @@ int main(int argc __attribute__((unused)
     exit(1);
   }
   pagecache_file_init(file1, &dummy_callback, &dummy_callback,
-                      &dummy_fail_callback, &dummy_callback, NULL);
+                      &dummy_fail_callback, &dummy_callback, &dummy_callback,
+                      NULL);
   my_close(tmp_file, MYF(0));
   my_delete(file2_name, MYF(0));
 

=== modified file 'storage/maria/unittest/ma_test_loghandler_pagecache-t.c'
--- a/storage/maria/unittest/ma_test_loghandler_pagecache-t.c	2008-10-20 09:16:47 +0000
+++ b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2008 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -153,7 +153,8 @@ int main(int argc __attribute__((unused)
     exit(1);
   }
   pagecache_file_init(file1, &dummy_callback, &dummy_callback,
-                      &dummy_fail_callback, maria_flush_log_for_page, NULL);
+                      &dummy_fail_callback, maria_flush_log_for_page,
+                      &dummy_callback, NULL);
   if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
     exit(1);
 

=== modified file 'storage/maria/unittest/trnman-t.c'
--- a/storage/maria/unittest/trnman-t.c	2008-10-20 09:16:47 +0000
+++ b/storage/maria/unittest/trnman-t.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -40,7 +40,8 @@ pthread_handler_t test_trnman(void *arg)
   TRN    *trn[MAX_ITER];
   int    m= (*(int *)arg);
 
-  my_thread_init();
+  if (my_thread_init())
+    BAIL_OUT("my_thread_init failed!");
 
   for (x= ((int)(intptr)(&m)); m > 0; )
   {

=== modified file 'storage/myisam/mi_create.c'
--- a/storage/myisam/mi_create.c	2009-01-31 16:21:19 +0000
+++ b/storage/myisam/mi_create.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2006 MySQL AB
+/* Copyright (C) 2000-2006 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -496,7 +496,8 @@ int mi_create(const char *name,uint keys
   /* There are only 16 bits for the total header length. */
   if (info_length > 65535)
   {
-    my_printf_error(0, "MyISAM table '%s' has too many columns and/or "
+    my_printf_error(HA_WRONG_CREATE_OPTION,
+                    "MyISAM table '%s' has too many columns and/or "
                     "indexes and/or unique constraints.",
                     MYF(0), name + dirname_length(name));
     my_errno= HA_WRONG_CREATE_OPTION;
@@ -855,6 +856,10 @@ int mi_create(const char *name,uint keys
 
 err:
   pthread_mutex_unlock(&THR_LOCK_myisam);
+#ifdef THREAD
+  my_atomic_rwlock_destroy(&share.physical_logging_rwlock);
+#endif
+
 err_no_lock:
 
   save_errno=my_errno;
@@ -876,9 +881,6 @@ err_no_lock:
                                        MY_UNPACK_FILENAME | MY_APPEND_EXT),
 			     MYF(0));
   }
-#ifdef THREAD
-  my_atomic_rwlock_destroy(&share.physical_logging_rwlock);
-#endif
   my_free((char*) rec_per_key_part, MYF(0));
   DBUG_RETURN(my_errno=save_errno);		/* return the fatal errno */
 }

=== modified file 'storage/myisam/mi_delete.c'
--- a/storage/myisam/mi_delete.c	2008-07-09 07:12:43 +0000
+++ b/storage/myisam/mi_delete.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2006 MySQL AB
+/* Copyright (C) 2000-2006 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -117,7 +117,7 @@ err:
   save_errno=my_errno;
   mi_sizestore(lastpos,info->lastpos);
   myisam_log_command_logical(MI_LOG_DELETE, info,
-                           (uchar*) lastpos, sizeof(lastpos), 0);
+                           (uchar*) lastpos, sizeof(lastpos), save_errno);
   if (save_errno != HA_ERR_RECORD_CHANGED)
   {
     mi_print_error(info->s, HA_ERR_CRASHED);

=== modified file 'storage/myisam/mi_update.c'
--- a/storage/myisam/mi_update.c	2008-07-09 07:12:43 +0000
+++ b/storage/myisam/mi_update.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2006 MySQL AB
+/* Copyright (C) 2000-2006 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -233,7 +233,7 @@ err:
 
  err_end:
   myisam_log_record_logical(MI_LOG_UPDATE, info, newrec,
-                            info->lastpos, my_errno);
+                            info->lastpos, save_errno);
   (void) _mi_writeinfo(info, WRITEINFO_UPDATE_KEYFILE);
   allow_break();				/* Allow SIGHUP & SIGINT */
   if (save_errno == HA_ERR_KEY_NOT_FOUND)

=== modified file 'storage/myisam/mi_write.c'
--- a/storage/myisam/mi_write.c	2008-07-09 07:12:43 +0000
+++ b/storage/myisam/mi_write.c	2009-01-28 11:08:55 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2006 MySQL AB
+/* Copyright (C) 2000-2006 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -232,7 +232,7 @@ err:
   my_errno=save_errno;
 err2:
   save_errno=my_errno;
-  myisam_log_record_logical(MI_LOG_WRITE, info, record, filepos, my_errno);
+  myisam_log_record_logical(MI_LOG_WRITE, info, record, filepos, save_errno);
   (void) _mi_writeinfo(info,WRITEINFO_UPDATE_KEYFILE);
   allow_break();			/* Allow SIGHUP & SIGINT */
   DBUG_RETURN(my_errno=save_errno);

=== modified file 'storage/myisam/myisam_backup_engine.cc'
--- a/storage/myisam/myisam_backup_engine.cc	2009-02-04 10:49:16 +0000
+++ b/storage/myisam/myisam_backup_engine.cc	2009-02-13 12:40:13 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007 MySQL AB
+/* Copyright (C) 2007 MySQL AB, 2008 - 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -119,6 +119,23 @@ using backup::Buffer;
 */
 #define MYISAM_BACKUP_VERSION 1
 
+/**
+  Restore kernel opens tables and locks them for the duration of restore
+  (after having created them empty); this means that cached objects stay
+  around (MYISAM_SHARE, MI_INFO) and can become out-of-sync with the
+  data/index file filled by the driver, unless we take precautions which are
+  recognizable by this symbol.
+*/
+#define RESTORE_KERNEL_KEEPS_OPEN_TABLES 1
+/**
+  Restore kernel leaves a time windows between end of creation of table (via
+  execution of CREATE TABLE) and locking of this table; in this window another
+  client can open/lock/modify/unlock the table, which conflicts with what the
+  driver is going to write to the data/index file, unless we take precautions
+  which are recognizable by this symbol.
+*/
+#define RESTORE_KERNEL_NOT_ATOMIC 1
+
 /** Like Table_ref but with file name added */
 class Myisam_table_ref
 {
@@ -546,7 +563,7 @@ result_t Backup::begin(const size_t)
                   (my_hash_get_key)backup_get_table_from_hash_key,
                   (my_hash_free_key)backup_free_hash_key, 0))
     SET_STATE_TO_ERROR_AND_DBUG_RETURN;
-  /* Build the hash of tables for the MyISAM layer (mi_backup_log.c etc) */
+  /* Build the hash of tables for the MyISAM layer (mi_log.c etc) */
   for (uint n=0 ; n < m_tables.count() ; n++ )
   {
     char path[FN_REFLEN];
@@ -877,7 +894,7 @@ end2:
 
 /** Entry point for the locking thread */
 
-pthread_handler_t separate_thread_for_locking(void *arg)
+pthread_handler_t myisam_backup_separate_thread_for_locking(void *arg)
 {
   my_thread_init();
   DBUG_PRINT("info", ("myisam_backup::separate_thread_for_locking"));
@@ -909,7 +926,7 @@ result_t Backup::prelock()
   {
     pthread_t th;
     if (pthread_create(&th, &connection_attrib,
-                       separate_thread_for_locking, this))
+                       myisam_backup_separate_thread_for_locking, this))
     {
       lock_state= LOCK_ERROR;
       SET_STATE_TO_ERROR_AND_DBUG_RETURN;
@@ -1664,6 +1681,19 @@ Table_restore::Table_restore(const Table
     /* table does not exist or is corrupted? not normal, it's just created */
     goto err;
   }
+#ifdef RESTORE_KERNEL_NOT_ATOMIC
+  /*
+    Restore kernel leaves a window between creation of table and locking it;
+    in this window, another thread can modify the table, put pages in page
+    cache, alter state, increase the files's length to greater than what the
+    driver has to write...
+    So we re-empty it here. We know we are alone using the table at this
+    point, as restore kernel has finished locking tables.
+    See BUG#42519, BUG#41716.
+  */
+  if (mi_delete_all_rows(mi_info))
+    goto err;
+#endif
   /*
     It's ok to copy the kfile descriptor and write() to it as the upper layers
     guarantee that we are the only user of the brand new table (nobody will
@@ -1714,6 +1744,7 @@ result_t Table_restore::close()
       (kfile_restore.close_file() != backup::OK))
     SET_STATE_TO_ERROR_AND_DBUG_RETURN;
 
+#ifdef RESTORE_KERNEL_KEEPS_OPEN_TABLES
   /*
     CAUTION! Ugliest hack ever!
     This hack tries to recover from bypassing the MyISAM interface
@@ -1799,11 +1830,10 @@ result_t Table_restore::close()
       We must find this instance and fix the local info.
       Fortunately there is a state pointer, which can be set to the
       share. This invalidates the instance's local copy.
-      We need to acquire share->intern_lock when traversing the list
-      of open MyISAM instances.
     */
     {
       LIST *list_element ;
+      pthread_mutex_lock(&THR_LOCK_myisam);
       pthread_mutex_lock(&share->intern_lock);
       for (list_element= myisam_open_list;
            list_element;
@@ -1813,6 +1843,7 @@ result_t Table_restore::close()
         if (tmpinfo->s == share)
           tmpinfo->state= &share->state.state;
       }
+      pthread_mutex_unlock(&THR_LOCK_myisam);
       pthread_mutex_unlock(&share->intern_lock);
     }
     if (mi_close(mi_info))
@@ -1825,6 +1856,7 @@ result_t Table_restore::close()
   end :
     do {} while (0); /* Empty statement, syntactically required. */
   }
+#endif
 
   DBUG_RETURN(backup::OK);
 }
@@ -1866,7 +1898,6 @@ result_t Table_restore::post_restore()
   Vio* save_vio;
   DBUG_ENTER("myisam_backup::Table_restore::post_restore");
 
-  if (!rebuild_index)
   {
     MI_INFO *mi_info;
     MYISAM_SHARE *share;
@@ -1893,9 +1924,11 @@ result_t Table_restore::post_restore()
       error= mi_state_info_write(share, share->kfile, &share->state, 1);
     }
     error|= mi_close(mi_info);
-    goto err;
   }
 
+  if (!rebuild_index)
+    goto err;
+
   /*
     myisamchk() as well as ha_myisam::repair() do a lot of operations before
     and after mi_repair(); to not duplicate code we reuse one of them.

=== modified file 'storage/myisam/myisamdef.h'
--- a/storage/myisam/myisamdef.h	2009-01-30 14:13:39 +0000
+++ b/storage/myisam/myisamdef.h	2009-02-13 16:30:54 +0000
@@ -203,15 +203,15 @@ typedef struct st_mi_isam_share
   ulong min_pack_length;                /* Theese are used by packed data */
   ulong max_pack_length;
   ulong state_diff_length;
-  uint rec_reflength;                   /* rec_reflength in use now */
-  uint unique_name_length;
+  uint	rec_reflength;			/* rec_reflength in use now */
+  uint  unique_name_length;
   uint32 ftkeys;                        /* Number of full-text keys + 1 */
-  File kfile;                           /* Shared keyfile */
-  File data_file;                       /* Shared data file */
-  int mode;                             /* mode of file on open */
-  uint reopen;                          /* How many times reopened */
-  uint w_locks, r_locks, tot_locks;     /* Number of read/write locks */
-  uint blocksize;                       /* blocksize of keyfile */
+  File	kfile;				/* Shared keyfile */
+  File	data_file;			/* Shared data file */
+  int	mode;				/* mode of file on open */
+  uint	reopen;				/* How many times reopened */
+  uint	w_locks,r_locks,tot_locks;	/* Number of read/write locks */
+  uint	blocksize;			/* blocksize of keyfile */
   myf write_flag;
   enum data_file_type data_file_type;
   /* Below flag is needed to make log tables work with concurrent insert */

=== modified file 'support-files/mysql.spec.sh'
--- a/support-files/mysql.spec.sh	2008-12-28 07:07:52 +0000
+++ b/support-files/mysql.spec.sh	2009-02-13 16:30:54 +0000
@@ -387,7 +387,12 @@ CFLAGS="$CFLAGS" \
 CXXFLAGS="$CXXFLAGS" \
 BuildMySQL "\
 		--with-debug \
-		--with-comment=\"MySQL Community Server - Debug (%{license})\"")
+%if %{MARIA_BUILD}
+		--with-comment=\"MySQL Community Server - Debug [Maria] (%{license})\" \
+%else
+		--with-comment=\"MySQL Community Server - Debug (%{license})\" \
+%endif
+")
 
 # We might want to save the config log file
 if test -n "$MYSQL_DEBUGCONFLOG_DEST"

=== modified file 'unittest/mysys/lf-t.c'
--- a/unittest/mysys/lf-t.c	2008-07-29 14:10:24 +0000
+++ b/unittest/mysys/lf-t.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2008-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -13,6 +13,12 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
+/**
+  @file
+
+  Unit tests for lock-free algorithms of mysys
+*/
+
 #include "thr_template.c"
 
 #include <lf.h>
@@ -47,6 +53,10 @@ pthread_handler_t test_lf_pinbox(void *a
   return 0;
 }
 
+/*
+  thread local data area, allocated using lf_alloc.
+  union is required to enforce the minimum required element size (sizeof(ptr))
+*/
 typedef union {
   int32 data;
   void *not_used;

=== modified file 'unittest/mysys/my_atomic-t.c'
--- a/unittest/mysys/my_atomic-t.c	2008-10-20 09:16:47 +0000
+++ b/unittest/mysys/my_atomic-t.c	2009-02-13 16:30:54 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by

=== modified file 'unittest/mysys/thr_template.c'
--- a/unittest/mysys/thr_template.c	2008-08-29 19:50:04 +0000
+++ b/unittest/mysys/thr_template.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by

=== modified file 'unittest/mysys/waiting_threads-t.c'
--- a/unittest/mysys/waiting_threads-t.c	2008-10-21 19:31:14 +0000
+++ b/unittest/mysys/waiting_threads-t.c	2009-02-12 10:06:03 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -16,7 +16,6 @@
 #include "thr_template.c"
 #include <waiting_threads.h>
 #include <m_string.h>
-#include <locale.h>
 
 struct test_wt_thd {
   WT_THD thd;
@@ -25,12 +24,14 @@ struct test_wt_thd {
 
 uint i, cnt;
 pthread_mutex_t lock;
+pthread_cond_t thread_sync;
 
 ulong wt_timeout_short=100, wt_deadlock_search_depth_short=4;
 ulong wt_timeout_long=10000, wt_deadlock_search_depth_long=15;
 
 #define reset(ARRAY) bzero(ARRAY, sizeof(ARRAY))
 
+/* see explanation of the kill strategies in waiting_threads.h */
 enum { LATEST, RANDOM, YOUNGEST, LOCKS } kill_strategy;
 
 WT_RESOURCE_TYPE restype={ wt_resource_id_memcmp, 0};
@@ -48,9 +49,19 @@ pthread_handler_t test_wt(void *arg)
 
   my_thread_init();
 
-  pthread_mutex_lock(&lock);
+  pthread_mutex_lock(&mutex);
   id= cnt++;
-  pthread_mutex_unlock(&lock);
+  wt_thd_lazy_init(& thds[id].thd,
+                   & wt_deadlock_search_depth_short, & wt_timeout_short,
+                   & wt_deadlock_search_depth_long, & wt_timeout_long);
+
+  /* now, wait for everybody to be ready to run */
+  if (cnt >= THREADS)
+    pthread_cond_broadcast(&thread_sync);
+  else
+    while (cnt < THREADS)
+      pthread_cond_wait(&thread_sync, &mutex);
+  pthread_mutex_unlock(&mutex);
 
   my_rnd_init(&rand, (ulong)(intptr)&m, id);
   if (kill_strategy == YOUNGEST)
@@ -68,13 +79,14 @@ pthread_handler_t test_wt(void *arg)
 
     res= 0;
 
+    /* prepare for waiting for a random number of random threads */
     for (j= n= (rnd() % THREADS)/10; !res && j >= 0; j--)
     {
 retry:
-      i= rnd() % (THREADS-1);
-      if (i >= id) i++;
+      i= rnd() % (THREADS-1); /* pick a random thread */
+      if (i >= id) i++;   /* with a number from 0 to THREADS-1 excluding ours */
 
-      for (k=n; k >=j; k--)
+      for (k=n; k >=j; k--) /* the one we didn't pick before */
         if (blockers[k] == i)
           goto retry;
       blockers[j]= i;
@@ -110,23 +122,23 @@ retry:
       thds[id].thd.weight++;
   }
 
+  pthread_mutex_lock(&mutex);
+  /* wait for everybody to finish */
+  if (!--cnt)
+    pthread_cond_broadcast(&thread_sync);
+  else
+    while (cnt)
+      pthread_cond_wait(&thread_sync, &mutex);
+
   pthread_mutex_lock(& thds[id].lock);
   pthread_mutex_lock(&lock);
   wt_thd_release_all(& thds[id].thd);
   pthread_mutex_unlock(&lock);
   pthread_mutex_unlock(& thds[id].lock);
+  wt_thd_destroy(& thds[id].thd);
 
-#ifndef DBUG_OFF
-  {
-#define DEL "(deleted)"
-    char *x=malloc(strlen(thds[id].thd.name)+sizeof(DEL)+1);
-    strxmov(x, thds[id].thd.name, DEL, 0);
-    thds[id].thd.name=x; /* it's a memory leak, go on, shoot me */
-  }
-#endif
-
-  pthread_mutex_lock(&mutex);
-  if (!--running_threads) pthread_cond_signal(&cond);
+  if (!--running_threads) /* now, signal when everybody is done with deinit */
+    pthread_cond_signal(&cond);
   pthread_mutex_unlock(&mutex);
   DBUG_PRINT("wt", ("exiting"));
   my_thread_end();
@@ -136,6 +148,7 @@ retry:
 void do_one_test()
 {
   double sum, sum0;
+  DBUG_ENTER("do_one_test");
 
   reset(wt_cycle_stats);
   reset(wt_wait_stats);
@@ -161,31 +174,33 @@ void do_one_test()
            wt_wait_table[cnt], wt_wait_stats[cnt]);
   diag("timed out: %u", wt_wait_stats[cnt]);
   diag("successes: %u", wt_success_stats);
+
+  DBUG_VOID_RETURN;
 }
 
 void do_tests()
 {
-  plan(12);
-  compile_time_assert(THREADS >= 3);
+  DBUG_ENTER("do_tests");
+  plan(14);
+  compile_time_assert(THREADS >= 4);
 
   DBUG_PRINT("wt", ("================= initialization ==================="));
 
   bad= my_atomic_initialize();
   ok(!bad, "my_atomic_initialize() returned %d", bad);
 
+  pthread_cond_init(&thread_sync, 0);
   pthread_mutex_init(&lock, 0);
   wt_init();
   for (cnt=0; cnt < THREADS; cnt++)
-  {
-    wt_thd_lazy_init(& thds[cnt].thd,
-                     & wt_deadlock_search_depth_short, & wt_timeout_short,
-                     & wt_deadlock_search_depth_long, & wt_timeout_long);
     pthread_mutex_init(& thds[cnt].lock, 0);
-  }
   {
-    WT_RESOURCE_ID resid[3];
-    for (i=0; i < 3; i++)
+    WT_RESOURCE_ID resid[4];
+    for (i=0; i < array_elements(resid); i++)
     {
+      wt_thd_lazy_init(& thds[i].thd,
+                       & wt_deadlock_search_depth_short, & wt_timeout_short,
+                       & wt_deadlock_search_depth_long, & wt_timeout_long);
       resid[i].value= i+1;
       resid[i].type= &restype;
     }
@@ -206,28 +221,26 @@ void do_tests()
     pthread_mutex_lock(&lock);
     bad= wt_thd_cond_timedwait(& thds[0].thd, &lock);
     pthread_mutex_unlock(&lock);
-    ok(bad == ETIMEDOUT, "timeout test returned %d", bad);
+    ok(bad == WT_TIMEOUT, "timeout test returned %d", bad);
 
     ok_wait(0,1,0);
     ok_wait(1,2,1);
     ok_deadlock(2,0,2);
 
     pthread_mutex_lock(&lock);
-    wt_thd_cond_timedwait(& thds[0].thd, &lock);
-    wt_thd_cond_timedwait(& thds[1].thd, &lock);
+    ok(wt_thd_cond_timedwait(& thds[0].thd, &lock) == WT_TIMEOUT, "as always");
+    ok(wt_thd_cond_timedwait(& thds[1].thd, &lock) == WT_TIMEOUT, "as always");
     wt_thd_release_all(& thds[0].thd);
     wt_thd_release_all(& thds[1].thd);
     wt_thd_release_all(& thds[2].thd);
     wt_thd_release_all(& thds[3].thd);
-    pthread_mutex_unlock(&lock);
 
-    for (cnt=0; cnt < 3; cnt++)
+    for (i=0; i < array_elements(resid); i++)
     {
-      wt_thd_destroy(& thds[cnt].thd);
-      wt_thd_lazy_init(& thds[cnt].thd,
-                       & wt_deadlock_search_depth_short, & wt_timeout_short,
-                       & wt_deadlock_search_depth_long, & wt_timeout_long);
+      wt_thd_release_all(& thds[i].thd);
+      wt_thd_destroy(& thds[i].thd);
     }
+    pthread_mutex_unlock(&lock);
   }
 
   wt_deadlock_search_depth_short=6;
@@ -243,27 +256,23 @@ void do_tests()
 
 #define test_kill_strategy(X)                   \
   diag("kill strategy: " #X);                   \
+  DBUG_EXECUTE("reset_file",                    \
+               { rewind(DBUG_FILE); ftruncate(fileno(DBUG_FILE), 0); }); \
+  DBUG_PRINT("info", ("kill strategy: " #X));   \
   kill_strategy=X;                              \
   do_one_test();
 
   test_kill_strategy(LATEST);
-  SKIP_BIG_TESTS(1)
-  {
-    test_kill_strategy(RANDOM);
-  }
+  test_kill_strategy(RANDOM);
   test_kill_strategy(YOUNGEST);
   test_kill_strategy(LOCKS);
 
   DBUG_PRINT("wt", ("================= cleanup ==================="));
-  pthread_mutex_lock(&lock);
   for (cnt=0; cnt < THREADS; cnt++)
-  {
-    wt_thd_release_all(& thds[cnt].thd);
-    wt_thd_destroy(& thds[cnt].thd);
     pthread_mutex_destroy(& thds[cnt].lock);
-  }
-  pthread_mutex_unlock(&lock);
   wt_end();
   pthread_mutex_destroy(&lock);
+  pthread_cond_destroy(&thread_sync);
+  DBUG_VOID_RETURN;
 }
 

Thread
bzr push into mysql-6.0 branch (serg:2715 to 2718) Sergei Golubchik17 Feb