List:Commits« Previous MessageNext Message »
From:konstantin Date:March 6 2007 4:25pm
Subject:bk commit into 5.1 tree (kostja:1.2452)
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of kostja. When kostja does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2007-03-06 19:24:52+03:00, kostja@stripped +30 -0
  Merge bodhi.local:/opt/local/work/mysql-5.0-runtime
  into  bodhi.local:/opt/local/work/mysql-5.1-runtime-merge
  MERGE: 1.1810.2372.115

  BitKeeper/deleted/.del-ha_berkeley.cc@stripped, 2007-03-06 18:55:01+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.151.9.2

  BitKeeper/deleted/.del-ha_berkeley.cc@stripped, 2007-03-06 18:55:00+03:00, kostja@stripped +0 -0
    Merge rename: sql/ha_berkeley.cc -> BitKeeper/deleted/.del-ha_berkeley.cc

  client/mysql_upgrade.c@stripped, 2007-03-06 19:24:46+03:00, kostja@stripped +1 -2
    Manual merge.
    MERGE: 1.6.1.12

  client/mysqltest.c@stripped, 2007-03-06 19:24:46+03:00, kostja@stripped +0 -2
    Manual merge.
    MERGE: 1.155.9.88

  cmd-line-utils/readline/xmalloc.c@stripped, 2007-03-06 18:55:01+03:00, kostja@stripped +0 -3
    Auto merged
    MERGE: 1.8.1.1

  include/my_dbug.h@stripped, 2007-03-06 18:55:01+03:00, kostja@stripped +0 -2
    Auto merged
    MERGE: 1.16.2.6

  mysql-test/mysql-test-run.pl@stripped, 2007-03-06 18:55:01+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.30.49.55

  mysql-test/r/subselect.result@stripped, 2007-03-06 19:00:06+03:00, kostja@stripped +0 -51
    Use local.
    MERGE: 1.134.1.42

  mysql-test/t/disabled.def@stripped, 2007-03-06 18:55:01+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.2.4.24

  mysql-test/t/subselect.test@stripped, 2007-03-06 19:24:46+03:00, kostja@stripped +46 -47
    Manual merge.
    MERGE: 1.129.1.14

  server-tools/instance-manager/instance_options.cc@stripped, 2007-03-06 18:55:01+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.29.1.7

  server-tools/instance-manager/mysqlmanager.cc@stripped, 2007-03-06 18:55:02+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.17.1.5

  sql/field.cc@stripped, 2007-03-06 19:24:46+03:00, kostja@stripped +0 -3
    Manual merge.
    MERGE: 1.256.1.85

  sql/ha_ndbcluster.cc@stripped, 2007-03-06 19:16:09+03:00, kostja@stripped +0 -22
    Use local
    MERGE: 1.175.1.125

  sql/item_cmpfunc.cc@stripped, 2007-03-06 18:55:03+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.187.1.50

  sql/item_cmpfunc.h@stripped, 2007-03-06 18:55:03+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.122.2.20

  sql/item_func.cc@stripped, 2007-03-06 18:55:03+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.270.1.56

  sql/item_subselect.cc@stripped, 2007-03-06 18:55:03+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.113.1.40

  sql/item_subselect.h@stripped, 2007-03-06 18:55:03+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.77.1.11

  sql/log.cc@stripped, 2007-03-06 18:55:04+03:00, kostja@stripped +0 -2
    Auto merged
    MERGE: 1.158.1.47

  sql/slave.cc@stripped, 2007-03-06 18:55:04+03:00, kostja@stripped +0 -1
    Auto merged
    MERGE: 1.241.1.54

  sql/sp_head.cc@stripped, 2007-03-06 18:55:04+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.200.7.11

  sql/sql_base.cc@stripped, 2007-03-06 19:24:46+03:00, kostja@stripped +4 -6
    Manual merge.
    MERGE: 1.235.1.133

  sql/sql_class.cc@stripped, 2007-03-06 18:55:05+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.223.1.39

  sql/sql_lex.cc@stripped, 2007-03-06 18:55:05+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.142.1.72

  sql/sql_lex.h@stripped, 2007-03-06 18:55:05+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.175.1.64

  sql/sql_parse.cc@stripped, 2007-03-06 18:55:05+03:00, kostja@stripped +0 -0
    Auto merged
    MERGE: 1.426.1.181

  sql/sql_repl.cc@stripped, 2007-03-06 18:55:06+03:00, kostja@stripped +0 -1
    Auto merged
    MERGE: 1.133.1.26

  sql/sql_yacc.yy@stripped, 2007-03-06 19:24:46+03:00, kostja@stripped +1 -2
    Manual merge.
    MERGE: 1.371.1.137

  storage/archive/ha_archive.cc@stripped, 2007-03-06 19:22:50+03:00, kostja@stripped +0 -3
    Use local.
    MERGE: 1.60.14.3

  storage/archive/ha_archive.cc@stripped, 2007-03-06 18:55:00+03:00, kostja@stripped +0 -0
    Merge rename: sql/ha_archive.cc -> storage/archive/ha_archive.cc

  support-files/compiler_warnings.supp@stripped, 2007-03-06 19:23:14+03:00, kostja@stripped +0 -56
    Use local
    MERGE: 1.2.1.6

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	kostja
# Host:	bodhi.local
# Root:	/opt/local/work/mysql-5.1-runtime-merge/RESYNC

--- 1.151.9.1/sql/ha_berkeley.cc	2007-02-28 23:23:33 +03:00
+++ 1.189/BitKeeper/deleted/.del-ha_berkeley.cc	2007-03-06 18:55:01 +03:00
@@ -24,7 +24,8 @@
     We will need an updated Berkeley DB version for this.
   - Killing threads that has got a 'deadlock'
   - SHOW TABLE STATUS should give more information about the table.
-  - Get a more accurate count of the number of rows (estimate_rows_upper_bound()).
+  - Get a more accurate count of the number of rows
+    (estimate_rows_upper_bound()).
     We could store the found number of rows when the table is scanned and
     then increment the counter for each attempted write.
   - We will need to extend the manager thread to makes checkpoints at
@@ -52,14 +53,17 @@
 
 #include "mysql_priv.h"
 
-#ifdef HAVE_BERKELEY_DB
 #include <m_ctype.h>
 #include <myisampack.h>
 #include <hash.h>
+
+#ifdef WITH_BERKELEY_STORAGE_ENGINE
 #include "ha_berkeley.h"
 #include "sql_manager.h"
 #include <stdarg.h>
 
+#include <mysql/plugin.h>
+
 #define HA_BERKELEY_ROWS_IN_TABLE 10000 /* to get optimization right */
 #define HA_BERKELEY_RANGE_COUNT   100
 #define HA_BERKELEY_MAX_ROWS	  10000000 /* Max rows in table */
@@ -71,13 +75,21 @@
 #define STATUS_ROW_COUNT_INIT	2
 #define STATUS_BDB_ANALYZE	4
 
+const u_int32_t bdb_DB_TXN_NOSYNC= DB_TXN_NOSYNC;
+const u_int32_t bdb_DB_RECOVER= DB_RECOVER;
+const u_int32_t bdb_DB_PRIVATE= DB_PRIVATE;
+const u_int32_t bdb_DB_DIRECT_DB= DB_DIRECT_DB;
+const u_int32_t bdb_DB_DIRECT_LOG= DB_DIRECT_LOG;
 const char *ha_berkeley_ext=".db";
 bool berkeley_shared_data=0;
-u_int32_t berkeley_init_flags= DB_PRIVATE | DB_RECOVER, berkeley_env_flags=0,
-          berkeley_lock_type=DB_LOCK_DEFAULT;
-ulong berkeley_cache_size, berkeley_log_buffer_size, berkeley_log_file_size=0;
+u_int32_t berkeley_init_flags= DB_PRIVATE | DB_RECOVER,
+          berkeley_env_flags= DB_LOG_AUTOREMOVE,
+          berkeley_lock_type= DB_LOCK_DEFAULT;
+ulong berkeley_log_buffer_size=0 , berkeley_log_file_size=0;
+ulonglong berkeley_cache_size= 0;
 char *berkeley_home, *berkeley_tmpdir, *berkeley_logdir;
 long berkeley_lock_scan_time=0;
+ulong berkeley_region_size=0, berkeley_cache_parts=1;
 ulong berkeley_trans_retry=1;
 ulong berkeley_max_lock;
 pthread_mutex_t bdb_mutex;
@@ -85,14 +97,21 @@
 static DB_ENV *db_env;
 static HASH bdb_open_tables;
 
+static const char berkeley_hton_name[]= "BerkeleyDB";
+static const int berkeley_hton_name_length=sizeof(berkeley_hton_name)-1;
+
 const char *berkeley_lock_names[] =
-{ "DEFAULT", "OLDEST","RANDOM","YOUNGEST",0 };
+{ "DEFAULT", "OLDEST", "RANDOM", "YOUNGEST", "EXPIRE", "MAXLOCKS",
+  "MAXWRITE", "MINLOCKS", "MINWRITE", 0 };
 u_int32_t berkeley_lock_types[]=
-{ DB_LOCK_DEFAULT, DB_LOCK_OLDEST, DB_LOCK_RANDOM };
+{ DB_LOCK_DEFAULT, DB_LOCK_OLDEST, DB_LOCK_RANDOM, DB_LOCK_YOUNGEST,
+  DB_LOCK_EXPIRE, DB_LOCK_MAXLOCKS, DB_LOCK_MAXWRITE, DB_LOCK_MINLOCKS,
+  DB_LOCK_MINWRITE };
 TYPELIB berkeley_lock_typelib= {array_elements(berkeley_lock_names)-1,"",
 				berkeley_lock_names, NULL};
 
-static void berkeley_print_error(const char *db_errpfx, char *buffer);
+static void berkeley_print_error(const DB_ENV *db_env, const char *db_errpfx,
+                                 const char *buffer);
 static byte* bdb_get_key(BDB_SHARE *share,uint *length,
 			 my_bool not_used __attribute__((unused)));
 static BDB_SHARE *get_share(const char *table_name, TABLE *table);
@@ -100,50 +119,53 @@
 		      bool mutex_is_locked);
 static int write_status(DB *status_block, char *buff, uint length);
 static void update_status(BDB_SHARE *share, TABLE *table);
-static void berkeley_noticecall(DB_ENV *db_env, db_notices notice);
 
 static int berkeley_close_connection(THD *thd);
 static int berkeley_commit(THD *thd, bool all);
 static int berkeley_rollback(THD *thd, bool all);
+static int berkeley_rollback_to_savepoint(THD* thd, void *savepoint);
+static int berkeley_savepoint(THD* thd, void *savepoint);
+static int berkeley_release_savepoint(THD* thd, void *savepoint);
+static handler *berkeley_create_handler(TABLE_SHARE *table,
+                                        MEM_ROOT *mem_root);
 
-handlerton berkeley_hton = {
-  "BerkeleyDB",
-  SHOW_OPTION_YES,
-  "Supports transactions and page-level locking", 
-  DB_TYPE_BERKELEY_DB,
-  berkeley_init,
-  0, /* slot */
-  0, /* savepoint size */
-  berkeley_close_connection,
-  NULL, /* savepoint_set */
-  NULL, /* savepoint_rollback */
-  NULL, /* savepoint_release */
-  berkeley_commit,
-  berkeley_rollback,
-  NULL, /* prepare */
-  NULL, /* recover */
-  NULL, /* commit_by_xid */
-  NULL, /* rollback_by_xid */
-  NULL, /* create_cursor_read_view */
-  NULL, /* set_cursor_read_view */
-  NULL, /* close_cursor_read_view */
-  HTON_CLOSE_CURSORS_AT_COMMIT
-};
+handlerton berkeley_hton;
+
+static handler *berkeley_create_handler(TABLE_SHARE *table, MEM_ROOT *mem_root)
+{
+  return new (mem_root) ha_berkeley(table);
+}
 
 typedef struct st_berkeley_trx_data {
   DB_TXN *all;
   DB_TXN *stmt;
+  DB_TXN *sp_level;
   uint bdb_lock_count;
 } berkeley_trx_data;
 
 /* General functions */
 
-bool berkeley_init(void)
+int berkeley_init(void)
 {
   DBUG_ENTER("berkeley_init");
 
+  berkeley_hton.state=SHOW_OPTION_YES;
+  berkeley_hton.db_type=DB_TYPE_BERKELEY_DB;
+  berkeley_hton.savepoint_offset=sizeof(DB_TXN *);
+  berkeley_hton.close_connection=berkeley_close_connection;
+  berkeley_hton.savepoint_set=berkeley_savepoint;
+  berkeley_hton.savepoint_rollback=berkeley_rollback_to_savepoint;
+  berkeley_hton.savepoint_release=berkeley_release_savepoint;
+  berkeley_hton.commit=berkeley_commit;
+  berkeley_hton.rollback=berkeley_rollback;
+  berkeley_hton.create=berkeley_create_handler;
+  berkeley_hton.panic=berkeley_end;
+  berkeley_hton.flush_logs=berkeley_flush_logs;
+  berkeley_hton.show_status=berkeley_show_status;
+  berkeley_hton.flags=HTON_CLOSE_CURSORS_AT_COMMIT | HTON_FLUSH_AFTER_RENAME;
+
   if (have_berkeley_db != SHOW_OPTION_YES)
-    goto error;
+    return 0; // nothing else to do
 
   if (!berkeley_tmpdir)
     berkeley_tmpdir=mysql_tmpdir;
@@ -173,7 +195,6 @@
     goto error;
   db_env->set_errcall(db_env,berkeley_print_error);
   db_env->set_errpfx(db_env,"bdb");
-  db_env->set_noticecall(db_env, berkeley_noticecall);
   db_env->set_tmp_dir(db_env, berkeley_tmpdir);
   db_env->set_data_dir(db_env, mysql_data_home);
   db_env->set_flags(db_env, berkeley_env_flags, 1);
@@ -182,13 +203,20 @@
 
   if (opt_endinfo)
     db_env->set_verbose(db_env,
-			DB_VERB_CHKPOINT | DB_VERB_DEADLOCK | DB_VERB_RECOVERY,
+			DB_VERB_DEADLOCK | DB_VERB_RECOVERY,
 			1);
 
-  db_env->set_cachesize(db_env, 0, berkeley_cache_size, 0);
+  if (berkeley_cache_size > (uint) ~0)
+    db_env->set_cachesize(db_env, berkeley_cache_size / (1024*1024L*1024L),
+                          berkeley_cache_size % (1024L*1024L*1024L),
+                          berkeley_cache_parts);
+  else
+    db_env->set_cachesize(db_env, 0, berkeley_cache_size, berkeley_cache_parts);
+
   db_env->set_lg_max(db_env, berkeley_log_file_size);
   db_env->set_lg_bsize(db_env, berkeley_log_buffer_size);
   db_env->set_lk_detect(db_env, berkeley_lock_type);
+  db_env->set_lg_regionmax(db_env, berkeley_region_size);
   if (berkeley_max_lock)
     db_env->set_lk_max(db_env, berkeley_max_lock);
 
@@ -213,18 +241,19 @@
 }
 
 
-bool berkeley_end(void)
+int berkeley_end(ha_panic_function type)
 {
-  int error;
+  int error= 0;
   DBUG_ENTER("berkeley_end");
-  if (!db_env)
-    return 1; /* purecov: tested */
-  berkeley_cleanup_log_files();
-  error=db_env->close(db_env,0);		// Error is logged
-  db_env=0;
-  hash_free(&bdb_open_tables);
-  pthread_mutex_destroy(&bdb_mutex);
-  DBUG_RETURN(error != 0);
+  if (db_env)
+  {
+    berkeley_cleanup_log_files();
+    error= db_env->close(db_env,0);		// Error is logged
+    db_env= 0;
+    hash_free(&bdb_open_tables);
+    pthread_mutex_destroy(&bdb_mutex);
+  }
+  DBUG_RETURN(error);
 }
 
 static int berkeley_close_connection(THD *thd)
@@ -258,7 +287,7 @@
   DBUG_PRINT("trans",("ending transaction %s", all ? "all" : "stmt"));
   berkeley_trx_data *trx=(berkeley_trx_data *)thd->ha_data[berkeley_hton.slot];
   DB_TXN **txn= all ? &trx->all : &trx->stmt;
-  int error=txn_commit(*txn,0);
+  int error= (*txn)->commit(*txn,0);
   *txn=0;
 #ifndef DBUG_OFF
   if (error)
@@ -273,13 +302,55 @@
   DBUG_PRINT("trans",("aborting transaction %s", all ? "all" : "stmt"));
   berkeley_trx_data *trx=(berkeley_trx_data *)thd->ha_data[berkeley_hton.slot];
   DB_TXN **txn= all ? &trx->all : &trx->stmt;
-  int error=txn_abort(*txn);
+  int error= (*txn)->abort(*txn);
   *txn=0;
   DBUG_RETURN(error);
 }
 
+static int berkeley_savepoint(THD* thd, void *savepoint)
+{
+  int error;
+  DB_TXN **save_txn= (DB_TXN**) savepoint;
+  DBUG_ENTER("berkeley_savepoint");
+  berkeley_trx_data *trx=(berkeley_trx_data *)thd->ha_data[berkeley_hton.slot];
+  if (!(error= db_env->txn_begin(db_env, trx->sp_level, save_txn, 0)))
+  {
+    trx->sp_level= *save_txn;
+  }
+  DBUG_RETURN(error);
+}
+
+static int berkeley_rollback_to_savepoint(THD* thd, void *savepoint)
+{
+  int error;
+  DB_TXN *parent, **save_txn= (DB_TXN**) savepoint;
+  DBUG_ENTER("berkeley_rollback_to_savepoint");
+  berkeley_trx_data *trx=(berkeley_trx_data *)thd->ha_data[berkeley_hton.slot];
+  parent= (*save_txn)->parent;
+  if (!(error= (*save_txn)->abort(*save_txn)))
+  {
+    trx->sp_level= parent;
+    error= berkeley_savepoint(thd, savepoint);
+  }
+  DBUG_RETURN(error);
+}
 
-int berkeley_show_logs(Protocol *protocol)
+static int berkeley_release_savepoint(THD* thd, void *savepoint)
+{
+  int error;
+  DB_TXN *parent, **save_txn= (DB_TXN**) savepoint;
+  DBUG_ENTER("berkeley_release_savepoint");
+  berkeley_trx_data *trx=(berkeley_trx_data *)thd->ha_data[berkeley_hton.slot];
+  parent= (*save_txn)->parent;
+  if (!(error= (*save_txn)->commit(*save_txn,0)))
+  {
+    trx->sp_level= parent;
+    *save_txn= 0;
+  }
+  DBUG_RETURN(error);
+}
+
+static bool berkeley_show_logs(THD *thd, stat_print_fn *stat_print)
 {
   char **all_logs, **free_logs, **a, **f;
   int error=1;
@@ -290,6 +361,7 @@
   init_sql_alloc(&show_logs_root, BDB_LOG_ALLOC_BLOCK_SIZE,
 		 BDB_LOG_ALLOC_BLOCK_SIZE);
   *root_ptr= &show_logs_root;
+  all_logs= free_logs= 0;
 
   if ((error= db_env->log_archive(db_env, &all_logs,
 				  DB_ARCH_ABS | DB_ARCH_LOG)) ||
@@ -306,50 +378,51 @@
   {
     for (a = all_logs, f = free_logs; *a; ++a)
     {
-      protocol->prepare_for_resend();
-      protocol->store(*a, system_charset_info);
-      protocol->store(STRING_WITH_LEN("BDB"), system_charset_info);
       if (f && *f && strcmp(*a, *f) == 0)
       {
-	f++;
-	protocol->store(SHOW_LOG_STATUS_FREE, system_charset_info);
+        f++;
+        if ((error= stat_print(thd, berkeley_hton_name,
+                               berkeley_hton_name_length, *a, strlen(*a),
+                               STRING_WITH_LEN(SHOW_LOG_STATUS_FREE))))
+          break;
       }
       else
-	protocol->store(SHOW_LOG_STATUS_INUSE, system_charset_info);
-
-      if (protocol->write())
       {
-	error=1;
-	goto err;
+        if ((error= stat_print(thd, berkeley_hton_name,
+                               berkeley_hton_name_length, *a, strlen(*a),
+                               STRING_WITH_LEN(SHOW_LOG_STATUS_INUSE))))
+          break;
       }
     }
   }
 err:
+  if (all_logs)
+    free(all_logs);
+  if (free_logs)
+    free(free_logs);
   free_root(&show_logs_root,MYF(0));
   *root_ptr= old_mem_root;
   DBUG_RETURN(error);
 }
 
-
-static void berkeley_print_error(const char *db_errpfx, char *buffer)
+bool berkeley_show_status(THD *thd, stat_print_fn *stat_print,
+                          enum ha_stat_type stat_type)
 {
-  sql_print_error("%s:  %s",db_errpfx,buffer); /* purecov: tested */
+  switch (stat_type) {
+  case HA_ENGINE_LOGS:
+    return berkeley_show_logs(thd, stat_print);
+  default:
+    return FALSE;
+  }
 }
 
-
-static void berkeley_noticecall(DB_ENV *db_env, db_notices notice)
+static void berkeley_print_error(const DB_ENV *db_env, const char *db_errpfx,
+                                 const char *buffer)
 {
-  switch (notice)
-  {
-  case DB_NOTICE_LOGFILE_CHANGED: /* purecov: tested */
-    pthread_mutex_lock(&LOCK_manager);
-    manager_status |= MANAGER_BERKELEY_LOG_CLEANUP;
-    pthread_mutex_unlock(&LOCK_manager);
-    pthread_cond_signal(&COND_manager);
-    break;
-  }
+  sql_print_error("%s:  %s",db_errpfx,buffer); /* purecov: tested */
 }
 
+
 void berkeley_cleanup_log_files(void)
 {
   DBUG_ENTER("berkeley_cleanup_log_files");
@@ -387,10 +460,10 @@
 ** Berkeley DB tables
 *****************************************************************************/
 
-ha_berkeley::ha_berkeley(TABLE *table_arg)
+ha_berkeley::ha_berkeley(TABLE_SHARE *table_arg)
   :handler(&berkeley_hton, table_arg), alloc_ptr(0), rec_buff(0), file(0),
   int_table_flags(HA_REC_NOT_IN_SEQ | HA_FAST_KEY_READ |
-                  HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS | HA_NOT_EXACT_COUNT |
+                  HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS |
                   HA_PRIMARY_KEY_IN_READ_INDEX | HA_FILE_BASED |
                   HA_CAN_GEOMETRY |
                   HA_AUTO_PART_KEY | HA_TABLE_SCAN_ON_INDEX),
@@ -414,13 +487,14 @@
                 | HA_READ_RANGE);
   for (uint i= all_parts ? 0 : part ; i <= part ; i++)
   {
-    if (table->key_info[idx].key_part[i].field->type() == FIELD_TYPE_BLOB)
+    KEY_PART_INFO *key_part= table_share->key_info[idx].key_part+i;
+    if (key_part->field->type() == FIELD_TYPE_BLOB)
     {
       /* We can't use BLOBS to shortcut sorts */
       flags&= ~(HA_READ_ORDER | HA_KEYREAD_ONLY | HA_READ_RANGE);
       break;
     }
-    switch (table->key_info[idx].key_part[i].field->key_type()) {
+    switch (key_part->field->key_type()) {
     case HA_KEYTYPE_TEXT:
     case HA_KEYTYPE_VARTEXT1:
     case HA_KEYTYPE_VARTEXT2:
@@ -428,8 +502,7 @@
         As BDB stores only one copy of equal strings, we can't use key read
         on these. Binary collations do support key read though.
       */
-      if (!(table->key_info[idx].key_part[i].field->charset()->state
-           & MY_CS_BINSORT))
+      if (!(key_part->field->charset()->state & MY_CS_BINSORT))
         flags&= ~HA_KEYREAD_ONLY;
       break;
     default:                                    // Keep compiler happy
@@ -567,7 +640,6 @@
   uint open_mode=(mode == O_RDONLY ? DB_RDONLY : 0) | DB_THREAD;
   uint max_key_length;
   int error;
-  TABLE_SHARE *table_share= table->s;
   DBUG_ENTER("ha_berkeley::open");
 
   /* Open primary key */
@@ -589,7 +661,7 @@
 			&key_buff2, max_key_length,
 			&primary_key_buff,
 			(hidden_primary_key ? 0 :
-			 table->key_info[table_share->primary_key].key_length),
+			 table_share->key_info[table_share->primary_key].key_length),
 			NullS)))
     DBUG_RETURN(1); /* purecov: inspected */
   if (!(rec_buff= (byte*) my_malloc((alloced_rec_buff_length=
@@ -632,10 +704,10 @@
 			  berkeley_cmp_packed_key));
     if (!hidden_primary_key)
       file->app_private= (void*) (table->key_info + table_share->primary_key);
-    if ((error= txn_begin(db_env, 0, (DB_TXN**) &transaction, 0)) ||
+    if ((error= db_env->txn_begin(db_env, NULL, (DB_TXN**) &transaction, 0)) ||
 	(error= (file->open(file, transaction,
 			    fn_format(name_buff, name, "", ha_berkeley_ext,
-				      2 | 4),
+				      MY_UNPACK_FILENAME|MY_APPEND_EXT),
 			    "main", DB_BTREE, open_mode, 0))) ||
 	(error= transaction->commit(transaction, 0)))
     {
@@ -648,7 +720,7 @@
 
     /* Open other keys;  These are part of the share structure */
     key_file[primary_key]=file;
-    key_type[primary_key]=DB_NOOVERWRITE;
+    key_type[primary_key]= hidden_primary_key ? 0 : DB_NOOVERWRITE;
 
     DB **ptr=key_file;
     for (uint i=0, used_keys=0; i < table_share->keys ; i++, ptr++)
@@ -671,7 +743,8 @@
 	  DBUG_PRINT("bdb",("Setting DB_DUP for key %u", i));
 	  (*ptr)->set_flags(*ptr, DB_DUP);
 	}
-	if ((error= txn_begin(db_env, 0, (DB_TXN**) &transaction, 0)) ||
+	if ((error= db_env->txn_begin(db_env, NULL, (DB_TXN**) &transaction,
+                                      0)) ||
 	    (error=((*ptr)->open(*ptr, transaction, name_buff, part, DB_BTREE,
 				 open_mode, 0))) ||
 	    (error= transaction->commit(transaction, 0)))
@@ -703,7 +776,7 @@
   transaction=0;
   cursor=0;
   key_read=0;
-  block_size=8192;				// Berkeley DB block size
+  stats.block_size=8192;                        // Berkeley DB block size
   share->fixed_length_row= !(table_share->db_create_options &
                              HA_OPTION_PACK_RECORD);
 
@@ -719,7 +792,7 @@
 
   my_free((char*) rec_buff,MYF(MY_ALLOW_ZERO_PTR));
   my_free(alloc_ptr,MYF(MY_ALLOW_ZERO_PTR));
-  ha_berkeley::extra(HA_EXTRA_RESET);		// current_row buffer
+  ha_berkeley::reset();                         // current_row buffer
   DBUG_RETURN(free_share(share,table, hidden_primary_key,0));
 }
 
@@ -745,9 +818,9 @@
 
 ulong ha_berkeley::max_row_length(const byte *buf)
 {
-  ulong length= table->s->reclength + table->s->fields*2;
+  ulong length= table_share->reclength + table_share->fields*2;
   uint *ptr, *end;
-  for (ptr= table->s->blob_field, end=ptr + table->s->blob_fields ;
+  for (ptr= table_share->blob_field, end=ptr + table_share->blob_fields ;
        ptr != end ;
        ptr++)
   {
@@ -774,25 +847,26 @@
   if (share->fixed_length_row)
   {
     row->data=(void*) record;
-    row->size= table->s->reclength+hidden_primary_key;
+    row->size= table_share->reclength+hidden_primary_key;
     if (hidden_primary_key)
     {
       if (new_row)
 	get_auto_primary_key(current_ident);
-      memcpy_fixed((char*) record+table->s->reclength, (char*) current_ident,
+      memcpy_fixed((char*) record+table_share->reclength,
+                   (char*) current_ident,
 		   BDB_HIDDEN_PRIMARY_KEY_LENGTH);
     }
     return 0;
   }
-  if (table->s->blob_fields)
+  if (table_share->blob_fields)
   {
     if (fix_rec_buff_for_blob(max_row_length(record)))
       return HA_ERR_OUT_OF_MEM; /* purecov: inspected */
   }
 
   /* Copy null bits */
-  memcpy(rec_buff, record, table->s->null_bytes);
-  ptr= rec_buff + table->s->null_bytes;
+  memcpy(rec_buff, record, table_share->null_bytes);
+  ptr= rec_buff + table_share->null_bytes;
 
   for (Field **field=table->field ; *field ; field++)
     ptr=(byte*) (*field)->pack((char*) ptr,
@@ -815,15 +889,17 @@
 void ha_berkeley::unpack_row(char *record, DBT *row)
 {
   if (share->fixed_length_row)
-    memcpy(record,(char*) row->data,table->s->reclength+hidden_primary_key);
+    memcpy(record,(char*) row->data,table_share->reclength+hidden_primary_key);
   else
   {
     /* Copy null bits */
+    my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->write_set);
     const char *ptr= (const char*) row->data;
-    memcpy(record, ptr, table->s->null_bytes);
-    ptr+= table->s->null_bytes;
+    memcpy(record, ptr, table_share->null_bytes);
+    ptr+= table_share->null_bytes;
     for (Field **field=table->field ; *field ; field++)
       ptr= (*field)->unpack(record + (*field)->offset(), ptr);
+    dbug_tmp_restore_column_map(table->write_set, old_map);
   }
 }
 
@@ -881,6 +957,7 @@
   KEY *key_info=table->key_info+keynr;
   KEY_PART_INFO *key_part=key_info->key_part;
   KEY_PART_INFO *end=key_part+key_info->key_parts;
+  my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->write_set);
   DBUG_ENTER("create_key");
 
   key->data=buff;
@@ -904,6 +981,7 @@
   }
   key->size= (u_int32_t) (buff  - (char*) key->data);
   DBUG_DUMP("key",(char*) key->data, key->size);
+  dbug_tmp_restore_column_map(table->write_set, old_map);
   DBUG_RETURN(key);
 }
 
@@ -921,6 +999,7 @@
   KEY *key_info=table->key_info+keynr;
   KEY_PART_INFO *key_part=key_info->key_part;
   KEY_PART_INFO *end=key_part+key_info->key_parts;
+  my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->write_set);
   DBUG_ENTER("bdb:pack_key");
 
   bzero((char*) key,sizeof(*key));
@@ -948,6 +1027,7 @@
   }
   key->size= (u_int32_t) (buff  - (char*) key->data);
   DBUG_DUMP("key",(char*) key->data, key->size);
+  dbug_tmp_restore_column_map(table->write_set, old_map);
   DBUG_RETURN(key);
 }
 
@@ -970,7 +1050,7 @@
     DBUG_RETURN(error); /* purecov: inspected */
 
   table->insert_or_update= 1;                   // For handling of VARCHAR
-  if (table->s->keys + test(hidden_primary_key) == 1)
+  if (table_share->keys + test(hidden_primary_key) == 1)
   {
     error=file->put(file, transaction, create_key(&prim_key, primary_key,
 						  key_buff, record),
@@ -989,7 +1069,7 @@
 			    &row, key_type[primary_key])))
       {
 	changed_keys.set_bit(primary_key);
-	for (uint keynr=0 ; keynr < table->s->keys ; keynr++)
+	for (uint keynr=0 ; keynr < table_share->keys ; keynr++)
 	{
 	  if (keynr == primary_key)
 	    continue;
@@ -1017,7 +1097,7 @@
 	  {
 	    new_error = 0;
 	    for (uint keynr=0;
-                 keynr < table->s->keys+test(hidden_primary_key);
+                 keynr < table_share->keys+test(hidden_primary_key);
                  keynr++)
 	    {
 	      if (changed_keys.is_set(keynr))
@@ -1160,7 +1240,7 @@
      that one just put back the old value. */
   if (!changed_keys->is_clear_all())
   {
-    for (keynr=0 ; keynr < table->s->keys+test(hidden_primary_key) ; keynr++)
+    for (keynr=0 ; keynr < table_share->keys+test(hidden_primary_key) ; keynr++)
     {
       if (changed_keys->is_set(keynr))
       {
@@ -1189,8 +1269,8 @@
   DB_TXN *sub_trans;
   bool primary_key_changed;
   DBUG_ENTER("update_row");
-  LINT_INIT(error);
 
+  LINT_INIT(error);
   statistic_increment(table->in_use->status_var.ha_update_count,&LOCK_status);
   if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
     table->timestamp_field->set_time();
@@ -1225,7 +1305,7 @@
 				   using_ignore)))
     {
       // Update all other keys
-      for (uint keynr=0 ; keynr < table->s->keys ; keynr++)
+      for (uint keynr=0 ; keynr < table_share->keys ; keynr++)
       {
 	if (keynr == primary_key)
 	  continue;
@@ -1337,7 +1417,7 @@
 {
   int result = 0;
   for (uint keynr=0;
-       keynr < table->s->keys+test(hidden_primary_key);
+       keynr < table_share->keys+test(hidden_primary_key);
        keynr++)
   {
     if (keys->is_set(keynr))
@@ -1358,7 +1438,7 @@
 {
   int error;
   DBT row, prim_key;
-  key_map keys= table->s->keys_in_use;
+  key_map keys= table_share->keys_in_use;
   DBUG_ENTER("delete_row");
   statistic_increment(table->in_use->status_var.ha_delete_count,&LOCK_status);
 
@@ -1390,11 +1470,12 @@
 }
 
 
-int ha_berkeley::index_init(uint keynr)
+int ha_berkeley::index_init(uint keynr, bool sorted)
 {
   int error;
   DBUG_ENTER("ha_berkeley::index_init");
-  DBUG_PRINT("enter",("table: '%s'  key: %d", table->s->table_name, keynr));
+  DBUG_PRINT("enter",("table: '%s'  key: %d", table_share->table_name.str,
+                      keynr));
 
   /*
     Under some very rare conditions (like full joins) we may already have
@@ -1421,7 +1502,7 @@
   DBUG_ENTER("ha_berkely::index_end");
   if (cursor)
   {
-    DBUG_PRINT("enter",("table: '%s'", table->s->table_name));
+    DBUG_PRINT("enter",("table: '%s'", table_share->table_name.str));
     error=cursor->c_close(cursor);
     cursor=0;
   }
@@ -1668,7 +1749,7 @@
 {
   DBUG_ENTER("rnd_init");
   current_row.flags=DB_DBT_REALLOC;
-  DBUG_RETURN(index_init(primary_key));
+  DBUG_RETURN(index_init(primary_key, 0));
 }
 
 int ha_berkeley::rnd_end()
@@ -1770,20 +1851,21 @@
   DBUG_ENTER("ha_berkeley::info");
   if (flag & HA_STATUS_VARIABLE)
   {
-    records = share->rows + changed_rows; // Just to get optimisations right
-    deleted = 0;
+    // Just to get optimizations right
+    stats.records = share->rows + changed_rows;
+    stats.deleted = 0;
   }
   if ((flag & HA_STATUS_CONST) || version != share->version)
   {
     version=share->version;
-    for (uint i=0 ; i < table->s->keys ; i++)
+    for (uint i=0 ; i < table_share->keys ; i++)
     {
       table->key_info[i].rec_per_key[table->key_info[i].key_parts-1]=
 	share->rec_per_key[i];
     }
   }
   /* Don't return key if we got an error for the internal primary key */
-  if (flag & HA_STATUS_ERRKEY && last_dup_key < table->s->keys)
+  if (flag & HA_STATUS_ERRKEY && last_dup_key < table_share->keys)
     errkey= last_dup_key;
   DBUG_RETURN(0);
 }
@@ -1792,19 +1874,8 @@
 int ha_berkeley::extra(enum ha_extra_function operation)
 {
   switch (operation) {
-  case HA_EXTRA_RESET:
   case HA_EXTRA_RESET_STATE:
-    key_read=0;
-    using_ignore=0;
-    if (current_row.flags & (DB_DBT_MALLOC | DB_DBT_REALLOC))
-    {
-      current_row.flags=0;
-      if (current_row.data)
-      {
-	free(current_row.data);
-	current_row.data=0;
-      }
-    }
+    reset();
     break;
   case HA_EXTRA_KEYREAD:
     key_read=1;					// Query satisfied with key
@@ -1827,8 +1898,17 @@
 
 int ha_berkeley::reset(void)
 {
-  ha_berkeley::extra(HA_EXTRA_RESET);
-  key_read=0;					// Reset to state after open
+  key_read= 0;
+  using_ignore= 0;
+  if (current_row.flags & (DB_DBT_MALLOC | DB_DBT_REALLOC))
+  {
+    current_row.flags= 0;
+    if (current_row.data)
+    {
+      free(current_row.data);
+      current_row.data= 0;
+    }
+  }
   return 0;
 }
 
@@ -1854,6 +1934,8 @@
     if (!trx)
       DBUG_RETURN(1);
   }
+  if (trx->all == 0)
+    trx->sp_level= 0;
   if (lock_type != F_UNLCK)
   {
     if (!trx->bdb_lock_count++)
@@ -1867,17 +1949,18 @@
 	/* We have to start a master transaction */
 	DBUG_PRINT("trans",("starting transaction all:  options: 0x%lx",
                             (ulong) thd->options));
-        if ((error=txn_begin(db_env, 0, &trx->all, 0)))
+        if ((error= db_env->txn_begin(db_env, NULL, &trx->all, 0)))
 	{
           trx->bdb_lock_count--;        // We didn't get the lock
           DBUG_RETURN(error);
 	}
+        trx->sp_level= trx->all;
         trans_register_ha(thd, TRUE, &berkeley_hton);
 	if (thd->in_lock_tables)
 	  DBUG_RETURN(0);			// Don't create stmt trans
       }
       DBUG_PRINT("trans",("starting transaction stmt"));
-      if ((error=txn_begin(db_env, trx->all, &trx->stmt, 0)))
+      if ((error= db_env->txn_begin(db_env, trx->sp_level, &trx->stmt, 0)))
       {
 	/* We leave the possible master transaction open */
         trx->bdb_lock_count--;                  // We didn't get the lock
@@ -1902,7 +1985,7 @@
 	   We must in this case commit the work to keep the row locks
 	*/
 	DBUG_PRINT("trans",("commiting non-updating transaction"));
-        error= txn_commit(trx->stmt,0);
+        error= trx->stmt->commit(trx->stmt,0);
         trx->stmt= transaction= 0;
       }
     }
@@ -1931,7 +2014,7 @@
   if (!trx->stmt)
   {
     DBUG_PRINT("trans",("starting transaction stmt"));
-    error=txn_begin(db_env, trx->all, &trx->stmt, 0);
+    error= db_env->txn_begin(db_env, trx->sp_level, &trx->stmt, 0);
     trans_register_ha(thd, FALSE, &berkeley_hton);
   }
   transaction= trx->stmt;
@@ -2026,13 +2109,14 @@
   int error;
   DBUG_ENTER("ha_berkeley::create");
 
-  fn_format(name_buff,name,"", ha_berkeley_ext,2 | 4);
+  fn_format(name_buff,name,"", ha_berkeley_ext,
+            MY_UNPACK_FILENAME|MY_APPEND_EXT);
 
   /* Create the main table that will hold the real rows */
   if ((error= create_sub_table(name_buff,"main",DB_BTREE,0)))
     DBUG_RETURN(error); /* purecov: inspected */
 
-  primary_key= table->s->primary_key;
+  primary_key= form->s->primary_key;
   /* Create the keys */
   for (uint i=0; i < form->s->keys; i++)
   {
@@ -2040,7 +2124,7 @@
     {
       sprintf(part,"key%02d",index++);
       if ((error= create_sub_table(name_buff, part, DB_BTREE,
-				   (table->key_info[i].flags & HA_NOSAME) ? 0 :
+				   (form->key_info[i].flags & HA_NOSAME) ? 0 :
 				   DB_DUP)))
 	DBUG_RETURN(error); /* purecov: inspected */
     }
@@ -2056,7 +2140,7 @@
 				    "status", DB_BTREE, DB_CREATE, 0))))
     {
       char rec_buff[4+MAX_KEY*4];
-      uint length= 4+ table->s->keys*4;
+      uint length= 4+ form->s->keys*4;
       bzero(rec_buff, length);
       error= write_status(status_block, rec_buff, length);
       status_block->close(status_block,0);
@@ -2075,8 +2159,9 @@
   if ((error=db_create(&file, db_env, 0)))
     my_errno=error; /* purecov: inspected */
   else
-    error=file->remove(file,fn_format(name_buff,name,"",ha_berkeley_ext,2 | 4),
-		       NULL,0);
+    error=file->remove(file,fn_format(name_buff,name,"",ha_berkeley_ext,
+                                      MY_UNPACK_FILENAME|MY_APPEND_EXT),
+                       NULL,0);
   file=0;					// Safety
   DBUG_RETURN(error);
 }
@@ -2094,9 +2179,11 @@
   {
     /* On should not do a file->close() after rename returns */
     error= file->rename(file, 
-			fn_format(from_buff, from, "", ha_berkeley_ext, 2 | 4),
+			fn_format(from_buff, from, "", 
+                                  ha_berkeley_ext,
+                                  MY_UNPACK_FILENAME|MY_APPEND_EXT),
 			NULL, fn_format(to_buff, to, "", ha_berkeley_ext,
-					2 | 4), 0);
+                                        MY_UNPACK_FILENAME|MY_APPEND_EXT), 0);
   }
   return error;
 }
@@ -2110,7 +2197,7 @@
 
 double ha_berkeley::scan_time()
 {
-  return rows2double(records/3);
+  return rows2double(stats.records/3);
 }
 
 ha_rows ha_berkeley::records_in_range(uint keynr, key_range *start_key,
@@ -2163,27 +2250,40 @@
     end_pos=end_range.less;
   else
     end_pos=end_range.less+end_range.equal;
-  rows=(end_pos-start_pos)*records;
+  rows=(end_pos-start_pos)*stats.records;
   DBUG_PRINT("exit",("rows: %g",rows));
   DBUG_RETURN((ha_rows)(rows <= 1.0 ? 1 : rows));
 }
 
 
-ulonglong ha_berkeley::get_auto_increment()
+void ha_berkeley::get_auto_increment(ulonglong offset, ulonglong increment,
+                                     ulonglong nb_desired_values,
+                                     ulonglong *first_value,
+                                     ulonglong *nb_reserved_values)
 {
+  /* Ideally in case of real error (not "empty table") nr should be ~ULL(0) */
   ulonglong nr=1;				// Default if error or new key
   int error;
   (void) ha_berkeley::extra(HA_EXTRA_KEYREAD);
 
   /* Set 'active_index' */
-  ha_berkeley::index_init(table->s->next_number_index);
+  ha_berkeley::index_init(table_share->next_number_index, 0);
 
-  if (!table->s->next_number_key_offset)
+  if (!table_share->next_number_key_offset)
   {						// Autoincrement at key-start
     error=ha_berkeley::index_last(table->record[1]);
+    /* has taken read lock on page of max key so reserves to infinite  */
+    *nb_reserved_values= ULONGLONG_MAX;
   }
   else
   {
+    /*
+      MySQL needs to call us for next row: assume we are inserting ("a",null)
+      here, we return 3, and next this statement will want to insert ("b",null):
+      there is no reason why ("b",3+1) would be the good row to insert: maybe it
+      already exists, maybe 3+1 is too large...
+    */
+    *nb_reserved_values= 1;
     DBT row,old_key;
     bzero((char*) &row,sizeof(row));
     KEY *key_info= &table->key_info[active_index];
@@ -2191,7 +2291,7 @@
     /* Reading next available number for a sub key */
     ha_berkeley::create_key(&last_key, active_index,
 			    key_buff, table->record[0],
-			    table->s->next_number_key_offset);
+			    table_share->next_number_key_offset);
     /* Store for compare */
     memcpy(old_key.data=key_buff2, key_buff, (old_key.size=last_key.size));
     old_key.app_private=(void*) key_info;
@@ -2221,10 +2321,10 @@
   }
   if (!error)
     nr= (ulonglong)
-      table->next_number_field->val_int_offset(table->s->rec_buff_length)+1;
+      table->next_number_field->val_int_offset(table_share->rec_buff_length)+1;
   ha_berkeley::index_end();
   (void) ha_berkeley::extra(HA_EXTRA_NO_KEYREAD);
-  return nr;
+  *first_value= nr;
 }
 
 void ha_berkeley::print_error(int error, myf errflag)
@@ -2271,48 +2371,14 @@
   berkeley_trx_data *trx=(berkeley_trx_data *)thd->ha_data[berkeley_hton.slot];
   DBUG_ASSERT(trx);
 
-  /*
-   Original bdb documentation says:
-   "The DB->stat method cannot be transaction-protected.
-   For this reason, it should be called in a thread of
-   control that has no open cursors or active transactions."
-   So, let's check if there are any changes have been done since
-   the beginning of the transaction..
-  */
-
-  if (!db_env->txn_stat(db_env, &txn_stat_ptr, 0) &&
-      txn_stat_ptr && txn_stat_ptr->st_nactive>=2)
-  {
-    DB_TXN_ACTIVE *atxn_stmt= 0, *atxn_all= 0;
-
-    u_int32_t all_id= trx->all->id(trx->all);
-    u_int32_t stmt_id= trx->stmt->id(trx->stmt);
-
-    DB_TXN_ACTIVE *cur= txn_stat_ptr->st_txnarray;
-    DB_TXN_ACTIVE *end= cur + txn_stat_ptr->st_nactive;
-    for (; cur!=end && (!atxn_stmt || !atxn_all); cur++)
-    {
-      if (cur->txnid==all_id) atxn_all= cur;
-      if (cur->txnid==stmt_id) atxn_stmt= cur;
-    }
-
-    if (atxn_stmt && atxn_all &&
-	log_compare(&atxn_stmt->lsn,&atxn_all->lsn))
-    {
-      free(txn_stat_ptr);
-      return HA_ADMIN_REJECT;
-    }
-    free(txn_stat_ptr);
-  }
-
-  for (i=0 ; i < table->s->keys ; i++)
+  for (i=0 ; i < table_share->keys ; i++)
   {
     if (stat)
     {
       free(stat);
       stat=0;
     }
-    if ((key_file[i]->stat)(key_file[i], (void*) &stat, 0))
+    if ((key_file[i]->stat)(key_file[i], trx->all, (void*) &stat, 0))
       goto err; /* purecov: inspected */
     share->rec_per_key[i]= (stat->bt_ndata /
 			    (stat->bt_nkeys ? stat->bt_nkeys : 1));
@@ -2325,7 +2391,7 @@
       free(stat);
       stat=0;
     }
-    if ((file->stat)(file, (void*) &stat, 0))
+    if ((file->stat)(file, trx->all, (void*) &stat, 0))
       goto err; /* purecov: inspected */
   }
   pthread_mutex_lock(&share->mutex);
@@ -2380,7 +2446,8 @@
 			   (hidden_primary_key ? berkeley_cmp_hidden_key :
 			    berkeley_cmp_packed_key));
   tmp_file->app_private= (void*) (table->key_info+table->primary_key);
-  fn_format(name_buff,share->table_name,"", ha_berkeley_ext, 2 | 4);
+  fn_format(name_buff,share->table_name.str,"", ha_berkeley_ext,
+            MY_UNPACK_FILENAME|MY_APPEND_EXT);      
   if ((error=tmp_file->verify(tmp_file, name_buff, NullS, (FILE*) 0,
 			      hidden_primary_key ? 0 : DB_NOORDERCHK)))
   {
@@ -2454,7 +2521,7 @@
       share->rec_per_key = rec_per_key;
       share->table_name = tmp_name;
       share->table_name_length=length;
-      strmov(share->table_name,table_name);
+      strmov(share->table_name, table_name);
       share->key_file = key_file;
       share->key_type = key_type;
       if (my_hash_insert(&bdb_open_tables, (byte*) share))
@@ -2515,7 +2582,7 @@
     if (!(share->status & STATUS_PRIMARY_KEY_INIT))
     {
       (void) extra(HA_EXTRA_KEYREAD);
-      index_init(primary_key);
+      index_init(primary_key, 0);
       if (!index_last(table->record[1]))
 	share->auto_ident=uint5korr(current_ident);
       index_end();
@@ -2526,7 +2593,8 @@
       char name_buff[FN_REFLEN];
       uint open_mode= (((table->db_stat & HA_READ_ONLY) ? DB_RDONLY : 0)
 		       | DB_THREAD);
-      fn_format(name_buff, share->table_name,"", ha_berkeley_ext, 2 | 4);
+      fn_format(name_buff, share->table_name, "", ha_berkeley_ext,
+                MY_UNPACK_FILENAME|MY_APPEND_EXT);
       if (!db_create(&share->status_block, db_env, 0))
       {
 	if (share->status_block->open(share->status_block, NULL, name_buff,
@@ -2540,7 +2608,7 @@
     if (!(share->status & STATUS_ROW_COUNT_INIT) && share->status_block)
     {
       share->org_rows= share->rows=
-	table->s->max_rows ? table->s->max_rows : HA_BERKELEY_MAX_ROWS;
+	table_share->max_rows ? table_share->max_rows : HA_BERKELEY_MAX_ROWS;
       if (!share->status_block->cursor(share->status_block, 0, &cursor, 0))
       {
 	DBT row;
@@ -2555,7 +2623,7 @@
 	  uint i;
 	  uchar *pos=(uchar*) row.data;
 	  share->org_rows=share->rows=uint4korr(pos); pos+=4;
-	  for (i=0 ; i < table->s->keys ; i++)
+	  for (i=0 ; i < table_share->keys ; i++)
 	  {
 	    share->rec_per_key[i]=uint4korr(pos);
             pos+=4;
@@ -2607,8 +2675,9 @@
 	goto end; /* purecov: inspected */
       share->status_block->set_flags(share->status_block,0); /* purecov: inspected */
       if (share->status_block->open(share->status_block, NULL,
-				    fn_format(name_buff,share->table_name,"",
-					      ha_berkeley_ext,2 | 4),
+				    fn_format(name_buff,share->table_name,
+                                              "", ha_berkeley_ext,
+                                              MY_UNPACK_FILENAME|MY_APPEND_EXT),
 				    "status", DB_BTREE,
 				    DB_THREAD | DB_CREATE, my_umask)) /* purecov: inspected */
 	goto end; /* purecov: inspected */
@@ -2620,7 +2689,7 @@
       {
 	int4store(pos,share->rec_per_key[i]); pos+=4;
       }
-      DBUG_PRINT("info",("updating status for %s",share->table_name));
+      DBUG_PRINT("info",("updating status for %s", share->table_name));
       (void) write_status(share->status_block, rec_buff,
 			  (uint) (pos-rec_buff));
       share->status&= ~STATUS_BDB_ANALYZE;
@@ -2650,7 +2719,7 @@
 
   int result;
   Field *field;
-  KEY *key_info=table->key_info+table->s->primary_key;
+  KEY *key_info=table->key_info+table_share->primary_key;
   KEY_PART_INFO *key_part=key_info->key_part;
   KEY_PART_INFO *end=key_part+key_info->key_parts;
 
@@ -2668,4 +2737,30 @@
   return 0;
 }
 
-#endif /* HAVE_BERKELEY_DB */
+
+bool ha_berkeley::check_if_incompatible_data(HA_CREATE_INFO *info,
+					     uint table_changes)
+{
+  if (table_changes < IS_EQUAL_YES)
+    return COMPATIBLE_DATA_NO;
+  return COMPATIBLE_DATA_YES;
+}
+
+struct st_mysql_storage_engine berkeley_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION, &berkeley_hton };
+
+mysql_declare_plugin(berkeley)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &berkeley_storage_engine,
+  berkeley_hton_name,
+  "Sleepycat Software",
+  "Supports transactions and page-level locking",
+  berkeley_init, /* Plugin Init */
+  NULL, /* Plugin Deinit */
+  0x0100, /* 1.0 */
+  0
+}
+mysql_declare_plugin_end;
+
+#endif

--- 1.379/sql/sql_base.cc	2007-03-01 03:58:09 +03:00
+++ 1.380/sql/sql_base.cc	2007-03-06 19:24:46 +03:00
@@ -1850,6 +1850,13 @@
   key_length= (create_table_def_key(thd, key, table_list, 1) -
                TMP_TABLE_KEY_EXTRA);
 
+  /*
+    Unless requested otherwise, try to resolve this table in the list
+    of temporary tables of this thread. In MySQL temporary tables
+    are always thread-local and "shadow" possible base tables with the
+    same name. This block implements the behaviour.
+    TODO: move this block into a separate function.
+  */
   if (!table_list->skip_temporary)
   {
     for (table= thd->temporary_tables; table ; table=table->next)
@@ -1859,6 +1866,12 @@
 	  !memcmp(table->s->table_cache_key.str, key,
 		  key_length + TMP_TABLE_KEY_EXTRA))
       {
+        /*
+          We're trying to use the same temporary table twice in a query.
+          Right now we don't support this because a temporary table
+          is always represented by only one TABLE object in THD, and
+          it can not be cloned. Emit an error for an unsupported behaviour.
+        */
 	if (table->query_id == thd->query_id ||
             thd->prelocked_mode && table->query_id)
 	{
@@ -1878,6 +1891,13 @@
     }
   }
 
+  /*
+    The table is not temporary - if we're in pre-locked or LOCK TABLES
+    mode, let's try to find the requested table in the list of pre-opened
+    and locked tables. If the table is not there, return an error - we can't
+    open not pre-opened tables in pre-locked/LOCK TABLES mode.
+    TODO: move this block into a separate function.
+  */
   if (!(flags & MYSQL_OPEN_IGNORE_LOCKED_TABLES) &&
       (thd->locked_tables || thd->prelocked_mode))
   {						// Using table locks
@@ -1949,7 +1969,7 @@
       goto reset;
     }
     /*
-      is it view?
+      Is this table a view and not a base table?
       (it is work around to allow to open view with locked tables,
       real fix will be made after definition cache will be made)
     */
@@ -1981,8 +2001,32 @@
     DBUG_RETURN(0);
   }
 
+  /*
+    Non pre-locked/LOCK TABLES mode, and the table is not temporary:
+    this is the normal use case.
+    Now we should:
+    - try to find the table in the table cache.
+    - if one of the discovered TABLE instances is name-locked
+      (table->s->version == 0) or some thread has started FLUSH TABLES
+      (refresh_version > table->s->version), back off -- we have to wait
+      until no one holds a name lock on the table.
+    - if there is no such TABLE in the name cache, read the table definition
+    and insert it into the cache.
+    We perform all of the above under LOCK_open which currently protects
+    the open cache (also known as table cache) and table definitions stored
+    on disk.
+  */
+
   VOID(pthread_mutex_lock(&LOCK_open));
 
+  /*
+    If it's the first table from a list of tables used in a query,
+    remember refresh_version (the version of open_cache state).
+    If the version changes while we're opening the remaining tables,
+    we will have to back off, close all the tables opened-so-far,
+    and try to reopen them.
+    Note: refresh_version is currently changed only during FLUSH TABLES.
+  */
   if (!thd->open_tables)
     thd->version=refresh_version;
   else if ((thd->version != refresh_version) &&
@@ -1999,6 +2043,16 @@
   if (thd->handler_tables)
     mysql_ha_flush(thd, (TABLE_LIST*) NULL, MYSQL_HA_REOPEN_ON_USAGE, TRUE);
 
+  /*
+    Actually try to find the table in the open_cache.
+    The cache may contain several "TABLE" instances for the same
+    physical table. The instances that are currently "in use" by
+    some thread have their "in_use" member != NULL.
+    There is no good reason for having more than one entry in the
+    hash for the same physical table, except that we use this as
+    an implicit "pending locks queue" - see
+    wait_for_locked_table_names for details.
+  */
   for (table= (TABLE*) hash_first(&open_cache, (byte*) key, key_length,
                                   &state);
        table && table->in_use ;
@@ -2008,6 +2062,21 @@
     /*
       Here we flush tables marked for flush. However we never flush log
       tables here. They are flushed only on FLUSH LOGS.
+      Normally, table->s->version contains the value of
+      refresh_version from the moment when this table was
+      (re-)opened and added to the cache.
+      If since then we did (or just started) FLUSH TABLES
+      statement, refresh_version has been increased.
+      For "name-locked" TABLE instances, table->s->version is set
+      to 0 (see lock_table_name for details).
+      In case there is a pending FLUSH TABLES or a name lock, we
+      need to back off and re-start opening tables.
+      If we do not back off now, we may dead lock in case of lock
+      order mismatch with some other thread:
+      c1: name lock t1; -- sort of exclusive lock 
+      c2: open t2;      -- sort of shared lock
+      c1: name lock t2; -- blocks
+      c2: open t1; -- blocks
     */
     if (table->s->version != refresh_version && !table->s->log_table)
     {
@@ -2023,16 +2092,35 @@
       }
 
       /*
-        There is a refresh in progress for this table
-        Wait until the table is freed or the thread is killed.
+        Back off, part 1: mark the table as "unused" for the
+        purpose of name-locking by setting table->db_stat to 0. Do
+        that only for the tables in this thread that have an old
+        table->s->version (this is an optimization (?)).
+        table->db_stat == 0 signals wait_for_locked_table_names
+        that the tables in question are not used any more. See
+        table_is_used call for details.
       */
       close_old_data_files(thd,thd->open_tables,0,0);
+      /*
+        Back-off part 2: try to avoid "busy waiting" on the table:
+        if the table is in use by some other thread, we suspend
+        and wait till the operation is complete: when any
+        operation that juggles with table->s->version completes,
+        it broadcasts COND_refresh condition variable.
+      */
       if (table->in_use != thd)
+      {
+        /* wait_for_conditionwill unlock LOCK_open for us */
         wait_for_condition(thd, &LOCK_open, &COND_refresh);
+      }
       else
       {
 	VOID(pthread_mutex_unlock(&LOCK_open));
       }
+      /*
+        There is a refresh in progress for this table.
+        Signal the caller that it has to try again.
+      */
       if (refresh)
 	*refresh=1;
       DBUG_RETURN(0);
@@ -2040,6 +2128,7 @@
   }
   if (table)
   {
+    /* Unlink the table from "unused_tables" list. */
     if (table == unused_tables)
     {						// First unused
       unused_tables=unused_tables->next;	// Remove from link
@@ -2052,6 +2141,7 @@
   }
   else
   {
+    /* Insert a new TABLE instance into the open cache */
     int error;
     /* Free cache if too big */
     while (open_cache.records > table_cache_size && unused_tables)
@@ -2906,6 +2996,10 @@
     }
   }
 
+  /*
+    For every table in the list of tables to open, try to find or open
+    a table.
+  */
   for (tables= *start; tables ;tables= tables->next_global)
   {
     /*
@@ -2920,6 +3014,12 @@
         goto process_view_routines;
       continue;
     }
+    /*
+      If this TABLE_LIST object is a placeholder for an information_schema
+      table, create a temporary table to represent the information_schema
+      table in the query. Do not fill it yet - will be filled during
+      execution.
+    */
     if (tables->schema_table)
     {
       if (!mysql_schema_table(thd, thd->lex, tables))
@@ -2927,7 +3027,11 @@
       DBUG_RETURN(-1);
     }
     (*counter)++;
-    
+
+    /*
+      Not a placeholder: must be a base table or a view, and the table is
+      not opened yet. Try to open the table.
+    */
     if (!tables->table &&
 	!(tables->table= open_table(thd, tables, &new_frm_mem, &refresh, flags)))
     {
@@ -3034,7 +3138,7 @@
       {
         /*
           Serious error during reading stored routines from mysql.proc table.
-          Something's wrong with the table or its contents, and an error has
+          Something is wrong with the table or its contents, and an error has
           been emitted; we must abort.
         */
         result= -1;

--- 1.545/sql/sql_yacc.yy	2007-03-06 16:35:30 +03:00
+++ 1.546/sql/sql_yacc.yy	2007-03-06 19:24:46 +03:00
@@ -47,7 +47,7 @@
 
 #define yyoverflow(A,B,C,D,E,F) {ulong val= *(F); if (my_yyoverflow((B), (D), &val)) { yyerror((char*) (A)); return 2; } else { *(F)= (YYSIZE_T)val; }}
 
-#define YYERROR_UNLESS(A)               \
+#define YYABORT_UNLESS(A)               \
   if (!(A))                             \
   {					\
     yyerror(ER(ER_SYNTAX_ERROR));	\
@@ -421,6 +421,11 @@
 %}
 
 %pure_parser					/* We have threads */
+/*
+  Currently there is 251 shift/reduce conflict. We should not introduce
+  new conflicts any more.
+*/
+%expect 251
 
 /*
    Comments for TOKENS.
@@ -7140,7 +7145,7 @@
         ;
 
 join_table_list:
-	derived_table_list		{ YYERROR_UNLESS($$=$1); }
+	derived_table_list		{ YYABORT_UNLESS($$=$1); }
 	;
 
 /* Warning - may return NULL in case of incomplete SELECT */
@@ -7148,7 +7153,7 @@
         table_ref { $$=$1; }
         | derived_table_list ',' table_ref
           {
-            YYERROR_UNLESS($1 && ($$=$3));
+            YYABORT_UNLESS($1 && ($$=$3));
           }
         ;
 
@@ -7167,13 +7172,13 @@
           left-associative joins.
         */
         table_ref %prec TABLE_REF_PRIORITY normal_join table_ref
-          { YYERROR_UNLESS($1 && ($$=$3)); }
+          { YYABORT_UNLESS($1 && ($$=$3)); }
 	| table_ref STRAIGHT_JOIN table_factor
-	  { YYERROR_UNLESS($1 && ($$=$3)); $3->straight=1; }
+	  { YYABORT_UNLESS($1 && ($$=$3)); $3->straight=1; }
 	| table_ref normal_join table_ref
           ON
           {
-            YYERROR_UNLESS($1 && $3);
+            YYABORT_UNLESS($1 && $3);
             /* Change the current name resolution context to a local context. */
             if (push_new_name_resolution_context(YYTHD, $1, $3))
               YYABORT;
@@ -7188,7 +7193,7 @@
         | table_ref STRAIGHT_JOIN table_factor
           ON
           {
-            YYERROR_UNLESS($1 && $3);
+            YYABORT_UNLESS($1 && $3);
             /* Change the current name resolution context to a local context. */
             if (push_new_name_resolution_context(YYTHD, $1, $3))
               YYABORT;
@@ -7204,13 +7209,13 @@
 	| table_ref normal_join table_ref
 	  USING
 	  {
-            YYERROR_UNLESS($1 && $3);
+            YYABORT_UNLESS($1 && $3);
 	  }
 	  '(' using_list ')'
           { add_join_natural($1,$3,$7,Select); $$=$3; }
 	| table_ref NATURAL JOIN_SYM table_factor
 	  {
-            YYERROR_UNLESS($1 && ($$=$4));
+            YYABORT_UNLESS($1 && ($$=$4));
             add_join_natural($1,$4,NULL,Select);
           }
 
@@ -7218,7 +7223,7 @@
 	| table_ref LEFT opt_outer JOIN_SYM table_ref
           ON
           {
-            YYERROR_UNLESS($1 && $5);
+            YYABORT_UNLESS($1 && $5);
             /* Change the current name resolution context to a local context. */
             if (push_new_name_resolution_context(YYTHD, $1, $5))
               YYABORT;
@@ -7234,7 +7239,7 @@
           }
 	| table_ref LEFT opt_outer JOIN_SYM table_factor
 	  {
-            YYERROR_UNLESS($1 && $5);
+            YYABORT_UNLESS($1 && $5);
 	  }
 	  USING '(' using_list ')'
           { 
@@ -7244,7 +7249,7 @@
           }
 	| table_ref NATURAL LEFT opt_outer JOIN_SYM table_factor
 	  {
-            YYERROR_UNLESS($1 && $6);
+            YYABORT_UNLESS($1 && $6);
  	    add_join_natural($1,$6,NULL,Select);
 	    $6->outer_join|=JOIN_TYPE_LEFT;
 	    $$=$6;
@@ -7254,7 +7259,7 @@
 	| table_ref RIGHT opt_outer JOIN_SYM table_ref
           ON
           {
-            YYERROR_UNLESS($1 && $5);
+            YYABORT_UNLESS($1 && $5);
             /* Change the current name resolution context to a local context. */
             if (push_new_name_resolution_context(YYTHD, $1, $5))
               YYABORT;
@@ -7271,7 +7276,7 @@
           }
 	| table_ref RIGHT opt_outer JOIN_SYM table_factor
 	  {
-            YYERROR_UNLESS($1 && $5);
+            YYABORT_UNLESS($1 && $5);
 	  }
 	  USING '(' using_list ')'
           {
@@ -7282,7 +7287,7 @@
           }
 	| table_ref NATURAL RIGHT opt_outer JOIN_SYM table_factor
 	  {
-            YYERROR_UNLESS($1 && $6);
+            YYABORT_UNLESS($1 && $6);
 	    add_join_natural($6,$1,NULL,Select);
 	    LEX *lex= Lex;
             if (!($$= lex->current_select->convert_right_join()))
@@ -7325,7 +7330,7 @@
           expr '}'
 	  {
 	    LEX *lex= Lex;
-            YYERROR_UNLESS($3 && $7);
+            YYABORT_UNLESS($3 && $7);
             add_join_on($7,$10);
             Lex->pop_context();
             $7->outer_join|=JOIN_TYPE_LEFT;
@@ -11511,21 +11516,21 @@
 
 xid: text_string
      {
-       YYERROR_UNLESS($1->length() <= MAXGTRIDSIZE);
+       YYABORT_UNLESS($1->length() <= MAXGTRIDSIZE);
        if (!(Lex->xid=(XID *)YYTHD->alloc(sizeof(XID))))
          YYABORT;
        Lex->xid->set(1L, $1->ptr(), $1->length(), 0, 0);
      }
      | text_string ',' text_string
      {
-       YYERROR_UNLESS($1->length() <= MAXGTRIDSIZE && $3->length() <= MAXBQUALSIZE);
+       YYABORT_UNLESS($1->length() <= MAXGTRIDSIZE && $3->length() <= MAXBQUALSIZE);
        if (!(Lex->xid=(XID *)YYTHD->alloc(sizeof(XID))))
          YYABORT;
        Lex->xid->set(1L, $1->ptr(), $1->length(), $3->ptr(), $3->length());
      }
      | text_string ',' text_string ',' ulong_num
      {
-       YYERROR_UNLESS($1->length() <= MAXGTRIDSIZE && $3->length() <= MAXBQUALSIZE);
+       YYABORT_UNLESS($1->length() <= MAXGTRIDSIZE && $3->length() <= MAXBQUALSIZE);
        if (!(Lex->xid=(XID *)YYTHD->alloc(sizeof(XID))))
          YYABORT;
        Lex->xid->set($5, $1->ptr(), $1->length(), $3->ptr(), $3->length());

--- 1.146/mysql-test/t/subselect.test	2007-03-01 04:00:24 +03:00
+++ 1.147/mysql-test/t/subselect.test	2007-03-06 19:24:46 +03:00
@@ -2002,7 +2002,7 @@
 DROP TABLE t1;
 
 #
-# Bug 24653: sorting by expressions containing subselects 
+# Bug 24653: sorting by expressions containing subselects
 #            that return more than one row
 #
 
@@ -2014,12 +2014,12 @@
   (2,1), (1,3), (2,1), (4,4), (2,2), (1,4);
 
 SELECT a FROM t1 ORDER BY (SELECT c FROM t2 WHERE b > 2 );
---error 1242   
-SELECT a FROM t1 ORDER BY (SELECT c FROM t2 WHERE b > 1);  
-SELECT a FROM t1 ORDER BY (SELECT c FROM t2 WHERE b > 2), a;  
---error 1242   
+--error 1242
+SELECT a FROM t1 ORDER BY (SELECT c FROM t2 WHERE b > 1);
+SELECT a FROM t1 ORDER BY (SELECT c FROM t2 WHERE b > 2), a;
+--error 1242
 SELECT a FROM t1 ORDER BY (SELECT c FROM t2 WHERE b > 1), a;
- 
+
 SELECT b, MAX(c) FROM t2 GROUP BY b, (SELECT c FROM t2 WHERE b > 2);
 --error 1242
 SELECT b, MAX(c) FROM t2 GROUP BY b, (SELECT c FROM t2 WHERE b > 1);
@@ -2036,28 +2036,28 @@
 SELECT a FROM t1 GROUP BY a
   HAVING IFNULL((SELECT b FROM t2 WHERE b > 4),
                 (SELECT c FROM t2 WHERE c=a AND b > 2 ORDER BY b)) > 3;
---error 1242 
+--error 1242
 SELECT a FROM t1 GROUP BY a
   HAVING IFNULL((SELECT b FROM t2 WHERE b > 4),
                 (SELECT c FROM t2 WHERE c=a AND b > 1 ORDER BY b)) > 3;
 
-SELECT a FROM t1 
+SELECT a FROM t1
   ORDER BY IFNULL((SELECT b FROM t2 WHERE b > 2),
                   (SELECT c FROM t2 WHERE c=a AND b > 2 ORDER BY b));
 --error 1242
-SELECT a FROM t1 
+SELECT a FROM t1
   ORDER BY IFNULL((SELECT b FROM t2 WHERE b > 1),
                   (SELECT c FROM t2 WHERE c=a AND b > 1 ORDER BY b));
 
-SELECT a FROM t1 
+SELECT a FROM t1
   ORDER BY IFNULL((SELECT b FROM t2 WHERE b > 4),
                   (SELECT c FROM t2 WHERE c=a AND b > 2 ORDER BY b));
 --error 1242
-SELECT a FROM t1 
+SELECT a FROM t1
   ORDER BY IFNULL((SELECT b FROM t2 WHERE b > 4),
                   (SELECT c FROM t2 WHERE c=a AND b > 1 ORDER BY b));
 
-DROP TABLE t1,t2; 
+DROP TABLE t1,t2;
 
 # End of 4.1 tests
 
@@ -2571,7 +2571,7 @@
 #
 # Bug #25219: EXIST subquery with UNION over a mix of
 #             correlated and uncorrelated selects
-#             
+#
 
 CREATE TABLE t1 (id char(4) PRIMARY KEY, c int);
 CREATE TABLE t2 (c int);
@@ -2579,25 +2579,25 @@
 INSERT INTO t1 VALUES ('aa', 1);
 INSERT INTO t2 VALUES (1);
 
-SELECT * FROM t1 
+SELECT * FROM t1
   WHERE EXISTS (SELECT c FROM t2 WHERE c=1
-                UNION 
+                UNION
                 SELECT c from t2 WHERE c=t1.c);
 
 INSERT INTO t1 VALUES ('bb', 2), ('cc', 3), ('dd',1);
 
-SELECT * FROM t1 
+SELECT * FROM t1
   WHERE EXISTS (SELECT c FROM t2 WHERE c=1
-                UNION 
+                UNION
                 SELECT c from t2 WHERE c=t1.c);
 
 INSERT INTO t2 VALUES (2);
 CREATE TABLE t3 (c int);
 INSERT INTO t3 VALUES (1);
 
-SELECT * FROM t1 
+SELECT * FROM t1
   WHERE EXISTS (SELECT t2.c FROM t2 JOIN t3 ON t2.c=t3.c WHERE t2.c=1
-                UNION 
+                UNION
                 SELECT c from t2 WHERE c=t1.c);
 
 DROP TABLE t1,t2,t3;
@@ -2609,6 +2609,33 @@
 INSERT INTO t1 VALUES ('a');
 SELECT * FROM t1 WHERE _utf8'a' = ANY (SELECT s1 FROM t1);
 DROP TABLE t1;
+#                                                                             
+# Bug#23800: Outer fields in correlated subqueries is used in a temporary     
+#            table created for sorting.                                       
+#                                                                             
+CREATE TABLE t1(f1 int);
+CREATE TABLE t2(f2 int, f21 int, f3 timestamp);
+INSERT INTO t1 VALUES (1),(1),(2),(2);
+INSERT INTO t2 VALUES (1,1,"2004-02-29 11:11:11"), (2,2,"2004-02-29 11:11:11");
+SELECT ((SELECT f2 FROM t2 WHERE f21=f1 LIMIT 1) * COUNT(f1)) AS sq FROM t1 GROUP BY f1;
+SELECT (SELECT SUM(1) FROM t2 ttt GROUP BY t2.f3 LIMIT 1) AS tt FROM t2;
+PREPARE stmt1 FROM 'SELECT ((SELECT f2 FROM t2 WHERE f21=f1 LIMIT 1) * COUNT(f1)) AS sq FROM t1 GROUP BY f1';
+EXECUTE stmt1;
+EXECUTE stmt1;
+DEALLOCATE PREPARE stmt1;
+SELECT f2, AVG(f21), 
+      (SELECT t.f3 FROM t2 AS t WHERE t2.f2=t.f2 AND t.f3=MAX(t2.f3)) AS test
+  FROM t2 GROUP BY f2;
+DROP TABLE t1,t2;                                                             
+CREATE TABLE t1 (a int, b INT, c CHAR(10) NOT NULL);                          
+INSERT INTO t1 VALUES                                                         
+  (1,1,'a'), (1,2,'b'), (1,3,'c'), (1,4,'d'), (1,5,'e'),                      
+  (2,1,'f'), (2,2,'g'), (2,3,'h'), (3,4,'i'), (3,3,'j'),                      
+  (3,2,'k'), (3,1,'l'), (1,9,'m');                                            
+SELECT a, MAX(b),                                                             
+      (SELECT t.c FROM t1 AS t WHERE t1.a=t.a AND t.b=MAX(t1.b)) AS test      
+  FROM t1 GROUP BY a;                                                         
+DROP TABLE t1;      
 
 #
 # Bug#21904 (parser problem when using IN with a double "(())")

--- 1.60.14.2/sql/ha_archive.cc	2007-03-01 11:52:22 +03:00
+++ 1.130/storage/archive/ha_archive.cc	2007-03-06 19:22:50 +03:00
@@ -18,25 +18,27 @@
 #endif
 
 #include "mysql_priv.h"
+#include <myisam.h>
 
-#if defined(HAVE_ARCHIVE_DB)
 #include "ha_archive.h"
 #include <my_dir.h>
 
+#include <mysql/plugin.h>
+
 /*
   First, if you want to understand storage engines you should look at 
   ha_example.cc and ha_example.h. 
+
   This example was written as a test case for a customer who needed
   a storage engine without indexes that could compress data very well.
   So, welcome to a completely compressed storage engine. This storage
   engine only does inserts. No replace, deletes, or updates. All reads are 
-  complete table scans. Compression is done through gzip (bzip compresses
-  better, but only marginally, if someone asks I could add support for
-  it too, but beaware that it costs a lot more in CPU time then gzip).
+  complete table scans. Compression is done through a combination of packing
+  and making use of the zlib library
   
   We keep a file pointer open for each instance of ha_archive for each read
   but for writes we keep one open file handle just for that. We flush it
-  only if we have a read occur. gzip handles compressing lots of records
+  only if we have a read occur. azip handles compressing lots of records
   at once much better then doing lots of little records between writes.
   It is possible to not lock on writes but this would then mean we couldn't
   handle bulk inserts as well (that is if someone was trying to read at
@@ -63,8 +65,7 @@
   pool. For MyISAM its a question of how much the file system caches the
   MyISAM file. With enough free memory MyISAM is faster. Its only when the OS
   doesn't have enough memory to cache entire table that archive turns out 
-  to be any faster. For writes it is always a bit slower then MyISAM. It has no
-  internal limits though for row length.
+  to be any faster. 
 
   Examples between MyISAM (packed) and Archive.
 
@@ -79,95 +80,58 @@
 
 
   TODO:
-   Add bzip optional support.
    Allow users to set compression level.
-   Add truncate table command.
    Implement versioning, should be easy.
    Allow for errors, find a way to mark bad rows.
-   Talk to the gzip guys, come up with a writable format so that updates are doable
-     without switching to a block method.
    Add optional feature so that rows can be flushed at interval (which will cause less
      compression but may speed up ordered searches).
    Checkpoint the meta file to allow for faster rebuilds.
-   Dirty open (right now the meta file is repaired if a crash occured).
    Option to allow for dirty reads, this would lower the sync calls, which would make
      inserts a lot faster, but would mean highly arbitrary reads.
 
     -Brian
 */
-/*
-  Notes on file formats.
-  The Meta file is layed out as:
-  check - Just an int of 254 to make sure that the the file we are opening was
-          never corrupted.
-  version - The current version of the file format.
-  rows - This is an unsigned long long which is the number of rows in the data
-         file.
-  check point - Reserved for future use
-  dirty - Status of the file, whether or not its values are the latest. This
-          flag is what causes a repair to occur
-
-  The data file:
-  check - Just an int of 254 to make sure that the the file we are opening was
-          never corrupted.
-  version - The current version of the file format.
-  data - The data is stored in a "row +blobs" format.
-*/
 
-/* If the archive storage engine has been inited */
-static bool archive_inited= FALSE;
 /* Variables for archive share methods */
 pthread_mutex_t archive_mutex;
 static HASH archive_open_tables;
-static z_off_t max_zfile_size;
-static int zoffset_size;
 
 /* The file extension */
 #define ARZ ".ARZ"               // The data file
 #define ARN ".ARN"               // Files used during an optimize call
-#define ARM ".ARM"               // Meta file
-/*
-  uchar + uchar + ulonglong + ulonglong + uchar
-*/
-#define META_BUFFER_SIZE 19      // Size of the data used in the meta file
+#define ARM ".ARM"               // Meta file (deprecated)
+
 /*
   uchar + uchar
 */
 #define DATA_BUFFER_SIZE 2       // Size of the data used in the data file
 #define ARCHIVE_CHECK_HEADER 254 // The number we use to determine corruption
 
-/* 
+/* Static declarations for handerton */
+static handler *archive_create_handler(handlerton *hton, 
+                                       TABLE_SHARE *table, 
+                                       MEM_ROOT *mem_root);
+int archive_discover(handlerton *hton, THD* thd, const char *db, 
+                        const char *name,
+                        const void** frmblob, 
+                        uint* frmlen);
+
+/*
   Number of rows that will force a bulk insert.
 */
 #define ARCHIVE_MIN_ROWS_TO_USE_BULK_INSERT 2
 
+/*
+  Size of header used for row
+*/
+#define ARCHIVE_ROW_HEADER_SIZE 4
 
-
-/* dummy handlerton - only to have something to return from archive_db_init */
-handlerton archive_hton = {
-  "ARCHIVE",
-  SHOW_OPTION_YES,
-  "Archive storage engine", 
-  DB_TYPE_ARCHIVE_DB,
-  archive_db_init,
-  0,       /* slot */
-  0,       /* savepoint size. */
-  NULL,    /* close_connection */
-  NULL,    /* savepoint */
-  NULL,    /* rollback to savepoint */
-  NULL,    /* releas savepoint */
-  NULL,    /* commit */
-  NULL,    /* rollback */
-  NULL,    /* prepare */
-  NULL,    /* recover */
-  NULL,    /* commit_by_xid */
-  NULL,    /* rollback_by_xid */
-  NULL,    /* create_cursor_read_view */
-  NULL,    /* set_cursor_read_view */
-  NULL,    /* close_cursor_read_view */
-  HTON_NO_FLAGS
-};
-
+static handler *archive_create_handler(handlerton *hton,
+                                       TABLE_SHARE *table, 
+                                       MEM_ROOT *mem_root)
+{
+  return new (mem_root) ha_archive(hton, table);
+}
 
 /*
   Used for hash table that tracks open tables.
@@ -185,16 +149,25 @@
 
   SYNOPSIS
     archive_db_init()
-    void
+    void *
 
   RETURN
     FALSE       OK
     TRUE        Error
 */
 
-bool archive_db_init()
+int archive_db_init(void *p)
 {
   DBUG_ENTER("archive_db_init");
+  handlerton *archive_hton;
+
+  archive_hton= (handlerton *)p;
+  archive_hton->state= SHOW_OPTION_YES;
+  archive_hton->db_type= DB_TYPE_ARCHIVE_DB;
+  archive_hton->create= archive_create_handler;
+  archive_hton->flags= HTON_NO_FLAGS;
+  archive_hton->discover= archive_discover;
+
   if (pthread_mutex_init(&archive_mutex, MY_MUTEX_INIT_FAST))
     goto error;
   if (hash_init(&archive_open_tables, system_charset_info, 32, 0, 0,
@@ -204,23 +177,9 @@
   }
   else
   {
-    zoffset_size= 2 << ((zlibCompileFlags() >> 6) & 3);
-    switch (sizeof(z_off_t)) {
-    case 2:
-      max_zfile_size= INT_MAX16;
-      break;
-    case 8:
-      max_zfile_size= (z_off_t) LONGLONG_MAX;
-      break;
-    case 4:
-    default:
-      max_zfile_size= INT_MAX32;
-    }
-    archive_inited= TRUE;
     DBUG_RETURN(FALSE);
   }
 error:
-  have_archive_db= SHOW_OPTION_DISABLED;	// If we couldn't use handler
   DBUG_RETURN(TRUE);
 }
 
@@ -228,145 +187,112 @@
   Release the archive handler.
 
   SYNOPSIS
-    archive_db_end()
+    archive_db_done()
     void
 
   RETURN
     FALSE       OK
 */
 
-bool archive_db_end()
+int archive_db_done(void *p)
 {
-  if (archive_inited)
-  {
-    hash_free(&archive_open_tables);
-    VOID(pthread_mutex_destroy(&archive_mutex));
-  }
-  archive_inited= 0;
-  return FALSE;
+  hash_free(&archive_open_tables);
+  VOID(pthread_mutex_destroy(&archive_mutex));
+
+  return 0;
 }
 
-ha_archive::ha_archive(TABLE *table_arg)
-  :handler(&archive_hton, table_arg), delayed_insert(0), bulk_insert(0)
+
+ha_archive::ha_archive(handlerton *hton, TABLE_SHARE *table_arg)
+  :handler(hton, table_arg), delayed_insert(0), bulk_insert(0)
 {
   /* Set our original buffer from pre-allocated memory */
   buffer.set((char *)byte_buffer, IO_SIZE, system_charset_info);
 
   /* The size of the offset value we will use for position() */
-  ref_length = zoffset_size;
-  DBUG_ASSERT(ref_length <= sizeof(z_off_t));
+  ref_length = sizeof(my_off_t);
 }
 
-/*
-  This method reads the header of a datafile and returns whether or not it was successful.
-*/
-int ha_archive::read_data_header(gzFile file_to_read)
+int archive_discover(handlerton *hton, THD* thd, const char *db, 
+                        const char *name,
+                        const void** frmblob, 
+                        uint* frmlen)
 {
-  uchar data_buffer[DATA_BUFFER_SIZE];
-  DBUG_ENTER("ha_archive::read_data_header");
+  DBUG_ENTER("archive_discover");
+  DBUG_PRINT("archive_discover", ("db: %s, name: %s", db, name)); 
+  azio_stream frm_stream;
+  char az_file[FN_REFLEN];
+  char *frm_ptr;
+  MY_STAT file_stat; 
 
-  if (gzrewind(file_to_read) == -1)
-    DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+  fn_format(az_file, name, db, ARZ, MY_REPLACE_EXT | MY_UNPACK_FILENAME);
 
-  if (gzread(file_to_read, data_buffer, DATA_BUFFER_SIZE) != DATA_BUFFER_SIZE)
-    DBUG_RETURN(errno ? errno : -1);
-  
-  DBUG_PRINT("ha_archive::read_data_header", ("Check %u", data_buffer[0]));
-  DBUG_PRINT("ha_archive::read_data_header", ("Version %u", data_buffer[1]));
-  
-  if ((data_buffer[0] != (uchar)ARCHIVE_CHECK_HEADER) &&  
-      (data_buffer[1] != (uchar)ARCHIVE_VERSION))
-    DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+  if (!(my_stat(az_file, &file_stat, MYF(0))))
+    goto err;
 
-  DBUG_RETURN(0);
-}
+  if (!(azopen(&frm_stream, az_file, O_RDONLY|O_BINARY)))
+  {
+    if (errno == EROFS || errno == EACCES)
+      DBUG_RETURN(my_errno= errno);
+    DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+  }
 
-/*
-  This method writes out the header of a datafile and returns whether or not it was successful.
-*/
-int ha_archive::write_data_header(gzFile file_to_write)
-{
-  uchar data_buffer[DATA_BUFFER_SIZE];
-  DBUG_ENTER("ha_archive::write_data_header");
+  if (frm_stream.frm_length == 0)
+    goto err;
 
-  data_buffer[0]= (uchar)ARCHIVE_CHECK_HEADER;
-  data_buffer[1]= (uchar)ARCHIVE_VERSION;
+  frm_ptr= (char *)my_malloc(sizeof(char) * frm_stream.frm_length, MYF(0));
+  azread_frm(&frm_stream, frm_ptr);
+  azclose(&frm_stream);
 
-  if (gzwrite(file_to_write, &data_buffer, DATA_BUFFER_SIZE) != 
-      DATA_BUFFER_SIZE)
-    goto error;
-  DBUG_PRINT("ha_archive::write_data_header", ("Check %u", (uint)data_buffer[0]));
-  DBUG_PRINT("ha_archive::write_data_header", ("Version %u", (uint)data_buffer[1]));
+  *frmlen= frm_stream.frm_length;
+  *frmblob= frm_ptr;
 
   DBUG_RETURN(0);
-error:
-  DBUG_RETURN(errno);
+err:
+  my_errno= 0;
+  DBUG_RETURN(1);
 }
 
 /*
-  This method reads the header of a meta file and returns whether or not it was successful.
-  *rows will contain the current number of rows in the data file upon success.
+  This method reads the header of a datafile and returns whether or not it was successful.
 */
-int ha_archive::read_meta_file(File meta_file, ha_rows *rows)
+int ha_archive::read_data_header(azio_stream *file_to_read)
 {
-  uchar meta_buffer[META_BUFFER_SIZE];
-  ulonglong check_point;
-
-  DBUG_ENTER("ha_archive::read_meta_file");
-
-  VOID(my_seek(meta_file, 0, MY_SEEK_SET, MYF(0)));
-  if (my_read(meta_file, (byte*)meta_buffer, META_BUFFER_SIZE, 0) != META_BUFFER_SIZE)
-    DBUG_RETURN(-1);
-  
-  /*
-    Parse out the meta data, we ignore version at the moment
-  */
-  *rows= (ha_rows)uint8korr(meta_buffer + 2);
-  check_point= uint8korr(meta_buffer + 10);
-
-  DBUG_PRINT("ha_archive::read_meta_file", ("Check %d", (uint)meta_buffer[0]));
-  DBUG_PRINT("ha_archive::read_meta_file", ("Version %d", (uint)meta_buffer[1]));
-  DBUG_PRINT("ha_archive::read_meta_file", ("Rows %lu", (ulong) *rows));
-  DBUG_PRINT("ha_archive::read_meta_file", ("Checkpoint %lu", (ulong) check_point));
-  DBUG_PRINT("ha_archive::read_meta_file", ("Dirty %d", (int)meta_buffer[18]));
+  int error;
+  unsigned long ret;
+  uchar data_buffer[DATA_BUFFER_SIZE];
+  DBUG_ENTER("ha_archive::read_data_header");
 
-  if ((meta_buffer[0] != (uchar)ARCHIVE_CHECK_HEADER) || 
-      ((bool)meta_buffer[18] == TRUE))
+  if (azrewind(file_to_read) == -1)
     DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 
-  my_sync(meta_file, MYF(MY_WME));
+  if (file_to_read->version >= 3)
+    DBUG_RETURN(0);
+  /* Everything below this is just legacy to version 2< */
 
-  DBUG_RETURN(0);
-}
+  DBUG_PRINT("ha_archive", ("Reading legacy data header"));
 
-/*
-  This method writes out the header of a meta file and returns whether or not it was successful.
-  By setting dirty you say whether or not the file represents the actual state of the data file.
-  Upon ::open() we set to dirty, and upon ::close() we set to clean.
-*/
-int ha_archive::write_meta_file(File meta_file, ha_rows rows, bool dirty)
-{
-  uchar meta_buffer[META_BUFFER_SIZE];
-  ulonglong check_point= 0; //Reserved for the future
-
-  DBUG_ENTER("ha_archive::write_meta_file");
-
-  meta_buffer[0]= (uchar)ARCHIVE_CHECK_HEADER;
-  meta_buffer[1]= (uchar)ARCHIVE_VERSION;
-  int8store(meta_buffer + 2, (ulonglong)rows); 
-  int8store(meta_buffer + 10, check_point); 
-  *(meta_buffer + 18)= (uchar)dirty;
-  DBUG_PRINT("ha_archive::write_meta_file", ("Check %d", (uint)ARCHIVE_CHECK_HEADER));
-  DBUG_PRINT("ha_archive::write_meta_file", ("Version %d", (uint)ARCHIVE_VERSION));
-  DBUG_PRINT("ha_archive::write_meta_file", ("Rows %lu", (ulong)rows));
-  DBUG_PRINT("ha_archive::write_meta_file", ("Checkpoint %lu", (ulong) check_point));
-  DBUG_PRINT("ha_archive::write_meta_file", ("Dirty %d", (uint)dirty));
+  ret= azread(file_to_read, data_buffer, DATA_BUFFER_SIZE, &error);
 
-  VOID(my_seek(meta_file, 0, MY_SEEK_SET, MYF(0)));
-  if (my_write(meta_file, (byte *)meta_buffer, META_BUFFER_SIZE, 0) != META_BUFFER_SIZE)
-    DBUG_RETURN(-1);
+  if (ret != DATA_BUFFER_SIZE)
+  {
+    DBUG_PRINT("ha_archive", ("Reading, expected %d got %lu", 
+                              DATA_BUFFER_SIZE, ret));
+    DBUG_RETURN(1);
+  }
+
+  if (error)
+  {
+    DBUG_PRINT("ha_archive", ("Compression error (%d)", error));
+    DBUG_RETURN(1);
+  }
   
-  my_sync(meta_file, MYF(MY_WME));
+  DBUG_PRINT("ha_archive", ("Check %u", data_buffer[0]));
+  DBUG_PRINT("ha_archive", ("Version %u", data_buffer[1]));
+
+  if ((data_buffer[0] != (uchar)ARCHIVE_CHECK_HEADER) &&  
+      (data_buffer[1] != (uchar)ARCHIVE_VERSION))
+    DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 
   DBUG_RETURN(0);
 }
@@ -381,9 +307,7 @@
 */
 ARCHIVE_SHARE *ha_archive::get_share(const char *table_name, int *rc)
 {
-  char meta_file_name[FN_REFLEN];
   uint length;
-  char *tmp_name;
   DBUG_ENTER("ha_archive::get_share");
 
   pthread_mutex_lock(&archive_mutex);
@@ -393,6 +317,9 @@
                                            (byte*) table_name,
                                            length)))
   {
+    char *tmp_name;
+    azio_stream archive_tmp;
+
     if (!my_multi_malloc(MYF(MY_WME | MY_ZEROFILL),
                           &share, sizeof(*share),
                           &tmp_name, length+1,
@@ -408,31 +335,36 @@
     share->table_name= tmp_name;
     share->crashed= FALSE;
     share->archive_write_open= FALSE;
-    fn_format(share->data_file_name,table_name,"",ARZ,
-              MY_REPLACE_EXT|MY_UNPACK_FILENAME);
-    fn_format(meta_file_name,table_name,"",ARM,
-              MY_REPLACE_EXT|MY_UNPACK_FILENAME);
-    strmov(share->table_name,table_name);
+    fn_format(share->data_file_name, table_name, "",
+              ARZ, MY_REPLACE_EXT | MY_UNPACK_FILENAME);
+    strmov(share->table_name, table_name);
+    DBUG_PRINT("ha_archive", ("Data File %s", 
+                        share->data_file_name));
     /*
       We will use this lock for rows.
     */
     VOID(pthread_mutex_init(&share->mutex,MY_MUTEX_INIT_FAST));
-    if ((share->meta_file= my_open(meta_file_name, O_RDWR, MYF(0))) == -1)
-      share->crashed= TRUE;
     
     /*
-      After we read, we set the file to dirty. When we close, we will do the 
-      opposite. If the meta file will not open we assume it is crashed and
-      leave it up to the user to fix.
+      We read the meta file, but do not mark it dirty. Since we are not
+      doing a write we won't mark it dirty (and we won't open it for
+      anything but reading... open it for write and we will generate null
+      compression writes).
     */
-    if (read_meta_file(share->meta_file, &share->rows_recorded))
-      share->crashed= TRUE;
+    if (!(azopen(&archive_tmp, share->data_file_name, O_RDONLY|O_BINARY)))
+    {
+      DBUG_RETURN(NULL);
+    }
+    stats.auto_increment_value= archive_tmp.auto_increment;
+    share->rows_recorded= (ha_rows)archive_tmp.rows;
+    share->crashed= archive_tmp.dirty;
+    azclose(&archive_tmp);
 
     VOID(my_hash_insert(&archive_open_tables, (byte*) share));
     thr_lock_init(&share->lock);
   }
   share->use_count++;
-  DBUG_PRINT("info", ("archive table %.*s has %d open handles now", 
+  DBUG_PRINT("ha_archive", ("archive table %.*s has %d open handles now", 
                       share->table_name_length, share->table_name,
                       share->use_count));
   if (share->crashed)
@@ -451,9 +383,10 @@
 {
   int rc= 0;
   DBUG_ENTER("ha_archive::free_share");
-  DBUG_PRINT("info", ("archive table %.*s has %d open handles on entrance", 
-                      share->table_name_length, share->table_name,
-                      share->use_count));
+  DBUG_PRINT("ha_archive",
+             ("archive table %.*s has %d open handles on entrance", 
+              share->table_name_length, share->table_name,
+              share->use_count));
 
   pthread_mutex_lock(&archive_mutex);
   if (!--share->use_count)
@@ -461,15 +394,18 @@
     hash_delete(&archive_open_tables, (byte*) share);
     thr_lock_delete(&share->lock);
     VOID(pthread_mutex_destroy(&share->mutex));
-    if (share->crashed)
-      (void)write_meta_file(share->meta_file, share->rows_recorded, TRUE);
-    else
-      (void)write_meta_file(share->meta_file, share->rows_recorded, FALSE);
+    /* 
+      We need to make sure we don't reset the crashed state.
+      If we open a crashed file, wee need to close it as crashed unless
+      it has been repaired.
+      Since we will close the data down after this, we go on and count
+      the flush on close;
+    */
     if (share->archive_write_open)
-      if (gzclose(share->archive_write) == Z_ERRNO)
+    {
+      if (azclose(&(share->archive_write)))
         rc= 1;
-    if (my_close(share->meta_file, MYF(0)))
-      rc= 1;
+    }
     my_free((gptr) share, MYF(0));
   }
   pthread_mutex_unlock(&archive_mutex);
@@ -480,21 +416,20 @@
 int ha_archive::init_archive_writer()
 {
   DBUG_ENTER("ha_archive::init_archive_writer");
-  (void)write_meta_file(share->meta_file, share->rows_recorded, TRUE);
-
   /* 
     It is expensive to open and close the data files and since you can't have
     a gzip file that can be both read and written we keep a writer open
     that is shared amoung all open tables.
   */
-  if ((share->archive_write= gzopen(share->data_file_name, "ab")) == NULL)
+  if (!(azopen(&(share->archive_write), share->data_file_name, 
+               O_RDWR|O_BINARY)))
   {
+    DBUG_PRINT("ha_archive", ("Could not open archive write file"));
     share->crashed= TRUE;
     DBUG_RETURN(1);
   }
   share->archive_write_open= TRUE;
-  info(HA_STATUS_TIME);
-  share->approx_file_size= (ulong) data_file_length;
+
   DBUG_RETURN(0);
 }
 
@@ -504,7 +439,6 @@
 */
 static const char *ha_archive_exts[] = {
   ARZ,
-  ARM,
   NullS
 };
 
@@ -525,7 +459,7 @@
   int rc= 0;
   DBUG_ENTER("ha_archive::open");
 
-  DBUG_PRINT("info", ("archive table was opened for crash %s", 
+  DBUG_PRINT("ha_archive", ("archive table was opened for crash: %s", 
                       (open_options & HA_OPEN_FOR_REPAIR) ? "yes" : "no"));
   share= get_share(name, &rc);
 
@@ -541,16 +475,29 @@
     DBUG_RETURN(rc);
   }
 
-  thr_lock_data_init(&share->lock,&lock,NULL);
+  DBUG_ASSERT(share);
+
+
+  record_buffer= create_record_buffer(table->s->reclength + 
+                                      ARCHIVE_ROW_HEADER_SIZE);
+
+  if (!record_buffer)
+  {
+    free_share();
+    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+  }
+
+  thr_lock_data_init(&share->lock, &lock, NULL);
 
-  if ((archive= gzopen(share->data_file_name, "rb")) == NULL)
+  DBUG_PRINT("ha_archive", ("archive data_file_name %s", share->data_file_name));
+  if (!(azopen(&archive, share->data_file_name, O_RDONLY|O_BINARY)))
   {
     if (errno == EROFS || errno == EACCES)
       DBUG_RETURN(my_errno= errno);
     DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
   }
 
-  DBUG_PRINT("info", ("archive table was crashed %s", 
+  DBUG_PRINT("ha_archive", ("archive table was crashed %s", 
                       rc == HA_ERR_CRASHED_ON_USAGE ? "yes" : "no"));
   if (rc == HA_ERR_CRASHED_ON_USAGE && open_options & HA_OPEN_FOR_REPAIR)
   {
@@ -583,8 +530,10 @@
   int rc= 0;
   DBUG_ENTER("ha_archive::close");
 
+  destroy_record_buffer(record_buffer);
+
   /* First close stream */
-  if (gzclose(archive) == Z_ERRNO)
+  if (azclose(&archive))
     rc= 1;
   /* then also close share */
   rc|= free_share();
@@ -605,57 +554,120 @@
 int ha_archive::create(const char *name, TABLE *table_arg,
                        HA_CREATE_INFO *create_info)
 {
-  File create_file;  // We use to create the datafile and the metafile
   char name_buff[FN_REFLEN];
+  char linkname[FN_REFLEN];
   int error;
+  azio_stream create_stream;            /* Archive file we are working with */
+  File frm_file;                   /* File handler for readers */
+  MY_STAT file_stat;  // Stat information for the data file
+  byte *frm_ptr;
+
   DBUG_ENTER("ha_archive::create");
 
-  if ((create_file= my_create(fn_format(name_buff,name,"",ARM,
-                                        MY_REPLACE_EXT|MY_UNPACK_FILENAME),0,
-                              O_RDWR | O_TRUNC,MYF(MY_WME))) < 0)
+  stats.auto_increment_value= (create_info->auto_increment_value ?
+                               create_info->auto_increment_value -1 :
+                               (ulonglong) 0);
+
+  for (uint key= 0; key < table_arg->s->keys; key++)
   {
-    error= my_errno;
-    goto error;
+    KEY *pos= table_arg->key_info+key;
+    KEY_PART_INFO *key_part=     pos->key_part;
+    KEY_PART_INFO *key_part_end= key_part + pos->key_parts;
+
+    for (; key_part != key_part_end; key_part++)
+    {
+      Field *field= key_part->field;
+
+      if (!(field->flags & AUTO_INCREMENT_FLAG))
+      {
+        error= -1;
+        DBUG_PRINT("ha_archive", ("Index error in creating archive table"));
+        goto error;
+      }
+    }
   }
-  write_meta_file(create_file, 0, FALSE);
-  my_close(create_file,MYF(0));
 
   /* 
     We reuse name_buff since it is available.
   */
-  if ((create_file= my_create(fn_format(name_buff,name,"",ARZ,
-                                        MY_REPLACE_EXT|MY_UNPACK_FILENAME),0,
-                              O_RDWR | O_TRUNC,MYF(MY_WME))) < 0)
-  {
-    error= my_errno;
-    goto error;
-  }
-  if ((archive= gzdopen(dup(create_file), "wb")) == NULL)
+  if (create_info->data_file_name && create_info->data_file_name[0] != '#')
   {
-    error= errno;
-    goto error2;
+    DBUG_PRINT("ha_archive", ("archive will create stream file %s", 
+                        create_info->data_file_name));
+                        
+    fn_format(name_buff, create_info->data_file_name, "", ARZ,
+              MY_REPLACE_EXT | MY_UNPACK_FILENAME);
+    fn_format(linkname, name, "", ARZ,
+              MY_REPLACE_EXT | MY_UNPACK_FILENAME);
   }
-  if (write_data_header(archive))
+  else
   {
-    error= errno;
-    goto error3;
+    fn_format(name_buff, name, "", ARZ,
+              MY_REPLACE_EXT | MY_UNPACK_FILENAME);
+    linkname[0]= 0;
   }
 
-  if (gzclose(archive))
+  /*
+    There is a chance that the file was "discovered". In this case
+    just use whatever file is there.
+  */
+  if (!(my_stat(name_buff, &file_stat, MYF(0))))
   {
-    error= errno;
-    goto error2;
+    my_errno= 0;
+    if (!(azopen(&create_stream, name_buff, O_CREAT|O_RDWR|O_BINARY)))
+    {
+      error= errno;
+      goto error2;
+    }
+
+    if (linkname[0])
+      my_symlink(name_buff, linkname, MYF(0));
+    fn_format(name_buff, name, "", ".frm",
+              MY_REPLACE_EXT | MY_UNPACK_FILENAME);
+
+    /*
+      Here is where we open up the frm and pass it to archive to store 
+    */
+    if ((frm_file= my_open(name_buff, O_RDONLY, MYF(0))) > 0)
+    {
+      if (!my_fstat(frm_file, &file_stat, MYF(MY_WME)))
+      {
+        frm_ptr= (byte *)my_malloc(sizeof(byte) * file_stat.st_size , MYF(0));
+        if (frm_ptr)
+        {
+          my_read(frm_file, frm_ptr, file_stat.st_size, MYF(0));
+          azwrite_frm(&create_stream, (char *)frm_ptr, file_stat.st_size);
+          my_free((gptr)frm_ptr, MYF(0));
+        }
+      }
+      my_close(frm_file, MYF(0));
+    }
+
+    if (create_info->comment.str)
+      azwrite_comment(&create_stream, create_info->comment.str, 
+                      create_info->comment.length);
+
+    /* 
+      Yes you need to do this, because the starting value 
+      for the autoincrement may not be zero.
+    */
+    create_stream.auto_increment= stats.auto_increment_value;
+    if (azclose(&create_stream))
+    {
+      error= errno;
+      goto error2;
+    }
   }
+  else
+    my_errno= 0;
+
+  DBUG_PRINT("ha_archive", ("Creating File %s", name_buff));
+  DBUG_PRINT("ha_archive", ("Creating Link %s", linkname));
 
-  my_close(create_file, MYF(0));
 
   DBUG_RETURN(0);
 
-error3:
-  /* We already have an error, so ignore results of gzclose. */
-  (void)gzclose(archive);
 error2:
-  my_close(create_file, MYF(0));
   delete_table(name);
 error:
   /* Return error number, if we got one */
@@ -665,51 +677,82 @@
 /*
   This is where the actual row is written out.
 */
-int ha_archive::real_write_row(byte *buf, gzFile writer)
+int ha_archive::real_write_row(byte *buf, azio_stream *writer)
 {
-  z_off_t written, total_row_length;
-  uint *ptr, *end;
+  my_off_t written;
+  unsigned int r_pack_length;
   DBUG_ENTER("ha_archive::real_write_row");
-  total_row_length= table->s->reclength;
-  for (ptr= table->s->blob_field, end= ptr + table->s->blob_fields;
-       ptr != end; ptr++)
-    total_row_length+= ((Field_blob*) table->field[*ptr])->get_length();
-  if (share->approx_file_size > max_zfile_size - total_row_length)
-  {
-    info(HA_STATUS_TIME);
-    share->approx_file_size= (ulong) data_file_length;
-    if (share->approx_file_size > max_zfile_size - total_row_length)
-      DBUG_RETURN(HA_ERR_RECORD_FILE_FULL);
-  }
-  share->approx_file_size+= total_row_length;
-  written= gzwrite(writer, buf, table->s->reclength);
-  DBUG_PRINT("ha_archive::real_write_row", ("Wrote %d bytes expected %lu", (int) written,
-                                            table->s->reclength));
+
+  /* We pack the row for writing */
+  r_pack_length= pack_row(buf);
+
+  written= azwrite(writer, record_buffer->buffer, r_pack_length);
+  if (written != r_pack_length)
+  {
+    DBUG_PRINT("ha_archive", ("Wrote %d bytes expected %d", 
+                                              (uint32) written, 
+                                              (uint32)r_pack_length));
+    DBUG_RETURN(-1);
+  }
+
   if (!delayed_insert || !bulk_insert)
     share->dirty= TRUE;
 
-  if (written != (z_off_t)table->s->reclength)
-    DBUG_RETURN(errno ? errno : -1);
-  /*
-    We should probably mark the table as damagaged if the record is written
-    but the blob fails.
-  */
-  for (ptr= table->s->blob_field, end= ptr + table->s->blob_fields ;
+  DBUG_RETURN(0);
+}
+
+
+/* 
+  Calculate max length needed for row. This includes
+  the bytes required for the length in the header.
+*/
+
+uint32 ha_archive::max_row_length(const byte *buf)
+{
+  uint32 length= (uint32)(table->s->reclength + table->s->fields*2);
+  length+= ARCHIVE_ROW_HEADER_SIZE;
+
+  uint *ptr, *end;
+  for (ptr= table->s->blob_field, end=ptr + table->s->blob_fields ;
        ptr != end ;
        ptr++)
   {
-    char *data_ptr;
-    uint32 size= ((Field_blob*) table->field[*ptr])->get_length();
+      length += 2 + ((Field_blob*)table->field[*ptr])->get_length();
+  }
 
-    if (size)
-    {
-      ((Field_blob*) table->field[*ptr])->get_ptr(&data_ptr);
-      written= gzwrite(writer, data_ptr, (unsigned)size);
-      if (written != (z_off_t)size)
-        DBUG_RETURN(errno ? errno : -1);
-    }
+  return length;
+}
+
+
+unsigned int ha_archive::pack_row(byte *record)
+{
+  byte *ptr;
+
+  DBUG_ENTER("ha_archive::pack_row");
+
+
+  if (fix_rec_buff(max_row_length(record)))
+    DBUG_RETURN(HA_ERR_OUT_OF_MEM); /* purecov: inspected */
+
+  /* Copy null bits */
+  memcpy(record_buffer->buffer+ARCHIVE_ROW_HEADER_SIZE, 
+         record, table->s->null_bytes);
+  ptr= record_buffer->buffer + table->s->null_bytes + ARCHIVE_ROW_HEADER_SIZE;
+
+  for (Field **field=table->field ; *field ; field++)
+  {
+    if (!((*field)->is_null()))
+      ptr=(byte*) (*field)->pack((char*) ptr,
+                                 (char*) record + (*field)->offset(record));
   }
-  DBUG_RETURN(0);
+
+  int4store(record_buffer->buffer, (int)(ptr - record_buffer->buffer -
+                                         ARCHIVE_ROW_HEADER_SIZE)); 
+  DBUG_PRINT("ha_archive",("Pack row length %u", (unsigned int)
+                           (ptr - record_buffer->buffer - 
+                             ARCHIVE_ROW_HEADER_SIZE)));
+
+  DBUG_RETURN((unsigned int) (ptr - record_buffer->buffer));
 }
 
 
@@ -725,50 +768,204 @@
 int ha_archive::write_row(byte *buf)
 {
   int rc;
+  byte *read_buf= NULL;
+  ulonglong temp_auto;
+  byte *record=  table->record[0];
   DBUG_ENTER("ha_archive::write_row");
 
   if (share->crashed)
+    DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+
+  if (!share->archive_write_open)
+    if (init_archive_writer())
       DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 
-  statistic_increment(table->in_use->status_var.ha_write_count, &LOCK_status);
+  ha_statistic_increment(&SSV::ha_write_count);
   if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
     table->timestamp_field->set_time();
   pthread_mutex_lock(&share->mutex);
-  if (!share->archive_write_open)
-    if (init_archive_writer())
-      DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 
-  /*
-    Varchar structures are constant in size but are not cleaned up request
-    to request. The following sets all unused space to null to improve
-    compression.
-  */
-  for (Field **field=table->field ; *field ; field++)
+  if (table->next_number_field && record == table->record[0])
   {
-    DBUG_PRINT("archive",("Pack is %d\n", (*field)->pack_length()));
-    DBUG_PRINT("archive",("MyPack is %d\n", (*field)->data_length((char*) buf + (*field)->offset())));
-    if ((*field)->real_type() == MYSQL_TYPE_VARCHAR) 
+    KEY *mkey= &table->s->key_info[0]; // We only support one key right now
+    update_auto_increment();
+    temp_auto= table->next_number_field->val_int();
+
+    /*
+      We don't support decremening auto_increment. They make the performance
+      just cry.
+    */
+    if (temp_auto <= share->archive_write.auto_increment && 
+        mkey->flags & HA_NOSAME)
     {
-#ifndef DBUG_OFF
-      uint actual_length= (*field)->data_length((char*) buf + (*field)->offset());
-      uint offset= (*field)->offset() + actual_length + 
-        (actual_length > 255 ? 2 : 1);
-      DBUG_PRINT("archive",("Offset is %d -> %d\n", actual_length, offset));
-#endif
-      /*
-      if ((*field)->pack_length() + (*field)->offset() != offset)
-        bzero(buf + offset, (size_t)((*field)->pack_length() + (actual_length > 255 ? 2 : 1) - (*field)->data_length));
+      rc= HA_ERR_FOUND_DUPP_KEY;
+      goto error;
+    }
+#ifdef DEAD_CODE
+    /*
+      Bad news, this will cause a search for the unique value which is very 
+      expensive since we will have to do a table scan which will lock up 
+      all other writers during this period. This could perhaps be optimized 
+      in the future.
     */
+    {
+      /* 
+        First we create a buffer that we can use for reading rows, and can pass
+        to get_row().
+      */
+      if (!(read_buf= (byte*) my_malloc(table->s->reclength, MYF(MY_WME))))
+      {
+        rc= HA_ERR_OUT_OF_MEM;
+        goto error;
+      }
+       /* 
+         All of the buffer must be written out or we won't see all of the
+         data 
+       */
+      azflush(&(share->archive_write), Z_SYNC_FLUSH);
+      /*
+        Set the position of the local read thread to the beginning postion.
+      */
+      if (read_data_header(&archive))
+      {
+        rc= HA_ERR_CRASHED_ON_USAGE;
+        goto error;
+      }
+
+      Field *mfield= table->next_number_field;
+
+      while (!(get_row(&archive, read_buf)))
+      {
+        if (!memcmp(read_buf + mfield->offset(record),
+                    table->next_number_field->ptr,
+                    mfield->max_display_length()))
+        {
+          rc= HA_ERR_FOUND_DUPP_KEY;
+          goto error;
+        }
+      }
+    }
+#endif
+    else
+    {
+      if (temp_auto > share->archive_write.auto_increment)
+        stats.auto_increment_value= share->archive_write.auto_increment= 
+          temp_auto;
     }
   }
 
+  /*
+    Notice that the global auto_increment has been increased.
+    In case of a failed row write, we will never try to reuse the value.
+  */
   share->rows_recorded++;
-  rc= real_write_row(buf, share->archive_write);
+  rc= real_write_row(buf,  &(share->archive_write));
+error:
   pthread_mutex_unlock(&share->mutex);
+  if (read_buf)
+    my_free((gptr) read_buf, MYF(0));
 
   DBUG_RETURN(rc);
 }
 
+
+void ha_archive::get_auto_increment(ulonglong offset, ulonglong increment,
+                                    ulonglong nb_desired_values,
+                                    ulonglong *first_value,
+                                    ulonglong *nb_reserved_values)
+{
+  *nb_reserved_values= 1;
+  *first_value= share->archive_write.auto_increment + 1;
+}
+
+/* Initialized at each key walk (called multiple times unlike rnd_init()) */
+int ha_archive::index_init(uint keynr, bool sorted)
+{
+  DBUG_ENTER("ha_archive::index_init");
+  active_index= keynr;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  No indexes, so if we get a request for an index search since we tell
+  the optimizer that we have unique indexes, we scan
+*/
+int ha_archive::index_read(byte *buf, const byte *key,
+                             uint key_len, enum ha_rkey_function find_flag)
+{
+  int rc;
+  DBUG_ENTER("ha_archive::index_read");
+  rc= index_read_idx(buf, active_index, key, key_len, find_flag);
+  DBUG_RETURN(rc);
+}
+
+
+int ha_archive::index_read_idx(byte *buf, uint index, const byte *key,
+                                 uint key_len, enum ha_rkey_function find_flag)
+{
+  int rc= 0;
+  bool found= 0;
+  KEY *mkey= &table->s->key_info[index];
+  current_k_offset= mkey->key_part->offset;
+  current_key= key;
+  current_key_len= key_len;
+
+
+  DBUG_ENTER("ha_archive::index_read_idx");
+
+  /* 
+    All of the buffer must be written out or we won't see all of the
+    data 
+  */
+  pthread_mutex_lock(&share->mutex);
+  azflush(&(share->archive_write), Z_SYNC_FLUSH);
+  pthread_mutex_unlock(&share->mutex);
+
+  /*
+    Set the position of the local read thread to the beginning postion.
+  */
+  if (read_data_header(&archive))
+  {
+    rc= HA_ERR_CRASHED_ON_USAGE;
+    goto error;
+  }
+
+  while (!(get_row(&archive, buf)))
+  {
+    if (!memcmp(current_key, buf + current_k_offset, current_key_len))
+    {
+      found= 1;
+      break;
+    }
+  }
+
+  if (found)
+    DBUG_RETURN(0);
+
+error:
+  DBUG_RETURN(rc ? rc : HA_ERR_END_OF_FILE);
+}
+
+
+int ha_archive::index_next(byte * buf) 
+{ 
+  bool found= 0;
+
+  DBUG_ENTER("ha_archive::index_next");
+
+  while (!(get_row(&archive, buf)))
+  {
+    if (!memcmp(current_key, buf+current_k_offset, current_key_len))
+    {
+      found= 1;
+      break;
+    }
+  }
+
+  DBUG_RETURN(found ? 0 : HA_ERR_END_OF_FILE); 
+}
+
 /*
   All calls that need to scan the table start with this method. If we are told
   that it is a table scan we rewind the file to the beginning, otherwise
@@ -786,26 +983,27 @@
   if (scan)
   {
     scan_rows= share->rows_recorded;
-    DBUG_PRINT("info", ("archive will retrieve %lu rows", (ulong) scan_rows));
-    records= 0;
+    DBUG_PRINT("info", ("archive will retrieve %llu rows", 
+                        (unsigned long long) scan_rows));
+    stats.records= 0;
 
     /* 
       If dirty, we lock, and then reset/flush the data.
-      I found that just calling gzflush() doesn't always work.
+      I found that just calling azflush() doesn't always work.
     */
     if (share->dirty == TRUE)
     {
       pthread_mutex_lock(&share->mutex);
       if (share->dirty == TRUE)
       {
-        DBUG_PRINT("info", ("archive flushing out rows for scan"));
-        gzflush(share->archive_write, Z_SYNC_FLUSH);
+        DBUG_PRINT("ha_archive", ("archive flushing out rows for scan"));
+        azflush(&(share->archive_write), Z_SYNC_FLUSH);
         share->dirty= FALSE;
       }
       pthread_mutex_unlock(&share->mutex);
     }
 
-    if (read_data_header(archive))
+    if (read_data_header(&archive))
       DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
   }
 
@@ -817,25 +1015,133 @@
   This is the method that is used to read a row. It assumes that the row is 
   positioned where you want it.
 */
-int ha_archive::get_row(gzFile file_to_read, byte *buf)
+int ha_archive::get_row(azio_stream *file_to_read, byte *buf)
 {
-  int read; // Bytes read, gzread() returns int
-  uint *ptr, *end;
-  char *last;
-  size_t total_blob_length= 0;
+  int rc;
   DBUG_ENTER("ha_archive::get_row");
+  DBUG_PRINT("ha_archive", ("Picking version for get_row() %d -> %d", 
+                            (uchar)file_to_read->version, 
+                            ARCHIVE_VERSION));
+  if (file_to_read->version == ARCHIVE_VERSION)
+    rc= get_row_version3(file_to_read, buf);
+  else
+    rc= get_row_version2(file_to_read, buf);
+
+  DBUG_PRINT("ha_archive", ("Return %d\n", rc));
+
+  DBUG_RETURN(rc);
+}
+
+/* Reallocate buffer if needed */
+bool ha_archive::fix_rec_buff(unsigned int length)
+{
+  DBUG_ENTER("ha_archive::fix_rec_buff");
+  DBUG_PRINT("ha_archive", ("Fixing %u for %u", 
+                            length, record_buffer->length));
+  DBUG_ASSERT(record_buffer->buffer);
+
+  if (length > record_buffer->length)
+  {
+    byte *newptr;
+    if (!(newptr=(byte*) my_realloc((gptr) record_buffer->buffer, 
+                                    length,
+				    MYF(MY_ALLOW_ZERO_PTR))))
+      DBUG_RETURN(1);
+    record_buffer->buffer= newptr;
+    record_buffer->length= length;
+  }
+
+  DBUG_ASSERT(length <= record_buffer->length);
+
+  DBUG_RETURN(0);
+}
 
-  read= gzread(file_to_read, buf, table->s->reclength);
-  DBUG_PRINT("ha_archive::get_row", ("Read %d bytes expected %lu", (int) read,
-                                     table->s->reclength));
+int ha_archive::unpack_row(azio_stream *file_to_read, byte *record)
+{
+  DBUG_ENTER("ha_archive::unpack_row");
+
+  unsigned int read;
+  int error;
+  byte size_buffer[ARCHIVE_ROW_HEADER_SIZE];
+  unsigned int row_len;
+
+  /* First we grab the length stored */
+  read= azread(file_to_read, (byte *)size_buffer, ARCHIVE_ROW_HEADER_SIZE, &error);
 
-  if (read == Z_STREAM_ERROR)
+  if (error == Z_STREAM_ERROR ||  (read && read < ARCHIVE_ROW_HEADER_SIZE))
     DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 
   /* If we read nothing we are at the end of the file */
+  if (read == 0 || read != ARCHIVE_ROW_HEADER_SIZE)
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+  row_len=  uint4korr(size_buffer);
+  DBUG_PRINT("ha_archive",("Unpack row length %u -> %u", row_len, 
+                           (unsigned int)table->s->reclength));
+  fix_rec_buff(row_len);
+  DBUG_ASSERT(row_len <= record_buffer->length);
+
+  read= azread(file_to_read, record_buffer->buffer, row_len, &error);
+
+  DBUG_ASSERT(row_len == read);
+
+  if (read != row_len || error)
+  {
+    DBUG_RETURN(-1);
+  }
+
+  /* Copy null bits */
+  const char *ptr= (const char*) record_buffer->buffer;
+  memcpy(record, ptr, table->s->null_bytes);
+  ptr+= table->s->null_bytes;
+  for (Field **field=table->field ; *field ; field++)
+    if (!((*field)->is_null()))
+    {
+      ptr= (*field)->unpack((char *)record + 
+                            (*field)->offset(table->record[0]), ptr);
+    }
+
+  DBUG_RETURN(0);
+}
+
+
+int ha_archive::get_row_version3(azio_stream *file_to_read, byte *buf)
+{
+  DBUG_ENTER("ha_archive::get_row_version3");
+
+  int returnable= unpack_row(file_to_read, buf);
+
+  DBUG_RETURN(returnable);
+}
+
+
+int ha_archive::get_row_version2(azio_stream *file_to_read, byte *buf)
+{
+  unsigned int read;
+  int error;
+  uint *ptr, *end;
+  char *last;
+  size_t total_blob_length= 0;
+  MY_BITMAP *read_set= table->read_set;
+  DBUG_ENTER("ha_archive::get_row_version2");
+
+  read= azread(file_to_read, (voidp)buf, table->s->reclength, &error);
+
+  /* If we read nothing we are at the end of the file */
   if (read == 0)
     DBUG_RETURN(HA_ERR_END_OF_FILE);
 
+  if (read != table->s->reclength)
+  {
+    DBUG_PRINT("ha_archive::get_row_version2", ("Read %u bytes expected %u", 
+                                                read, 
+                                                (unsigned int)table->s->reclength));
+    DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+  }
+
+  if (error == Z_STREAM_ERROR || error == Z_DATA_ERROR )
+    DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+
   /* 
     If the record is the wrong size, the file is probably damaged, unless 
     we are dealing with a delayed insert or a bulk insert.
@@ -847,7 +1153,11 @@
   for (ptr= table->s->blob_field, end=ptr + table->s->blob_fields ;
        ptr != end ;
        ptr++)
-    total_blob_length += ((Field_blob*) table->field[*ptr])->get_length();
+  {
+    if (bitmap_is_set(read_set,
+                      (((Field_blob*) table->field[*ptr])->field_index)))
+        total_blob_length += ((Field_blob*) table->field[*ptr])->get_length();
+  }
 
   /* Adjust our row buffer if we need be */
   buffer.alloc(total_blob_length);
@@ -861,11 +1171,23 @@
     size_t size= ((Field_blob*) table->field[*ptr])->get_length();
     if (size)
     {
-      read= gzread(file_to_read, last, size);
-      if ((size_t) read != size)
-        DBUG_RETURN(HA_ERR_END_OF_FILE);
-      ((Field_blob*) table->field[*ptr])->set_ptr(size, last);
-      last += size;
+      if (bitmap_is_set(read_set,
+                        ((Field_blob*) table->field[*ptr])->field_index))
+      {
+        read= azread(file_to_read, last, size, &error);
+
+        if (error)
+          DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+
+        if ((size_t) read != size)
+          DBUG_RETURN(HA_ERR_END_OF_FILE);
+        ((Field_blob*) table->field[*ptr])->set_ptr(size, last);
+        last += size;
+      }
+      else
+      {
+        (void)azseek(file_to_read, size, SEEK_CUR);
+      }
     }
   }
   DBUG_RETURN(0);
@@ -889,14 +1211,13 @@
     DBUG_RETURN(HA_ERR_END_OF_FILE);
   scan_rows--;
 
-  statistic_increment(table->in_use->status_var.ha_read_rnd_next_count,
-		      &LOCK_status);
-  current_position= gztell(archive);
-  rc= get_row(archive, buf);
+  ha_statistic_increment(&SSV::ha_read_rnd_next_count);
+  current_position= aztell(&archive);
+  rc= get_row(&archive, buf);
 
 
   if (rc != HA_ERR_END_OF_FILE)
-    records++;
+    stats.records++;
 
   DBUG_RETURN(rc);
 }
@@ -926,12 +1247,11 @@
 int ha_archive::rnd_pos(byte * buf, byte *pos)
 {
   DBUG_ENTER("ha_archive::rnd_pos");
-  statistic_increment(table->in_use->status_var.ha_read_rnd_next_count,
-		      &LOCK_status);
-  current_position= (z_off_t)my_get_ptr(pos, ref_length);
-  (void)gzseek(archive, current_position, SEEK_SET);
+  ha_statistic_increment(&SSV::ha_read_rnd_next_count);
+  current_position= (my_off_t)my_get_ptr(pos, ref_length);
+  (void)azseek(&archive, current_position, SEEK_SET);
 
-  DBUG_RETURN(get_row(archive, buf));
+  DBUG_RETURN(get_row(&archive, buf));
 }
 
 /*
@@ -959,55 +1279,40 @@
 int ha_archive::optimize(THD* thd, HA_CHECK_OPT* check_opt)
 {
   DBUG_ENTER("ha_archive::optimize");
-  int rc;
-  gzFile writer;
+  int rc= 0;
+  azio_stream writer;
   char writer_filename[FN_REFLEN];
 
-  /* Open up the writer if we haven't yet */
-  if (!share->archive_write_open)
-    init_archive_writer();
-
-  /* Flush any waiting data */
-  gzflush(share->archive_write, Z_SYNC_FLUSH);
+  // now we close both our writer and our reader for the rename
+  if (share->archive_write_open)
+  {
+    azclose(&(share->archive_write));
+    share->archive_write_open= FALSE;
+  }
 
   /* Lets create a file to contain the new data */
   fn_format(writer_filename, share->table_name, "", ARN, 
-            MY_REPLACE_EXT|MY_UNPACK_FILENAME);
+            MY_REPLACE_EXT | MY_UNPACK_FILENAME);
 
-  if ((writer= gzopen(writer_filename, "wb")) == NULL)
+  if (!(azopen(&writer, writer_filename, O_CREAT|O_RDWR|O_BINARY)))
     DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE); 
 
   /* 
     An extended rebuild is a lot more effort. We open up each row and re-record it. 
     Any dead rows are removed (aka rows that may have been partially recorded). 
-  */
 
-  if (check_opt->flags == T_EXTEND)
+    As of Archive format 3, this is the only type that is performed, before this
+    version it was just done on T_EXTEND
+  */
+  if (1)
   {
-    byte *buf; 
-
-    /* 
-      First we create a buffer that we can use for reading rows, and can pass
-      to get_row().
-    */
-    if (!(buf= (byte*) my_malloc(table->s->reclength, MYF(MY_WME))))
-    {
-      rc= HA_ERR_OUT_OF_MEM;
-      goto error;
-    }
+    DBUG_PRINT("ha_archive", ("archive extended rebuild"));
 
     /*
       Now we will rewind the archive file so that we are positioned at the 
       start of the file.
     */
-    rc= read_data_header(archive);
-    
-    /*
-      Assuming now error from rewinding the archive file, we now write out the 
-      new header for out data file.
-    */
-    if (!rc)
-      rc= write_data_header(writer);
+    rc= read_data_header(&archive);
 
     /* 
       On success of writing out the new header, we now fetch each row and
@@ -1016,58 +1321,55 @@
     if (!rc)
     {
       share->rows_recorded= 0;
-      while (!(rc= get_row(archive, buf)))
+      stats.auto_increment_value= share->archive_write.auto_increment= 0;
+      my_bitmap_map *org_bitmap= dbug_tmp_use_all_columns(table, table->read_set);
+
+      while (!(rc= get_row(&archive, table->record[0])))
       {
-        real_write_row(buf, writer);
-        share->rows_recorded++;
+        real_write_row(table->record[0], &writer);
+        /*
+          Long term it should be possible to optimize this so that
+          it is not called on each row.
+        */
+        if (table->found_next_number_field)
+        {
+          Field *field= table->found_next_number_field;
+          ulonglong auto_value=
+            (ulonglong) field->val_int((char*)(table->record[0] +
+                                               field->offset(table->record[0])));
+          if (share->archive_write.auto_increment < auto_value)
+            stats.auto_increment_value= share->archive_write.auto_increment=
+              auto_value;
+        }
       }
+
+      dbug_tmp_restore_column_map(table->read_set, org_bitmap);
+      share->rows_recorded= (ha_rows)writer.rows;
     }
-    DBUG_PRINT("info", ("recovered %lu archive rows",
-                        (ulong) share->rows_recorded));
 
-    my_free((char*)buf, MYF(0));
+    DBUG_PRINT("info", ("recovered %llu archive rows", 
+                        (unsigned long long)share->rows_recorded));
+
+    DBUG_PRINT("ha_archive", ("recovered %llu archive rows", 
+                        (unsigned long long)share->rows_recorded));
+
     if (rc && rc != HA_ERR_END_OF_FILE)
       goto error;
   } 
-  else
-  {
-    /* 
-      The quick method is to just read the data raw, and then compress it directly.
-    */
-    int read; // Bytes read, gzread() returns int
-    char block[IO_SIZE];
-    if (gzrewind(archive) == -1)
-    {
-      rc= HA_ERR_CRASHED_ON_USAGE;
-      goto error;
-    }
 
-    while ((read= gzread(archive, block, IO_SIZE)))
-      gzwrite(writer, block, read);
-  }
-
-  gzflush(writer, Z_SYNC_FLUSH);
+  azclose(&writer);
   share->dirty= FALSE;
-  gzclose(share->archive_write);
-  share->archive_write= writer; 
-
-  my_rename(writer_filename,share->data_file_name,MYF(0));
-
-  /*
-    Now we need to reopen our read descriptor since it has changed.
-  */
-  gzclose(archive);
-  if ((archive= gzopen(share->data_file_name, "rb")) == NULL)
-  {
-    rc= HA_ERR_CRASHED_ON_USAGE;
-    goto error;
-  }
+  
+  azclose(&archive);
 
+  // make the file we just wrote be our data file
+  rc = my_rename(writer_filename,share->data_file_name,MYF(0));
 
-  DBUG_RETURN(0); 
 
+  DBUG_RETURN(rc);
 error:
-  gzclose(writer);
+  DBUG_PRINT("ha_archive", ("Failed to recover, error was %d", rc));
+  azclose(&writer);
 
   DBUG_RETURN(rc); 
 }
@@ -1094,8 +1396,8 @@
     */
 
     if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
-         lock_type <= TL_WRITE) && !thd->in_lock_tables
-        && !thd->tablespace_op)
+         lock_type <= TL_WRITE) && !thd_in_lock_tables(thd)
+        && !thd_tablespace_op(thd))
       lock_type = TL_WRITE_ALLOW_WRITE;
 
     /* 
@@ -1106,7 +1408,7 @@
       concurrent inserts to t2. 
     */
 
-    if (lock_type == TL_READ_NO_INSERT && !thd->in_lock_tables) 
+    if (lock_type == TL_READ_NO_INSERT && !thd_in_lock_tables(thd)) 
       lock_type = TL_READ;
 
     lock.type=lock_type;
@@ -1117,6 +1419,31 @@
   return to;
 }
 
+void ha_archive::update_create_info(HA_CREATE_INFO *create_info)
+{
+  DBUG_ENTER("ha_archive::update_create_info");
+
+  ha_archive::info(HA_STATUS_AUTO);
+  if (create_info->used_fields & HA_CREATE_USED_AUTO)
+  {
+    /* 
+      Internally Archive keeps track of last used, not next used.
+      To make the output look like MyISAM we add 1 here.
+
+      This is not completely compatible with MYISAM though, since
+      MyISAM will record on "SHOW CREATE TABLE" the last position,
+      where we will report the original position the table was
+      created with.
+    */
+    create_info->auto_increment_value= stats.auto_increment_value + 1;
+  }
+
+  if (!(my_readlink(share->real_path, share->data_file_name, MYF(0))))
+    create_info->data_file_name= share->real_path;
+
+  DBUG_VOID_RETURN;
+}
+
 
 /*
   Hints for optimizer, see ha_tina for more information
@@ -1128,8 +1455,8 @@
     This should be an accurate number now, though bulk and delayed inserts can
     cause the number to be inaccurate.
   */
-  records= share->rows_recorded;
-  deleted= 0;
+  stats.records= share->rows_recorded;
+  stats.deleted= 0;
   /* Costs quite a bit more to get all information */
   if (flag & HA_STATUS_TIME)
   {
@@ -1137,14 +1464,20 @@
 
     VOID(my_stat(share->data_file_name, &file_stat, MYF(MY_WME)));
 
-    mean_rec_length= table->s->reclength + buffer.alloced_length();
-    data_file_length= file_stat.st_size;
-    create_time= file_stat.st_ctime;
-    update_time= file_stat.st_mtime;
-    max_data_file_length= share->rows_recorded * mean_rec_length;
+    stats.mean_rec_length= table->s->reclength + buffer.alloced_length();
+    stats.data_file_length= file_stat.st_size;
+    stats.create_time= file_stat.st_ctime;
+    stats.update_time= file_stat.st_mtime;
+    stats.max_data_file_length= share->rows_recorded * stats.mean_rec_length;
+  }
+  stats.delete_length= 0;
+  stats.index_file_length=0;
+
+  if (flag & HA_STATUS_AUTO)
+  {
+    azflush(&archive, Z_SYNC_FLUSH);
+    stats.auto_increment_value= archive.auto_increment;
   }
-  delete_length= 0;
-  index_file_length=0;
 
   DBUG_RETURN(0);
 }
@@ -1205,13 +1538,13 @@
 {
   int rc= 0;
   byte *buf; 
-  const char *old_proc_info=thd->proc_info;
+  const char *old_proc_info;
   ha_rows count= share->rows_recorded;
   DBUG_ENTER("ha_archive::check");
 
-  thd->proc_info= "Checking table";
+  old_proc_info= thd_proc_info(thd, "Checking table");
   /* Flush any waiting data */
-  gzflush(share->archive_write, Z_SYNC_FLUSH);
+  azflush(&(share->archive_write), Z_SYNC_FLUSH);
 
   /* 
     First we create a buffer that we can use for reading rows, and can pass
@@ -1225,15 +1558,15 @@
     start of the file.
   */
   if (!rc)
-    read_data_header(archive);
+    read_data_header(&archive);
 
   if (!rc)
-    while (!(rc= get_row(archive, buf)))
+    while (!(rc= get_row(&archive, buf)))
       count--;
 
   my_free((char*)buf, MYF(0));
 
-  thd->proc_info= old_proc_info;
+  thd_proc_info(thd, old_proc_info);
 
   if ((rc && rc != HA_ERR_END_OF_FILE) || count)  
   {
@@ -1258,4 +1591,54 @@
 
   DBUG_RETURN(repair(thd, &check_opt));
 }
-#endif /* HAVE_ARCHIVE_DB */
+
+archive_record_buffer *ha_archive::create_record_buffer(unsigned int length) 
+{
+  DBUG_ENTER("ha_archive::create_record_buffer");
+  archive_record_buffer *r;
+  if (!(r= 
+        (archive_record_buffer*) my_malloc(sizeof(archive_record_buffer),
+                                           MYF(MY_WME))))
+  {
+    DBUG_RETURN(NULL); /* purecov: inspected */
+  }
+  r->length= (int)length;
+
+  if (!(r->buffer= (byte*) my_malloc(r->length,
+                                    MYF(MY_WME))))
+  {
+    my_free((char*) r, MYF(MY_ALLOW_ZERO_PTR));
+    DBUG_RETURN(NULL); /* purecov: inspected */
+  }
+
+  DBUG_RETURN(r);
+}
+
+void ha_archive::destroy_record_buffer(archive_record_buffer *r) 
+{
+  DBUG_ENTER("ha_archive::destroy_record_buffer");
+  my_free((char*) r->buffer, MYF(MY_ALLOW_ZERO_PTR));
+  my_free((char*) r, MYF(MY_ALLOW_ZERO_PTR));
+  DBUG_VOID_RETURN;
+}
+
+struct st_mysql_storage_engine archive_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+mysql_declare_plugin(archive)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &archive_storage_engine,
+  "ARCHIVE",
+  "Brian Aker, MySQL AB",
+  "Archive storage engine",
+  PLUGIN_LICENSE_GPL,
+  archive_db_init, /* Plugin Init */
+  archive_db_done, /* Plugin Deinit */
+  0x0300 /* 3.0 */,
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  NULL                        /* config options                  */
+}
+mysql_declare_plugin_end;
+

--- 1.26/client/mysql_upgrade.c	2007-01-29 02:47:29 +03:00
+++ 1.27/client/mysql_upgrade.c	2007-03-06 19:24:46 +03:00
@@ -171,7 +171,7 @@
       }
 
       d= (extra_default_t *)my_malloc(sizeof(extra_default_t), 
-                           MYF(MY_FAE|MY_ZEROFILL));
+                                      MYF(MY_FAE | MY_ZEROFILL));
       d->id= id;
       d->name= opt->name;
       d->n_len= strlen(opt->name);
@@ -345,15 +345,17 @@
       }
       dynstr_set(&buf, NULL);
     }
-    if (dynstr_append_mem(&buf, "\n", 1)
-       || dynstr_append_mem(&buf, d->name, d->n_len)
-       || (d->v_len && (dynstr_append_mem(&buf, "=", 1)
-       || dynstr_append_mem(&buf, d->value, d->v_len))))
+    if (dynstr_append_mem(&buf, "\n", 1) ||
+        dynstr_append_mem(&buf, d->name, d->n_len) ||
+        (d->v_len && (dynstr_append_mem(&buf, "=", 1) ||
+                      dynstr_append_mem(&buf, d->value, d->v_len))))
     {
       ret= 1;
       goto error;
     }
     my_delete((gptr)d, MYF(0));
+    my_free((gptr) d, MYF(0));
+
     list_pop(extra_defaults);                   /* pop off the head */
   }
   if (my_write(defaults_file, buf.str, buf.length, MYF(MY_FNABP | MY_WME)))
@@ -451,10 +453,10 @@
   char *forced_extra_defaults;
   char *local_defaults_group_suffix;
   const char *script_line;
-  char *upgrade_defaults_path= 0;
+  char *upgrade_defaults_path= NULL;
   char *defaults_to_use= NULL;
   int upgrade_defaults_created= 0;
-  
+  int no_defaults;
   char path[FN_REFLEN];
   DYNAMIC_STRING cmdline;
 
@@ -464,6 +466,10 @@
 #endif
 
   /* Check if we are forced to use specific defaults */
+  no_defaults= 0;
+  if (argc >= 2 && !strcmp(argv[1],"--no-defaults"))
+    no_defaults= 1;
+
   get_defaults_options(argc, argv,
                        &forced_defaults_file, &forced_extra_defaults,
                        &local_defaults_group_suffix);
@@ -578,7 +584,9 @@
   if (defaults_to_use)
   {
     dynstr_append(&cmdline, " ");
-    dynstr_append_os_quoted(&cmdline, "--defaults-extra-file=", 
+    dynstr_append_os_quoted(&cmdline,
+                            (no_defaults ? "--defaults-file=" :
+                             "--defaults-extra-file="),
                             defaults_to_use, NullS);
   }
 
@@ -652,7 +660,9 @@
   if (defaults_to_use)
   {
     dynstr_append(&cmdline, " ");
-    dynstr_append_os_quoted(&cmdline, "--defaults-extra-file=", 
+    dynstr_append_os_quoted(&cmdline,
+                            (no_defaults ? "--defaults-file=" :
+                             "--defaults-extra-file="),
                             defaults_to_use, NullS);
   }
   dynstr_append(&cmdline, " ");
@@ -684,6 +694,7 @@
   if (upgrade_defaults_created)
     my_delete(upgrade_defaults_path, MYF(0));
   
+  my_free(upgrade_defaults_path, MYF(MY_ALLOW_ZERO_PTR));
   my_end(info_flag ? MY_CHECK_ERROR : 0);
   return ret;
 }
Thread
bk commit into 5.1 tree (kostja:1.2452)konstantin6 Mar