List:Maria Storage Engine« Previous MessageNext Message »
From:Guilhem Bichot Date:January 19 2009 8:26am
Subject:bzr commit into MySQL/Maria:mysql-5.1-maria-2.0 branch (guilhem:2720)
View as plain text  
#At bzr+ssh://bk-internal.mysql.com/bzrroot/server/mysql-5.1-maria-2.0/ based on revid:guilhem@stripped

 2720 Guilhem Bichot	2009-01-19 [merge]
      Merge from 5.1-maria (no conflicts)
added:
  mysql-test/suite/maria/t/maria-preload-master.opt
modified:
  client/mysqltest.c
  include/atomic/generic-msvc.h
  include/lf.h
  include/maria.h
  include/my_pthread.h
  include/waiting_threads.h
  mysql-test/mysql-test-run.pl
  mysql-test/suite/maria/r/maria-big.result
  mysql-test/suite/maria/r/maria-recovery-big.result
  mysql-test/suite/maria/r/maria-recovery3.result
  mysql-test/suite/maria/r/maria_notembedded.result
  mysql-test/suite/maria/t/maria-big.test
  mysql-test/suite/maria/t/maria-recovery-big-master.opt
  mysql-test/suite/maria/t/maria-recovery-big.test
  mysql-test/suite/maria/t/maria-recovery2-master.opt
  mysql-test/suite/maria/t/maria-recovery3.test
  mysql-test/suite/maria/t/maria_notembedded.test
  mysys/lf_alloc-pin.c
  mysys/lf_hash.c
  mysys/my_static.c
  mysys/my_thr_init.c
  mysys/waiting_threads.c
  server-tools/instance-manager/parse.cc
  sql/mysqld.cc
  sql/sql_class.cc
  sql/sql_class.h
  sql/sql_insert.cc
  storage/maria/ha_maria.cc
  storage/maria/ma_blockrec.c
  storage/maria/ma_blockrec.h
  storage/maria/ma_commit.c
  storage/maria/ma_create.c
  storage/maria/ma_delete_all.c
  storage/maria/ma_loghandler.c
  storage/maria/ma_loghandler.h
  storage/maria/ma_recovery.c
  storage/maria/ma_write.c
  storage/maria/trnman.c
  storage/maria/trnman.h
  storage/maria/trnman_public.h
  storage/maria/unittest/trnman-t.c
  storage/myisam/mi_create.c
  unittest/mysys/lf-t.c
  unittest/mysys/my_atomic-t.c
  unittest/mysys/thr_template.c
  unittest/mysys/waiting_threads-t.c

=== modified file 'client/mysqltest.c'
--- a/client/mysqltest.c	2008-10-10 15:28:41 +0000
+++ b/client/mysqltest.c	2009-01-16 22:12:25 +0000
@@ -7189,6 +7189,7 @@ static void init_signal_handling(void)
 #endif
   sigaction(SIGILL, &sa, NULL);
   sigaction(SIGFPE, &sa, NULL);
+  DBUG_VOID_RETURN;
 }
 
 #endif /* !__WIN__ */

=== modified file 'include/atomic/generic-msvc.h'
--- a/include/atomic/generic-msvc.h	2008-08-31 17:00:02 +0000
+++ b/include/atomic/generic-msvc.h	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -52,11 +52,11 @@ LONG _InterlockedExchangeAdd (LONG volat
 #endif /*_M_IX86*/
 
 #define MY_ATOMIC_MODE "msvc-intrinsics"
-#define IL_EXCHG_ADD32(X,Y) InterlockedExchangeAdd((volatile LONG *)(X),(Y))
-#define IL_COMP_EXCHG32(X,Y,Z) InterlockedCompareExchange((volatile LONG *)(X),(Y),(Z))
-#define IL_COMP_EXCHGptr InterlockedCompareExchangePointer
-#define IL_EXCHG32       InterlockedExchange
-#define IL_EXCHGptr      InterlockedExchangePointer
+#define IL_EXCHG_ADD32(X,Y)     InterlockedExchangeAdd((volatile LONG *)(X),(Y))
+#define IL_COMP_EXCHG32(X,Y,Z)  InterlockedCompareExchange((volatile LONG *)(X),(Y),(Z))
+#define IL_COMP_EXCHGptr        InterlockedCompareExchangePointer
+#define IL_EXCHG32(X,Y)         InterlockedExchange((volatile LONG *)(X),(Y))
+#define IL_EXCHGptr             InterlockedExchangePointer
 #define make_atomic_add_body(S) \
   v= IL_EXCHG_ADD ## S (a, v)
 #define make_atomic_cas_body(S)                                 \

=== modified file 'include/lf.h'
--- a/include/lf.h	2008-07-29 14:10:24 +0000
+++ b/include/lf.h	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007 MySQL AB
+/* Copyright (C) 2007-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -187,8 +187,8 @@ typedef struct st_lf_allocator {
   uchar * volatile top;
   uint element_size;
   uint32 volatile mallocs;
-  void (*constructor)(uchar *);
-  void (*destructor)(uchar *);
+  void (*constructor)(uchar *); /* called, when an object is malloc()'ed */
+  void (*destructor)(uchar *);  /* called, when an object is free()'d    */
 } LF_ALLOCATOR;
 
 void lf_alloc_init(LF_ALLOCATOR *allocator, uint size, uint free_ptr_offset);
@@ -219,7 +219,7 @@ lock_wrap(lf_alloc_new, void *,
 #define LF_HASH_UNIQUE 1
 
 /* lf_hash overhead per element (that is, sizeof(LF_SLIST) */
-#define LF_HASH_OVERHEAD (sizeof(int*)*4)
+extern const int LF_HASH_OVERHEAD;
 
 typedef struct {
   LF_DYNARRAY array;                    /* hash itself */

=== modified file 'include/maria.h'
--- a/include/maria.h	2008-10-14 21:23:33 +0000
+++ b/include/maria.h	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,6 +33,8 @@ extern "C" {
 #include <myisamchk.h>
 #include <mysql/plugin.h>
 
+#define MARIA_CANNOT_ROLLBACK
+
 /*
   Limit max keys according to HA_MAX_POSSIBLE_KEY; See myisamchk.h for details
 */

=== modified file 'include/my_pthread.h'
--- a/include/my_pthread.h	2008-12-03 04:07:50 +0000
+++ b/include/my_pthread.h	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -430,19 +430,19 @@ int my_pthread_mutex_trylock(pthread_mut
 
 /* adapt for two different flavors of struct timespec */
 #ifdef HAVE_TIMESPEC_TS_SEC
-#define TV_sec  ts_sec
-#define TV_nsec ts_nsec
+#define MY_tv_sec  ts_sec
+#define MY_tv_nsec ts_nsec
 #else
-#define TV_sec  tv_sec
-#define TV_nsec tv_nsec
+#define MY_tv_sec  tv_sec
+#define MY_tv_nsec tv_nsec
 #endif /* HAVE_TIMESPEC_TS_SEC */
 
 #ifndef set_timespec_time_nsec
 #define set_timespec_time_nsec(ABSTIME,TIME,NSEC) do {                  \
   ulonglong nsec= (NSEC);                                               \
   ulonglong now= (TIME) + (nsec/100);                                   \
-  (ABSTIME).TV_sec=  (now / ULL(10000000));                             \
-  (ABSTIME).TV_nsec= (now % ULL(10000000) * 100 + (nsec % 100));        \
+  (ABSTIME).MY_tv_sec=  (now / ULL(10000000));                          \
+  (ABSTIME).MY_tv_nsec= (now % ULL(10000000) * 100 + (nsec % 100));     \
 } while(0)
 #endif /* !set_timespec_time_nsec */
 

=== modified file 'include/waiting_threads.h'
--- a/include/waiting_threads.h	2009-01-07 20:50:11 +0000
+++ b/include/waiting_threads.h	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008 MySQL AB
+/* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -24,16 +24,18 @@
 C_MODE_START
 
 typedef struct st_wt_resource_id WT_RESOURCE_ID;
+typedef struct st_wt_resource WT_RESOURCE;
 
 typedef struct st_wt_resource_type {
-  int (*compare)(void *a, void *b);
-  const void *(*make_key)(WT_RESOURCE_ID *id, uint *len);
+  my_bool (*compare)(const void *a, const void *b);
+  const void *(*make_key)(const WT_RESOURCE_ID *id, uint *len); /* not used */
 } WT_RESOURCE_TYPE;
 
 struct st_wt_resource_id {
   ulonglong value;
-  WT_RESOURCE_TYPE *type;
+  const WT_RESOURCE_TYPE *type;
 };
+/* the below differs from sizeof(WT_RESOURCE_ID) by the amount of padding */
 #define sizeof_WT_RESOURCE_ID (sizeof(ulonglong)+sizeof(void*))
 
 #define WT_WAIT_STATS  24
@@ -43,93 +45,17 @@ extern uint32    wt_wait_stats[WT_WAIT_S
 extern uint32    wt_cycle_stats[2][WT_CYCLE_STATS+1];
 extern uint32    wt_success_stats;
 
-/*
-  'lock' protects 'owners', 'state', and 'waiter_count'
-  'id' is read-only
-
-  a resource is picked up from a hash in a lock-free manner
-  it's returned pinned, so it cannot be freed at once
-  but it may be freed right after the pin is removed
-  to free a resource it should be
-    1. have no owners
-    2. have no waiters
-
-  two ways to access a resource:
-    1. find it in a hash
-       - it's returned pinned.
-        a) take a lock in exclusive mode
-        b) check the state, it should be ACTIVE
-        c) unpin
-    2. by a direct reference
-       - could only used if a resource cannot be freed
-       e.g. accessing a resource by thd->waiting_for is safe,
-       a resource cannot be freed as there's a thread waiting for it
-*/
-typedef struct st_wt_resource {
-  WT_RESOURCE_ID  id;
-  uint            waiter_count;
-  enum { ACTIVE, FREE } state;
-#ifndef DBUG_OFF
-  pthread_mutex_t  *mutex;
-#endif
-  /*
-    before the 'lock' all elements are mutable, after (and including) -
-    immutable in the sense that lf_hash_insert() won't memcpy() over them.
-    See wt_init().
-  */
-#ifdef WT_RWLOCKS_USE_MUTEXES
-  /*
-    we need a special rwlock-like 'lock' to allow readers bypass
-    waiting writers, otherwise readers can deadlock. For example:
-
-      A waits on resource x, owned by B, B waits on resource y, owned
-      by A, we have a cycle (A->x->B->y->A)
-      Both A and B start deadlock detection:
-
-        A locks x                          B locks y
-        A goes deeper                      B goes deeper
-        A locks y                          B locks x
-
-      with mutexes it would deadlock. With rwlocks it won't, as long
-      as both A and B are taking read locks (and they do).
-      But other threads may take write locks. Assume there's
-      C who wants to start waiting on x, and D who wants to start
-      waiting on y.
-
-        A read-locks x                       B read-locks y
-        A goes deeper                        B goes deeper
-     => C write-locks x (to add a new edge)  D write-locks y
-     .. C is blocked                         D is blocked
-        A read-locks y                       B read-locks x
-
-      Now, if a read lock can bypass a pending wrote lock request, we're fine.
-      If it can not, we have a deadlock.
-
-    writer starvation is technically possible, but unlikely, because
-    the contention is expected to be low.
-  */
-  struct {
-    pthread_cond_t   cond;
-    pthread_mutex_t  mutex;
-    uint readers: 16;
-    uint pending_writers: 15;
-    uint write_locked: 1;
-  } lock;
-#else
-  rw_lock_t lock;
-#endif
-  pthread_cond_t   cond;
-  DYNAMIC_ARRAY    owners;
-} WT_RESOURCE;
-
 typedef struct st_wt_thd {
   /*
     XXX
-    there's no protection (mutex) against concurrent access of
-    the dynarray below. it is assumed that a caller will have it
-    automatically (not to protect this array but to protect its
-    own - caller's - data structures, and we'll get it for free.
-    If not, we'll need to add a mutex
+    there's no protection (mutex) against concurrent access of the
+    dynarray below. it is assumed that a caller will have it anyway
+    (not to protect this array but to protect its own - caller's -
+    data structures), and we'll get it for free. A caller needs to
+    ensure that a blocker won't release a resource before a blocked
+    thread starts waiting, which is usually done with a mutex.
+    
+    If the above assumption is wrong, we'll need to add a mutex here.
   */
   DYNAMIC_ARRAY   my_resources;
   /*
@@ -141,8 +67,10 @@ typedef struct st_wt_thd {
   LF_PINS        *pins;
 
   /* pointers to values */
-  ulong *timeout_short, *deadlock_search_depth_short;
-  ulong *timeout_long, *deadlock_search_depth_long;
+  const ulong *timeout_short;
+  const ulong *deadlock_search_depth_short;
+  const ulong *timeout_long;
+  const ulong *deadlock_search_depth_long;
 
   /*
     weight relates to the desirability of a transaction being killed if it's
@@ -169,13 +97,13 @@ typedef struct st_wt_thd {
   */
   ulong volatile weight;
   /*
-    'killed' is indirectly protected by waiting_for->lock -
-    a killed thread needs to clear its 'waiting_for', and thus needs a lock.
+    'killed' is indirectly protected by waiting_for->lock because
+    a killed thread needs to clear its 'waiting_for' and thus needs a lock.
     That is a thread needs an exclusive lock to read 'killed' reliably.
     But other threads may change 'killed' from 0 to 1, a shared
     lock is enough for that.
    */
-  my_bool volatile killed;
+  my_bool killed;
 #ifndef DBUG_OFF
   const char     *name;
 #endif
@@ -189,13 +117,13 @@ typedef struct st_wt_thd {
 
 void wt_init(void);
 void wt_end(void);
-void wt_thd_lazy_init(WT_THD *, ulong *, ulong *, ulong *, ulong *);
+void wt_thd_lazy_init(WT_THD *, const ulong *, const ulong *, const ulong *, const ulong *);
 void wt_thd_destroy(WT_THD *);
-int wt_thd_will_wait_for(WT_THD *, WT_THD *, WT_RESOURCE_ID *);
+int wt_thd_will_wait_for(WT_THD *, WT_THD *, const WT_RESOURCE_ID *);
 int wt_thd_cond_timedwait(WT_THD *, pthread_mutex_t *);
-void wt_thd_release(WT_THD *, WT_RESOURCE_ID *);
+void wt_thd_release(WT_THD *, const WT_RESOURCE_ID *);
 #define wt_thd_release_all(THD) wt_thd_release((THD), 0)
-int wt_resource_id_memcmp(void *, void *);
+int wt_resource_id_memcmp(const void *, const void *);
 
 C_MODE_END
 

=== modified file 'mysql-test/mysql-test-run.pl'
--- a/mysql-test/mysql-test-run.pl	2008-12-15 12:34:57 +0000
+++ b/mysql-test/mysql-test-run.pl	2009-01-16 19:27:45 +0000
@@ -134,6 +134,7 @@ our $opt_vs_config = $ENV{'MTR_VS_CONFIG
 our $default_vardir;
 
 our $opt_usage;
+our $opt_list_options;
 our $opt_suites;
 our $opt_suites_default= "main,binlog,rpl,rpl_ndb,ndb,maria"; # Default suites to run
 our $opt_script_debug= 0;  # Script debugging, enable with --script-debug
@@ -556,7 +557,7 @@ sub command_line_setup () {
   );
 
   Getopt::Long::Configure("pass_through");
-  GetOptions(
+  my %options=(
              # Control what engine/variation to run
              'embedded-server'          => \$opt_embedded_server,
              'ps-protocol'              => \$opt_ps_protocol,
@@ -694,9 +695,13 @@ sub command_line_setup () {
              (map { $_ => \&warn_about_removed_option } @removed_options),
 
              'help|h'                   => \$opt_usage,
-            ) or usage("Can't read options");
+             'list-options'             => \$opt_list_options,
+            );
+
+  GetOptions(%options) or usage("Can't read options");
 
   usage("") if $opt_usage;
+  list_options(\%options) if $opt_list_options;
 
   $glob_scriptname=  basename($0);
 
@@ -5473,3 +5478,16 @@ HERE
   mtr_exit(1);
 
 }
+
+sub list_options ($) {
+  my $hash= shift;
+
+  for (keys %$hash) {
+    s/(=.*|!)$//;
+    s/\|/\n--/g;
+    print "--$_\n";
+  }
+
+  mtr_exit(1);
+}
+

=== modified file 'mysql-test/suite/maria/r/maria-big.result'
--- a/mysql-test/suite/maria/r/maria-big.result	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/r/maria-big.result	2009-01-15 14:29:14 +0000
@@ -1,3 +1,4 @@
+set global max_allowed_packet=400000000;
 set storage_engine=maria;
 affected rows: 0
 set global maria_log_file_size=4294967295;
@@ -61,8 +62,6 @@ count(*)
 affected rows: 1
 drop table t1, t2;
 affected rows: 0
-set @@max_allowed_packet=400000000;
-affected rows: 0
 create table t1 (a int, b longtext);
 affected rows: 0
 insert into t1 values (1,"123456789012345678901234567890"),(2,"09876543210987654321");

=== modified file 'mysql-test/suite/maria/r/maria-recovery-big.result'
--- a/mysql-test/suite/maria/r/maria-recovery-big.result	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/r/maria-recovery-big.result	2009-01-15 14:29:14 +0000
@@ -4,7 +4,6 @@ create database mysqltest;
 use mysqltest;
 * TEST of recovery with blobs
 * shut down mysqld, removed logs, restarted it
-set @@max_allowed_packet=32000000;
 create table t1 (a int, b longtext) engine=maria table_checksum=1;
 * copied t1 for feeding_recovery
 insert into t1 values (1,"123456789012345678901234567890"),(2,"09876543210987654321");

=== modified file 'mysql-test/suite/maria/r/maria-recovery3.result'
--- a/mysql-test/suite/maria/r/maria-recovery3.result	2008-12-05 21:11:46 +0000
+++ b/mysql-test/suite/maria/r/maria-recovery3.result	2009-01-16 21:00:39 +0000
@@ -25,5 +25,69 @@ Checksum-check
 ok
 use mysqltest;
 drop table t1;
+* TEST of logging of BLOBs
+CREATE TABLE `t1` (
+`blob` blob,
+`blob_key` blob
+) ENGINE=maria ROW_FORMAT=page
+;
+* copied t1 for feeding_recovery
+* compared t1 to old version
+set global maria_checkpoint_interval=0;
+INSERT INTO `t1` VALUES (NULL,repeat('A',5198));
+INSERT INTO `t1` VALUES (NULL,repeat('B',65535));
+INSERT INTO `t1` VALUES (repeat('K',5198),repeat('L',2325));
+INSERT INTO `t1` VALUES (repeat('C',65535),NULL);
+INSERT INTO `t1` VALUES (NULL,repeat('D',65535));
+INSERT INTO `t1` VALUES (repeat('E',65535),repeat('F',16111));
+INSERT INTO `t1` VALUES (repeat('G',65535),repeat('H',65535));
+INSERT INTO `t1` VALUES (repeat('I',5198),repeat('J',65535));
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+flush table t1;
+* copied t1 for comparison
+* compared t1 to old version
+SET SESSION debug="+d,maria_flush_whole_log,maria_crash";
+* crashing mysqld intentionally
+set global maria_checkpoint_interval=1;
+ERROR HY000: Lost connection to MySQL server during query
+* copied t1 back for feeding_recovery
+* recovery happens
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+* testing that checksum after recovery is as expected
+Checksum-check
+ok
+* compared t1 to old version
+use mysqltest;
+drop table t1;
+create table t1 engine=maria select 1;
+* copied t1 for feeding_recovery
+set global maria_checkpoint_interval=0;
+insert into t1 values(2);
+truncate table t1;
+flush table t1;
+* copied t1 for comparison
+truncate table t1;
+SET SESSION debug="+d,maria_flush_whole_log,maria_crash_create_table";
+* crashing mysqld intentionally
+truncate table t1;
+ERROR HY000: Lost connection to MySQL server during query
+* recovery happens
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	warning	Size of indexfile is: 372      Should be: 8192
+mysqltest.t1	check	status	OK
+* testing that checksum after recovery is as expected
+Checksum-check
+ok
+use mysqltest;
+truncate table t1;
+check table t1 extended;
+Table	Op	Msg_type	Msg_text
+mysqltest.t1	check	status	OK
+drop table t1;
 drop database mysqltest_for_comparison;
 drop database mysqltest;

=== modified file 'mysql-test/suite/maria/r/maria_notembedded.result'
--- a/mysql-test/suite/maria/r/maria_notembedded.result	2008-11-27 15:13:02 +0000
+++ b/mysql-test/suite/maria/r/maria_notembedded.result	2009-01-15 21:27:36 +0000
@@ -30,9 +30,24 @@ insert t1 values (2);
 lock table t1 write concurrent;
 insert t1 values (3);
 insert t1 values (2);
+lock table t1 write concurrent;
+insert t1 values (4);
 insert t1 values (3);
+lock table t1 write concurrent;
+insert t1 values (5);
+insert t1 values (4);
+lock table t1 write concurrent;
+insert t1 values (6);
+insert t1 values (5);
+insert t1 values (6);
 ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
 unlock tables;
 ERROR 23000: Duplicate entry '2' for key 'a'
 unlock tables;
+ERROR 23000: Duplicate entry '3' for key 'a'
+unlock tables;
+ERROR 23000: Duplicate entry '4' for key 'a'
+unlock tables;
+ERROR 23000: Duplicate entry '5' for key 'a'
+unlock tables;
 drop table t1;

=== modified file 'mysql-test/suite/maria/t/maria-big.test'
--- a/mysql-test/suite/maria/t/maria-big.test	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/t/maria-big.test	2009-01-15 14:29:14 +0000
@@ -2,6 +2,11 @@
 --source include/have_maria.inc
 --source include/big_test.inc
 
+set global max_allowed_packet=400000000;
+# need new session to use setting above
+connect (root,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK);
+connection root;
+
 enable_info;
 set storage_engine=maria;
 set global maria_log_file_size=4294967295;
@@ -38,7 +43,6 @@ drop table t1, t2;
 # Test creating a really big blob (up to 16M)
 #
 
-set @@max_allowed_packet=400000000;
 create table t1 (a int, b longtext);
 insert into t1 values (1,"123456789012345678901234567890"),(2,"09876543210987654321");
 

=== added file 'mysql-test/suite/maria/t/maria-preload-master.opt'
--- a/mysql-test/suite/maria/t/maria-preload-master.opt	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/maria-preload-master.opt	2009-01-12 12:08:06 +0000
@@ -0,0 +1 @@
+--skip-safemalloc

=== modified file 'mysql-test/suite/maria/t/maria-recovery-big-master.opt'
--- a/mysql-test/suite/maria/t/maria-recovery-big-master.opt	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/t/maria-recovery-big-master.opt	2009-01-15 14:29:14 +0000
@@ -1 +1 @@
---skip-stack-trace --skip-core-file
+--skip-stack-trace --skip-core-file --max_allowed_packet=32000000

=== modified file 'mysql-test/suite/maria/t/maria-recovery-big.test'
--- a/mysql-test/suite/maria/t/maria-recovery-big.test	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/t/maria-recovery-big.test	2009-01-15 14:29:14 +0000
@@ -33,7 +33,6 @@ use mysqltest;
 
 --echo * TEST of recovery with blobs
 -- source include/maria_empty_logs.inc
-set @@max_allowed_packet=32000000;
 create table t1 (a int, b longtext) engine=maria table_checksum=1;
 let $mms_tables=1;
 -- source include/maria_make_snapshot_for_feeding_recovery.inc

=== modified file 'mysql-test/suite/maria/t/maria-recovery2-master.opt'
--- a/mysql-test/suite/maria/t/maria-recovery2-master.opt	2008-10-01 12:13:39 +0000
+++ b/mysql-test/suite/maria/t/maria-recovery2-master.opt	2009-01-15 14:29:14 +0000
@@ -1 +1 @@
---skip-stack-trace --skip-core-file --maria-log-dir-path=../tmp
+--skip-stack-trace --skip-core-file --loose-maria-log-dir-path=$MYSQLTEST_VARDIR/tmp

=== modified file 'mysql-test/suite/maria/t/maria-recovery3.test'
--- a/mysql-test/suite/maria/t/maria-recovery3.test	2008-12-05 21:11:46 +0000
+++ b/mysql-test/suite/maria/t/maria-recovery3.test	2009-01-16 21:00:39 +0000
@@ -39,7 +39,6 @@ let $mvr_restore_old_snapshot=0;
 # UNDO phase prevents physical comparison, normally,
 # so we'll only use checksums to compare.
 let $mms_compare_physically=0;
-let $mvr_crash_statement= set global maria_checkpoint_interval=1;
 create table t1(a int primary key) engine=maria;
 insert into t1 values(1);
 -- source include/maria_make_snapshot_for_comparison.inc
@@ -65,6 +64,52 @@ drop table t1;
 # before checkpoint happens, test should still pass (though it won't
 # reproduce the conditions of the bug).
 
+# Test for BUG#41493 Maria: two recovery failures (wrong logging of BLOB pages)
+--echo * TEST of logging of BLOBs
+let $mvr_restore_old_snapshot=1;
+let $mms_compare_physically=1;
+CREATE TABLE `t1` (
+`blob` blob,
+`blob_key` blob
+) ENGINE=maria ROW_FORMAT=page
+;
+-- source include/maria_make_snapshot_for_feeding_recovery.inc
+set global maria_checkpoint_interval=0; # no checkpoints
+INSERT INTO `t1` VALUES (NULL,repeat('A',5198));
+INSERT INTO `t1` VALUES (NULL,repeat('B',65535));
+INSERT INTO `t1` VALUES (repeat('K',5198),repeat('L',2325));
+INSERT INTO `t1` VALUES (repeat('C',65535),NULL);
+INSERT INTO `t1` VALUES (NULL,repeat('D',65535));
+INSERT INTO `t1` VALUES (repeat('E',65535),repeat('F',16111));
+INSERT INTO `t1` VALUES (repeat('G',65535),repeat('H',65535));
+INSERT INTO `t1` VALUES (repeat('I',5198),repeat('J',65535));
+check table t1 extended;
+-- source include/maria_make_snapshot_for_comparison.inc
+-- source include/maria_verify_recovery.inc
+drop table t1;
+
+# Test for BUG#42112 "Maria: recovery failure (pushbuild2) Assertion
+# `rownr == 0 && new_page' failed"
+
+let $mvr_restore_old_snapshot=0;
+let $mms_compare_physically=0;
+create table t1 engine=maria select 1;
+-- source include/maria_make_snapshot_for_feeding_recovery.inc
+set global maria_checkpoint_interval=0; # no checkpoints
+insert into t1 values(2);
+truncate table t1;
+-- source include/maria_make_snapshot_for_comparison.inc
+let $mvr_crash_statement= truncate table t1;
+let $mvr_debug_option="+d,maria_flush_whole_log,maria_crash_create_table";
+truncate table t1;
+-- source include/maria_verify_recovery.inc
+# Table is bad but at least Recovery didn't crash and a new truncate
+# can succeed:
+truncate table t1;
+check table t1 extended;
+drop table t1;
+
+
 # clean up everything
 let $mms_purpose=comparison;
 eval drop database mysqltest_for_$mms_purpose;

=== modified file 'mysql-test/suite/maria/t/maria_notembedded.test'
--- a/mysql-test/suite/maria/t/maria_notembedded.test	2008-11-27 15:13:02 +0000
+++ b/mysql-test/suite/maria/t/maria_notembedded.test	2009-01-15 21:27:36 +0000
@@ -33,27 +33,64 @@ drop table t1;
 #
 create table t1 (a int unique) transactional=1;
 insert t1 values (1);
+
 lock table t1 write concurrent;
 insert t1 values (2);
-connect(con_d,localhost,root,,);
+
+connect(con_a,localhost,root,,);
 lock table t1 write concurrent;
 insert t1 values (3);
 send insert t1 values (2);
+
+connect(con_b,localhost,root,,);
+lock table t1 write concurrent;
+insert t1 values (4);
+send insert t1 values (3);
+
+connect(con_c,localhost,root,,);
+lock table t1 write concurrent;
+insert t1 values (5);
+send insert t1 values (4);
+
+connect(con_d,localhost,root,,);
+lock table t1 write concurrent;
+insert t1 values (6);
+send insert t1 values (5);
+
 connection default;
-let $wait_condition=select count(*) = 1 from information_schema.processlist where state="waiting for a resource";
+let $wait_condition=select count(*) = 4 from information_schema.processlist where state="waiting for a resource";
 --source include/wait_condition.inc
 --error ER_LOCK_DEADLOCK
-insert t1 values (3);
+insert t1 values (6);
 unlock tables;
+
+connection con_a;
+--error ER_DUP_ENTRY
+reap;
+unlock tables;
+disconnect con_a;
+
+connection con_b;
+--error ER_DUP_ENTRY
+reap;
+unlock tables;
+disconnect con_b;
+
+connection con_c;
+--error ER_DUP_ENTRY
+reap;
+unlock tables;
+disconnect con_c;
+
 connection con_d;
 --error ER_DUP_ENTRY
 reap;
 unlock tables;
 disconnect con_d;
+
 connection default;
 drop table t1;
 
-
 --disable_result_log
 --disable_query_log
 eval set session storage_engine=$default_engine;

=== modified file 'mysys/lf_alloc-pin.c'
--- a/mysys/lf_alloc-pin.c	2008-10-07 16:49:01 +0000
+++ b/mysys/lf_alloc-pin.c	2009-01-15 21:27:36 +0000
@@ -1,5 +1,5 @@
 /* QQ: TODO multi-pinbox */
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -330,7 +330,7 @@ static void _lf_pinbox_real_free(LF_PINS
 {
   int npins, alloca_size;
   void *list, **addr;
-  uchar *first, *last= NULL;
+  void *first, *last= NULL;
   LF_PINBOX *pinbox= pins->pinbox;
 
   LINT_INIT(first);

=== modified file 'mysys/lf_hash.c'
--- a/mysys/lf_hash.c	2008-11-03 19:33:34 +0000
+++ b/mysys/lf_hash.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -42,6 +42,8 @@ typedef struct {
   */
 } LF_SLIST;
 
+const int LF_HASH_OVERHEAD= sizeof(LF_SLIST);
+
 /*
   a structure to pass the context (pointers two the three successive elements
   in a list) from lfind to linsert/ldelete
@@ -315,7 +317,6 @@ void lf_hash_init(LF_HASH *hash, uint el
                   uint key_offset, uint key_length, hash_get_key get_key,
                   CHARSET_INFO *charset)
 {
-  compile_time_assert(sizeof(LF_SLIST) == LF_HASH_OVERHEAD);
   lf_alloc_init(&hash->alloc, sizeof(LF_SLIST)+element_size,
                 offsetof(LF_SLIST, key));
   lf_dynarray_init(&hash->array, sizeof(LF_SLIST *));

=== modified file 'mysys/my_static.c'
--- a/mysys/my_static.c	2008-10-10 15:28:41 +0000
+++ b/mysys/my_static.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -101,6 +101,7 @@ static const char *proc_info_dummy(void 
   return 0;
 }
 
+/* this is to be able to call set_thd_proc_info from the C code */
 const char *(*proc_info_hook)(void *, const char *, const char *, const char *,
                               const unsigned int)= proc_info_dummy;
 

=== modified file 'mysys/my_thr_init.c'
--- a/mysys/my_thr_init.c	2008-12-10 09:02:25 +0000
+++ b/mysys/my_thr_init.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -332,7 +332,8 @@ my_bool my_thread_init(void)
                         0);
   pthread_cond_init(&tmp->suspend, NULL);
 
-  tmp->stack_ends_here= &tmp + STACK_DIRECTION * my_thread_stack_size;
+  tmp->stack_ends_here= (char*)&tmp +
+                         STACK_DIRECTION * (long)my_thread_stack_size;
 
   pthread_mutex_lock(&THR_LOCK_threads);
   tmp->id= ++thread_id;

=== modified file 'mysys/waiting_threads.c'
--- a/mysys/waiting_threads.c	2009-01-07 20:50:11 +0000
+++ b/mysys/waiting_threads.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008 MySQL AB
+/* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -13,74 +13,134 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-/*
+/**
+  @file
+
   "waiting threads" subsystem - a unified interface for threads to wait
   on each other, with built-in deadlock detection.
 
   Main concepts
   ^^^^^^^^^^^^^
-    a thread - is represented by a WT_THD structure. One physical thread
-      can have only one WT_THD descriptor.
+  a thread - is represented by a WT_THD structure. One physical thread
+    can have only one WT_THD descriptor at any given moment.
 
-    a resource - a thread does not wait for other threads directly,
-      instead it waits for a "resource", which is "owned" by other threads.
-      It waits, exactly, for all "owners" to "release" a resource.
-      It does not have to correspond to a physical resource. For example, it
-      may be convenient in certain cases to force resource == thread.
-      A resource is represented by a WT_RESOURCE structure. 
+  a resource - a thread does not wait for other threads directly,
+    instead it waits for a "resource", which is "owned" by other threads.
+    It waits, exactly, for all "owners" to "release" a resource.
+    It does not have to correspond to a physical resource. For example, it
+    may be convenient in certain cases to force resource == thread.
+    A resource is represented by a WT_RESOURCE structure. 
 
-    a resource identifier - a pair of {resource type, value}. A value is
-      an ulonglong number. Represented by a WT_RESOURCE_ID structure.
+  a resource identifier - a pair of {resource type, value}. A value is
+    an ulonglong number. Represented by a WT_RESOURCE_ID structure.
 
-    a resource type - a pointer to a statically defined instance of
+  a resource type - a pointer to a statically defined instance of
     WT_RESOURCE_TYPE structure. This structure contains a pointer to
     a function that knows how to compare values of this resource type.
     In the simple case it could be wt_resource_id_memcmp().
 
-   Usage
-   ^^^^^
-   to use the interface one needs to use this thread's WT_THD,
-   call wt_thd_will_wait_for() for every thread it needs to wait on,
-   then call wt_thd_cond_timedwait(). When thread releases a resource
-   it should call wt_thd_release() (or wt_thd_release_all()) - it will
-   notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
-   if appropriate.
-
-   Just like with pthread's cond_wait, there could be spurious
-   wake-ups from wt_thd_cond_timedwait(). A caller is expected to
-   handle that.
-
-   wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
-   WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
-   WT_TIMEOUT. Out of memory and other fatal errors are reported as
-   WT_DEADLOCK - and a transaction must be aborted just the same.
-
-   Configuration
-   ^^^^^^^^^^^^^
-   There are four config variables. Two deadlock search depths - short and
-   long - and two timeouts. Deadlock search is performed with the short
-   depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
-   waits with a short timeout, performs a deadlock search with the long
-   depth, and waits with a long timeout. As most deadlock cycles are supposed
-   to be short, most deadlocks will be detected at once, and waits will
-   rarely be necessary.
-
-   These config variables are thread-local. Different threads may have
-   different search depth and timeout values.
-
-   Also, deadlock detector supports different killing strategies, the victim
-   in a deadlock cycle is selected based on the "weight". See "weight"
-   description in waiting_threads.h for details. It's up to the caller to
-   set weights accordingly.
-
-   Status
-   ^^^^^^
-   We calculate the number of successfull waits (WT_OK returned from
-   wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
-   length distribution - number of deadlocks with every length from
-   1 to WT_CYCLE_STATS, and a wait time distribution - number
-   of waits with a time from 1 us to 1 min in WT_CYCLE_STATS
-   intervals on a log scale.
+  a wait-for graph - a graph, that represenst "wait-for" relationships.
+    It has two types of nodes - threads and resources. There are directed
+    edges from a thread to a resource it is waiting for (WT_THD::waiting_for),
+    from a thread to resources that it "owns" (WT_THD::my_resources),
+    and from a resource to threads that "own" it (WT_RESOURCE::owners)
+
+  Graph completeness
+  ^^^^^^^^^^^^^^^^^^
+
+  For flawless deadlock detection wait-for graph must be complete.
+  It means that when a thread starts waiting it needs to know *all* its
+  blockers, and call wt_thd_will_wait_for() for every one of them.
+  Otherwise two phenomena should be expected:
+
+  1. Fuzzy timeouts:
+
+    thread A needs to get a lock, and is blocked by a thread B.
+    it waits.
+    Just before the timeout thread B releases the lock.
+    thread A is ready to grab the lock but discovers that it is also
+    blocked by a thread C.
+    It waits and times out.
+
+    As a result thread A has waited two timeout intervals, instead of one.
+
+  2. Unreliable cycle detection:
+
+     Thread A waits for threads B and C
+     Thread C waits for D
+     Thread D wants to start waiting for A
+
+     one can see immediately that thread D creates a cycle, and thus
+     a deadlock is detected.
+
+     But if thread A would only wait for B, and start waiting for C
+     when B would unlock, thread D would be allowed to wait, a deadlock
+     would be only detected when B unlocks or somebody times out.
+
+  These two phenomena don't affect a correctness, and strictly speaking,
+  the caller is not required to call wt_thd_will_wait_for() for *all*
+  blockers - it may optimize wt_thd_will_wait_for() calls. But they
+  may be perceived as bugs by users, it must be understood that such
+  an optimization comes with its price.
+
+  Usage
+  ^^^^^
+
+  First, the wt* subsystem must be initialized by calling
+  wt_init(). In the server you don't need to do it, it's done
+  in mysqld.cc.
+
+  Similarly, wt_end() frees wt* structures, should be called
+  at the end, but in the server mysqld.cc takes care of that.
+
+  Every WT_THD should be initialized with wt_thd_lazy_init().
+  After that they can be used in other wt_thd_* calls.
+  Before discarding, WT_THD should be free'd with
+  wt_thd_destroy(). In the server both are handled in sql_class.cc,
+  it's an error to try to do it manually.
+
+  To use the deadlock detection one needs to use this thread's WT_THD,
+  call wt_thd_will_wait_for() for every thread it needs to wait on,
+  then call wt_thd_cond_timedwait(). When thread releases a resource
+  it should call wt_thd_release() (or wt_thd_release_all()) - it will
+  notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
+  if appropriate.
+
+  Just like with pthread's cond_wait, there could be spurious
+  wake-ups from wt_thd_cond_timedwait(). A caller is expected to
+  handle that (that is, to re-check the blocking criteria).
+
+  wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
+  WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
+  WT_TIMEOUT. Out of memory and other fatal errors are reported as
+  WT_DEADLOCK - and a transaction must be aborted just the same.
+
+  Configuration
+  ^^^^^^^^^^^^^
+  There are four config variables. Two deadlock search depths - short and
+  long - and two timeouts. Deadlock search is performed with the short
+  depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
+  waits with a short timeout, performs a deadlock search with the long
+  depth, and waits with a long timeout. As most deadlock cycles are supposed
+  to be short, most deadlocks will be detected at once, and waits will
+  rarely be necessary.
+
+  These config variables are thread-local. Different threads may have
+  different search depth and timeout values.
+
+  Also, deadlock detector supports different killing strategies, the victim
+  in a deadlock cycle is selected based on the "weight". See "weight"
+  description in waiting_threads.h for details. It's up to the caller to
+  set weights accordingly.
+
+  Status
+  ^^^^^^
+  We calculate the number of successfull waits (WT_OK returned from
+  wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
+  length distribution - number of deadlocks with every length from
+  1 to WT_CYCLE_STATS, and a wait time distribution - number
+  of waits with a time from 1 us to 1 min in WT_WAIT_STATS
+  intervals on a log e scale.
 */
 
 /*
@@ -93,10 +153,11 @@
 
       (example A=IX, B=IS, C=S, D=X)
 
-   you need to include lock level in the resource identifier - thread 1
-   waiting for lock A on resource R and thread 2 waiting for lock B
-   on resource R should wait on different WT_RESOURCE structures, on different
-   {lock, resource} pairs. Otherwise the following is possible:
+   you need to include lock level in the resource identifier - a
+   thread waiting for lock of the type A on resource R and another
+   thread waiting for lock of the type B on resource R should wait on
+   different WT_RESOURCE structures, on different {lock, resource}
+   pairs.  Otherwise the following is possible:
 
       thread1> take S-lock on R
       thread2> take IS-lock on R
@@ -113,40 +174,46 @@
 #include <waiting_threads.h>
 #include <m_string.h>
 
-/*
-  status variables:
-    distribution of cycle lengths
-    wait time log distribution
-
-  Note:
-
-    we call deadlock() twice per wait (with different search lengths).
-    it means a deadlock will be counted twice. It's difficult to avoid,
-    as on the second search we could find a *different* deadlock and we
-    *want* to count it too. So we just count all deadlocks - two searches
-    mean two increments on the wt_cycle_stats.
-*/
+/* status variables */
 
+/**
+  preset table of wait intervals
+*/
 ulonglong wt_wait_table[WT_WAIT_STATS];
-uint32    wt_wait_stats[WT_WAIT_STATS+1];
-uint32    wt_cycle_stats[2][WT_CYCLE_STATS+1], wt_success_stats;
+/**
+  wait time distribution (log e scale)
+*/
+uint32 wt_wait_stats[WT_WAIT_STATS+1];
+/**
+  distribution of cycle lengths
+  first column tells whether this was during short or long detection
+*/
+uint32 wt_cycle_stats[2][WT_CYCLE_STATS+1];
+uint32 wt_success_stats;
 
 static my_atomic_rwlock_t cycle_stats_lock, wait_stats_lock, success_stats_lock;
 
+#ifdef SAFE_STATISTICS
+#define incr(VAR, LOCK)                           \
+  do {                                            \
+    my_atomic_rwlock_wrlock(&(LOCK));             \
+    my_atomic_add32(&(VAR), 1);                   \
+    my_atomic_rwlock_wrunlock(&(LOCK));           \
+  } while(0)
+#else
+#define incr(VAR,LOCK)  do { (VAR)++; } while(0)
+#endif
+
 static void increment_success_stats()
 {
-  my_atomic_rwlock_wrlock(&success_stats_lock);
-  my_atomic_add32(&wt_success_stats, 1);
-  my_atomic_rwlock_wrunlock(&success_stats_lock);
+  incr(wt_success_stats, success_stats_lock);
 }
 
 static void increment_cycle_stats(uint depth, uint slot)
 {
   if (depth >= WT_CYCLE_STATS)
     depth= WT_CYCLE_STATS;
-  my_atomic_rwlock_wrlock(&cycle_stats_lock);
-  my_atomic_add32(&wt_cycle_stats[slot][depth], 1);
-  my_atomic_rwlock_wrunlock(&cycle_stats_lock);
+  incr(wt_cycle_stats[slot][depth], cycle_stats_lock);
 }
 
 static void increment_wait_stats(ulonglong waited,int ret)
@@ -155,12 +222,89 @@ static void increment_wait_stats(ulonglo
   if ((ret) == ETIMEDOUT)
     i= WT_WAIT_STATS;
   else
-    for (i=0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
-  my_atomic_rwlock_wrlock(&wait_stats_lock);
-  my_atomic_add32(wt_wait_stats+i, 1);
-  my_atomic_rwlock_wrunlock(&wait_stats_lock);
+    for (i= 0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
+  incr(wt_wait_stats[i], wait_stats_lock);
 }
 
+/*
+  'lock' protects 'owners', 'state', and 'waiter_count'
+  'id' is read-only
+
+  a resource is picked up from a hash in a lock-free manner
+  it's returned pinned, so it cannot be freed at once
+  but it may be freed right after the pin is removed
+  to free a resource it should
+    1. have no owners
+    2. have no waiters
+
+  two ways to access a resource:
+    1. find it in a hash
+       - it's returned pinned.
+        a) take a lock in exclusive mode
+        b) check the state, it should be ACTIVE to be usable
+        c) unpin
+    2. by a direct reference
+       - could only used if a resource cannot be freed
+       e.g. accessing a resource by thd->waiting_for is safe,
+       a resource cannot be freed as there's a thread waiting for it
+*/
+struct st_wt_resource {
+  WT_RESOURCE_ID  id;
+  uint            waiter_count;
+  enum { ACTIVE, FREE } state;
+#ifndef DBUG_OFF
+  pthread_mutex_t  *cond_mutex; /* a mutex for the 'cond' below */
+#endif
+  /*
+    before the 'lock' all elements are mutable, after (and including) -
+    immutable in the sense that lf_hash_insert() won't memcpy() over them.
+    See wt_init().
+  */
+#ifdef WT_RWLOCKS_USE_MUTEXES
+  /*
+    we need a special rwlock-like 'lock' to allow readers bypass
+    waiting writers, otherwise readers can deadlock. For example:
+
+      A waits on resource x, owned by B, B waits on resource y, owned
+      by A, we have a cycle (A->x->B->y->A)
+      Both A and B start deadlock detection:
+
+        A locks x                          B locks y
+        A goes deeper                      B goes deeper
+        A locks y                          B locks x
+
+      with mutexes it would deadlock. With rwlocks it won't, as long
+      as both A and B are taking read locks (and they do).
+      But other threads may take write locks. Assume there's
+      C who wants to start waiting on x, and D who wants to start
+      waiting on y.
+
+        A read-locks x                       B read-locks y
+        A goes deeper                        B goes deeper
+     => C write-locks x (to add a new edge)  D write-locks y
+     .. C is blocked                         D is blocked
+        A read-locks y                       B read-locks x
+
+      Now, if a read lock can bypass a pending wrote lock request, we're fine.
+      If it can not, we have a deadlock.
+
+    writer starvation is technically possible, but unlikely, because
+    the contention is expected to be low.
+  */
+  struct {
+    pthread_cond_t   cond;
+    pthread_mutex_t  mutex;
+    uint readers: 16;
+    uint pending_writers: 15;
+    uint write_locked: 1;
+  } lock;
+#else
+  rw_lock_t lock;
+#endif
+  pthread_cond_t   cond; /* the corresponding mutex is provided by the caller */
+  DYNAMIC_ARRAY    owners;
+};
+
 #ifdef  WT_RWLOCKS_USE_MUTEXES
 static void rc_rwlock_init(WT_RESOURCE *rc)
 {
@@ -169,6 +313,8 @@ static void rc_rwlock_init(WT_RESOURCE *
 }
 static void rc_rwlock_destroy(WT_RESOURCE *rc)
 {
+  DBUG_ASSERT(rc->lock.write_locked == 0);
+  DBUG_ASSERT(rc->lock.readers == 0);
   pthread_cond_destroy(&rc->lock.cond);
   pthread_mutex_destroy(&rc->lock.mutex);
 }
@@ -188,7 +334,7 @@ static void rc_wrlock(WT_RESOURCE *rc)
   pthread_mutex_lock(&rc->lock.mutex);
   while (rc->lock.write_locked || rc->lock.readers)
     pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex);
-  rc->lock.write_locked=1;
+  rc->lock.write_locked= 1;
   pthread_mutex_unlock(&rc->lock.mutex);
   DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
 }
@@ -198,7 +344,7 @@ static void rc_unlock(WT_RESOURCE *rc)
   pthread_mutex_lock(&rc->lock.mutex);
   if (rc->lock.write_locked)
   {
-    rc->lock.write_locked=0;
+    rc->lock.write_locked= 0;
     pthread_cond_broadcast(&rc->lock.cond);
   }
   else if (--rc->lock.readers == 0)
@@ -242,12 +388,12 @@ static LF_HASH      reshash;
 /**
   WT_RESOURCE constructor
 
-  It's called from lf_hash and takes an offset to LF_SLIST instance.
+  It's called from lf_hash and takes a pointer to an LF_SLIST instance.
   WT_RESOURCE is located at arg+sizeof(LF_SLIST)
 */
 static void wt_resource_init(uchar *arg)
 {
-  WT_RESOURCE *rc=(WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
+  WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
   DBUG_ENTER("wt_resource_init");
 
   bzero(rc, sizeof(*rc));
@@ -260,12 +406,12 @@ static void wt_resource_init(uchar *arg)
 /**
   WT_RESOURCE destructor
 
-  It's called from lf_hash and takes an offset to LF_SLIST instance.
+  It's called from lf_hash and takes a pointer to an LF_SLIST instance.
   WT_RESOURCE is located at arg+sizeof(LF_SLIST)
 */
 static void wt_resource_destroy(uchar *arg)
 {
-  WT_RESOURCE *rc=(WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
+  WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
   DBUG_ENTER("wt_resource_destroy");
 
   DBUG_ASSERT(rc->owners.elements == 0);
@@ -278,6 +424,7 @@ static void wt_resource_destroy(uchar *a
 void wt_init()
 {
   DBUG_ENTER("wt_init");
+  DBUG_ASSERT(reshash.alloc.constructor != wt_resource_init);
 
   lf_hash_init(&reshash, sizeof(WT_RESOURCE), LF_HASH_UNIQUE, 0,
                sizeof_WT_RESOURCE_ID, 0, 0);
@@ -293,15 +440,15 @@ void wt_init()
   reshash.element_size= offsetof(WT_RESOURCE, lock);
   bzero(wt_wait_stats, sizeof(wt_wait_stats));
   bzero(wt_cycle_stats, sizeof(wt_cycle_stats));
-  wt_success_stats=0;
-  { /* initialize wt_wait_table[]. from 1 us to 1 min, log scale */
+  wt_success_stats= 0;
+  { /* initialize wt_wait_table[]. from 1 us to 1 min, log e scale */
     int i;
-    double from=log(1);   /* 1 us */
-    double to=log(60e6);  /* 1 min */
-    for (i=0; i < WT_WAIT_STATS; i++)
+    double from= log(1);   /* 1 us */
+    double to= log(60e6);  /* 1 min */
+    for (i= 0; i < WT_WAIT_STATS; i++)
     {
-      wt_wait_table[i]=(ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
-      DBUG_ASSERT(i==0 || wt_wait_table[i-1] != wt_wait_table[i]);
+      wt_wait_table[i]= (ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
+      DBUG_ASSERT(i == 0 || wt_wait_table[i-1] != wt_wait_table[i]);
     }
   }
   my_atomic_rwlock_init(&cycle_stats_lock);
@@ -325,7 +472,7 @@ void wt_end()
 /**
   Lazy WT_THD initialization
 
-  Cheap initialization of WT_THD. Only initialized fields that don't require
+  Cheap initialization of WT_THD. Only initialize fields that don't require
   memory allocations - basically, it only does assignments. The rest of the
   WT_THD structure will be initialized on demand, on the first use.
   This allows one to initialize lazily all WT_THD structures, even if some
@@ -335,14 +482,18 @@ void wt_end()
   @param ts     a pointer to deadlock timeout short value
   @param dl     a pointer to deadlock search depth long value
   @param tl     a pointer to deadlock timeout long value
+
+  @note these are pointers to values, and WT_THD stores them as pointers.
+  It allows one later to change search depths and timeouts for existing
+  threads. It also means that the pointers must stay valid for the lifetime
+  of WT_THD.
 */
-void wt_thd_lazy_init(WT_THD *thd, ulong *ds, ulong *ts, ulong *dl, ulong *tl)
+void wt_thd_lazy_init(WT_THD *thd, const ulong *ds, const ulong *ts,
+                                   const ulong *dl, const ulong *tl)
 {
   DBUG_ENTER("wt_thd_lazy_init");
-  thd->waiting_for=0;
-  thd->my_resources.buffer= 0;
-  thd->my_resources.elements= 0;
-  thd->weight=0;
+  thd->waiting_for= 0;
+  thd->weight= 0;
   thd->deadlock_search_depth_short= ds;
   thd->timeout_short= ts;
   thd->deadlock_search_depth_long= dl;
@@ -350,7 +501,7 @@ void wt_thd_lazy_init(WT_THD *thd, ulong
   /* dynamic array is also initialized lazily - without memory allocations */
   my_init_dynamic_array(&thd->my_resources, sizeof(WT_RESOURCE *), 0, 5);
 #ifndef DBUG_OFF
-  thd->name=my_thread_name();
+  thd->name= my_thread_name();
 #endif
   DBUG_VOID_RETURN;
 }
@@ -367,9 +518,9 @@ static int fix_thd_pins(WT_THD *thd)
 {
   if (unlikely(thd->pins == 0))
   {
-    thd->pins=lf_hash_get_pins(&reshash);
+    thd->pins= lf_hash_get_pins(&reshash);
 #ifndef DBUG_OFF
-    thd->name=my_thread_name();
+    thd->name= my_thread_name();
 #endif
   }
   return thd->pins == 0;
@@ -380,12 +531,12 @@ void wt_thd_destroy(WT_THD *thd)
   DBUG_ENTER("wt_thd_destroy");
 
   DBUG_ASSERT(thd->my_resources.elements == 0);
+  DBUG_ASSERT(thd->waiting_for == 0);
 
   if (thd->pins != 0)
     lf_hash_put_pins(thd->pins);
 
   delete_dynamic(&thd->my_resources);
-  thd->waiting_for=0;
   DBUG_VOID_RETURN;
 }
 /**
@@ -394,7 +545,7 @@ void wt_thd_destroy(WT_THD *thd)
   It can be used in WT_RESOURCE_TYPE structures where bytewise
   comparison of values is sufficient.
 */
-int wt_resource_id_memcmp(void *a, void *b)
+int wt_resource_id_memcmp(const void *a, const void *b)
 {
   /* we use the fact that there's no padding in the middle of WT_RESOURCE_ID */
   compile_time_assert(offsetof(WT_RESOURCE_ID, type) == sizeof(ulonglong));
@@ -405,10 +556,10 @@ int wt_resource_id_memcmp(void *a, void 
   arguments for the recursive deadlock_search function
 */
 struct deadlock_arg {
-  WT_THD *thd;          /**< starting point of a search */
-  uint    max_depth;    /**< search depth limit */
-  WT_THD *victim;       /**< a thread to be killed to resolve a deadlock */
-  WT_RESOURCE *rc;      /**< see comment at the end of deadlock_search() */
+  WT_THD * const thd;          /**< starting point of a search */
+  uint const max_depth;        /**< search depth limit */
+  WT_THD *victim;              /**< a thread to be killed to resolve a deadlock */
+  WT_RESOURCE *last_locked_rc; /**< see comment at the end of deadlock_search() */
 };
 
 /**
@@ -421,10 +572,10 @@ static void change_victim(WT_THD* found,
     if (arg->victim != arg->thd)
     {
       rc_unlock(arg->victim->waiting_for); /* release the previous victim */
-      DBUG_ASSERT(arg->rc == found->waiting_for);
+      DBUG_ASSERT(arg->last_locked_rc == found->waiting_for);
     }
     arg->victim= found;
-    arg->rc= 0;
+    arg->last_locked_rc= 0;
   }
 }
 
@@ -444,7 +595,7 @@ static int deadlock_search(struct deadlo
 
   LF_REQUIRE_PINS(1);
 
-  arg->rc= 0;
+  arg->last_locked_rc= 0;
 
   if (depth > arg->max_depth)
   {
@@ -453,7 +604,10 @@ static int deadlock_search(struct deadlo
   }
 
 retry:
-  /* safe dereference as explained in lf_alloc-pin.c */
+  /*
+    safe dereference as explained in lf_alloc-pin.c
+    (in short: protects against lf_alloc_free() in lf_hash_delete())
+  */
   do
   {
     rc= *shared_ptr;
@@ -469,6 +623,7 @@ retry:
   rc_rdlock(rc);
   if (rc->state != ACTIVE || *shared_ptr != rc)
   {
+    /* blocker is not waiting on this resource anymore */
     rc_unlock(rc);
     lf_unpin(arg->thd->pins, 0);
     goto retry;
@@ -480,20 +635,22 @@ retry:
     Below is not a pure depth-first search. It's a depth-first with a
     slightest hint of breadth-first. Depth-first is:
 
-      check(element):
+      check(element, X):
         foreach current in element->nodes[] do:
-          if current == element return error;
-          check(current);
+          if current == X return error;
+          check(current, X);
 
     while we do
 
-      check(element):
+      check(element, X):
         foreach current in element->nodes[] do:
-          if current == element return error;
+          if current == X return error;
         foreach current in element->nodes[] do:
-          check(current);
+          check(current, X);
+
+    preferring shorter deadlocks over longer ones.
   */
-  for (i=0; i < rc->owners.elements; i++)
+  for (i= 0; i < rc->owners.elements; i++)
   {
     cursor= *dynamic_element(&rc->owners, i, WT_THD**);
     /*
@@ -517,7 +674,7 @@ retry:
       goto end;
     }
   }
-  for (i=0; i < rc->owners.elements; i++)
+  for (i= 0; i < rc->owners.elements; i++)
   {
     cursor= *dynamic_element(&rc->owners, i, WT_THD**);
     switch (deadlock_search(arg, cursor, depth+1)) {
@@ -528,20 +685,21 @@ retry:
       break;
     case WT_DEADLOCK:
       ret= WT_DEADLOCK;
-      change_victim(cursor, arg);       /* also sets arg->rc to 0 */
+      change_victim(cursor, arg);       /* also sets arg->last_locked_rc to 0 */
       i= rc->owners.elements;           /* jump out of the loop */
       break;
     default:
       DBUG_ASSERT(0);
     }
-    if (arg->rc)
-      rc_unlock(arg->rc);
+    if (arg->last_locked_rc)
+      rc_unlock(arg->last_locked_rc);
   }
 end:
   /*
     Note that 'rc' is locked in this function, but it's never unlocked here.
-    Instead it's saved in arg->rc and the *caller* is expected to unlock it.
-    It's done to support different killing strategies. This is how it works:
+    Instead it's saved in arg->last_locked_rc and the *caller* is
+    expected to unlock it.  It's done to support different killing
+    strategies. This is how it works:
     Assuming a graph
 
       thd->A->B->C->thd
@@ -552,9 +710,9 @@ end:
     on. Goes down recursively, locks B. Goes down recursively, locks C.
     Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd.
     Returns from the last deadlock_search() call. C stays locked!
-    Now it checks whether C is a more appropriate victim then 'thd'.
+    Now it checks whether C is a more appropriate victim than 'thd'.
     If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked.
-    Now it checks whether B is a more appropriate victim then arg->victim.
+    Now it checks whether B is a more appropriate victim than arg->victim.
     If yes - old arg->victim is unlocked and arg->victim=B,
     otherwise B is unlocked. Return.
     And so on.
@@ -566,7 +724,7 @@ end:
     is unrolled and we are back to deadlock() function, there are only two
     locks left - on thd and on the victim.
   */
-  arg->rc= rc;
+  arg->last_locked_rc= rc;
   DBUG_PRINT("wt", ("exit: %s",
                     ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" :
                     ret ? "WT_DEADLOCK" : "OK"));
@@ -612,30 +770,31 @@ static int deadlock(WT_THD *thd, WT_THD 
   */
   if (ret == WT_DEADLOCK && depth)
     change_victim(blocker, &arg);
-  if (arg.rc)
+  if (arg.last_locked_rc)
   {
     /*
       Special return code if there's nobody to wait for.
 
       depth == 0 means that we start the search from thd (thd == blocker).
-      ret == WT_OK means that no cycle was found and arg.rc == thd->waiting_for.
-      and arg.rc->owners.elements == 0 means that (applying the rule above)
-      thd->waiting_for->owners.elements == 0, and thd doesn't have anybody to
-      wait for.
+      ret == WT_OK means that no cycle was found and
+        arg.last_locked_rc == thd->waiting_for.
+      and arg.last_locked_rc->owners.elements == 0 means that
+        (applying the rule above) thd->waiting_for->owners.elements == 0,
+        and thd doesn't have anybody to wait for.
     */
-    if (depth == 0 && ret == WT_OK && arg.rc->owners.elements == 0)
+    if (depth == 0 && ret == WT_OK && arg.last_locked_rc->owners.elements == 0)
     {
       DBUG_ASSERT(thd == blocker);
-      DBUG_ASSERT(arg.rc == thd->waiting_for);
+      DBUG_ASSERT(arg.last_locked_rc == thd->waiting_for);
       ret= WT_FREE_TO_GO;
     }
-    rc_unlock(arg.rc);
+    rc_unlock(arg.last_locked_rc);
   }
   /* notify the victim, if appropriate */
   if (ret == WT_DEADLOCK && arg.victim != thd)
   {
     DBUG_PRINT("wt", ("killing %s", arg.victim->name));
-    arg.victim->killed=1;
+    arg.victim->killed= 1;
     pthread_cond_broadcast(&arg.victim->waiting_for->cond);
     rc_unlock(arg.victim->waiting_for);
     ret= WT_OK;
@@ -659,7 +818,7 @@ static int unlock_lock_and_free_resource
 
   if (rc->owners.elements || rc->waiter_count)
   {
-    DBUG_PRINT("wt", ("nothing to do, %d owners, %d waiters",
+    DBUG_PRINT("wt", ("nothing to do, %u owners, %u waiters",
                       rc->owners.elements, rc->waiter_count));
     rc_unlock(rc);
     DBUG_RETURN(0);
@@ -683,12 +842,8 @@ static int unlock_lock_and_free_resource
      2. set the state to FREE
      3. release the lock
      4. remove from the hash
-
-     I *think* it's safe to release the lock while the element is still
-     in the hash. If not, the corrected procedure should be
-     3. pin; 4; remove; 5; release; 6; unpin and it'll need pin[3].
   */
-  rc->state=FREE;
+  rc->state= FREE;
   rc_unlock(rc);
   DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1);
 }
@@ -739,15 +894,19 @@ static int stop_waiting(WT_THD *thd)
 /**
   notify the system that a thread needs to wait for another thread
 
-  called by a *waiter* to declare what resource it will wait for.
+  called by a *waiter* to declare that it (thd) will wait for another
+  thread (blocker) on a specific resource (resid).
   can be called many times, if many blockers own a blocking resource.
   but must always be called with the same resource id - a thread cannot
   wait for more than one resource at a time.
 
+  @return WT_OK or WT_DEADLOCK
+
   As a new edge is added to the wait-for graph, a deadlock detection is
   performed for this new edge.
 */
-int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker, WT_RESOURCE_ID *resid)
+int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker,
+                         const WT_RESOURCE_ID *resid)
 {
   uint i;
   WT_RESOURCE *rc;
@@ -822,7 +981,7 @@ retry:
 
     /*
       we can safely access the resource here, it's in the hash as it has
-      at least one owner, and non-zero waiter_count
+      non-zero waiter_count
     */
     rc= thd->waiting_for;
     rc_wrlock(rc);
@@ -835,7 +994,11 @@ retry:
       DBUG_RETURN(WT_DEADLOCK);
     }
   }
-  for (i=0; i < rc->owners.elements; i++)
+  /*
+    Another thread could be waiting on this resource for this very 'blocker'.
+    In this case we should not add it to the list for the second time.
+  */
+  for (i= 0; i < rc->owners.elements; i++)
     if (*dynamic_element(&rc->owners, i, WT_THD**) == blocker)
       break;
   if (i >= rc->owners.elements)
@@ -854,19 +1017,21 @@ retry:
   }
   rc_unlock(rc);
 
-  if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short))
+  if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short) != WT_OK)
   {
     stop_waiting(thd);
     DBUG_RETURN(WT_DEADLOCK);
   }
-  DBUG_RETURN(0);
+  DBUG_RETURN(WT_OK);
 }
 
 /**
-  called by a *waiter* to start waiting
+  called by a *waiter* (thd) to start waiting
 
   It's supposed to be a drop-in replacement for
   pthread_cond_timedwait(), and it takes mutex as an argument.
+
+  @return one of WT_TIMEOUT, WT_DEADLOCK, WT_OK
 */
 int wt_thd_cond_timedwait(WT_THD *thd, pthread_mutex_t *mutex)
 {
@@ -878,10 +1043,10 @@ int wt_thd_cond_timedwait(WT_THD *thd, p
   DBUG_PRINT("wt", ("enter: thd=%s, rc=%p", thd->name, rc));
 
 #ifndef DBUG_OFF
-  if (rc->mutex)
-    DBUG_ASSERT(rc->mutex == mutex);
+  if (rc->cond_mutex)
+    DBUG_ASSERT(rc->cond_mutex == mutex);
   else
-    rc->mutex= mutex;
+    rc->cond_mutex= mutex;
   safe_mutex_assert_owner(mutex);
 #endif
 
@@ -890,20 +1055,27 @@ int wt_thd_cond_timedwait(WT_THD *thd, p
 #ifdef __WIN__
   /*
     only for the sake of Windows we distinguish between
-    'before' and 'starttime'
+    'before' and 'starttime':
+
+    my_getsystime() returns high-resolution value, that cannot be used for
+    waiting (it doesn't follow system clock changes), but is good for time
+    intervals.
+
+    GetSystemTimeAsFileTime() follows system clock, but is low-resolution
+    and will result in lousy intervals.
   */
   GetSystemTimeAsFileTime((PFILETIME)&starttime);
 #endif
 
   rc_wrlock(rc);
-  if (rc->owners.elements == 0 || thd->killed)
+  if (rc->owners.elements == 0)
     ret= WT_OK;
   rc_unlock(rc);
 
   set_timespec_time_nsec(timeout, starttime, (*thd->timeout_short)*ULL(1000));
-  if (ret == WT_TIMEOUT)
+  if (ret == WT_TIMEOUT && !thd->killed)
     ret= pthread_cond_timedwait(&rc->cond, mutex, &timeout);
-  if (ret == WT_TIMEOUT)
+  if (ret == WT_TIMEOUT && !thd->killed)
   {
     int r= deadlock(thd, thd, 0, *thd->deadlock_search_depth_long);
     if (r == WT_FREE_TO_GO)
@@ -935,24 +1107,25 @@ int wt_thd_cond_timedwait(WT_THD *thd, p
   @param resid   a resource to release. 0 to release all resources
 */
 
-void wt_thd_release(WT_THD *thd, WT_RESOURCE_ID *resid)
+void wt_thd_release(WT_THD *thd, const WT_RESOURCE_ID *resid)
 {
   uint i;
   DBUG_ENTER("wt_thd_release");
 
-  for (i=0; i < thd->my_resources.elements; i++)
+  for (i= 0; i < thd->my_resources.elements; i++)
   {
-    uint j;
     WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**);
     if (!resid || (resid->type->compare(&rc->id, resid) == 0))
     {
+      uint j;
+
       rc_wrlock(rc);
       /*
         nobody's trying to free the resource now,
         as its owners[] array is not empty (at least thd must be there)
       */
       DBUG_ASSERT(rc->state == ACTIVE);
-      for (j=0; j < rc->owners.elements; j++)
+      for (j= 0; j < rc->owners.elements; j++)
         if (*dynamic_element(&rc->owners, j, WT_THD**) == thd)
           break;
       DBUG_ASSERT(j < rc->owners.elements);
@@ -961,8 +1134,8 @@ void wt_thd_release(WT_THD *thd, WT_RESO
       {
         pthread_cond_broadcast(&rc->cond);
 #ifndef DBUG_OFF
-        if (rc->mutex)
-          safe_mutex_assert_owner(rc->mutex);
+        if (rc->cond_mutex)
+          safe_mutex_assert_owner(rc->cond_mutex);
 #endif
       }
       unlock_lock_and_free_resource(thd, rc);

=== modified file 'server-tools/instance-manager/parse.cc'
--- a/server-tools/instance-manager/parse.cc	2007-05-10 09:59:39 +0000
+++ b/server-tools/instance-manager/parse.cc	2009-01-15 21:27:36 +0000
@@ -78,7 +78,7 @@ Named_value_arr::Named_value_arr() :
 
 bool Named_value_arr::init()
 {
-  if (my_init_dynamic_array(&arr, sizeof(Named_value), 0, 32))
+  if (my_init_dynamic_array(&arr, sizeof(Named_value), 32, 32))
     return TRUE;
 
   initialized= TRUE;

=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc	2008-12-10 09:02:25 +0000
+++ b/sql/mysqld.cc	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2003 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -3718,8 +3718,6 @@ static int init_server_components()
   if (table_cache_init() | table_def_init() | hostname_cache_init())
     unireg_abort(1);
 
-  wt_init();
-
   query_cache_result_size_limit(query_cache_limit);
   query_cache_set_min_res_unit(query_cache_min_res_unit);
   query_cache_init();
@@ -3731,6 +3729,7 @@ static int init_server_components()
 #ifdef HAVE_REPLICATION
   init_slave_list();
 #endif
+  wt_init();
 
   /* Setup logs */
 
@@ -7471,7 +7470,7 @@ static void usage(void)
     default_collation_name= (char*) default_charset_info->name;
   print_version();
   puts("\
-Copyright (C) 2000 MySQL AB, by Monty and others\n\
+Copyright (C) 2000-2008 MySQL AB, Monty and others, 2008-2009 Sun Microsystems, Inc.\n\
 This software comes with ABSOLUTELY NO WARRANTY. This is free software,\n\
 and you are welcome to modify and redistribute it under the GPL license\n\n\
 Starts the MySQL database server\n");

=== modified file 'sql/sql_class.cc'
--- a/sql/sql_class.cc	2008-12-10 09:02:25 +0000
+++ b/sql/sql_class.cc	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2006 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -1123,6 +1123,8 @@ bool THD::store_globals()
   */
   mysys_var->id= thread_id;
   real_id= pthread_self();                      // For debugging
+  mysys_var->stack_ends_here= thread_stack +    // for consistency, see libevent_thread_proc
+                              STACK_DIRECTION * (long)my_thread_stack_size;
 
   /*
     We have to call thr_lock_info_init() again here as THD may have been

=== modified file 'sql/sql_class.h'
--- a/sql/sql_class.h	2008-12-04 00:36:55 +0000
+++ b/sql/sql_class.h	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2006 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -1417,7 +1417,7 @@ public:
     THD_TRANS stmt;			// Trans for current statement
     bool on;                            // see ha_enable_transaction()
     XID_STATE xid_state;
-    WT_THD wt;
+    WT_THD wt;                          ///< for deadlock detection
     Rows_log_event *m_pending_rows_event;
 
     /*

=== modified file 'sql/sql_insert.cc'
--- a/sql/sql_insert.cc	2009-01-09 04:11:37 +0000
+++ b/sql/sql_insert.cc	2009-01-16 09:38:02 +0000
@@ -3493,6 +3493,12 @@ static TABLE *create_table_from_items(TH
                                     MYSQL_LOCK_IGNORE_FLUSH, &not_used)) ||
         hooks->postlock(&table, 1))
   {
+    /* purecov: begin tested */
+    /*
+      This can happen in innodb when you get a deadlock when using same table
+      in insert and select
+    */
+    my_error(ER_CANT_LOCK, MYF(0), my_errno);
     if (*lock)
     {
       mysql_unlock_tables(thd, *lock);
@@ -3502,6 +3508,7 @@ static TABLE *create_table_from_items(TH
     if (!create_info->table_existed)
       drop_open_table(thd, table, create_table->db, create_table->table_name);
     DBUG_RETURN(0);
+    /* purecov: end */
   }
   DBUG_RETURN(table);
 }

=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc	2008-12-22 00:17:37 +0000
+++ b/storage/maria/ha_maria.cc	2009-01-16 16:18:17 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (C) 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -40,9 +41,11 @@ C_MODE_END
   Note that in future versions, only *transactional* Maria tables can
   rollback, so this flag should be up or down conditionally.
 */
-#define MARIA_CANNOT_ROLLBACK HA_NO_TRANSACTIONS
 #ifdef MARIA_CANNOT_ROLLBACK
+#define CANNOT_ROLLBACK_FLAG HA_NO_TRANSACTIONS
 #define trans_register_ha(A, B, C)  do { /* nothing */ } while(0)
+#else
+#define CANNOT_ROLLBACK_FLAG 0
 #endif
 #define THD_TRN (*(TRN **)thd_ha_data(thd, maria_hton))
 
@@ -716,7 +719,7 @@ handler(hton, table_arg), file(0),
 int_table_flags(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER |
                 HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
                 HA_DUPLICATE_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY |
-                HA_FILE_BASED | HA_CAN_GEOMETRY | MARIA_CANNOT_ROLLBACK |
+                HA_FILE_BASED | HA_CAN_GEOMETRY | CANNOT_ROLLBACK_FLAG |
                 HA_CAN_BIT_FIELD | HA_CAN_RTREEKEYS |
                 HA_HAS_RECORDS | HA_STATS_RECORDS_IS_EXACT),
 can_enable_indexes(1), bulk_insert_single_undo(BULK_INSERT_NONE)
@@ -2261,6 +2264,9 @@ int ha_maria::extra_opt(enum ha_extra_fu
 
 int ha_maria::delete_all_rows()
 {
+  THD *thd= current_thd;
+  (void) translog_log_debug_info(file->trn, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query, thd->query_length);
   if (file->s->now_transactional &&
       ((table->in_use->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) ||
        table->in_use->locked_tables))
@@ -2277,6 +2283,9 @@ int ha_maria::delete_all_rows()
 
 int ha_maria::delete_table(const char *name)
 {
+  THD *thd= current_thd;
+  (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query, thd->query_length);
   return maria_delete_table(name);
 }
 
@@ -2350,6 +2359,16 @@ int ha_maria::external_lock(THD *thd, in
         DBUG_PRINT("info", ("Disabling logging for table"));
         _ma_tmp_disable_logging_for_table(file, TRUE);
       }
+#ifdef EXTRA_DEBUG
+      if (lock_type == F_WRLCK &&
+          ! (trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED))
+      {
+        trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED |
+                         TRN_STATE_TABLES_CAN_CHANGE);
+        (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY,
+                                       (uchar*) thd->query, thd->query_length);
+      }
+#endif
     }
     else
     {
@@ -2374,9 +2393,10 @@ int ha_maria::external_lock(THD *thd, in
         external lock of the table
       */
       file->state= &file->s->state.state;
-      if (trn && trnman_has_locked_tables(trn))
+      if (trn)
       {
-        if (!trnman_decrement_locked_tables(trn))
+        if (trnman_has_locked_tables(trn) &&
+            !trnman_decrement_locked_tables(trn))
         {
           /*
             OK should not have been sent to client yet (ACID).
@@ -2399,6 +2419,7 @@ int ha_maria::external_lock(THD *thd, in
           }
 #endif
         }
+        trnman_set_flags(trn, trnman_get_flags(trn) & ~ TRN_STATE_INFO_LOGGED);
       }
     }
   } /* if transactional table */
@@ -2433,6 +2454,16 @@ int ha_maria::start_stmt(THD *thd, thr_l
       call to start_stmt().
     */
     trnman_new_statement(trn);
+
+#ifdef EXTRA_DEBUG
+    if (!(trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED) &&
+        trnman_get_flags(trn) & TRN_STATE_TABLES_CAN_CHANGE)
+    {
+      trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED);
+      (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY,
+                                     (uchar*) thd->query, thd->query_length);
+    }
+#endif
   }
   return 0;
 }
@@ -2645,6 +2676,7 @@ int ha_maria::create(const char *name, r
   TABLE_SHARE *share= table_arg->s;
   uint options= share->db_options_in_use;
   enum data_file_type row_type;
+  THD *thd= current_thd;
   DBUG_ENTER("ha_maria::create");
 
   for (i= 0; i < share->keys; i++)
@@ -2709,6 +2741,9 @@ int ha_maria::create(const char *name, r
        ha_create_info->page_checksum ==  HA_CHOICE_YES)
     create_flags|= HA_CREATE_PAGE_CHECKSUM;
 
+  (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query, thd->query_length);
+
   /* TODO: Check that the following fn_format is really needed */
   error=
     maria_create(fn_format(buff, name, "", "",
@@ -2725,6 +2760,9 @@ int ha_maria::create(const char *name, r
 
 int ha_maria::rename_table(const char *from, const char *to)
 {
+  THD *thd= current_thd;
+  (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query, thd->query_length);
   return maria_rename(from, to);
 }
 
@@ -2869,6 +2907,8 @@ static int maria_commit(handlerton *hton
   TRN *trn= THD_TRN;
   DBUG_ENTER("maria_commit");
   trnman_reset_locked_tables(trn, 0);
+  trnman_set_flags(trn, trnman_get_flags(trn) & ~TRN_STATE_INFO_LOGGED);
+
   /* statement or transaction ? */
   if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all)
     DBUG_RETURN(0); // end of statement

=== modified file 'storage/maria/ma_blockrec.c'
--- a/storage/maria/ma_blockrec.c	2009-01-09 04:11:37 +0000
+++ b/storage/maria/ma_blockrec.c	2009-01-15 15:14:47 +0000
@@ -3146,8 +3146,8 @@ static my_bool write_block_record(MARIA_
 
           log_pos= store_page_range(log_pos, tmp_block, block_size,
                                     blob_length, &extents);
-          tmp_block+= tmp_block->sub_blocks;
         }
+        tmp_block+= tmp_block->sub_blocks;
       }
     }
 
@@ -6182,6 +6182,7 @@ err:
                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
                              LSN_IMPOSSIBLE, 0, FALSE);
   _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0); /* catch recovery errors early */
   DBUG_RETURN((my_errno= error));
 }
 
@@ -6280,6 +6281,7 @@ err:
                            PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
                            LSN_IMPOSSIBLE, 0, FALSE);
   _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0);
   DBUG_RETURN((my_errno= error));
 
 }
@@ -6337,6 +6339,7 @@ uint _ma_apply_redo_free_blocks(MARIA_HA
     if (res)
     {
       _ma_mark_file_crashed(share);
+      DBUG_ASSERT(0);
       DBUG_RETURN(res);
     }
   }
@@ -6420,6 +6423,7 @@ uint _ma_apply_redo_free_head_or_tail(MA
 
 err:
   _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0);
   DBUG_RETURN(1);
 }
 
@@ -6431,6 +6435,10 @@ err:
    @parma  lsn             LSN to put on pages
    @param  header          Header (with FILEID)
    @param  redo_lsn        REDO record's LSN
+   @param[out] number_of_blobs Number of blobs found in log record
+   @param[out] number_of_ranges Number of ranges found
+   @param[out] first_page  First page touched
+   @param[out] last_page   Last page touched
 
    @note Write full pages (full head & blob pages)
 
@@ -6441,13 +6449,18 @@ err:
 
 uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
                                      LSN lsn, const uchar *header,
-                                     LSN redo_lsn)
+                                     LSN redo_lsn,
+                                     uint * const number_of_blobs,
+                                     uint * const number_of_ranges,
+                                     pgcache_page_no_t * const first_page,
+                                     pgcache_page_no_t * const last_page)
 {
   MARIA_SHARE *share= info->s;
   const uchar *data;
   uint      data_size= FULL_PAGE_SIZE(share->block_size);
   uint      blob_count, ranges;
   uint16    sid;
+  pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0;
   DBUG_ENTER("_ma_apply_redo_insert_row_blobs");
 
   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
@@ -6455,9 +6468,9 @@ uint _ma_apply_redo_insert_row_blobs(MAR
 
   sid= fileid_korr(header);
   header+= FILEID_STORE_SIZE;
-  ranges= pagerange_korr(header);
+  *number_of_ranges= ranges= pagerange_korr(header);
   header+= PAGERANGE_STORE_SIZE;
-  blob_count= pagerange_korr(header);
+  *number_of_blobs= blob_count= pagerange_korr(header);
   header+= PAGERANGE_STORE_SIZE;
   DBUG_ASSERT(ranges >= blob_count);
 
@@ -6495,6 +6508,8 @@ uint _ma_apply_redo_insert_row_blobs(MAR
         enum pagecache_page_pin unpin_method;
         uint length;
 
+        set_if_smaller(first_page2, page);
+        set_if_bigger(last_page2, page);
         if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE))
           continue;
 
@@ -6545,15 +6560,22 @@ uint _ma_apply_redo_insert_row_blobs(MAR
           }
           else
           {
+#ifndef DBUG_OFF
+            uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
+#endif
             if (lsn_korr(buff) >= lsn)
             {
               /* Already applied */
+              DBUG_PRINT("info", ("already applied %llu >= %llu",
+                                  lsn_korr(buff), lsn));
               pagecache_unlock_by_link(share->pagecache, page_link.link,
                                        PAGECACHE_LOCK_WRITE_UNLOCK,
                                        PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
                                        LSN_IMPOSSIBLE, 0, FALSE);
               continue;
             }
+            DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) ||
+                        (found_page_type == (uchar) UNALLOCATED_PAGE));
           }
           unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
           unpin_method=  PAGECACHE_UNPIN;
@@ -6595,10 +6617,13 @@ uint _ma_apply_redo_insert_row_blobs(MAR
         goto err;
     }
   }
+  *first_page= first_page2;
+  *last_page=  last_page2;
   DBUG_RETURN(0);
 
 err:
   _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0);
   DBUG_RETURN(1);
 }
 

=== modified file 'storage/maria/ma_blockrec.h'
--- a/storage/maria/ma_blockrec.h	2008-12-05 21:11:46 +0000
+++ b/storage/maria/ma_blockrec.h	2009-01-15 15:14:47 +0000
@@ -236,7 +236,11 @@ uint _ma_apply_redo_free_blocks(MARIA_HA
 uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
                                       const uchar *header);
 uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, LSN lsn,
-                                     const uchar *header, LSN redo_lsn);
+                                     const uchar *header, LSN redo_lsn,
+                                     uint * const number_of_blobs,
+                                     uint * const number_of_ranges,
+                                     pgcache_page_no_t * const first_page,
+                                     pgcache_page_no_t * const last_page);
 my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info, LSN lsn,
                                        const uchar *header);
 my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,

=== modified file 'storage/maria/ma_commit.c'
--- a/storage/maria/ma_commit.c	2008-12-05 21:11:46 +0000
+++ b/storage/maria/ma_commit.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007 MySQL AB
+/* Copyright (C) 2007-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -17,7 +17,7 @@
 #include "trnman.h"
 
 /**
-   @brief writes a COMMIT record to log and commits transaction in memory
+   writes a COMMIT record to log and commits transaction in memory
 
    @param  trn              transaction
 
@@ -82,7 +82,7 @@ int ma_commit(TRN *trn)
 
 
 /**
-   @brief Writes a COMMIT record for a transaciton associated with a file
+   Writes a COMMIT record for a transaciton associated with a file
 
    @param  info              Maria handler
 
@@ -98,13 +98,17 @@ int maria_commit(MARIA_HA *info)
 
 
 /**
-   @brief Starts a transaction on a file handle
+   Starts a transaction on a file handle
 
    @param  info              Maria handler
 
    @return Operation status
      @retval 0      ok
      @retval #      Error code.
+
+   @note this can be used only in single-threaded programs (tests),
+   because we create a transaction (trnman_new_trn) with WT_THD=0.
+   XXX it needs to be fixed when we'll start using maria_begin from SQL.
 */
 
 int maria_begin(MARIA_HA *info)

=== modified file 'storage/maria/ma_create.c'
--- a/storage/maria/ma_create.c	2008-10-15 20:00:35 +0000
+++ b/storage/maria/ma_create.c	2009-01-16 21:10:31 +0000
@@ -652,7 +652,8 @@ int maria_create(const char *name, enum 
   /* There are only 16 bits for the total header length. */
   if (info_length > 65535)
   {
-    my_printf_error(0, "Maria table '%s' has too many columns and/or "
+    my_printf_error(HA_WRONG_CREATE_OPTION,
+                    "Maria table '%s' has too many columns and/or "
                     "indexes and/or unique constraints.",
                     MYF(0), name + dirname_length(name));
     my_errno= HA_WRONG_CREATE_OPTION;
@@ -750,6 +751,13 @@ int maria_create(const char *name, enum 
       (via maria_recreate_table()) and it does not have a log.
     */
     sync_dir= MY_SYNC_DIR;
+    /*
+      If crash between _ma_state_info_write_sub() and
+      _ma_update_state__lsns_sub(), table should be ignored by Recovery (or
+      old REDOs would fail), so we cannot let LSNs be 0:
+    */
+    share.state.skip_redo_lsn= share.state.is_of_horizon=
+      share.state.create_rename_lsn= LSN_MAX;
   }
 
   if (datafile_type == DYNAMIC_RECORD)
@@ -1059,11 +1067,21 @@ int maria_create(const char *name, enum 
                                        log_array, NULL, NULL) ||
                  translog_flush(lsn)))
       goto err;
+    share.kfile.file= file;
+    DBUG_EXECUTE_IF("maria_flush_whole_log",
+                    {
+                      DBUG_PRINT("maria_flush_whole_log", ("now"));
+                      translog_flush(translog_get_horizon());
+                    });
+    DBUG_EXECUTE_IF("maria_crash_create_table",
+                    {
+                      DBUG_PRINT("maria_crash_create_table", ("now"));
+                      DBUG_ABORT();
+                    });
     /*
       store LSN into file, needed for Recovery to not be confused if a
       DROP+CREATE happened (applying REDOs to the wrong table).
     */
-    share.kfile.file= file;
     if (_ma_update_state_lsns_sub(&share, lsn, trnman_get_min_safe_trid(),
                                   FALSE, TRUE))
       goto err;

=== modified file 'storage/maria/ma_delete_all.c'
--- a/storage/maria/ma_delete_all.c	2008-06-26 05:18:28 +0000
+++ b/storage/maria/ma_delete_all.c	2009-01-16 21:00:39 +0000
@@ -115,6 +115,12 @@ int maria_delete_all_rows(MARIA_HA *info
       but redo_insert are skipped (dirty pages list is empty).
       To avoid this, we need to set skip_redo_lsn now, and thus need to sync
       files.
+      Also fixes the problem of:
+      bulk insert; insert; delete_all; crash:
+      "bulk insert" is skipped (no REDOs), so if "insert" would not be skipped
+      (if we didn't update skip_redo_lsn below) then "insert" would be tried
+      and fail, saying that it sees that the first page has to be created
+      though the inserted row has rownr>0.
     */
     my_bool error= _ma_state_info_write(share, 1|4) ||
       _ma_update_state_lsns(share, lsn, trnman_get_min_trid(), FALSE, FALSE) ||

=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c	2008-12-08 10:06:08 +0000
+++ b/storage/maria/ma_loghandler.c	2009-01-16 09:38:02 +0000
@@ -685,6 +685,10 @@ static LOG_DESC INIT_LOGREC_IMPORTED_TAB
 {LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
  "imported_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
 
+static LOG_DESC INIT_LOGREC_DEBUG_INFO=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "info", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
 const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL;
 
 void translog_table_init()
@@ -774,6 +778,9 @@ void translog_table_init()
     INIT_LOGREC_REDO_BITMAP_NEW_PAGE;
   log_record_type_descriptor[LOGREC_IMPORTED_TABLE]=
     INIT_LOGREC_IMPORTED_TABLE;
+  log_record_type_descriptor[LOGREC_DEBUG_INFO]=
+    INIT_LOGREC_DEBUG_INFO;
+
   for (i= LOGREC_FIRST_FREE; i < LOGREC_NUMBER_OF_TYPES; i++)
     log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED;
 #ifndef DBUG_OFF
@@ -8299,6 +8306,46 @@ void translog_set_file_size(uint32 size)
   DBUG_VOID_RETURN;
 }
 
+
+/**
+   Write debug information to log if we EXTRA_DEBUG is enabled
+*/
+
+my_bool translog_log_debug_info(TRN *trn __attribute__((unused)),
+                                enum translog_debug_info_type type
+                                __attribute__((unused)),
+                                uchar *info __attribute__((unused)),
+                                size_t length __attribute__((unused)))
+{
+#ifdef EXTRA_DEBUG
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+  uchar debug_type;
+  LSN lsn;
+
+  if (!trn)
+  {
+    /*
+      We can't log the current transaction because we don't have
+      an active transaction. Use a temporary transaction object instead
+    */
+    trn= &dummy_transaction_object;
+  }
+  debug_type= (uchar) type;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str= &debug_type;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= 1;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str= info;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
+  return translog_write_record(&lsn, LOGREC_DEBUG_INFO,
+                               trn, NULL,
+                               (translog_size_t) (1+ length),
+                               sizeof(log_array)/sizeof(log_array[0]),
+                               log_array, NULL, NULL);
+#else
+  return 0;
+#endif
+}
+
+
 #ifdef MARIA_DUMP_LOG
 #include <my_getopt.h>
 extern void translog_example_table_init();

=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h	2008-07-09 09:02:27 +0000
+++ b/storage/maria/ma_loghandler.h	2009-01-15 22:25:53 +0000
@@ -144,6 +144,7 @@ enum translog_record_type
   LOGREC_UNDO_BULK_INSERT,
   LOGREC_REDO_BITMAP_NEW_PAGE,
   LOGREC_IMPORTED_TABLE,
+  LOGREC_DEBUG_INFO,
   LOGREC_FIRST_FREE,
   LOGREC_RESERVED_FUTURE_EXTENSION= 63
 };
@@ -167,6 +168,12 @@ enum en_key_op
   KEY_OP_COMPACT_PAGE   /* Compact key page */
 };
 
+
+enum translog_debug_info_type
+{
+  LOGREC_DEBUG_INFO_QUERY
+};
+
 /* Size of log file; One log file is restricted to 4G */
 typedef uint32 translog_size_t;
 
@@ -323,6 +330,9 @@ translog_assign_id_to_share_from_recover
 extern my_bool translog_walk_filenames(const char *directory,
                                        my_bool (*callback)(const char *,
                                                            const char *));
+extern my_bool translog_log_debug_info(TRN *trn,
+                                       enum translog_debug_info_type type,
+                                       uchar *info, size_t length);
 
 enum enum_translog_status
 {

=== modified file 'storage/maria/ma_recovery.c'
--- a/storage/maria/ma_recovery.c	2008-12-09 09:56:02 +0000
+++ b/storage/maria/ma_recovery.c	2009-01-15 22:25:53 +0000
@@ -98,6 +98,7 @@ prototype_redo_exec_hook(UNDO_KEY_DELETE
 prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
 prototype_redo_exec_hook(COMMIT);
 prototype_redo_exec_hook(CLR_END);
+prototype_redo_exec_hook(DEBUG_INFO);
 prototype_undo_exec_hook(UNDO_ROW_INSERT);
 prototype_undo_exec_hook(UNDO_ROW_DELETE);
 prototype_undo_exec_hook(UNDO_ROW_UPDATE);
@@ -488,6 +489,11 @@ static void display_record_position(cons
          number ? "" : "   ", number, LSN_IN_PARTS(rec->lsn),
          rec->short_trid, log_desc->name, rec->type,
          (ulong)rec->record_length);
+  if (rec->type == LOGREC_DEBUG_INFO)
+  {
+    /* Print some extra information */
+    (*log_desc->record_execute_in_redo_phase)(rec);
+  }
 }
 
 
@@ -1412,6 +1418,9 @@ prototype_redo_exec_hook(REDO_INSERT_ROW
 {
   int error= 1;
   uchar *buff;
+  uint number_of_blobs, number_of_ranges;
+  pgcache_page_no_t first_page, last_page;
+  char llbuf1[22], llbuf2[22];
   MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
   if (info == NULL)
     return 0;
@@ -1426,11 +1435,19 @@ prototype_redo_exec_hook(REDO_INSERT_ROW
   }
   buff= log_record_buffer.str;
   if (_ma_apply_redo_insert_row_blobs(info, current_group_end_lsn,
-                                      buff, rec->lsn))
-    goto end;
+                                      buff, rec->lsn, &number_of_blobs,
+                                      &number_of_ranges,
+                                      &first_page, &last_page))
+    goto end;
+  llstr(first_page, llbuf1);
+  llstr(last_page, llbuf2);
+  tprint(tracef, " %u blobs %u ranges, first page %s last %s",
+         number_of_blobs, number_of_ranges, llbuf1, llbuf2);
+
   error= 0;
 
 end:
+  tprint(tracef, " \n");
   return error;
 }
 
@@ -1993,6 +2010,37 @@ prototype_redo_exec_hook(CLR_END)
 
 
 /**
+   Hock to print debug information (like MySQL query)
+*/
+
+prototype_redo_exec_hook(DEBUG_INFO)
+{
+  uchar *data;
+  enum translog_debug_info_type debug_info;
+
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+      rec->record_length)
+  {
+    eprint(tracef, "Failed to read record debug record");
+    return 1;
+  }
+  debug_info= (enum translog_debug_info_type) log_record_buffer.str[0];
+  data= log_record_buffer.str + 1;
+  switch (debug_info) {
+  case LOGREC_DEBUG_INFO_QUERY:
+    tprint(tracef, "Query: %s\n", (char*) data);
+    break;
+  default:
+    DBUG_ASSERT(0);
+  }
+  return 0;
+}
+
+
+/**
   In some cases we have to skip execution of an UNDO record during the UNDO
   phase.
 */
@@ -2350,6 +2398,7 @@ static int run_redo_phase(LSN lsn, enum 
   install_redo_exec_hook(UNDO_BULK_INSERT);
   install_undo_exec_hook(UNDO_BULK_INSERT);
   install_redo_exec_hook(IMPORTED_TABLE);
+  install_redo_exec_hook(DEBUG_INFO);
 
   current_group_end_lsn= LSN_IMPOSSIBLE;
 #ifndef DBUG_OFF
@@ -3392,6 +3441,7 @@ static void print_redo_phase_progress(TR
   }
 }
 
+
 #ifdef MARIA_EXTERNAL_LOCKING
 #error Marias Checkpoint and Recovery are really not ready for it
 #endif

=== modified file 'storage/maria/ma_write.c'
--- a/storage/maria/ma_write.c	2008-12-22 00:17:37 +0000
+++ b/storage/maria/ma_write.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,5 @@
-/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (C) 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -39,8 +40,8 @@ static uchar *_ma_find_last_pos(MARIA_HA
                                 uchar *page, uchar **after_key);
 static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key);
 static my_bool _ma_ck_write_btree(register MARIA_HA *info, MARIA_KEY *key);
-static int _ma_ck_write_btree_with_log(MARIA_HA *info, MARIA_KEY *key,
-                                       my_off_t *root, uint32 comp_flag);
+static int _ma_ck_write_btree_with_log(MARIA_HA *, MARIA_KEY *, my_off_t *,
+                                       uint32);
 static my_bool _ma_log_split(MARIA_HA *info, my_off_t page, const uchar *buff,
                              uint org_length, uint new_length,
                              const uchar *key_pos,
@@ -181,9 +182,8 @@ int maria_write(MARIA_HA *info, uchar *r
       else
       {
         while (keyinfo->ck_insert(info,
-                                  (*keyinfo->make_key)(info, &int_key, i,
-                                                       buff, record, filepos,
-                                                       info->trn->trid)))
+                 (*keyinfo->make_key)(info, &int_key, i, buff, record,
+                                      filepos, info->trn->trid)))
         {
           TRN *blocker;
           DBUG_PRINT("error",("Got error: %d on write",my_errno));
@@ -193,10 +193,12 @@ int maria_write(MARIA_HA *info, uchar *r
             below doesn't work for them.
             Also, filter out non-thread maria use, and table modified in
             the same transaction.
+            At last, filter out non-dup-unique errors.
           */
           if (!local_lock_tree)
             goto err;
-          if (info->dup_key_trid == info->trn->trid)
+          if (info->dup_key_trid == info->trn->trid ||
+              my_errno != HA_ERR_FOUND_DUPP_KEY)
           {
 	    rw_unlock(&keyinfo->root_lock);
             goto err;
@@ -257,6 +259,9 @@ int maria_write(MARIA_HA *info, uchar *r
             }
           }
           rw_wrlock(&keyinfo->root_lock);
+#ifndef MARIA_CANNOT_ROLLBACK
+          keyinfo->version++;
+#endif
         }
       }
 
@@ -671,12 +676,14 @@ static int w_search(register MARIA_HA *i
         When the index will support true versioning - with multiple
         identical values in the UNIQUE index, invisible to each other -
         the following should be changed to "continue inserting keys, at the
-        end (of the row or statement) wait". Until it's done we cannot properly
-        support deadlock timeouts.
+        end (of the row or statement) wait". We need to wait on *all*
+        unique conflicts at once, not one-at-a-time, because we need to
+        know all blockers in advance, otherwise we'll have incomplete wait-for
+        graph.
       */
       /*
-        transaction that has inserted the conflicting key is in progress.
-        wait for it to be committed or aborted.
+        transaction that has inserted the conflicting key may be in progress.
+        the caller will wait for it to be committed or aborted.
       */
       info->dup_key_trid= _ma_trid_from_key(&tmp_key);
       info->dup_key_pos= dup_key_pos;

=== modified file 'storage/maria/trnman.c'
--- a/storage/maria/trnman.c	2008-12-22 00:17:37 +0000
+++ b/storage/maria/trnman.c	2009-01-16 16:18:17 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -42,7 +42,7 @@ static TrID global_trid_generator;
   The default value is used when transaction manager not initialize;
   Probably called from maria_chk
 */
-static TrID trid_min_read_from= ~(TrID) 0;
+static TrID trid_min_read_from= MAX_TRID;
 
 /* the mutex for everything above */
 static pthread_mutex_t LOCK_trn_list;
@@ -59,6 +59,7 @@ static TRN **short_trid_to_active_trn;
 /* locks for short_trid_to_active_trn and pool */
 static my_atomic_rwlock_t LOCK_short_trid_to_trn, LOCK_pool;
 static my_bool default_trnman_end_trans_hook(TRN *, my_bool, my_bool);
+static void trnman_free_trn(TRN *);
 
 my_bool (*trnman_end_trans_hook)(TRN *, my_bool, my_bool)=
   default_trnman_end_trans_hook;
@@ -88,6 +89,19 @@ void trnman_reset_locked_tables(TRN *trn
   trn->locked_tables= locked_tables;
 }
 
+#ifdef EXTRA_DEBUG
+uint16 trnman_get_flags(TRN *trn)
+{
+  return trn->flags;
+}
+
+void trnman_set_flags(TRN *trn, uint16 flags)
+{
+  trn->flags= flags;
+}
+#endif
+
+/** Wake up threads waiting for this transaction */
 static void wt_thd_release_self(TRN *trn)
 {
   if (trn->wt)
@@ -149,12 +163,12 @@ int trnman_init(TrID initial_trid)
   */
 
   active_list_max.trid= active_list_min.trid= 0;
-  active_list_max.min_read_from= ~(TrID) 0;
+  active_list_max.min_read_from= MAX_TRID;
   active_list_max.next= active_list_min.prev= 0;
   active_list_max.prev= &active_list_min;
   active_list_min.next= &active_list_max;
 
-  committed_list_max.commit_trid= ~(TrID) 0;
+  committed_list_max.commit_trid= MAX_TRID;
   committed_list_max.next= committed_list_min.prev= 0;
   committed_list_max.prev= &committed_list_min;
   committed_list_min.next= &committed_list_max;
@@ -198,6 +212,7 @@ void trnman_destroy()
   {
     TRN *trn= pool;
     pool= pool->next;
+    DBUG_ASSERT(trn->wt == NULL);
     pthread_mutex_destroy(&trn->state_lock);
     my_free((void *)trn, MYF(0));
   }
@@ -251,10 +266,12 @@ static uint get_short_trid(TRN *trn)
   return res;
 }
 
-/*
-  DESCRIPTION
-    start a new transaction, allocate and initialize transaction object
-    mutex and cond will be used for lock waits
+/**
+  Allocates and initialzies a new TRN object
+
+  @note the 'wt' parameter can only be 0 in a single-threaded code (or,
+  generally, where threads cannot block each other), otherwise the
+  first call to the deadlock detector will sigsegv.
 */
 
 TRN *trnman_new_trn(WT_THD *wt)
@@ -338,7 +355,8 @@ TRN *trnman_new_trn(WT_THD *wt)
     trn->min_read_from= trn->trid + 1;
   }
 
-  trn->commit_trid=  ~(TrID)0;
+  /* no other transaction can read changes done by this one */
+  trn->commit_trid=  MAX_TRID;
   trn->rec_lsn= trn->undo_lsn= trn->first_undo_lsn= 0;
   trn->used_tables= 0;
 
@@ -394,6 +412,7 @@ my_bool trnman_end_trn(TRN *trn, my_bool
 
   /* if a rollback, all UNDO records should have been executed */
   DBUG_ASSERT(commit || trn->undo_lsn == 0);
+  DBUG_ASSERT(trn != &dummy_transaction_object);
   DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list"));
 
   pthread_mutex_lock(&LOCK_trn_list);
@@ -429,7 +448,8 @@ my_bool trnman_end_trn(TRN *trn, my_bool
   }
 
   pthread_mutex_lock(&trn->state_lock);
-  trn->commit_trid= global_trid_generator;
+  if (commit)
+    trn->commit_trid= global_trid_generator;
   wt_thd_release_self(trn);
   pthread_mutex_unlock(&trn->state_lock);
 
@@ -502,7 +522,7 @@ my_bool trnman_end_trn(TRN *trn, my_bool
   running. It may even be called automatically on checkpoints if no
   transactions are running.
 */
-void trnman_free_trn(TRN *trn)
+static void trnman_free_trn(TRN *trn)
 {
   /*
      union is to solve strict aliasing issue.
@@ -580,6 +600,16 @@ int trnman_can_read_from(TRN *trn, TrID 
   return can;
 }
 
+/**
+  Finds a TRN by its TrID
+
+  @param trn    current trn. Needed for pinning pointers (see lf_pin)
+  @param trid   trid to search for
+
+  @return found trn or 0
+
+  @note that trn is returned with its state locked!
+*/
 TRN *trnman_trid_to_trn(TRN *trn, TrID trid)
 {
   TRN **found;
@@ -604,7 +634,7 @@ TRN *trnman_trid_to_trn(TRN *trn, TrID t
   lf_hash_search_unpin(trn->pins);
 
   /* Gotcha! */
-  return *found; /* note that TRN is returned locked !!! */
+  return *found;
 }
 
 /* TODO: the stubs below are waiting for savepoints to be implemented */

=== modified file 'storage/maria/trnman.h'
--- a/storage/maria/trnman.h	2008-08-07 20:57:25 +0000
+++ b/storage/maria/trnman.h	2009-01-16 16:18:17 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@ C_MODE_START
 #include "trnman_public.h"
 #include "ma_loghandler_lsn.h"
 
-/*
+/**
   trid - 6 uchar transaction identifier. Assigned when a transaction
   is created. Transaction can always be identified by its trid,
   even after transaction has ended.
@@ -33,7 +33,7 @@ C_MODE_START
   when short_id is 0, TRN is not initialized, for all practical purposes
   it could be considered unused.
 
-  when commit_trid is ~(TrID)0 the transaction is running, otherwise it's
+  when commit_trid is MAX_TRID the transaction is running, otherwise it's
   committed.
 
   state_lock mutex protects the state of a TRN, that is whether a TRN
@@ -46,16 +46,18 @@ struct st_ma_transaction
   LF_PINS              *pins;
   WT_THD               *wt;
   pthread_mutex_t      state_lock;
-  void                 *used_tables;  /* Tables used by transaction */
+  void                 *used_tables;  /**< Tables used by transaction */
   TRN                  *next, *prev;
   TrID                 trid, min_read_from, commit_trid;
   LSN		       rec_lsn, undo_lsn;
   LSN_WITH_FLAGS       first_undo_lsn;
   uint                 locked_tables;
   uint16               short_id;
+  uint16               flags;         /**< Various flags */
 };
 
 #define TRANSACTION_LOGGED_LONG_ID ULL(0x8000000000000000)
+#define MAX_TRID (~(TrID)0)
 
 extern WT_RESOURCE_TYPE ma_rc_dup_unique;
 

=== modified file 'storage/maria/trnman_public.h'
--- a/storage/maria/trnman_public.h	2008-12-08 20:09:59 +0000
+++ b/storage/maria/trnman_public.h	2009-01-16 16:18:17 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -44,7 +44,6 @@ my_bool trnman_end_trn(TRN *trn, my_bool
 #define trnman_commit_trn(T) trnman_end_trn(T, TRUE)
 #define trnman_abort_trn(T)  trnman_end_trn(T, FALSE)
 #define trnman_rollback_trn(T)  trnman_end_trn(T, FALSE)
-void trnman_free_trn(TRN *trn);
 int trnman_can_read_from(TRN *trn, TrID trid);
 TRN *trnman_trid_to_trn(TRN *trn, TrID trid);
 void trnman_new_statement(TRN *trn);
@@ -70,5 +69,17 @@ my_bool trnman_exists_active_transaction
 void trnman_lock();
 void trnman_unlock();
 my_bool trman_is_inited();
+#ifdef EXTRA_DEBUG
+uint16 trnman_get_flags(TRN *);
+void trnman_set_flags(TRN *, uint16 flags);
+#else
+#define trnman_get_flags(A) 0
+#define trnman_set_flags(A, B) do { } while (0)
+#endif
+
+/* Flag bits */
+#define TRN_STATE_INFO_LOGGED       1  /* Query is logged */
+#define TRN_STATE_TABLES_CAN_CHANGE 2  /* Things can change during trans. */
+
 C_MODE_END
 #endif

=== modified file 'storage/maria/unittest/trnman-t.c'
--- a/storage/maria/unittest/trnman-t.c	2008-08-30 21:32:27 +0000
+++ b/storage/maria/unittest/trnman-t.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -40,7 +40,8 @@ pthread_handler_t test_trnman(void *arg)
   TRN    *trn[MAX_ITER];
   int    m= (*(int *)arg);
 
-  my_thread_init();
+  if (my_thread_init())
+    BAIL_OUT("my_thread_init failed!");
 
   for (x= ((int)(intptr)(&m)); m > 0; )
   {

=== modified file 'storage/myisam/mi_create.c'
--- a/storage/myisam/mi_create.c	2008-10-10 15:28:41 +0000
+++ b/storage/myisam/mi_create.c	2009-01-15 22:25:53 +0000
@@ -496,7 +496,8 @@ int mi_create(const char *name,uint keys
   /* There are only 16 bits for the total header length. */
   if (info_length > 65535)
   {
-    my_printf_error(0, "MyISAM table '%s' has too many columns and/or "
+    my_printf_error(HA_WRONG_CREATE_OPTION,
+                    "MyISAM table '%s' has too many columns and/or "
                     "indexes and/or unique constraints.",
                     MYF(0), name + dirname_length(name));
     my_errno= HA_WRONG_CREATE_OPTION;

=== modified file 'unittest/mysys/lf-t.c'
--- a/unittest/mysys/lf-t.c	2008-07-29 14:10:24 +0000
+++ b/unittest/mysys/lf-t.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2008-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -13,6 +13,12 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
+/**
+  @file
+
+  Unit tests for lock-free algorithms of mysys
+*/
+
 #include "thr_template.c"
 
 #include <lf.h>
@@ -47,6 +53,10 @@ pthread_handler_t test_lf_pinbox(void *a
   return 0;
 }
 
+/*
+  thread local data area, allocated using lf_alloc.
+  union is required to enforce the minimum required element size (sizeof(ptr))
+*/
 typedef union {
   int32 data;
   void *not_used;

=== modified file 'unittest/mysys/my_atomic-t.c'
--- a/unittest/mysys/my_atomic-t.c	2008-07-29 14:10:24 +0000
+++ b/unittest/mysys/my_atomic-t.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by

=== modified file 'unittest/mysys/thr_template.c'
--- a/unittest/mysys/thr_template.c	2008-08-29 19:50:04 +0000
+++ b/unittest/mysys/thr_template.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by

=== modified file 'unittest/mysys/waiting_threads-t.c'
--- a/unittest/mysys/waiting_threads-t.c	2008-10-21 19:31:14 +0000
+++ b/unittest/mysys/waiting_threads-t.c	2009-01-15 21:27:36 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -16,7 +16,6 @@
 #include "thr_template.c"
 #include <waiting_threads.h>
 #include <m_string.h>
-#include <locale.h>
 
 struct test_wt_thd {
   WT_THD thd;
@@ -31,6 +30,7 @@ ulong wt_timeout_long=10000, wt_deadlock
 
 #define reset(ARRAY) bzero(ARRAY, sizeof(ARRAY))
 
+/* see explanation of the kill strategies in waiting_threads.h */
 enum { LATEST, RANDOM, YOUNGEST, LOCKS } kill_strategy;
 
 WT_RESOURCE_TYPE restype={ wt_resource_id_memcmp, 0};
@@ -68,13 +68,14 @@ pthread_handler_t test_wt(void *arg)
 
     res= 0;
 
+    /* prepare for waiting for a random number of random threads */
     for (j= n= (rnd() % THREADS)/10; !res && j >= 0; j--)
     {
 retry:
-      i= rnd() % (THREADS-1);
-      if (i >= id) i++;
+      i= rnd() % (THREADS-1); /* pick a random thread */
+      if (i >= id) i++;   /* with a number from 0 to THREADS-1 excluding ours */
 
-      for (k=n; k >=j; k--)
+      for (k=n; k >=j; k--) /* the one we didn't pick before */
         if (blockers[k] == i)
           goto retry;
       blockers[j]= i;
@@ -121,7 +122,7 @@ retry:
 #define DEL "(deleted)"
     char *x=malloc(strlen(thds[id].thd.name)+sizeof(DEL)+1);
     strxmov(x, thds[id].thd.name, DEL, 0);
-    thds[id].thd.name=x; /* it's a memory leak, go on, shoot me */
+    thds[id].thd.name=x;
   }
 #endif
 
@@ -165,8 +166,8 @@ void do_one_test()
 
 void do_tests()
 {
-  plan(12);
-  compile_time_assert(THREADS >= 3);
+  plan(14);
+  compile_time_assert(THREADS >= 4);
 
   DBUG_PRINT("wt", ("================= initialization ==================="));
 
@@ -206,22 +207,22 @@ void do_tests()
     pthread_mutex_lock(&lock);
     bad= wt_thd_cond_timedwait(& thds[0].thd, &lock);
     pthread_mutex_unlock(&lock);
-    ok(bad == ETIMEDOUT, "timeout test returned %d", bad);
+    ok(bad == WT_TIMEOUT, "timeout test returned %d", bad);
 
     ok_wait(0,1,0);
     ok_wait(1,2,1);
     ok_deadlock(2,0,2);
 
     pthread_mutex_lock(&lock);
-    wt_thd_cond_timedwait(& thds[0].thd, &lock);
-    wt_thd_cond_timedwait(& thds[1].thd, &lock);
+    ok(wt_thd_cond_timedwait(& thds[0].thd, &lock) == WT_TIMEOUT, "as always");
+    ok(wt_thd_cond_timedwait(& thds[1].thd, &lock) == WT_TIMEOUT, "as always");
     wt_thd_release_all(& thds[0].thd);
     wt_thd_release_all(& thds[1].thd);
     wt_thd_release_all(& thds[2].thd);
     wt_thd_release_all(& thds[3].thd);
     pthread_mutex_unlock(&lock);
 
-    for (cnt=0; cnt < 3; cnt++)
+    for (cnt=0; cnt < 4; cnt++)
     {
       wt_thd_destroy(& thds[cnt].thd);
       wt_thd_lazy_init(& thds[cnt].thd,
@@ -261,6 +262,7 @@ void do_tests()
     wt_thd_release_all(& thds[cnt].thd);
     wt_thd_destroy(& thds[cnt].thd);
     pthread_mutex_destroy(& thds[cnt].lock);
+    free(thds[cnt].thd.name);
   }
   pthread_mutex_unlock(&lock);
   wt_end();

Thread
bzr commit into MySQL/Maria:mysql-5.1-maria-2.0 branch (guilhem:2720) Guilhem Bichot19 Jan