List:Commits« Previous MessageNext Message »
From:knielsen Date:March 4 2008 8:39am
Subject:bk commit into 6.0 tree (knielsen:1.2550) WL#2771
View as plain text  
Below is the list of changes that have just been committed into a local
6.0 repository of knielsen.  When knielsen does a push these changes
will be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2008-03-04 09:39:33+01:00, knielsen@ymer.(none) +15 -0
  WL#2771: Usage of multi_read_range in nested loop join
  
  Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
  in preparation for releasing in mysql-6.0-telco alpha.
  
  Exclude execute_count and scan_fetches parts, as I think those are
  incomplete and/or already included in -telco.

  sql/ha_ndbcluster.cc@stripped, 2008-03-04 09:39:24+01:00, knielsen@ymer.(none) +7 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/ha_ndbcluster.h@stripped, 2008-03-04 09:39:25+01:00, knielsen@ymer.(none) +3 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/handler.cc@stripped, 2008-03-04 09:39:25+01:00, knielsen@ymer.(none) +45 -33
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/handler.h@stripped, 2008-03-04 09:39:25+01:00, knielsen@ymer.(none) +45 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/item.cc@stripped, 2008-03-04 09:39:25+01:00, knielsen@ymer.(none) +11 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/item.h@stripped, 2008-03-04 09:39:25+01:00, knielsen@ymer.(none) +2 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/mysql_priv.h@stripped, 2008-03-04 09:39:25+01:00, knielsen@ymer.(none) +1 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/mysqld.cc@stripped, 2008-03-04 09:39:26+01:00, knielsen@ymer.(none) +9 -2
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/opt_range.cc@stripped, 2008-03-04 09:39:26+01:00, knielsen@ymer.(none) +17 -2
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/opt_range.h@stripped, 2008-03-04 09:39:26+01:00, knielsen@ymer.(none) +3 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/sql_class.h@stripped, 2008-03-04 09:39:26+01:00, knielsen@ymer.(none) +38 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/sql_select.cc@stripped, 2008-03-04 09:39:26+01:00, knielsen@ymer.(none) +1084 -228
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  sql/sql_select.h@stripped, 2008-03-04 09:39:27+01:00, knielsen@ymer.(none) +308 -47
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  storage/innobase/handler/ha_innodb.cc@stripped, 2008-03-04 09:39:27+01:00, knielsen@ymer.(none) +1 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

  storage/myisam/ha_myisam.cc@stripped, 2008-03-04 09:39:27+01:00, knielsen@ymer.(none) +2 -0
    WL#2771: Usage of multi_read_range in nested loop join
    
    Re-commit changes from mysql-5.2-wl2771-prototype-r2 as a single patch,
    in preparation for releasing in mysql-6.0-telco alpha.
    
    Exclude execute_count and scan_fetches parts, as I think those are
    incomplete and/or already included in -telco.

diff -Nrup a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc
--- a/sql/ha_ndbcluster.cc	2008-02-26 17:23:46 +01:00
+++ b/sql/ha_ndbcluster.cc	2008-03-04 09:39:24 +01:00
@@ -4009,6 +4009,7 @@ int ha_ndbcluster::info(uint flag)
     if (!thd)
       thd= current_thd;
     DBUG_PRINT("info", ("HA_STATUS_VARIABLE"));
+    stats.mrr_length_per_rec= table_share->reclength + 2*sizeof(void*) + sizeof(uint16);
     if ((flag & HA_STATUS_NO_LOCK) &&
         !thd->variables.ndb_use_exact_count)
     {
@@ -9245,6 +9246,12 @@ int ha_ndbcluster::multi_range_read_init
   mrr_funcs= *seq_funcs;
   mrr_iter= mrr_funcs.init(seq_init_param, n_ranges, mode);
   ranges_in_seq= n_ranges;
+  
+  mrr_need_range_assoc = !test(mode & HA_MRR_NO_ASSOCIATION);
+  if (mrr_need_range_assoc)
+  {
+    ha_statistic_increment(&SSV::ha_multi_range_read_init_count);
+  }
 
   /*
     We do not start fetching here with execute(), rather we defer this to the
diff -Nrup a/sql/ha_ndbcluster.h b/sql/ha_ndbcluster.h
--- a/sql/ha_ndbcluster.h	2008-02-26 17:20:29 +01:00
+++ b/sql/ha_ndbcluster.h	2008-03-04 09:39:25 +01:00
@@ -337,6 +337,9 @@ private:
   uint first_running_range;
   uint first_range_in_batch;
   uint first_unstarted_range;
+  /* TRUE <=> need range association */
+  bool mrr_need_range_assoc;
+
   int multi_range_start_retrievals(uint first_range);
 public:
 
diff -Nrup a/sql/handler.cc b/sql/handler.cc
--- a/sql/handler.cc	2008-02-11 17:05:51 +01:00
+++ b/sql/handler.cc	2008-03-04 09:39:25 +01:00
@@ -3842,8 +3842,14 @@ int DsMrr_impl::dsmrr_init(handler *h, K
   rowids_buf += key->key_length + h->ref_length;
 
   is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
+  semi_join= test(mode & HA_MRR_SEMI_JOIN);
+  DBUG_ASSERT(!semi_join || is_mrr_assoc);
+
+  if (is_mrr_assoc)
+    h->ha_statistic_increment(&SSV::ha_multi_range_read_init_count);
+ 
+
   rowids_buf_end= buf->buffer_end;
-  
   elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
   rowids_buf_last= rowids_buf + 
                       ((rowids_buf_end - rowids_buf)/ elem_size)*
@@ -3954,15 +3960,18 @@ int DsMrr_impl::dsmrr_fill_buffer(handle
   while ((rowids_buf_cur < rowids_buf_end) && 
          !(res= h->handler::multi_range_read_next(&range_info)))
   {
-    /* Put rowid, or {rowid, range_id} pair into the buffer */
-    h->position(h->table->record[0]);
-    memcpy(rowids_buf_cur, h->ref, h->ref_length);
-    rowids_buf_cur += h->ref_length;
-
-    if (is_mrr_assoc)
+    if (!semi_join || *range_info==0)
     {
-      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
+      /* Put rowid, or {rowid, range_id} pair into the buffer */
+      h->position(h->table->record[0]);
+      memcpy(rowids_buf_cur, h->ref, h->ref_length);
+      rowids_buf_cur += h->ref_length;
+
+      if (is_mrr_assoc)
+      {
+        memcpy(rowids_buf_cur, &range_info, sizeof(void*));
+        rowids_buf_cur += sizeof(void*);
+      }
     }
   }
 
@@ -3989,40 +3998,43 @@ int DsMrr_impl::dsmrr_fill_buffer(handle
 int DsMrr_impl::dsmrr_next(handler *h, char **range_info)
 {
   int res;
+  uchar *cur_rowid;
   
   if (use_default_impl)
     return h->handler::multi_range_read_next(range_info);
     
-  if (rowids_buf_cur == rowids_buf_last)
+  do
   {
-    h->table->column_bitmaps_set_no_signal(save_read_set, save_write_set);
-    if (dsmrr_eof)
+    if (rowids_buf_cur == rowids_buf_last)
+    {
+      h->table->column_bitmaps_set_no_signal(save_read_set, save_write_set);
+      if (dsmrr_eof)
+      {
+        res= HA_ERR_END_OF_FILE;
+        goto end;
+      }
+
+      res= dsmrr_fill_buffer(h);
+      h->table->column_bitmaps_set_no_signal(&row_access_bitmap,
+                                             &row_access_bitmap);
+      if (res)
+        goto end;
+    }
+    
+    /* Return EOF if there are no rowids in the buffer after re-fill attempt */
+    if (rowids_buf_cur == rowids_buf_last)
     {
       res= HA_ERR_END_OF_FILE;
       goto end;
     }
-
-    res= dsmrr_fill_buffer(h);
-    h->table->column_bitmaps_set_no_signal(&row_access_bitmap,
-                                           &row_access_bitmap);
-    if (res)
-      goto end;
-  }
-  
-  /* Return EOF if there are no rowids in the buffer after re-fill attempt */
-  if (rowids_buf_cur == rowids_buf_last)
-  {
-    res= HA_ERR_END_OF_FILE;
-    goto end;
-  }
-
-  res= h2->rnd_pos(h->table->record[0], rowids_buf_cur);
-  rowids_buf_cur += h->ref_length;
+    cur_rowid= rowids_buf_cur;
+    rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
+     
+  } while (semi_join && *(cur_rowid + h->ref_length)==0);
+ 
+  res= h2->rnd_pos(h->table->record[0], cur_rowid);
   if (is_mrr_assoc)
-  {
-    memcpy(range_info, rowids_buf_cur, sizeof(void*));
-    rowids_buf_cur += sizeof(void*);
-  }
+    memcpy(range_info, cur_rowid + h->ref_length, sizeof(void*));
 
 end:
   if (res)
diff -Nrup a/sql/handler.h b/sql/handler.h
--- a/sql/handler.h	2008-02-20 08:43:57 +01:00
+++ b/sql/handler.h	2008-03-04 09:39:25 +01:00
@@ -1004,6 +1004,30 @@ typedef struct st_range_seq_if
       1 - No more ranges
   */
   uint (*next) (range_seq_t seq, KEY_MULTI_RANGE *range);
+  /* 
+    The following two fields may have been added temporarrily
+    to neutralize some additional assumptions made by the current NDB code.
+  */
+ 
+  /*
+    Save current iterator position.
+    
+    NOTE
+      The call may be made before the first (*next)() call. 
+      The call may not be made after the (*next) call returned 1.
+    RETURN
+      An opaque value representing current position of the iterator. 
+      
+  */
+  void (*save_current_pos)(range_seq_t seq, uint *range_no, void **pos);
+
+  /*
+    Restore iterator position
+
+    restore_pos()
+      pos  Value previously obtained from save_current_pos call.
+  */
+  void  (*restore_pos)(range_seq_t seq,uint range_no, void *pos);
 } RANGE_SEQ_IF;
 
 class COST_VECT
@@ -1104,6 +1128,16 @@ void get_sweep_read_cost(TABLE *table, h
 */
 #define HA_MRR_NO_NULL_ENDPOINTS 128
 
+/*
+  Flag set <=> We're running a first-match semi-join, and the accessed table
+  is the inner table.
+  MRR implementation may (but doesn't have to) interpret KEY_MULTI_RANGE::ptr
+  as char* need_no_more_matches, if "*need_no_more_matches=1" then the SQL
+  layer does not need any more records from this range.
+
+  HA_MRR_SEMI_JOIN cannot be used together with HA_MRR_NO_ASSOCIATION.
+*/
+#define HA_MRR_SEMI_JOIN 256
 
 class ha_statistics
 {
@@ -1129,6 +1163,11 @@ public:
   time_t check_time;
   time_t update_time;
   uint block_size;			/* index block size */
+  
+  /*
+    Number of buffer bytes that native MRR implementation needs,
+  */
+  uint mrr_length_per_rec; 
 
   ha_statistics():
     data_file_length(0), max_data_file_length(0),
@@ -2231,6 +2270,12 @@ public:
 
   /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
   bool is_mrr_assoc; 
+
+  /*
+    TRUE <=> doing a first-match semi-join, check if we need any more records in
+    the scanned range
+  */
+  bool semi_join;
 
   handler *h; /* Owner table handler */
   handler *h2; /* Slave handler for doing rnd_pos(). */
diff -Nrup a/sql/item.cc b/sql/item.cc
--- a/sql/item.cc	2008-02-08 13:52:31 +01:00
+++ b/sql/item.cc	2008-03-04 09:39:25 +01:00
@@ -631,6 +631,17 @@ bool Item_field::collect_item_field_proc
 }
 
 
+bool Item_field::add_field_to_set_processor(uchar *arg)
+{
+  DBUG_ENTER("Item_field::add_field_to_set_processor");
+  DBUG_PRINT("info", ("%s", field->field_name ? field->field_name : "noname"));
+  TABLE *table= (TABLE *) arg;
+  if (field->table == table)
+    bitmap_set_bit(&table->tmp_set, field->field_index);
+  DBUG_RETURN(FALSE);
+}
+
+
 /**
   Check if an Item_field references some field from a list of fields.
 
diff -Nrup a/sql/item.h b/sql/item.h
--- a/sql/item.h	2007-12-13 13:56:19 +01:00
+++ b/sql/item.h	2008-03-04 09:39:25 +01:00
@@ -890,6 +890,7 @@ public:
   virtual bool remove_fixed(uchar * arg) { fixed= 0; return 0; }
   virtual bool cleanup_processor(uchar *arg);
   virtual bool collect_item_field_processor(uchar * arg) { return 0; }
+  virtual bool add_field_to_set_processor(uchar * arg) { return 0; }
   virtual bool find_item_in_field_list_processor(uchar *arg) { return 0; }
   virtual bool change_context_processor(uchar *context) { return 0; }
   virtual bool reset_query_id_processor(uchar *query_id_arg) { return 0; }
@@ -1484,6 +1485,7 @@ public:
   void update_null_value();
   Item *get_tmp_table_item(THD *thd);
   bool collect_item_field_processor(uchar * arg);
+  bool add_field_to_set_processor(uchar * arg);
   bool find_item_in_field_list_processor(uchar *arg);
   bool register_field_in_read_map(uchar *arg);
   bool check_partition_func_processor(uchar *int_arg) {return FALSE;}
diff -Nrup a/sql/mysql_priv.h b/sql/mysql_priv.h
--- a/sql/mysql_priv.h	2008-02-12 11:30:44 +01:00
+++ b/sql/mysql_priv.h	2008-03-04 09:39:25 +01:00
@@ -543,6 +543,7 @@ enum open_table_mode
 /* @@optimizer_switch flags */
 #define OPTIMIZER_SWITCH_NO_MATERIALIZATION 1
 #define OPTIMIZER_SWITCH_NO_SEMIJOIN 2
+#define OPTIMIZER_SWITCH_NO_BKA 4
 
 
 /*
diff -Nrup a/sql/mysqld.cc b/sql/mysqld.cc
--- a/sql/mysqld.cc	2008-02-19 20:59:45 +01:00
+++ b/sql/mysqld.cc	2008-03-04 09:39:26 +01:00
@@ -302,7 +302,7 @@ TYPELIB sql_mode_typelib= { array_elemen
 
 static const char *optimizer_switch_names[]=
 {
-  "no_materialization", "no_semijoin",
+  "no_materialization", "no_semijoin", "no_bka",
   NullS
 };
 
@@ -310,7 +310,8 @@ static const char *optimizer_switch_name
 static const unsigned int optimizer_switch_names_len[]=
 {
   /*no_materialization*/          19,
-  /*no_semijoin*/                 11
+  /*no_semijoin*/                 11,
+  /*no_bka*/                       6
 };
 
 TYPELIB optimizer_switch_typelib= { array_elements(optimizer_switch_names)-1,"",
@@ -6279,8 +6280,13 @@ log and this option does nothing anymore
    "The size of the buffer that is used for full joins.",
    (uchar**) &global_system_variables.join_buff_size,
    (uchar**) &max_system_variables.join_buff_size, 0, GET_ULONG,
+#if 0
    REQUIRED_ARG, 128*1024L, IO_SIZE*2+MALLOC_OVERHEAD, ULONG_MAX,
    MALLOC_OVERHEAD, IO_SIZE, 0},
+#else
+   REQUIRED_ARG, 128*1024L, 64+MALLOC_OVERHEAD, ULONG_MAX,
+   MALLOC_OVERHEAD, 64, 0},
+#endif
   {"keep_files_on_create", OPT_KEEP_FILES_ON_CREATE,
    "Don't overwrite stale .MYD and .MYI even if no directory is specified.",
    (uchar**) &global_system_variables.keep_files_on_create,
@@ -7119,6 +7125,7 @@ SHOW_VAR status_vars[]= {
   {"Handler_commit",           (char*) offsetof(STATUS_VAR, ha_commit_count), SHOW_LONG_STATUS},
   {"Handler_delete",           (char*) offsetof(STATUS_VAR, ha_delete_count), SHOW_LONG_STATUS},
   {"Handler_discover",         (char*) offsetof(STATUS_VAR, ha_discover_count), SHOW_LONG_STATUS},
+  {"Handler_mrr_init",         (char*) offsetof(STATUS_VAR, ha_multi_range_read_init_count),  SHOW_LONG_STATUS},
   {"Handler_prepare",          (char*) offsetof(STATUS_VAR, ha_prepare_count),  SHOW_LONG_STATUS},
   {"Handler_read_first",       (char*) offsetof(STATUS_VAR, ha_read_first_count), SHOW_LONG_STATUS},
   {"Handler_read_key",         (char*) offsetof(STATUS_VAR, ha_read_key_count), SHOW_LONG_STATUS},
diff -Nrup a/sql/opt_range.cc b/sql/opt_range.cc
--- a/sql/opt_range.cc	2008-02-20 08:43:57 +01:00
+++ b/sql/opt_range.cc	2008-03-04 09:39:26 +01:00
@@ -7416,7 +7416,8 @@ ha_rows check_quick_select(PARAM *param,
                            uint *mrr_flags, uint *bufsize, COST_VECT *cost)
 {
   SEL_ARG_RANGE_SEQ seq;
-  RANGE_SEQ_IF seq_if = {sel_arg_range_seq_init, sel_arg_range_seq_next};
+  RANGE_SEQ_IF seq_if = {sel_arg_range_seq_init, sel_arg_range_seq_next,
+                         NULL, NULL};
   handler *file= param->table->file;
   ha_rows rows;
   uint keynr= param->real_keynr[idx];
@@ -8330,7 +8331,9 @@ int QUICK_RANGE_SELECT::reset()
  
   if (sorted)
      mrr_flags |= HA_MRR_SORTED;
-  RANGE_SEQ_IF seq_funcs= {quick_range_seq_init, quick_range_seq_next};
+  RANGE_SEQ_IF seq_funcs= {quick_range_seq_init, quick_range_seq_next,
+                           quick_range_seq_save_current_pos, 
+                           quick_range_seq_restore_pos};
   error= file->multi_range_read_init(&seq_funcs, (void*)this, ranges.elements,
                                      mrr_flags, mrr_buf_desc? mrr_buf_desc: 
                                                               &empty_buf);
@@ -8406,6 +8409,18 @@ uint quick_range_seq_next(range_seq_t rs
   return 0;
 }
 
+void quick_range_seq_save_current_pos(range_seq_t rseq,
+                                      uint *range_no, void **pos)
+{
+  QUICK_RANGE_SEQ_CTX *ctx= (QUICK_RANGE_SEQ_CTX*)rseq;
+  *pos= ctx->cur;
+}
+
+void quick_range_seq_restore_pos(range_seq_t rseq, uint range_no, void *pos)
+{
+  QUICK_RANGE_SEQ_CTX *ctx= (QUICK_RANGE_SEQ_CTX*)rseq;
+  ctx->cur = (QUICK_RANGE**)pos;
+}
 
 /*
   Get next possible record using quick-struct.
diff -Nrup a/sql/opt_range.h b/sql/opt_range.h
--- a/sql/opt_range.h	2007-11-07 13:32:02 +01:00
+++ b/sql/opt_range.h	2008-03-04 09:39:26 +01:00
@@ -280,6 +280,9 @@ typedef struct st_quick_range_seq_ctx
 
 range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags);
 uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+void quick_range_seq_save_current_pos(range_seq_t rseq, 
+                                       uint *range_no, void **pos);
+void quick_range_seq_restore_pos(range_seq_t rseq, uint range_no, void *pos);
 
 
 /*
diff -Nrup a/sql/sql_class.h b/sql/sql_class.h
--- a/sql/sql_class.h	2008-02-01 15:08:44 +01:00
+++ b/sql/sql_class.h	2008-03-04 09:39:26 +01:00
@@ -384,6 +384,38 @@ struct system_variables
 
 };
 
+/*
+  Statistics for some value. One can feed the specimen values in, and then
+  get AVG() and VARIANCE() of the observed set.
+*/
+class Stat_value
+{
+  /* number of values */
+  uint n;
+
+  double mean, stddev;
+public:
+  void init(ulong *counter_arg)
+  {
+    mean= stddev= 0.0;
+  }
+  
+  void add_val(double val)
+  {
+    double prev_mean= mean;
+    if (!n)
+      mean= val;
+    else
+      mean+= (val - mean)/(n-1);
+
+    stddev += val -prev_mean * val - mean;
+    n++;
+  }
+
+  double get_avg() { return mean; }
+  double get_variance() { return stddev; }
+};
+
 
 /* per thread status variables */
 
@@ -404,6 +436,12 @@ typedef struct system_status_var
   ulong ha_read_prev_count;
   ulong ha_read_rnd_count;
   ulong ha_read_rnd_next_count;
+  /*
+    This number doesn't include calls to the default implementation and
+    calls made by range access. The intent is to count only calls made by
+    BatchedKeyAccess.
+  */
+  ulong ha_multi_range_read_init_count;
   ulong ha_rollback_count;
   ulong ha_update_count;
   ulong ha_write_count;
diff -Nrup a/sql/sql_select.cc b/sql/sql_select.cc
--- a/sql/sql_select.cc	2008-02-12 11:30:45 +01:00
+++ b/sql/sql_select.cc	2008-03-04 09:39:26 +01:00
@@ -138,8 +138,6 @@ evaluate_join_record(JOIN *join, JOIN_TA
 static enum_nested_loop_state
 evaluate_null_complemented_join_record(JOIN *join, JOIN_TAB *join_tab);
 static enum_nested_loop_state
-flush_cached_records(JOIN *join, JOIN_TAB *join_tab, bool skip_last);
-static enum_nested_loop_state
 end_send(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
 static enum_nested_loop_state
 end_write(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
@@ -173,6 +171,10 @@ int join_read_next_same_or_null(READ_REC
 static COND *make_cond_for_table(COND *cond,table_map table,
 				 table_map used_table,
                                  bool exclude_expensive_cond);
+static COND *make_cond_for_table_from_pred(COND *root_cond, COND *cond,
+                                           table_map tables,
+                                           table_map used_table,
+                                           bool exclude_expensive_cond);
 static Item* part_of_refkey(TABLE *form,Field *field);
 uint find_shortest_key(TABLE *table, const key_map *usable_keys);
 static bool test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,
@@ -193,12 +195,6 @@ static int remove_dup_with_hash_index(TH
 				      uint field_count, Field **first_field,
 
 				      ulong key_length,Item *having);
-static int join_init_cache(THD *thd,JOIN_TAB *tables,uint table_count);
-static ulong used_blob_length(CACHE_FIELD **ptr);
-static bool store_record_in_cache(JOIN_CACHE *cache);
-static void reset_cache_read(JOIN_CACHE *cache);
-static void reset_cache_write(JOIN_CACHE *cache);
-static void read_cached_record(JOIN_TAB *tab);
 static bool cmp_buffer_with_ref(JOIN_TAB *tab);
 static bool setup_new_fields(THD *thd, List<Item> &fields,
 			     List<Item> &all_fields, ORDER *new_order);
@@ -232,7 +228,8 @@ void select_describe(JOIN *join, bool ne
 			    bool distinct, const char *message=NullS);
 static Item *remove_additional_cond(Item* conds);
 static void add_group_and_distinct_keys(JOIN *join, JOIN_TAB *join_tab);
-static bool test_if_ref(Item_field *left_item,Item *right_item);
+static bool test_if_ref(COND *root_cond,
+                        Item_field *left_item,Item *right_item);
 
 /*
   This is used to mark equalities that were made from i-th IN-equality.
@@ -847,7 +844,8 @@ void JOIN::remove_subq_pushed_predicates
       ((Item_func *)this->conds)->functype() == Item_func::EQ_FUNC &&
       ((Item_func *)conds)->arguments()[0]->type() == Item::REF_ITEM &&
       ((Item_func *)conds)->arguments()[1]->type() == Item::FIELD_ITEM &&
-      test_if_ref ((Item_field *)((Item_func *)conds)->arguments()[1],
+      test_if_ref (conds,
+                   (Item_field *)((Item_func *)conds)->arguments()[1],
                    ((Item_func *)conds)->arguments()[0]))
   {
     *where= 0;
@@ -1121,7 +1119,10 @@ int setup_semijoin_dups_elimination(JOIN
     {
       if (i != join->const_tables && !(options & SELECT_NO_JOIN_CACHE) &&
           tab->type == JT_ALL && tab->use_quick != 2 && !tab->first_inner &&
-          i <= no_jbuf_after && !dealing_with_jbuf)
+          i <= no_jbuf_after && !dealing_with_jbuf && 
+          emb_sj_map != tab->table->map)
+          // ^ psergey-add: and there is more than one inner table in the
+          // SJ-nest
       {
         /*
           This table uses join buffering, which makes use of FirstMatch or 
@@ -4074,7 +4075,9 @@ make_join_statistics(JOIN *join, TABLE_L
     add_group_and_distinct_keys(join, s);
 
     if (!s->const_keys.is_clear_all() &&
-        !s->table->pos_in_table_list->embedding)
+        (!s->table->pos_in_table_list->embedding ||
+         s->table->pos_in_table_list->embedding &&
+         s->table->pos_in_table_list->embedding->sj_on_expr))
     {
       ha_rows records;
       SQL_SELECT *select;
@@ -7133,7 +7136,9 @@ make_simple_join(JOIN *join,TABLE *tmp_t
   join->row_limit=join->unit->select_limit_cnt;
   join->do_send_rows = (join->row_limit) ? 1 : 0;
 
-  join_tab->cache.buff=0;			/* No caching */
+  join_tab->use_join_cache= FALSE;
+  join_tab->cache= 0;			        /* No caching */
+  join_tab->cache_select= 0;
   join_tab->table=tmp_table;
   join_tab->select=0;
   join_tab->select_cond=0;
@@ -7524,7 +7529,7 @@ make_join_select(JOIN *join,SQL_SELECT *
 
       }
       if (tmp || !cond || tab->type == JT_REF || tab->type == JT_REF_OR_NULL ||
-          tab->type == JT_EQ_REF)
+          tab->type == JT_EQ_REF || tab->first_inner)
       {
 	DBUG_EXECUTE("where",print_where(tmp,tab->table->alias););
 	SQL_SELECT *sel= tab->select= ((SQL_SELECT*)
@@ -7681,10 +7686,10 @@ make_join_select(JOIN *join,SQL_SELECT *
 					 current_map, 0)))
 	    {
 	      DBUG_EXECUTE("where",print_where(tmp,"cache"););
-	      tab->cache.select=(SQL_SELECT*)
+	      tab->cache_select=(SQL_SELECT*)
 		thd->memdup((uchar*) sel, sizeof(SQL_SELECT));
-	      tab->cache.select->cond=tmp;
-	      tab->cache.select->read_tables=join->const_table_map;
+	      tab->cache_select->cond=tmp;
+	      tab->cache_select->read_tables=join->const_table_map;
 	    }
 	  }
 	}
@@ -7774,6 +7779,8 @@ make_join_select(JOIN *join,SQL_SELECT *
             if (!cond_tab->select_cond)
 	      DBUG_RETURN(1);
             cond_tab->select_cond->quick_fix_field();
+            if (cond_tab->select)
+              cond_tab->select->cond= cond_tab->select_cond; 
           }              
         }
         first_inner_tab= first_inner_tab->first_upper;       
@@ -8162,6 +8169,9 @@ make_join_readinfo(JOIN *join, ulonglong
     JOIN_TAB *tab=join->join_tab+i;
     TABLE *table=tab->table;
     bool using_join_cache;
+    JOIN_TAB *first_inner= tab->first_inner;
+    while (first_inner && first_inner->first_upper)
+      first_inner= first_inner->first_upper;
     tab->read_record.table= table;
     tab->read_record.file=table->file;
     tab->next_select=sub_select;		/* normal select */
@@ -8206,6 +8216,41 @@ make_join_readinfo(JOIN *join, ulonglong
       tab->quick=0;
       tab->read_first_record= join_read_key;
       tab->read_record.read_record= join_no_more_records;
+      using_join_cache= FALSE;
+
+#if 0
+#else
+      {
+        bool do_bka= !test(join->thd->variables.optimizer_switch &
+                           OPTIMIZER_SWITCH_NO_BKA);
+        if (do_bka &&
+            i != join->const_tables && !(options & SELECT_NO_JOIN_CACHE) &&
+            tab->use_quick != 2 &&
+            (!first_inner || tab == first_inner ||
+             first_inner->use_join_cache) && 
+            (!tab->emb_sj_nest ||
+             tab->is_single_inner_of_semi_join_with_first_match()) &&
+            i <= no_jbuf_after)
+        {
+          uint flag= HA_MRR_NO_NULL_ENDPOINTS;
+          COST_VECT cost;
+          uint bufsz= 4096;
+          JOIN_CACHE *prev_cache= (tab-1)->cache ? (tab-1)->cache : 0;
+          table->file->multi_range_read_info(tab->ref.key, 10, 20,
+                                             &bufsz, &flag, &cost);  
+	  if (!(flag & HA_MRR_USE_DEFAULT_IMPL) &&
+              ((options & SELECT_DESCRIBE) ||
+               (tab->cache ||
+                (tab->cache= new JOIN_CACHE_BKA(join, tab, prev_cache))) &&
+	       !tab->cache->init()))
+	  {
+            using_join_cache= TRUE;
+	    tab[-1].next_select=sub_select_cache; /* Patch previous */
+	  }
+        }
+      }
+#endif
+      tab->use_join_cache= using_join_cache;
       if (table->covering_keys.is_set(tab->ref.key) &&
 	  !table->no_keyread)
       {
@@ -8213,7 +8258,7 @@ make_join_readinfo(JOIN *join, ulonglong
 	table->file->extra(HA_EXTRA_KEYREAD);
       }
       else
-        push_index_cond(tab, tab->ref.key, TRUE);
+        push_index_cond(tab, tab->ref.key, !using_join_cache);
       break;
     case JT_REF_OR_NULL:
     case JT_REF:
@@ -8225,25 +8270,59 @@ make_join_readinfo(JOIN *join, ulonglong
       }
       delete tab->quick;
       tab->quick=0;
-      if (table->covering_keys.is_set(tab->ref.key) &&
-	  !table->no_keyread)
-      {
-	table->key_read=1;
-	table->file->extra(HA_EXTRA_KEYREAD);
-      }
-      else
-        push_index_cond(tab, tab->ref.key, TRUE);
+      using_join_cache= FALSE;
       if (tab->type == JT_REF)
       {
 	tab->read_first_record= join_read_always_key;
 	tab->read_record.read_record= tab->insideout_match_tab? 
            join_read_next_same_diff : join_read_next_same;
+ 
+#if 0
+#else   
+        bool do_bka= !test(join->thd->variables.optimizer_switch &
+                           OPTIMIZER_SWITCH_NO_BKA);
+        JOIN_CACHE *prev_cache= (tab-1)->cache ? (tab-1)->cache : 0;
+
+        if (do_bka &&
+            i != join->const_tables && !(options & SELECT_NO_JOIN_CACHE) &&
+            tab->use_quick != 2 &&
+            (!first_inner || tab == first_inner ||
+             first_inner->use_join_cache) && 
+            (!tab->emb_sj_nest ||
+             tab->is_single_inner_of_semi_join_with_first_match()) &&
+            i <= no_jbuf_after)
+        {
+          uint flag= HA_MRR_NO_NULL_ENDPOINTS;
+          COST_VECT cost;
+          uint bufsz= 4096;
+          table->file->multi_range_read_info(tab->ref.key, 10, 20,
+                                             &bufsz, &flag, &cost);  
+	  if (!(flag & HA_MRR_USE_DEFAULT_IMPL) &&
+              ((options & SELECT_DESCRIBE) ||
+               (tab->cache ||
+                (tab->cache= new JOIN_CACHE_BKA(join, tab, prev_cache))) &&
+	       !tab->cache->init()))
+	  {
+            using_join_cache= TRUE;
+	    tab[-1].next_select=sub_select_cache; /* Patch previous */
+	  }
+        }
+#endif
       }
       else
       {
 	tab->read_first_record= join_read_always_key_or_null;
 	tab->read_record.read_record= join_read_next_same_or_null;
       }
+      tab->use_join_cache= using_join_cache;
+      if (table->covering_keys.is_set(tab->ref.key) &&
+	  !table->no_keyread)
+      {
+	table->key_read=1;
+	table->file->extra(HA_EXTRA_KEYREAD);
+      }
+      else
+        push_index_cond(tab, tab->ref.key, !using_join_cache);
       break;
     case JT_FT:
       table->status=STATUS_NO_RECORD;
@@ -8257,13 +8336,25 @@ make_join_readinfo(JOIN *join, ulonglong
       */
       table->status=STATUS_NO_RECORD;
       using_join_cache= FALSE;
+#if 0
       if (i != join->const_tables && !(options & SELECT_NO_JOIN_CACHE) &&
           tab->use_quick != 2 && !tab->first_inner && i <= no_jbuf_after &&
           !tab->insideout_match_tab)
+#else
+      if (i != join->const_tables && !(options & SELECT_NO_JOIN_CACHE) &&
+          tab->use_quick != 2 &&
+          (!first_inner || tab == first_inner ||
+           first_inner->use_join_cache) && 
+          (!tab->emb_sj_nest ||
+           tab->is_single_inner_of_semi_join_with_first_match()) &&
+          i <= no_jbuf_after)
+#endif
       {
+        JOIN_CACHE *prev_cache= (tab-1)->cache ? (tab-1)->cache : 0;
 	if ((options & SELECT_DESCRIBE) ||
-	    !join_init_cache(join->thd,join->join_tab+join->const_tables,
-			     i-join->const_tables))
+            (tab->cache ||
+             (tab->cache= new JOIN_CACHE_BNL(join, tab,prev_cache))) &&
+	    !tab->cache->init())
 	{
           using_join_cache= TRUE;
 	  tab[-1].next_select=sub_select_cache; /* Patch previous */
@@ -8336,6 +8427,7 @@ make_join_readinfo(JOIN *join, ulonglong
 	    tab->type=JT_NEXT;		// Read with index_first / index_next
 	  }
 	}
+        tab->use_join_cache= using_join_cache;
         if (tab->select && tab->select->quick &&
             tab->select->quick->index != MAX_KEY && ! tab->table->key_read)
           push_index_cond(tab, tab->select->quick->index, !using_join_cache);
@@ -8395,8 +8487,8 @@ void JOIN_TAB::cleanup()
   select= 0;
   delete quick;
   quick= 0;
-  x_free(cache.buff);
-  cache.buff= 0;
+  if (cache)
+    cache->free();
   limit= 0;
   if (table)
   {
@@ -13152,13 +13244,16 @@ do_select(JOIN *join,List<Item> *fields,
 
 
 enum_nested_loop_state
-sub_select_cache(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
+sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
 {
   enum_nested_loop_state rc;
+  JOIN_CACHE *cache= join_tab->cache;
+  
+  join_tab->cache->reset_join(join);
 
   if (end_of_records)
   {
-    rc= flush_cached_records(join,join_tab,FALSE);
+    rc= cache->join_records(FALSE);
     if (rc == NESTED_LOOP_OK || rc == NESTED_LOOP_NO_MORE_ROWS)
       rc= sub_select(join,join_tab,end_of_records);
     return rc;
@@ -13170,11 +13265,11 @@ sub_select_cache(JOIN *join,JOIN_TAB *jo
   }
   if (join_tab->use_quick != 2 || test_if_quick_select(join_tab) <= 0)
   {
-    if (!store_record_in_cache(&join_tab->cache))
+    if (!cache->put_record())
       return NESTED_LOOP_OK;                     // There is more room in cache
-    return flush_cached_records(join,join_tab,FALSE);
+    return cache->join_records(FALSE);
   }
-  rc= flush_cached_records(join, join_tab, TRUE);
+  rc= cache->join_records(TRUE);
   if (rc == NESTED_LOOP_OK || rc == NESTED_LOOP_NO_MORE_ROWS)
     rc= sub_select(join, join_tab, end_of_records);
   return rc;
@@ -13556,6 +13651,7 @@ evaluate_join_record(JOIN *join, JOIN_TA
         enumerated all the suffixes for current prefix row combination
       */
       return_tab= join_tab->do_firstmatch;
+      join_tab->found= 1; //psergey-add:
     }
 
     /*
@@ -13672,18 +13768,20 @@ evaluate_null_complemented_join_record(J
 }
 
 
-static enum_nested_loop_state
-flush_cached_records(JOIN *join,JOIN_TAB *join_tab,bool skip_last)
+enum_nested_loop_state
+JOIN_CACHE_BNL::join_matching_records(bool skip_last)
 {
+  uint i;
+  bool semi_join_flag= mrr_mode == HA_MRR_SEMI_JOIN;
   enum_nested_loop_state rc= NESTED_LOOP_OK;
   int error;
   READ_RECORD *info;
 
   join_tab->table->null_row= 0;
-  if (!join_tab->cache.records)
+  if (!records)
     return NESTED_LOOP_OK;                      /* Nothing to do */
   if (skip_last)
-    (void) store_record_in_cache(&join_tab->cache); // Must save this for later
+    put_record();                               // Must save this for later
   if (join_tab->use_quick == 2)
   {
     if (join_tab->select->quick)
@@ -13692,10 +13790,9 @@ flush_cached_records(JOIN *join,JOIN_TAB
       join_tab->select->quick=0;
     }
   }
- /* read through all records */
+  /* read through all records */
   if ((error=join_init_read_record(join_tab)))
   {
-    reset_cache_write(&join_tab->cache);
     return error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
   }
 
@@ -13713,43 +13810,45 @@ flush_cached_records(JOIN *join,JOIN_TAB
       join->thd->send_kill_message();
       return NESTED_LOOP_KILLED; // Aborted by user /* purecov: inspected */
     }
-    SQL_SELECT *select=join_tab->select;
     if (rc == NESTED_LOOP_OK &&
-        (!join_tab->cache.select || !join_tab->cache.select->skip_record()))
+        (!select || !select->skip_record()))
     {
-      uint i;
-      reset_cache_read(&join_tab->cache);
-      for (i=(join_tab->cache.records- (skip_last ? 1 : 0)) ; i-- > 0 ;)
-      {
-	read_cached_record(join_tab);
-	if (!select || !select->skip_record())
+      reset(FALSE);
+
+      for (i= records - test(skip_last) ; i-- > 0 ;)
+      { 
+        if (!semi_join_flag || !skip_record_if_match())
         {
-          int res= 0;
-          if (!join_tab->check_weed_out_table || 
-              !(res= do_sj_dups_weedout(join->thd, join_tab->check_weed_out_table)))
+	  get_record();
+          if (check_match(get_record_pos()))
           {
-            rc= (join_tab->next_select)(join,join_tab+1,0);
-            if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+            int res= 0;
+            if (semi_join_flag)
+              set_match_flag(1);
+            if (!join_tab->check_weed_out_table || 
+                !(res= do_sj_dups_weedout(join->thd,
+                                          join_tab->check_weed_out_table)))
             {
-              reset_cache_write(&join_tab->cache);
-              return rc;
+              rc= (join_tab->next_select)(join, join_tab+1, 0);
+              if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+              {
+                reset(TRUE);
+                return rc;
+              }
             }
+            if (res == -1)
+              return NESTED_LOOP_ERROR;
           }
-          if (res == -1)
-            return NESTED_LOOP_ERROR;
         }
       }
     }
   } while (!(error=info->read_record(info)));
 
-  if (skip_last)
-    read_cached_record(join_tab);		// Restore current record
-  reset_cache_write(&join_tab->cache);
   if (error > 0)				// Fatal error
     return NESTED_LOOP_ERROR;                   /* purecov: inspected */
   for (JOIN_TAB *tmp2=join->join_tab; tmp2 != join_tab ; tmp2++)
-    tmp2->table->status=tmp2->status;
-  return NESTED_LOOP_OK;
+    tmp2->table->status= tmp2->status;
+  return rc;
 }
 
 
@@ -14858,11 +14957,15 @@ end_write_group(JOIN *join, JOIN_TAB *jo
     1 if right_item is used removable reference key on left_item
 */
 
-static bool test_if_ref(Item_field *left_item,Item *right_item)
+static bool test_if_ref(COND *root_cond, 
+                        Item_field *left_item,Item *right_item)
 {
   Field *field=left_item->field;
+  JOIN_TAB *join_tab= field->table->reginfo.join_tab;
   // No need to change const test. We also have to keep tests on LEFT JOIN
-  if (!field->table->const_table && !field->table->maybe_null)
+  if (!field->table->const_table && join_tab &&
+      (!join_tab->first_inner ||
+       *join_tab->first_inner->on_expr_ref == root_cond))
   {
     Item *ref_item=part_of_refkey(field->table,field);
     if (ref_item && ref_item->eq(right_item,1))
@@ -14938,6 +15041,15 @@ static COND *
 make_cond_for_table(COND *cond, table_map tables, table_map used_table,
                     bool exclude_expensive_cond)
 {
+  return make_cond_for_table_from_pred(cond, cond, tables, used_table,
+                                       exclude_expensive_cond);
+}
+               
+static COND *
+make_cond_for_table_from_pred(COND *root_cond, COND *cond,
+                              table_map tables, table_map used_table,
+                              bool exclude_expensive_cond)
+{
   if (used_table && !(cond->used_tables() & used_table) &&
       /*
         Exclude constant conditions not checked at optimization time if
@@ -14959,8 +15071,9 @@ make_cond_for_table(COND *cond, table_ma
       Item *item;
       while ((item=li++))
       {
-	Item *fix=make_cond_for_table(item,tables,used_table,
-                                      exclude_expensive_cond);
+	Item *fix=make_cond_for_table_from_pred(root_cond, item,
+                                                tables, used_table,
+                                                exclude_expensive_cond);
 	if (fix)
 	  new_cond->argument_list()->push_back(fix);
       }
@@ -14990,7 +15103,9 @@ make_cond_for_table(COND *cond, table_ma
       Item *item;
       while ((item=li++))
       {
-	Item *fix=make_cond_for_table(item,tables,0L, exclude_expensive_cond);
+	Item *fix=make_cond_for_table_from_pred(root_cond, item, 
+                                                tables, 0L,
+                                                exclude_expensive_cond);
 	if (!fix)
 	  return (COND*) 0;			// Always true
 	new_cond->argument_list()->push_back(fix);
@@ -15026,18 +15141,19 @@ make_cond_for_table(COND *cond, table_ma
     Remove equalities that are guaranteed to be true by use of 'ref' access
     method
   */
-  if (((Item_func*) cond)->functype() == Item_func::EQ_FUNC)
+  if (cond->type() == Item::FUNC_ITEM &&
+      ((Item_func*) cond)->functype() == Item_func::EQ_FUNC)
   {
-    Item *left_item=	((Item_func*) cond)->arguments()[0];
-    Item *right_item= ((Item_func*) cond)->arguments()[1];
+    Item *left_item=	((Item_func*) cond)->arguments()[0]->real_item();
+    Item *right_item= ((Item_func*) cond)->arguments()[1]->real_item();
     if (left_item->type() == Item::FIELD_ITEM &&
-	test_if_ref((Item_field*) left_item,right_item))
+	test_if_ref(root_cond, (Item_field*) left_item, right_item))
     {
       cond->marker=3;			// Checked when read
       return (COND*) 0;
     }
     if (right_item->type() == Item::FIELD_ITEM &&
-	test_if_ref((Item_field*) right_item,left_item))
+	test_if_ref(root_cond, (Item_field*) right_item, left_item))
     {
       cond->marker=3;			// Checked when read
       return (COND*) 0;
@@ -16325,176 +16441,441 @@ SORT_FIELD *make_unireg_sortorder(ORDER 
   records
 ******************************************************************************/
 
-static int
-join_init_cache(THD *thd,JOIN_TAB *tables,uint table_count)
+static
+uint add_prefix_addon_field(uchar *str, uint length,
+                            CACHE_FIELD **field, uint *fields)
 {
-  reg1 uint i;
-  uint length, blobs;
-  size_t size;
-  CACHE_FIELD *copy,**blob_ptr;
-  JOIN_CACHE  *cache;
-  JOIN_TAB *join_tab;
-  DBUG_ENTER("join_init_cache");
+  CACHE_FIELD *copy= *field;
+  copy->str= str;
+  copy->length= length;
+  copy->strip= 0;
+  copy->blob_field= 0;
+  copy->field= 0;
+  copy->ext_key_arg_no= 0;
+  copy->get_rowid= NULL;
+  (*field)++;
+  (*fields)++;
+  return length;    
+}
+
+static
+uint add_table_fields_to_join_cache(JOIN_TAB *tab,
+                                    MY_BITMAP *field_set, 
+                                    CACHE_FIELD **field,
+                                    CACHE_FIELD ***field_ptr)
+{
+  Field **f_ptr;
+  uint used_fields= bitmap_bits_set(field_set);
+  uint length= 0;
+  for (f_ptr= tab->table->field; used_fields; f_ptr++)
+  {
+    if (bitmap_is_set(field_set, (*f_ptr)->field_index))
+    {
+      CACHE_FIELD *copy= *field;
+      used_fields--;
+      length+= (*f_ptr)->fill_cache_field(copy);
+      if (copy->blob_field)
+      {
+        *(*field_ptr)= copy;
+        (*field_ptr)++;
+      }
+      copy->field= *f_ptr;
+      copy->ext_key_arg_no= 0;
+      copy->get_rowid= NULL;
+      (*field)++;
+    }
+  }
+  return length;
+}
+
+bool JOIN_CACHE_BKA::check_emb_key_usage()
+{
+  uint i;
+  Item *item; 
+  KEY_PART_INFO *key_part;
+  CACHE_FIELD *copy;
+  uint length= 0;
+  TABLE *table= join_tab->table;
+  TABLE_REF *ref= &join_tab->ref;
+  KEY *keyinfo= table->key_info+ ref->key;
+  if (join_tab->type != JT_EQ_REF)
+    return FALSE;
+  if (local_key_arg_fields != 0 && local_key_arg_fields != ref->key_parts)
+    return FALSE;
+  if (global_key_arg_fields != 0 && ref->key_parts != 1)
+    return FALSE;
+  for (i=0; i < ref->key_parts; i++)
+  {
+    item= ref->items[i]->real_item();
+    if (item->type() != Item::FIELD_ITEM)
+      return FALSE;
+    key_part= keyinfo->key_part+i;
+    if (key_part->key_part_flag & HA_PART_KEY_SEG)
+      return FALSE;
+    if (!key_part->field->eq_def(((Item_field *) item)->field))
+      return FALSE;
+    if (key_part->field->maybe_null())
+      return FALSE;
+  }
+  for (i= 0, copy= field+prefix_addon_fields;
+       i < local_key_arg_fields;
+       i++, copy++)
+  {
+    if (copy->strip)
+      return FALSE;
+    length+= copy->length;
+  }
 
-  cache= &tables[table_count].cache;
-  cache->fields=blobs=0;
+  emb_key_length= length;
 
-  join_tab=tables;
-  for (i=0 ; i < table_count ; i++,join_tab++)
+  for (i= 0; i < ref->key_parts; i++)
   {
-    if (!join_tab->used_fieldlength)		/* Not calced yet */
-      calc_used_field_length(thd, join_tab);
-    cache->fields+=join_tab->used_fields;
-    blobs+=join_tab->used_blobs;
+    uint j;
+    Item *item= ref->items[i]->real_item();
+    Field *fld= ((Item_field *) item)->field;
+    CACHE_FIELD *init_copy= field+prefix_addon_fields+i; 
+    for (j= i, copy= init_copy; i < local_key_arg_fields;  i++, copy++)
+    {
+      if (fld->eq(copy->field))
+      {
+        if (j != i)
+        {
+          CACHE_FIELD key_part_copy= *copy; // not blob for sure
+          *copy= *init_copy;
+          *init_copy= key_part_copy;
+        }
+        break;
+      }
+    }
+  }
+
+  return ((use_emb_key= TRUE));
+}    
+    
+
+int
+JOIN_CACHE::init()
+{
+  reg1 uint i;
+  uint table_count;
+  size_t size;
+  CACHE_FIELD *copy;
+  CACHE_FIELD **copy_ptr;
+  JOIN_TAB *tab_start;
+  JOIN_TAB *tab;
+  uint gl_key_arg_fields= 0;
+  DBUG_ENTER("JOIN_CACHE::init");
+
+  use_match_flag= FALSE;
+  mrr_mode= 0;
+  if (join_tab->emb_sj_nest && 
+      join_tab->emb_sj_nest->sj_inner_tables == join_tab->table->map &&
+      join_tab->do_firstmatch == join_tab - 1)
+  {
+    use_match_flag= TRUE;
+    mrr_mode= HA_MRR_SEMI_JOIN;
+  }
+  if (join_tab->first_inner && join_tab->first_inner == join_tab)
+    use_match_flag= TRUE;
+
+  fields= blobs= 0;
+  prefix_addon_fields= 0;
+  ext_key_arg_fields= 0;
+
+  select= join_tab->cache_select;
+  tab_start= prev_cache ? prev_cache->join_tab :
+                          join->join_tab+join->const_tables;
+  tables= table_count= join_tab-tab_start;
+
+  for (i= 0, tab= tab_start; i < table_count ; i++, tab++)
+  {
+    if (!tab->used_fieldlength)		/* Not calced yet */
+      calc_used_field_length(join->thd, tab);
+    fields+= tab->used_fields;
+    blobs+= tab->used_blobs;
 
     /* SemiJoinDuplicateElimination: reserve space for rowid */
-    if (join_tab->rowid_keep_flags & JOIN_TAB::KEEP_ROWID)
+    if (tab->rowid_keep_flags & JOIN_TAB::KEEP_ROWID)
     {
-      cache->fields++;
-      join_tab->used_fieldlength += join_tab->table->file->ref_length;
+      fields++;
+      tab->used_fieldlength += tab->table->file->ref_length;
     }
   }
-  if (!(cache->field=(CACHE_FIELD*)
-	sql_alloc(sizeof(CACHE_FIELD)*(cache->fields+table_count*2)+(blobs+1)*
+ 
+  if (is_key_access())
+  {
+    ((JOIN_CACHE_BKA *) this)->set_key_arg_fields(FALSE, 0);
+    ((JOIN_CACHE_BKA *) this)->set_key_arg_fields(TRUE, 0);
+    ((JOIN_CACHE_BKA *) this)->use_emb_key= FALSE;
 
-		  sizeof(CACHE_FIELD*))))
+    /* Mark all fields that can be used as arguments for this key access */
+ 
+    TABLE_REF *ref= &join_tab->ref;
+    JOIN_CACHE *cache= this;
+    do
+    {
+      for (tab= cache->join_tab-cache->tables; tab < cache->join_tab ; tab++)
+      { 
+        bitmap_clear_all(&tab->table->tmp_set);
+        for (uint j= 0; j < ref->key_parts; j++)
+        {
+          Item *ref_item= ref->items[j]; 
+          if (!(tab->table->map & ref_item->used_tables()))
+	    continue;
+	  ref_item->walk(&Item::add_field_to_set_processor, 1,
+                         (uchar *) tab->table);
+        }
+        uint key_args= bitmap_bits_set(&tab->table->tmp_set);
+        if (key_args)
+          ((JOIN_CACHE_BKA *) this)->incr_key_arg_fields(key_args,
+                                                         cache == this);
+      }
+      cache= cache->prev_cache;
+    } 
+    while (cache);
+    gl_key_arg_fields= ((JOIN_CACHE_BKA *) this)->get_key_arg_fields(FALSE);
+  }  
+
+
+  if (!(field=(CACHE_FIELD*)
+	sql_alloc(sizeof(CACHE_FIELD)*(fields+table_count*2+1)+
+                  sizeof(CACHE_FIELD*)*(gl_key_arg_fields+blobs+1))))
   {
-    my_free((uchar*) cache->buff,MYF(0));		/* purecov: inspected */
-    cache->buff=0;				/* purecov: inspected */
+    my_free((uchar*) buff,MYF(0));		/* purecov: inspected */
+    buff=0;				        /* purecov: inspected */
     DBUG_RETURN(1);				/* purecov: inspected */
   }
-  copy=cache->field;
-  blob_ptr=cache->blob_ptr=(CACHE_FIELD**)
-    (cache->field+cache->fields+table_count*2);
+  copy= field;
+  copy_ptr= (CACHE_FIELD**) (field+fields+table_count*2+test(use_match_flag));
 
   length=0;
-  for (i=0 ; i < table_count ; i++)
-  {
-    uint null_fields=0,used_fields;
-    Field **f_ptr,*field;
-    MY_BITMAP *read_set= tables[i].table->read_set;
-    for (f_ptr=tables[i].table->field,used_fields=tables[i].used_fields ;
-	 used_fields ;
-	 f_ptr++)
+
+  if (use_match_flag)
+    length+= add_prefix_addon_field((uchar*) &join_tab->found,
+                                    sizeof(join_tab->found),
+                                    &copy, &prefix_addon_fields);
+
+  /* Copy all null bits and null_row flag from each table */
+  for (i=0, tab= tab_start; i < table_count ; i++, tab++)
+  {
+    uint null_fields= 0;
+    uint used_fields= tab->used_fields;
+    TABLE *table= tab->table;
+    MY_BITMAP *read_set= table->read_set;
+    for (Field **f_ptr= table->field; used_fields; f_ptr++)
     {
-      field= *f_ptr;
-      if (bitmap_is_set(read_set, field->field_index))
+      if (bitmap_is_set(read_set, (*f_ptr)->field_index))
       {
 	used_fields--;
-	length+=field->fill_cache_field(copy);
-	if (copy->blob_field)
-	  (*blob_ptr++)=copy;
-	if (field->maybe_null())
+	if ((*f_ptr)->maybe_null())
 	  null_fields++;
-        copy->get_rowid= NULL;
-	copy++;
       }
     }
-    /* Copy null bits from table */
-    if (null_fields && tables[i].table->s->null_fields)
+    /* Copy null bits from table if needed */
+    if (null_fields && tab->table->s->null_fields)
     {						/* must copy null bits */
-      copy->str= tables[i].table->null_flags;
-      copy->length= tables[i].table->s->null_bytes;
-      copy->strip=0;
-      copy->blob_field=0;
-      copy->get_rowid= NULL;
-      length+=copy->length;
-      copy++;
-      cache->fields++;
+      length+= add_prefix_addon_field(table->null_flags,
+                                      table->s->null_bytes,
+                                      &copy, &prefix_addon_fields);
     }
     /* If outer join table, copy null_row flag */
-    if (tables[i].table->maybe_null)
+    if (tab->table->maybe_null)
     {
-      copy->str= (uchar*) &tables[i].table->null_row;
-      copy->length=sizeof(tables[i].table->null_row);
-      copy->strip=0;
-      copy->blob_field=0;
-      copy->get_rowid= NULL;
-      length+=copy->length;
-      copy++;
-      cache->fields++;
+      length+= add_prefix_addon_field((uchar*) &table->null_row,
+                                      sizeof(table->null_row),
+                                      &copy, &prefix_addon_fields);
+    }
+  }
+  fields+= prefix_addon_fields;
+
+  if (is_key_access())
+  {
+    /* 
+      Set pointers to the cache fields in previous caches
+      that  are used to build keys for this key access.
+    */
+    JOIN_CACHE *cache= this;
+    while (gl_key_arg_fields)
+    {
+      cache= cache->prev_cache;
+      for (tab= cache->join_tab-cache->tables; tab < cache->join_tab ; tab++)
+      { 
+        CACHE_FIELD *cache_copy;
+        MY_BITMAP *key_read_set= &tab->table->tmp_set;
+        if (bitmap_is_clear_all(key_read_set))
+          continue;
+        cache_copy= cache->field+cache->prefix_addon_fields;
+        for (i= cache->prefix_addon_fields; i < cache->fields; i++, cache_copy++)
+        {
+          if (bitmap_is_set(key_read_set, cache_copy->field->field_index))
+          {
+            *copy_ptr++= cache_copy;
+            gl_key_arg_fields--;
+            if (!cache_copy->ext_key_arg_no)
+	      cache->register_ext_key_arg_field(cache_copy);
+          }
+        }
+      }
+    } 
+    
+    /* Now copy fields from ref if any */
+    blob_ptr= copy_ptr;
+    for (i=0, tab= tab_start; i < table_count ; i++, tab++)
+    {
+      length+= add_table_fields_to_join_cache(tab, &tab->table->tmp_set,  
+                                              &copy, &copy_ptr);
+    }
+    ((JOIN_CACHE_BKA *) this)->check_emb_key_usage();
+  }
+  else
+    blob_ptr= copy_ptr;
+   
+  /* Copy the remaining table fields */    
+  for (i=0, tab= tab_start; i < table_count ; i++, tab++)
+  {
+    MY_BITMAP *field_set;
+    TABLE *table= tab->table;
+    if (is_key_access())
+    {                   
+      bitmap_invert(&table->tmp_set);
+      bitmap_intersect(&table->tmp_set, table->read_set);
+      field_set= &table->tmp_set;
     }
+    else
+      field_set= table->read_set;
+    length+= add_table_fields_to_join_cache(tab, field_set,
+                                            &copy, &copy_ptr);
+    
     /* SemiJoinDuplicateElimination: Allocate space for rowid if needed */
-    if (tables[i].rowid_keep_flags & JOIN_TAB::KEEP_ROWID)
+    if (tab->rowid_keep_flags & JOIN_TAB::KEEP_ROWID)
     {
-      copy->str= tables[i].table->file->ref;
-      copy->length= tables[i].table->file->ref_length;
-      copy->strip=0;
+      copy->str= table->file->ref;
+      copy->length= table->file->ref_length;      copy->strip=0;
       copy->blob_field=0;
+      copy->field= 0;
+      copy->ext_key_arg_no= 0;
       copy->get_rowid= NULL;
-      if (tables[i].rowid_keep_flags & JOIN_TAB::CALL_POSITION)
+      if (tab->rowid_keep_flags & JOIN_TAB::CALL_POSITION)
       {
         /* We will need to call h->position(): */
-        copy->get_rowid= tables[i].table;
+        copy->get_rowid= tab->table;
         /* And those after us won't have to: */
-        tables[i].rowid_keep_flags &=  ~((int)JOIN_TAB::CALL_POSITION);
+        tab->rowid_keep_flags &=  ~((int)JOIN_TAB::CALL_POSITION);
       }
       copy++;
     }
   }
 
-  cache->length=length+blobs*sizeof(char*);
-  cache->blobs=blobs;
-  *blob_ptr=0;					/* End sequentel */
-  size=max(thd->variables.join_buff_size, cache->length);
-  if (!(cache->buff=(uchar*) my_malloc(size,MYF(0))))
-    DBUG_RETURN(1);				/* Don't use cache */ /* purecov: inspected */
-  cache->end=cache->buff+size;
-  reset_cache_write(cache);
+  length+= blobs*sizeof(char*);
+  *copy_ptr=0;					/* End sequentel */
+  size= max(join->thd->variables.join_buff_size, length);
+  if (!(buff=(uchar*) my_malloc(size,MYF(0))))
+    DBUG_RETURN(1);		/* Don't use cache */ /* purecov: inspected */
+  end_pos= end= buff+size;
+  reset(TRUE);
   DBUG_RETURN(0);
 }
 
 
-static ulong
-used_blob_length(CACHE_FIELD **ptr)
+ulong JOIN_CACHE::used_blob_length()
 {
-  uint length,blob_length;
-  for (length=0 ; *ptr ; ptr++)
+  uint len;
+  uint blob_length;
+  CACHE_FIELD **ptr= blob_ptr;
+  for (len= 0; *ptr; ptr++)
   {
-    (*ptr)->blob_length=blob_length=(*ptr)->blob_field->get_length();
-    length+=blob_length;
+    (*ptr)->blob_length= blob_length= (*ptr)->blob_field->get_length();
+    len+= blob_length;
     (*ptr)->blob_field->get_ptr(&(*ptr)->str);
   }
-  return length;
+  return len;
 }
 
 
-static bool
-store_record_in_cache(JOIN_CACHE *cache)
+uint JOIN_CACHE::write_record_data(bool *is_full)
 {
-  uint length;
-  uchar *pos;
-  CACHE_FIELD *copy,*end_field;
+  uint i;
+  uint len;
+  uchar *data_pos;
+  uchar *rec_len_ptr= 0;
+  CACHE_FIELD *copy;
+  uchar *pos= this->pos;
+  uchar *init_pos= pos;
+  CACHE_FIELD *end_field= field+fields;
   bool last_record;
 
-  pos=cache->pos;
-  end_field=cache->field+cache->fields;
-
-  length=cache->length;
-  if (cache->blobs)
-    length+=used_blob_length(cache->blob_ptr);
-  if ((last_record= (length + cache->length > (size_t) (cache->end - pos))))
-    cache->ptr_record=cache->records;
+  len= pack_length();
+  if (blobs)
+    len+= used_blob_length();
+  /*
+    The current record is considered as the last one written in the cache if
+    there is not enough space left to store its blob values or if we can't
+    guarantee that this space is large enough to hold additionally the parts 
+    of the next record excluding its blob values.
+  */
+  last_record= (len+pack_length()+addon_length() > (uint) (end_pos - pos));
+  
   /*
-    There is room in cache. Put record there
+    This function is called only in the case when there is enough
+    space left in the cache to store at least  non-blob parts of
+    the current record.
   */
-  cache->records++;
-  for (copy=cache->field ; copy < end_field; copy++)
+  if (last_record)
+    ptr_record= records;
+  records++;
+
+  if (with_length_prepanded())
+  {
+    rec_len_ptr= pos;
+    pos+= sizeof(uint32);
+  }
+
+  if (prev_cache)
+  {
+    uchar *prev_rec_pos= prev_cache->get_record_pos();
+    memcpy(pos, &prev_rec_pos, sizeof(uchar *));
+    pos+= sizeof(uchar *);
+  } 
+
+  data_pos= pos;
+
+  if (use_match_flag)
+    *field[0].str= 0;
+
+  /* First write match flag, null bits and null_row flag for each table */
+  for (i= 0, copy= field; i < prefix_addon_fields; i++, copy++)
+  {
+    memcpy(pos, copy->str, copy->length);
+    pos+= copy->length;
+  } 
+
+  /* Now write the remaining fields if needed */ 
+  for ( ; copy < end_field; copy++)
   {
+    if (copy->field && copy->field->maybe_null() && copy->field->is_null())
+    {
+      if (copy->ext_key_arg_no)
+        copy->offset= 0;
+      continue;              // Do not copy a field if its value is null 
+    }
+    if (copy->ext_key_arg_no)
+      copy->offset= pos-data_pos;
     if (copy->blob_field)
     {
       if (last_record)
       {
 	copy->blob_field->get_image(pos, copy->length+sizeof(char*), 
 				    copy->blob_field->charset());
-	pos+=copy->length+sizeof(char*);
+	pos+= copy->length+sizeof(char*);
       }
       else
       {
-	copy->blob_field->get_image(pos, copy->length, // blob length
+        // Copy the length of the blob: 
+	copy->blob_field->get_image(pos, copy->length, 
 				    copy->blob_field->charset());
-	memcpy(pos+copy->length,copy->str,copy->blob_length);  // Blob data
-	pos+=copy->length+copy->blob_length;
+        // Copy the blob data: 
+	memcpy(pos+copy->length, copy->str, copy->blob_length);               
+	pos+= copy->length+copy->blob_length;
       }
     }
     else
@@ -16505,89 +16886,565 @@ store_record_in_cache(JOIN_CACHE *cache)
 
       if (copy->strip)
       {
-	uchar *str,*end;
-	for (str=copy->str,end= str+copy->length;
-	     end > str && end[-1] == ' ' ;
+	uchar *str, *end;
+	for (str= copy->str, end= str+copy->length;
+	     end > str && end[-1] == ' ';
 	     end--) ;
-	length=(uint) (end-str);
-	memcpy(pos+2, str, length);
-        int2store(pos, length);
-	pos+= length+2;
+	len=(uint) (end-str);
+        int2store(pos, len);
+	memcpy(pos+2, str, len);
+	pos+= len+2;
       }
       else
       {
-	memcpy(pos,copy->str,copy->length);
-	pos+=copy->length;
+	memcpy(pos, copy->str, copy->length);
+	pos+= copy->length;
       }
     }
   }
-  cache->pos=pos;
-  return last_record || (size_t) (cache->end - pos) < cache->length;
+  if (ext_key_arg_fields)
+  {
+    for (copy= field+prefix_addon_fields; copy < end_field ; copy++)
+    {
+      if (copy->ext_key_arg_no)
+      {
+        int4store(pos, copy->offset);
+        pos+= sizeof(uint32);
+      }
+    }
+  }
+
+  if (rec_len_ptr)
+    int4store(rec_len_ptr, (uint32) (pos-rec_len_ptr-sizeof(uint32)));
+  this->pos= pos;
+  *is_full= last_record || 
+    (size_t) (end_pos - pos) < pack_length()+addon_length(); 
+  return (uint) (pos-init_pos);
 }
 
 
-static void
-reset_cache_read(JOIN_CACHE *cache)
+void
+JOIN_CACHE::reset(bool is_for_write)
 {
-  cache->record_nr=0;
-  cache->pos=cache->buff;
+  record_nr= 0;
+  pos= buff;
+  prev_rec_end= pos;
+  if (is_for_write)
+  {
+    records= 0;
+    ptr_record= (uint) ~0;
+  }
 }
 
+uint JOIN_CACHE::read_prefix_addon_fields()
+{
+  uint i;
+  CACHE_FIELD *copy;
+  for (i= 0, copy= field; i < prefix_addon_fields; i++, copy++)
+  {
+    memcpy(copy->str, pos, copy->length);
+    pos+= copy->length;
+  }
+  return prefix_addon_fields;
+}
 
-static void reset_cache_write(JOIN_CACHE *cache)
+uint JOIN_CACHE::read_record_field(CACHE_FIELD *copy, bool last_record)
 {
-  reset_cache_read(cache);
-  cache->records= 0;
-  cache->ptr_record= (uint) ~0;
+  uint len;
+  if (copy->field && copy->field->maybe_null() && copy->field->is_null())
+    return 0;              // Do not copy a field if its value is null 
+  if (copy->blob_field)
+  {
+    if (last_record)
+    {
+      copy->blob_field->set_image(pos, copy->length+sizeof(char*),
+				  copy->blob_field->charset());
+      len= copy->length+sizeof(char*);
+    }
+    else
+    {
+      copy->blob_field->set_ptr(pos, pos+copy->length);
+      len= copy->length+copy->blob_field->get_length();
+    }
+  }
+  else
+  {
+    if (copy->strip)
+    {
+      len= uint2korr(pos);
+      memcpy(copy->str, pos+2, len);
+      memset(copy->str+len, ' ', copy->length-len);
+      len+= 2;
+    }
+    else
+    {
+      len= copy->length;
+      memcpy(copy->str, pos, len);
+    }
+  }
+  pos+= len;
+  return len;
+}
+
+bool JOIN_CACHE::read_ext_key_arg_field(CACHE_FIELD *copy,
+                                        uchar *rec_ptr, 
+                                        bool last_record,
+                                        uint *len)
+{
+  uint offset;
+  if (copy < field || copy >= field+fields)
+    return FALSE;
+  if (!*len)
+  {
+    uchar *len_ptr= rec_ptr;
+    if (prev_cache)
+      len_ptr-= sizeof(uchar *);
+    *len= uint4korr(len_ptr-sizeof(uint32));
+  }
+  offset= uint4korr(rec_ptr-
+                    (prev_cache ? sizeof(uchar *) : 0)+
+                    (*len)-
+                    sizeof(uint32)*
+                    (ext_key_arg_fields+1-copy->ext_key_arg_no));  
+  bool is_null= FALSE;
+  if (offset == 0 && prefix_addon_fields)
+    is_null= TRUE;
+  if (is_null)
+    copy->field->set_null();
+  else
+  {
+    copy->field->set_notnull(); 
+    pos= rec_ptr+offset;
+    read_record_field(copy, last_record);
+  }
+  return TRUE;
 }
+   
+uint JOIN_CACHE::read_record_data()
+{
+  CACHE_FIELD *copy= field;
+  uchar *init_pos= pos;
+  CACHE_FIELD *end_field= field+fields;
+  bool last_record= record_nr == ptr_record;
+  
+  if (record_nr == records)
+    return 0;
+  record_nr++;  
 
+  /* First match flag, read null bits and null_row flag for each record */
+  copy+= read_prefix_addon_fields();
+ 
+  /* Now read the remaining table fields if needed */
+  for ( ; copy < end_field; copy++)
+    read_record_field(copy, last_record);
 
-static void
-read_cached_record(JOIN_TAB *tab)
+  return (uint) (pos-init_pos);
+}
+
+
+bool JOIN_CACHE::set_match_flag_if_none(JOIN_TAB *first_inner,
+                                        uchar *rec_ptr)
 {
-  uchar *pos;
-  uint length;
-  bool last_record;
-  CACHE_FIELD *copy,*end_field;
+  if (!first_inner->cache)
+  {
+    first_inner->found= 1;
+    return TRUE;
+  }
+  uchar *prev_rec_ptr= rec_ptr;
+  JOIN_CACHE *cache= this;
+  while (cache->join_tab != first_inner)
+  {
+    memcpy(&prev_rec_ptr, prev_rec_ptr-sizeof(uchar *), sizeof(uchar *));
+    cache= cache->prev_cache;
+  } 
+  if (*prev_rec_ptr == 0)
+  {
+    *prev_rec_ptr= 1;
+    first_inner->found= 1;
+    return TRUE;  
+  }
+  return FALSE;
+}
 
-  last_record=tab->cache.record_nr++ == tab->cache.ptr_record;
-  pos=tab->cache.pos;
-  for (copy=tab->cache.field,end_field=copy+tab->cache.fields ;
-       copy < end_field;
-       copy++)
+
+enum_nested_loop_state
+JOIN_CACHE::join_records(bool skip_last)
+{
+  JOIN_TAB *tab;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  bool is_first_inner= join_tab->first_inner && 
+                       join_tab->first_inner == join_tab;
+  if (is_first_inner)
+    join_tab->not_null_compl= TRUE;
+  if (!join_tab->first_inner || join_tab->first_inner->not_null_compl)
   {
-    if (copy->blob_field)
+    rc= join_matching_records(skip_last);   
+    if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
     {
-      if (last_record)
+      reset(TRUE);
+      return rc;
+    }
+    if (is_first_inner)
+    {
+      if (next_cache)
       {
-	copy->blob_field->set_image(pos, copy->length+sizeof(char*),
-				    copy->blob_field->charset());
-	pos+=copy->length+sizeof(char*);
+        rc= next_cache->join_records(skip_last);
+        if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+        {
+          reset(TRUE);
+          return rc;
+        }
       }
-      else
+      join_tab->not_null_compl= FALSE;
+      for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
+        tab->first_unmatched= join_tab->first_inner;
+    }
+  }
+  if (join_tab->first_unmatched && !join_tab->first_unmatched->not_null_compl)
+  {
+    reset(FALSE);
+    rc= join_null_records(skip_last);   
+    if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+    {
+      reset(TRUE);
+      return rc;
+    }
+  }
+  if(next_cache)
+  {
+    rc= next_cache->join_records(skip_last);
+    if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+    {
+      reset(TRUE);
+      return rc;
+    }
+  }
+  if (is_first_inner)
+  {
+    for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
+      tab->first_unmatched= 0;
+  } 
+ 
+  if (skip_last)
+  {
+    DBUG_ASSERT(!is_key_access());
+    get_record();		                // Restore current record
+  }
+  reset(TRUE);
+  return rc;
+}
+
+inline
+bool JOIN_CACHE::check_match(uchar *rec_ptr)
+{
+  if (join_tab->select && join_tab->select->skip_record())
+    return FALSE;
+
+  if (!join_tab->first_inner ||
+      join_tab->first_inner->last_inner != join_tab)
+    return TRUE;
+
+  /* 
+     This is the last inner table of an outer join,
+     and maybe of other embedding outer joins. 
+  */
+  JOIN_TAB *first_inner= join_tab->first_inner;
+  do
+  {
+    if (!set_match_flag_if_none(first_inner, rec_ptr))
+      return TRUE;
+    /* 
+      This is the first match for the outer table row.
+      The function set_match_flag_if_none has turned the flag
+      first_inner->found on. The pushed down predicates for
+      inner tables must be re-evaluated with this flag on.
+    */      
+    for (JOIN_TAB *tab= first_inner; tab <= join_tab; tab++)
+    {
+      if (tab->select && tab->select->skip_record())
+        return FALSE;
+    }
+  }
+  while ((first_inner= first_inner->first_upper));
+  
+  return TRUE;
+} 
+
+/* 
+  The following inplementation of the JOIN_CACHE::check_match
+  employs only one loop but its code is harder to understand
+*/
+#if 0
+inline
+bool JOIN_CACHE::check_match(uchar *rec_ptr)
+{
+  JOIN_TAB *tab= join_tab;
+  JOIN_TAB *first_inner= 0;
+  if (tab->first_inner && tab->first_inner->last_inner == tab)
+    first_inner= tab->first_inner : 0;
+  do
+  {
+    if (tab->select && tab->select->skip_record()));
+      return FALSE;
+    if (tab == join_tab) 
+    {                     
+      if (first_inner && 
+          !set_match_flag_if_none(tab->first_inner, rec_ptr))
+        break;
+      tab= first_inner;
+      if (first_inner)
+        first_inner= first_inner->first_upper;
+    }
+    else
+      tab++;
+  } while (tab);
+  return TRUE;
+}
+#endif
+
+     
+static range_seq_t bka_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  DBUG_ENTER("bka_range_seq_init");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) init_param;
+  cache->reset(0);
+  DBUG_RETURN((range_seq_t) init_param);
+}
+
+static uint bka_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  DBUG_ENTER("bka_range_seq_next");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  TABLE_REF *ref= &cache->join_tab->ref;
+  key_range *start_key= &range->start_key;
+  if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
+  {
+    start_key->keypart_map= (1 << ref->key_parts) - 1;
+    start_key->flag= HA_READ_KEY_EXACT;
+    range->end_key= *start_key;
+    range->end_key.flag= HA_READ_AFTER_KEY;
+    range->ptr= (char *) cache->get_record_pos();
+    range->range_flag= EQ_RANGE;
+    DBUG_RETURN(0);
+  } 
+  DBUG_RETURN(1);
+}
+
+static void bka_save_current_pos(range_seq_t rseq, uint *range_no, void **pos)
+{
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  *range_no= cache->get_record_nr();
+  *pos =cache->get_pos();
+}
+
+static void bka_restore_current_pos(range_seq_t rseq,  uint range_no, void *pos)
+{
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  cache->set_record_nr(range_no);
+  cache->set_pos((uchar *) pos);
+}
+
+uint JOIN_CACHE_BKA::get_next_key(uchar ** key)
+{
+  uint len;
+  uint32 rec_len;
+  uchar *init_pos;
+  uchar *prev_rec_pos;
+  JOIN_CACHE *cache;
+  CACHE_FIELD *copy= field;
+  bool last_record= record_nr == ptr_record;
+  
+  if (record_nr == records)
+    return 0;
+  record_nr++;
+   
+  pos= prev_rec_end;
+  rec_len= uint4korr(pos);
+  pos+= 4;
+  init_pos= pos;
+
+  if (prev_cache)
+  {
+    if (global_key_arg_fields)
+      memcpy(&prev_rec_pos, pos, sizeof(uchar *));
+    pos+= sizeof(uchar *);
+  }
+
+  rec_pos= pos;
+
+  copy+= read_prefix_addon_fields();
+ 
+  if (global_key_arg_fields)
+  {
+    uint arg_key_count= global_key_arg_fields;
+    CACHE_FIELD **copy_ptr= blob_ptr-arg_key_count;
+    for (cache= prev_cache; ; cache= cache->prev_cache)
+    { 
+      uint len= 0;
+      while (!cache->get_ext_key_arg_fields())
       {
-	copy->blob_field->set_ptr(pos, pos+copy->length);
-	pos+=copy->length+copy->blob_field->get_length();
+        memcpy(&prev_rec_pos, prev_rec_pos-sizeof(uchar *), sizeof(uchar *));
+        cache= cache->prev_cache;
       }
+      while (arg_key_count && 
+             cache->read_ext_key_arg_field(*copy_ptr, prev_rec_pos, 
+                                           FALSE, &len)) // now FALSE is a stub
+      {
+        copy_ptr++;
+        --arg_key_count;
+      }
+      if (!arg_key_count)
+        break;
+      memcpy(&prev_rec_pos, prev_rec_pos-sizeof(uchar *), sizeof(uchar *));
     }
-    else
+  }
+      
+  if (use_emb_key)
+  {
+    *key= pos;
+    len= emb_key_length;
+  }
+  else
+  {
+    cache= this;
+    
+    for (uint i= 0; i < local_key_arg_fields; i++, copy++)
+      read_record_field(copy, last_record);
+    
+    TABLE_REF *ref= &join_tab->ref;
+    cp_buffer_from_ref(join->thd, join_tab->table, ref);
+    *key= ref->key_buff;
+    len= ref->key_length;
+  }
+
+  prev_rec_end= init_pos+rec_len;
+
+  return len;
+}  
+
+enum_nested_loop_state
+JOIN_CACHE_BKA::join_matching_records(bool skip_last)
+{
+  bool semi_join_flag= mrr_mode == HA_MRR_SEMI_JOIN;
+  handler *file= join_tab->table->file;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  int error;
+  uchar *rec_ptr;
+
+  join_tab->table->null_row= 0;
+  if (!records)
+    return NESTED_LOOP_OK;                      /* Nothing to do */
+
+  DBUG_ASSERT(!skip_last);
+  DBUG_ASSERT(join_tab->use_quick != 2);
+
+  RANGE_SEQ_IF seq_funcs= {bka_range_seq_init, bka_range_seq_next,
+                           bka_save_current_pos, bka_restore_current_pos};
+  init_mrr_buff();
+  if (!file->inited)
+    file->ha_index_init(join_tab->ref.key, 1);
+  error= file->multi_range_read_init(&seq_funcs, (void*)this, records,
+                                     mrr_mode, &mrr_buff);
+
+  for (JOIN_TAB *tmp=join->join_tab; tmp != join_tab ; tmp++)
+  {
+    tmp->status=tmp->table->status;
+    tmp->table->status=0;
+  }
+
+  while (!(error= file->multi_range_read_next((char **) &rec_ptr)))
+  {
+    if (join->thd->killed)
     {
-      if (copy->strip)
+      join->thd->send_kill_message();
+      return NESTED_LOOP_KILLED; // Aborted by user /* purecov: inspected */
+    }
+    if (rc == NESTED_LOOP_OK)
+    {
+      if (!semi_join_flag || !get_match_flag_by_pos(rec_ptr))
       {
-        length= uint2korr(pos);
-	memcpy(copy->str, pos+2, length);
-	memset(copy->str+length, ' ', copy->length-length);
-	pos+= 2 + length;
+        get_record_by_pos(rec_ptr);
+        
+        if (check_match(rec_ptr))
+        {    
+          int res= 0;
+          if (semi_join_flag)
+             set_match_flag(1);
+          if (!join_tab->check_weed_out_table || 
+              !(res= do_sj_dups_weedout(join->thd,
+                                        join_tab->check_weed_out_table)))
+          {
+            rc= (join_tab->next_select)(join, join_tab+1, 0);
+            if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+            {
+              reset(TRUE);
+              return rc;
+            }
+          }
+          if (res == -1)
+            return NESTED_LOOP_ERROR;
+        }
       }
-      else
+    }
+  }
+
+  if (error > 0 && error != HA_ERR_END_OF_FILE)	    // Fatal error
+    return NESTED_LOOP_ERROR;                   /* purecov: inspected */
+  for (JOIN_TAB *tmp2=join->join_tab; tmp2 != join_tab ; tmp2++)
+    tmp2->table->status= tmp2->status;
+  return rc;
+}
+
+
+enum_nested_loop_state JOIN_CACHE::join_null_records(bool skip_last)
+{
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  bool is_first_inner= join_tab == join_tab->first_unmatched;
+  bool is_last_inner= join_tab == join_tab->first_unmatched->last_inner; 
+  uint count= records - (is_key_access() ? 0 : test(skip_last));
+  DBUG_ASSERT(join_tab->first_inner);
+  for ( ; count; count--)
+  { 
+   next:
+    if (join->thd->killed)
+    {
+      join->thd->send_kill_message();
+      return NESTED_LOOP_KILLED; // Aborted by user /* purecov: inspected */
+    }
+    if (!is_first_inner || !skip_record_if_match())
+    {
+      get_record();
+      /* The outer row is complemented by nulls for each inner tables */
+      restore_record(join_tab->table, s->default_values);
+      mark_as_null_row(join_tab->table);  
+      /* Check all attached conditions for inner table rows. */
+      if (join_tab->select && join_tab->select->skip_record())
+        continue;
+      if (is_last_inner)
+      { 
+        JOIN_TAB *first_upper= join_tab->first_unmatched->first_upper;
+        while (first_upper)
+        {
+          if (!set_match_flag_if_none(first_upper, get_record_pos()))
+            break;
+          for (JOIN_TAB* tab= first_upper; tab <= join_tab; tab++)
+          {
+            if (tab->select && tab->select->skip_record())
+              goto next;
+          }
+          first_upper= first_upper->first_upper;
+        }
+      }
+      rc= (*join_tab->next_select)(join, join_tab+1, 0);
+      if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
       {
-	memcpy(copy->str,pos,copy->length);
-	pos+=copy->length;
+        reset(TRUE);
+        return rc;
       }
     }
   }
-  tab->cache.pos=pos;
-  return;
+
+  return rc;
 }
 
 
@@ -16631,7 +17488,6 @@ cmp_buffer_with_ref(JOIN_TAB *tab)
   return memcmp(tab->ref.key_buff2, tab->ref.key_buff, tab->ref.key_length)
     != 0;
 }
-
 
 bool
 cp_buffer_from_ref(THD *thd, TABLE *table, TABLE_REF *ref)
diff -Nrup a/sql/sql_select.h b/sql/sql_select.h
--- a/sql/sql_select.h	2007-12-21 20:27:47 +01:00
+++ b/sql/sql_select.h	2008-03-04 09:39:27 +01:00
@@ -105,58 +105,12 @@ typedef struct st_table_ref
 
 
 /**
-  CACHE_FIELD and JOIN_CACHE is used on full join to cache records in outer
-  table
-*/
-
-typedef struct st_cache_field {
-  /* 
-    Where source data is located (i.e. this points to somewhere in 
-    tableX->record[0])
-  */
-  uchar *str;
-  uint length; /* Length of data at *str, in bytes */
-  uint blob_length; /* Valid IFF blob_field != 0 */
-  Field_blob *blob_field;
-  bool strip; /* TRUE <=> Strip endspaces ?? */
-
-  TABLE *get_rowid; /* _ != NULL <=> */
-} CACHE_FIELD;
-
-
-typedef struct st_join_cache 
-{
-  uchar *buff;
-  uchar *pos;    /* Start of free space in the buffer */
-  uchar *end;
-  uint records;  /* # of row cominations currently stored in the cache */
-  uint record_nr;
-  uint ptr_record; 
-  /* 
-    Number of fields (i.e. cache_field objects). Those correspond to table
-    columns, and there are also special fields for
-     - table's column null bits
-     - table's null-complementation byte
-     - [new] table's rowid.
-  */
-  uint fields; 
-  uint length; 
-  uint blobs;
-  CACHE_FIELD *field;
-  CACHE_FIELD **blob_ptr;
-  SQL_SELECT *select;
-} JOIN_CACHE;
-
-
-/*
   The structs which holds the join connections and join states
 */
 enum join_type { JT_UNKNOWN,JT_SYSTEM,JT_CONST,JT_EQ_REF,JT_REF,JT_MAYBE_REF,
 		 JT_ALL, JT_RANGE, JT_NEXT, JT_FT, JT_REF_OR_NULL,
 		 JT_UNIQUE_SUBQUERY, JT_INDEX_SUBQUERY, JT_INDEX_MERGE};
 
-class JOIN;
-
 enum enum_nested_loop_state
 {
   NESTED_LOOP_KILLED= -2, NESTED_LOOP_ERROR= -1,
@@ -171,6 +125,7 @@ enum enum_nested_loop_state
 #define TAB_INFO_USING_WHERE 4
 #define TAB_INFO_FULL_SCAN_ON_NULL 8
 
+class JOIN_CACHE;
 class SJ_TMP_TABLE;
 
 typedef enum_nested_loop_state
@@ -256,7 +211,9 @@ typedef struct st_join_table {
   */ 
   ha_rows       limit; 
   TABLE_REF	ref;
-  JOIN_CACHE	cache;
+  bool          use_join_cache;
+  JOIN_CACHE	*cache;
+  SQL_SELECT    *cache_select;
   JOIN		*join;
   /** Bitmap of nested joins this table is part of */
 
@@ -309,7 +266,311 @@ typedef struct st_join_table {
             (select->quick->get_type() ==
              QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX));
   }
+  inline bool is_single_inner_of_semi_join_with_first_match()
+  {
+    return emb_sj_nest && 
+           emb_sj_nest->sj_inner_tables == table->map &&
+	   do_firstmatch == this - 1;
+  }
+  inline bool is_single_inner_of_outer_join()
+  {
+    return first_inner && first_inner == last_inner;
+  } 
 } JOIN_TAB;
+
+
+/*
+  CACHE_FIELD and JOIN_CACHE is used on full join to cache records in outer
+  table
+*/
+
+typedef struct st_cache_field {
+  /* 
+    Where source data is located (i.e. this points to somewhere in 
+    tableX->record[0])
+  */
+  uchar *str;
+  uint length; /* Length of data at *str, in bytes */
+  uint blob_length; /* Valid IFF blob_field != 0 */
+  Field_blob *blob_field;
+  bool strip; /* TRUE <=> Strip endspaces ?? */
+  uint ext_key_arg_no; /* used to build keys for other BKA join caches */
+  uint offset;
+  Field *field;     /* for some cache fields may be null */
+
+  TABLE *get_rowid; /* _ != NULL <=> */
+} CACHE_FIELD;
+
+class JOIN_CACHE :public Sql_alloc
+{
+protected:
+  CACHE_FIELD *field;
+  CACHE_FIELD **blob_ptr;
+  SQL_SELECT *select;
+  /* 
+    Number of fields (i.e. cache_field objects). Those correspond to table
+    columns, and there are also special fields for
+     - table's column null bits
+     - table's null-complementation byte
+     - [new] table's rowid.
+  */
+  uint tables;
+  uint fields;
+  uint blobs;
+  uint prefix_addon_fields;
+  uint ext_key_arg_fields;
+  uint length;           
+  uint records;          /* number of records written into the cache */
+  uint record_nr; 
+  uint ptr_record;
+  uchar *buff;           /* beginning of the cache buffer                */
+  uchar *end;            /* end of the cache buffer                      */
+  uchar *pos;            /* current position of free space in the buffer */
+  uchar *end_pos;        /* end of the cache of free space in the buffer */
+  JOIN *join;
+  uint write_record_data(bool *is_full);
+  uint read_prefix_addon_fields();
+  uint read_record_field(CACHE_FIELD *copy, bool last_record);
+  uint read_record_data(); 
+  virtual uint pack_length()
+  { return length + (prev_cache ? sizeof(uchar *) : 0); }
+  virtual uint addon_length()
+  { return 0; }
+  uchar *rec_pos;
+  uchar *prev_rec_end;
+public:
+  JOIN_TAB *join_tab;
+  JOIN_CACHE *prev_cache;
+  JOIN_CACHE *next_cache;
+  bool use_match_flag;
+  uint mrr_mode;
+  virtual ~JOIN_CACHE() {}
+  void reset_join(JOIN *j) { join= j; }
+  uchar *get_pos() { return pos; }
+  void set_pos(uchar *arg) { pos= arg; }
+  uint get_record_nr() { return record_nr; }
+  void set_record_nr(uint arg) { record_nr= arg; }
+  bool with_length_prepanded()
+  { return is_key_access() || use_match_flag || ext_key_arg_fields; }
+  ulong used_blob_length();
+  void free()
+  { 
+    x_free(buff);
+    buff= 0;
+  }
+  void register_ext_key_arg_field(CACHE_FIELD *copy)
+  {
+    copy->ext_key_arg_no= ++ext_key_arg_fields;
+    length+= sizeof(uint32);
+  }
+  uint get_ext_key_arg_fields()
+  { 
+    return ext_key_arg_fields;
+  }
+  bool read_ext_key_arg_field(CACHE_FIELD *copy, uchar *rec_ptr, 
+                              bool last_record, uint *len);
+  virtual bool is_key_access() { return FALSE; }
+  virtual int init();
+  virtual void reset(bool is_for_write);    
+  virtual bool put_record()= 0;
+  virtual bool get_record()= 0;
+  virtual void get_record_by_pos(uchar *rec_ptr)
+  {
+    uchar *save_pos= pos;
+    uint save_record_nr= record_nr;
+    record_nr= 0;
+    if (prev_cache)
+    {
+      uchar *prev_rec_pos;
+      memcpy(&prev_rec_pos, rec_ptr-sizeof(uchar *), sizeof(uchar *));
+      prev_cache->get_record_by_pos(prev_rec_pos);
+    }
+    rec_pos= pos= rec_ptr;
+    read_record_data();
+    pos= save_pos;
+    record_nr= save_record_nr;
+  }
+  enum_nested_loop_state join_records(bool skip_last);
+  bool check_match(uchar *rec_ptr);
+  virtual enum_nested_loop_state join_matching_records(bool skip_last)= 0;
+  virtual enum_nested_loop_state join_null_records(bool skip_last);
+  virtual uchar *get_record_pos()
+  {
+    return rec_pos;
+  }
+  
+  void set_match_flag(bool set) { (*get_record_pos())= (uchar)set; }
+  bool set_match_flag_if_none(JOIN_TAB *first_inner, uchar *rec_ptr);
+  bool skip_record_if_match()
+  {
+    DBUG_ASSERT(use_match_flag && with_length_prepanded());
+    uint offset= sizeof(uint32) + (prev_cache ? sizeof(uchar *) : 0);
+    if (test(*(pos+offset)))
+    {
+      pos+= sizeof(uint32) + uint4korr(pos);
+      return TRUE;
+    }
+    return FALSE;
+  }      
+    
+};
+
+class JOIN_CACHE_BNL :public JOIN_CACHE
+{
+public:
+  JOIN_CACHE_BNL(JOIN *j, JOIN_TAB *tab)
+  { 
+    join= j;
+    join_tab= tab;
+    prev_cache= next_cache= 0;
+  }
+  JOIN_CACHE_BNL(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev)
+  { 
+    join= j;
+    join_tab= tab;
+    prev_cache= prev;
+    next_cache= 0;
+    if (prev)
+      prev->next_cache= this;
+  }
+  bool put_record()
+  { 
+    bool is_full;
+    write_record_data(&is_full);
+    return is_full;
+  }
+  bool get_record()
+  { 
+    bool res;
+    if (with_length_prepanded())
+      pos+= sizeof(uint32);
+    if (prev_cache)
+    {
+      uchar *prev_rec_pos;
+      memcpy(&prev_rec_pos, pos, sizeof(uchar *));
+      pos+= sizeof(uchar *);
+      prev_cache->get_record_by_pos(prev_rec_pos);
+    }
+    rec_pos= pos;
+    res= read_record_data() == 0;
+    if (ext_key_arg_fields)
+      pos+= sizeof(uint32)*ext_key_arg_fields;
+    return res; 
+  }
+  enum_nested_loop_state join_matching_records(bool skip_last);
+};
+
+class JOIN_CACHE_BKA :public JOIN_CACHE
+{
+protected:
+  uint local_key_arg_fields;
+  uint global_key_arg_fields;
+  HANDLER_BUFFER mrr_buff;
+public:
+  bool use_emb_key;
+  uint emb_key_length;
+  
+  JOIN_CACHE_BKA(JOIN *j, JOIN_TAB *tab)
+  { 
+    join= j;
+    join_tab= tab;
+    prev_cache= next_cache= 0;
+  }
+  JOIN_CACHE_BKA(JOIN *j, JOIN_TAB *tab, JOIN_CACHE* prev)
+  { 
+    join= j;
+    join_tab= tab;
+    prev_cache= prev;
+    next_cache= 0;
+    if (prev)
+      prev->next_cache= this;
+  }
+  uint pack_length()
+  { return JOIN_CACHE::pack_length() + sizeof(uint32); }
+  uint addon_length()
+  { 
+    TABLE_REF *ref= &join_tab->ref;
+    TABLE *tab= join_tab->table;
+    uint len= 0;
+    if (end_pos == end)
+      len+= ref->key_length + tab->file->ref_length;
+    uint rec_per_key= tab->key_info[ref->key].rec_per_key[ref->key_parts-1];
+    set_if_bigger(rec_per_key, 1);
+    len+= tab->file->stats.mrr_length_per_rec * rec_per_key;
+    return len;
+  }
+
+  bool put_record()
+  { 
+    bool is_full;
+    write_record_data(&is_full);
+    if (!is_full)
+    {
+      end_pos-= addon_length();
+      DBUG_ASSERT(end_pos >= pos);
+    }
+    else
+      end_pos= pos;
+    return is_full;
+  }
+  bool get_record()
+  { 
+    bool res;
+    pos+= sizeof(uint32);
+    if (prev_cache)
+    {
+      uchar *prev_rec_pos;
+      memcpy(&prev_rec_pos, pos, sizeof(uchar *));
+      pos+= sizeof(uchar *);
+      prev_cache->get_record_by_pos(prev_rec_pos);
+    }    
+    rec_pos= pos;
+    res= read_record_data() == 0;
+    if (ext_key_arg_fields)
+      pos+= sizeof(uint32)*ext_key_arg_fields;
+    return res; 
+  }
+  bool is_key_access() { return TRUE; }
+  int init()
+  {
+    int res= JOIN_CACHE::init();
+    return res;
+  }
+  void incr_key_arg_fields(uint d, bool is_local_cnt)
+  {
+    if (is_local_cnt)
+      local_key_arg_fields+= d;
+    else
+      global_key_arg_fields+= d;
+  }
+  uint get_key_arg_fields(bool is_local_cnt)
+  {
+    return is_local_cnt ? local_key_arg_fields : global_key_arg_fields;
+  }
+  void set_key_arg_fields(bool is_local_cnt, uint cnt)
+  {
+    if (is_local_cnt) 
+      local_key_arg_fields= cnt;
+    else
+      global_key_arg_fields= cnt;
+  }
+  bool check_emb_key_usage();
+  virtual void reset(bool is_for_write)
+  {
+    JOIN_CACHE::reset(is_for_write);
+    if (is_for_write)
+      end_pos= end;
+  }
+  virtual uint get_next_key(uchar **key);    
+  virtual void init_mrr_buff()
+  {
+    mrr_buff.buffer= end_pos;
+    mrr_buff.buffer_end= end;
+  }
+  virtual bool get_match_flag_by_pos(uchar *rec_pos)
+  { return test(*rec_pos); }
+  enum_nested_loop_state join_matching_records(bool skip_last);
+};
 
 enum_nested_loop_state sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool
                                         end_of_records);
diff -Nrup a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
--- a/storage/innobase/handler/ha_innodb.cc	2007-12-14 06:55:39 +01:00
+++ b/storage/innobase/handler/ha_innodb.cc	2008-03-04 09:39:27 +01:00
@@ -5760,6 +5760,7 @@ ha_innobase::info(
 					* UNIV_PAGE_SIZE;
 		stats.delete_length = 0;
 		stats.check_time = 0;
+	        stats.mrr_length_per_rec= ref_length + sizeof(void*);
 
 		if (stats.records == 0) {
 			stats.mean_rec_length = 0;
diff -Nrup a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
--- a/storage/myisam/ha_myisam.cc	2008-02-02 13:19:48 +01:00
+++ b/storage/myisam/ha_myisam.cc	2008-03-04 09:39:27 +01:00
@@ -1644,6 +1644,8 @@ int ha_myisam::info(uint flag)
     stats.max_data_file_length=  misam_info.max_data_file_length;
     stats.max_index_file_length= misam_info.max_index_file_length;
     stats.create_time= misam_info.create_time;
+    stats.mrr_length_per_rec= misam_info.reflength + sizeof(void*);
+
     ref_length= misam_info.reflength;
     share->db_options_in_use= misam_info.options;
     stats.block_size= myisam_block_size;        /* record block size */
Thread
bk commit into 6.0 tree (knielsen:1.2550) WL#2771knielsen4 Mar