List:Commits« Previous MessageNext Message »
From:Sergey Petrunia Date:March 18 2006 2:06pm
Subject:bk commit - 5.2 tree (sergefp:1.2154) WL#2474
View as plain text  
# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2006/03/18 14:45:40+03:00 sergefp@stripped 
#   WL#2474 Multi Range Read: Changed the default MRR implementation to implement new MRR interface
# 
# sql/sql_select.cc
#   2006/03/18 14:44:57+03:00 sergefp@stripped +9 -16
#   WL#2474 Multi Range Read: Changed the default MRR implementation to implement new MRR interface
# 
# sql/opt_range.h
#   2006/03/18 14:44:57+03:00 sergefp@stripped +20 -1
#   WL#2474 Multi Range Read: Changed the range optimizer to use new MRR interface
# 
# sql/opt_range.cc
#   2006/03/18 14:44:56+03:00 sergefp@stripped +480 -374
#   WL#2474 Multi Range Read: Changed the range optimizer to use new MRR interface
# 
# sql/handler.h
#   2006/03/18 14:44:55+03:00 sergefp@stripped +125 -9
#   WL#2474 Multi Range Read: Changed the default MRR implementation to implement new MRR interface
# 
# sql/handler.cc
#   2006/03/18 14:44:55+03:00 sergefp@stripped +252 -70
#   WL#2474 Multi Range Read: Changed the default MRR implementation to implement new MRR interface
# 
# sql/ha_ndbcluster.h
#   2006/03/18 14:44:54+03:00 sergefp@stripped +10 -7
#   WL#2474 Multi Range Read: Changed MRR/NDB to provide the new interface, albeit with a help of interface non-compliant
#    mrr_persistent_flag_storage() function.
# 
# sql/ha_ndbcluster.cc
#   2006/03/18 14:44:53+03:00 sergefp@stripped +118 -107
#   WL#2474 Multi Range Read: Changed MRR/NDB to provide the new interface, albeit with a help of interface non-compliant
#    mrr_persistent_flag_storage() function.
# 
# include/my_base.h
#   2006/03/18 14:44:52+03:00 sergefp@stripped +26 -0
#   WL#2474 Multi Range Read: added comments
# 
diff -Nru a/include/my_base.h b/include/my_base.h
--- a/include/my_base.h	2006-03-18 14:54:11 +03:00
+++ b/include/my_base.h	2006-03-18 14:54:11 +03:00
@@ -439,14 +439,40 @@
 
 /* For key ranges */
 
+/* from - inf */
 #define NO_MIN_RANGE	1
+
+/* to +inf */
 #define NO_MAX_RANGE	2
+
+/*  X < key (i.e. not X <= key */
 #define NEAR_MIN	4
+
+/* X > key (i.e. not X>= key */
 #define NEAR_MAX	8
+
+/* 
+  This flag means that index is a unique index, and the interval is 
+  equivalent to "AND(keypart_i = const_i)", where all of const_i are not NULLs.
+*/
 #define UNIQUE_RANGE	16
+
+/* 
+  This flag means that the interval is equivalent to 
+  "AND(keypart_i = const_i)", where not all key parts may be used but all of 
+  const_i are not NULLs.
+*/
 #define EQ_RANGE	32
+
+/*
+  This flag has the same meaning as UNIQUE_RANGE, except that for at least
+  one keypart the condition is "keypart IS NULL". 
+*/
 #define NULL_RANGE	64
+
 #define GEOM_FLAG      128
+
+/* Used only in NDB, at row retrieval stage */
 #define SKIP_RANGE     256
 
 typedef struct st_key_range
diff -Nru a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc
--- a/sql/ha_ndbcluster.cc	2006-03-18 14:54:11 +03:00
+++ b/sql/ha_ndbcluster.cc	2006-03-18 14:54:11 +03:00
@@ -7017,45 +7017,71 @@
   DBUG_RETURN(error);
 }
 
-int
-ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p,
-                                      KEY_MULTI_RANGE *ranges, 
-                                      uint range_count,
-                                      bool sorted, 
+/****************************************************************************
+ * MRR interface implementation
+ ***************************************************************************/
+
+#if 0
+#define DBUG_MULTI_RANGE(x) DBUG_PRINT("info", ("multi_range_read_next: case %d\n", x));
+#else
+#define DBUG_MULTI_RANGE(x)
+#endif
+
+//new
+int ha_ndbcluster::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, 
+                                         void *seq_init_param,
+                                         uint n_ranges, uint mode,
                                       HANDLER_BUFFER *buffer)
 {
-  DBUG_ENTER("ha_ndbcluster::read_multi_range_first");
-  m_write_op= FALSE;
-  
   int res;
-  KEY* key_info= table->key_info + active_index;
-  NDB_INDEX_TYPE index_type= get_index_type(active_index);
-  ulong reclength= table_share->reclength;
-  NdbOperation* op;
+  DBUG_ENTER("ha_ndbcluster::multi_range_read_init");
+  m_write_op= FALSE;
 
-  if (uses_blob_value())
+  if (uses_blob_value() || (seq_funcs->init != quick_range_seq_init))
   {
     /**
      * blobs can't be batched currently
      */
     m_disable_multi_read= TRUE;
-    DBUG_RETURN(handler::read_multi_range_first(found_range_p, 
-                                                ranges, 
-                                                range_count,
-                                                sorted, 
-                                                buffer));
+    DBUG_RETURN(handler::multi_range_read_init(seq_funcs, seq_init_param,
+                                               n_ranges, mode, buffer));
   }
-
   m_disable_multi_read= FALSE;
 
-  /**
-   * Copy arguments into member variables
-   */
-  m_multi_ranges= ranges;
-  multi_range_curr= ranges;
-  multi_range_end= ranges+range_count;
-  multi_range_sorted= sorted;
+  mrr_is_output_sorted= test(mode & HA_MRR_SORTED);
   multi_range_buffer= buffer;
+  mrr_funcs= *seq_funcs;
+  mrr_iter= mrr_funcs.init(seq_init_param, n_ranges, mode);
+  ranges_in_seq= n_ranges;
+
+  res= multi_range_start_retrievals(-1);
+  if (first_unstarted_range == n_ranges)
+  {
+    /**
+     * Mark that we're using entire buffer (even if might not) as
+     *   we haven't read all ranges for some reason
+     * This as we don't want mysqld to reuse the buffer when we read
+     *   the remaining ranges
+     */
+    buffer->end_of_used_area= (byte*)multi_range_buffer->buffer_end;
+  }
+  else
+  {
+    /* Using all buffer */
+  }
+  
+  DBUG_RETURN(res);
+}
+
+
+int ha_ndbcluster::multi_range_start_retrievals(int starting_range)
+{
+  int res, range_res;
+  KEY* key_info= table->key_info + active_index;
+  NDB_INDEX_TYPE index_type= get_index_type(active_index);
+  ulong reclength= table_share->reclength;
+  NdbOperation* op;
+  DBUG_ENTER("multi_range_start_retrievals");
 
   /**
    * read multi range will read ranges as follows (if not ordered)
@@ -7073,8 +7099,8 @@
   /**
    * Variables for loop
    */
-  byte *curr= (byte*)buffer->buffer;
-  byte *end_of_buffer= (byte*)buffer->buffer_end;
+  byte *curr= (byte*)multi_range_buffer->buffer;
+  byte *end_of_buffer= (byte*)multi_range_buffer->buffer_end;
   NdbOperation::LockMode lm= 
     (NdbOperation::LockMode)get_ndb_lock_type(m_lock.type);
   const NDBTAB *tab= (const NDBTAB *) m_table;
@@ -7082,21 +7108,20 @@
   const NDBINDEX *idx= (NDBINDEX *) m_index[active_index].index; 
   const NdbOperation* lastOp= m_active_trans->getLastDefinedOperation();
   NdbIndexScanOperation* scanOp= 0;
-  for (; multi_range_curr<multi_range_end && curr+reclength <= end_of_buffer; 
-       multi_range_curr++)
+
+  int range_no= -1;
+  int mrr_range_no= starting_range;
+
+  while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range)) &&
+         curr+reclength <= end_of_buffer)
   {
+    range_no++;
+    mrr_range_no++;
     part_id_range part_spec;
     if (m_use_partition_function)
     {
-      get_partition_set(table, curr, active_index,
-                        &multi_range_curr->start_key,
+      get_partition_set(table, curr, active_index, &mrr_cur_range.start_key,
                         &part_spec);
-      DBUG_PRINT("info", ("part_spec.start_part = %u, part_spec.end_part = %u",
-                          part_spec.start_part, part_spec.end_part));
-      /*
-        If partition pruning has found no partition in set
-        we can skip this scan
-      */
       if (part_spec.start_part > part_spec.end_part)
       {
         /*
@@ -7104,22 +7129,23 @@
           partition
         */
         curr += reclength;
-        multi_range_curr->range_flag |= SKIP_RANGE;
+        mrr_persistent_flag_storage(mrr_iter, mrr_range_no) |= SKIP_RANGE;
         continue;
       }
     }
+    
     switch(index_type){
     case PRIMARY_KEY_ORDERED_INDEX:
-      if (!(multi_range_curr->start_key.length == key_info->key_length &&
-          multi_range_curr->start_key.flag == HA_READ_KEY_EXACT))
+      if (!(mrr_cur_range.start_key.length == key_info->key_length &&
+            mrr_cur_range.start_key.flag == HA_READ_KEY_EXACT))
         goto range;
       // else fall through
     case PRIMARY_KEY_INDEX:
     {
-      multi_range_curr->range_flag |= UNIQUE_RANGE;
+      mrr_persistent_flag_storage(mrr_iter, mrr_range_no) |= UNIQUE_RANGE;
       if ((op= m_active_trans->getNdbOperation(tab)) && 
           !op->readTuple(lm) && 
-          !set_primary_key(op, multi_range_curr->start_key.key) &&
+          !set_primary_key(op, mrr_cur_range.start_key.key) &&
           !define_read_attrs(curr, op) &&
           (op->setAbortOption(AO_IgnoreError), TRUE) &&
           (!m_use_partition_function ||
@@ -7129,20 +7155,23 @@
         ERR_RETURN(op ? op->getNdbError() : m_active_trans->getNdbError());
       break;
     }
-    break;
     case UNIQUE_ORDERED_INDEX:
-      if (!(multi_range_curr->start_key.length == key_info->key_length &&
-          multi_range_curr->start_key.flag == HA_READ_KEY_EXACT &&
-          !check_null_in_key(key_info, multi_range_curr->start_key.key,
-                             multi_range_curr->start_key.length)))
+    {
+      if (!(mrr_cur_range.start_key.length == key_info->key_length &&
+            mrr_cur_range.start_key.flag == HA_READ_KEY_EXACT &&
+            !check_null_in_key(key_info, mrr_cur_range.start_key.key,
+                               mrr_cur_range.start_key.length)))
+      {
         goto range;
+      }
       // else fall through
+    }
     case UNIQUE_INDEX:
     {
-      multi_range_curr->range_flag |= UNIQUE_RANGE;
+      mrr_persistent_flag_storage(mrr_iter, mrr_range_no) |= UNIQUE_RANGE;
       if ((op= m_active_trans->getNdbIndexOperation(unique_idx, tab)) && 
           !op->readTuple(lm) && 
-          !set_index_key(op, key_info, multi_range_curr->start_key.key) &&
+          !set_index_key(op, key_info, mrr_cur_range.start_key.key) &&
           !define_read_attrs(curr, op) &&
           (op->setAbortOption(AO_IgnoreError), TRUE))
         curr += reclength;
@@ -7150,15 +7179,16 @@
         ERR_RETURN(op ? op->getNdbError() : m_active_trans->getNdbError());
       break;
     }
-    case ORDERED_INDEX: {
+    case ORDERED_INDEX:
+    {
   range:
-      multi_range_curr->range_flag &= ~(uint)UNIQUE_RANGE;
+      mrr_persistent_flag_storage(mrr_iter, mrr_range_no) &= ~(uint16)UNIQUE_RANGE;
       if (scanOp == 0)
       {
         if (m_multi_cursor)
         {
           scanOp= m_multi_cursor;
-          DBUG_ASSERT(scanOp->getSorted() == sorted);
+          DBUG_ASSERT(scanOp->getSorted() == mrr_is_output_sorted);
           DBUG_ASSERT(scanOp->getLockMode() == 
                       (NdbOperation::LockMode)get_ndb_lock_type(m_lock.type));
           if (scanOp->reset_bounds(m_force_send))
@@ -7167,7 +7197,8 @@
           end_of_buffer -= reclength;
         }
         else if ((scanOp= m_active_trans->getNdbIndexScanOperation(idx, tab)) 
-                 &&!scanOp->readTuples(lm, 0, parallelism, sorted, FALSE, TRUE)
+                 &&!scanOp->readTuples(lm, 0, parallelism,  
+                                       mrr_is_output_sorted, FALSE, TRUE)
                  &&!generate_scan_filter(m_cond_stack, scanOp)
                  &&!define_read_attrs(end_of_buffer-reclength, scanOp))
         {
@@ -7181,10 +7212,9 @@
         }
       }
 
-      const key_range *keys[2]= { &multi_range_curr->start_key, 
-                                  &multi_range_curr->end_key };
-      if ((res= set_bounds(scanOp, active_index, false, keys,
-                           multi_range_curr-ranges)))
+      const key_range *keys[2]= { &mrr_cur_range.start_key, 
+                                  &mrr_cur_range.end_key };
+      if ((res= set_bounds(scanOp, active_index, false, keys, range_no)))
         DBUG_RETURN(res);
       break;
     }
@@ -7195,21 +7225,6 @@
     }
   }
   
-  if (multi_range_curr != multi_range_end)
-  {
-    /**
-     * Mark that we're using entire buffer (even if might not) as
-     *   we haven't read all ranges for some reason
-     * This as we don't want mysqld to reuse the buffer when we read
-     *   the remaining ranges
-     */
-    buffer->end_of_used_area= (byte*)buffer->buffer_end;
-  }
-  else
-  {
-    buffer->end_of_used_area= curr;
-  }
-  
   /**
    * Set first operation in multi range
    */
@@ -7217,40 +7232,36 @@
     lastOp ? lastOp->next() : m_active_trans->getFirstDefinedOperation();
   if (!(res= execute_no_commit_ie(this, m_active_trans)))
   {
-    m_multi_range_defined= multi_range_curr;
-    multi_range_curr= ranges;
-    m_multi_range_result_ptr= (byte*)buffer->buffer;
-    DBUG_RETURN(read_multi_range_next(found_range_p));
+    m_multi_range_result_ptr= (byte*)multi_range_buffer->buffer;
+    first_running_range= first_range_in_batch= starting_range + 1;
+    first_unstarted_range= mrr_range_no + 1;
+    DBUG_RETURN(0);
   }
   ERR_RETURN(m_active_trans->getNdbError());
 }
 
-#if 0
-#define DBUG_MULTI_RANGE(x) DBUG_PRINT("info", ("read_multi_range_next: case %d\n", x));
-#else
-#define DBUG_MULTI_RANGE(x)
-#endif
-
-int
-ha_ndbcluster::read_multi_range_next(KEY_MULTI_RANGE ** multi_range_found_p)
+//new 
+int ha_ndbcluster::multi_range_read_next(char **range_info)
 {
-  DBUG_ENTER("ha_ndbcluster::read_multi_range_next");
+  DBUG_ENTER("ha_ndbcluster::multi_range_read_next");
   if (m_disable_multi_read)
   {
     DBUG_MULTI_RANGE(11);
-    DBUG_RETURN(handler::read_multi_range_next(multi_range_found_p));
+    DBUG_RETURN(handler::multi_range_read_next(range_info));
   }
   
   int res;
   int range_no;
   ulong reclength= table_share->reclength;
   const NdbOperation* op= m_current_multi_operation;
-  for (;multi_range_curr < m_multi_range_defined; multi_range_curr++)
+  
+  //for each range (we should have remembered the number)
+  for (;first_running_range < first_unstarted_range; first_running_range++)
   {
     DBUG_MULTI_RANGE(12);
-    if (multi_range_curr->range_flag & SKIP_RANGE)
+    if (mrr_persistent_flag_storage(mrr_iter, first_running_range) & SKIP_RANGE)
       continue;
-    if (multi_range_curr->range_flag & UNIQUE_RANGE)
+    if (mrr_persistent_flag_storage(mrr_iter, first_running_range) & UNIQUE_RANGE)
     {
       if (op->getNdbError().code == 0)
       {
@@ -7262,7 +7273,7 @@
       m_multi_range_result_ptr += reclength;
       continue;
     } 
-    else if (m_multi_cursor && !multi_range_sorted)
+    else if (m_multi_cursor && !mrr_is_output_sorted)
     {
       DBUG_MULTI_RANGE(1);
       if ((res= fetch_next(m_multi_cursor)) == 0)
@@ -7277,7 +7288,7 @@
         goto close_scan;
       }
     }
-    else if (m_multi_cursor && multi_range_sorted)
+    else if (m_multi_cursor && mrr_is_output_sorted)
     {
       if (m_active_cursor && (res= fetch_next(m_multi_cursor)))
       {
@@ -7286,7 +7297,7 @@
       }
       
       range_no= m_multi_cursor->get_range_no();
-      uint current_range_no= multi_range_curr - m_multi_ranges;
+      uint current_range_no= first_running_range - first_range_in_batch;
       if ((uint) range_no == current_range_no)
       {
         DBUG_MULTI_RANGE(4);
@@ -7310,7 +7321,7 @@
           DBUG_MULTI_RANGE(15);
           goto close_scan;
         }
-        multi_range_curr--; // Will be increased in for-loop
+        first_running_range--; // Will be increased in for-loop
         continue;
       }
     }
@@ -7318,7 +7329,7 @@
     {
       DBUG_MULTI_RANGE(7);
       /**
-       * Corresponds to range 5 in example in read_multi_range_first
+       * Corresponds to range 5 in example in multi_range_start_retrievals
        */
       (void)1;
       continue;
@@ -7340,27 +7351,23 @@
     }
   }
   
-  if (multi_range_curr == multi_range_end)
+  if (first_running_range == ranges_in_seq)
   {
     DBUG_MULTI_RANGE(16);
     DBUG_RETURN(HA_ERR_END_OF_FILE);
   }
   
-  /**
-   * Read remaining ranges
-   */
-  DBUG_RETURN(read_multi_range_first(multi_range_found_p, 
-                                     multi_range_curr,
-                                     multi_range_end - multi_range_curr, 
-                                     multi_range_sorted,
-                                     multi_range_buffer));
+  /* Read remaining ranges */
+  if ((res= multi_range_start_retrievals(first_running_range-1)))
+    DBUG_RETURN(res);
+  DBUG_RETURN(multi_range_read_next(range_info));
   
 found:
   /**
    * Found a record belonging to a scan
    */
   m_active_cursor= m_multi_cursor;
-  * multi_range_found_p= m_multi_ranges + range_no;
+  *range_info= mrr_get_ptr_by_idx(mrr_iter, first_running_range);
   memcpy(table->record[0], m_multi_range_cursor_result_ptr, reclength);
   setup_recattr(m_active_cursor->getFirstRecAttr());
   unpack_record(table->record[0]);
@@ -7372,17 +7379,21 @@
    * Found a record belonging to a pk/index op,
    *   copy result and move to next to prepare for next call
    */
-  * multi_range_found_p= multi_range_curr;
+  *range_info= mrr_get_ptr_by_idx(mrr_iter, first_running_range++);
   memcpy(table->record[0], m_multi_range_result_ptr, reclength);
   setup_recattr(op->getFirstRecAttr());
   unpack_record(table->record[0]);
   table->status= 0;
   
-  multi_range_curr++;
   m_current_multi_operation= m_active_trans->getNextCompletedOperation(op);
   m_multi_range_result_ptr += reclength;
   DBUG_RETURN(0);
 }
+
+
+/*****************************************************************************
+ * MRR implementation ends
+ ****************************************************************************/
 
 int
 ha_ndbcluster::setup_recattr(const NdbRecAttr* curr)
diff -Nru a/sql/ha_ndbcluster.h b/sql/ha_ndbcluster.h
--- a/sql/ha_ndbcluster.h	2006-03-18 14:54:11 +03:00
+++ b/sql/ha_ndbcluster.h	2006-03-18 14:54:11 +03:00
@@ -579,12 +579,17 @@
   int alter_tablespace(st_alter_tablespace *info);
 
   /**
-   * Multi range stuff
+   * Multi Range Read interface
    */
-  int read_multi_range_first(KEY_MULTI_RANGE **found_range_p,
-                             KEY_MULTI_RANGE*ranges, uint range_count,
-                             bool sorted, HANDLER_BUFFER *buffer);
-  int read_multi_range_next(KEY_MULTI_RANGE **found_range_p);
+  int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+  int multi_range_read_next(char **range_info);
+private:
+  uint first_running_range;
+  uint first_range_in_batch;
+  uint first_unstarted_range;
+  int multi_range_start_retrievals(int first_range);
+public:
 
   bool get_error_message(int error, String *buf);
   void info(uint);
@@ -852,8 +857,6 @@
   Ndb_cond_stack *m_cond_stack;
   bool m_disable_multi_read;
   byte *m_multi_range_result_ptr;
-  KEY_MULTI_RANGE *m_multi_ranges;
-  KEY_MULTI_RANGE *m_multi_range_defined;
   const NdbOperation *m_current_multi_operation;
   NdbIndexScanOperation *m_multi_cursor;
   byte *m_multi_range_cursor_result_ptr;
diff -Nru a/sql/handler.cc b/sql/handler.cc
--- a/sql/handler.cc	2006-03-18 14:54:11 +03:00
+++ b/sql/handler.cc	2006-03-18 14:54:11 +03:00
@@ -2742,94 +2742,277 @@
 }
 #endif
 
+
 /*
-  Read the first row of a multi-range set.
+  Calculate cost of 'index only' scan for given index and number of records.
 
   SYNOPSIS
-    read_multi_range_first()
-    found_range_p       Returns a pointer to the element in 'ranges' that
-                        corresponds to the returned row.
-    ranges              An array of KEY_MULTI_RANGE range descriptions.
-    range_count         Number of ranges in 'ranges'.
-    sorted		If result should be sorted per key.
-    buffer              A HANDLER_BUFFER for internal handler usage.
+    index_only_read_time()
+      keynr    key to read
+      records  #of records to read
 
   NOTES
-    Record is read into table->record[0].
-    *found_range_p returns a valid value only if read_multi_range_first()
-    returns 0.
-    Sorting is done within each range. If you want an overall sort, enter
-    'ranges' with sorted ranges.
+    It is assumed that we will read trough the whole key range and that all
+    key blocks are half full (normally things are much better). It is also
+    assumed that each time we read the next key from the index, the handler
+    performs a random seek, thus the cost is proportional to the number of
+    blocks read.
+
+  TODO:
+    Move this to handler->read_time() by adding a flag 'index-only-read' to
+    this call. The reason for doing this is that the current function doesn't
+    handle the case when the row is stored in the b-tree (like in innodb
+    clustered index)
+*/
+
+double handler::index_only_read_time(uint keynr, double records)
+{
+  double read_time;
+  uint keys_per_block= (block_size/2/
+			(table->key_info[keynr].key_length + ref_length) + 1);
+  read_time=((double) (records + keys_per_block-1) /
+             (double) keys_per_block);
+  return read_time;
+}
+
+
+/*
+  Get cost and other information about MRR scan over a known list of ranges
+
+  SYNOPSIS
+    multi_range_read_info_const()
+      keyno           Index number
+      seq             Range sequence to be traversed
+      seq_init_param  First parameter for seq->init()
+      n_ranges        Number of ranges in the sequence
+      bufsz    INOUT  IN:  Size of the buffer available for use
+                      OUT: Size of the buffer that is expected to be actually
+                           used, or 0 if buffer is not needed.
+      flags    INOUT  A combination of HA_MRR_* flags
+      cost     OUT    Estimated cost of MRR access
+
+  DESCRIPTION
+    Obtain the cost and other information about an MRR scan when the list of
+    ranges to be scanned is known.
+
+  NOTE
+    Reasons for having this function:
+    * Let high-roundtrip-cost handlers, like NDB/Federated produce the 
+      estimates in one roundtrip. (One might ask, why don't we use 
+      condition-pushdown-based system for the purpose of estimating number of
+      records that match the table condition?... good question... in any case
+      there is another reason)
+    * We need a specialized cost-of-MRR-over-constant-ranges function.
+      = It must be different from non-MRR-cost function (since MRR is supposed
+        to be better and have a smaller cost)
+      = It must be different from multi_range_read_info() because here we know
+        the ranges.
+    Therefore, we have this function.
 
   RETURN
-    0			OK, found a row
-    HA_ERR_END_OF_FILE	No rows in range
-    #			Error code
+    0 - OK, *cost contains cost of the scan, *bufsz and *flags contain scan
+        parameters.
+    1 - Error or can't perform the requested scan
 */
 
-int handler::read_multi_range_first(KEY_MULTI_RANGE **found_range_p,
-                                    KEY_MULTI_RANGE *ranges, uint range_count,
-                                    bool sorted, HANDLER_BUFFER *buffer)
-{
-  int result= HA_ERR_END_OF_FILE;
-  DBUG_ENTER("handler::read_multi_range_first");
-  multi_range_sorted= sorted;
-  multi_range_buffer= buffer;
-
-  for (multi_range_curr= ranges, multi_range_end= ranges + range_count;
-       multi_range_curr < multi_range_end;
-       multi_range_curr++)
+ha_rows 
+handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                     void *seq_init_param, uint n_ranges,
+                                     uint *bufsz, uint *flags, COST_VECT *cost)
+{
+  KEY_MULTI_RANGE range;
+  range_seq_t seq_it;
+  ha_rows rows, total_rows;
+  *bufsz= 0;
+  total_rows= 0;
+  uint part, max_used_part= 0;
+
+  seq_it= seq->init(seq_init_param, n_ranges, *flags);
+  n_ranges=0;
+  while (!seq->next(seq_it, &range))
   {
-    result= read_range_first(multi_range_curr->start_key.length ?
-                             &multi_range_curr->start_key : 0,
-                             multi_range_curr->end_key.length ?
-                             &multi_range_curr->end_key : 0,
-                             test(multi_range_curr->range_flag & EQ_RANGE),
-                             multi_range_sorted);
-    if (result != HA_ERR_END_OF_FILE)
-      break;
+    n_ranges++;
+    key_range *min_endp= range.start_key.length? &range.start_key : NULL;
+    key_range *max_endp= range.end_key.length? &range.end_key : NULL;
+    if (!(range.range_flag & UNIQUE_RANGE))
+    {
+      if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, 
+                                                        max_endp)))
+      {
+        /* Can't scan one range => can't do MRR scan at all */
+        total_rows= HA_POS_ERROR;
+        break;
+      }
+    }
+    else
+      rows= 1; // UNIQUE_RANGE
+
+    total_rows += rows;
+    part= (int)range.ptr;
+    if (part > max_used_part)
+      max_used_part= part;
   }
 
-  *found_range_p= multi_range_curr;
-  DBUG_PRINT("exit",("result %d", result));
-  DBUG_RETURN(result);
+  cost->zero();
+  if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
+    cost->io_count= index_only_read_time(keyno, total_rows);
+  else
+    cost->io_count= read_time(keyno, n_ranges, total_rows);
+  cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
+
+  cost->avg_io_cost= 1; // Assume random seeks
+  return total_rows;
 }
 
 
 /*
-  Read the next row of a multi-range set.
+  Get cost and other information about MRR scan over currently unknown ranges
 
   SYNOPSIS
-    read_multi_range_next()
-    found_range_p       Returns a pointer to the element in 'ranges' that
-                        corresponds to the returned row.
+    multi_range_read_info()
+      keyno           Index number
+      n_ranges        Estimated number of ranges (i.e. intervals) in the 
+                      range sequence.
+      n_rows          Estimated total number of records contained within all 
+                      of the ranges
+      bufsz    INOUT  IN:  Size of the buffer available for use
+                      OUT: Size of the buffer that will be actually used, or 
+                           0 if buffer is not needed.
+      flags    INOUT  A combination of HA_MRR_* flags
+      cost     OUT    Estimated cost of MRR access
+
+  DESCRIPTION
+    This function obtains the cost and other info about about the MRR scan.
+    The caller has some information about what kinds of ranges will be scanned
+    (e.g. "1000 {keypart1=const} ranges"}, but the constant themselves are
+    assumed not to be known when this function is called.
+
+    The flags parameter is a combination of those flags: HA_MRR_SORTED, 
+    HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION, HA_MRR_LIMITS.
+
+  RETURN
+    0     - OK, *cost contains cost of the scan, *bufsz and *flags contain scan
+            parameters.
+    other - Error or can't perform the requested scan
+*/
+
+int handler::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                   uint *bufsz, uint *flags, COST_VECT *cost)
+{
+  *bufsz= 0;  // No need for buffer
+
+  cost->zero();
+  cost->avg_io_cost= 1; // Assume random seeks
+
+  //TODO: flags & HA_MRR_SINGLEPOINT
+
+  /* Produce the same cost as non-MRR code does */
+  if (*flags & HA_MRR_INDEX_ONLY)
+    cost->io_count= index_only_read_time(keyno, keys);
+  else
+    cost->io_count= read_time(keyno, n_ranges, keys);
+  return 0;
+}
+
+
+/*
+  Initialize the MRR scan
+
+  SYNOPSIS
+    multi_range_read_init()
+      seq             Range sequence to be traversed
+      seq_init_param  First parameter for seq->init()
+      n_ranges        Number of ranges in the sequence
+      mode            Flags, see the description for details
+      buf             INOUT, memory buffer to be used.
+
+  DESCRIPTION
+    Initialize the MRR scan. This function may do heavyweight scan 
+    initialization like row prefetching/sorting/etc (NOTE: but better not do
+    it here as we may not need it, e.g. if we never satisfy WHERE clause on
+    previous tables. For many implementations it would be natural to do such
+    initializations in _next() call)
+
+    mode is a combination of the following flags: HA_MRR_SORTED,
+    HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION 
 
   NOTES
-    Record is read into table->record[0].
-    *found_range_p returns a valid value only if read_multi_range_next()
-    returns 0.
+    One must have called index_init() before calling this function. Several
+    multi_range_read_init() calls may be made in course of one query.
+
+    Until WL#2623 is done (see section 3.2), the following will also hold:
+    The caller will guarantee that if "seq->init == mrr_ranges_array_init"
+    then seq_init_param will be an array of KEY_MULTI_RANGE structures (with
+    n_ranges elements). This property will only be used by NDB handler until
+    WL#2623 is done (see section 3.2 again).
+     
+    Buffer memory management is done according to the following scenario:
+    The caller allocates the buffer and provides it to the callee by filling
+    in the members of HANDLER_BUFFER structure (see current code for
+    definition of HANDLER_BUFFER).
+    The callee consumes all or some fraction of the provided buffer space, and
+    sets the HANDLER_BUFFER members accordingly.
+    The callee may use the buffer memory until the next multi_range_read_init()
+    call is made, all records have been read, or until index_end() call is
+    made, whichever comes first.
 
   RETURN
-    0			OK, found a row
-    HA_ERR_END_OF_FILE	No (more) rows in range
-    #			Error code
+    0 - OK
+    1 - Error
 */
 
-int handler::read_multi_range_next(KEY_MULTI_RANGE **found_range_p)
+int
+handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
+                               uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+{
+  DBUG_ENTER("handler::multi_range_read_init");
+  /* For now, do nothing here... just initialize the ranges traversal */
+  mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+  mrr_funcs= *seq_funcs;
+  mrr_is_output_sorted= test(mode & HA_MRR_SORTED);
+  mrr_have_range= FALSE;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Get next record in MRR scan
+
+  SYNOPSIS
+    multi_range_read_next()
+       range_info  OUT  If HA_MRR_NO_ASSOCIATION flag is in effect or an error
+                        occured, nothing (but the caller is still allowed to 
+                        store an arbitrary value there)
+                        Otherwise, the opaque value associated with the range
+                        that contains the returned record.
+
+  NOTE
+    It should be possible to process range lists of arbitrary length without
+    any kind of overflows.
+    
+  RETURN
+    0      OK
+    other  Error code
+*/
+
+int handler::multi_range_read_next(char **range_info)
 {
   int result;
-  DBUG_ENTER("handler::read_multi_range_next");
+  int range_res;
+  DBUG_ENTER("handler::multi_range_read_next");
 
-  /* We should not be called after the last call returned EOF. */
-  DBUG_ASSERT(multi_range_curr < multi_range_end);
+  if (!mrr_have_range)
+  {
+    mrr_have_range= TRUE;
+    goto start;
+  }
 
   do
   {
     /* Save a call if there can be only one row in range. */
-    if (multi_range_curr->range_flag != (UNIQUE_RANGE | EQ_RANGE))
+    if (mrr_cur_range.range_flag != (UNIQUE_RANGE | EQ_RANGE))
     {
       result= read_range_next();
-
       /* On success or non-EOF errors jump to the end. */
       if (result != HA_ERR_END_OF_FILE)
         break;
@@ -2843,26 +3026,24 @@
       result= HA_ERR_END_OF_FILE;
     }
 
+start:
     /* Try the next range(s) until one matches a record. */
-    for (multi_range_curr++;
-         multi_range_curr < multi_range_end;
-         multi_range_curr++)
+    while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range)))
     {
-      result= read_range_first(multi_range_curr->start_key.length ?
-                               &multi_range_curr->start_key : 0,
-                               multi_range_curr->end_key.length ?
-                               &multi_range_curr->end_key : 0,
-                               test(multi_range_curr->range_flag & EQ_RANGE),
-                               multi_range_sorted);
+      result= read_range_first(mrr_cur_range.start_key.length ?
+                                 &mrr_cur_range.start_key : 0,
+                               mrr_cur_range.end_key.length ?
+                                 &mrr_cur_range.end_key : 0,
+                               test(mrr_cur_range.range_flag & EQ_RANGE),
+                               mrr_is_output_sorted);
       if (result != HA_ERR_END_OF_FILE)
         break;
     }
   }
-  while ((result == HA_ERR_END_OF_FILE) &&
-         (multi_range_curr < multi_range_end));
+  while ((result == HA_ERR_END_OF_FILE) && !range_res);
 
-  *found_range_p= multi_range_curr;
-  DBUG_PRINT("exit",("handler::read_multi_range_next: result %d", result));
+  *range_info= mrr_cur_range.ptr;
+  DBUG_PRINT("exit",("handler::multi_range_read_next result %d", result));
   DBUG_RETURN(result);
 }
 
@@ -2889,7 +3070,8 @@
 
 int handler::read_range_first(const key_range *start_key,
 			      const key_range *end_key,
-			      bool eq_range_arg, bool sorted)
+			      bool eq_range_arg,
+                              bool sorted /* ignored */)
 {
   int result;
   DBUG_ENTER("handler::read_range_first");
@@ -2922,7 +3104,7 @@
 
 
 /*
-  Read next row between two ranges.
+  Read next row between two endpoints.
 
   SYNOPSIS
     read_range_next()
diff -Nru a/sql/handler.h b/sql/handler.h
--- a/sql/handler.h	2006-03-18 14:54:11 +03:00
+++ b/sql/handler.h	2006-03-18 14:54:11 +03:00
@@ -700,6 +700,102 @@
 
 typedef struct system_status_var SSV;
 
+
+typedef void *range_seq_t;
+
+typedef struct st_range_seq_if
+{
+  /*
+    Initialize the traversal of range sequence
+    
+    SYNOPSIS
+      init()
+        init_params  The seq_init_param parameter 
+        n_ranges     The number of ranges obtained 
+        flags        HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
+
+    RETURN
+      An opaque value to be used as RANGE_SEQ_IF::next() parameter
+  */
+  range_seq_t (*init)(void *init_params, uint n_ranges, uint flags);
+
+
+  /*
+    Get the next range in the range sequence
+
+    SYNOPSIS
+      next()
+        seq    The value returned by RANGE_SEQ_IF::init()
+        range  OUT
+    
+    RETURN
+      0 - Ok, the range structure filled with info about the next range
+      1 - No more ranges
+  */
+  uint (*next) (range_seq_t seq, KEY_MULTI_RANGE *range);
+} RANGE_SEQ_IF;
+
+uint16 &mrr_persistent_flag_storage(range_seq_t seq, uint idx);
+char* &mrr_get_ptr_by_idx(range_seq_t seq, uint idx);
+
+class COST_VECT
+{ 
+public:
+  double io_count;     /* number of I/O                 */
+  double avg_io_cost;  /* cost of an average I/O oper.  */
+  double cpu_cost;     /* cost of operations in CPU     */
+  double mem_cost;     /* cost of used memory           */ 
+  double import_cost;  /* cost of remote operations     */
+  
+  enum { IO_COEFF=1 };
+  enum { CPU_COEFF=1 };
+  enum { MEM_COEFF=1 };
+  enum { IMPORT_COEFF=1 };
+
+  double total_cost() 
+  {
+    return IO_COEFF*io_count*avg_io_cost + CPU_COEFF * cpu_cost +
+           MEM_COEFF*mem_cost + IMPORT_COEFF*import_cost;
+  }
+
+  void zero()
+  {
+    avg_io_cost= 1.0;
+    io_count= cpu_cost= mem_cost= import_cost= 0.0;
+  }
+};
+
+
+/*
+  The below two are not used (and not handled) in this milestone of this WL
+  entry because there seems to be no use for them at this stage of
+  implementation. (TODO is that so??)
+*/
+#define HA_MRR_SINGLE_POINT 1
+#define HA_MRR_FIXED_KEY  2
+
+/* 
+  Indicates that RANGE_SEQ_IF::next(&range) doesn't need to fill in the
+  'range' parameter.
+*/
+#define HA_MRR_NO_ASSOCIATION 4
+
+/* 
+  The MRR user will provide ranges in key order, and MRR implementation
+  must return rows in key order.
+*/
+#define HA_MRR_SORTED 8
+
+/* MRR implementation doesn't have to retrieve full records */
+#define HA_MRR_INDEX_ONLY 16
+
+/* 
+  The passed memory buffer is of maximum possible size, the caller can't
+  assume larger buffer.
+*/
+#define HA_MRR_LIMITS 32
+
+
 /*
   The handler class is the interface for dynamically loadable
   storage engines. Do not add ifdefs and take care when adding or
@@ -746,12 +842,20 @@
   time_t create_time;			/* When table was created */
   time_t check_time;
   time_t update_time;
+  
+  /* Multi range read implementation-used members: */
+  range_seq_t mrr_iter;    /* MRR range sequence iterator being traversed */
+  RANGE_SEQ_IF mrr_funcs;  /* Saved MRR range sequence traversal functions */
+  HANDLER_BUFFER *multi_range_buffer; /* Saved MRR buffer info */
+  uint ranges_in_seq; /* Total number of ranges in the traversed sequence */
+  /* TRUE <=> source MRR ranges and the output are ordered */
+  bool mrr_is_output_sorted;
+  
+  /* TRUE <=> we're currently traversing a range in mrr_cur_range. */
+  bool mrr_have_range;
+  /* Current range (the one we're now returning rows from) */
+  KEY_MULTI_RANGE mrr_cur_range;
 
-  /* The following are for read_multi_range */
-  bool multi_range_sorted;
-  KEY_MULTI_RANGE *multi_range_curr;
-  KEY_MULTI_RANGE *multi_range_end;
-  HANDLER_BUFFER *multi_range_buffer;
 
   /* The following are for read_range() */
   key_range save_end_range, *end_range;
@@ -826,10 +930,26 @@
     table= table_arg;
     table_share= share;
   }
+  /* Estimates calculation */
   virtual double scan_time()
     { return ulonglong2double(data_file_length) / IO_SIZE + 2; }
   virtual double read_time(uint index, uint ranges, ha_rows rows)
  { return rows2double(ranges+rows); }
+
+  virtual double index_only_read_time(uint keynr, double records);
+  
+  virtual ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                              void *seq_init_param, 
+                                              uint n_ranges, uint *bufsz,
+                                              uint *flags, COST_VECT *cost);
+  virtual int multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                    uint *bufsz, uint *flags, COST_VECT *cost);
+  virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                                    uint n_ranges, uint mode,
+                                    HANDLER_BUFFER *buf);
+  virtual int multi_range_read_next(char **range_info);
+
+
   virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; }
   virtual bool has_transactions(){ return 0;}
   virtual uint extra_rec_buf_length() const { return 0; }
@@ -1119,10 +1239,6 @@
   virtual int index_next_same(byte *buf, const byte *key, uint keylen);
   virtual int index_read_last(byte * buf, const byte * key, uint key_len)
    { return (my_errno=HA_ERR_WRONG_COMMAND); }
-  virtual int read_multi_range_first(KEY_MULTI_RANGE **found_range_p,
-                                     KEY_MULTI_RANGE *ranges, uint range_count,
-                                     bool sorted, HANDLER_BUFFER *buffer);
-  virtual int read_multi_range_next(KEY_MULTI_RANGE **found_range_p);
   virtual int read_range_first(const key_range *start_key,
                                const key_range *end_key,
                                bool eq_range, bool sorted);
diff -Nru a/sql/opt_range.cc b/sql/opt_range.cc
--- a/sql/opt_range.cc	2006-03-18 14:54:11 +03:00
+++ b/sql/opt_range.cc	2006-03-18 14:54:11 +03:00
@@ -35,7 +35,7 @@
     
     The lists are returned in form of complicated structure of interlinked
     SEL_TREE/SEL_IMERGE/SEL_ARG objects.
-    See check_quick_keys, find_used_partitions for examples of how to walk 
+    See quick_range_seq_next, find_used_partitions for examples of how to walk 
     this structure.
     All direct "users" of this module are located within this file, too.
 
@@ -212,6 +212,7 @@
   }
   void store_min(uint length,char **min_key,uint min_key_flag)
   {
+    /* "(kp1 > c1) AND (kp2 OP c2) AND ..." -> (kp1 > c1) */
     if ((min_flag & GEOM_FLAG) ||
         (!(min_flag & NO_MIN_RANGE) &&
 	!(min_key_flag & (NO_MIN_RANGE | NEAR_MIN))))
@@ -226,6 +227,16 @@
       (*min_key)+= length;
     }
   }
+
+  /*
+    SYNOPSIS
+      store()
+       length              Key pack length
+       min_key       INOUT store here and advance the pointer
+       min_key_flag        prev-key-part
+       max_key       INOUT store here and advance the pointer
+       max_key_flag        prev-key-part
+  */
   void store(uint length,char **min_key,uint min_key_flag,
 	     char **max_key, uint max_key_flag)
   {
@@ -244,12 +255,20 @@
     }
   }
 
+  /*
+     store_min_key()
+       key             
+       range_key       INOUT
+       range_key_flag  INOUT
+  */
   void store_min_key(KEY_PART *key,char **range_key, uint *range_key_flag)
   {
-    SEL_ARG *key_tree= first();
+    SEL_ARG *key_tree= first(); //find the left-most interval of this.
+    //   left < X < +inf ... and amend with next key part(s) ???
     key_tree->store(key[key_tree->part].store_length,
 		    range_key,*range_key_flag,range_key,NO_MAX_RANGE);
     *range_key_flag|= key_tree->min_flag;
+    
     if (key_tree->next_key_part &&
 	key_tree->next_key_part->part == key_tree->part+1 &&
 	!(*range_key_flag & (NO_MIN_RANGE | NEAR_MIN)) &&
@@ -471,9 +490,6 @@
 
 static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts);
 static ha_rows check_quick_select(PARAM *param,uint index,SEL_ARG *key_tree);
-static ha_rows check_quick_keys(PARAM *param,uint index,SEL_ARG *key_tree,
-				char *min_key,uint min_key_flag,
-				char *max_key, uint max_key_flag);
 
 QUICK_RANGE_SELECT *get_quick_select(PARAM *param,uint index,
                                      SEL_ARG *key_tree,
@@ -497,8 +513,6 @@
 static int get_index_merge_params(PARAM *param, key_map& needed_reg,
                            SEL_IMERGE *imerge, double *read_time,
                            ha_rows* imerge_rows);
-static double get_index_only_read_time(const PARAM* param, ha_rows records,
-                                       int keynr);
 
 #ifndef DBUG_OFF
 static void print_sel_tree(PARAM *param, SEL_TREE *tree, key_map *tree_map,
@@ -1938,9 +1952,10 @@
     if (!head->used_keys.is_clear_all())
     {
       int key_for_use= find_shortest_key(head, &head->used_keys);
-      double key_read_time= (get_index_only_read_time(&param, records,
-                                                     key_for_use) +
-                             (double) records / TIME_FOR_COMPARE);
+      double key_read_time= 
+        param.table->file->index_only_read_time(key_for_use, 
+                                                rows2double(records)) +
+        (double) records / TIME_FOR_COMPARE;
       DBUG_PRINT("info",  ("'all'+'using index' scan will be using key %d, "
                            "read time %g", key_for_use, key_read_time));
       if (key_read_time < read_time)
@@ -2715,7 +2730,7 @@
     {
       /* 
         Partitioning is done by RANGE|INTERVAL(monotonic_expr(fieldX)), and
-        we got "const1 CMP fieldX CMP const2" interval <-- psergey-todo: change
+        we got "const1 CMP fieldX CMP const2" interval
       */
       DBUG_EXECUTE("info", dbug_print_segment_range(key_tree,
                                                     ppar->range_param.
@@ -3518,42 +3533,6 @@
 }
 
 
-/*
-  Calculate cost of 'index only' scan for given index and number of records.
-
-  SYNOPSIS
-    get_index_only_read_time()
-      param    parameters structure
-      records  #of records to read
-      keynr    key to read
-
-  NOTES
-    It is assumed that we will read trough the whole key range and that all
-    key blocks are half full (normally things are much better). It is also
-    assumed that each time we read the next key from the index, the handler
-    performs a random seek, thus the cost is proportional to the number of
-    blocks read.
-
-  TODO:
-    Move this to handler->read_time() by adding a flag 'index-only-read' to
-    this call. The reason for doing this is that the current function doesn't
-    handle the case when the row is stored in the b-tree (like in innodb
-    clustered index)
-*/
-
-static double get_index_only_read_time(const PARAM* param, ha_rows records,
-                                       int keynr)
-{
-  double read_time;
-  uint keys_per_block= (param->table->file->block_size/2/
-			(param->table->key_info[keynr].key_length+
-			 param->table->file->ref_length) + 1);
-  read_time=((double) (records+keys_per_block-1)/
-             (double) keys_per_block);
-  return read_time;
-}
-
-
 typedef struct st_ror_scan_info
 {
   uint      idx;      /* # of used key in param->keys */
@@ -3629,9 +3608,9 @@
     if (bitmap_is_set(&param->needed_fields, key_part->fieldnr))
       bitmap_set_bit(&ror_scan->covered_fields, key_part->fieldnr);
   }
+  double rows= rows2double(param->table->quick_rows[ror_scan->keynr]);
   ror_scan->index_read_cost=
-    get_index_only_read_time(param, param->table->quick_rows[ror_scan->keynr],
-                             ror_scan->keynr);
+    param->table->file->index_only_read_time(ror_scan->keynr, rows);
   DBUG_RETURN(ror_scan);
 }
 
@@ -4398,6 +4377,8 @@
     Best range read plan
     NULL if no plan found or error occurred
 */
+ha_rows check_quick_select(PARAM *param, uint idx, bool index_only, SEL_ARG *tree,
+                               COST_VECT *cost);
 
 static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
                                        bool index_read_must_be_used,
@@ -4419,9 +4400,7 @@
                                       "tree scans"););
   tree->ror_scans_map.clear_all();
   tree->n_ror_scans= 0;
-  for (idx= 0,key=tree->keys, end=key+param->keys;
-       key != end ;
-       key++,idx++)
+  for (idx= 0,key=tree->keys, end=key+param->keys; key != end; key++,idx++)
   {
     ha_rows found_records;
     double found_read_time;
@@ -4435,41 +4414,15 @@
       bool read_index_only= index_read_must_be_used ? TRUE :
                             (bool) param->table->used_keys.is_set(keynr);
 
-      found_records= check_quick_select(param, idx, *key);
-      if (param->is_ror_scan)
+      COST_VECT cost;
+      found_records= check_quick_select(param, idx, read_index_only, *key,
+                                             &cost);
+      found_read_time= cost.total_cost();
+      if ((found_records != HA_POS_ERROR) && param->is_ror_scan)
       {
         tree->n_ror_scans++;
         tree->ror_scans_map.set_bit(idx);
       }
-      double cpu_cost= (double) found_records / TIME_FOR_COMPARE;
-      if (found_records != HA_POS_ERROR && found_records > 2 &&
-          read_index_only &&
-          (param->table->file->index_flags(keynr, param->max_key_part,1) &
-           HA_KEYREAD_ONLY) &&
-          !(pk_is_clustered && keynr == param->table->s->primary_key))
-      {
-        /*
-          We can resolve this by only reading through this key. 
-          0.01 is added to avoid races between range and 'index' scan.
-        */
-        found_read_time= get_index_only_read_time(param,found_records,keynr) +
-                         cpu_cost + 0.01;
-      }
-      else
-      {
-        /*
-          cost(read_through_index) = cost(disk_io) + cost(row_in_range_checks)
-          The row_in_range check is in QUICK_RANGE_SELECT::cmp_next function.
-        */
-	found_read_time= param->table->file->read_time(keynr,
-                                                       param->range_count,
-                                                       found_records) +
-			 cpu_cost + 0.01;
-      }
-      DBUG_PRINT("info",("key %s: found_read_time: %g (cur. read_time: %g)",
-                         param->table->key_info[keynr].name, found_read_time,
-                         read_time));
-
       if (read_time > found_read_time && found_records != HA_POS_ERROR
           /*|| read_time == DBL_MAX*/ )
       {
@@ -4477,7 +4430,6 @@
         best_records= found_records;
         key_to_read=  key;
       }
-
     }
   }
 
@@ -6437,212 +6389,233 @@
 
 #endif
 
+/****************************************************************************
+  Implementation of MRR range sequence interface that traverses the SEL_ARG* 
+  tree 
+ ****************************************************************************/
 
-/*
-  Calculate estimate of number records that will be retrieved by a range
-  scan on given index using given SEL_ARG intervals tree.
-  SYNOPSIS
-    check_quick_select
-      param  Parameter from test_quick_select
-      idx    Number of index to use in PARAM::key SEL_TREE::key
-      tree   Transformed selection condition, tree->key[idx] holds intervals
-             tree to be used for scanning.
-  NOTES
-    param->is_ror_scan is set to reflect if the key scan is a ROR (see
-    is_key_scan_ror function for more info)
-    param->table->quick_*, param->range_count (and maybe others) are
-    updated with data of given key scan, see check_quick_keys for details.
+/* MRR range sequence, SEL_ARG* implementation: stack entry */
+typedef struct st_range_seq_entry 
+{
+  /* 
+    pointers in min and max keys. They point to right-after-end of key
+    images. The 0-th entry has these pointing to key tuple start.
+  */
+  char *min_key, *max_key;
+  
+  /* Flags (accumulated with prev. keypart) */
+  uint min_key_flag, max_key_flag;
 
-  RETURN
-    Estimate # of records to be retrieved.
-    HA_POS_ERROR if estimate calculation failed due to table handler problems.
+  SEL_ARG *key_tree;
+} RANGE_SEQ_ENTRY;
 
-*/
 
-static ha_rows
-check_quick_select(PARAM *param,uint idx,SEL_ARG *tree)
+/* MRR range sequence, SEL_ARG* implementation: enumeration context */
+typedef struct st_sel_arg_range_seq
 {
-  ha_rows records;
-  bool    cpk_scan;
-  uint key;
-  DBUG_ENTER("check_quick_select");
+  uint keyno;      /* index of used tree in SEL_TREE structure */
+  uint real_keyno; /* Number of the index in tables */
+  PARAM *param;
 
-  param->is_ror_scan= FALSE;
+  SEL_ARG *start; /* Root node of the traversed SEL_ARG* graph */
+  
+  RANGE_SEQ_ENTRY stack[MAX_REF_PARTS];
+  int i; //todo:psergey:
+  
+  bool at_start; /* TRUE <=> The traversal has just started */
+} SEL_ARG_RANGE_SEQ;
 
-  if (!tree)
-    DBUG_RETURN(HA_POS_ERROR);			// Can't use it
-  param->max_key_part=0;
-  param->range_count=0;
-  key= param->real_keynr[idx];
 
-  if (tree->type == SEL_ARG::IMPOSSIBLE)
-    DBUG_RETURN(0L);				// Impossible select. return
-  if (tree->type != SEL_ARG::KEY_RANGE || tree->part != 0)
-    DBUG_RETURN(HA_POS_ERROR);				// Don't use tree
+/*
+  Range sequence interface, SEL_ARG* implementation: Initialize the traversal
+  SYNOPSIS
+    init()
+      init_params  SEL_ARG tree traversal context
+      n_ranges     [ignored] The number of ranges obtained 
+      flags        [ignored] HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
 
-  enum ha_key_alg key_alg= param->table->key_info[key].algorithm;
-  if ((key_alg != HA_KEY_ALG_BTREE) && (key_alg!= HA_KEY_ALG_UNDEF))
-  {
-    /* Records are not ordered by rowid for other types of indexes. */
-    cpk_scan= FALSE;
-  }
-  else
-  {
-    /*
-      Clustered PK scan is a special case, check_quick_keys doesn't recognize
-      CPK scans as ROR scans (while actually any CPK scan is a ROR scan).
-    */
-    cpk_scan= ((param->table->s->primary_key == param->real_keynr[idx]) &&
-               param->table->file->primary_key_is_clustered());
-    param->is_ror_scan= !cpk_scan;
-  }
+  RETURN
+    Same as what has been passed
+*/
 
-  records=check_quick_keys(param,idx,tree,param->min_key,0,param->max_key,0);
-  if (records != HA_POS_ERROR)
-  {
-    param->table->quick_keys.set_bit(key);
-    param->table->quick_rows[key]=records;
-    param->table->quick_key_parts[key]=param->max_key_part+1;
+range_seq_t sel_arg_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)init_param;
+  seq->at_start= TRUE;
 
-    if (cpk_scan)
-      param->is_ror_scan= TRUE;
-  }
-  if (param->table->file->index_flags(key, 0, TRUE) & HA_KEY_SCAN_NOT_ROR)
-    param->is_ror_scan= FALSE;
-  DBUG_PRINT("exit", ("Records: %lu", (ulong) records));
-  DBUG_RETURN(records);
+  seq->stack[0].key_tree= NULL;
+  seq->stack[0].min_key= seq->param->min_key;
+  seq->stack[0].min_key_flag= 0;
+  seq->stack[0].max_key= seq->param->max_key;
+  seq->stack[0].max_key_flag= 0;
+  seq->i= 0;
+  return init_param;
 }
 
 
-/*
-  Recursively calculate estimate of # rows that will be retrieved by
-  key scan on key idx.
-  SYNOPSIS
-    check_quick_keys()
-      param         Parameter from test_quick select function.
-      idx           Number of key to use in PARAM::keys in list of used keys
-                    (param->real_keynr[idx] holds the key number in table)
-      key_tree      SEL_ARG tree being examined.
-      min_key       Buffer with partial min key value tuple
-      min_key_flag
-      max_key       Buffer with partial max key value tuple
-      max_key_flag
-
-  NOTES
-    The function does the recursive descent on the tree via SEL_ARG::left,
-    SEL_ARG::right, and SEL_ARG::next_key_part edges. The #rows estimates
-    are calculated using records_in_range calls at the leaf nodes and then
-    summed.
+static void step_down_to(SEL_ARG_RANGE_SEQ *arg, SEL_ARG *key_tree)
+{
+  RANGE_SEQ_ENTRY *cur= &arg->stack[arg->i+1];
+  RANGE_SEQ_ENTRY *prev= &arg->stack[arg->i];
+  
+  cur->key_tree= key_tree;
+  cur->min_key= prev->min_key;
+  cur->max_key= prev->max_key;
+  
+  key_tree->store(arg->param->key[arg->keyno][key_tree->part].store_length,
+                  &cur->min_key, prev->min_key_flag,
+                  &cur->max_key, prev->max_key_flag);
+  cur->min_key_flag= prev->min_key_flag | key_tree->min_flag;
+  cur->max_key_flag= prev->max_key_flag | key_tree->max_flag;
 
-    param->min_key and param->max_key are used to hold prefixes of key value
-    tuples.
+  (arg->i)++;
+}
 
-    The side effects are:
 
-    param->max_key_part is updated to hold the maximum number of key parts used
-      in scan minus 1.
+/*
+  Range sequence interface, SEL_ARG* implementation: get the next interval
+  
+  SYNOPSIS
+    sel_arg_range_seq_next()
+      rseq        Value returned from sel_arg_range_seq_init
+      range  OUT  Store information about the range here
 
-    param->range_count is incremented if the function finds a range that
-      wasn't counted by the caller.
+  DESCRIPTION
+   This is "get_next" function for Range sequence interface implementation
+   for SEL_ARG* tree.
 
-    param->is_ror_scan is cleared if the function detects that the key scan is
-      not a Rowid-Ordered Retrieval scan ( see comments for is_key_scan_ror
-      function for description of which key scans are ROR scans)
+  RETURN
+    0  Ok
+    1  No more ranges in the sequence
 */
 
-static ha_rows
-check_quick_keys(PARAM *param,uint idx,SEL_ARG *key_tree,
-		 char *min_key,uint min_key_flag, char *max_key,
-		 uint max_key_flag)
+uint sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
 {
-  ha_rows records=0, tmp;
-  uint tmp_min_flag, tmp_max_flag, keynr, min_key_length, max_key_length;
-  char *tmp_min_key, *tmp_max_key;
-
-  param->max_key_part=max(param->max_key_part,key_tree->part);
-  if (key_tree->left != &null_element)
+  SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)rseq;
+  SEL_ARG *key_tree, *parent;
+  uint tmp_min_flag, tmp_max_flag;
+  if (seq->at_start)
   {
-    /*
-      There are at least two intervals for current key part, i.e. condition
-      was converted to something like
-        (keyXpartY less/equals c1) OR (keyXpartY more/equals c2).
-      This is not a ROR scan if the key is not Clustered Primary Key.
-    */
-    param->is_ror_scan= FALSE;
-    records=check_quick_keys(param,idx,key_tree->left,min_key,min_key_flag,
-			     max_key,max_key_flag);
-    if (records == HA_POS_ERROR)			// Impossible
-      return records;
+    key_tree= seq->start;
+    seq->at_start= FALSE;
+    goto walk_up_right;
   }
 
-  tmp_min_key= min_key;
-  tmp_max_key= max_key;
-  key_tree->store(param->key[idx][key_tree->part].store_length,
-		  &tmp_min_key,min_key_flag,&tmp_max_key,max_key_flag);
-  min_key_length= (uint) (tmp_min_key- param->min_key);
-  max_key_length= (uint) (tmp_max_key- param->max_key);
+  key_tree= seq->stack[seq->i].key_tree;
+  /* Ok, we're at some "full tuple" position in the tree */
+ 
+  /* Step down if we can */
+  if (key_tree->next && key_tree->next != &null_element)
+  {
+    //step down; (update the tuple, we'll step right and stay there)
+    seq->i--;
+    step_down_to(seq, key_tree->next);
+    key_tree= key_tree->next;
+    seq->param->is_ror_scan= FALSE;
+    goto walk_right_up;
+  }
 
-  if (param->is_ror_scan)
+  /* Ok, can't step down, walk left until we can step down */
+  while (1)
   {
-    /*
-      If the index doesn't cover entire key, mark the scan as non-ROR scan.
-      Actually we're cutting off some ROR scans here.
-    */
-    uint16 fieldnr= param->table->key_info[param->real_keynr[idx]].
-                    key_part[key_tree->part].fieldnr - 1;
-    if (param->table->field[fieldnr]->key_length() !=
-        param->key[idx][key_tree->part].length)
-      param->is_ror_scan= FALSE;
+    if (seq->i == 1) // can't step left
+      return 1;
+
+    /* Step left */
+    seq->i--;
+    key_tree= seq->stack[seq->i].key_tree;
+
+    /* Step down if we can */
+    if (key_tree->next && key_tree->next != &null_element)
+    {
+      // Step down; update the tuple
+      seq->i--;
+      step_down_to(seq, key_tree->next);
+      key_tree= key_tree->next;
+      break;
+    }
   }
 
-  if (key_tree->next_key_part &&
-      key_tree->next_key_part->part == key_tree->part+1 &&
-      key_tree->next_key_part->type == SEL_ARG::KEY_RANGE)
-  {						// const key as prefix
-    if (min_key_length == max_key_length &&
-	!memcmp(min_key,max_key, (uint) (tmp_max_key - max_key)) &&
-	!key_tree->min_flag && !key_tree->max_flag)
-    {
-      tmp=check_quick_keys(param,idx,key_tree->next_key_part,
-			   tmp_min_key, min_key_flag | key_tree->min_flag,
-			   tmp_max_key, max_key_flag | key_tree->max_flag);
-      goto end;					// Ugly, but efficient
+  // Ok, we've stepped down from the path to previous tuple.
+  // Walk right-up while we can.
+walk_right_up:
+  // stepping right...
+  while (key_tree->next_key_part && key_tree->next_key_part != &null_element && 
+         key_tree->next_key_part->part == key_tree->part + 1)
+  {
+    // Here: check if current is not "t.field = const"
+    // if it is not, store next keypart as "addon" and break;
+    {
+      uint min_key_length= seq->stack[seq->i].min_key - seq->param->min_key;
+      uint max_key_length= seq->stack[seq->i].max_key - seq->param->max_key;
+      uint len= seq->stack[seq->i].min_key - seq->stack[seq->i-1].min_key;
+      if (!(min_key_length == max_key_length &&
+            !memcmp(seq->stack[seq->i-1].min_key, seq->stack[seq->i-1].max_key, len) &&
+            !key_tree->min_flag && !key_tree->max_flag))
+      {
+        seq->param->is_ror_scan= FALSE;
+        if (!key_tree->min_flag)
+          key_tree->next_key_part->store_min_key(seq->param->key[seq->keyno], 
+                                                 &seq->stack[seq->i].min_key,
+                                                 &seq->stack[seq->i].min_key_flag);
+        if (!key_tree->max_flag)
+          key_tree->next_key_part->store_max_key(seq->param->key[seq->keyno], 
+                                                 &seq->stack[seq->i].max_key,
+                                                 &seq->stack[seq->i].max_key_flag);
+        break;
+      }
     }
-    else
+  
+    // Ok, current is "t.field=const" and there is next_key_part.
+    // step right, and walk up there.
+    key_tree= key_tree->next_key_part;
+
+walk_up_right:
+    while (key_tree->prev && key_tree->prev != &null_element)
     {
-      /* The interval for current key part is not c1 <= keyXpartY <= c1 */
-      param->is_ror_scan= FALSE;
+      /* Step "up" */
+      key_tree= key_tree->prev;
     }
-
-    tmp_min_flag=key_tree->min_flag;
-    tmp_max_flag=key_tree->max_flag;
-    if (!tmp_min_flag)
-      key_tree->next_key_part->store_min_key(param->key[idx], &tmp_min_key,
-					     &tmp_min_flag);
-    if (!tmp_max_flag)
-      key_tree->next_key_part->store_max_key(param->key[idx], &tmp_max_key,
-					     &tmp_max_flag);
-    min_key_length= (uint) (tmp_min_key- param->min_key);
-    max_key_length= (uint) (tmp_max_key- param->max_key);
+    step_down_to(seq, key_tree);
   }
-  else
+
+  /* Ok, we are at full tuple */
+  RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
+  uint min_key_length= cur->min_key - seq->param->min_key;
+  uint max_key_length= cur->max_key - seq->param->max_key;
+  
+  range->ptr= (char*)(int)(key_tree->part);
+  if (cur->min_key_flag & GEOM_FLAG)
   {
-    tmp_min_flag=min_key_flag | key_tree->min_flag;
-    tmp_max_flag=max_key_flag | key_tree->max_flag;
-  }
+    range->range_flag= cur->min_key_flag;
 
-  keynr=param->real_keynr[idx];
-  param->range_count++;
-  if (!tmp_min_flag && ! tmp_max_flag &&
-      (uint) key_tree->part+1 == param->table->key_info[keynr].key_parts &&
-      (param->table->key_info[keynr].flags & (HA_NOSAME | HA_END_SPACE_KEY)) ==
-      HA_NOSAME &&
-      min_key_length == max_key_length &&
-      !memcmp(param->min_key,param->max_key,min_key_length))
-    tmp=1;					// Max one record
+    /* Here minimum contains also function code bits, and maximum is +inf */
+    range->start_key.key=    (byte*)seq->param->min_key;
+    range->start_key.length= min_key_length;
+    range->start_key.flag=  (ha_rkey_function) (cur->min_key_flag ^ GEOM_FLAG);
+  }
   else
   {
-    if (param->is_ror_scan)
+    range->range_flag= cur->min_key_flag | cur->max_key_flag;
+    
+    range->start_key.key=    (byte*)seq->param->min_key;
+    range->start_key.length= cur->min_key - seq->param->min_key;
+    range->start_key.flag=   (cur->min_key_flag & NEAR_MIN ? HA_READ_AFTER_KEY : 
+                                                             HA_READ_KEY_EXACT);
+
+    range->end_key.key=    (byte*)seq->param->max_key;
+    range->end_key.length= cur->max_key - seq->param->max_key;
+    range->end_key.flag=   (cur->max_key_flag & NEAR_MAX ? HA_READ_BEFORE_KEY : 
+                                                           HA_READ_AFTER_KEY);
+    if (!cur->min_key_flag && !cur->max_key_flag &&
+        (uint)key_tree->part+1 == seq->param->table->key_info[seq->real_keyno].key_parts &&
+        (seq->param->table->key_info[seq->real_keyno].flags & (HA_NOSAME | HA_END_SPACE_KEY)) ==
+        HA_NOSAME &&
+        range->start_key.length == range->end_key.length &&
+        !memcmp(seq->param->min_key,seq->param->max_key,range->start_key.length))
+      range->range_flag= UNIQUE_RANGE;
+      
+    if (seq->param->is_ror_scan)
     {
       /*
         If we get here, the condition on the key was converted to form
@@ -6653,63 +6626,108 @@
           uncovered "tail" of KeyX parts is either empty or is identical to
           first members of clustered primary key.
       */
-      if (!(min_key_length == max_key_length &&
-            !memcmp(min_key,max_key, (uint) (tmp_max_key - max_key)) &&
-            !key_tree->min_flag && !key_tree->max_flag &&
-            is_key_scan_ror(param, keynr, key_tree->part + 1)))
-        param->is_ror_scan= FALSE;
-    }
-
-    if (tmp_min_flag & GEOM_FLAG)
-    {
-      key_range min_range;
-      min_range.key=    (byte*) param->min_key;
-      min_range.length= min_key_length;
-      /* In this case tmp_min_flag contains the handler-read-function */
-      min_range.flag=   (ha_rkey_function) (tmp_min_flag ^ GEOM_FLAG);
-
-      tmp= param->table->file->records_in_range(keynr, &min_range,
-                                                (key_range*) 0);
-    }
-    else
-    {
-      key_range min_range, max_range;
-
-      min_range.key=    (byte*) param->min_key;
-      min_range.length= min_key_length;
-      min_range.flag=   (tmp_min_flag & NEAR_MIN ? HA_READ_AFTER_KEY :
-                         HA_READ_KEY_EXACT);
-      max_range.key=    (byte*) param->max_key;
-      max_range.length= max_key_length;
-      max_range.flag=   (tmp_max_flag & NEAR_MAX ?
-                         HA_READ_BEFORE_KEY : HA_READ_AFTER_KEY);
-      tmp=param->table->file->records_in_range(keynr,
-                                               (min_key_length ? &min_range :
-                                                (key_range*) 0),
-                                               (max_key_length ? &max_range :
-                                                (key_range*) 0));
+      if (!(!cur->min_key_flag && !cur->max_key_flag &&
+            (range->start_key.length == range->end_key.length) &&
+            !memcmp(range->start_key.key, range->end_key.key, range->start_key.length) &&
+            is_key_scan_ror(seq->param, seq->real_keyno, key_tree->part + 1)))
+        seq->param->is_ror_scan= FALSE;
     }
   }
- end:
-  if (tmp == HA_POS_ERROR)			// Impossible range
-    return tmp;
-  records+=tmp;
-  if (key_tree->right != &null_element)
+  seq->param->range_count++;
+  seq->param->max_key_part=max(seq->param->max_key_part,key_tree->part);
+  return 0;
+}
+
+
+/*
+  Calculate estimate of number records that will be retrieved by a range
+  scan on given index using given SEL_ARG intervals tree.
+  SYNOPSIS
+    check_quick_select
+      param  Parameter from test_quick_select
+      idx    Number of index to use in PARAM::key SEL_TREE::key
+      tree   Transformed selection condition, tree->key[idx] holds intervals
+             tree to be used for scanning.
+  NOTES
+    param->is_ror_scan is set to reflect if the key scan is a ROR (see
+    is_key_scan_ror function for more info)
+    param->table->quick_*, param->range_count (and maybe others) are
+    updated with data of given key scan, see quick_range_seq_next for details.
+
+  RETURN
+    Estimate # of records to be retrieved.
+    HA_POS_ERROR if estimate calculation failed due to table handler problems.
+
+*/
+ha_rows check_quick_select(PARAM *param, uint idx, bool index_only, SEL_ARG *tree, 
+                               COST_VECT *cost)
+{
+  DBUG_ENTER("check_quick_select");
+  SEL_ARG_RANGE_SEQ seq;
+  bool cpk_scan;
+  
+  if (tree->type == SEL_ARG::IMPOSSIBLE)
+    DBUG_RETURN(0L);				// Impossible select. return
+  if (tree->type != SEL_ARG::KEY_RANGE || tree->part != 0)
+    DBUG_RETURN(HA_POS_ERROR);				// Don't use tree
+
+  seq.keyno= idx;
+  seq.real_keyno= param->real_keynr[idx];
+  seq.param= param;
+  seq.start= tree;
+  
+  RANGE_SEQ_IF seq_if = {sel_arg_range_seq_init, sel_arg_range_seq_next};
+  uint bufsize= 0;
+  uint flags= 0;
+  ha_rows rows;
+
+  handler *file= param->table->file;
+  param->range_count=0;
+  param->max_key_part=0;
+  param->is_ror_scan= FALSE;
+  
+  enum ha_key_alg key_alg= param->table->key_info[seq.real_keyno].algorithm;
+  if ((key_alg != HA_KEY_ALG_BTREE) && (key_alg!= HA_KEY_ALG_UNDEF))
+  {
+    /* Records are not ordered by rowid for other types of indexes. */
+    cpk_scan= FALSE;
+  }
+  else
   {
     /*
-      There are at least two intervals for current key part, i.e. condition
-      was converted to something like
-        (keyXpartY less/equals c1) OR (keyXpartY more/equals c2).
-      This is not a ROR scan if the key is not Clustered Primary Key.
+      Clustered PK scan is a special case, quick_range_seq_next doesn't 
+      recognize CPK scans as ROR scans (while actually any CPK scan is a ROR 
+      scan).
     */
-    param->is_ror_scan= FALSE;
-    tmp=check_quick_keys(param,idx,key_tree->right,min_key,min_key_flag,
-			 max_key,max_key_flag);
-    if (tmp == HA_POS_ERROR)
-      return tmp;
-    records+=tmp;
+    cpk_scan= ((param->table->s->primary_key == param->real_keynr[idx]) &&
+               param->table->file->primary_key_is_clustered());
+    param->is_ror_scan= !cpk_scan;
   }
-  return records;
+
+  bool pk_is_clustered= file->primary_key_is_clustered();
+  uint keynr= param->real_keynr[idx];
+  if (index_only && 
+      (file->index_flags(keynr, param->max_key_part, 1) & HA_KEYREAD_ONLY) &&
+      !(pk_is_clustered && keynr == param->table->s->primary_key))
+     flags |= HA_MRR_INDEX_ONLY;
+
+  rows= file->multi_range_read_info_const(param->real_keynr[idx], &seq_if, 
+                                          (void*)&seq,
+                                          0, // todo: n_ranges
+                                          &bufsize, &flags, cost);
+  if (rows != HA_POS_ERROR)
+  {
+    param->table->quick_keys.set_bit(seq.real_keyno);
+    param->table->quick_rows[seq.real_keyno]=rows;
+    param->table->quick_key_parts[seq.real_keyno]=param->max_key_part+1;
+
+    if (cpk_scan)
+      param->is_ror_scan= TRUE;
+  }
+  if (param->table->file->index_flags(seq.real_keyno, 0, TRUE) & HA_KEY_SCAN_NOT_ROR)
+    param->is_ror_scan= FALSE;
+  DBUG_PRINT("exit", ("Records: %lu", (ulong) rows));
+  DBUG_RETURN(rows);
 }
 
 
@@ -6743,7 +6761,7 @@
     Scans on HASH indexes are not ROR scans,
     any range scan on clustered primary key is ROR scan  (3)
 
-    Check (1) is made in check_quick_keys()
+    Check (1) is made in quick_range_seq_next()
     Check (3) is made check_quick_select()
     Check (2) is made by this function.
 
@@ -6759,9 +6777,19 @@
   KEY_PART_INFO *key_part_end= (table_key->key_part +
                                 table_key->key_parts);
   uint pk_number;
+  
+  for (KEY_PART_INFO *kp= table_key->key_part; kp < key_part; kp++)
+  {
+    uint16 fieldnr= param->table->key_info[keynr].
+                    key_part[kp - table_key->key_part].fieldnr - 1;
+    if (param->table->field[fieldnr]->key_length() != kp->length)
+      return FALSE;
+  }
 
   if (key_part == key_part_end)
     return TRUE;
+
+  key_part= table_key->key_part + nparts;
   pk_number= param->table->s->primary_key;
   if (!param->table->file->primary_key_is_clustered() || pk_number == MAX_KEY)
     return FALSE;
@@ -6971,7 +6999,20 @@
 }
 
 
-/* Returns TRUE if any part of the key is NULL */
+
+/*
+  Return TRUE if any part of the key is NULL
+
+  SYNOPSIS
+    null_part_in_key()    
+      key_part  Array of key parts (index description)
+      key       Key values tuple
+      length    Length of key values tuple in bytes.
+
+  RETURN
+    TRUE   The tuple has at least one "keypartX is NULL"
+    FALSE  Otherwise
+*/
 
 static bool null_part_in_key(KEY_PART *key_part, const char *key, uint length)
 {
@@ -7426,6 +7467,7 @@
   DBUG_RETURN(error);
 }
 
+
 int QUICK_RANGE_SELECT::reset()
 {
   uint  mrange_bufsiz;
@@ -7443,7 +7485,7 @@
   if (multi_range_length)
   {
     DBUG_ASSERT(multi_range_length == min(multi_range_count, ranges.elements));
-    DBUG_RETURN(0);
+    goto end;
   }
 
   /* Allocate the ranges array. */
@@ -7500,11 +7542,135 @@
     bzero((char*) mrange_buff, mrange_bufsiz);
 #endif
   }
+end:
+  RANGE_SEQ_IF seq_funcs= {quick_range_seq_init, quick_range_seq_next};
+  file->multi_range_read_init(&seq_funcs, (void*)this, ranges.elements, 
+                              (sorted?HA_MRR_SORTED:0) | HA_MRR_NO_ASSOCIATION, multi_range_buff);
   DBUG_RETURN(0);
 }
 
 
 /*
+  Range sequence interface implementation for array<QUICK_RANGE>: initialize
+  
+  SYNOPSIS
+    quick_range_seq_init()
+      init_param  Caller-opaque paramenter: QUICK_RANGE_SELECT* pointer
+      n_ranges    Number of ranges in the sequence (ignored)
+      flags       MRR flags (currently not used) 
+
+  RETURN
+    Opaque value to be passed to quick_range_seq_next
+*/
+
+range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  QUICK_RANGE_SELECT *quick= (QUICK_RANGE_SELECT*)init_param;
+  quick->qr_traversal_ctx.first=  (QUICK_RANGE**)quick->ranges.buffer;
+  quick->qr_traversal_ctx.cur=    (QUICK_RANGE**)quick->ranges.buffer;
+  quick->qr_traversal_ctx.last=   quick->qr_traversal_ctx.cur + 
+                                  quick->ranges.elements;
+  return &quick->qr_traversal_ctx;
+}
+
+
+/*
+  Range sequence interface implementation for array<QUICK_RANGE>: get next
+  
+  SYNOPSIS
+    quick_range_seq_next()
+      rseq        Value returned from quick_range_seq_init
+      range  OUT  Store information about the range here
+
+  RETURN
+    0  Ok
+    1  No more ranges in the sequence
+*/
+
+uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  QUICK_RANGE_SEQ_CTX *ctx= (QUICK_RANGE_SEQ_CTX*)rseq;
+
+  if (ctx->cur == ctx->last)
+    return 1; /* no more ranges */
+
+  QUICK_RANGE *cur= *(ctx->cur);
+  key_range *start_key= &range->start_key;
+  key_range *end_key=   &range->end_key;
+
+  start_key->key=    (const byte*)cur->min_key;
+  start_key->length= cur->min_length;
+  start_key->flag=   ((cur->flag & NEAR_MIN) ? HA_READ_AFTER_KEY :
+                      (cur->flag & EQ_RANGE) ?
+                      HA_READ_KEY_EXACT : HA_READ_KEY_OR_NEXT);
+  end_key->key=      (const byte*)cur->max_key;
+  end_key->length=   cur->max_length;
+  /*
+    We use HA_READ_AFTER_KEY here because if we are reading on a key
+    prefix. We want to find all keys with this prefix.
+  */
+  end_key->flag=     (cur->flag & NEAR_MAX ? HA_READ_BEFORE_KEY :
+                      HA_READ_AFTER_KEY);
+  range->range_flag= cur->flag;
+  ctx->cur++;
+  return 0;
+}
+
+
+/*
+  MRR range sequence interface: array<QUICK_RANGE> impl: utility func for NDB
+
+  SYNOPSIS
+    mrr_persistent_flag_storage()
+      seq  Range sequence being traversed
+      idx  Number of range
+
+  DESCRIPTION
+    MRR/NDB implementation needs to store some bits for each range. This
+    function returns a reference to the "range_flag" associated with the
+    range number idx.
+
+    This function should be removed when we get a proper MRR/NDB 
+    implementation.
+
+  RETURN
+    Reference to range_flag associated with range number #idx
+*/
+
+uint16 &mrr_persistent_flag_storage(range_seq_t seq, uint idx)
+{
+  QUICK_RANGE_SEQ_CTX *ctx= (QUICK_RANGE_SEQ_CTX*)seq;
+  return ctx->first[idx]->flag;
+}
+
+
+/*
+  MRR range sequence interface: array<QUICK_RANGE> impl: utility func for NDB
+
+  SYNOPSIS
+    mrr_get_ptr_by_idx()
+      seq  Range sequence bening traversed
+      idx  Number of range
+
+  DESCRIPTION
+    Current MRR/NDB code traverses the sequence of ranges several times and
+    it accesses range-associated data on the second pass. Multiple range
+    sequence interface traversals are not allowed in the new MRR interface,
+    so this function retu
+
+  RETURN
+    Reference to range-associated data.  
+*/
+
+char* &mrr_get_ptr_by_idx(range_seq_t seq, uint idx)
+{
+  static char *dummy;
+  QUICK_RANGE_SEQ_CTX *ctx= (QUICK_RANGE_SEQ_CTX*)seq;
+  return dummy;
+}
+
+
+/*
   Get next possible record using quick-struct.
 
   SYNOPSIS
@@ -7521,71 +7687,8 @@
 
 int QUICK_RANGE_SELECT::get_next()
 {
-  int             result;
-  KEY_MULTI_RANGE *mrange;
-  key_range       *start_key;
-  key_range       *end_key;
-  DBUG_ENTER("QUICK_RANGE_SELECT::get_next");
-  DBUG_ASSERT(multi_range_length && multi_range &&
-              (cur_range >= (QUICK_RANGE**) ranges.buffer) &&
-              (cur_range <= (QUICK_RANGE**) ranges.buffer + ranges.elements));
-
-  for (;;)
-  {
-    if (in_range)
-    {
-      /* We did already start to read this key. */
-      result= file->read_multi_range_next(&mrange);
-      if (result != HA_ERR_END_OF_FILE)
-      {
-        in_range= ! result;
-	DBUG_RETURN(result);
-      }
-    }
-
-    uint count= min(multi_range_length, ranges.elements -
-                    (cur_range - (QUICK_RANGE**) ranges.buffer));
-    if (count == 0)
-    {
-      /* Ranges have already been used up before. None is left for read. */
-      in_range= FALSE;
-      DBUG_RETURN(HA_ERR_END_OF_FILE);
-    }
-    KEY_MULTI_RANGE *mrange_slot, *mrange_end;
-    for (mrange_slot= multi_range, mrange_end= mrange_slot+count;
-         mrange_slot < mrange_end;
-         mrange_slot++)
-    {
-      start_key= &mrange_slot->start_key;
-      end_key= &mrange_slot->end_key;
-      range= *(cur_range++);
-
-      start_key->key=    (const byte*) range->min_key;
-      start_key->length= range->min_length;
-      start_key->flag=   ((range->flag & NEAR_MIN) ? HA_READ_AFTER_KEY :
-                          (range->flag & EQ_RANGE) ?
-                          HA_READ_KEY_EXACT : HA_READ_KEY_OR_NEXT);
-      end_key->key=      (const byte*) range->max_key;
-      end_key->length=   range->max_length;
-      /*
-        We use HA_READ_AFTER_KEY here because if we are reading on a key
-        prefix. We want to find all keys with this prefix.
-      */
-      end_key->flag=     (range->flag & NEAR_MAX ? HA_READ_BEFORE_KEY :
-                          HA_READ_AFTER_KEY);
-
-      mrange_slot->range_flag= range->flag;
-    }
-
-    result= file->read_multi_range_first(&mrange, multi_range, count,
-                                         sorted, multi_range_buff);
-    if (result != HA_ERR_END_OF_FILE)
-    {
-      in_range= ! result;
-      DBUG_RETURN(result);
-    }
-    in_range= FALSE; /* No matching rows; go to next set of ranges. */
-  }
+  char *dummy;
+  return file->multi_range_read_next(&dummy);
 }
 
 
@@ -8615,8 +8718,11 @@
       cur_index_tree= get_index_range_tree(cur_index, tree, param,
                                            &cur_param_idx);
       /* Check if this range tree can be used for prefix retrieval. */
-      cur_quick_prefix_records= check_quick_select(param, cur_param_idx,
-                                                    cur_index_tree);
+      COST_VECT dummy;
+      cur_quick_prefix_records= check_quick_select(param, cur_param_idx, 
+                                                       FALSE /*don't care*/, 
+                                                       cur_index_tree,
+                                                       &dummy);
     }
     cost_group_min_max(table, cur_index_info, used_key_parts,
                        cur_group_key_parts, tree, cur_index_tree,
diff -Nru a/sql/opt_range.h b/sql/opt_range.h
--- a/sql/opt_range.h	2006-03-18 14:54:11 +03:00
+++ b/sql/opt_range.h	2006-03-18 14:54:11 +03:00
@@ -252,6 +252,22 @@
 class PARAM;
 class SEL_ARG;
 
+
+/*
+  MRR range sequence, array<QUICK_RANGE> implementation: sequence traversal
+  context.
+*/
+typedef struct st_quick_range_seq_ctx
+{
+  QUICK_RANGE **first;
+  QUICK_RANGE **cur;
+  QUICK_RANGE **last;
+} QUICK_RANGE_SEQ_CTX;
+
+range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags);
+uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+
+
 /*
   Quick select that does a range scan on a single key. The records are
   returned in key order.
@@ -277,7 +293,7 @@
                                        freed by QUICK_RANGE_SELECT) */
   HANDLER_BUFFER *multi_range_buff; /* the handler buffer (allocated and
                                        freed by QUICK_RANGE_SELECT) */
-
+  QUICK_RANGE_SEQ_CTX qr_traversal_ctx;
 protected:
   friend class TRP_ROR_INTERSECT;
   friend
@@ -295,6 +311,9 @@
   friend class QUICK_SELECT_DESC;
   friend class QUICK_INDEX_MERGE_SELECT;
   friend class QUICK_ROR_INTERSECT_SELECT;
+  friend uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+  friend range_seq_t quick_range_seq_init(void *init_param,
+                                                  uint n_ranges, uint flags);
 
   DYNAMIC_ARRAY ranges;     /* ordered array of range ptrs */
   QUICK_RANGE **cur_range;  /* current element in ranges  */
diff -Nru a/sql/sql_select.cc b/sql/sql_select.cc
--- a/sql/sql_select.cc	2006-03-18 14:54:10 +03:00
+++ b/sql/sql_select.cc	2006-03-18 14:54:11 +03:00
@@ -3456,12 +3456,10 @@
             if (table->used_keys.is_set(key))
             {
               /* we can use only index tree */
-              uint keys_per_block= table->file->block_size/2/
-                (keyinfo->key_length+table->file->ref_length)+1;
-              tmp = record_count*(tmp+keys_per_block-1)/keys_per_block;
+              tmp= record_count * table->file->index_only_read_time(key, tmp);
             }
             else
-              tmp = record_count*min(tmp,s->worst_seeks);
+              tmp= record_count * min(tmp,s->worst_seeks);
           }
         }
         else
@@ -3546,12 +3544,10 @@
             if (table->used_keys.is_set(key))
             {
               /* we can use only index tree */
-              uint keys_per_block= table->file->block_size/2/
-                (keyinfo->key_length+table->file->ref_length)+1;
-              tmp = record_count*(tmp+keys_per_block-1)/keys_per_block;
+              tmp= record_count * table->file->index_only_read_time(key, tmp);
             }
             else
-              tmp = record_count*min(tmp,s->worst_seeks);
+              tmp = record_count * min(tmp,s->worst_seeks);
           }
           else
             tmp = best_time;                    // Do nothing
@@ -4468,12 +4464,10 @@
 	      if (table->used_keys.is_set(key))
 	      {
 		/* we can use only index tree */
-		uint keys_per_block= table->file->block_size/2/
-		  (keyinfo->key_length+table->file->ref_length)+1;
-		tmp=record_count*(tmp+keys_per_block-1)/keys_per_block;
+                tmp= record_count*table->file->index_only_read_time(key, tmp);
 	      }
 	      else
-		tmp=record_count*min(tmp,s->worst_seeks);
+		tmp= record_count*min(tmp,s->worst_seeks);
 	    }
 	  }
 	  else
@@ -4554,12 +4548,11 @@
 	      if (table->used_keys.is_set(key))
 	      {
 		/* we can use only index tree */
-		uint keys_per_block= table->file->block_size/2/
-		  (keyinfo->key_length+table->file->ref_length)+1;
-		tmp=record_count*(tmp+keys_per_block-1)/keys_per_block;
+                tmp= record_count * table->file->index_only_read_time(key, 
+                                                                      tmp);
 	      }
 	      else
-		tmp=record_count*min(tmp,s->worst_seeks);
+		tmp=record_count * min(tmp,s->worst_seeks);
 	    }
 	    else
 	      tmp=best_time;			// Do nothing

BR
 Sergey
-- 
Sergey Petrunia, Software Developer
MySQL AB, www.mysql.com
Office: N/A

Thread
bk commit - 5.2 tree (sergefp:1.2154) WL#2474Sergey Petrunia18 Mar