From: Ole John Aske Date: March 30 2012 6:52am Subject: bzr push into mysql-trunk branch (ole.john.aske:3876 to 3877) WL#5940 List-Archive: http://lists.mysql.com/commits/143368 Message-Id: <20120330065252.6D05A252@fimafeng09.norway.sun.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit 3877 Ole John Aske 2012-03-30 Patch sets for WL#5940: 'handler extensions for pushed join' Patches are commited to branch: mysql-trunk-wl5940_2 which are based on mysql-trunk, revno 3875 @ sql/abstract_query_plan.cc Add an abstraction layer on top of the Query Execution Plan (QEP) in order to make it simpler for non-optimizer modules to extract information from the QEP @ sql/abstract_query_plan.h Add an abstraction layer on top of the Query Execution Plan (QEP) in order to make it simpler for non-optimizer modules to extract information from the QEP @ sql/ha_ndbcluster.h Remove the temporary declaration of HA_PUSH flags as they are now available from handler.h @ sql/handler.cc Implements the handlerton extensions required by pushed joins @ sql/handler.h Defines the handler extensions required by pushed joins @ sql/opt_explain.cc Add code to produce EXPLAIN text for pushed joins @ sql/opt_explain_format.h Add ET_PUSHED_JOIN explain. @ sql/opt_explain_json.cc Add ET_PUSHED_JOIN explain. @ sql/opt_explain_traditional.cc Add ET_PUSHED_JOIN explain. @ sql/sql_executor.cc Extends pick_table_access_method() to also set up new access methods used for pushed joins. Also implements the pushed join access methods join_read_linked_first() and join_read_linked_next() @ sql/sql_optimizer.cc Call handler extensions to build pushed joins if supported. Also pick_table_access_method() at end of ::optimize() @ sql/sql_select.cc pick_table_access_method() is being moved to end of JOIN::optimize(). Instead of prematurely setting the access methods in make_join_readinfo(), and having to maintain/change them during optimize, we defer this work until ::optimize() has completed. added: sql/abstract_query_plan.cc sql/abstract_query_plan.h modified: sql/CMakeLists.txt sql/ha_ndbcluster.h sql/handler.cc sql/handler.h sql/opt_explain.cc sql/opt_explain_format.h sql/opt_explain_json.cc sql/opt_explain_traditional.cc sql/sql_executor.cc sql/sql_optimizer.cc sql/sql_select.cc 3876 Ole John Aske 2012-03-29 Refreshing branch mysql-trunk-wl5940_2 with a fresh mysql-trunk, revno 3875 as a new baseline for reapplying new patches for WL5940 === modified file 'sql/CMakeLists.txt' --- a/sql/CMakeLists.txt 2012-03-06 14:29:42 +0000 +++ b/sql/CMakeLists.txt 2012-03-30 06:52:25 +0000 @@ -37,6 +37,7 @@ IF(SSL_DEFINES) ENDIF() SET(SQL_SHARED_SOURCES + abstract_query_plan.cc datadict.cc debug_sync.cc derror.cc === added file 'sql/abstract_query_plan.cc' --- a/sql/abstract_query_plan.cc 1970-01-01 00:00:00 +0000 +++ b/sql/abstract_query_plan.cc 2012-03-30 06:52:25 +0000 @@ -0,0 +1,527 @@ +/* + Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "sql_priv.h" +#include "sql_select.h" +#include "sql_optimizer.h" +#include "abstract_query_plan.h" + + +namespace AQP +{ + + /** + @param join_tab Array of access methods constituting the nested loop join. + @param access_count Length of array. + */ + Join_plan::Join_plan(const JOIN* join) + : m_join_tabs(join->join_tab), + m_access_count(join->tables), + m_table_accesses(NULL) + { + /* + This combination is assumed not to appear. If it does, code must + be written to handle it. + */ + DBUG_ASSERT((m_join_tabs[0].use_quick != 2) + || (m_join_tabs[0].type == JT_ALL) + || (m_join_tabs[0].select == NULL) + || (m_join_tabs[0].select->quick == NULL)); + + m_table_accesses= new Table_access[m_access_count]; + for(uint i= 0; i < m_access_count; i++) + { + m_table_accesses[i].m_join_plan= this; + m_table_accesses[i].m_tab_no= i; + } + } + + Join_plan::~Join_plan() + { + delete[] m_table_accesses; + m_table_accesses= NULL; + } + + /** Get the JOIN_TAB of the n'th table access operation.*/ + const JOIN_TAB* Join_plan::get_join_tab(uint join_tab_no) const + { + DBUG_ASSERT(join_tab_no < m_access_count); + return m_join_tabs + join_tab_no; + } + + /** + Determine join type between this table access and some other table + access that preceeds it in the join plan.. + */ + enum_join_type + Table_access::get_join_type(const Table_access* predecessor) const + { + DBUG_ENTER("get_join_type"); + DBUG_ASSERT(get_access_no() > predecessor->get_access_no()); + + if (get_join_tab()->table->pos_in_table_list->outer_join != 0) + { + /* + This cover unnested outer joins such as + 'select * from t1 left join t2 on t1.attr=t1.pk'. + */ + DBUG_PRINT("info", ("JT_OUTER_JOIN between %s and %s", + predecessor->get_join_tab()->table->alias, + get_join_tab()->table->alias)); + DBUG_RETURN(JT_OUTER_JOIN); + } + + const TABLE_LIST* const child_embedding= + get_join_tab()->table->pos_in_table_list->embedding; + + if (child_embedding == NULL) + { + // 'this' is not on the inner side of any left join. + DBUG_PRINT("info", ("JT_INNER_JOIN between %s and %s", + predecessor->get_join_tab()->table->alias, + get_join_tab()->table->alias)); + DBUG_RETURN(JT_INNER_JOIN); + } + + DBUG_ASSERT(child_embedding->outer_join != 0); + + const TABLE_LIST *predecessor_embedding= + predecessor->get_join_tab()->table->pos_in_table_list->embedding; + + /* + This covers the nested join case, i.e: + LEFT JOIN (). + + TABLE_LIST objects form a tree where TABLE_LIST::emebedding points to + the parent object. Now if child_embedding is non null and not an + ancestor of predecessor_embedding in the embedding tree, then 'this' + must be on the inner side of some left join where 'predecessor' is on + the outer side. + */ + while (true) + { + if (predecessor_embedding == child_embedding) + { + DBUG_PRINT("info", ("JT_INNER_JOIN between %s and %s", + predecessor->get_join_tab()->table->alias, + get_join_tab()->table->alias)); + DBUG_RETURN(JT_INNER_JOIN); + } + else if (predecessor_embedding == NULL) + { + /* + We reached the root of the tree without finding child_embedding, + so it must be in another branch and hence on the inner side of some + left join where 'predecessor' is on the outer side. + */ + DBUG_PRINT("info", ("JT_OUTER_JOIN between %s and %s", + predecessor->get_join_tab()->table->alias, + get_join_tab()->table->alias)); + DBUG_RETURN(JT_OUTER_JOIN); + } + // Iterate through ancestors of predecessor_embedding. + predecessor_embedding = predecessor_embedding->embedding; + } + } + + /** + Get the number of key values for this operation. It is an error + to call this method on an operation that is not an index lookup + operation. + */ + uint Table_access::get_no_of_key_fields() const + { + DBUG_ASSERT(m_access_type == AT_PRIMARY_KEY || + m_access_type == AT_UNIQUE_KEY || + m_access_type == AT_MULTI_PRIMARY_KEY || + m_access_type == AT_MULTI_UNIQUE_KEY || + m_access_type == AT_ORDERED_INDEX_SCAN); // Used as 'range scan' + return get_join_tab()->ref.key_parts; + } + + /** + Get the field_no'th key values for this operation. It is an error + to call this method on an operation that is not an index lookup + operation. + */ + const Item* Table_access::get_key_field(uint field_no) const + { + DBUG_ASSERT(field_no < get_no_of_key_fields()); + return get_join_tab()->ref.items[field_no]; + } + + /** + Get the field_no'th KEY_PART_INFO for this operation. It is an error + to call this method on an operation that is not an index lookup + operation. + */ + const KEY_PART_INFO* Table_access::get_key_part_info(uint field_no) const + { + DBUG_ASSERT(field_no < get_no_of_key_fields()); + const KEY* key= &get_join_tab()->table->key_info[get_join_tab()->ref.key]; + return &key->key_part[field_no]; + } + + /** + Get the table that this operation accesses. + */ + TABLE* Table_access::get_table() const + { + return get_join_tab()->table; + } + + double Table_access::get_fanout() const + { + switch (get_access_type()) + { + case AT_PRIMARY_KEY: + case AT_UNIQUE_KEY: + return 1.0; + + case AT_ORDERED_INDEX_SCAN: + DBUG_ASSERT(get_join_tab()->join->best_positions[m_tab_no].records_read>0.0); + return get_join_tab()->join->best_positions[m_tab_no].records_read; + + case AT_MULTI_PRIMARY_KEY: + case AT_MULTI_UNIQUE_KEY: + case AT_MULTI_MIXED: + DBUG_ASSERT(get_join_tab()->join->best_positions[m_tab_no].records_read>0.0); + return get_join_tab()->join->best_positions[m_tab_no].records_read; + + case AT_TABLE_SCAN: + DBUG_ASSERT(get_join_tab()->table->file->stats.records>0.0); + return static_cast(get_join_tab()->table->file->stats.records); + + default: + return 99999999.0; + } + } + + /** Get the JOIN_TAB object that corresponds to this operation.*/ + const JOIN_TAB* Table_access::get_join_tab() const + { + return m_join_plan->get_join_tab(m_tab_no); + } + + /** Get the Item_equal's set relevant for the specified 'Item_field' */ + Item_equal* + Table_access::get_item_equal(const Item_field* field_item) const + { + DBUG_ASSERT(field_item->type() == Item::FIELD_ITEM); + + COND_EQUAL* const cond_equal = get_join_tab()->join->cond_equal; + if (cond_equal!=NULL) + { + return (field_item->item_equal != NULL) + ? field_item->item_equal + : const_cast(field_item)->find_item_equal(cond_equal); + } + return NULL; + } + + /** + Write an entry in the trace file about the contents of this object. + */ + void Table_access::dbug_print() const + { + DBUG_PRINT("info", ("type:%d", get_join_tab()->type)); + DBUG_PRINT("info", ("ref.key:%d", get_join_tab()->ref.key)); + DBUG_PRINT("info", ("ref.key_parts:%d", get_join_tab()->ref.key_parts)); + DBUG_PRINT("info", ("ref.key_length:%d", get_join_tab()->ref.key_length)); + + DBUG_PRINT("info", ("order:%p", get_join_tab()->join->order.order)); + DBUG_PRINT("info", ("skip_sort_order:%d", + get_join_tab()->join->skip_sort_order)); + DBUG_PRINT("info", ("no_order:%d", get_join_tab()->join->no_order)); + DBUG_PRINT("info", ("simple_order:%d", get_join_tab()->join->simple_order)); + + DBUG_PRINT("info", ("group:%d", get_join_tab()->join->group)); + DBUG_PRINT("info", ("group_list:%p", get_join_tab()->join->group_list.order)); + DBUG_PRINT("info", ("simple_group:%d", get_join_tab()->join->simple_group)); + DBUG_PRINT("info", ("group_optimized_away:%d", + get_join_tab()->join->group_optimized_away)); + + DBUG_PRINT("info", ("full_join:%d", get_join_tab()->join->full_join)); + DBUG_PRINT("info", ("need_tmp:%d", get_join_tab()->join->need_tmp)); + DBUG_PRINT("info", ("select_distinct:%d", + get_join_tab()->join->select_distinct)); + + DBUG_PRINT("info", ("use_quick:%d", get_join_tab()->use_quick)); + DBUG_PRINT("info", ("index:%d", get_join_tab()->index)); + DBUG_PRINT("info", ("quick:%p", get_join_tab()->quick)); + DBUG_PRINT("info", ("select:%p", get_join_tab()->select)); + if (get_join_tab()->select && get_join_tab()->select->quick) + { + DBUG_PRINT("info", ("select->quick->get_type():%d", + get_join_tab()->select->quick->get_type())); + } + } + + + /** + Compute the access type and index (if apliccable) of this operation . + */ + void Table_access::compute_type_and_index() const + { + DBUG_ENTER("Table_access::compute_type_and_index"); + const JOIN_TAB* const join_tab= get_join_tab(); + JOIN* const join= join_tab->join; + + /** + * There are some JOIN arguments we don't fully understand or has + * not yet invested time into exploring pushability of: + */ + if (join->procedure) + { + m_access_type= AT_OTHER; + m_other_access_reason = + "'PROCEDURE'-clause post processing cannot be pushed."; + DBUG_VOID_RETURN; + } + + /** + * OLEJA: I think this restriction can be removed + * now as WL5558 and other changes has cleaned up the + * ORDER/GROUP BY optimize + execute path. + */ + if (join->group_list && !join->tmp_table_param.quick_group) + { + m_access_type= AT_OTHER; + m_other_access_reason = + "GROUP BY cannot be done using index on grouped columns."; + DBUG_VOID_RETURN; + } + + /* Tables below 'const_tables' has been const'ified, or entirely + * optimized away due to 'impossible WHERE/ON' + */ + if (join_tab < join->join_tab+join->const_tables) + { + DBUG_PRINT("info", ("Operation %d is const-optimized.", m_tab_no)); + m_access_type= AT_FIXED; + DBUG_VOID_RETURN; + } + + /* + Identify the type of access operation and the index to use (if any). + */ + switch (join_tab->type) + { + case JT_EQ_REF: + case JT_CONST: + m_index_no= join_tab->ref.key; + + if (m_index_no == static_cast(join_tab->table->s->primary_key)) + { + DBUG_PRINT("info", ("Operation %d is a primary key lookup.", m_tab_no)); + m_access_type= AT_PRIMARY_KEY; + } + else + { + DBUG_PRINT("info", ("Operation %d is a unique index lookup.", + m_tab_no)); + m_access_type= AT_UNIQUE_KEY; + } + break; + + case JT_REF: + { + DBUG_ASSERT(join_tab->ref.key >= 0); + DBUG_ASSERT((uint)join_tab->ref.key < MAX_KEY); + m_index_no= join_tab->ref.key; + + /* + All parts of a key are specified for an unique index -> access is a key lookup. + */ + const KEY *key_info= join_tab->table->s->key_info; + if (key_info[m_index_no].key_parts == join_tab->ref.key_parts && + key_info[m_index_no].flags & HA_NOSAME) + { + m_access_type= + (m_index_no == static_cast(join_tab->table->s->primary_key)) + ? AT_PRIMARY_KEY + : AT_UNIQUE_KEY; + DBUG_PRINT("info", ("Operation %d is an unique key referrence.", m_tab_no)); + } + else + { + DBUG_ASSERT(join_tab->ref.key_parts > 0); + DBUG_ASSERT(join_tab->ref.key_parts <= key_info[m_index_no].key_parts); + m_access_type= AT_ORDERED_INDEX_SCAN; + DBUG_PRINT("info", ("Operation %d is an ordered index scan.", m_tab_no)); + } + break; + } + case JT_INDEX_SCAN: + DBUG_ASSERT(join_tab->index < MAX_KEY); + m_index_no= join_tab->index; + m_access_type= AT_ORDERED_INDEX_SCAN; + DBUG_PRINT("info", ("Operation %d is an ordered index scan.", m_tab_no)); + break; + + case JT_ALL: + if (join_tab->use_quick == 2) + { + /* + use_quick == 2 means that the decision on which access method to use + will be taken late (as rows from the preceeding operation arrive). + This operation is therefor not pushable. + */ + DBUG_PRINT("info", + ("Operation %d has 'use_quick == 2' -> not pushable", + m_tab_no)); + m_access_type= AT_UNDECIDED; + m_index_no= -1; + } + else + { + if (join_tab->select != NULL && + join_tab->select->quick != NULL) + { + QUICK_SELECT_I *quick= join_tab->select->quick; + + /** QUICK_SELECT results in execution of MRR (Multi Range Read). + * Depending on each range, it may require execution of + * either a PK-lookup or a range scan. To cover both of + * these we may need to prepare both a pushed lookup join + * and a pushed range scan. Currently we handle it as + * a range scan and convert e PK lookup to a (closed-) range + * whenever required. + **/ + + const KEY *key_info= join_tab->table->s->key_info; + DBUG_EXECUTE("info", quick->dbug_dump(0, TRUE);); + + // Temporary assert as we are still investigation the relation between + // 'quick->index == MAX_KEY' and the different quick_types + DBUG_ASSERT ((quick->index == MAX_KEY) == + ((quick->get_type() == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE) || + (quick->get_type() == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT) || + (quick->get_type() == QUICK_SELECT_I::QS_TYPE_ROR_UNION))); + + // JT_INDEX_MERGE: We have a set of qualifying PKs as root of pushed joins + if (quick->index == MAX_KEY) + { + m_index_no= join_tab->table->s->primary_key; + m_access_type= AT_MULTI_PRIMARY_KEY; // Multiple PKs are produced by merge + } + + // Else JT_RANGE: May be both exact PK and/or index scans when sorted index available + else if (quick->index == join_tab->table->s->primary_key) + { + m_index_no= quick->index; + if (key_info[m_index_no].algorithm == HA_KEY_ALG_HASH) + m_access_type= AT_MULTI_PRIMARY_KEY; // MRR w/ multiple PK's + else + m_access_type= AT_MULTI_MIXED; // MRR w/ both range and PKs + } + else + { + m_index_no= quick->index; + if (key_info[m_index_no].algorithm == HA_KEY_ALG_HASH) + m_access_type= AT_MULTI_UNIQUE_KEY; // MRR with multiple unique keys + else + m_access_type= AT_MULTI_MIXED; // MRR w/ both range and unique keys + } + } + else + { + DBUG_PRINT("info", ("Operation %d is a table scan.", m_tab_no)); + m_access_type= AT_TABLE_SCAN; + } + } + break; + + default: + /* + Other join_types either cannot be pushed or the code analyze them is + not yet in place. + */ + DBUG_PRINT("info", + ("Operation %d has join_type %d. -> Not pushable.", + m_tab_no, join_tab->type)); + m_access_type= AT_OTHER; + m_index_no= -1; + m_other_access_reason = "This table access method can not be pushed."; + break; + } + DBUG_VOID_RETURN; + } + // Table_access::compute_type_and_index() + + + Table_access::Table_access() + :m_join_plan(NULL), + m_tab_no(0), + m_access_type(AT_VOID), + m_other_access_reason(NULL), + m_index_no(-1) + {} + + /** + Check if the results from this operation will joined with results + from the next operation using a join buffer (instead of plain nested loop). + @return True if using a join buffer. + */ + bool Table_access::uses_join_cache() const + { + return get_join_tab()->next_select == sub_select_cache; + } + + /** + Check if this table will be presorted to an intermediate record storage + before it is joined with its siblings. + */ + bool Table_access::filesort_before_join() const + { + if (m_access_type == AT_PRIMARY_KEY || + m_access_type == AT_UNIQUE_KEY) + { + return false; + } + + const JOIN_TAB* const join_tab= get_join_tab(); + JOIN* const join= join_tab->join; + + /** + Table will be presorted before joining with child tables, if: + 1) This is the first non-const table + 2) There are more tables to be joined + 3) It is not already decide to write entire join result to temp. + 4a) The GROUP BY is 'simple' and does not match an orderd index + 4b) The ORDER BY is 'simple' and does not match an orderd index + + A 'simple' order/group by contain only column references to + the first non-const table + */ + if (join_tab == join->join_tab+join->const_tables && // First non-const table + join->const_tables < join->tables) // There are more tables + { + if (join->need_tmp) + return false; + else if (join->group_list && join->simple_group) + return (join->ordered_index_usage!=JOIN::ordered_index_group_by); + else if (join->order && join->simple_order) + return (join->ordered_index_usage!=JOIN::ordered_index_order_by); + else + return false; + } + return false; + } + +}; +// namespace AQP === added file 'sql/abstract_query_plan.h' --- a/sql/abstract_query_plan.h 1970-01-01 00:00:00 +0000 +++ b/sql/abstract_query_plan.h 2012-03-30 06:52:25 +0000 @@ -0,0 +1,305 @@ +/* + Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef ABSTRACT_QUERY_PLAN_H_INCLUDED +#define ABSTRACT_QUERY_PLAN_H_INCLUDED + +struct TABLE; +struct st_join_table; +typedef st_join_table JOIN_TAB; +class JOIN; +class Item; +class Item_field; +class Item_equal_iterator; + +#include "sql_list.h" + +/** + Abstract query plan (AQP) is an interface for examining certain aspects of + query plans without accessing mysqld internal classes (JOIN_TAB, SQL_SELECT + etc.) directly. + + AQP maps join execution plans, as represented by mysqld internals, to a set + of facade classes. Non-join operations such as sorting and aggregation is + currently *not* modelled in the AQP. + + The AQP models an n-way join as a sequence of the n table access operations + that the MySQL server would execute as part of its nested loop join + execution. (Each such table access operation is a scan of a table or index, + or an index lookup.) For each lookup operation, it is possible to examine + the expression that represents each field in the key. + + A storage enging will typically use the AQP for finding sections of a join + execution plan that may be executed in the engine rather than in mysqld. By + using the AQP rather than the mysqld internals directly, the coupling between + the engine and mysqld is reduced. +*/ +namespace AQP +{ + class Table_access; + + /** + This class represents a query plan for an n-way join, in the form a + sequence of n table access operations that will execute as a nested loop + join. + */ + class Join_plan : public Sql_alloc + { + friend class Equal_set_iterator; + friend class Table_access; + public: + + explicit Join_plan(const JOIN* join); + + ~Join_plan(); + + const Table_access* get_table_access(uint access_no) const; + + uint get_access_count() const; + + private: + /** + Array of the JOIN_TABs that are the internal representation of table + access operations. + */ + const JOIN_TAB* const m_join_tabs; + + /** Number of table access operations. */ + const uint m_access_count; + Table_access* m_table_accesses; + + const JOIN_TAB* get_join_tab(uint join_tab_no) const; + + // No copying. + Join_plan(const Join_plan&); + Join_plan& operator=(const Join_plan&); + }; + // class Join_plan + + + /** + This class is an iterator for iterating over sets of fields (columns) that + should have the same value. For example, if the query is + SELECT * FROM T1, T2, T3 WHERE T1.b = T2.a AND T2.a = T3.a + then there would be such a set of {T1.b, T2.a, T3.a}. + */ + class Equal_set_iterator : public Sql_alloc + { + public: + explicit Equal_set_iterator(Item_equal& item_equal) + : m_iterator(item_equal) {} + + const Item_field* next() + { return m_iterator++; } + + private: + /** + This class is implemented in terms of this mysqld internal class. + */ + Item_equal_iterator m_iterator; + + // No copying. + Equal_set_iterator(const Equal_set_iterator&); + Equal_set_iterator& operator=(const Equal_set_iterator&); + }; + // class Equal_set_iterator + + /** The type of a table access operation. */ + enum enum_access_type + { + /** For default initialization.*/ + AT_VOID, + /** Value has already been fetched / determined by optimizer.*/ + AT_FIXED, + /** Do a lookup of a single primary key.*/ + AT_PRIMARY_KEY, + /** Do a lookup of a single unique index key.*/ + AT_UNIQUE_KEY, + /** Scan an ordered index with a single upper and lower bound pair.*/ + AT_ORDERED_INDEX_SCAN, + /** Do a multi range read for a set of primary keys.*/ + AT_MULTI_PRIMARY_KEY, + /** Do a multi range read for a set of unique index keys.*/ + AT_MULTI_UNIQUE_KEY, + /** + Do a multi range read for a mix of ranges (for which there is an + ordered index), and either primary keys or unique index keys. + */ + AT_MULTI_MIXED, + /** Scan a table. (No index is assumed to be used.) */ + AT_TABLE_SCAN, + /** Access method will not be chosen before the execution phase.*/ + AT_UNDECIDED, + /** + The access method has properties that prevents it from being pushed to a + storage engine. + */ + AT_OTHER + }; + + /** The type of join operation require */ + enum enum_join_type + { + JT_OUTER_JOIN, + JT_INNER_JOIN, + JT_SEMI_JOIN + }; + + /** + This class represents an access operation on a table, such as a table + scan, or a scan or lookup via an index. A Table_access object is always + owned by a Join_plan object, such that the life time of the Table_access + object ends when the life time of the owning Join_plan object ends. + */ + class Table_access : public Sql_alloc + { + friend class Join_plan; + friend inline bool equal(const Table_access*, const Table_access*); + public: + + const Join_plan* get_join_plan() const; + + enum_access_type get_access_type() const; + + const char* get_other_access_reason() const; + + enum_join_type get_join_type(const Table_access* parent) const; + + uint get_no_of_key_fields() const; + + const Item* get_key_field(uint field_no) const; + + const KEY_PART_INFO* get_key_part_info(uint field_no) const; + + uint get_access_no() const; + + int get_index_no() const; + + TABLE* get_table() const; + + double get_fanout() const; + + Item_equal* get_item_equal(const Item_field* field_item) const; + + void dbug_print() const; + + bool uses_join_cache() const; + + bool filesort_before_join() const; + + private: + + /** Backref. to the Join_plan which this Table_access is part of */ + const Join_plan* m_join_plan; + + /** This operation corresponds to m_root_tab[m_tab_no].*/ + uint m_tab_no; + + /** The type of this operation.*/ + mutable enum_access_type m_access_type; + + /** + The reason for getting m_access_type==AT_OTHER. Used for explain extended. + */ + mutable const char* m_other_access_reason; + + /** The index to use for this operation (if applicable )*/ + mutable int m_index_no; + + explicit Table_access(); + + const JOIN_TAB* get_join_tab() const; + + void compute_type_and_index() const; + + /** No copying*/ + Table_access(const Table_access&); + Table_access& operator=(const Table_access&); + }; + // class Table_access + + /** + Get the n'th table access operation. + @param access_no The index of the table access operation to fetch. + @return The access_no'th table access operation. + */ + inline const Table_access* Join_plan::get_table_access(uint access_no) const + { + DBUG_ASSERT(access_no < m_access_count); + return m_table_accesses + access_no; + } + + /** + @return The number of table access operations in the nested loop join. + */ + inline uint Join_plan::get_access_count() const + { + return m_access_count; + } + + /** Get the Join_plan that this Table_access belongs to.*/ + inline const Join_plan* Table_access::get_join_plan() const + { + return m_join_plan; + } + + /** Get the type of this operation.*/ + inline enum_access_type Table_access::get_access_type() const + { + if (m_access_type == AT_VOID) + compute_type_and_index(); + return m_access_type; + } + + /** + Get a description of the reason for getting access_type==AT_OTHER. To be + used for informational messages. + @return A string that should be assumed to have the same life time as the + Table_access object. + */ + inline const char* Table_access::get_other_access_reason() const + { + if (m_access_type == AT_VOID) + compute_type_and_index(); + return m_other_access_reason; + } + + /** + @return The number of the index to use for this access operation ( + or -1 for non-index operations). + */ + inline int Table_access::get_index_no() const + { + if (m_access_type == AT_VOID) + compute_type_and_index(); + + return m_index_no; + } + + /** + Get the number of this Table_access within the enclosing Join_plan. + (This number will be in the range 0 to Join_plan::get_access_count() - 1.) + */ + inline uint Table_access::get_access_no() const + { + return m_tab_no; + } + +}; +// namespace AQP + +#endif === modified file 'sql/ha_ndbcluster.h' --- a/sql/ha_ndbcluster.h 2012-03-28 11:25:37 +0000 +++ b/sql/ha_ndbcluster.h 2012-03-30 06:52:25 +0000 @@ -415,13 +415,6 @@ static void set_tabname(const char *path bool maybe_pushable_join(const char*& reason) const; int assign_pushed_join(const ndb_pushed_join* pushed_join); -#ifdef NDB_WITHOUT_JOIN_PUSHDOWN - enum ha_push_flag { - HA_PUSH_BLOCK_CONST_TABLE, - HA_PUSH_MULTIPLE_DEPENDENCY, - HA_PUSH_NO_ORDERED_INDEX - }; -#endif bool test_push_flag(enum ha_push_flag flag) const; uint number_of_pushed_joins() const; === modified file 'sql/handler.cc' --- a/sql/handler.cc 2012-03-28 11:25:37 +0000 +++ b/sql/handler.cc 2012-03-30 06:52:25 +0000 @@ -4423,6 +4423,44 @@ int ha_table_exists_in_engine(THD* thd, DBUG_RETURN(args.err); } +/** + Prepare (sub-) sequences of joins in this statement + which may be pushed to each storage engine for execution. +*/ +struct st_make_pushed_join_args +{ + const AQP::Join_plan* plan; // Query plan provided by optimizer + int err; // Error code to return. +}; + +static my_bool make_pushed_join_handlerton(THD *thd, plugin_ref plugin, + void *arg) +{ + st_make_pushed_join_args *vargs= (st_make_pushed_join_args *)arg; + handlerton *hton= plugin_data(plugin, handlerton *); + + if (hton && hton->make_pushed_join) + { + const int error= hton->make_pushed_join(hton, thd, vargs->plan); + if (unlikely(error)) + { + vargs->err = error; + return TRUE; + } + } + return FALSE; +} + +int ha_make_pushed_joins(THD *thd, const AQP::Join_plan* plan) +{ + DBUG_ENTER("ha_make_pushed_joins"); + st_make_pushed_join_args args= {plan, 0}; + plugin_foreach(thd, make_pushed_join_handlerton, + MYSQL_STORAGE_ENGINE_PLUGIN, &args); + DBUG_PRINT("exit", ("error: %d", args.err)); + DBUG_RETURN(args.err); +} + #ifdef HAVE_NDB_BINLOG /* TODO: change this into a dynamic struct === modified file 'sql/handler.h' --- a/sql/handler.h 2012-03-28 11:25:37 +0000 +++ b/sql/handler.h 2012-03-30 06:52:25 +0000 @@ -406,6 +406,25 @@ typedef ulonglong my_xid; // this line i #define COMPATIBLE_DATA_YES 0 #define COMPATIBLE_DATA_NO 1 +namespace AQP { + class Join_plan; +}; + +/* Flag used for for test_push_flag() */ +enum ha_push_flag { + + /* Handler want to block const table optimization */ + HA_PUSH_BLOCK_CONST_TABLE + + /* Handler reports a pushed join as having multiple dependencies + if its results does not only depend on the root operation: + ie. results from some child operations does not only depend + on results from the root operation and/or other child operations + within this pushed join + */ + ,HA_PUSH_MULTIPLE_DEPENDENCY +}; + /** struct xid_t is binary compatible with the XID structure as in the X/Open CAE Specification, Distributed Transaction Processing: @@ -812,6 +831,8 @@ struct handlerton const char *wild, bool dir, List *files); int (*table_exists_in_engine)(handlerton *hton, THD* thd, const char *db, const char *name); + int (*make_pushed_join)(handlerton *hton, THD* thd, + const AQP::Join_plan* plan); uint32 license; /* Flag for Engine License */ void *data; /* Location for engines to keep personal structures */ }; @@ -2422,6 +2443,39 @@ public: in_range_check_pushed_down= false; } + /** + Reports #tables included in pushed join which this + handler instance is part of. ==0 -> Not pushed + */ + virtual uint number_of_pushed_joins() const + { return 0; } + + /** + If this handler instance is part of a pushed join sequence + returned TABLE instance being root of the pushed query? + */ + virtual const TABLE* root_of_pushed_join() const + { return NULL; } + + /** + If this handler instance is a child in a pushed join sequence + returned TABLE instance being my parent? + */ + virtual const TABLE* parent_of_pushed_join() const + { return NULL; } + + virtual bool test_push_flag(enum ha_push_flag flag) const + { + return FALSE; + } + + virtual int index_read_pushed(uchar * buf, const uchar * key, + key_part_map keypart_map) + { return HA_ERR_WRONG_COMMAND; } + + virtual int index_next_pushed(uchar * buf) + { return HA_ERR_WRONG_COMMAND; } + /** Part of old, deprecated in-place ALTER API. */ @@ -3090,6 +3144,9 @@ int ha_rollback_to_savepoint(THD *thd, S int ha_savepoint(THD *thd, SAVEPOINT *sv); int ha_release_savepoint(THD *thd, SAVEPOINT *sv); +/* Build pushed joins in handlers implementing this feature */ +int ha_make_pushed_joins(THD *thd, const AQP::Join_plan* plan); + /* these are called by storage engines */ void trans_register_ha(THD *thd, bool all, handlerton *ht); === modified file 'sql/opt_explain.cc' --- a/sql/opt_explain.cc 2012-03-21 21:19:11 +0000 +++ b/sql/opt_explain.cc 2012-03-30 06:52:25 +0000 @@ -890,6 +890,46 @@ bool Explain_table_base::explain_extra_c return true; } + const TABLE* pushed_root= table->file->root_of_pushed_join(); + if (pushed_root) + { + char buf[128]; + int len; + int pushed_id= 0; + + for (JOIN_TAB* prev= join->join_tab; prev <= tab; prev++) + { + const TABLE* prev_root= prev->table->file->root_of_pushed_join(); + if (prev_root == prev->table) + { + pushed_id++; + if (prev_root == pushed_root) + break; + } + } + if (pushed_root == table) + { + uint pushed_count= tab->table->file->number_of_pushed_joins(); + len= my_snprintf(buf, sizeof(buf)-1, + "Parent of %d pushed join@%d", + pushed_count, pushed_id); + } + else + { + len= my_snprintf(buf, sizeof(buf)-1, + "Child of '%s' in pushed join@%d", + tab->table->file->parent_of_pushed_join()->alias, + pushed_id); + } + + { + StringBuffer<128> buff(cs); + buff.append(buf,len); + if (push_extra(ET_PUSHED_JOIN, buff)) + return true; + } + } + switch (quick_type) { case QUICK_SELECT_I::QS_TYPE_ROR_UNION: case QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT: === modified file 'sql/opt_explain_format.h' --- a/sql/opt_explain_format.h 2012-02-29 11:17:52 +0000 +++ b/sql/opt_explain_format.h 2012-03-30 06:52:25 +0000 @@ -98,6 +98,7 @@ enum Extra_tag ET_CONST_ROW_NOT_FOUND, ET_UNIQUE_ROW_NOT_FOUND, ET_IMPOSSIBLE_ON_CONDITION, + ET_PUSHED_JOIN, //------------------------------------ ET_total }; === modified file 'sql/opt_explain_json.cc' --- a/sql/opt_explain_json.cc 2012-03-21 21:19:11 +0000 +++ b/sql/opt_explain_json.cc 2012-03-30 06:52:25 +0000 @@ -53,6 +53,7 @@ static const char *json_extra_tags[ET_to "const_row_not_found", // ET_CONST_ROW_NOT_FOUND "unique_row_not_found", // ET_UNIQUE_ROW_NOT_FOUND "impossible_on_condition", // ET_IMPOSSIBLE_ON_CONDITION + NULL // ET_PUSHED_JOIN }; === modified file 'sql/opt_explain_traditional.cc' --- a/sql/opt_explain_traditional.cc 2012-02-29 11:17:52 +0000 +++ b/sql/opt_explain_traditional.cc 2012-03-30 06:52:25 +0000 @@ -52,6 +52,7 @@ static const char *traditional_extra_tag "const row not found", // ET_CONST_ROW_NOT_FOUND "unique row not found", // ET_UNIQUE_ROW_NOT_FOUND "Impossible ON condition" // ET_IMPOSSIBLE_ON_CONDITION + "" // ET_PUSHED_JOIN }; @@ -202,6 +203,7 @@ bool Explain_format_traditional::flush_e break; } if (e->tag != ET_FIRST_MATCH && // for backward compatibility + e->tag != ET_PUSHED_JOIN && buff.append(" ")) return true; if (brackets && buff.append("(")) === modified file 'sql/sql_executor.cc' --- a/sql/sql_executor.cc 2012-03-26 10:29:07 +0000 +++ b/sql/sql_executor.cc 2012-03-30 06:52:25 +0000 @@ -81,6 +81,8 @@ static int join_ft_read_next(READ_RECORD static int join_read_always_key_or_null(JOIN_TAB *tab); static int join_read_next_same_or_null(READ_RECORD *info); static int join_read_record_no_init(JOIN_TAB *tab); +static int join_read_linked_first(JOIN_TAB *tab); +static int join_read_linked_next(READ_RECORD *info); // Create list for using with tempory table static bool change_to_use_tmp_fields(THD *thd, Ref_ptr_array ref_pointer_array, List &new_list1, @@ -2962,6 +2964,76 @@ join_read_key_unlock_row(st_join_table * tab->ref.use_count--; } +/** + Read a table *assumed* to be included in execution of a pushed join. + This is the counterpart of join_read_key() / join_read_always_key() + for child tables in a pushed join. + + When the table access is performed as part of the pushed join, + all 'linked' child colums are prefetched together with the parent row. + The handler will then only format the row as required by MySQL and set + 'table->status' accordingly. + + However, there may be situations where the prepared pushed join was not + executed as assumed. It is the responsibility of the handler to handle + these situation by letting ::index_read_pushed() then effectively do a + plain old' index_read_map(..., HA_READ_KEY_EXACT); + + @param tab Table to read + + @retval + 0 Row was found + @retval + -1 Row was not found + @retval + 1 Got an error (other than row not found) during read +*/ +static int +join_read_linked_first(JOIN_TAB *tab) +{ + TABLE *table= tab->table; + DBUG_ENTER("join_read_linked_first"); + + DBUG_ASSERT(!tab->sorted); // Pushed child can't be sorted + if (!table->file->inited) + table->file->ha_index_init(tab->ref.key, tab->sorted); + + if (cp_buffer_from_ref(tab->join->thd, table, &tab->ref)) + { + table->status=STATUS_NOT_FOUND; + DBUG_RETURN(-1); + } + + // 'read' itself is a NOOP: + // handler::index_read_pushed() only unpack the prefetched row and set 'status' + int error=table->file->index_read_pushed(table->record[0], + tab->ref.key_buff, + make_prev_keypart_map(tab->ref.key_parts)); + if (unlikely(error && error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)) + DBUG_RETURN(report_error(table, error)); + + table->null_row=0; + int rc= table->status ? -1 : 0; + DBUG_RETURN(rc); +} + +static int +join_read_linked_next(READ_RECORD *info) +{ + TABLE *table= info->table; + DBUG_ENTER("join_read_linked_next"); + + int error=table->file->index_next_pushed(table->record[0]); + if (error) + { + if (unlikely(error != HA_ERR_END_OF_FILE)) + DBUG_RETURN(report_error(table, error)); + table->status= STATUS_GARBAGE; + DBUG_RETURN(-1); + } + DBUG_RETURN(error); +} + /* ref access method implementation: "read_first" function @@ -3043,8 +3115,19 @@ join_read_last_key(JOIN_TAB *tab) /* ARGSUSED */ static int -join_no_more_records(READ_RECORD *info __attribute__((unused))) +join_no_more_records(READ_RECORD *info) { + /** + * When a pushed join completes, and its results did not only depend on + * the key of this root operations: ('tab->ref.key_buff') + * Results from this pushed join can not be reused + * for later queries having the same root key. + * (ref: join_read_key(), join_read_const() & join_read_system() + */ + if (info->table->file->test_push_flag(HA_PUSH_MULTIPLE_DEPENDENCY)) + { + info->table->status= STATUS_GARBAGE; + } return -1; } @@ -3311,6 +3394,36 @@ join_read_next_same_or_null(READ_RECORD void pick_table_access_method(JOIN_TAB *tab) { + /** + Set up modified access function for pushed joins. + */ + uint pushed_joins= tab->table->file->number_of_pushed_joins(); + if (pushed_joins > 0) + { + if (tab->table->file->root_of_pushed_join() != tab->table) + { + /* + Is child of a pushed join operation: + Replace access functions with its linked counterpart. + ... Which is effectively a NOOP as the row is already fetched + together with the root of the linked operation. + */ + DBUG_ASSERT(tab->type != JT_REF_OR_NULL); + tab->read_first_record= join_read_linked_first; + tab->read_record.read_record= join_read_linked_next; + tab->read_record.unlock_row= rr_unlock_row; + return; + } + } + + /** + Already set to some non-default value in sql_select.cc + TODO: Move these settings into pick_table_access_method() also + */ + else if (tab->read_first_record != NULL) + return; + + // Fall through to set default access functions: switch (tab->type) { case JT_REF: === modified file 'sql/sql_optimizer.cc' --- a/sql/sql_optimizer.cc 2012-03-28 13:39:57 +0000 +++ b/sql/sql_optimizer.cc 2012-03-30 06:52:25 +0000 @@ -36,6 +36,7 @@ #include "sql_parse.h" #include "my_bit.h" #include "lock.h" +#include "abstract_query_plan.h" #include "opt_explain_format.h" // Explain_format_flags #include @@ -943,6 +944,31 @@ JOIN::optimize() } } + /** + * Push joins to handler(s) whenever possible. + * The handlers will inspect the QEP through the + * AQP (Abstract Query Plan), and extract from it + * whatewer it might implement of pushed execution. + * It is the responsibility if the handler to store any + * information it need for later execution of pushed queries. + * + * Currently pushed joins are only implemented by NDB. + */ + { + const AQP::Join_plan plan(this); + if (ha_make_pushed_joins(thd, &plan)) + DBUG_RETURN(1); + } + + /** + * Set up access functions for the tables as + * required by the selected access type. + */ + for (uint i= const_tables; i < tables; i++) + { + pick_table_access_method (&join_tab[i]); + } + tmp_having= having; if (!(select_options & SELECT_DESCRIBE)) { @@ -3022,6 +3048,14 @@ make_join_statistics(JOIN *join, TABLE_L */ extract_method= extract_no_table; } + else if (table->file->test_push_flag(HA_PUSH_BLOCK_CONST_TABLE)) + { + /* + Handler implements pushed joins, and prefer const tables to + be pushed together with rest of the pushed query. + */ + extract_method= extract_no_table; + } else if (*s->on_expr_ref) { /* s is the only inner table of an outer join, extract empty tables */ @@ -3198,12 +3232,14 @@ const_table_extraction_done: 3. are part of semi-join, or 4. have an expensive outer join condition. DontEvaluateMaterializedSubqueryTooEarly + 5. are blocked by handler for const table optimize. */ if (eq_part.is_prefix(table->key_info[key].key_parts) && !table->fulltext_searched && // 1 !tl->in_outer_join_nest() && // 2 !(tl->embedding && tl->embedding->sj_on_expr) && // 3 - !(*s->on_expr_ref && (*s->on_expr_ref)->is_expensive())) // 4 + !(*s->on_expr_ref && (*s->on_expr_ref)->is_expensive()) &&// 4 + !table->file->test_push_flag(HA_PUSH_BLOCK_CONST_TABLE)) // 5 { if (table->key_info[key].flags & HA_NOSAME) { === modified file 'sql/sql_select.cc' --- a/sql/sql_select.cc 2012-03-22 08:13:35 +0000 +++ b/sql/sql_select.cc 2012-03-30 06:52:25 +0000 @@ -2636,7 +2636,9 @@ make_join_readinfo(JOIN *join, ulonglong tab->sorted= (tab->type != JT_EQ_REF) ? sorted : false; sorted= false; // only first must be sorted table->status= STATUS_GARBAGE | STATUS_NOT_FOUND; - pick_table_access_method (tab); + tab->read_first_record= NULL; // Access methods not set yet + tab->read_record.read_record= NULL; + tab->read_record.unlock_row= rr_unlock_row; Opt_trace_object trace_refine_table(trace); trace_refine_table.add_utf8_table(table); @@ -3833,7 +3835,6 @@ test_if_skip_sort_order(JOIN_TAB *tab,OR goto use_filesort; DBUG_ASSERT(tab->type != JT_REF_OR_NULL && tab->type != JT_FT); - pick_table_access_method(tab); } else { No bundle (reason: useless for push emails).