From: Ole John Aske Date: May 22 2012 9:04am Subject: bzr push into mysql-trunk branch (ole.john.aske:3900 to 3901) WL#5940 List-Archive: http://lists.mysql.com/commits/143887 Message-Id: <20120522090434.B2230254@fimafeng09.norway.sun.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit 3901 Ole John Aske 2012-05-22 WL#5940: Handler extension and optimizer addaption required for pushed joins. modified: sql/CMakeLists.txt sql/ha_ndbcluster.cc sql/ha_ndbcluster.h sql/handler.cc sql/handler.h sql/opt_explain.cc sql/opt_explain_format.h sql/opt_explain_json.cc sql/opt_explain_traditional.cc sql/sql_executor.cc sql/sql_optimizer.cc sql/sql_select.cc 3900 Hemant Kumar 2012-05-22 Several tests are failing on mysql/daily trunk with "Thread stack overrun" on Solaris.Making these tests experimental since Alik is trying to setup a workaround. modified: mysql-test/collections/default.experimental === modified file 'sql/CMakeLists.txt' --- a/sql/CMakeLists.txt 2012-05-18 14:39:25 +0000 +++ b/sql/CMakeLists.txt 2012-05-22 09:03:56 +0000 @@ -49,6 +49,7 @@ IF(HAVE_VISIBILITY_HIDDEN) ENDIF() SET(SQL_SHARED_SOURCES + abstract_query_plan.cc datadict.cc debug_sync.cc derror.cc === modified file 'sql/ha_ndbcluster.cc' --- a/sql/ha_ndbcluster.cc 2012-04-25 12:30:33 +0000 +++ b/sql/ha_ndbcluster.cc 2012-05-22 09:03:56 +0000 @@ -12345,6 +12345,14 @@ ulonglong ha_ndbcluster::table_flags(voi */ if (thd->variables.binlog_format == BINLOG_FORMAT_STMT) f= (f | HA_BINLOG_STMT_CAPABLE) & ~HA_HAS_OWN_BINLOGGING; + + /** + * To maximize join pushability we want const-table + * optimization blocked if 'ndb_join_pushdown= on' + */ + if (THDVAR(thd, join_pushdown)) + f= f | HA_BLOCK_CONST_TABLE; + return f; } @@ -14504,97 +14512,6 @@ ha_ndbcluster::parent_of_pushed_join() c return NULL; } -bool -ha_ndbcluster::test_push_flag(enum ha_push_flag flag) const -{ - DBUG_ENTER("test_push_flag"); - switch (flag) { - case HA_PUSH_BLOCK_CONST_TABLE: - { - /** - * We don't support join push down if... - * - not LM_CommittedRead - * - uses blobs - */ - THD *thd= current_thd; - if (unlikely(!THDVAR(thd, join_pushdown))) - DBUG_RETURN(false); - - if (table->read_set != NULL && uses_blob_value(table->read_set)) - { - DBUG_RETURN(false); - } - - NdbOperation::LockMode lm= get_ndb_lock_mode(m_lock.type); - - if (lm != NdbOperation::LM_CommittedRead) - { - DBUG_RETURN(false); - } - - DBUG_RETURN(true); - } - case HA_PUSH_MULTIPLE_DEPENDENCY: - /** - * If any child operation within this pushed join refer - * column values (paramValues), the pushed join has dependencies - * in addition to the root operation itself. - */ - if (m_pushed_join_operation==PUSHED_ROOT && - m_pushed_join_member->get_field_referrences_count() > 0) // Childs has field refs - { - DBUG_RETURN(true); - } - DBUG_RETURN(false); - - case HA_PUSH_NO_ORDERED_INDEX: - { - if (m_pushed_join_operation != PUSHED_ROOT) - { - DBUG_RETURN(true); - } - const NdbQueryDef& query_def = m_pushed_join_member->get_query_def(); - const NdbQueryOperationDef::Type root_type= - query_def.getQueryOperation((uint)PUSHED_ROOT)->getType(); - - /** - * Primary key/ unique key lookup is always 'ordered' wrt. itself. - */ - if (root_type == NdbQueryOperationDef::PrimaryKeyAccess || - root_type == NdbQueryOperationDef::UniqueIndexAccess) - { - DBUG_RETURN(false); - } - - /** - * Ordered index scan can be provided as an ordered resultset iff - * it has no child scans. - */ - if (root_type == NdbQueryOperationDef::OrderedIndexScan) - { - for (uint i= 1; i < query_def.getNoOfOperations(); i++) - { - const NdbQueryOperationDef::Type child_type= - query_def.getQueryOperation(i)->getType(); - if (child_type == NdbQueryOperationDef::TableScan || - child_type == NdbQueryOperationDef::OrderedIndexScan) - { - DBUG_RETURN(true); - } - } - DBUG_RETURN(false); - } - DBUG_RETURN(true); - } - - default: - DBUG_ASSERT(0); - DBUG_RETURN(false); - } - DBUG_RETURN(false); -} - - /** @param[in] comment table comment defined by user === modified file 'sql/ha_ndbcluster.h' --- a/sql/ha_ndbcluster.h 2012-04-25 12:30:33 +0000 +++ b/sql/ha_ndbcluster.h 2012-05-22 09:03:56 +0000 @@ -414,15 +414,6 @@ static void set_tabname(const char *path bool maybe_pushable_join(const char*& reason) const; int assign_pushed_join(const ndb_pushed_join* pushed_join); -#ifdef NDB_WITHOUT_JOIN_PUSHDOWN - enum ha_push_flag { - HA_PUSH_BLOCK_CONST_TABLE, - HA_PUSH_MULTIPLE_DEPENDENCY, - HA_PUSH_NO_ORDERED_INDEX - }; -#endif - bool test_push_flag(enum ha_push_flag flag) const; - uint number_of_pushed_joins() const; const TABLE* root_of_pushed_join() const; const TABLE* parent_of_pushed_join() const; === modified file 'sql/handler.cc' --- a/sql/handler.cc 2012-05-21 12:08:05 +0000 +++ b/sql/handler.cc 2012-05-22 09:03:56 +0000 @@ -5069,6 +5069,44 @@ int ha_table_exists_in_engine(THD* thd, DBUG_RETURN(args.err); } +/** + Prepare (sub-) sequences of joins in this statement + which may be pushed to each storage engine for execution. +*/ +struct st_make_pushed_join_args +{ + const AQP::Join_plan* plan; // Query plan provided by optimizer + int err; // Error code to return. +}; + +static my_bool make_pushed_join_handlerton(THD *thd, plugin_ref plugin, + void *arg) +{ + st_make_pushed_join_args *vargs= (st_make_pushed_join_args *)arg; + handlerton *hton= plugin_data(plugin, handlerton *); + + if (hton && hton->make_pushed_join) + { + const int error= hton->make_pushed_join(hton, thd, vargs->plan); + if (unlikely(error)) + { + vargs->err = error; + return TRUE; + } + } + return FALSE; +} + +int ha_make_pushed_joins(THD *thd, const AQP::Join_plan* plan) +{ + DBUG_ENTER("ha_make_pushed_joins"); + st_make_pushed_join_args args= {plan, 0}; + plugin_foreach(thd, make_pushed_join_handlerton, + MYSQL_STORAGE_ENGINE_PLUGIN, &args); + DBUG_PRINT("exit", ("error: %d", args.err)); + DBUG_RETURN(args.err); +} + #ifdef HAVE_NDB_BINLOG /* TODO: change this into a dynamic struct === modified file 'sql/handler.h' --- a/sql/handler.h 2012-05-11 12:05:39 +0000 +++ b/sql/handler.h 2012-05-22 09:03:56 +0000 @@ -231,6 +231,12 @@ enum enum_alter_inplace_result { */ #define HA_READ_OUT_OF_SYNC (LL(1) << 40) +/* + The handler don't want accesses to this table to + be const-table optimized +*/ +#define HA_BLOCK_CONST_TABLE (LL(1) << 41) + /* bits in index_flags(index_number) for what you can do with index */ #define HA_READ_NEXT 1 /* TODO really use this flag */ #define HA_READ_PREV 2 /* supports ::index_prev */ @@ -480,6 +486,10 @@ typedef ulonglong my_xid; // this line i #define COMPATIBLE_DATA_YES 0 #define COMPATIBLE_DATA_NO 1 +namespace AQP { + class Join_plan; +}; + /** struct xid_t is binary compatible with the XID structure as in the X/Open CAE Specification, Distributed Transaction Processing: @@ -886,6 +896,8 @@ struct handlerton const char *wild, bool dir, List *files); int (*table_exists_in_engine)(handlerton *hton, THD* thd, const char *db, const char *name); + int (*make_pushed_join)(handlerton *hton, THD* thd, + const AQP::Join_plan* plan); /** List of all system tables specific to the SE. @@ -2525,6 +2537,34 @@ public: in_range_check_pushed_down= false; } + /** + Reports #tables included in pushed join which this + handler instance is part of. ==0 -> Not pushed + */ + virtual uint number_of_pushed_joins() const + { return 0; } + + /** + If this handler instance is part of a pushed join sequence + returned TABLE instance being root of the pushed query? + */ + virtual const TABLE* root_of_pushed_join() const + { return NULL; } + + /** + If this handler instance is a child in a pushed join sequence + returned TABLE instance being my parent? + */ + virtual const TABLE* parent_of_pushed_join() const + { return NULL; } + + virtual int index_read_pushed(uchar * buf, const uchar * key, + key_part_map keypart_map) + { return HA_ERR_WRONG_COMMAND; } + + virtual int index_next_pushed(uchar * buf) + { return HA_ERR_WRONG_COMMAND; } + /** Part of old, deprecated in-place ALTER API. */ @@ -3195,6 +3235,9 @@ int ha_rollback_to_savepoint(THD *thd, S int ha_savepoint(THD *thd, SAVEPOINT *sv); int ha_release_savepoint(THD *thd, SAVEPOINT *sv); +/* Build pushed joins in handlers implementing this feature */ +int ha_make_pushed_joins(THD *thd, const AQP::Join_plan* plan); + /* these are called by storage engines */ void trans_register_ha(THD *thd, bool all, handlerton *ht); === modified file 'sql/opt_explain.cc' --- a/sql/opt_explain.cc 2012-05-11 19:37:22 +0000 +++ b/sql/opt_explain.cc 2012-05-22 09:03:56 +0000 @@ -904,6 +904,46 @@ bool Explain_table_base::explain_extra_c return true; } + const TABLE* pushed_root= table->file->root_of_pushed_join(); + if (pushed_root) + { + char buf[128]; + int len; + int pushed_id= 0; + + for (JOIN_TAB* prev= join->join_tab; prev <= tab; prev++) + { + const TABLE* prev_root= prev->table->file->root_of_pushed_join(); + if (prev_root == prev->table) + { + pushed_id++; + if (prev_root == pushed_root) + break; + } + } + if (pushed_root == table) + { + uint pushed_count= tab->table->file->number_of_pushed_joins(); + len= my_snprintf(buf, sizeof(buf)-1, + "Parent of %d pushed join@%d", + pushed_count, pushed_id); + } + else + { + len= my_snprintf(buf, sizeof(buf)-1, + "Child of '%s' in pushed join@%d", + tab->table->file->parent_of_pushed_join()->alias, + pushed_id); + } + + { + StringBuffer<128> buff(cs); + buff.append(buf,len); + if (push_extra(ET_PUSHED_JOIN, buff)) + return true; + } + } + switch (quick_type) { case QUICK_SELECT_I::QS_TYPE_ROR_UNION: case QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT: === modified file 'sql/opt_explain_format.h' --- a/sql/opt_explain_format.h 2012-05-11 19:37:22 +0000 +++ b/sql/opt_explain_format.h 2012-05-22 09:03:56 +0000 @@ -98,6 +98,7 @@ enum Extra_tag ET_CONST_ROW_NOT_FOUND, ET_UNIQUE_ROW_NOT_FOUND, ET_IMPOSSIBLE_ON_CONDITION, + ET_PUSHED_JOIN, //------------------------------------ ET_total }; === modified file 'sql/opt_explain_json.cc' --- a/sql/opt_explain_json.cc 2012-04-04 17:22:45 +0000 +++ b/sql/opt_explain_json.cc 2012-05-22 09:03:56 +0000 @@ -53,6 +53,7 @@ static const char *json_extra_tags[ET_to "const_row_not_found", // ET_CONST_ROW_NOT_FOUND "unique_row_not_found", // ET_UNIQUE_ROW_NOT_FOUND "impossible_on_condition", // ET_IMPOSSIBLE_ON_CONDITION + "pushed_join" // ET_PUSHED_JOIN }; === modified file 'sql/opt_explain_traditional.cc' --- a/sql/opt_explain_traditional.cc 2012-05-11 19:37:22 +0000 +++ b/sql/opt_explain_traditional.cc 2012-05-22 09:03:56 +0000 @@ -52,6 +52,7 @@ static const char *traditional_extra_tag "const row not found", // ET_CONST_ROW_NOT_FOUND "unique row not found", // ET_UNIQUE_ROW_NOT_FOUND "Impossible ON condition" // ET_IMPOSSIBLE_ON_CONDITION + "" // ET_PUSHED_JOIN }; @@ -202,6 +203,7 @@ bool Explain_format_traditional::flush_e break; } if (e->tag != ET_FIRST_MATCH && // for backward compatibility + e->tag != ET_PUSHED_JOIN && buff.append(" ")) return true; if (brackets && buff.append("(")) === modified file 'sql/sql_executor.cc' --- a/sql/sql_executor.cc 2012-05-16 08:13:19 +0000 +++ b/sql/sql_executor.cc 2012-05-22 09:03:56 +0000 @@ -81,6 +81,8 @@ static int join_ft_read_next(READ_RECORD static int join_read_always_key_or_null(JOIN_TAB *tab); static int join_read_next_same_or_null(READ_RECORD *info); static int join_read_record_no_init(JOIN_TAB *tab); +static int join_read_linked_first(JOIN_TAB *tab); +static int join_read_linked_next(READ_RECORD *info); // Create list for using with tempory table static bool change_to_use_tmp_fields(THD *thd, Ref_ptr_array ref_pointer_array, List &new_list1, @@ -2972,6 +2974,76 @@ join_read_key_unlock_row(st_join_table * tab->ref.use_count--; } +/** + Read a table *assumed* to be included in execution of a pushed join. + This is the counterpart of join_read_key() / join_read_always_key() + for child tables in a pushed join. + + When the table access is performed as part of the pushed join, + all 'linked' child colums are prefetched together with the parent row. + The handler will then only format the row as required by MySQL and set + 'table->status' accordingly. + + However, there may be situations where the prepared pushed join was not + executed as assumed. It is the responsibility of the handler to handle + these situation by letting ::index_read_pushed() then effectively do a + plain old' index_read_map(..., HA_READ_KEY_EXACT); + + @param tab Table to read + + @retval + 0 Row was found + @retval + -1 Row was not found + @retval + 1 Got an error (other than row not found) during read +*/ +static int +join_read_linked_first(JOIN_TAB *tab) +{ + TABLE *table= tab->table; + DBUG_ENTER("join_read_linked_first"); + + DBUG_ASSERT(!tab->sorted); // Pushed child can't be sorted + if (!table->file->inited) + table->file->ha_index_init(tab->ref.key, tab->sorted); + + if (cp_buffer_from_ref(tab->join->thd, table, &tab->ref)) + { + table->status=STATUS_NOT_FOUND; + DBUG_RETURN(-1); + } + + // 'read' itself is a NOOP: + // handler::index_read_pushed() only unpack the prefetched row and set 'status' + int error=table->file->index_read_pushed(table->record[0], + tab->ref.key_buff, + make_prev_keypart_map(tab->ref.key_parts)); + if (unlikely(error && error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)) + DBUG_RETURN(report_error(table, error)); + + table->null_row=0; + int rc= table->status ? -1 : 0; + DBUG_RETURN(rc); +} + +static int +join_read_linked_next(READ_RECORD *info) +{ + TABLE *table= info->table; + DBUG_ENTER("join_read_linked_next"); + + int error=table->file->index_next_pushed(table->record[0]); + if (error) + { + if (unlikely(error != HA_ERR_END_OF_FILE)) + DBUG_RETURN(report_error(table, error)); + table->status= STATUS_GARBAGE; + DBUG_RETURN(-1); + } + DBUG_RETURN(error); +} + /* ref access method implementation: "read_first" function @@ -3353,6 +3425,36 @@ join_read_next_same_or_null(READ_RECORD void pick_table_access_method(JOIN_TAB *tab) { + /** + Set up modified access function for pushed joins. + */ + uint pushed_joins= tab->table->file->number_of_pushed_joins(); + if (pushed_joins > 0) + { + if (tab->table->file->root_of_pushed_join() != tab->table) + { + /* + Is child of a pushed join operation: + Replace access functions with its linked counterpart. + ... Which is effectively a NOOP as the row is already fetched + together with the root of the linked operation. + */ + DBUG_ASSERT(tab->type != JT_REF_OR_NULL); + tab->read_first_record= join_read_linked_first; + tab->read_record.read_record= join_read_linked_next; + tab->read_record.unlock_row= rr_unlock_row; + return; + } + } + + /** + Already set to some non-default value in sql_select.cc + TODO: Move these settings into pick_table_access_method() also + */ + else if (tab->read_first_record != NULL) + return; + + // Fall through to set default access functions: switch (tab->type) { case JT_REF: === modified file 'sql/sql_optimizer.cc' --- a/sql/sql_optimizer.cc 2012-04-30 10:06:23 +0000 +++ b/sql/sql_optimizer.cc 2012-05-22 09:03:56 +0000 @@ -36,6 +36,7 @@ #include "sql_parse.h" #include "my_bit.h" #include "lock.h" +#include "abstract_query_plan.h" #include "opt_explain_format.h" // Explain_format_flags #include @@ -948,6 +949,33 @@ JOIN::optimize() } } + /** + * Push joins to handler(s) whenever possible. + * The handlers will inspect the QEP through the + * AQP (Abstract Query Plan), and extract from it + * whatewer it might implement of pushed execution. + * It is the responsibility if the handler to store any + * information it need for later execution of pushed queries. + * + * Currently pushed joins are only implemented by NDB. + * It only make sense to try pushing if > 1 tables. + */ + if ((tables-const_tables) > 1) + { + const AQP::Join_plan plan(this); + if (ha_make_pushed_joins(thd, &plan)) + DBUG_RETURN(1); + } + + /** + * Set up access functions for the tables as + * required by the selected access type. + */ + for (uint i= const_tables; i < tables; i++) + { + pick_table_access_method (&join_tab[i]); + } + tmp_having= having; if (!(select_options & SELECT_DESCRIBE)) { @@ -3203,12 +3231,14 @@ const_table_extraction_done: 3. are part of semi-join, or 4. have an expensive outer join condition. DontEvaluateMaterializedSubqueryTooEarly + 5. are blocked by handler for const table optimize. */ if (eq_part.is_prefix(table->key_info[key].key_parts) && !table->fulltext_searched && // 1 !tl->in_outer_join_nest() && // 2 !(tl->embedding && tl->embedding->sj_on_expr) && // 3 - !(*s->on_expr_ref && (*s->on_expr_ref)->is_expensive())) // 4 + !(*s->on_expr_ref && (*s->on_expr_ref)->is_expensive()) &&// 4 + !(table->file->ha_table_flags() & HA_BLOCK_CONST_TABLE)) // 5 { if (table->key_info[key].flags & HA_NOSAME) { === modified file 'sql/sql_select.cc' --- a/sql/sql_select.cc 2012-05-16 08:13:19 +0000 +++ b/sql/sql_select.cc 2012-05-22 09:03:56 +0000 @@ -1511,7 +1511,8 @@ bool create_ref_for_key(JOIN *join, JOIN j->type= null_ref_key ? JT_REF_OR_NULL : JT_REF; j->ref.null_ref_key= null_ref_key; } - else if (keyuse_uses_no_tables) + else if (keyuse_uses_no_tables && + !(table->file->ha_table_flags() & HA_BLOCK_CONST_TABLE)) { /* This happen if we are using a constant expression in the ON part @@ -2638,7 +2639,9 @@ make_join_readinfo(JOIN *join, ulonglong tab->sorted= (tab->type != JT_EQ_REF) ? sorted : false; sorted= false; // only first must be sorted table->status= STATUS_GARBAGE | STATUS_NOT_FOUND; - pick_table_access_method (tab); + tab->read_first_record= NULL; // Access methods not set yet + tab->read_record.read_record= NULL; + tab->read_record.unlock_row= rr_unlock_row; Opt_trace_object trace_refine_table(trace); trace_refine_table.add_utf8_table(table); @@ -3983,7 +3986,6 @@ check_reverse_order: goto use_filesort; DBUG_ASSERT(tab->type != JT_REF_OR_NULL && tab->type != JT_FT); - pick_table_access_method(tab); } else if (best_key >= 0) { No bundle (reason: useless for push emails).