Martin,
IMO this patch is too risky to go into 5.0. Please consider some
more conservative solution for 5.0, where we e.g. detect the case and
issue some warning/error, and we explain all group_concat limitations
in the docs.
Also, it seems that the patch in some cases delivers the group_concat
result in the wrong order. Please check my comments to the test results.
My remaining comments are inline in the patch below.
Timour
> Below is the list of changes that have just been committed into a local
> 5.0 repository of martin. When martin does a push these changes will
> be propagated to the main repository and, within 24 hours after the
> push, to the public repository.
> For information on how to access the public repository
> see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
>
> ChangeSet@stripped, 2007-12-06 10:47:13+01:00, mhansson@stripped +4 -0
> Bug#32798: DISTINCT in GROUP_CONCAT clause fails when ordering by a column
> with null values
>
> For queries containing GROUP_CONCAT(DISTINCT fields ORDER BY fields), there
> was a limitation that the DISTINCT fields had to be the same as ORDER BY
> fields, owing to the fact that one single sorted tree was used for keeping
> track of tuples, ordering and uniqueness. Fixed by introducing a second
> structure to handle uniqueness so that the original structure has only to
> order the result.
>
> mysql-test/r/func_gconcat.result@stripped, 2007-12-06 10:47:11+01:00,
> mhansson@stripped +46 -1
> Bug#32798:
> Hunk 1: Wrong test result turned correct after fix.
> Hunk 2: Correct test result
Hmm, I have no idea what is "hunk", please rewrite.
>
> mysql-test/t/func_gconcat.test@stripped, 2007-12-06 10:47:11+01:00,
> mhansson@stripped +50 -0
> Bug#32798: Test case
>
> sql/item_sum.cc@stripped, 2007-12-06 10:47:11+01:00, mhansson@stripped +75 -81
> Bug#32798: Implementation of fix. Dead code removal.
>
> - removed comment describing this bug
> - removed function group_concat_key_cmp_with_distinct
> - removed function group_concat_key_cmp_with_distinct_and_order
> - Added a Unique object to maintain uniqueness of values.
> - Added a comparison method to compare tuples based only on selected fields.
>
> sql/item_sum.h@stripped, 2007-12-06 10:47:11+01:00, mhansson@stripped +38 -3
> Bug#32798: Declarations and comments.
>
> diff -Nrup a/mysql-test/r/func_gconcat.result b/mysql-test/r/func_gconcat.result
> --- a/mysql-test/r/func_gconcat.result 2007-07-19 18:20:41 +02:00
> +++ b/mysql-test/r/func_gconcat.result 2007-12-06 10:47:11 +01:00
> @@ -271,7 +271,7 @@ group_concat(distinct s1 order by s2)
> c,b,a
> select group_concat(distinct s1 order by s2) from t1;
> group_concat(distinct s1 order by s2)
> -c,b,a,c
> +c,b,a
> drop table t1;
> create table t1 (a int, c int);
> insert into t1 values (1, 2), (2, 3), (2, 4), (3, 5);
> @@ -819,4 +819,49 @@ id group_concat(b.name)
> 1 óra,óra
> 2 óra,óra
> drop table t1;
> +CREATE TABLE t1 (
> +a INT,
> +b INT
> +);
Please put all table DDL in one line - makes tests shorter, more fits
on a screen.
> +INSERT INTO t1 VALUES (1, NULL), (2, NULL), (2, 1);
> +SELECT GROUP_CONCAT(DISTINCT a ORDER BY b) FROM t1;
> +GROUP_CONCAT(DISTINCT a ORDER BY b)
> +2,1
The result of the query below is (1,2):
mysql> select distinct a from t1 order by b;
+------+
| a |
+------+
| 1 |
| 2 |
+------+
Therefore the result of group_concat should be "1,2", not "2,1".
> +SELECT GROUP_CONCAT(DISTINCT a) FROM t1;
> +GROUP_CONCAT(DISTINCT a)
> +1,2
> +CREATE TABLE t2 (
> +a INT,
> +b INT
> +);
> +INSERT INTO t2 VALUES (1, 1), (2, 1), (2, 3);
> +SELECT GROUP_CONCAT(DISTINCT a ORDER BY b) FROM t2;
> +GROUP_CONCAT(DISTINCT a ORDER BY b)
> +2,1
Likewise:
mysql> select distinct a from t2 order by b;
+------+
| a |
+------+
| 1 |
| 2 |
+------+
Thus I expect the result of group_concat to be "1,2", not "2,1".
Can you explain these differences?
> +CREATE TABLE t3 (
> +a INT,
> +b INT,
> +c INT,
> +d INT
> +);
> +INSERT INTO t3 VALUES (1,1, 1,1), (1,1, 2,2), (1,2, 2,1), (2,1, 1,2);
> +SELECT GROUP_CONCAT(DISTINCT a, b ORDER BY c, d) FROM t3;
> +GROUP_CONCAT(DISTINCT a, b ORDER BY c, d)
> +11,21,12
> +SELECT GROUP_CONCAT(DISTINCT a, b ORDER BY d, c) FROM t3;
> +GROUP_CONCAT(DISTINCT a, b ORDER BY d, c)
> +11,12,21
The above two queries work as expected:
mysql> select distinct a,b from t3 order by c ,d;
+------+------+
| a | b |
+------+------+
| 1 | 1 |
| 2 | 1 |
| 1 | 2 |
+------+------+
3 rows in set (0.00 sec)
mysql> select distinct a,b from t3 order by d ,c;
+------+------+
| a | b |
+------+------+
| 1 | 1 |
| 1 | 2 |
| 2 | 1 |
+------+------+
> +CREATE TABLE t4 (
> +a INT,
> +b INT,
> +c INT
> +);
> +INSERT INTO t4 VALUES (1, 1, 1), (2, 1, 2), (3, 2, 1);
> +SELECT GROUP_CONCAT(DISTINCT a, b ORDER BY b, c) FROM t4;
> +GROUP_CONCAT(DISTINCT a, b ORDER BY b, c)
> +11,21,32
> +SELECT GROUP_CONCAT(DISTINCT a, b ORDER BY c, b) FROM t4;
> +GROUP_CONCAT(DISTINCT a, b ORDER BY c, b)
> +11,32,21
> +DROP TABLE t1, t2, t3, t4;
> End of 5.0 tests
Please add tests where:
- there is GROUP BY in the query that contains the group_concat,
and group_concat has DISTINCT on some other field(s),
- there is DESC order inside GROUP_CONCAT (together with DISTINCT)
> diff -Nrup a/mysql-test/t/func_gconcat.test b/mysql-test/t/func_gconcat.test
> --- a/mysql-test/t/func_gconcat.test 2007-07-19 18:20:17 +02:00
> +++ b/mysql-test/t/func_gconcat.test 2007-12-06 10:47:11 +01:00
> @@ -562,4 +562,54 @@ insert into t1 (id, name) values (2, "ór
> select b.id, group_concat(b.name) from t1 a, t1 b group by b.id;
> drop table t1;
>
> +#
> +# Bug#32798: DISTINCT in GROUP_CONCAT clause fails when ordering by a column
> +# with null values
> +#
> +CREATE TABLE t1 (
> + a INT,
> + b INT
> +);
> +
> +INSERT INTO t1 VALUES (1, NULL), (2, NULL), (2, 1);
> +
> +SELECT GROUP_CONCAT(DISTINCT a ORDER BY b) FROM t1;
> +SELECT GROUP_CONCAT(DISTINCT a) FROM t1;
> +
> +CREATE TABLE t2 (
> + a INT,
> + b INT
> +);
> +
> +INSERT INTO t2 VALUES (1, 1), (2, 1), (2, 3);
> +
> +SELECT GROUP_CONCAT(DISTINCT a ORDER BY b) FROM t2;
> +
> +
> +CREATE TABLE t3 (
> + a INT,
> + b INT,
> + c INT,
> + d INT
> +);
> +
> +# There is one duplicate in the expression list: 1,10
> +# There is one duplicate in ORDER BY list, but that shouldnt matter: 1,10
> +INSERT INTO t3 VALUES (1,1, 1,1), (1,1, 2,2), (1,2, 2,1), (2,1, 1,2);
> +
> +SELECT GROUP_CONCAT(DISTINCT a, b ORDER BY c, d) FROM t3;
> +SELECT GROUP_CONCAT(DISTINCT a, b ORDER BY d, c) FROM t3;
> +
> +CREATE TABLE t4 (
> + a INT,
> + b INT,
> + c INT
> +);
> +
> +INSERT INTO t4 VALUES (1, 1, 1), (2, 1, 2), (3, 2, 1);
> +
> +SELECT GROUP_CONCAT(DISTINCT a, b ORDER BY b, c) FROM t4;
> +SELECT GROUP_CONCAT(DISTINCT a, b ORDER BY c, b) FROM t4;
> +DROP TABLE t1, t2, t3, t4;
> +
> --echo End of 5.0 tests
> diff -Nrup a/sql/item_sum.cc b/sql/item_sum.cc
> --- a/sql/item_sum.cc 2007-10-14 10:28:11 +02:00
> +++ b/sql/item_sum.cc 2007-12-06 10:47:11 +01:00
> @@ -2831,48 +2831,9 @@ String *Item_sum_udf_str::val_str(String
> concat of values from "group by" operation
>
> BUGS
> - DISTINCT and ORDER BY only works if ORDER BY uses all fields and only fields
> - in expression list
> Blobs doesn't work with DISTINCT or ORDER BY
> *****************************************************************************/
>
> -/*
> - function of sort for syntax:
> - GROUP_CONCAT(DISTINCT expr,...)
> -*/
> -
See my comment towards the end of the patch. I suggest to keep
the signature of this function, replace the body.
> -int group_concat_key_cmp_with_distinct(void* arg, byte* key1,
> - byte* key2)
> -{
> - Item_func_group_concat* grp_item= (Item_func_group_concat*)arg;
> - TABLE *table= grp_item->table;
> - Item **field_item, **end;
> -
> - for (field_item= grp_item->args, end= field_item +
> grp_item->arg_count_field;
> - field_item < end;
> - field_item++)
> - {
> - /*
> - We have to use get_tmp_table_field() instead of
> - real_item()->get_tmp_table_field() because we want the field in
> - the temporary table, not the original field
> - */
> - Field *field= (*field_item)->get_tmp_table_field();
> - /*
> - If field_item is a const item then either get_tp_table_field returns 0
> - or it is an item over a const table.
> - */
> - if (field && !(*field_item)->const_item())
> - {
> - int res;
> - uint offset= field->offset() - table->s->null_bytes;
> - if ((res= field->cmp((char *) key1 + offset, (char *) key2 + offset)))
> - return res;
> - }
> - }
> - return 0;
> -}
> -
>
> /*
> function of sort for syntax:
> @@ -2918,25 +2879,6 @@ int group_concat_key_cmp_with_order(void
>
>
> /*
> - function of sort for syntax:
> - GROUP_CONCAT(DISTINCT expr,... ORDER BY col,... )
> -
> - BUG:
> - This doesn't work in the case when the order by contains data that
> - is not part of the field list because tree-insert will not notice
> - the duplicated values when inserting things sorted by ORDER BY
> -*/
> -
You forgot to remove this function from the list of friends of
Item_func_group_concat.
> -int group_concat_key_cmp_with_distinct_and_order(void* arg,byte* key1,
> - byte* key2)
> -{
> - if (!group_concat_key_cmp_with_distinct(arg,key1,key2))
> - return 0;
> - return(group_concat_key_cmp_with_order(arg,key1,key2));
> -}
> -
> -
> -/*
> Append data from current leaf to item->result
> */
>
> @@ -3020,7 +2962,7 @@ Item_func_group_concat(Name_resolution_c
> bool distinct_arg, List<Item> *select_list,
> SQL_LIST *order_list, String *separator_arg)
> :tmp_table_param(0), warning(0),
> - separator(separator_arg), tree(0), table(0),
> + separator(separator_arg), tree(0), unique_filter(NULL), table(0),
> order(0), context(context_arg),
> arg_count_order(order_list ? order_list->elements : 0),
> arg_count_field(select_list->elements),
> @@ -3075,6 +3017,7 @@ Item_func_group_concat::Item_func_group_
> warning(item->warning),
> separator(item->separator),
> tree(item->tree),
> + unique_filter(item->unique_filter),
> table(item->table),
> order(item->order),
> context(item->context),
> @@ -3125,6 +3068,11 @@ void Item_func_group_concat::cleanup()
> delete_tree(tree);
> tree= 0;
> }
> + if (unique_filter)
> + {
> + delete unique_filter;
> + unique_filter= NULL;
> + }
> if (warning)
> {
> char warn_buff[MYSQL_ERRMSG_SIZE];
> @@ -3154,6 +3102,11 @@ void Item_func_group_concat::clear()
> no_appended= TRUE;
> if (tree)
> reset_tree(tree);
> + if (distinct)
> + {
> + DBUG_ASSERT(unique_filter);
> + unique_filter->reset();
> + }
> /* No need to reset the table as we never call write_row */
> }
>
> @@ -3177,9 +3130,19 @@ bool Item_func_group_concat::add()
> }
>
> null_value= FALSE;
> + bool row_eligible= TRUE;
> +
Add a comment here, e.g.: "Filter duplicate rows ..."
> + if (distinct)
> + {
> + DBUG_ASSERT(unique_filter);
Above - no need for ASSERT - you get a crash anyway if
unique_filter == NULL.
> + uint count= unique_filter->elements_in_tree();
> + unique_filter->unique_add(table->record[0] + table->s->null_bytes);
Above: why don't you check for the result of unique_add? What if
its TRUE?
> + if (count == unique_filter->elements_in_tree())
> + row_eligible= FALSE;
> + }
>
> TREE_ELEMENT *el= 0; // Only for safety
> - if (tree)
> + if (row_eligible && tree)
> el= tree_insert(tree, table->record[0] + table->s->null_bytes, 0,
> tree->custom_arg);
> /*
> @@ -3187,7 +3150,7 @@ bool Item_func_group_concat::add()
> we can dump the row here in case of GROUP_CONCAT(DISTINCT...)
> instead of doing tree traverse later.
> */
> - if (!warning_for_row &&
> + if (row_eligible && !warning_for_row &&
> (!tree || (el->count == 1 && distinct &&
> !arg_count_order)))
> dump_leaf_key(table->record[0] + table->s->null_bytes, 1, this);
>
> @@ -3334,38 +3297,36 @@ bool Item_func_group_concat::setup(THD *
> table->file->extra(HA_EXTRA_NO_ROWS);
> table->no_rows= 1;
>
> + if (!(arg_count_order || distinct))
> + DBUG_RETURN(FALSE);
The above test is not need - if the two tests below fail (equivalent to
above), we return FALSE as well.
>
> - if (distinct || arg_count_order)
> - {
> - /*
> - Need sorting: init tree and choose a function to sort.
> - Don't reserve space for NULLs: if any of gconcat arguments is NULL,
> - the row is not added to the result.
> - */
> - uint tree_key_length= table->s->reclength - table->s->null_bytes;
> + /**
Above replace "**" with "*".
> + Need sorting or uniqueness: init tree and choose a function to sort.
> + Don't reserve space for NULLs: if any of gconcat arguments is NULL,
> + the row is not added to the result.
> + */
> + uint tree_key_length= table->s->reclength - table->s->null_bytes;
>
> + if (arg_count_order)
> + {
> tree= &tree_base;
> - if (arg_count_order)
> - {
> - if (distinct)
> - compare_key= (qsort_cmp2) group_concat_key_cmp_with_distinct_and_order;
> - else
> - compare_key= (qsort_cmp2) group_concat_key_cmp_with_order;
> - }
> - else
> - {
> - compare_key= (qsort_cmp2) group_concat_key_cmp_with_distinct;
> - }
> + compare_key= (qsort_cmp2) group_concat_key_cmp_with_order;
> /*
> Create a tree for sorting. The tree is used to sort and to remove
> duplicate values (according to the syntax of this function). If there
Is the above still correct? Do we still use this tree both to sort and
remove duplicates?
> - is no DISTINCT or ORDER BY clauses, we don't create this tree.
> + is no ORDER BY clause, we don't create this tree.
> */
> init_tree(tree, (uint) min(thd->variables.max_heap_table_size,
> thd->variables.sortbuff_size/16), 0,
> tree_key_length, compare_key, 0, NULL, (void*) this);
> }
>
> + if (distinct)
> + unique_filter= new Unique(&Item_func_group_concat::compare_select_list,
> + (void*)this,
> + tree_key_length,
> + thd->variables.max_heap_table_size);
> +
> DBUG_RETURN(FALSE);
> }
>
> @@ -3434,4 +3395,37 @@ void Item_func_group_concat::print(Strin
> str->append(STRING_WITH_LEN(" separator \'"));
> str->append(*separator);
> str->append(STRING_WITH_LEN("\')"));
> +}
> +
> +
Move the method comment from the class declaration here. Our comment
standard requires that the comment is before the implementation.
> +int Item_func_group_concat::compare_select_list(const void *key1,
> + const void *key2)
> +{
> + for (uint i= 0; i < arg_count_field; i++)
> + {
> + Item *item= args[i];
> + if (!item->const_item())
> + {
Remove one level of nesting by changing the above IF with:
+ if (item->const_item())
+ continue;
> + /*
> + We have to use get_tmp_table_field() instead of
> + real_item()->get_tmp_table_field() because we want the field in the
> + temporary table, not the original field
> + */
> + Field *field= item->get_tmp_table_field();
> + DBUG_ASSERT(field);
> + int res;
> + uint offset= field->offset() - table->s->null_bytes;
> + res= field->cmp((char *) key1 + offset, (char *) key2 + offset);
> + if (res)
> + return res;
Above:
- where do you deal with ASC/DESC order?
- our coding style requires:
if ((res= field->cmp((char *) key1 + offset, (char *) key2 + offset)))
return res;
> + }
> + }
> + return 0;
> +}
> +
> +
> +Item_func_group_concat::~Item_func_group_concat()
> +{
> + if (unique_filter)
> + delete unique_filter;
> }
> diff -Nrup a/sql/item_sum.h b/sql/item_sum.h
> --- a/sql/item_sum.h 2007-06-29 09:39:15 +02:00
> +++ b/sql/item_sum.h 2007-12-06 10:47:11 +01:00
> @@ -1173,11 +1173,22 @@ class Item_func_group_concat : public It
> String *separator;
> TREE tree_base;
> TREE *tree;
> +
> + /**
> + If DISTINCT is used with this GROUP_CONCAT, this member is used to filter
> + out duplicates.
> + @see Item_func_group_concat::setup
> + @see Item_func_group_concat::add
> + @see Item_func_group_concat::clear
> + */
> + Unique *unique_filter;
> TABLE *table;
> ORDER **order;
> Name_resolution_context *context;
> - uint arg_count_order; // total count of ORDER BY items
> - uint arg_count_field; // count of arguments
> + /** The number of ORDER BY items. */
> + uint arg_count_order;
> + /** The number of selected items, aka the expr list. */
> + uint arg_count_field;
> uint count_cut_values;
> bool distinct;
> bool warning_for_row;
> @@ -1207,7 +1218,7 @@ public:
> SQL_LIST *is_order, String *is_separator);
>
> Item_func_group_concat(THD *thd, Item_func_group_concat *item);
> - ~Item_func_group_concat() {}
> + ~Item_func_group_concat();
> void cleanup();
>
> enum Sumfunctype sum_func () const {return GROUP_CONCAT_FUNC;}
> @@ -1252,4 +1263,28 @@ public:
> void print(String *str);
> virtual bool change_context_processor(byte *cntx)
> { context= (Name_resolution_context *)cntx; return FALSE; }
Below:
IMO it will be more natural to create a new (static) function
like group_concat_key_cmp_with_distinct, that does the cast
inside itself, and that has the signature of qsort_cmp2. Then
make it a friend of Item_func_group_concat. Now we end up with
two different ways of defining sort/distinct functions.
It would be best to just replace the body of
group_concat_key_cmp_with_distinct.
> +
> + /**
> + Compares the values for fields in expr list of GROUP_CONCAT.
> +
> + GROUP_CONCAT([DISTINCT] expr [,expr ...]
> + [ORDER BY {unsigned_integer | col_name | expr}
> + [ASC | DESC] [,col_name ...]]
> + [SEPARATOR str_val])
> +
> + @return
> + - -1 : key1 < key2
> + - 0 : key1 = key2
> + - 1 : key1 > key2
> + */
> + int compare_select_list(const void *key1, const void *key2);
> +
> + /**
> + Adheres to the qsort_cmp2 signature.
> + */
> + static int compare_select_list(void* instance, const void *key1,
> + const void *key2)
> + {
> + return ((Item_func_group_concat*)instance)->compare_select_list(key1, key2);
> + }
> };
>