Hi Alexander,
some preliminary notes to the code changes.
I still need some time to understand the logic of the patch.
On 11.11.10 09.44, Alexander Barkov wrote:
> #At file:///home/bar/mysql-bzr/mysql-5.5-bugteam.b57737/ based on
> revid:dmitry.shulga@stripped
>
> 3114 Alexander Barkov 2010-11-11
> Bug#57737 Character sets: search fails with like, contraction, index
>
> Problem: LIKE over an indexed column optimized away good results,
> because my_like_range_utf32/utf16 returned wrong ranges for contractions.
>
> Fix: Make my_like_range_utf32/utf16 return correct ranges for contractions.
>
> Note, this code presens in my_like_range_mb and my_like_range_ucs2.
> It was forgotten in utf32/utf16 versions (during mysql-6.0 push/revert mess).
>
>
> added:
> @ mysql-test/include/ctype_czech.inc
> Adding shared test code
>
> modified:
> @ include/m_ctype.h
> Adding helper functions for contractions.
>
> @ mysql-test/r/ctype_uca.result
> @ mysql-test/r/ctype_utf16_uca.result
> @ mysql-test/r/ctype_utf32_uca.result
> @ mysql-test/t/ctype_uca.test
> @ mysql-test/t/ctype_utf16_uca.test
> @ mysql-test/t/ctype_utf32_uca.test
> Adding tests.
>
> @ strings/ctype-mb.c
> Pad function did not put the last character.
>
> @ strings/ctype-ucs2.c
> - my_fill_mb2 did not put the high byte, as previously
> it was used to put only characters in ASCII range.
> Now it puts high byte as well
> (needed to pupulate cs->max_sort_char correctly).
>
> - Adding contraction handling into my_like_range_utf16
> - Removing legacy "Temporary fix" from my_like_range_utf16
> - Using my_fill_mb2 instead of loop (duplicate code eliminations)
> - Adding contraction handling into my_like_range_utf32
>
> added:
> mysql-test/include/ctype_czech.inc
> modified:
> include/m_ctype.h
> mysql-test/r/ctype_uca.result
> mysql-test/r/ctype_utf16_uca.result
> mysql-test/r/ctype_utf32_uca.result
> mysql-test/t/ctype_uca.test
> mysql-test/t/ctype_utf16_uca.test
> mysql-test/t/ctype_utf32_uca.test
> strings/ctype-mb.c
> strings/ctype-ucs2.c
> === modified file 'include/m_ctype.h'
> --- a/include/m_ctype.h 2010-09-28 15:15:58 +0000
> +++ b/include/m_ctype.h 2010-11-11 08:39:03 +0000
> @@ -356,6 +356,32 @@ extern CHARSET_INFO my_charset_utf8mb4_u
> #define MY_UTF8MB4 "utf8mb4"
>
>
> +/* Helper functions to handle contraction */
> +static inline my_bool
> +my_cs_have_contractions(CHARSET_INFO *cs)
> +{
> + return cs->contractions != NULL;
> +}
> +
> +static inline my_bool
> +my_cs_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc)
> +{
> + return ((const char *)cs->contractions)[0x40*0x40 + (wc& 0xFF)];
> +}
> +
> +static inline my_bool
> +my_cs_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc)
> +{
> + return ((const char *)cs->contractions)[0x40*0x40 + (wc& 0xFF)];
> +}
> +
> +static inline uint16*
> +my_cs_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
> +{
> + return&cs->contractions[(wc1 - 0x40) * 0x40 + wc2 - 0x40];
> +}
> +
> +
> /* declarations for simple charsets */
> extern size_t my_strnxfrm_simple(CHARSET_INFO *, uchar *, size_t,
> const uchar *, size_t);
>
> === added file 'mysql-test/include/ctype_czech.inc'
> --- a/mysql-test/include/ctype_czech.inc 1970-01-01 00:00:00 +0000
> +++ b/mysql-test/include/ctype_czech.inc 2010-11-11 08:39:03 +0000
> @@ -0,0 +1,12 @@
> +SELECT @@collation_connection;
> +--echo #
> +--echo # Bug#57737 Character sets: search fails with like, contraction, index
> +--echo #
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +DROP TABLE t1;
>
> === modified file 'mysql-test/r/ctype_uca.result'
> --- a/mysql-test/r/ctype_uca.result 2008-03-26 09:51:16 +0000
> +++ b/mysql-test/r/ctype_uca.result 2010-11-11 08:39:03 +0000
> @@ -2888,3 +2888,63 @@ a hex(b) c
> DROP TABLE t1;
> set names utf8;
> End for 5.0 tests
> +#
> +# Start of 5.5 tests
> +#
> +SET collation_connection=utf8_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +utf8_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
> +SET collation_connection=ucs2_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +ucs2_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
> +#
> +# End of 5.5 tests
> +#
>
> === modified file 'mysql-test/r/ctype_utf16_uca.result'
> --- a/mysql-test/r/ctype_utf16_uca.result 2010-02-24 09:15:34 +0000
> +++ b/mysql-test/r/ctype_utf16_uca.result 2010-11-11 08:39:03 +0000
> @@ -2368,6 +2368,33 @@ NULL
> NULL
> NULL
> drop table t1;
> +SET collation_connection=utf16_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +utf16_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
> #
> # End of 5.5 tests
> #
>
> === modified file 'mysql-test/r/ctype_utf32_uca.result'
> --- a/mysql-test/r/ctype_utf32_uca.result 2010-02-24 09:15:34 +0000
> +++ b/mysql-test/r/ctype_utf32_uca.result 2010-11-11 08:39:03 +0000
> @@ -2368,6 +2368,33 @@ NULL
> NULL
> NULL
> drop table t1;
> +SET collation_connection=utf32_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +utf32_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
> #
> # End of 5.5 tests
> #
>
> === modified file 'mysql-test/t/ctype_uca.test'
> --- a/mysql-test/t/ctype_uca.test 2008-02-20 18:49:26 +0000
> +++ b/mysql-test/t/ctype_uca.test 2010-11-11 08:39:03 +0000
> @@ -545,3 +545,17 @@ set collation_connection=ucs2_unicode_ci
> set names utf8;
>
> -- echo End for 5.0 tests
> +
> +--echo #
> +--echo # Start of 5.5 tests
> +--echo #
> +#
> +# Test my_like_range and contractions
> +#
> +SET collation_connection=utf8_czech_ci;
> +--source include/ctype_czech.inc
> +SET collation_connection=ucs2_czech_ci;
> +--source include/ctype_czech.inc
> +--echo #
> +--echo # End of 5.5 tests
> +--echo #
>
> === modified file 'mysql-test/t/ctype_utf16_uca.test'
> --- a/mysql-test/t/ctype_utf16_uca.test 2010-02-24 09:15:34 +0000
> +++ b/mysql-test/t/ctype_utf16_uca.test 2010-11-11 08:39:03 +0000
> @@ -284,6 +284,12 @@ DROP TABLE IF EXISTS t1;
> set collation_connection=utf16_unicode_ci;
> --source include/ctype_regex.inc
>
> +#
> +# Test my_like_range and contractions
> +#
> +SET collation_connection=utf16_czech_ci;
> +--source include/ctype_czech.inc
> +
>
> --echo #
> --echo # End of 5.5 tests
>
> === modified file 'mysql-test/t/ctype_utf32_uca.test'
> --- a/mysql-test/t/ctype_utf32_uca.test 2010-02-24 09:15:34 +0000
> +++ b/mysql-test/t/ctype_utf32_uca.test 2010-11-11 08:39:03 +0000
> @@ -286,6 +286,13 @@ set collation_connection=utf32_unicode_c
> --source include/ctype_regex.inc
>
>
> +#
> +# Test my_like_range and contractions
> +#
> +SET collation_connection=utf32_czech_ci;
> +--source include/ctype_czech.inc
> +
> +
> --echo #
> --echo # End of 5.5 tests
> --echo #
>
> === modified file 'strings/ctype-mb.c'
> --- a/strings/ctype-mb.c 2010-07-26 06:47:03 +0000
> +++ b/strings/ctype-mb.c 2010-11-11 08:39:03 +0000
> @@ -636,7 +636,7 @@ static void pad_max_char(CHARSET_INFO *c
> DBUG_ASSERT(buflen> 0);
> do
> {
> - if ((str + buflen)< end)
> + if ((str + buflen)<= end)
> {
> /* Enough space for the characer */
> memcpy(str, buf, buflen);
>
> === modified file 'strings/ctype-ucs2.c'
> --- a/strings/ctype-ucs2.c 2010-10-20 19:02:59 +0000
> +++ b/strings/ctype-ucs2.c 2010-11-11 08:39:03 +0000
> @@ -903,7 +903,7 @@ static void
> my_fill_mb2(CHARSET_INFO *cs __attribute__((unused)),
> char *s, size_t l, int fill)
Type of argument 'fill' should be uint16.
> {
> - for ( ; l>= 2; s[0]= 0, s[1]= fill, s+= 2, l-= 2);
> + for ( ; l>= 2; s[0]= (fill>> 8), s[1]= (fill& 0xFF), s+= 2, l-=
> 2);
> }
>
>
> @@ -1590,14 +1590,19 @@ my_like_range_utf16(CHARSET_INFO *cs,
> char *min_str,char *max_str,
> size_t *min_length,size_t *max_length)
Can you please document arguments cs, w_one, w_many, min_length and max_length?
And likewise for my_like_range_utf32().
The 'escape' argument is a single byte character. Is that correct in the general
case? If so, I think the fact should be documented.
> {
> - const char *end=ptr+ptr_length;
> - char *min_org=min_str;
> - char *min_end=min_str+res_length;
> + const char *end= ptr + ptr_length;
> + char *min_org= min_str;
min_org can be 'const char *'.
> + char *min_end= min_str + res_length;
> + char *max_end= max_str + res_length;
> size_t charlen= res_length / cs->mbmaxlen;
> + my_bool have_contractions= my_cs_have_contractions(cs);
For symmetry, you may add here DBUG_ASSERT((res_length % 2) == 0);
>
> for ( ; ptr + 1< end&& min_str + 1< min_end&&
> charlen> 0
> ; ptr+=2, charlen--)
> {
> + my_wc_t wc2;
> + int res;
The 'res' local variable is redundant (never tested).
> +
> if (ptr[0] == '\0'&& ptr[1] == escape&& ptr + 1< end)
> {
> ptr+=2; /* Skip escape */
> @@ -1623,34 +1628,35 @@ my_like_range_utf16(CHARSET_INFO *cs,
> *min_length= ((cs->state& MY_CS_BINSORT) ? (size_t) (min_str -
> min_org) :
> res_length);
> *max_length= res_length;
> - do {
> - *min_str++ = 0;
> - *min_str++ = 0;
> - *max_str++ = (char) (cs->max_sort_char>> 8);
> - *max_str++ = (char) (cs->max_sort_char& 255);
> - } while (min_str + 1< min_end);
> - return FALSE;
> + goto pad_min_max;
> + }
> + if (have_contractions&& ptr + 3< end&&
> + ptr[0] == '\0'&&
> + my_cs_can_be_contraction_head(cs, (uchar) ptr[1])&&
The cast of ptr[1] is not according to the prototype of
my_cs_can_be_contraction_head().
> + (res= my_utf16_uni(cs,&wc2, (uchar*) ptr + 2, (uchar*) end))> 0)
> + {
> + /*
> + Check contraction head followed by a wildcard,
> + or contraction that does not fit to result
> + */
> + if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) ||
> + (charlen == 1&&
> + my_cs_can_be_contraction_tail(cs, wc2)&&
> + my_cs_contraction2_weight(cs, (uchar) ptr[1], wc2)))
> + {
> + *min_length= *max_length= res_length;
> + goto pad_min_max;
> + }
> }
> *min_str++= *max_str++ = ptr[0];
> *min_str++= *max_str++ = ptr[1];
> }
>
> - /* Temporary fix for handling w_one at end of string (key compression) */
> - {
> - char *tmp;
> - for (tmp= min_str ; tmp-1> min_org&& tmp[-1] == '\0'&&
> tmp[-2]=='\0';)
> - {
> - *--tmp=' ';
> - *--tmp='\0';
> - }
> - }
> -
> - *min_length= *max_length = (size_t) (min_str - min_org);
> - while (min_str + 1< min_end)
> - {
> - *min_str++ = *max_str++ = '\0';
> - *min_str++ = *max_str++ = ' '; /* Because if key compression */
> - }
> + *min_length= *max_length= (size_t) (min_str - min_org);
> +
> +pad_min_max:
> + my_fill_mb2(cs, min_str, min_end - min_str, cs->min_sort_char);
> + my_fill_mb2(cs, max_str, max_end - max_str, cs->max_sort_char);
> return FALSE;
> }
>
> @@ -2583,12 +2589,13 @@ my_like_range_utf32(CHARSET_INFO *cs,
> char *min_end= min_str + res_length;
> char *max_end= max_str + res_length;
> size_t charlen= res_length / cs->mbmaxlen;
> + my_bool have_contractions= my_cs_have_contractions(cs);
>
> DBUG_ASSERT((res_length % 4) == 0);
>
> for ( ; charlen> 0; ptr+= 4, charlen--)
> {
> - my_wc_t wc;
> + my_wc_t wc, wc2;
> int res;
The 'res' local variable is redundant.
> if ((res= my_utf32_uni(cs,&wc, (uchar*) ptr, (uchar*) end))< 0)
> {
> @@ -2640,6 +2647,24 @@ my_like_range_utf32(CHARSET_INFO *cs,
> goto pad_min_max;
> }
>
> + if (have_contractions&&
> + my_cs_can_be_contraction_head(cs, wc)&&
> + (res= my_utf32_uni(cs,&wc2, (uchar*) ptr + 4, (uchar*) end))> 0)
> + {
> + /*
> + Check contraction head followed by a wildcard
> + or contraction which does not fit to result
> + */
> + if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) ||
> + (charlen == 1&&
> + my_cs_can_be_contraction_tail(cs, wc2)&&
> + my_cs_contraction2_weight(cs, wc, wc2)))
> + {
> + *min_length= *max_length= res_length;
> + goto pad_min_max;
> + }
> + }
> +
> /* Normal character */
> if (my_uni_utf32(cs, wc, (uchar*) min_str, (uchar*) min_end) != 4 ||
> my_uni_utf32(cs, wc, (uchar*) max_str, (uchar*) max_end) != 4)
>
In escape handling, this function performs:
*min_str++= 4;
*max_str++= 4;
Is that correct?
The return values in the two functions seem to be inconsistent - There are no
return TRUE in my_like_range_utf16() but two in my_like_range_utf32().
Thanks,
Roy