From: Roy Lyseng Date: November 19 2010 1:49pm Subject: Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737 List-Archive: http://lists.mysql.com/commits/124446 Message-Id: <4CE6807E.8000808@oracle.com> MIME-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 8bit Hi Alexander, some preliminary notes to the code changes. I still need some time to understand the logic of the patch. On 11.11.10 09.44, Alexander Barkov wrote: > #At file:///home/bar/mysql-bzr/mysql-5.5-bugteam.b57737/ based on revid:dmitry.shulga@stripped > > 3114 Alexander Barkov 2010-11-11 > Bug#57737 Character sets: search fails with like, contraction, index > > Problem: LIKE over an indexed column optimized away good results, > because my_like_range_utf32/utf16 returned wrong ranges for contractions. > > Fix: Make my_like_range_utf32/utf16 return correct ranges for contractions. > > Note, this code presens in my_like_range_mb and my_like_range_ucs2. > It was forgotten in utf32/utf16 versions (during mysql-6.0 push/revert mess). > > > added: > @ mysql-test/include/ctype_czech.inc > Adding shared test code > > modified: > @ include/m_ctype.h > Adding helper functions for contractions. > > @ mysql-test/r/ctype_uca.result > @ mysql-test/r/ctype_utf16_uca.result > @ mysql-test/r/ctype_utf32_uca.result > @ mysql-test/t/ctype_uca.test > @ mysql-test/t/ctype_utf16_uca.test > @ mysql-test/t/ctype_utf32_uca.test > Adding tests. > > @ strings/ctype-mb.c > Pad function did not put the last character. > > @ strings/ctype-ucs2.c > - my_fill_mb2 did not put the high byte, as previously > it was used to put only characters in ASCII range. > Now it puts high byte as well > (needed to pupulate cs->max_sort_char correctly). > > - Adding contraction handling into my_like_range_utf16 > - Removing legacy "Temporary fix" from my_like_range_utf16 > - Using my_fill_mb2 instead of loop (duplicate code eliminations) > - Adding contraction handling into my_like_range_utf32 > > added: > mysql-test/include/ctype_czech.inc > modified: > include/m_ctype.h > mysql-test/r/ctype_uca.result > mysql-test/r/ctype_utf16_uca.result > mysql-test/r/ctype_utf32_uca.result > mysql-test/t/ctype_uca.test > mysql-test/t/ctype_utf16_uca.test > mysql-test/t/ctype_utf32_uca.test > strings/ctype-mb.c > strings/ctype-ucs2.c > === modified file 'include/m_ctype.h' > --- a/include/m_ctype.h 2010-09-28 15:15:58 +0000 > +++ b/include/m_ctype.h 2010-11-11 08:39:03 +0000 > @@ -356,6 +356,32 @@ extern CHARSET_INFO my_charset_utf8mb4_u > #define MY_UTF8MB4 "utf8mb4" > > > +/* Helper functions to handle contraction */ > +static inline my_bool > +my_cs_have_contractions(CHARSET_INFO *cs) > +{ > + return cs->contractions != NULL; > +} > + > +static inline my_bool > +my_cs_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc) > +{ > + return ((const char *)cs->contractions)[0x40*0x40 + (wc& 0xFF)]; > +} > + > +static inline my_bool > +my_cs_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc) > +{ > + return ((const char *)cs->contractions)[0x40*0x40 + (wc& 0xFF)]; > +} > + > +static inline uint16* > +my_cs_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2) > +{ > + return&cs->contractions[(wc1 - 0x40) * 0x40 + wc2 - 0x40]; > +} > + > + > /* declarations for simple charsets */ > extern size_t my_strnxfrm_simple(CHARSET_INFO *, uchar *, size_t, > const uchar *, size_t); > > === added file 'mysql-test/include/ctype_czech.inc' > --- a/mysql-test/include/ctype_czech.inc 1970-01-01 00:00:00 +0000 > +++ b/mysql-test/include/ctype_czech.inc 2010-11-11 08:39:03 +0000 > @@ -0,0 +1,12 @@ > +SELECT @@collation_connection; > +--echo # > +--echo # Bug#57737 Character sets: search fails with like, contraction, index > +--echo # > +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0; > +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch'); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +ALTER TABLE t1 ADD KEY s1 (s1); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1)); > +SELECT * FROM t1 WHERE s1 LIKE 'ch'; > +DROP TABLE t1; > > === modified file 'mysql-test/r/ctype_uca.result' > --- a/mysql-test/r/ctype_uca.result 2008-03-26 09:51:16 +0000 > +++ b/mysql-test/r/ctype_uca.result 2010-11-11 08:39:03 +0000 > @@ -2888,3 +2888,63 @@ a hex(b) c > DROP TABLE t1; > set names utf8; > End for 5.0 tests > +# > +# Start of 5.5 tests > +# > +SET collation_connection=utf8_czech_ci; > +SELECT @@collation_connection; > +@@collation_connection > +utf8_czech_ci > +# > +# Bug#57737 Character sets: search fails with like, contraction, index > +# > +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0; > +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch'); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +s1 > +c > +ce > +cé > +ch > +ALTER TABLE t1 ADD KEY s1 (s1); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +s1 > +c > +ce > +cé > +ch > +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1)); > +SELECT * FROM t1 WHERE s1 LIKE 'ch'; > +s1 > +ch > +DROP TABLE t1; > +SET collation_connection=ucs2_czech_ci; > +SELECT @@collation_connection; > +@@collation_connection > +ucs2_czech_ci > +# > +# Bug#57737 Character sets: search fails with like, contraction, index > +# > +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0; > +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch'); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +s1 > +c > +ce > +cé > +ch > +ALTER TABLE t1 ADD KEY s1 (s1); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +s1 > +c > +ce > +cé > +ch > +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1)); > +SELECT * FROM t1 WHERE s1 LIKE 'ch'; > +s1 > +ch > +DROP TABLE t1; > +# > +# End of 5.5 tests > +# > > === modified file 'mysql-test/r/ctype_utf16_uca.result' > --- a/mysql-test/r/ctype_utf16_uca.result 2010-02-24 09:15:34 +0000 > +++ b/mysql-test/r/ctype_utf16_uca.result 2010-11-11 08:39:03 +0000 > @@ -2368,6 +2368,33 @@ NULL > NULL > NULL > drop table t1; > +SET collation_connection=utf16_czech_ci; > +SELECT @@collation_connection; > +@@collation_connection > +utf16_czech_ci > +# > +# Bug#57737 Character sets: search fails with like, contraction, index > +# > +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0; > +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch'); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +s1 > +c > +ce > +cé > +ch > +ALTER TABLE t1 ADD KEY s1 (s1); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +s1 > +c > +ce > +cé > +ch > +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1)); > +SELECT * FROM t1 WHERE s1 LIKE 'ch'; > +s1 > +ch > +DROP TABLE t1; > # > # End of 5.5 tests > # > > === modified file 'mysql-test/r/ctype_utf32_uca.result' > --- a/mysql-test/r/ctype_utf32_uca.result 2010-02-24 09:15:34 +0000 > +++ b/mysql-test/r/ctype_utf32_uca.result 2010-11-11 08:39:03 +0000 > @@ -2368,6 +2368,33 @@ NULL > NULL > NULL > drop table t1; > +SET collation_connection=utf32_czech_ci; > +SELECT @@collation_connection; > +@@collation_connection > +utf32_czech_ci > +# > +# Bug#57737 Character sets: search fails with like, contraction, index > +# > +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0; > +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch'); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +s1 > +c > +ce > +cé > +ch > +ALTER TABLE t1 ADD KEY s1 (s1); > +SELECT * FROM t1 WHERE s1 LIKE 'c%'; > +s1 > +c > +ce > +cé > +ch > +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1)); > +SELECT * FROM t1 WHERE s1 LIKE 'ch'; > +s1 > +ch > +DROP TABLE t1; > # > # End of 5.5 tests > # > > === modified file 'mysql-test/t/ctype_uca.test' > --- a/mysql-test/t/ctype_uca.test 2008-02-20 18:49:26 +0000 > +++ b/mysql-test/t/ctype_uca.test 2010-11-11 08:39:03 +0000 > @@ -545,3 +545,17 @@ set collation_connection=ucs2_unicode_ci > set names utf8; > > -- echo End for 5.0 tests > + > +--echo # > +--echo # Start of 5.5 tests > +--echo # > +# > +# Test my_like_range and contractions > +# > +SET collation_connection=utf8_czech_ci; > +--source include/ctype_czech.inc > +SET collation_connection=ucs2_czech_ci; > +--source include/ctype_czech.inc > +--echo # > +--echo # End of 5.5 tests > +--echo # > > === modified file 'mysql-test/t/ctype_utf16_uca.test' > --- a/mysql-test/t/ctype_utf16_uca.test 2010-02-24 09:15:34 +0000 > +++ b/mysql-test/t/ctype_utf16_uca.test 2010-11-11 08:39:03 +0000 > @@ -284,6 +284,12 @@ DROP TABLE IF EXISTS t1; > set collation_connection=utf16_unicode_ci; > --source include/ctype_regex.inc > > +# > +# Test my_like_range and contractions > +# > +SET collation_connection=utf16_czech_ci; > +--source include/ctype_czech.inc > + > > --echo # > --echo # End of 5.5 tests > > === modified file 'mysql-test/t/ctype_utf32_uca.test' > --- a/mysql-test/t/ctype_utf32_uca.test 2010-02-24 09:15:34 +0000 > +++ b/mysql-test/t/ctype_utf32_uca.test 2010-11-11 08:39:03 +0000 > @@ -286,6 +286,13 @@ set collation_connection=utf32_unicode_c > --source include/ctype_regex.inc > > > +# > +# Test my_like_range and contractions > +# > +SET collation_connection=utf32_czech_ci; > +--source include/ctype_czech.inc > + > + > --echo # > --echo # End of 5.5 tests > --echo # > > === modified file 'strings/ctype-mb.c' > --- a/strings/ctype-mb.c 2010-07-26 06:47:03 +0000 > +++ b/strings/ctype-mb.c 2010-11-11 08:39:03 +0000 > @@ -636,7 +636,7 @@ static void pad_max_char(CHARSET_INFO *c > DBUG_ASSERT(buflen> 0); > do > { > - if ((str + buflen)< end) > + if ((str + buflen)<= end) > { > /* Enough space for the characer */ > memcpy(str, buf, buflen); > > === modified file 'strings/ctype-ucs2.c' > --- a/strings/ctype-ucs2.c 2010-10-20 19:02:59 +0000 > +++ b/strings/ctype-ucs2.c 2010-11-11 08:39:03 +0000 > @@ -903,7 +903,7 @@ static void > my_fill_mb2(CHARSET_INFO *cs __attribute__((unused)), > char *s, size_t l, int fill) Type of argument 'fill' should be uint16. > { > - for ( ; l>= 2; s[0]= 0, s[1]= fill, s+= 2, l-= 2); > + for ( ; l>= 2; s[0]= (fill>> 8), s[1]= (fill& 0xFF), s+= 2, l-= 2); > } > > > @@ -1590,14 +1590,19 @@ my_like_range_utf16(CHARSET_INFO *cs, > char *min_str,char *max_str, > size_t *min_length,size_t *max_length) Can you please document arguments cs, w_one, w_many, min_length and max_length? And likewise for my_like_range_utf32(). The 'escape' argument is a single byte character. Is that correct in the general case? If so, I think the fact should be documented. > { > - const char *end=ptr+ptr_length; > - char *min_org=min_str; > - char *min_end=min_str+res_length; > + const char *end= ptr + ptr_length; > + char *min_org= min_str; min_org can be 'const char *'. > + char *min_end= min_str + res_length; > + char *max_end= max_str + res_length; > size_t charlen= res_length / cs->mbmaxlen; > + my_bool have_contractions= my_cs_have_contractions(cs); For symmetry, you may add here DBUG_ASSERT((res_length % 2) == 0); > > for ( ; ptr + 1< end&& min_str + 1< min_end&& charlen> 0 > ; ptr+=2, charlen--) > { > + my_wc_t wc2; > + int res; The 'res' local variable is redundant (never tested). > + > if (ptr[0] == '\0'&& ptr[1] == escape&& ptr + 1< end) > { > ptr+=2; /* Skip escape */ > @@ -1623,34 +1628,35 @@ my_like_range_utf16(CHARSET_INFO *cs, > *min_length= ((cs->state& MY_CS_BINSORT) ? (size_t) (min_str - min_org) : > res_length); > *max_length= res_length; > - do { > - *min_str++ = 0; > - *min_str++ = 0; > - *max_str++ = (char) (cs->max_sort_char>> 8); > - *max_str++ = (char) (cs->max_sort_char& 255); > - } while (min_str + 1< min_end); > - return FALSE; > + goto pad_min_max; > + } > + if (have_contractions&& ptr + 3< end&& > + ptr[0] == '\0'&& > + my_cs_can_be_contraction_head(cs, (uchar) ptr[1])&& The cast of ptr[1] is not according to the prototype of my_cs_can_be_contraction_head(). > + (res= my_utf16_uni(cs,&wc2, (uchar*) ptr + 2, (uchar*) end))> 0) > + { > + /* > + Check contraction head followed by a wildcard, > + or contraction that does not fit to result > + */ > + if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) || > + (charlen == 1&& > + my_cs_can_be_contraction_tail(cs, wc2)&& > + my_cs_contraction2_weight(cs, (uchar) ptr[1], wc2))) > + { > + *min_length= *max_length= res_length; > + goto pad_min_max; > + } > } > *min_str++= *max_str++ = ptr[0]; > *min_str++= *max_str++ = ptr[1]; > } > > - /* Temporary fix for handling w_one at end of string (key compression) */ > - { > - char *tmp; > - for (tmp= min_str ; tmp-1> min_org&& tmp[-1] == '\0'&& tmp[-2]=='\0';) > - { > - *--tmp=' '; > - *--tmp='\0'; > - } > - } > - > - *min_length= *max_length = (size_t) (min_str - min_org); > - while (min_str + 1< min_end) > - { > - *min_str++ = *max_str++ = '\0'; > - *min_str++ = *max_str++ = ' '; /* Because if key compression */ > - } > + *min_length= *max_length= (size_t) (min_str - min_org); > + > +pad_min_max: > + my_fill_mb2(cs, min_str, min_end - min_str, cs->min_sort_char); > + my_fill_mb2(cs, max_str, max_end - max_str, cs->max_sort_char); > return FALSE; > } > > @@ -2583,12 +2589,13 @@ my_like_range_utf32(CHARSET_INFO *cs, > char *min_end= min_str + res_length; > char *max_end= max_str + res_length; > size_t charlen= res_length / cs->mbmaxlen; > + my_bool have_contractions= my_cs_have_contractions(cs); > > DBUG_ASSERT((res_length % 4) == 0); > > for ( ; charlen> 0; ptr+= 4, charlen--) > { > - my_wc_t wc; > + my_wc_t wc, wc2; > int res; The 'res' local variable is redundant. > if ((res= my_utf32_uni(cs,&wc, (uchar*) ptr, (uchar*) end))< 0) > { > @@ -2640,6 +2647,24 @@ my_like_range_utf32(CHARSET_INFO *cs, > goto pad_min_max; > } > > + if (have_contractions&& > + my_cs_can_be_contraction_head(cs, wc)&& > + (res= my_utf32_uni(cs,&wc2, (uchar*) ptr + 4, (uchar*) end))> 0) > + { > + /* > + Check contraction head followed by a wildcard > + or contraction which does not fit to result > + */ > + if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) || > + (charlen == 1&& > + my_cs_can_be_contraction_tail(cs, wc2)&& > + my_cs_contraction2_weight(cs, wc, wc2))) > + { > + *min_length= *max_length= res_length; > + goto pad_min_max; > + } > + } > + > /* Normal character */ > if (my_uni_utf32(cs, wc, (uchar*) min_str, (uchar*) min_end) != 4 || > my_uni_utf32(cs, wc, (uchar*) max_str, (uchar*) max_end) != 4) > In escape handling, this function performs: *min_str++= 4; *max_str++= 4; Is that correct? The return values in the two functions seem to be inconsistent - There are no return TRUE in my_like_range_utf16() but two in my_like_range_utf32(). Thanks, Roy