List:Commits« Previous MessageNext Message »
From:Roy Lyseng Date:November 22 2010 12:01pm
Subject:Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737
View as plain text  
Hi Alexander,

Patch is approved, but please see a couple of inline suggestions, in addition to 
the suggestions provided on 2010-11-19.

The function my_like_range_utf32() has a slightly simpler logic than 
my_like_range_utf16() because the former extracts a character early in the loop 
and performs tests on this, instead of testing the high byte and low byte 
separately.

So, for readability, you may consider rewriting the latter function the same way.

Thanks,
Roy

On 11.11.10 09.44, Alexander Barkov wrote:
> #At file:///home/bar/mysql-bzr/mysql-5.5-bugteam.b57737/ based on
> revid:dmitry.shulga@stripped
>
>   3114 Alexander Barkov	2010-11-11
>        Bug#57737 Character sets: search fails with like, contraction, index
>
>        Problem: LIKE over an indexed column optimized away good results,
>        because my_like_range_utf32/utf16 returned wrong ranges for contractions.
>
>        Fix: Make my_like_range_utf32/utf16 return correct ranges for contractions.
>
>        Note, this code presens in my_like_range_mb and my_like_range_ucs2.
>        It was forgotten in utf32/utf16 versions (during mysql-6.0 push/revert mess).
>
>
>        added:
>          @ mysql-test/include/ctype_czech.inc
>          Adding shared test code
>
>        modified:
>          @ include/m_ctype.h
>          Adding helper functions for contractions.
>
>          @ mysql-test/r/ctype_uca.result
>          @ mysql-test/r/ctype_utf16_uca.result
>          @ mysql-test/r/ctype_utf32_uca.result
>          @ mysql-test/t/ctype_uca.test
>          @ mysql-test/t/ctype_utf16_uca.test
>          @ mysql-test/t/ctype_utf32_uca.test
>          Adding tests.
>
>          @ strings/ctype-mb.c
>          Pad function did not put the last character.
>
>          @ strings/ctype-ucs2.c
>          - my_fill_mb2 did not put the high byte, as previously
>            it was used to put only characters in ASCII range.
>            Now it puts high byte as well
>            (needed to pupulate cs->max_sort_char correctly).
>
>          - Adding contraction handling into my_like_range_utf16
>          - Removing legacy "Temporary fix" from my_like_range_utf16
>          - Using my_fill_mb2 instead of loop (duplicate code eliminations)
>          - Adding contraction handling into my_like_range_utf32
>
>      added:
>        mysql-test/include/ctype_czech.inc
>      modified:
>        include/m_ctype.h
>        mysql-test/r/ctype_uca.result
>        mysql-test/r/ctype_utf16_uca.result
>        mysql-test/r/ctype_utf32_uca.result
>        mysql-test/t/ctype_uca.test
>        mysql-test/t/ctype_utf16_uca.test
>        mysql-test/t/ctype_utf32_uca.test
>        strings/ctype-mb.c
>        strings/ctype-ucs2.c
> === modified file 'include/m_ctype.h'
> --- a/include/m_ctype.h	2010-09-28 15:15:58 +0000
> +++ b/include/m_ctype.h	2010-11-11 08:39:03 +0000
> @@ -356,6 +356,32 @@ extern CHARSET_INFO my_charset_utf8mb4_u
>   #define MY_UTF8MB4                 "utf8mb4"
>
>
> +/* Helper functions to handle contraction */
> +static inline my_bool
> +my_cs_have_contractions(CHARSET_INFO *cs)
> +{
> +  return cs->contractions != NULL;
> +}
> +
> +static inline my_bool
> +my_cs_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc)
> +{
> +  return ((const char *)cs->contractions)[0x40*0x40 + (wc&  0xFF)];
> +}
> +
> +static inline my_bool
> +my_cs_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc)
> +{
> +  return ((const char *)cs->contractions)[0x40*0x40 + (wc&  0xFF)];
> +}
> +
> +static inline uint16*
> +my_cs_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
> +{
> +  return&cs->contractions[(wc1 - 0x40) * 0x40 + wc2 - 0x40];
> +}
> +
> +
>   /* declarations for simple charsets */
>   extern size_t my_strnxfrm_simple(CHARSET_INFO *, uchar *, size_t,
>                                    const uchar *, size_t);
>
> === added file 'mysql-test/include/ctype_czech.inc'
> --- a/mysql-test/include/ctype_czech.inc	1970-01-01 00:00:00 +0000
> +++ b/mysql-test/include/ctype_czech.inc	2010-11-11 08:39:03 +0000
> @@ -0,0 +1,12 @@
> +SELECT @@collation_connection;
> +--echo #
> +--echo # Bug#57737 Character sets: search fails with like, contraction, index
> +--echo #
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +DROP TABLE t1;
>
> === modified file 'mysql-test/r/ctype_uca.result'
> --- a/mysql-test/r/ctype_uca.result	2008-03-26 09:51:16 +0000
> +++ b/mysql-test/r/ctype_uca.result	2010-11-11 08:39:03 +0000
> @@ -2888,3 +2888,63 @@ a	hex(b)	c
>   DROP TABLE t1;
>   set names utf8;
>   End for 5.0 tests
> +#
> +# Start of 5.5 tests
> +#
> +SET collation_connection=utf8_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +utf8_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
> +SET collation_connection=ucs2_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +ucs2_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
> +#
> +# End of 5.5 tests
> +#
>
> === modified file 'mysql-test/r/ctype_utf16_uca.result'
> --- a/mysql-test/r/ctype_utf16_uca.result	2010-02-24 09:15:34 +0000
> +++ b/mysql-test/r/ctype_utf16_uca.result	2010-11-11 08:39:03 +0000
> @@ -2368,6 +2368,33 @@ NULL
>   NULL
>   NULL
>   drop table t1;
> +SET collation_connection=utf16_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +utf16_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
>   #
>   # End of 5.5 tests
>   #
>
> === modified file 'mysql-test/r/ctype_utf32_uca.result'
> --- a/mysql-test/r/ctype_utf32_uca.result	2010-02-24 09:15:34 +0000
> +++ b/mysql-test/r/ctype_utf32_uca.result	2010-11-11 08:39:03 +0000
> @@ -2368,6 +2368,33 @@ NULL
>   NULL
>   NULL
>   drop table t1;
> +SET collation_connection=utf32_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +utf32_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
>   #
>   # End of 5.5 tests
>   #
>
> === modified file 'mysql-test/t/ctype_uca.test'
> --- a/mysql-test/t/ctype_uca.test	2008-02-20 18:49:26 +0000
> +++ b/mysql-test/t/ctype_uca.test	2010-11-11 08:39:03 +0000
> @@ -545,3 +545,17 @@ set collation_connection=ucs2_unicode_ci
>   set names utf8;
>
>   -- echo End for 5.0 tests
> +
> +--echo #
> +--echo # Start of 5.5 tests
> +--echo #
> +#
> +# Test my_like_range and contractions
> +#
> +SET collation_connection=utf8_czech_ci;
> +--source include/ctype_czech.inc
> +SET collation_connection=ucs2_czech_ci;
> +--source include/ctype_czech.inc
> +--echo #
> +--echo # End of 5.5 tests
> +--echo #
>
> === modified file 'mysql-test/t/ctype_utf16_uca.test'
> --- a/mysql-test/t/ctype_utf16_uca.test	2010-02-24 09:15:34 +0000
> +++ b/mysql-test/t/ctype_utf16_uca.test	2010-11-11 08:39:03 +0000
> @@ -284,6 +284,12 @@ DROP TABLE IF EXISTS t1;
>   set collation_connection=utf16_unicode_ci;
>   --source include/ctype_regex.inc
>
> +#
> +# Test my_like_range and contractions
> +#
> +SET collation_connection=utf16_czech_ci;
> +--source include/ctype_czech.inc
> +
>
>   --echo #
>   --echo # End of 5.5 tests
>
> === modified file 'mysql-test/t/ctype_utf32_uca.test'
> --- a/mysql-test/t/ctype_utf32_uca.test	2010-02-24 09:15:34 +0000
> +++ b/mysql-test/t/ctype_utf32_uca.test	2010-11-11 08:39:03 +0000
> @@ -286,6 +286,13 @@ set collation_connection=utf32_unicode_c
>   --source include/ctype_regex.inc
>
>
> +#
> +# Test my_like_range and contractions
> +#
> +SET collation_connection=utf32_czech_ci;
> +--source include/ctype_czech.inc
> +
> +
>   --echo #
>   --echo # End of 5.5 tests
>   --echo #
>
> === modified file 'strings/ctype-mb.c'
> --- a/strings/ctype-mb.c	2010-07-26 06:47:03 +0000
> +++ b/strings/ctype-mb.c	2010-11-11 08:39:03 +0000
> @@ -636,7 +636,7 @@ static void pad_max_char(CHARSET_INFO *c
>     DBUG_ASSERT(buflen>  0);
>     do
>     {
> -    if ((str + buflen)<  end)
> +    if ((str + buflen)<= end)
>       {
>         /* Enough space for the characer */
>         memcpy(str, buf, buflen);
>
> === modified file 'strings/ctype-ucs2.c'
> --- a/strings/ctype-ucs2.c	2010-10-20 19:02:59 +0000
> +++ b/strings/ctype-ucs2.c	2010-11-11 08:39:03 +0000
> @@ -903,7 +903,7 @@ static void
>   my_fill_mb2(CHARSET_INFO *cs __attribute__((unused)),
>               char *s, size_t l, int fill)
>   {
> -  for ( ; l>= 2; s[0]= 0, s[1]= fill, s+= 2, l-= 2);
> +  for ( ; l>= 2; s[0]= (fill>>  8), s[1]= (fill&  0xFF), s+= 2, l-=
> 2);
>   }
>
>
> @@ -1590,14 +1590,19 @@ my_like_range_utf16(CHARSET_INFO *cs,
>                       char *min_str,char *max_str,
>                       size_t *min_length,size_t *max_length)
>   {
> -  const char *end=ptr+ptr_length;
> -  char *min_org=min_str;
> -  char *min_end=min_str+res_length;
> +  const char *end= ptr + ptr_length;
> +  char *min_org= min_str;
> +  char *min_end= min_str + res_length;
> +  char *max_end= max_str + res_length;

min_end and max_end may also be "const char *".

>     size_t charlen= res_length / cs->mbmaxlen;
> +  my_bool have_contractions= my_cs_have_contractions(cs);

have_contractions may also be "const".


>
>     for ( ; ptr + 1<  end&&  min_str + 1<  min_end&& 
> charlen>  0
>           ; ptr+=2, charlen--)
>     {
> +    my_wc_t wc2;
> +    int res;
> +
>       if (ptr[0] == '\0'&&  ptr[1] == escape&&  ptr + 1<  end)
>       {
>         ptr+=2;                                        /* Skip escape */
> @@ -1623,34 +1628,35 @@ my_like_range_utf16(CHARSET_INFO *cs,
>         *min_length= ((cs->state&  MY_CS_BINSORT) ? (size_t) (min_str -
> min_org) :
>                       res_length);
>         *max_length= res_length;
> -      do {
> -        *min_str++ = 0;
> -        *min_str++ = 0;
> -        *max_str++ = (char) (cs->max_sort_char>>  8);
> -        *max_str++ = (char) (cs->max_sort_char&  255);
> -      } while (min_str + 1<  min_end);
> -      return FALSE;
> +      goto pad_min_max;
> +    }
> +    if (have_contractions&&  ptr + 3<  end&&
> +        ptr[0] == '\0'&&
> +        my_cs_can_be_contraction_head(cs, (uchar) ptr[1])&&
> +        (res= my_utf16_uni(cs,&wc2, (uchar*) ptr + 2, (uchar*) end))>  0)
> +    {
> +      /*
> +         Check contraction head followed by a wildcard,
> +         or contraction that does not fit to result
> +      */
> +      if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) ||
> +          (charlen == 1&&
> +           my_cs_can_be_contraction_tail(cs, wc2)&&
> +           my_cs_contraction2_weight(cs, (uchar) ptr[1], wc2)))
> +      {
> +        *min_length= *max_length= res_length;
> +        goto pad_min_max;
> +      }
>       }
>       *min_str++= *max_str++ = ptr[0];
>       *min_str++= *max_str++ = ptr[1];
>     }
>
> -  /* Temporary fix for handling w_one at end of string (key compression) */
> -  {
> -    char *tmp;
> -    for (tmp= min_str ; tmp-1>  min_org&&  tmp[-1] == '\0'&& 
> tmp[-2]=='\0';)
> -    {
> -      *--tmp=' ';
> -      *--tmp='\0';
> -    }
> -  }
> -
> -  *min_length= *max_length = (size_t) (min_str - min_org);
> -  while (min_str + 1<  min_end)
> -  {
> -    *min_str++ = *max_str++ = '\0';
> -    *min_str++ = *max_str++ = ' ';      /* Because if key compression */
> -  }
> +  *min_length= *max_length= (size_t) (min_str - min_org);
> +
> +pad_min_max:
> +  my_fill_mb2(cs, min_str, min_end - min_str, cs->min_sort_char);
> +  my_fill_mb2(cs, max_str, max_end - max_str, cs->max_sort_char);
>     return FALSE;
>   }
>
> @@ -2583,12 +2589,13 @@ my_like_range_utf32(CHARSET_INFO *cs,
>     char *min_end= min_str + res_length;
>     char *max_end= max_str + res_length;
>     size_t charlen= res_length / cs->mbmaxlen;
> +  my_bool have_contractions= my_cs_have_contractions(cs);
>
>     DBUG_ASSERT((res_length % 4) == 0);
>
>     for ( ; charlen>  0; ptr+= 4, charlen--)
>     {
> -    my_wc_t wc;
> +    my_wc_t wc, wc2;
>       int res;
>       if ((res= my_utf32_uni(cs,&wc, (uchar*) ptr, (uchar*) end))<  0)
>       {
> @@ -2640,6 +2647,24 @@ my_like_range_utf32(CHARSET_INFO *cs,
>         goto pad_min_max;
>       }
>
> +    if (have_contractions&&
> +        my_cs_can_be_contraction_head(cs, wc)&&
> +        (res= my_utf32_uni(cs,&wc2, (uchar*) ptr + 4, (uchar*) end))>  0)
> +    {
> +      /*
> +         Check contraction head followed by a wildcard
> +         or contraction which does not fit to result
> +      */
> +      if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) ||
> +          (charlen == 1&&
> +           my_cs_can_be_contraction_tail(cs, wc2)&&
> +           my_cs_contraction2_weight(cs, wc, wc2)))
> +      {
> +        *min_length= *max_length= res_length;
> +        goto pad_min_max;
> +      }
> +    }
> +
>       /* Normal character */
>       if (my_uni_utf32(cs, wc, (uchar*) min_str, (uchar*) min_end) != 4 ||
>           my_uni_utf32(cs, wc, (uchar*) max_str, (uchar*) max_end) != 4)
>
>
>
>


Thread
bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Alexander Barkov11 Nov
  • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Roy Lyseng19 Nov
    • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Alexander Barkov22 Nov
      • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Roy Lyseng22 Nov
        • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Alexander Barkov23 Nov
  • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Roy Lyseng22 Nov