List:Commits« Previous MessageNext Message »
From:Roy Lyseng Date:November 19 2010 1:49pm
Subject:Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737
View as plain text  
Hi Alexander,

some preliminary notes to the code changes.

I still need some time to understand the logic of the patch.

On 11.11.10 09.44, Alexander Barkov wrote:
> #At file:///home/bar/mysql-bzr/mysql-5.5-bugteam.b57737/ based on
> revid:dmitry.shulga@stripped
>
>   3114 Alexander Barkov	2010-11-11
>        Bug#57737 Character sets: search fails with like, contraction, index
>
>        Problem: LIKE over an indexed column optimized away good results,
>        because my_like_range_utf32/utf16 returned wrong ranges for contractions.
>
>        Fix: Make my_like_range_utf32/utf16 return correct ranges for contractions.
>
>        Note, this code presens in my_like_range_mb and my_like_range_ucs2.
>        It was forgotten in utf32/utf16 versions (during mysql-6.0 push/revert mess).
>
>
>        added:
>          @ mysql-test/include/ctype_czech.inc
>          Adding shared test code
>
>        modified:
>          @ include/m_ctype.h
>          Adding helper functions for contractions.
>
>          @ mysql-test/r/ctype_uca.result
>          @ mysql-test/r/ctype_utf16_uca.result
>          @ mysql-test/r/ctype_utf32_uca.result
>          @ mysql-test/t/ctype_uca.test
>          @ mysql-test/t/ctype_utf16_uca.test
>          @ mysql-test/t/ctype_utf32_uca.test
>          Adding tests.
>
>          @ strings/ctype-mb.c
>          Pad function did not put the last character.
>
>          @ strings/ctype-ucs2.c
>          - my_fill_mb2 did not put the high byte, as previously
>            it was used to put only characters in ASCII range.
>            Now it puts high byte as well
>            (needed to pupulate cs->max_sort_char correctly).
>
>          - Adding contraction handling into my_like_range_utf16
>          - Removing legacy "Temporary fix" from my_like_range_utf16
>          - Using my_fill_mb2 instead of loop (duplicate code eliminations)
>          - Adding contraction handling into my_like_range_utf32
>
>      added:
>        mysql-test/include/ctype_czech.inc
>      modified:
>        include/m_ctype.h
>        mysql-test/r/ctype_uca.result
>        mysql-test/r/ctype_utf16_uca.result
>        mysql-test/r/ctype_utf32_uca.result
>        mysql-test/t/ctype_uca.test
>        mysql-test/t/ctype_utf16_uca.test
>        mysql-test/t/ctype_utf32_uca.test
>        strings/ctype-mb.c
>        strings/ctype-ucs2.c
> === modified file 'include/m_ctype.h'
> --- a/include/m_ctype.h	2010-09-28 15:15:58 +0000
> +++ b/include/m_ctype.h	2010-11-11 08:39:03 +0000
> @@ -356,6 +356,32 @@ extern CHARSET_INFO my_charset_utf8mb4_u
>   #define MY_UTF8MB4                 "utf8mb4"
>
>
> +/* Helper functions to handle contraction */
> +static inline my_bool
> +my_cs_have_contractions(CHARSET_INFO *cs)
> +{
> +  return cs->contractions != NULL;
> +}
> +
> +static inline my_bool
> +my_cs_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc)
> +{
> +  return ((const char *)cs->contractions)[0x40*0x40 + (wc&  0xFF)];
> +}
> +
> +static inline my_bool
> +my_cs_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc)
> +{
> +  return ((const char *)cs->contractions)[0x40*0x40 + (wc&  0xFF)];
> +}
> +
> +static inline uint16*
> +my_cs_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
> +{
> +  return&cs->contractions[(wc1 - 0x40) * 0x40 + wc2 - 0x40];
> +}
> +
> +
>   /* declarations for simple charsets */
>   extern size_t my_strnxfrm_simple(CHARSET_INFO *, uchar *, size_t,
>                                    const uchar *, size_t);
>
> === added file 'mysql-test/include/ctype_czech.inc'
> --- a/mysql-test/include/ctype_czech.inc	1970-01-01 00:00:00 +0000
> +++ b/mysql-test/include/ctype_czech.inc	2010-11-11 08:39:03 +0000
> @@ -0,0 +1,12 @@
> +SELECT @@collation_connection;
> +--echo #
> +--echo # Bug#57737 Character sets: search fails with like, contraction, index
> +--echo #
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +DROP TABLE t1;
>
> === modified file 'mysql-test/r/ctype_uca.result'
> --- a/mysql-test/r/ctype_uca.result	2008-03-26 09:51:16 +0000
> +++ b/mysql-test/r/ctype_uca.result	2010-11-11 08:39:03 +0000
> @@ -2888,3 +2888,63 @@ a	hex(b)	c
>   DROP TABLE t1;
>   set names utf8;
>   End for 5.0 tests
> +#
> +# Start of 5.5 tests
> +#
> +SET collation_connection=utf8_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +utf8_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
> +SET collation_connection=ucs2_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +ucs2_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
> +#
> +# End of 5.5 tests
> +#
>
> === modified file 'mysql-test/r/ctype_utf16_uca.result'
> --- a/mysql-test/r/ctype_utf16_uca.result	2010-02-24 09:15:34 +0000
> +++ b/mysql-test/r/ctype_utf16_uca.result	2010-11-11 08:39:03 +0000
> @@ -2368,6 +2368,33 @@ NULL
>   NULL
>   NULL
>   drop table t1;
> +SET collation_connection=utf16_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +utf16_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
>   #
>   # End of 5.5 tests
>   #
>
> === modified file 'mysql-test/r/ctype_utf32_uca.result'
> --- a/mysql-test/r/ctype_utf32_uca.result	2010-02-24 09:15:34 +0000
> +++ b/mysql-test/r/ctype_utf32_uca.result	2010-11-11 08:39:03 +0000
> @@ -2368,6 +2368,33 @@ NULL
>   NULL
>   NULL
>   drop table t1;
> +SET collation_connection=utf32_czech_ci;
> +SELECT @@collation_connection;
> +@@collation_connection
> +utf32_czech_ci
> +#
> +# Bug#57737 Character sets: search fails with like, contraction, index
> +#
> +CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
> +INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 ADD KEY s1 (s1);
> +SELECT * FROM t1 WHERE s1 LIKE 'c%';
> +s1
> +c
> +ce
> +cé
> +ch
> +ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
> +SELECT * FROM t1 WHERE s1 LIKE 'ch';
> +s1
> +ch
> +DROP TABLE t1;
>   #
>   # End of 5.5 tests
>   #
>
> === modified file 'mysql-test/t/ctype_uca.test'
> --- a/mysql-test/t/ctype_uca.test	2008-02-20 18:49:26 +0000
> +++ b/mysql-test/t/ctype_uca.test	2010-11-11 08:39:03 +0000
> @@ -545,3 +545,17 @@ set collation_connection=ucs2_unicode_ci
>   set names utf8;
>
>   -- echo End for 5.0 tests
> +
> +--echo #
> +--echo # Start of 5.5 tests
> +--echo #
> +#
> +# Test my_like_range and contractions
> +#
> +SET collation_connection=utf8_czech_ci;
> +--source include/ctype_czech.inc
> +SET collation_connection=ucs2_czech_ci;
> +--source include/ctype_czech.inc
> +--echo #
> +--echo # End of 5.5 tests
> +--echo #
>
> === modified file 'mysql-test/t/ctype_utf16_uca.test'
> --- a/mysql-test/t/ctype_utf16_uca.test	2010-02-24 09:15:34 +0000
> +++ b/mysql-test/t/ctype_utf16_uca.test	2010-11-11 08:39:03 +0000
> @@ -284,6 +284,12 @@ DROP TABLE IF EXISTS t1;
>   set collation_connection=utf16_unicode_ci;
>   --source include/ctype_regex.inc
>
> +#
> +# Test my_like_range and contractions
> +#
> +SET collation_connection=utf16_czech_ci;
> +--source include/ctype_czech.inc
> +
>
>   --echo #
>   --echo # End of 5.5 tests
>
> === modified file 'mysql-test/t/ctype_utf32_uca.test'
> --- a/mysql-test/t/ctype_utf32_uca.test	2010-02-24 09:15:34 +0000
> +++ b/mysql-test/t/ctype_utf32_uca.test	2010-11-11 08:39:03 +0000
> @@ -286,6 +286,13 @@ set collation_connection=utf32_unicode_c
>   --source include/ctype_regex.inc
>
>
> +#
> +# Test my_like_range and contractions
> +#
> +SET collation_connection=utf32_czech_ci;
> +--source include/ctype_czech.inc
> +
> +
>   --echo #
>   --echo # End of 5.5 tests
>   --echo #
>
> === modified file 'strings/ctype-mb.c'
> --- a/strings/ctype-mb.c	2010-07-26 06:47:03 +0000
> +++ b/strings/ctype-mb.c	2010-11-11 08:39:03 +0000
> @@ -636,7 +636,7 @@ static void pad_max_char(CHARSET_INFO *c
>     DBUG_ASSERT(buflen>  0);
>     do
>     {
> -    if ((str + buflen)<  end)
> +    if ((str + buflen)<= end)
>       {
>         /* Enough space for the characer */
>         memcpy(str, buf, buflen);
>
> === modified file 'strings/ctype-ucs2.c'
> --- a/strings/ctype-ucs2.c	2010-10-20 19:02:59 +0000
> +++ b/strings/ctype-ucs2.c	2010-11-11 08:39:03 +0000
> @@ -903,7 +903,7 @@ static void
>   my_fill_mb2(CHARSET_INFO *cs __attribute__((unused)),
>               char *s, size_t l, int fill)

Type of argument 'fill' should be uint16.
>   {
> -  for ( ; l>= 2; s[0]= 0, s[1]= fill, s+= 2, l-= 2);
> +  for ( ; l>= 2; s[0]= (fill>>  8), s[1]= (fill&  0xFF), s+= 2, l-=
> 2);
>   }
>
>
> @@ -1590,14 +1590,19 @@ my_like_range_utf16(CHARSET_INFO *cs,
>                       char *min_str,char *max_str,
>                       size_t *min_length,size_t *max_length)

Can you please document arguments cs, w_one, w_many, min_length and max_length?
And likewise for my_like_range_utf32().

The 'escape' argument is a single byte character. Is that correct in the general 
case? If so, I think the fact should be documented.

>   {
> -  const char *end=ptr+ptr_length;
> -  char *min_org=min_str;
> -  char *min_end=min_str+res_length;
> +  const char *end= ptr + ptr_length;
> +  char *min_org= min_str;

min_org can be 'const char *'.

> +  char *min_end= min_str + res_length;
> +  char *max_end= max_str + res_length;
>     size_t charlen= res_length / cs->mbmaxlen;
> +  my_bool have_contractions= my_cs_have_contractions(cs);

For symmetry, you may add here DBUG_ASSERT((res_length % 2) == 0);

>
>     for ( ; ptr + 1<  end&&  min_str + 1<  min_end&& 
> charlen>  0
>           ; ptr+=2, charlen--)
>     {
> +    my_wc_t wc2;
> +    int res;

The 'res' local variable is redundant (never tested).

> +
>       if (ptr[0] == '\0'&&  ptr[1] == escape&&  ptr + 1<  end)
>       {
>         ptr+=2;                                        /* Skip escape */
> @@ -1623,34 +1628,35 @@ my_like_range_utf16(CHARSET_INFO *cs,
>         *min_length= ((cs->state&  MY_CS_BINSORT) ? (size_t) (min_str -
> min_org) :
>                       res_length);
>         *max_length= res_length;
> -      do {
> -        *min_str++ = 0;
> -        *min_str++ = 0;
> -        *max_str++ = (char) (cs->max_sort_char>>  8);
> -        *max_str++ = (char) (cs->max_sort_char&  255);
> -      } while (min_str + 1<  min_end);
> -      return FALSE;
> +      goto pad_min_max;
> +    }
> +    if (have_contractions&&  ptr + 3<  end&&
> +        ptr[0] == '\0'&&
> +        my_cs_can_be_contraction_head(cs, (uchar) ptr[1])&&

The cast of ptr[1] is not according to the prototype of 
my_cs_can_be_contraction_head().

> +        (res= my_utf16_uni(cs,&wc2, (uchar*) ptr + 2, (uchar*) end))>  0)
> +    {
> +      /*
> +         Check contraction head followed by a wildcard,
> +         or contraction that does not fit to result
> +      */
> +      if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) ||
> +          (charlen == 1&&
> +           my_cs_can_be_contraction_tail(cs, wc2)&&
> +           my_cs_contraction2_weight(cs, (uchar) ptr[1], wc2)))
> +      {
> +        *min_length= *max_length= res_length;
> +        goto pad_min_max;
> +      }
>       }
>       *min_str++= *max_str++ = ptr[0];
>       *min_str++= *max_str++ = ptr[1];
>     }
>
> -  /* Temporary fix for handling w_one at end of string (key compression) */
> -  {
> -    char *tmp;
> -    for (tmp= min_str ; tmp-1>  min_org&&  tmp[-1] == '\0'&& 
> tmp[-2]=='\0';)
> -    {
> -      *--tmp=' ';
> -      *--tmp='\0';
> -    }
> -  }
> -
> -  *min_length= *max_length = (size_t) (min_str - min_org);
> -  while (min_str + 1<  min_end)
> -  {
> -    *min_str++ = *max_str++ = '\0';
> -    *min_str++ = *max_str++ = ' ';      /* Because if key compression */
> -  }
> +  *min_length= *max_length= (size_t) (min_str - min_org);
> +
> +pad_min_max:
> +  my_fill_mb2(cs, min_str, min_end - min_str, cs->min_sort_char);
> +  my_fill_mb2(cs, max_str, max_end - max_str, cs->max_sort_char);
>     return FALSE;
>   }
>
> @@ -2583,12 +2589,13 @@ my_like_range_utf32(CHARSET_INFO *cs,
>     char *min_end= min_str + res_length;
>     char *max_end= max_str + res_length;
>     size_t charlen= res_length / cs->mbmaxlen;
> +  my_bool have_contractions= my_cs_have_contractions(cs);
>
>     DBUG_ASSERT((res_length % 4) == 0);
>
>     for ( ; charlen>  0; ptr+= 4, charlen--)
>     {
> -    my_wc_t wc;
> +    my_wc_t wc, wc2;
>       int res;

The 'res' local variable is redundant.

>       if ((res= my_utf32_uni(cs,&wc, (uchar*) ptr, (uchar*) end))<  0)
>       {
> @@ -2640,6 +2647,24 @@ my_like_range_utf32(CHARSET_INFO *cs,
>         goto pad_min_max;
>       }
>
> +    if (have_contractions&&
> +        my_cs_can_be_contraction_head(cs, wc)&&
> +        (res= my_utf32_uni(cs,&wc2, (uchar*) ptr + 4, (uchar*) end))>  0)
> +    {
> +      /*
> +         Check contraction head followed by a wildcard
> +         or contraction which does not fit to result
> +      */
> +      if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) ||
> +          (charlen == 1&&
> +           my_cs_can_be_contraction_tail(cs, wc2)&&
> +           my_cs_contraction2_weight(cs, wc, wc2)))
> +      {
> +        *min_length= *max_length= res_length;
> +        goto pad_min_max;
> +      }
> +    }
> +
>       /* Normal character */
>       if (my_uni_utf32(cs, wc, (uchar*) min_str, (uchar*) min_end) != 4 ||
>           my_uni_utf32(cs, wc, (uchar*) max_str, (uchar*) max_end) != 4)
>

In escape handling, this function performs:
   *min_str++= 4;
   *max_str++= 4;
Is that correct?

The return values in the two functions seem to be inconsistent - There are no 
return TRUE in my_like_range_utf16() but two in my_like_range_utf32().

Thanks,
Roy

Thread
bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Alexander Barkov11 Nov
  • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Roy Lyseng19 Nov
    • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Alexander Barkov22 Nov
      • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Roy Lyseng22 Nov
        • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Alexander Barkov23 Nov
  • Re: bzr commit into mysql-5.5-bugteam branch (bar:3114) Bug#57737Roy Lyseng22 Nov