#At file:///home/bar/mysql-bzr/mysql-5.5-bugteam.b57737/ based on revid:dmitry.shulga@strippednqay5s8s959l8x
3114 Alexander Barkov 2010-11-11
Bug#57737 Character sets: search fails with like, contraction, index
Problem: LIKE over an indexed column optimized away good results,
because my_like_range_utf32/utf16 returned wrong ranges for contractions.
Fix: Make my_like_range_utf32/utf16 return correct ranges for contractions.
Note, this code presens in my_like_range_mb and my_like_range_ucs2.
It was forgotten in utf32/utf16 versions (during mysql-6.0 push/revert mess).
added:
@ mysql-test/include/ctype_czech.inc
Adding shared test code
modified:
@ include/m_ctype.h
Adding helper functions for contractions.
@ mysql-test/r/ctype_uca.result
@ mysql-test/r/ctype_utf16_uca.result
@ mysql-test/r/ctype_utf32_uca.result
@ mysql-test/t/ctype_uca.test
@ mysql-test/t/ctype_utf16_uca.test
@ mysql-test/t/ctype_utf32_uca.test
Adding tests.
@ strings/ctype-mb.c
Pad function did not put the last character.
@ strings/ctype-ucs2.c
- my_fill_mb2 did not put the high byte, as previously
it was used to put only characters in ASCII range.
Now it puts high byte as well
(needed to pupulate cs->max_sort_char correctly).
- Adding contraction handling into my_like_range_utf16
- Removing legacy "Temporary fix" from my_like_range_utf16
- Using my_fill_mb2 instead of loop (duplicate code eliminations)
- Adding contraction handling into my_like_range_utf32
added:
mysql-test/include/ctype_czech.inc
modified:
include/m_ctype.h
mysql-test/r/ctype_uca.result
mysql-test/r/ctype_utf16_uca.result
mysql-test/r/ctype_utf32_uca.result
mysql-test/t/ctype_uca.test
mysql-test/t/ctype_utf16_uca.test
mysql-test/t/ctype_utf32_uca.test
strings/ctype-mb.c
strings/ctype-ucs2.c
=== modified file 'include/m_ctype.h'
--- a/include/m_ctype.h 2010-09-28 15:15:58 +0000
+++ b/include/m_ctype.h 2010-11-11 08:39:03 +0000
@@ -356,6 +356,32 @@ extern CHARSET_INFO my_charset_utf8mb4_u
#define MY_UTF8MB4 "utf8mb4"
+/* Helper functions to handle contraction */
+static inline my_bool
+my_cs_have_contractions(CHARSET_INFO *cs)
+{
+ return cs->contractions != NULL;
+}
+
+static inline my_bool
+my_cs_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc)
+{
+ return ((const char *)cs->contractions)[0x40*0x40 + (wc & 0xFF)];
+}
+
+static inline my_bool
+my_cs_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc)
+{
+ return ((const char *)cs->contractions)[0x40*0x40 + (wc & 0xFF)];
+}
+
+static inline uint16*
+my_cs_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
+{
+ return &cs->contractions[(wc1 - 0x40) * 0x40 + wc2 - 0x40];
+}
+
+
/* declarations for simple charsets */
extern size_t my_strnxfrm_simple(CHARSET_INFO *, uchar *, size_t,
const uchar *, size_t);
=== added file 'mysql-test/include/ctype_czech.inc'
--- a/mysql-test/include/ctype_czech.inc 1970-01-01 00:00:00 +0000
+++ b/mysql-test/include/ctype_czech.inc 2010-11-11 08:39:03 +0000
@@ -0,0 +1,12 @@
+SELECT @@collation_connection;
+--echo #
+--echo # Bug#57737 Character sets: search fails with like, contraction, index
+--echo #
+CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
+INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+ALTER TABLE t1 ADD KEY s1 (s1);
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
+SELECT * FROM t1 WHERE s1 LIKE 'ch';
+DROP TABLE t1;
=== modified file 'mysql-test/r/ctype_uca.result'
--- a/mysql-test/r/ctype_uca.result 2008-03-26 09:51:16 +0000
+++ b/mysql-test/r/ctype_uca.result 2010-11-11 08:39:03 +0000
@@ -2888,3 +2888,63 @@ a hex(b) c
DROP TABLE t1;
set names utf8;
End for 5.0 tests
+#
+# Start of 5.5 tests
+#
+SET collation_connection=utf8_czech_ci;
+SELECT @@collation_connection;
+@@collation_connection
+utf8_czech_ci
+#
+# Bug#57737 Character sets: search fails with like, contraction, index
+#
+CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
+INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+s1
+c
+ce
+cé
+ch
+ALTER TABLE t1 ADD KEY s1 (s1);
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+s1
+c
+ce
+cé
+ch
+ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
+SELECT * FROM t1 WHERE s1 LIKE 'ch';
+s1
+ch
+DROP TABLE t1;
+SET collation_connection=ucs2_czech_ci;
+SELECT @@collation_connection;
+@@collation_connection
+ucs2_czech_ci
+#
+# Bug#57737 Character sets: search fails with like, contraction, index
+#
+CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
+INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+s1
+c
+ce
+cé
+ch
+ALTER TABLE t1 ADD KEY s1 (s1);
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+s1
+c
+ce
+cé
+ch
+ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
+SELECT * FROM t1 WHERE s1 LIKE 'ch';
+s1
+ch
+DROP TABLE t1;
+#
+# End of 5.5 tests
+#
=== modified file 'mysql-test/r/ctype_utf16_uca.result'
--- a/mysql-test/r/ctype_utf16_uca.result 2010-02-24 09:15:34 +0000
+++ b/mysql-test/r/ctype_utf16_uca.result 2010-11-11 08:39:03 +0000
@@ -2368,6 +2368,33 @@ NULL
NULL
NULL
drop table t1;
+SET collation_connection=utf16_czech_ci;
+SELECT @@collation_connection;
+@@collation_connection
+utf16_czech_ci
+#
+# Bug#57737 Character sets: search fails with like, contraction, index
+#
+CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
+INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+s1
+c
+ce
+cé
+ch
+ALTER TABLE t1 ADD KEY s1 (s1);
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+s1
+c
+ce
+cT * FROM t1 WHERE s1 LIKE 'ch';
+s1
+ch
+DROP TABLE t1;
#
# End of 5.5 tests
#
=== modified file 'mysql-test/r/ctype_utf32_uca.result'
--- a/mysql-test/r/ctype_utf32_uca.result 2010-02-24 09:15:34 +0000
+++ b/mysql-test/r/ctype_utf32_uca.result 2010-11-11 08:39:03 +0000
@@ -2368,6 +2368,33 @@ NULL
NULL
NULL
drop table t1;
+SET collation_connection=utf32_czech_ci;
+SELECT @@collation_connection;
+@@collation_connection
+utf32_czech_ci
+#
+# Bug#57737 Character sets: search fails with like, contraction, index
+#
+CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
+INSERT INTO t1 VALUES ('c'),('ce'),('cé'),('ch');
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+s1
+c
+ce
+cé
+ch
+ALTER TABLE t1 ADD KEY s1 (s1);
+SELECT * FROM t1 WHERE s1 LIKE 'c%';
+s1
+c
+ce
+cé
+ch
+ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
+SELECT * FROM t1 WHERE s1 LIKE 'ch';
+s1
+ch
+DROP TABLE t1;
#
# End of 5.5 tests
#
=== modified file 'mysql-test/t/ctype_uca.test'
--- a/mysql-test/t/ctype_uca.test 2008-02-20 18:49:26 +0000
+++ b/mysql-test/t/ctype_uca.test 2010-11-11 08:39:03 +0000
@@ -545,3 +545,17 @@ set collation_connection=ucs2_unicode_ci
set names utf8;
-- echo End for 5.0 tests
+
+--echo #
+--echo # Start of 5.5 tests
+--echo #
+#
+# Test my_like_range and contractions
+#
+SET collation_connection=utf8_czech_ci;
+--source include/ctype_czech.inc
+SET collation_connection=ucs2_czech_ci;
+--source include/ctype_czech.inc
+--echo #
+--echo # End of 5.5 tests
+--echo #
=== modified file 'mysql-test/t/ctype_utf16_uca.test'
--- a/mysql-test/t/ctype_utf16_uca.test 2010-02-24 09:15:34 +0000
+++ b/mysql-test/t/ctype_utf16_uca.test 2010-11-11 08:39:03 +0000
@@ -284,6 +284,12 @@ DROP TABLE IF EXISTS t1;
set collation_connection=utf16_unicode_ci;
--source include/ctype_regex.inc
+#
+# Test my_like_range and contractions
+#
+SET collation_connection=utf16_czech_ci;
+--source include/ctype_czech.inc
+
--echo #
--echo # End of 5.5 tests
=== modified file 'mysql-test/t/ctype_utf32_uca.test'
--- a/mysql-test/t/ctype_utf32_uca.test 2010-02-24 09:15:34 +0000
+++ b/mysql-test/t/ctype_utf32_uca.test 2010-11-11 08:39:03 +0000
@@ -286,6 +286,13 @@ set collation_connection=utf32_unicode_c
--source include/ctype_regex.inc
+#
+# Test my_like_range and contractions
+#
+SET collation_connection=utf32_czech_ci;
+--source include/ctype_czech.inc
+
+
--echo #
--echo # End of 5.5 tests
--echo #
=== modified file 'strings/ctype-mb.c'
--- a/strings/ctype-mb.c 2010-07-26 06:47:03 +0000
+++ b/strings/ctype-mb.c 2010-11-11 08:39:03 +0000
@@ -636,7 +636,7 @@ static void pad_max_char(CHARSET_INFO *c
DBUG_ASSERT(buflen > 0);
do
{
- if ((str + buflen) < end)
+ if ((str + buflen) <= end)
{
/* Enough space for the characer */
memcpy(str, buf, buflen);
=== modified file 'strings/ctype-ucs2.c'
--- a/strings/ctype-ucs2.c 2010-10-20 19:02:59 +0000
+++ b/strings/ctype-ucs2.c 2010-11-11 08:39:03 +0000
@@ -903,7 +903,7 @@ static void
my_fill_mb2(CHARSET_INFO *cs __attribute__((unused)),
char *s, size_t l, int fill)
{
- for ( ; l >= 2; s[0]= 0, s[1]= fill, s+= 2, l-= 2);
+ for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2);
}
@@ -1590,14 +1590,19 @@ my_like_range_utf16(CHARSET_INFO *cs,
char *min_str,char *max_str,
size_t *min_length,size_t *max_length)
{
- const char *end=ptr+ptr_length;
- char *min_org=min_str;
- char *min_end=min_str+res_length;
+ const char *end= ptr + ptr_length;
+ char *min_org= min_str;
+ char *min_end= min_str + res_length;
+ char *max_end= max_str + res_length;
size_t charlen= res_length / cs->mbmaxlen;
+ my_bool have_contractions= my_cs_have_contractions(cs);
for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0
; ptr+=2, charlen--)
{
+ my_wc_t wc2;
+ int res;
+
if (ptr[0] == '\0' && ptr[1] == escape && ptr + 1 < end)
{
ptr+=2; /* Skip escape */
@@ -1623,34 +1628,35 @@ my_like_range_utf16(CHARSET_INFO *cs,
*min_length= ((cs->state & MY_CS_BINSORT) ? (size_t) (min_str - min_org) :
res_length);
*max_length= res_length;
- do {
- *min_str++ = 0;
- *min_str++ = 0;
- *max_str++ = (char) (cs->max_sort_char >> 8);
- *max_str++ = (char) (cs->max_sort_char & 255);
- } while (min_str + 1 < min_end);
- return FALSE;
+ goto pad_min_max;
+ }
+ if (have_contractions && ptr + 3 < end &&
+ ptr[0] == '\0' &&
+ my_cs_can_be_contraction_head(cs, (uchar) ptr[1]) &&
+ (res= my_utf16_uni(cs, &wc2, (uchar*) ptr + 2, (uchar*) end)) > 0)
+ {
+ /*
+ Check contraction head followed by a wildcard,
+ or contraction that does not fit to result
+ */
+ if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) ||
+ (charlen == 1 &&
+ my_cs_can_be_contraction_tail(cs, wc2) &&
+ my_cs_contraction2_weight(cs, (uchar) ptr[1], wc2)))
+ {
+ *min_length= *max_length= res_length;
+ goto pad_min_max;
+ }
}
*min_str++= *max_str++ = ptr[0];
*min_str++= *max_str++ = ptr[1];
}
- /* Temporary fix for handling w_one at end of string (key compression) */
- {
- char *tmp;
- for (tmp= min_str ; tmp-1 > min_org && tmp[-1] == '\0' && tmp[-2]=='\0';)
- {
- *--tmp=' ';
- *--tmp='\0';
- }
- }
-
- *min_length= *max_length = (size_t) (min_str - min_org);
- while (min_str + 1 < min_end)
- {
- *min_str++ = *max_str++ = '\0';
- *min_str++ = *max_str++ = ' '; /* Because if key compression */
- }
+ *min_length= *max_length= (size_t) (min_str - min_org);
+
+pad_min_max:
+ my_fill_mb2(cs, min_str, min_end - min_str, cs->min_sort_char);
+ my_fill_mb2(cs, max_str, max_end - max_str, cs->max_sort_char);
return FALSE;
}
@@ -2583,12 +2589,13 @@ my_like_range_utf32(CHARSET_INFO *cs,
char *min_end= min_str + res_length;
char *max_end= max_str + res_length;
size_t charlen= res_length / cs->mbmaxlen;
+ my_bool have_contractions= my_cs_have_contractions(cs);
DBUG_ASSERT((res_length % 4) == 0);
for ( ; charlen > 0; ptr+= 4, charlen--)
{
- my_wc_t wc;
+ my_wc_t wc, wc2;
int res;
if ((res= my_utf32_uni(cs, &wc, (uchar*) ptr, (uchar*) end)) < 0)
{
@@ -2640,6 +2647,24 @@ my_like_range_utf32(CHARSET_INFO *cs,
goto pad_min_max;
}
+ if (have_contractions &&
+ my_cs_can_be_contraction_head(cs, wc) &&
+ (res= my_utf32_uni(cs, &wc2, (uchar*) ptr + 4, (uchar*) end)) > 0)
+ {
+ /*
+ Check contraction head followed by a wildcard
+ or contraction which does not fit to result
+ */
+ if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many) ||
+ (charlen == 1 &&
+ my_cs_can_be_contraction_tail(cs, wc2) &&
+ my_cs_contraction2_weight(cs, wc, wc2)))
+ {
+ *min_length= *max_length= res_length;
+ goto pad_min_max;
+ }
+ }
+
/* Normal character */
if (my_uni_utf32(cs, wc, (uchar*) min_str, (uchar*) min_end) != 4 ||
my_uni_utf32(cs, wc, (uchar*) max_str, (uchar*) max_end) != 4)
Attachment: [text/bzr-bundle] bzr/bar@mysql.com-20101111083903-1hu3527l3vsqsjlx.bundle