List:Commits« Previous MessageNext Message »
From:Alexander Barkov Date:August 31 2010 12:00pm
Subject:bzr commit into mysql-5.5-bugfixing branch (bar:3195) Bug#55980
View as plain text  
#At file:///home/bar/mysql-bzr/mysql-5.5-bugfixing.b55980/ based on revid:alik@stripped

 3195 Alexander Barkov	2010-08-31
      Bug#55980 Character sets: supplementary character _bin ordering is wrong
      
      Problem:
      - ORDER BY for utf8mb4_bin, utf16_bin and utf32_bin returned
        results in a wrong order, because old functions
        (supporting only BMP range) were used to handle these collations.
      - Additionally, utf16_bin did not sort supplementary characters
        between U+D700 and U+E000, as WL#1213 specification specified.
      
        mysql-test/include/ctype_filesort2.inc
        Adding a new shared test file
      
        include/m_ctype.h
        Adding prototypes
      
        mysql-test/r/ctype_utf16.result
        mysql-test/r/ctype_utf32.result
        mysql-test/r/ctype_utf8mb4.result
        mysql-test/t/ctype_utf16.test
        mysql-test/t/ctype_utf32.test
        mysql-test/t/ctype_utf8mb4.test
        Adding tests
      
        strings/ctype-ucs2.c
        - Fixing my_strncoll[sp]_utf16_bin to compare
          binary representation instead of code points,
          to make columns with indexes sort correct.
        - Fixing my_collation_handler_utf32_bin and
          my_collation_handler_utf16_bin to use new
          functions
       
        strings/ctype-utf8.c
        - Adding my_strnxfrm[len]_unicode_fill_bin()
          to handle utf8mb4_bin, utf16_bin and utf32_bin,
          using 3 bytes per weight.
          This function also performs special reordering in case of utf16_bin.
        - Fixing my_collation_utf8mb4_bin handler to use the
          new function.

    added:
      mysql-test/include/ctype_filesort2.inc
    modified:
      include/m_ctype.h
      mysql-test/r/ctype_utf16.result
      mysql-test/r/ctype_utf32.result
      mysql-test/r/ctype_utf8mb4.result
      mysql-test/t/ctype_utf16.test
      mysql-test/t/ctype_utf32.test
      mysql-test/t/ctype_utf8mb4.test
      strings/ctype-ucs2.c
      strings/ctype-utf8.c
=== modified file 'include/m_ctype.h'
--- a/include/m_ctype.h	2010-03-31 14:05:33 +0000
+++ b/include/m_ctype.h	2010-08-31 11:48:33 +0000
@@ -539,6 +539,11 @@ size_t my_strnxfrm_unicode(CHARSET_INFO 
                            uchar *dst, size_t dstlen,
                            const uchar *src, size_t srclen);
 
+size_t my_strnxfrm_unicode_full_bin(CHARSET_INFO *,
+                                    uchar *dst, size_t dstlen,
+                                    const uchar *src, size_t srclen);
+size_t  my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *, size_t); 
+
 int my_wildcmp_unicode(CHARSET_INFO *cs,
                        const char *str, const char *str_end,
                        const char *wildstr, const char *wildend,

=== added file 'mysql-test/include/ctype_filesort2.inc'
--- a/mysql-test/include/ctype_filesort2.inc	1970-01-01 00:00:00 +0000
+++ b/mysql-test/include/ctype_filesort2.inc	2010-08-31 11:48:33 +0000
@@ -0,0 +1,16 @@
+#
+# Testing filesort for full Unicode character sets
+# with supplementary characters.
+#
+
+--echo #
+--echo # Bug#55980 Character sets: supplementary character _bin ordering is wrong
+--echo #
+CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
+INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+ALTER TABLE t1 ADD KEY(a);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+DROP TABLE IF EXISTS t1;

=== modified file 'mysql-test/r/ctype_utf16.result'
--- a/mysql-test/r/ctype_utf16.result	2010-06-02 12:23:50 +0000
+++ b/mysql-test/r/ctype_utf16.result	2010-08-31 11:48:33 +0000
@@ -611,6 +611,31 @@ utf16_bin	00610009
 utf16_bin	0061
 utf16_bin	00610020
 drop table t1;
+#
+# Bug#55980 Character sets: supplementary character _bin ordering is wrong
+#
+CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` varchar(1) CHARACTER SET utf16 COLLATE utf16_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
+INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+0385	CE85
+D800DF84	F0908E84
+DBC0DC00	F4808080
+FF9D	EFBE9D
+ALTER TABLE t1 ADD KEY(a);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+0385	CE85
+D800DF84	F0908E84
+DBC0DC00	F4808080
+FF9D	EFBE9D
+DROP TABLE IF EXISTS t1;
 select @@collation_connection;
 @@collation_connection
 utf16_bin

=== modified file 'mysql-test/r/ctype_utf32.result'
--- a/mysql-test/r/ctype_utf32.result	2010-08-26 12:36:33 +0000
+++ b/mysql-test/r/ctype_utf32.result	2010-08-31 11:48:33 +0000
@@ -610,6 +610,31 @@ utf32_bin	0000006100000009
 utf32_bin	00000061
 utf32_bin	0000006100000020
 drop table t1;
+#
+# Bug#55980 Character sets: supplementary character _bin ordering is wrong
+#
+CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` varchar(1) CHARACTER SET utf32 COLLATE utf32_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
+INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+00000385	CE85
+0000FF9D	EFBE9D
+00010384	F0908E84
+00100000	F4808080
+ALTER TABLE t1 ADD KEY(a);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+00000385	CE85
+0000FF9D	EFBE9D
+00010384	F0908E84
+00100000	F4808080
+DROP TABLE IF EXISTS t1;
 select @@collation_connection;
 @@collation_connection
 utf32_bin

=== modified file 'mysql-test/r/ctype_utf8mb4.result'
--- a/mysql-test/r/ctype_utf8mb4.result	2010-06-02 12:23:50 +0000
+++ b/mysql-test/r/ctype_utf8mb4.result	2010-08-31 11:48:33 +0000
@@ -987,6 +987,31 @@ utf8mb4_bin	6109
 utf8mb4_bin	61
 utf8mb4_bin	6120
 drop table t1;
+#
+# Bug#55980 Character sets: supplementary character _bin ordering is wrong
+#
+CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` varchar(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
+INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+CE85	CE85
+EFBE9D	EFBE9D
+F0908E84	F0908E84
+F4808080	F4808080
+ALTER TABLE t1 ADD KEY(a);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+CE85	CE85
+EFBE9D	EFBE9D
+F0908E84	F0908E84
+F4808080	F4808080
+DROP TABLE IF EXISTS t1;
 select @@collation_connection;
 @@collation_connection
 utf8mb4_bin

=== modified file 'mysql-test/t/ctype_utf16.test'
--- a/mysql-test/t/ctype_utf16.test	2010-06-02 12:23:50 +0000
+++ b/mysql-test/t/ctype_utf16.test	2010-08-31 11:48:33 +0000
@@ -326,6 +326,7 @@ SET collation_connection='utf16_general_
 SET NAMES latin1;
 SET collation_connection='utf16_bin';
 -- source include/ctype_filesort.inc
+-- source include/ctype_filesort2.inc
 -- source include/ctype_like_escape.inc
 
 #

=== modified file 'mysql-test/t/ctype_utf32.test'
--- a/mysql-test/t/ctype_utf32.test	2010-08-26 12:36:33 +0000
+++ b/mysql-test/t/ctype_utf32.test	2010-08-31 11:48:33 +0000
@@ -328,6 +328,7 @@ SET collation_connection='utf32_general_
 SET NAMES latin1;
 SET collation_connection='utf32_bin';
 -- source include/ctype_filesort.inc
+-- source include/ctype_filesort2.inc
 -- source include/ctype_like_escape.inc
 
 #

=== modified file 'mysql-test/t/ctype_utf8mb4.test'
--- a/mysql-test/t/ctype_utf8mb4.test	2010-06-02 12:23:50 +0000
+++ b/mysql-test/t/ctype_utf8mb4.test	2010-08-31 11:48:33 +0000
@@ -733,6 +733,7 @@ SET collation_connection='utf8mb4_genera
 -- source include/ctype_german.inc
 SET collation_connection='utf8mb4_bin';
 -- source include/ctype_filesort.inc
+-- source include/ctype_filesort2.inc
 -- source include/ctype_like_escape.inc
 
 #

=== modified file 'strings/ctype-ucs2.c'
--- a/strings/ctype-ucs2.c	2010-07-23 20:09:27 +0000
+++ b/strings/ctype-ucs2.c	2010-08-31 11:48:33 +0000
@@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs,
     }
     if (s_wc != t_wc)
     {
-      return  s_wc > t_wc ? 1 : -1;
+      return  my_bincmp(s, s + s_res, t, t + t_res);
     }
     
     s+= s_res;
@@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *c
 
     if (s_wc != t_wc)
     {
-      return s_wc > t_wc ? 1 : -1;
+      return my_bincmp(s, s + s_res, t, t + t_res);
     }
 
     s+= s_res;
@@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation
   NULL,                /* init */
   my_strnncoll_utf16_bin,
   my_strnncollsp_utf16_bin,
-  my_strnxfrm_unicode,
-  my_strnxfrmlen_simple,
+  my_strnxfrm_unicode_full_bin,
+  my_strnxfrmlen_unicode_full_bin,
   my_like_range_utf16,
   my_wildcmp_utf16_bin,
   my_strcasecmp_mb2_or_mb4,
@@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation
   NULL, /* init */
   my_strnncoll_utf32_bin,
   my_strnncollsp_utf32_bin,
-  my_strnxfrm_unicode,
-  my_strnxfrmlen_utf32,
+  my_strnxfrm_unicode_full_bin,
+  my_strnxfrmlen_unicode_full_bin,
   my_like_range_utf32,
   my_wildcmp_utf32_bin,
   my_strcasecmp_mb2_or_mb4,

=== modified file 'strings/ctype-utf8.c'
--- a/strings/ctype-utf8.c	2010-03-04 11:00:32 +0000
+++ b/strings/ctype-utf8.c	2010-08-31 11:48:33 +0000
@@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs,
 
 
 /*
-  This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32
+  Store sorting weights using 2 bytes per character.
+
+  This function is shared between
+  - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
+    which support BMP only (U+0000..U+FFFF).
+  - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
+    which map all supplementary characters to weight 0xFFFF.
 */
 size_t
 my_strnxfrm_unicode(CHARSET_INFO *cs,
@@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs,
 }
 
 
+/*
+  Store sorting weights using 3 bytes per character.
+  This function is shared between utf8mb4_bin, utf16_bin, utf32_bin.
+*/
+size_t
+my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
+                             uchar *dst, size_t dstlen,
+                             const uchar *src, size_t srclen)
+{
+  my_wc_t wc;
+  uchar *de= dst + dstlen;
+  uchar *de_beg= de - 2; /* The beginning of the last chunk */
+  const uchar *se = src + srclen;
+
+  LINT_INIT(wc);
+  DBUG_ASSERT(src);
+  DBUG_ASSERT(cs->state & MY_CS_BINSORT);
+
+  while (dst < de_beg)
+  {
+    int res;
+    if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
+      break;
+    src+= res;
+    if (cs->mbminlen == 2) /* utf16_bin */
+    {
+      /*
+        Reorder code points to weights as follows:
+        U+0000..U+D7FF    -> [00][00][00]..[00][D7][FF] BMP part #1
+        U+10000..U+10FFFF -> [10][00][00]..[10][FF][FF] Supplementary
+        U+E000..U+FFFF    -> [20][E0][00]..[20][FF][FF] BMP part #2
+      */
+      if (wc >= 0xE000 && wc <= 0xFFFF)
+        wc+= 0x200000;
+    }
+    *dst++= (uchar) (wc >> 16);
+    *dst++= (uchar) ((wc >> 8) & 0xFF);
+    *dst++= (uchar) (wc & 0xFF);
+  }
+
+  while (dst < de_beg) /* Fill the tail with keys for space character */
+  {
+    *dst++= 0x00;
+    *dst++= 0x00;
+    *dst++= 0x20;
+  }
+
+  /* Clear the last one or two bytes, if "dstlen" was not divisible by 3 */
+  if (dst < de)
+  {
+    *dst++= 0x00;
+    if (dst < de)
+      *dst= 0x00;
+  }
+
+  return dstlen;
+}
+
+
+size_t
+my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len)
+{
+  return ((len + 3) / cs->mbmaxlen) * 3;
+}
 #endif /* HAVE_UNIDATA */
 
 
@@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation
     NULL,		/* init */
     my_strnncoll_mb_bin,
     my_strnncollsp_mb_bin,
-    my_strnxfrm_unicode,
-    my_strnxfrmlen_utf8mb4,
+    my_strnxfrm_unicode_full_bin,
+    my_strnxfrmlen_unicode_full_bin,
     my_like_range_mb,
     my_wildcmp_mb_bin,
     my_strcasecmp_mb_bin,


Attachment: [text/bzr-bundle] bzr/bar@mysql.com-20100831114833-xpxsafdfe69xh6es.bundle
Thread
bzr commit into mysql-5.5-bugfixing branch (bar:3195) Bug#55980Alexander Barkov31 Aug