List:Commits« Previous MessageNext Message »
From:Alexander Nozdrin Date:August 31 2010 2:22pm
Subject:bzr commit into mysql-5.5 branch (alik:3086) Bug#55980
View as plain text  
#At file:///mnt/raid/alik/MySQL/bzr/00.builds/mysql-5.5.6-m3-release/ based on revid:alik@stripped

 3086 Alexander Nozdrin	2010-08-31
      Cherry-picking patch for Bug#55980.
      Original changeset:
      ------------------------------------------------------------
      revno: 3197
      revision-id: alik@stripped
      parent: magnus.blaudd@stripped
      committer: Alexander Nozdrin <alik@stripped>
      branch nick: mysql-5.5-bugfixing
      timestamp: Tue 2010-08-31 17:54:26 +0400
      message:
        Bug#55980 Character sets: supplementary character _bin ordering is wrong
        
        Problem:
        - ORDER BY for utf8mb4_bin, utf16_bin and utf32_bin returned
          results in a wrong order, because old functions
          (supporting only BMP range) were used to handle these collations.
        - Additionally, utf16_bin did not sort supplementary characters
          between U+D700 and U+E000, as WL#1213 specification specified.
      ------------------------------------------------------------

    added:
      mysql-test/include/ctype_filesort2.inc
    modified:
      include/m_ctype.h
      mysql-test/r/ctype_utf16.result
      mysql-test/r/ctype_utf32.result
      mysql-test/r/ctype_utf8mb4.result
      mysql-test/t/ctype_utf16.test
      mysql-test/t/ctype_utf32.test
      mysql-test/t/ctype_utf8mb4.test
      strings/ctype-ucs2.c
      strings/ctype-utf8.c
=== modified file 'include/m_ctype.h'
--- a/include/m_ctype.h	2010-03-31 14:05:33 +0000
+++ b/include/m_ctype.h	2010-08-31 14:22:03 +0000
@@ -539,6 +539,11 @@ size_t my_strnxfrm_unicode(CHARSET_INFO 
                            uchar *dst, size_t dstlen,
                            const uchar *src, size_t srclen);
 
+size_t my_strnxfrm_unicode_full_bin(CHARSET_INFO *,
+                                    uchar *dst, size_t dstlen,
+                                    const uchar *src, size_t srclen);
+size_t  my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *, size_t); 
+
 int my_wildcmp_unicode(CHARSET_INFO *cs,
                        const char *str, const char *str_end,
                        const char *wildstr, const char *wildend,

=== added file 'mysql-test/include/ctype_filesort2.inc'
--- a/mysql-test/include/ctype_filesort2.inc	1970-01-01 00:00:00 +0000
+++ b/mysql-test/include/ctype_filesort2.inc	2010-08-31 14:22:03 +0000
@@ -0,0 +1,16 @@
+#
+# Testing filesort for full Unicode character sets
+# with supplementary characters.
+#
+
+--echo #
+--echo # Bug#55980 Character sets: supplementary character _bin ordering is wrong
+--echo #
+CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
+INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+ALTER TABLE t1 ADD KEY(a);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+DROP TABLE IF EXISTS t1;

=== modified file 'mysql-test/r/ctype_utf16.result'
--- a/mysql-test/r/ctype_utf16.result	2010-06-02 12:23:50 +0000
+++ b/mysql-test/r/ctype_utf16.result	2010-08-31 14:22:03 +0000
@@ -611,6 +611,31 @@ utf16_bin	00610009
 utf16_bin	0061
 utf16_bin	00610020
 drop table t1;
+#
+# Bug#55980 Character sets: supplementary character _bin ordering is wrong
+#
+CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` varchar(1) CHARACTER SET utf16 COLLATE utf16_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
+INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+0385	CE85
+D800DF84	F0908E84
+DBC0DC00	F4808080
+FF9D	EFBE9D
+ALTER TABLE t1 ADD KEY(a);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+0385	CE85
+D800DF84	F0908E84
+DBC0DC00	F4808080
+FF9D	EFBE9D
+DROP TABLE IF EXISTS t1;
 select @@collation_connection;
 @@collation_connection
 utf16_bin

=== modified file 'mysql-test/r/ctype_utf32.result'
--- a/mysql-test/r/ctype_utf32.result	2010-08-20 11:14:11 +0000
+++ b/mysql-test/r/ctype_utf32.result	2010-08-31 14:22:03 +0000
@@ -610,6 +610,31 @@ utf32_bin	0000006100000009
 utf32_bin	00000061
 utf32_bin	0000006100000020
 drop table t1;
+#
+# Bug#55980 Character sets: supplementary character _bin ordering is wrong
+#
+CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` varchar(1) CHARACTER SET utf32 COLLATE utf32_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
+INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+00000385	CE85
+0000FF9D	EFBE9D
+00010384	F0908E84
+00100000	F4808080
+ALTER TABLE t1 ADD KEY(a);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+00000385	CE85
+0000FF9D	EFBE9D
+00010384	F0908E84
+00100000	F4808080
+DROP TABLE IF EXISTS t1;
 select @@collation_connection;
 @@collation_connection
 utf32_bin

=== modified file 'mysql-test/r/ctype_utf8mb4.result'
--- a/mysql-test/r/ctype_utf8mb4.result	2010-06-02 12:23:50 +0000
+++ b/mysql-test/r/ctype_utf8mb4.result	2010-08-31 14:22:03 +0000
@@ -987,6 +987,31 @@ utf8mb4_bin	6109
 utf8mb4_bin	61
 utf8mb4_bin	6120
 drop table t1;
+#
+# Bug#55980 Character sets: supplementary character _bin ordering is wrong
+#
+CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` varchar(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
+INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+CE85	CE85
+EFBE9D	EFBE9D
+F0908E84	F0908E84
+F4808080	F4808080
+ALTER TABLE t1 ADD KEY(a);
+SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
+HEX(a)	HEX(CONVERT(a USING utf8mb4))
+CE85	CE85
+EFBE9D	EFBE9D
+F0908E84	F0908E84
+F4808080	F4808080
+DROP TABLE IF EXISTS t1;
 select @@collation_connection;
 @@collation_connection
 utf8mb4_bin

=== modified file 'mysql-test/t/ctype_utf16.test'
--- a/mysql-test/t/ctype_utf16.test	2010-06-02 12:23:50 +0000
+++ b/mysql-test/t/ctype_utf16.test	2010-08-31 14:22:03 +0000
@@ -326,6 +326,7 @@ SET collation_connection='utf16_general_
 SET NAMES latin1;
 SET collation_connection='utf16_bin';
 -- source include/ctype_filesort.inc
+-- source include/ctype_filesort2.inc
 -- source include/ctype_like_escape.inc
 
 #

=== modified file 'mysql-test/t/ctype_utf32.test'
--- a/mysql-test/t/ctype_utf32.test	2010-08-20 11:14:11 +0000
+++ b/mysql-test/t/ctype_utf32.test	2010-08-31 14:22:03 +0000
@@ -328,6 +328,7 @@ SET collation_connection='utf32_general_
 SET NAMES latin1;
 SET collation_connection='utf32_bin';
 -- source include/ctype_filesort.inc
+-- source include/ctype_filesort2.inc
 -- source include/ctype_like_escape.inc
 
 #

=== modified file 'mysql-test/t/ctype_utf8mb4.test'
--- a/mysql-test/t/ctype_utf8mb4.test	2010-06-02 12:23:50 +0000
+++ b/mysql-test/t/ctype_utf8mb4.test	2010-08-31 14:22:03 +0000
@@ -733,6 +733,7 @@ SET collation_connection='utf8mb4_genera
 -- source include/ctype_german.inc
 SET collation_connection='utf8mb4_bin';
 -- source include/ctype_filesort.inc
+-- source include/ctype_filesort2.inc
 -- source include/ctype_like_escape.inc
 
 #

=== modified file 'strings/ctype-ucs2.c'
--- a/strings/ctype-ucs2.c	2010-07-23 20:09:27 +0000
+++ b/strings/ctype-ucs2.c	2010-08-31 14:22:03 +0000
@@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs,
     }
     if (s_wc != t_wc)
     {
-      return  s_wc > t_wc ? 1 : -1;
+      return  my_bincmp(s, s + s_res, t, t + t_res);
     }
     
     s+= s_res;
@@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *c
 
     if (s_wc != t_wc)
     {
-      return s_wc > t_wc ? 1 : -1;
+      return my_bincmp(s, s + s_res, t, t + t_res);
     }
 
     s+= s_res;
@@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation
   NULL,                /* init */
   my_strnncoll_utf16_bin,
   my_strnncollsp_utf16_bin,
-  my_strnxfrm_unicode,
-  my_strnxfrmlen_simple,
+  my_strnxfrm_unicode_full_bin,
+  my_strnxfrmlen_unicode_full_bin,
   my_like_range_utf16,
   my_wildcmp_utf16_bin,
   my_strcasecmp_mb2_or_mb4,
@@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation
   NULL, /* init */
   my_strnncoll_utf32_bin,
   my_strnncollsp_utf32_bin,
-  my_strnxfrm_unicode,
-  my_strnxfrmlen_utf32,
+  my_strnxfrm_unicode_full_bin,
+  my_strnxfrmlen_unicode_full_bin,
   my_like_range_utf32,
   my_wildcmp_utf32_bin,
   my_strcasecmp_mb2_or_mb4,

=== modified file 'strings/ctype-utf8.c'
--- a/strings/ctype-utf8.c	2010-03-04 11:00:32 +0000
+++ b/strings/ctype-utf8.c	2010-08-31 14:22:03 +0000
@@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs,
 
 
 /*
-  This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32
+  Store sorting weights using 2 bytes per character.
+
+  This function is shared between
+  - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
+    which support BMP only (U+0000..U+FFFF).
+  - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
+    which map all supplementary characters to weight 0xFFFD.
 */
 size_t
 my_strnxfrm_unicode(CHARSET_INFO *cs,
@@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs,
 }
 
 
+/*
+  Store sorting weights using 3 bytes per character.
+  This function is shared between utf8mb4_bin, utf16_bin, utf32_bin.
+*/
+size_t
+my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
+                             uchar *dst, size_t dstlen,
+                             const uchar *src, size_t srclen)
+{
+  my_wc_t wc;
+  uchar *de= dst + dstlen;
+  uchar *de_beg= de - 2; /* The beginning of the last chunk */
+  const uchar *se = src + srclen;
+
+  LINT_INIT(wc);
+  DBUG_ASSERT(src);
+  DBUG_ASSERT(cs->state & MY_CS_BINSORT);
+
+  while (dst < de_beg)
+  {
+    int res;
+    if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
+      break;
+    src+= res;
+    if (cs->mbminlen == 2) /* utf16_bin */
+    {
+      /*
+        Reorder code points to weights as follows:
+        U+0000..U+D7FF    -> [00][00][00]..[00][D7][FF] BMP part #1
+        U+10000..U+10FFFF -> [01][00][00]..[10][FF][FF] Supplementary
+        U+E000..U+FFFF    -> [20][E0][00]..[20][FF][FF] BMP part #2
+      */
+      if (wc >= 0xE000 && wc <= 0xFFFF)
+        wc+= 0x200000;
+    }
+    *dst++= (uchar) (wc >> 16);
+    *dst++= (uchar) ((wc >> 8) & 0xFF);
+    *dst++= (uchar) (wc & 0xFF);
+  }
+
+  while (dst < de_beg) /* Fill the tail with keys for space character */
+  {
+    *dst++= 0x00;
+    *dst++= 0x00;
+    *dst++= 0x20;
+  }
+
+  /* Clear the last one or two bytes, if "dstlen" was not divisible by 3 */
+  if (dst < de)
+  {
+    *dst++= 0x00;
+    if (dst < de)
+      *dst= 0x00;
+  }
+
+  return dstlen;
+}
+
+
+size_t
+my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len)
+{
+  return ((len + 3) / cs->mbmaxlen) * 3;
+}
 #endif /* HAVE_UNIDATA */
 
 
@@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation
     NULL,		/* init */
     my_strnncoll_mb_bin,
     my_strnncollsp_mb_bin,
-    my_strnxfrm_unicode,
-    my_strnxfrmlen_utf8mb4,
+    my_strnxfrm_unicode_full_bin,
+    my_strnxfrmlen_unicode_full_bin,
     my_like_range_mb,
     my_wildcmp_mb_bin,
     my_strcasecmp_mb_bin,


Attachment: [text/bzr-bundle] bzr/alik@sun.com-20100831142203-l6g7roxkxpumsoxn.bundle
Thread
bzr commit into mysql-5.5 branch (alik:3086) Bug#55980Alexander Nozdrin31 Aug