MySQL Lists are EOL. Please join:

List:Commits« Previous MessageNext Message »
From:bar Date:April 1 2008 3:03pm
Subject:bk commit into 6.0 tree (bar:1.2614) BUG#32914
View as plain text  
Below is the list of changes that have just been committed into a local
6.0 repository of bar.  When bar does a push these changes
will be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet@stripped, 2008-04-01 20:03:44+05:00, bar@stripped +6 -0
  Bug#32914 Character sets: illegal characters in utf8 and utf32 columns
  Problem: inserting of Unicode values higher than U+10FFFF was possible
  into utf32 and utf8 columns
  Fix:
  - my_mb_wc_utf8mb4() was not strict enough. Adding more strict rules.
  - well_formed_copy_nchars() didn't check if left ZERO PADDING
    generated a wrong character. Adding extra checking for the leftmost
    (padded) character.

  mysql-test/r/ctype_utf32.result@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +22 -0
    Adding tests

  mysql-test/r/ctype_utf8.result@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +33 -0
    Adding tests

  mysql-test/t/ctype_utf32.test@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +17 -0
    Adding tests

  mysql-test/t/ctype_utf8.test@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +24 -0
    Adding tests

  sql/sql_string.cc@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +18 -0
    Bug#32914 Character sets: illegal characters in utf8 and utf32 columns
    Check if left ZERO PADDING creates an incorrect character.

  strings/ctype-utf8.c@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +25 -4
    Bug#32914 Character sets: illegal characters in utf8 and utf32 columns
    Adding more strict UTF-8 four-byte sequence control.

diff -Nrup a/mysql-test/r/ctype_utf32.result b/mysql-test/r/ctype_utf32.result
--- a/mysql-test/r/ctype_utf32.result	2007-12-06 11:42:16 +04:00
+++ b/mysql-test/r/ctype_utf32.result	2008-04-01 20:03:41 +05:00
@@ -912,6 +912,28 @@ select hex(a) from t1;
 hex(a)
 0010FFFF
 drop table t1;
+create table t1 (utf32 varchar(2) character set utf32);
+Wrong character with pad
+insert into t1 values (0x110000);
+Warnings:
+Warning	1366	Incorrect string value: '\x11\x00\x00' for column 'utf32' at row 1
+Wrong chsaracter without pad
+insert into t1 values (0x00110000);
+Warnings:
+Warning	1366	Incorrect string value: '\x00\x11\x00\x00' for column 'utf32' at row 1
+Wrong character with pad followed by another wrong character
+insert into t1 values (0x11000000110000);
+Warnings:
+Warning	1366	Incorrect string value: '\x11\x00\x00\x00\x11\x00...' for column 'utf32' at row 1
+Good character with pad followed by bad character
+insert into t1 values (0x10000000110000);
+Warnings:
+Warning	1366	Incorrect string value: '\x00\x11\x00\x00' for column 'utf32' at row 1
+Good character without pad followed by bad character
+insert into t1 values (0x0010000000110000);
+Warnings:
+Warning	1366	Incorrect string value: '\x00\x11\x00\x00' for column 'utf32' at row 1
+drop table t1;
 create table t1 (a char(10)) character set utf32;
 insert into t1 values ('a   ');
 select hex(a) from t1;
diff -Nrup a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
--- a/mysql-test/r/ctype_utf8.result	2007-11-15 17:41:32 +04:00
+++ b/mysql-test/r/ctype_utf8.result	2008-04-01 20:03:41 +05:00
@@ -1813,6 +1813,39 @@ select hex(_utf8 B'001111111111');
 ERROR HY000: Invalid utf8 character string: 'FF'
 select (_utf8 X'616263FF');
 ERROR HY000: Invalid utf8 character string: 'FF'
+create table t1 (utf8 char(1) character set utf8);
+Testing [F0][90..BF][80..BF][80..BF]
+insert into t1 values (0xF0908080);
+insert into t1 values (0xF0BFBFBF);
+insert into t1 values (0xF08F8080);
+Warnings:
+Warning	1366	Incorrect string value: '\xF0\x8F\x80\x80' for column 'utf8' at row 1
+select hex(utf8) from t1;
+hex(utf8)
+F0908080
+F0BFBFBF
+
+delete from t1;
+Testing [F2..F3][80..BF][80..BF][80..BF]
+insert into t1 values (0xF2808080);
+insert into t1 values (0xF2BFBFBF);
+select hex(utf8) from t1;
+hex(utf8)
+F2808080
+F2BFBFBF
+delete from t1;
+Testing [F4][80..8F][80..BF][80..BF]
+insert into t1 values (0xF4808080);
+insert into t1 values (0xF48F8080);
+insert into t1 values (0xF4908080);
+Warnings:
+Warning	1366	Incorrect string value: '\xF4\x90\x80\x80' for column 'utf8' at row 1
+select hex(utf8) from t1;
+hex(utf8)
+F4808080
+F48F8080
+
+drop table t1;
 DROP TABLE IF EXISTS t1;
 CREATE TABLE t1 (
 predicted_order int NOT NULL,
diff -Nrup a/mysql-test/t/ctype_utf32.test b/mysql-test/t/ctype_utf32.test
--- a/mysql-test/t/ctype_utf32.test	2007-12-06 11:42:16 +04:00
+++ b/mysql-test/t/ctype_utf32.test	2008-04-01 20:03:41 +05:00
@@ -597,6 +597,22 @@ select hex(a) from t1;
 drop table t1;
 
 #
+# Bug#32914 Character sets: illegal characters in utf8 and utf32 columns
+#
+create table t1 (utf32 varchar(2) character set utf32);
+--echo Wrong character with pad
+insert into t1 values (0x110000);
+--echo Wrong chsaracter without pad
+insert into t1 values (0x00110000);
+--echo Wrong character with pad followed by another wrong character
+insert into t1 values (0x11000000110000);
+--echo Good character with pad followed by bad character
+insert into t1 values (0x10000000110000);
+--echo Good character without pad followed by bad character
+insert into t1 values (0x0010000000110000);
+drop table t1;
+
+#
 # Testing cs->cset->lengthsp()
 #
 create table t1 (a char(10)) character set utf32;
@@ -716,5 +732,6 @@ drop table t1;
 set collation_connection=utf32_general_ci;
 --source include/ctype_regex.inc
 set names latin1;
+
 
 # TODO: add tests for all engines
diff -Nrup a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
--- a/mysql-test/t/ctype_utf8.test	2007-11-15 17:41:39 +04:00
+++ b/mysql-test/t/ctype_utf8.test	2008-04-01 20:03:41 +05:00
@@ -1440,6 +1440,30 @@ select hex(_utf8 B'001111111111');
 select (_utf8 X'616263FF');
 
 #
+# Bug#32914 Character sets: illegal characters in utf8 and utf32 columns
+#
+create table t1 (utf8 char(1) character set utf8);
+--echo Testing [F0][90..BF][80..BF][80..BF]
+insert into t1 values (0xF0908080);
+insert into t1 values (0xF0BFBFBF);
+insert into t1 values (0xF08F8080);
+select hex(utf8) from t1;
+delete from t1;
+
+--echo Testing [F2..F3][80..BF][80..BF][80..BF]
+insert into t1 values (0xF2808080);
+insert into t1 values (0xF2BFBFBF);
+select hex(utf8) from t1;
+delete from t1;
+
+--echo Testing [F4][80..8F][80..BF][80..BF]
+insert into t1 values (0xF4808080);
+insert into t1 values (0xF48F8080);
+insert into t1 values (0xF4908080);
+select hex(utf8) from t1;
+drop table t1;
+
+#
 # Bug#26474: Add Sinhala script (Sri Lanka) collation to MySQL
 #
 --disable_warnings
diff -Nrup a/sql/sql_string.cc b/sql/sql_string.cc
--- a/sql/sql_string.cc	2008-03-07 21:25:00 +04:00
+++ b/sql/sql_string.cc	2008-04-01 20:03:41 +05:00
@@ -975,6 +975,24 @@ well_formed_copy_nchars(CHARSET_INFO *to
         uint pad_length= to_cs->mbminlen - from_offset;
         bzero(to, pad_length);
         memmove(to + pad_length, from, from_offset);
+        /*
+          In some cases left zero-padding can create an incorrect character.
+          For example:
+            INSERT INTO t1 (utf32_column) VALUES (0x110000);
+          We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
+          The valid characters range is limited to 0x00000000..0x0010FFFF.
+          
+          Make sure we didn't pad to an incorrect character.
+        */
+        if (to_cs->cset->well_formed_len(to_cs,
+                                         to, to + to_cs->mbminlen, 1,
+                                         &well_formed_error) !=
+                                         to_cs->mbminlen)
+        {
+          *from_end_pos= *well_formed_error_pos= from;
+          *cannot_convert_error_pos= NULL;
+          return 0;
+        }
         nchars--;
         from+= from_offset;
         from_length-= from_offset;
diff -Nrup a/strings/ctype-utf8.c b/strings/ctype-utf8.c
--- a/strings/ctype-utf8.c	2007-10-22 16:43:29 +05:00
+++ b/strings/ctype-utf8.c	2008-04-01 20:03:41 +05:00
@@ -2022,15 +2022,35 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attr
            (my_wc_t) (s[2] ^ 0x80);
     return 3;
   }
-  else if (c < 0xf8)
+  else if (c < 0xf5)
   {
     if (s + 4 > e) /* We need 4 characters */
       return MY_CS_TOOSMALL4;
 
+    /*
+      UTF-8 quick four-byte mask:
+      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      Encoding allows to encode U+00010000..U+001FFFFF
+      
+      The maximum character defined in the Unicode standard is U+0010FFFF.
+      Higher characters U+00110000..U+001FFFFF are not used.
+      
+      11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
+      11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
+      
+      Valid codes:
+      [F0][90..BF][80..BF][80..BF]
+      [F1][80..BF][80..BF][80..BF]
+      [F2][80..BF][80..BF][80..BF]
+      [F3][80..BF][80..BF][80..BF]
+      [F4][80..8F][80..BF][80..BF]
+    */
+
     if (!((s[1] ^ 0x80) < 0x40 &&
           (s[2] ^ 0x80) < 0x40 &&
           (s[3] ^ 0x80) < 0x40 &&
-          (c >= 0xf1 || s[1] >= 0x90)))
+          (c >= 0xf1 || s[1] >= 0x90) &&
+          (c <= 0xf3 || s[1] <= 0x8F)))
       return MY_CS_ILSEQ;
     *pwc = ((my_wc_t) (c & 0x07) << 18)    |
            ((my_wc_t) (s[1] ^ 0x80) << 12) |
@@ -2083,12 +2103,13 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *
 
     return 3;
   }
-  else if (c < 0xf8)
+  else if (c < 0xf5)
   {
     if (!((s[1] ^ 0x80) < 0x40 &&
           (s[2] ^ 0x80) < 0x40 &&
           (s[3] ^ 0x80) < 0x40 &&
-          (c >= 0xf1 || s[1] >= 0x90)))
+          (c >= 0xf1 || s[1] >= 0x90) &&
+          (c <= 0xf3 || s[1] <= 0x8F)))
       return MY_CS_ILSEQ;
     *pwc = ((my_wc_t) (c & 0x07) << 18)    |
            ((my_wc_t) (s[1] ^ 0x80) << 12) |
Thread
bk commit into 6.0 tree (bar:1.2614) BUG#32914bar1 Apr