From: bar Date: April 1 2008 3:03pm Subject: bk commit into 6.0 tree (bar:1.2614) BUG#32914 List-Archive: http://lists.mysql.com/commits/44738 X-Bug: 32914 Message-Id: <200804011503.m31F3ns6018888@bar.myoffice.izhnet.ru> Below is the list of changes that have just been committed into a local 6.0 repository of bar. When bar does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet@stripped, 2008-04-01 20:03:44+05:00, bar@stripped +6 -0 Bug#32914 Character sets: illegal characters in utf8 and utf32 columns Problem: inserting of Unicode values higher than U+10FFFF was possible into utf32 and utf8 columns Fix: - my_mb_wc_utf8mb4() was not strict enough. Adding more strict rules. - well_formed_copy_nchars() didn't check if left ZERO PADDING generated a wrong character. Adding extra checking for the leftmost (padded) character. mysql-test/r/ctype_utf32.result@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +22 -0 Adding tests mysql-test/r/ctype_utf8.result@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +33 -0 Adding tests mysql-test/t/ctype_utf32.test@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +17 -0 Adding tests mysql-test/t/ctype_utf8.test@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +24 -0 Adding tests sql/sql_string.cc@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +18 -0 Bug#32914 Character sets: illegal characters in utf8 and utf32 columns Check if left ZERO PADDING creates an incorrect character. strings/ctype-utf8.c@stripped, 2008-04-01 20:03:41+05:00, bar@stripped +25 -4 Bug#32914 Character sets: illegal characters in utf8 and utf32 columns Adding more strict UTF-8 four-byte sequence control. diff -Nrup a/mysql-test/r/ctype_utf32.result b/mysql-test/r/ctype_utf32.result --- a/mysql-test/r/ctype_utf32.result 2007-12-06 11:42:16 +04:00 +++ b/mysql-test/r/ctype_utf32.result 2008-04-01 20:03:41 +05:00 @@ -912,6 +912,28 @@ select hex(a) from t1; hex(a) 0010FFFF drop table t1; +create table t1 (utf32 varchar(2) character set utf32); +Wrong character with pad +insert into t1 values (0x110000); +Warnings: +Warning 1366 Incorrect string value: '\x11\x00\x00' for column 'utf32' at row 1 +Wrong chsaracter without pad +insert into t1 values (0x00110000); +Warnings: +Warning 1366 Incorrect string value: '\x00\x11\x00\x00' for column 'utf32' at row 1 +Wrong character with pad followed by another wrong character +insert into t1 values (0x11000000110000); +Warnings: +Warning 1366 Incorrect string value: '\x11\x00\x00\x00\x11\x00...' for column 'utf32' at row 1 +Good character with pad followed by bad character +insert into t1 values (0x10000000110000); +Warnings: +Warning 1366 Incorrect string value: '\x00\x11\x00\x00' for column 'utf32' at row 1 +Good character without pad followed by bad character +insert into t1 values (0x0010000000110000); +Warnings: +Warning 1366 Incorrect string value: '\x00\x11\x00\x00' for column 'utf32' at row 1 +drop table t1; create table t1 (a char(10)) character set utf32; insert into t1 values ('a '); select hex(a) from t1; diff -Nrup a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result --- a/mysql-test/r/ctype_utf8.result 2007-11-15 17:41:32 +04:00 +++ b/mysql-test/r/ctype_utf8.result 2008-04-01 20:03:41 +05:00 @@ -1813,6 +1813,39 @@ select hex(_utf8 B'001111111111'); ERROR HY000: Invalid utf8 character string: 'FF' select (_utf8 X'616263FF'); ERROR HY000: Invalid utf8 character string: 'FF' +create table t1 (utf8 char(1) character set utf8); +Testing [F0][90..BF][80..BF][80..BF] +insert into t1 values (0xF0908080); +insert into t1 values (0xF0BFBFBF); +insert into t1 values (0xF08F8080); +Warnings: +Warning 1366 Incorrect string value: '\xF0\x8F\x80\x80' for column 'utf8' at row 1 +select hex(utf8) from t1; +hex(utf8) +F0908080 +F0BFBFBF + +delete from t1; +Testing [F2..F3][80..BF][80..BF][80..BF] +insert into t1 values (0xF2808080); +insert into t1 values (0xF2BFBFBF); +select hex(utf8) from t1; +hex(utf8) +F2808080 +F2BFBFBF +delete from t1; +Testing [F4][80..8F][80..BF][80..BF] +insert into t1 values (0xF4808080); +insert into t1 values (0xF48F8080); +insert into t1 values (0xF4908080); +Warnings: +Warning 1366 Incorrect string value: '\xF4\x90\x80\x80' for column 'utf8' at row 1 +select hex(utf8) from t1; +hex(utf8) +F4808080 +F48F8080 + +drop table t1; DROP TABLE IF EXISTS t1; CREATE TABLE t1 ( predicted_order int NOT NULL, diff -Nrup a/mysql-test/t/ctype_utf32.test b/mysql-test/t/ctype_utf32.test --- a/mysql-test/t/ctype_utf32.test 2007-12-06 11:42:16 +04:00 +++ b/mysql-test/t/ctype_utf32.test 2008-04-01 20:03:41 +05:00 @@ -597,6 +597,22 @@ select hex(a) from t1; drop table t1; # +# Bug#32914 Character sets: illegal characters in utf8 and utf32 columns +# +create table t1 (utf32 varchar(2) character set utf32); +--echo Wrong character with pad +insert into t1 values (0x110000); +--echo Wrong chsaracter without pad +insert into t1 values (0x00110000); +--echo Wrong character with pad followed by another wrong character +insert into t1 values (0x11000000110000); +--echo Good character with pad followed by bad character +insert into t1 values (0x10000000110000); +--echo Good character without pad followed by bad character +insert into t1 values (0x0010000000110000); +drop table t1; + +# # Testing cs->cset->lengthsp() # create table t1 (a char(10)) character set utf32; @@ -716,5 +732,6 @@ drop table t1; set collation_connection=utf32_general_ci; --source include/ctype_regex.inc set names latin1; + # TODO: add tests for all engines diff -Nrup a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test --- a/mysql-test/t/ctype_utf8.test 2007-11-15 17:41:39 +04:00 +++ b/mysql-test/t/ctype_utf8.test 2008-04-01 20:03:41 +05:00 @@ -1440,6 +1440,30 @@ select hex(_utf8 B'001111111111'); select (_utf8 X'616263FF'); # +# Bug#32914 Character sets: illegal characters in utf8 and utf32 columns +# +create table t1 (utf8 char(1) character set utf8); +--echo Testing [F0][90..BF][80..BF][80..BF] +insert into t1 values (0xF0908080); +insert into t1 values (0xF0BFBFBF); +insert into t1 values (0xF08F8080); +select hex(utf8) from t1; +delete from t1; + +--echo Testing [F2..F3][80..BF][80..BF][80..BF] +insert into t1 values (0xF2808080); +insert into t1 values (0xF2BFBFBF); +select hex(utf8) from t1; +delete from t1; + +--echo Testing [F4][80..8F][80..BF][80..BF] +insert into t1 values (0xF4808080); +insert into t1 values (0xF48F8080); +insert into t1 values (0xF4908080); +select hex(utf8) from t1; +drop table t1; + +# # Bug#26474: Add Sinhala script (Sri Lanka) collation to MySQL # --disable_warnings diff -Nrup a/sql/sql_string.cc b/sql/sql_string.cc --- a/sql/sql_string.cc 2008-03-07 21:25:00 +04:00 +++ b/sql/sql_string.cc 2008-04-01 20:03:41 +05:00 @@ -975,6 +975,24 @@ well_formed_copy_nchars(CHARSET_INFO *to uint pad_length= to_cs->mbminlen - from_offset; bzero(to, pad_length); memmove(to + pad_length, from, from_offset); + /* + In some cases left zero-padding can create an incorrect character. + For example: + INSERT INTO t1 (utf32_column) VALUES (0x110000); + We'll pad the value to 0x00110000, which is a wrong UTF32 sequence! + The valid characters range is limited to 0x00000000..0x0010FFFF. + + Make sure we didn't pad to an incorrect character. + */ + if (to_cs->cset->well_formed_len(to_cs, + to, to + to_cs->mbminlen, 1, + &well_formed_error) != + to_cs->mbminlen) + { + *from_end_pos= *well_formed_error_pos= from; + *cannot_convert_error_pos= NULL; + return 0; + } nchars--; from+= from_offset; from_length-= from_offset; diff -Nrup a/strings/ctype-utf8.c b/strings/ctype-utf8.c --- a/strings/ctype-utf8.c 2007-10-22 16:43:29 +05:00 +++ b/strings/ctype-utf8.c 2008-04-01 20:03:41 +05:00 @@ -2022,15 +2022,35 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attr (my_wc_t) (s[2] ^ 0x80); return 3; } - else if (c < 0xf8) + else if (c < 0xf5) { if (s + 4 > e) /* We need 4 characters */ return MY_CS_TOOSMALL4; + /* + UTF-8 quick four-byte mask: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Encoding allows to encode U+00010000..U+001FFFFF + + The maximum character defined in the Unicode standard is U+0010FFFF. + Higher characters U+00110000..U+001FFFFF are not used. + + 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) + 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) + + Valid codes: + [F0][90..BF][80..BF][80..BF] + [F1][80..BF][80..BF][80..BF] + [F2][80..BF][80..BF][80..BF] + [F3][80..BF][80..BF][80..BF] + [F4][80..8F][80..BF][80..BF] + */ + if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && - (c >= 0xf1 || s[1] >= 0x90))) + (c >= 0xf1 || s[1] >= 0x90) && + (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x07) << 18) | ((my_wc_t) (s[1] ^ 0x80) << 12) | @@ -2083,12 +2103,13 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO * return 3; } - else if (c < 0xf8) + else if (c < 0xf5) { if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && - (c >= 0xf1 || s[1] >= 0x90))) + (c >= 0xf1 || s[1] >= 0x90) && + (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x07) << 18) | ((my_wc_t) (s[1] ^ 0x80) << 12) |