From: Date: December 12 2005 6:42pm Subject: bk commit into 4.1 tree (bar:1.2488) BUG#15375 List-Archive: http://lists.mysql.com/commits/76 X-Bug: 15375 Message-Id: <200512121742.jBCHgGb6003963@bar.intranet.mysql.r18.ru> Below is the list of changes that have just been committed into a local 4.1 repository of bar. When bar does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet 1.2488 05/12/12 21:42:09 bar@stripped +21 -0 Bug#15375 Unassigned multibyte codes are broken into parts when converting to Unicode. m_ctype.h: Reorganizing mb_wc return codes to be able to return "an unassigned N-byte-long character". sql_string.cc: Adding code to detect and properly handle unassigned characters (i.e. the those character which are correctly formed according to the character specifications, but don't have Unicode mapping). Many files: Fixing conversion function to return new codes. ctype_ujis.test, ctype_gbk.test, ctype_big5.test: Adding a test case. ctype_ujis.result, ctype_gbk.result, ctype_big5.result: Fixing results accordingly. strings/ctype-utf8.c 1.89 05/12/12 21:39:22 bar@stripped +7 -7 Fixing conversion function to return new codes. strings/ctype-ujis.c 1.65 05/12/12 21:39:20 bar@stripped +13 -13 Fixing conversion function to return new codes. strings/ctype-ucs2.c 1.46 05/12/12 21:39:18 bar@stripped +2 -2 Fixing conversion function to return new codes. strings/ctype-tis620.c 1.84 05/12/12 21:39:16 bar@stripped +2 -2 Fixing conversion function to return new codes. strings/ctype-sjis.c 1.82 05/12/12 21:39:14 bar@stripped +4 -4 Fixing conversion function to return new codes. strings/ctype-simple.c 1.68 05/12/12 21:39:13 bar@stripped +2 -2 Fixing conversion function to return new codes. strings/ctype-latin1.c 1.44 05/12/12 21:39:11 bar@stripped +2 -2 Fixing conversion function to return new codes. strings/ctype-gbk.c 1.73 05/12/12 21:39:09 bar@stripped +4 -4 Fixing conversion function to return new codes. strings/ctype-gb2312.c 1.58 05/12/12 21:39:07 bar@stripped +4 -4 Fixing conversion function to return new codes. strings/ctype-euc_kr.c 1.61 05/12/12 21:39:06 bar@stripped +4 -4 Fixing conversion function to return new codes. strings/ctype-cp932.c 1.8 05/12/12 21:39:04 bar@stripped +3 -3 Fixing conversion function to return new codes. strings/ctype-bin.c 1.59 05/12/12 21:38:59 bar@stripped +1 -1 Fixing conversion function to return new codes. strings/ctype-big5.c 1.80 05/12/12 21:38:37 bar@stripped +3 -3 Fixing conversion function to return new codes. sql/sql_string.cc 1.94 05/12/12 21:37:01 bar@stripped +11 -1 Adding code to detect and properly hanlde unassigned characters (i.e. the those character which are correctly formed according to the character specifications, but don't have Unicode mapping). mysql-test/t/ctype_ujis.test 1.16 05/12/12 21:36:56 bar@stripped +15 -0 Adding a test case. mysql-test/t/ctype_gbk.test 1.4 05/12/12 21:36:52 bar@stripped +10 -0 Adding a test case. mysql-test/t/ctype_big5.test 1.11 05/12/12 21:36:47 bar@stripped +10 -0 Adding a test case. mysql-test/r/ctype_ujis.result 1.13 05/12/12 21:36:44 bar@stripped +6 -0 Fixing results accordingly. mysql-test/r/ctype_gbk.result 1.3 05/12/12 21:36:39 bar@stripped +3 -0 Fixing results accordingly. mysql-test/r/ctype_big5.result 1.11 05/12/12 21:36:21 bar@stripped +3 -0 Fixing results accordingly. include/m_ctype.h 1.106 05/12/12 21:32:42 bar@stripped +13 -4 Reorganizing mb_wc return codes to be able to return "an unassigned N-byte long character". Bug#15375 Unassigned multibyte codes are broken into parts when converting to Unicode. # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: bar # Host: bar.intranet.mysql.r18.ru # Root: /usr/home/bar/mysql-4.1.b15376 --- 1.105/include/m_ctype.h 2005-08-17 13:26:26 +05:00 +++ 1.106/include/m_ctype.h 2005-12-12 21:32:42 +04:00 @@ -44,10 +44,19 @@ uint16 sort; } MY_UNICASE_INFO; -#define MY_CS_ILSEQ 0 -#define MY_CS_ILUNI 0 -#define MY_CS_TOOSMALL -1 -#define MY_CS_TOOFEW(n) (-1-(n)) + +/* wm_wc and wc_mb return codes */ +#define MY_CS_ILSEQ 0 /* Wrong by sequence: wb_wc */ +#define MY_CS_ILUNI 0 /* Cannot encode Unicode to charset: wc_mb */ +#define MY_CS_TOOSMALL -101 /* Need at least one byte: wc_mb and mb_wc */ +#define MY_CS_TOOSMALL2 -102 /* Need at least two bytes: wc_mb and mb_wc */ +#define MY_CS_TOOSMALL3 -103 /* Need at least three bytes: wc_mb and mb_wc */ +/* These following three are currently not really used */ +#define MY_CS_TOOSMALL4 -104 /* Need at least 4 bytes: wc_mb and mb_wc */ +#define MY_CS_TOOSMALL5 -105 /* Need at least 5 bytes: wc_mb and mb_wc */ +#define MY_CS_TOOSMALL6 -106 /* Need at least 6 bytes: wc_mb and mb_wc */ +/* A helper macros for "need at least n bytes" */ +#define MY_CS_TOOSMALLN(n) (-100-(n)) #define MY_SEQ_INTTAIL 1 #define MY_SEQ_SPACES 2 --- 1.93/sql/sql_string.cc 2005-06-05 22:38:42 +05:00 +++ 1.94/sql/sql_string.cc 2005-12-12 21:37:01 +04:00 @@ -806,8 +806,18 @@ from++; wc= '?'; } + else if (cnvres > MY_CS_TOOSMALL) + { + /* + A correct multibyte sequence detected + But it doesn't have Unicode mapping. + */ + error_count++; + from+= (-cnvres); + wc= '?'; + } else - break; // Impossible char. + break; // Not enough characters outp: if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) --- 1.79/strings/ctype-big5.c 2005-10-05 19:19:19 +05:00 +++ 1.80/strings/ctype-big5.c 2005-12-12 21:38:37 +04:00 @@ -6259,7 +6259,7 @@ int hi=s[0]; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; if (hi<0x80) { @@ -6268,10 +6268,10 @@ } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_big5_uni_onechar((hi<<8)+s[1]))) - return MY_CS_ILSEQ; + return -2; return 2; } --- 1.60/strings/ctype-euc_kr.c 2005-08-17 13:26:28 +05:00 +++ 1.61/strings/ctype-euc_kr.c 2005-12-12 21:39:06 +04:00 @@ -8601,7 +8601,7 @@ return MY_CS_ILUNI; if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; s[0]=code>>8; s[1]=code&0xFF; @@ -8617,7 +8617,7 @@ int hi=s[0]; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; if (hi<0x80) { @@ -8626,10 +8626,10 @@ } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_ksc5601_uni_onechar((hi<<8)+s[1]))) - return MY_CS_ILSEQ; + return -2; return 2; } --- 1.57/strings/ctype-gb2312.c 2005-08-17 13:26:28 +05:00 +++ 1.58/strings/ctype-gb2312.c 2005-12-12 21:39:07 +04:00 @@ -5651,7 +5651,7 @@ return MY_CS_ILUNI; if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; code|=0x8080; s[0]=code>>8; @@ -5668,7 +5668,7 @@ hi=(int) s[0]; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; if (hi<0x80) { @@ -5677,10 +5677,10 @@ } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_gb2312_uni_onechar(((hi<<8)+s[1])&0x7F7F))) - return MY_CS_ILSEQ; + return -2; return 2; } --- 1.72/strings/ctype-gbk.c 2005-09-21 22:12:12 +05:00 +++ 1.73/strings/ctype-gbk.c 2005-12-12 21:39:09 +04:00 @@ -9889,7 +9889,7 @@ return MY_CS_ILUNI; if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; s[0]=code>>8; s[1]=code&0xFF; @@ -9903,7 +9903,7 @@ int hi; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; hi=s[0]; @@ -9914,10 +9914,10 @@ } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_gbk_uni_onechar( (hi<<8) + s[1]))) - return MY_CS_ILSEQ; + return -2; return 2; --- 1.81/strings/ctype-sjis.c 2005-09-21 22:12:18 +05:00 +++ 1.82/strings/ctype-sjis.c 2005-12-12 21:39:14 +04:00 @@ -4501,7 +4501,7 @@ mb: if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; s[0]=code>>8; s[1]=code&0xFF; @@ -4515,7 +4515,7 @@ int hi=s[0]; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; if (hi < 0x80) { @@ -4530,10 +4530,10 @@ } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_sjis_uni_onechar((hi<<8)+s[1]))) - return MY_CS_ILSEQ; + return -2; return 2; } --- 1.83/strings/ctype-tis620.c 2005-10-13 13:25:00 +05:00 +++ 1.84/strings/ctype-tis620.c 2005-12-12 21:39:16 +04:00 @@ -820,10 +820,10 @@ const unsigned char *end __attribute__((unused))) { if (str >= end) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; *wc=cs_to_uni[*str]; - return (!wc[0] && str[0]) ? MY_CS_ILSEQ : 1; + return (!wc[0] && str[0]) ? -1 : 1; } static --- 1.64/strings/ctype-ujis.c 2005-08-17 13:26:29 +05:00 +++ 1.65/strings/ctype-ujis.c 2005-12-12 21:39:20 +04:00 @@ -242,7 +242,7 @@ const uchar *e __attribute__((unused))) { wc[0]=tab_jisx0201_uni[*s]; - return (!wc[0] && s[0]) ? MY_CS_ILSEQ : 1; + return (!wc[0] && s[0]) ? -1 : 1; } @@ -8341,7 +8341,7 @@ int c1,c2,c3; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; c1=s[0]; @@ -8353,7 +8353,7 @@ } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; c2=s[1]; @@ -8368,7 +8368,7 @@ { pwc[0]=my_jisx0208_uni_onechar( ((c1-0x80) << 8) + (c2-0x80)); if (!pwc[0]) - return MY_CS_ILSEQ; + return -2; } else { @@ -8388,7 +8388,7 @@ ret = my_mb_wc_jisx0201(cs,pwc,s+1,e); if (ret!=1) - return ret; + return -2; return 2; } @@ -8399,7 +8399,7 @@ return MY_CS_ILSEQ; if (s+3>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL3; c3=s[2]; if (c3 < 0xA1 || c3>=0xFF) @@ -8408,8 +8408,8 @@ if (c2<0xF5) { pwc[0]=my_jisx0212_uni_onechar((c2-0x80)*256 + (c3-0x80)); - if (!pwc) - return MY_CS_ILSEQ; + if (!pwc[0]) + return -3; } else { @@ -8440,7 +8440,7 @@ if ((jp=my_uni_jisx0208_onechar(wc))) { if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; jp+=0x8080; s[0]=jp>>8; @@ -8452,7 +8452,7 @@ if (my_wc_mb_jisx0201(c,wc,s,e) == 1) { if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; s[1]= s[0]; s[0]= 0x8E; return 2; @@ -8462,7 +8462,7 @@ if ((jp=my_uni_jisx0212_onechar(wc))) { if (s+3>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL3; jp+=0x8080; s[0]=0x8F; @@ -8476,7 +8476,7 @@ if (wc>=0xE000 && wc<0xE3AC) { if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; c1=((unsigned)(wc-0xE000)/94)+0xF5; s[0]=c1; @@ -8490,7 +8490,7 @@ if (wc>=0xE3AC && wc<0xE758) { if (s+3>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL3; s[0]=0x8F; c1=((unsigned)(wc-0xE3AC)/94)+0xF5; --- 1.10/mysql-test/r/ctype_big5.result 2005-10-05 19:20:19 +05:00 +++ 1.11/mysql-test/r/ctype_big5.result 2005-12-12 21:36:21 +04:00 @@ -189,3 +189,6 @@ hex(a) E5ABBA drop table t1; +select hex(convert(_big5 0xC84041 using ucs2)); +hex(convert(_big5 0xC84041 using ucs2)) +003F0041 --- 1.10/mysql-test/t/ctype_big5.test 2005-10-05 19:20:08 +05:00 +++ 1.11/mysql-test/t/ctype_big5.test 2005-12-12 21:36:47 +04:00 @@ -53,4 +53,14 @@ select hex(a) from t1 where a = _big5 0xF9DC; drop table t1; +# +# Bugs#15375: Unassigned multibyte codes are broken +# into parts when converting to Unicode. +# This query should return 0x003F0041. I.e. it should +# scan unassigned double-byte character 0xC840, convert +# it as QUESTION MARK 0x003F and then scan the next +# character, which is a single byte character 0x41. +# +select hex(convert(_big5 0xC84041 using ucs2)); + # End of 4.1 tests --- 1.58/strings/ctype-bin.c 2005-08-17 13:26:27 +05:00 +++ 1.59/strings/ctype-bin.c 2005-12-12 21:38:59 +04:00 @@ -220,7 +220,7 @@ const unsigned char *end __attribute__((unused))) { if (str >= end) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; *wc=str[0]; return 1; --- 1.43/strings/ctype-latin1.c 2005-09-16 14:19:45 +05:00 +++ 1.44/strings/ctype-latin1.c 2005-12-12 21:39:11 +04:00 @@ -363,10 +363,10 @@ const unsigned char *end __attribute__((unused))) { if (str >= end) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; *wc=cs_to_uni[*str]; - return (!wc[0] && str[0]) ? MY_CS_ILSEQ : 1; + return (!wc[0] && str[0]) ? -1 : 1; } static --- 1.45/strings/ctype-ucs2.c 2005-10-18 20:03:21 +05:00 +++ 1.46/strings/ctype-ucs2.c 2005-12-12 21:39:18 +04:00 @@ -95,7 +95,7 @@ my_wc_t * pwc, const uchar *s, const uchar *e) { if (s+2 > e) /* Need 2 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; *pwc= ((unsigned char)s[0]) * 256 + ((unsigned char)s[1]); return 2; @@ -105,7 +105,7 @@ my_wc_t wc, uchar *r, uchar *e) { if ( r+2 > e ) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; r[0]= (uchar) (wc >> 8); r[1]= (uchar) (wc & 0xFF); --- 1.67/strings/ctype-simple.c 2005-10-18 20:03:20 +05:00 +++ 1.68/strings/ctype-simple.c 2005-12-12 21:39:13 +04:00 @@ -207,10 +207,10 @@ const unsigned char *end __attribute__((unused))) { if (str >= end) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; *wc=cs->tab_to_uni[*str]; - return (!wc[0] && str[0]) ? MY_CS_ILSEQ : 1; + return (!wc[0] && str[0]) ? -1 : 1; } int my_wc_mb_8bit(CHARSET_INFO *cs,my_wc_t wc, --- 1.88/strings/ctype-utf8.c 2005-08-17 13:26:29 +05:00 +++ 1.89/strings/ctype-utf8.c 2005-12-12 21:39:22 +04:00 @@ -1765,7 +1765,7 @@ unsigned char c; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; c= s[0]; if (c < 0x80) @@ -1778,7 +1778,7 @@ else if (c < 0xe0) { if (s+2 > e) /* We need 2 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!((s[1] ^ 0x80) < 0x40)) return MY_CS_ILSEQ; @@ -1789,7 +1789,7 @@ else if (c < 0xf0) { if (s+3 > e) /* We need 3 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL3; if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -1804,7 +1804,7 @@ else if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32) { if (s+4 > e) /* We need 4 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL4; if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && @@ -1822,7 +1822,7 @@ else if (c < 0xfc && sizeof(my_wc_t)*8 >= 32) { if (s+5 >e) /* We need 5 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL5; if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && @@ -1841,7 +1841,7 @@ else if (c < 0xfe && sizeof(my_wc_t)*8 >= 32) { if ( s+6 >e ) /* We need 6 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL6; if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && @@ -1892,7 +1892,7 @@ Because of it (r+count > e), not (r+count-1 >e ) */ if ( r+count > e ) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALLN(count); switch (count) { /* Fall through all cases!!! */ --- 1.2/mysql-test/r/ctype_gbk.result 2005-09-21 22:17:25 +05:00 +++ 1.3/mysql-test/r/ctype_gbk.result 2005-12-12 21:36:39 +04:00 @@ -165,3 +165,6 @@ A1A1 A3A0 DROP TABLE t1; +select hex(convert(_gbk 0xA14041 using ucs2)); +hex(convert(_gbk 0xA14041 using ucs2)) +003F0041 --- 1.3/mysql-test/t/ctype_gbk.test 2005-09-21 22:13:32 +05:00 +++ 1.4/mysql-test/t/ctype_gbk.test 2005-12-12 21:36:52 +04:00 @@ -31,4 +31,14 @@ SELECT hex(a) FROM t1 ORDER BY a; DROP TABLE t1; +# +# Bugs#15375: Unassigned multibyte codes are broken +# into parts when converting to Unicode. +# This query should return 0x003F0041. I.e. it should +# scan unassigned double-byte character 0xA140, convert +# it as QUESTION MARK 0x003F and then scan the next +# character, which is a single byte character 0x41. +# +select hex(convert(_gbk 0xA14041 using ucs2)); + # End of 4.1 tests --- 1.12/mysql-test/r/ctype_ujis.result 2005-09-21 22:17:30 +05:00 +++ 1.13/mysql-test/r/ctype_ujis.result 2005-12-12 21:36:44 +04:00 @@ -2307,3 +2307,9 @@ c2h ab_def drop table t1; +select hex(convert(_ujis 0xA5FE41 using ucs2)); +hex(convert(_ujis 0xA5FE41 using ucs2)) +003F0041 +select hex(convert(_ujis 0x8FABF841 using ucs2)); +hex(convert(_ujis 0x8FABF841 using ucs2)) +003F0041 --- 1.15/mysql-test/t/ctype_ujis.test 2005-09-21 22:13:37 +05:00 +++ 1.16/mysql-test/t/ctype_ujis.test 2005-12-12 21:36:56 +04:00 @@ -1152,4 +1152,19 @@ -- source include/ctype_innodb_like.inc -- source include/ctype_like_escape.inc +# +# Bugs#15375: Unassigned multibyte codes are broken +# into parts when converting to Unicode. +# This query should return 0x003F0041. I.e. it should +# scan unassigned double-byte character 0xA5FE, convert +# it as QUESTION MARK 0x003F and then scan the next +# character, which is a single byte character 0x41. +# +select hex(convert(_ujis 0xA5FE41 using ucs2)); +# This one should return 0x003F0041: +# scan unassigned three-byte character 0x8FABF8, +# convert it as QUESTION MARK 0x003F and then scan +# the next character, which is a single byte character 0x41. +select hex(convert(_ujis 0x8FABF841 using ucs2)); + # End of 4.1 tests --- 1.7/strings/ctype-cp932.c 2005-09-21 22:11:48 +05:00 +++ 1.8/strings/ctype-cp932.c 2005-12-12 21:39:04 +04:00 @@ -5355,7 +5355,7 @@ int hi=s[0]; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; if (hi < 0x80) { @@ -5370,10 +5370,10 @@ } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_cp932_uni_onechar((hi<<8)+s[1]))) - return MY_CS_ILSEQ; + return -2; return 2; }