From: Date: March 28 2007 3:57pm Subject: bk commit into 5.0 tree (bar:1.2477) BUG#22638 List-Archive: http://lists.mysql.com/commits/23158 X-Bug: 22638 Message-Id: <200703281357.l2SDvZaZ014705@bar.myoffice.izhnet.ru> Below is the list of changes that have just been committed into a local 5.0 repository of bar. When bar does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet 1.2477 07/03/28 18:57:30 bar@stripped +5 -0 Bug#22638 SOUNDEX broken for international characters Problem: SOUNDEX returned an invalid string for international characters in multi-byte character sets. For example: for a Chinese/Japanese 3-byte long character _utf8 0xE99885 it took only the very first byte 0xE9, put it into the outout string and then appended with three DIGIT ZERO characters, so the result was 0xE9303030 - which is an invalide utf8 string. Fix: make SOUNDEX() multi-byte aware and - put only complete characters into result, thus return only valid strings. This patch also makes SOUNDEX() compatible with UCS2. sql/item_strfunc.cc 1.299 07/03/28 18:57:27 bar@stripped +105 -28 Making soundex multi-byte aware. mysql-test/t/ctype_utf8.test 1.95 07/03/28 18:57:27 bar@stripped +8 -0 Adding tests mysql-test/t/ctype_ucs.test 1.50 07/03/28 18:57:27 bar@stripped +14 -0 Adding tests mysql-test/r/ctype_utf8.result 1.104 07/03/28 18:57:27 bar@stripped +12 -0 Adding tests mysql-test/r/ctype_ucs.result 1.51 07/03/28 18:57:27 bar@stripped +18 -0 Adding tests # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: bar # Host: bar.myoffice.izhnet.ru # Root: /home/bar/mysql-5.0.b22638 --- 1.298/sql/item_strfunc.cc 2007-03-10 20:56:09 +04:00 +++ 1.299/sql/item_strfunc.cc 2007-03-28 18:57:27 +05:00 @@ -1805,7 +1805,8 @@ { collation.set(args[0]->collation); max_length=args[0]->max_length; - set_if_bigger(max_length,4); + set_if_bigger(max_length, 4 * collation.collation->mbminlen); + tmp_value.set_charset(collation.collation); } @@ -1815,14 +1816,15 @@ else return 0 */ -static char soundex_toupper(char ch) +static int soundex_toupper(int ch) { return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch; } -static char get_scode(char *ptr) + +static char get_scode(int wc) { - uchar ch= soundex_toupper(*ptr); + int ch= soundex_toupper(wc); if (ch < 'A' || ch > 'Z') { // Thread extended alfa (country spec) @@ -1832,46 +1834,121 @@ } +static bool my_uni_isalpha(int wc) +{ + /* + Return true for all Basic Latin letters: a..z A..Z. + Return true for all Unicode characters with code higher than U+00C0: + - characters between 'z' and U+00C0 are controls and punctuations. + - "U+00C0 LATIN CAPITAL LETTER A WITH GRAVE" is the first letter after 'z'. + */ + return (wc >= 'a' && wc <= 'z') || + (wc >= 'A' && wc <= 'Z') || + (wc >= 0xC0); +} + + String *Item_func_soundex::val_str(String *str) { DBUG_ASSERT(fixed == 1); String *res =args[0]->val_str(str); char last_ch,ch; CHARSET_INFO *cs= collation.collation; + my_wc_t wc; + uint nchars; + int rc; - if ((null_value=args[0]->null_value)) + if ((null_value= args[0]->null_value)) return 0; /* purecov: inspected */ - if (tmp_value.alloc(max(res->length(),4))) + if (tmp_value.alloc(max(res->length(), 4 * cs->mbminlen))) return str; /* purecov: inspected */ char *to= (char *) tmp_value.ptr(); - char *from= (char *) res->ptr(), *end=from+res->length(); - tmp_value.set_charset(cs); + char *to_end= to + tmp_value.alloced_length(); + char *from= (char *) res->ptr(), *end= from + res->length(); - while (from != end && !my_isalpha(cs,*from)) // Skip pre-space - from++; /* purecov: inspected */ - if (from == end) - return &my_empty_string; // No alpha characters. - *to++ = soundex_toupper(*from); // Copy first letter - last_ch = get_scode(from); // code of the first letter - // for the first 'double-letter check. - // Loop on input letters until - // end of input (null) or output - // letter code count = 3 - for (from++ ; from < end ; from++) + for ( ; ; ) /* Skip pre-space */ { - if (!my_isalpha(cs,*from)) - continue; - ch=get_scode(from); + if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0) + return &my_empty_string; /* EOL or invalid byte sequence */ + + if (rc == 1 && cs->ctype) + { + /* Single byte letter found */ + if (my_isalpha(cs, *from)) + { + last_ch= get_scode(*from); // Code of the first letter + *to++= soundex_toupper(*from++); // Copy first letter + break; + } + from++; + } + else + { + from+= rc; + if (my_uni_isalpha(wc)) + { + /* Multibyte letter found */ + wc= soundex_toupper(wc); + last_ch= get_scode(wc); // Code of the first letter + if ((rc= cs->cset->wc_mb(cs, wc, (uchar*) to, (uchar*) to_end)) <= 0) + { + /* Extra safety - should not really happen */ + DBUG_ASSERT(false); + return &my_empty_string; + } + to+= rc; + break; + } + } + } + + /* + last_ch is now set to the first 'double-letter' check. + loop on input letters until end of input + */ + for (nchars= 1 ; ; ) + { + if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0) + break; /* EOL or invalid byte sequence */ + + if (rc == 1 && cs->ctype) + { + if (!my_isalpha(cs, *from++)) + continue; + } + else + { + from+= rc; + if (!my_uni_isalpha(wc)) + continue; + } + + ch= get_scode(wc); if ((ch != '0') && (ch != last_ch)) // if not skipped or double { - *to++ = ch; // letter, copy to output - last_ch = ch; // save code of last input letter - } // for next double-letter check + // letter, copy to output + if ((rc= cs->cset->wc_mb(cs, (my_wc_t) ch, + (uchar*) to, (uchar*) to_end)) <= 0) + { + // Extra safety - should not really happen + DBUG_ASSERT(false); + break; + } + to+= rc; + nchars++; + last_ch= ch; // save code of last input letter + } // for next double-letter check } - for (end=(char*) tmp_value.ptr()+4 ; to < end ; to++) - *to = '0'; - *to=0; // end string + + /* Pad up to 4 characters with DIGIT ZERO, if the string is shorter */ + if (nchars < 4) + { + uint nbytes= (4 - nchars) * cs->mbminlen; + cs->cset->fill(cs, to, nbytes, '0'); + to+= nbytes; + } + tmp_value.length((uint) (to-tmp_value.ptr())); return &tmp_value; } --- 1.50/mysql-test/r/ctype_ucs.result 2006-10-03 14:16:21 +05:00 +++ 1.51/mysql-test/r/ctype_ucs.result 2007-03-28 18:57:27 +05:00 @@ -839,6 +839,24 @@ river drop table t1; deallocate prepare stmt; +set names latin1; +set character_set_connection=ucs2; +select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb'); +soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb') + H000 H4142 I51231 +select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb')); +hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb')) + 0048003000300030 00480034003100340032 004900350031003200330031 +select 'mood' sounds like 'mud'; +'mood' sounds like 'mud' +1 +select hex(soundex(_ucs2 0x041004110412)); +hex(soundex(_ucs2 0x041004110412)) +0410003000300030 +select hex(soundex(_ucs2 0x00BF00C0)); +hex(soundex(_ucs2 0x00BF00C0)) +00C0003000300030 +set names latin1; create table t1(a blob, b text charset utf8, c text charset ucs2); select data_type, character_octet_length, character_maximum_length from information_schema.columns where table_name='t1'; --- 1.103/mysql-test/r/ctype_utf8.result 2006-11-20 17:57:53 +04:00 +++ 1.104/mysql-test/r/ctype_utf8.result 2007-03-28 18:57:27 +05:00 @@ -854,6 +854,18 @@ id a 1 Test drop table t1; +select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB); +soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB) +阅000 +select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)); +hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)) +E99885303030 +select soundex(_utf8 0xD091D092D093); +soundex(_utf8 0xD091D092D093) +Б000 +select hex(soundex(_utf8 0xD091D092D093)); +hex(soundex(_utf8 0xD091D092D093)) +D091303030 SET collation_connection='utf8_general_ci'; create table t1 select repeat('a',4000) a; delete from t1; --- 1.49/mysql-test/t/ctype_ucs.test 2006-11-21 00:46:47 +04:00 +++ 1.50/mysql-test/t/ctype_ucs.test 2007-03-28 18:57:27 +05:00 @@ -573,6 +573,20 @@ deallocate prepare stmt; # +# Bug#22638 SOUNDEX broken for international characters +# +set names latin1; +set character_set_connection=ucs2; +select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb'); +select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb')); +select 'mood' sounds like 'mud'; +# Cyrillic A, BE, VE +select hex(soundex(_ucs2 0x041004110412)); +# Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter +select hex(soundex(_ucs2 0x00BF00C0)); +set names latin1; + +# # Bug #14290: character_maximum_length for text fields # create table t1(a blob, b text charset utf8, c text charset ucs2); --- 1.94/mysql-test/t/ctype_utf8.test 2006-11-20 14:57:25 +04:00 +++ 1.95/mysql-test/t/ctype_utf8.test 2007-03-28 18:57:27 +05:00 @@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('test'); drop table t1; +# +# Bug#22638 SOUNDEX broken for international characters +# +select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB); +select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)); +select soundex(_utf8 0xD091D092D093); +select hex(soundex(_utf8 0xD091D092D093)); + SET collation_connection='utf8_general_ci'; -- source include/ctype_filesort.inc