Below is the list of changes that have just been committed into a local
5.0 repository of bar. When bar does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet
1.2477 07/03/28 18:57:30 bar@stripped +5 -0
Bug#22638 SOUNDEX broken for international characters
Problem: SOUNDEX returned an invalid string for international
characters in multi-byte character sets.
For example: for a Chinese/Japanese 3-byte long character
_utf8 0xE99885 it took only the very first byte 0xE9,
put it into the outout string and then appended with three
DIGIT ZERO characters, so the result was 0xE9303030 - which
is an invalide utf8 string.
Fix: make SOUNDEX() multi-byte aware and - put only complete
characters into result, thus return only valid strings.
This patch also makes SOUNDEX() compatible with UCS2.
sql/item_strfunc.cc
1.299 07/03/28 18:57:27 bar@stripped +105 -28
Making soundex multi-byte aware.
mysql-test/t/ctype_utf8.test
1.95 07/03/28 18:57:27 bar@stripped +8 -0
Adding tests
mysql-test/t/ctype_ucs.test
1.50 07/03/28 18:57:27 bar@stripped +14 -0
Adding tests
mysql-test/r/ctype_utf8.result
1.104 07/03/28 18:57:27 bar@stripped +12 -0
Adding tests
mysql-test/r/ctype_ucs.result
1.51 07/03/28 18:57:27 bar@stripped +18 -0
Adding tests
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: bar
# Host: bar.myoffice.izhnet.ru
# Root: /home/bar/mysql-5.0.b22638
--- 1.298/sql/item_strfunc.cc 2007-03-10 20:56:09 +04:00
+++ 1.299/sql/item_strfunc.cc 2007-03-28 18:57:27 +05:00
@@ -1805,7 +1805,8 @@
{
collation.set(args[0]->collation);
max_length=args[0]->max_length;
- set_if_bigger(max_length,4);
+ set_if_bigger(max_length, 4 * collation.collation->mbminlen);
+ tmp_value.set_charset(collation.collation);
}
@@ -1815,14 +1816,15 @@
else return 0
*/
-static char soundex_toupper(char ch)
+static int soundex_toupper(int ch)
{
return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch;
}
-static char get_scode(char *ptr)
+
+static char get_scode(int wc)
{
- uchar ch= soundex_toupper(*ptr);
+ int ch= soundex_toupper(wc);
if (ch < 'A' || ch > 'Z')
{
// Thread extended alfa (country spec)
@@ -1832,46 +1834,121 @@
}
+static bool my_uni_isalpha(int wc)
+{
+ /*
+ Return true for all Basic Latin letters: a..z A..Z.
+ Return true for all Unicode characters with code higher than U+00C0:
+ - characters between 'z' and U+00C0 are controls and punctuations.
+ - "U+00C0 LATIN CAPITAL LETTER A WITH GRAVE" is the first letter after 'z'.
+ */
+ return (wc >= 'a' && wc <= 'z') ||
+ (wc >= 'A' && wc <= 'Z') ||
+ (wc >= 0xC0);
+}
+
+
String *Item_func_soundex::val_str(String *str)
{
DBUG_ASSERT(fixed == 1);
String *res =args[0]->val_str(str);
char last_ch,ch;
CHARSET_INFO *cs= collation.collation;
+ my_wc_t wc;
+ uint nchars;
+ int rc;
- if ((null_value=args[0]->null_value))
+ if ((null_value= args[0]->null_value))
return 0; /* purecov: inspected */
- if (tmp_value.alloc(max(res->length(),4)))
+ if (tmp_value.alloc(max(res->length(), 4 * cs->mbminlen)))
return str; /* purecov: inspected */
char *to= (char *) tmp_value.ptr();
- char *from= (char *) res->ptr(), *end=from+res->length();
- tmp_value.set_charset(cs);
+ char *to_end= to + tmp_value.alloced_length();
+ char *from= (char *) res->ptr(), *end= from + res->length();
- while (from != end && !my_isalpha(cs,*from)) // Skip pre-space
- from++; /* purecov: inspected */
- if (from == end)
- return &my_empty_string; // No alpha characters.
- *to++ = soundex_toupper(*from); // Copy first letter
- last_ch = get_scode(from); // code of the first letter
- // for the first 'double-letter check.
- // Loop on input letters until
- // end of input (null) or output
- // letter code count = 3
- for (from++ ; from < end ; from++)
+ for ( ; ; ) /* Skip pre-space */
{
- if (!my_isalpha(cs,*from))
- continue;
- ch=get_scode(from);
+ if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0)
+ return &my_empty_string; /* EOL or invalid byte sequence */
+
+ if (rc == 1 && cs->ctype)
+ {
+ /* Single byte letter found */
+ if (my_isalpha(cs, *from))
+ {
+ last_ch= get_scode(*from); // Code of the first letter
+ *to++= soundex_toupper(*from++); // Copy first letter
+ break;
+ }
+ from++;
+ }
+ else
+ {
+ from+= rc;
+ if (my_uni_isalpha(wc))
+ {
+ /* Multibyte letter found */
+ wc= soundex_toupper(wc);
+ last_ch= get_scode(wc); // Code of the first letter
+ if ((rc= cs->cset->wc_mb(cs, wc, (uchar*) to, (uchar*) to_end)) <= 0)
+ {
+ /* Extra safety - should not really happen */
+ DBUG_ASSERT(false);
+ return &my_empty_string;
+ }
+ to+= rc;
+ break;
+ }
+ }
+ }
+
+ /*
+ last_ch is now set to the first 'double-letter' check.
+ loop on input letters until end of input
+ */
+ for (nchars= 1 ; ; )
+ {
+ if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0)
+ break; /* EOL or invalid byte sequence */
+
+ if (rc == 1 && cs->ctype)
+ {
+ if (!my_isalpha(cs, *from++))
+ continue;
+ }
+ else
+ {
+ from+= rc;
+ if (!my_uni_isalpha(wc))
+ continue;
+ }
+
+ ch= get_scode(wc);
if ((ch != '0') && (ch != last_ch)) // if not skipped or double
{
- *to++ = ch; // letter, copy to output
- last_ch = ch; // save code of last input letter
- } // for next double-letter check
+ // letter, copy to output
+ if ((rc= cs->cset->wc_mb(cs, (my_wc_t) ch,
+ (uchar*) to, (uchar*) to_end)) <= 0)
+ {
+ // Extra safety - should not really happen
+ DBUG_ASSERT(false);
+ break;
+ }
+ to+= rc;
+ nchars++;
+ last_ch= ch; // save code of last input letter
+ } // for next double-letter check
}
- for (end=(char*) tmp_value.ptr()+4 ; to < end ; to++)
- *to = '0';
- *to=0; // end string
+
+ /* Pad up to 4 characters with DIGIT ZERO, if the string is shorter */
+ if (nchars < 4)
+ {
+ uint nbytes= (4 - nchars) * cs->mbminlen;
+ cs->cset->fill(cs, to, nbytes, '0');
+ to+= nbytes;
+ }
+
tmp_value.length((uint) (to-tmp_value.ptr()));
return &tmp_value;
}
--- 1.50/mysql-test/r/ctype_ucs.result 2006-10-03 14:16:21 +05:00
+++ 1.51/mysql-test/r/ctype_ucs.result 2007-03-28 18:57:27 +05:00
@@ -839,6 +839,24 @@
river
drop table t1;
deallocate prepare stmt;
+set names latin1;
+set character_set_connection=ucs2;
+select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
+soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb')
+ H000 H4142 I51231
+select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
+hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb'))
+ 0048003000300030 00480034003100340032 004900350031003200330031
+select 'mood' sounds like 'mud';
+'mood' sounds like 'mud'
+1
+select hex(soundex(_ucs2 0x041004110412));
+hex(soundex(_ucs2 0x041004110412))
+0410003000300030
+select hex(soundex(_ucs2 0x00BF00C0));
+hex(soundex(_ucs2 0x00BF00C0))
+00C0003000300030
+set names latin1;
create table t1(a blob, b text charset utf8, c text charset ucs2);
select data_type, character_octet_length, character_maximum_length
from information_schema.columns where table_name='t1';
--- 1.103/mysql-test/r/ctype_utf8.result 2006-11-20 17:57:53 +04:00
+++ 1.104/mysql-test/r/ctype_utf8.result 2007-03-28 18:57:27 +05:00
@@ -854,6 +854,18 @@
id a
1 Test
drop table t1;
+select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
+soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)
+阅000
+select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
+hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB))
+E99885303030
+select soundex(_utf8 0xD091D092D093);
+soundex(_utf8 0xD091D092D093)
+Б000
+select hex(soundex(_utf8 0xD091D092D093));
+hex(soundex(_utf8 0xD091D092D093))
+D091303030
SET collation_connection='utf8_general_ci';
create table t1 select repeat('a',4000) a;
delete from t1;
--- 1.49/mysql-test/t/ctype_ucs.test 2006-11-21 00:46:47 +04:00
+++ 1.50/mysql-test/t/ctype_ucs.test 2007-03-28 18:57:27 +05:00
@@ -573,6 +573,20 @@
deallocate prepare stmt;
#
+# Bug#22638 SOUNDEX broken for international characters
+#
+set names latin1;
+set character_set_connection=ucs2;
+select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
+select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
+select 'mood' sounds like 'mud';
+# Cyrillic A, BE, VE
+select hex(soundex(_ucs2 0x041004110412));
+# Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter
+select hex(soundex(_ucs2 0x00BF00C0));
+set names latin1;
+
+#
# Bug #14290: character_maximum_length for text fields
#
create table t1(a blob, b text charset utf8, c text charset ucs2);
--- 1.94/mysql-test/t/ctype_utf8.test 2006-11-20 14:57:25 +04:00
+++ 1.95/mysql-test/t/ctype_utf8.test 2007-03-28 18:57:27 +05:00
@@ -702,6 +702,14 @@
select * from t1 where soundex(a) = soundex('test');
drop table t1;
+#
+# Bug#22638 SOUNDEX broken for international characters
+#
+select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
+select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
+select soundex(_utf8 0xD091D092D093);
+select hex(soundex(_utf8 0xD091D092D093));
+
SET collation_connection='utf8_general_ci';
-- source include/ctype_filesort.inc
| Thread |
|---|
| • bk commit into 5.0 tree (bar:1.2477) BUG#22638 | bar | 28 Mar |