List:Commits« Previous MessageNext Message »
From:bar Date:March 28 2007 1:57pm
Subject:bk commit into 5.0 tree (bar:1.2477) BUG#22638
View as plain text  
Below is the list of changes that have just been committed into a local
5.0 repository of bar. When bar does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2477 07/03/28 18:57:30 bar@stripped +5 -0
  Bug#22638 SOUNDEX broken for international characters
  Problem: SOUNDEX returned an invalid string for international
  characters in multi-byte character sets.
  For example: for a Chinese/Japanese 3-byte long character
  _utf8 0xE99885 it took only the very first byte 0xE9,
  put it into the outout string and then appended with three 
  DIGIT ZERO characters, so the result was 0xE9303030 - which
  is an invalide utf8 string.
  Fix: make SOUNDEX() multi-byte aware and - put only complete
  characters into result, thus return only valid strings.
  This patch also makes SOUNDEX() compatible with UCS2.

  sql/item_strfunc.cc
    1.299 07/03/28 18:57:27 bar@stripped +105 -28
    Making soundex multi-byte aware.

  mysql-test/t/ctype_utf8.test
    1.95 07/03/28 18:57:27 bar@stripped +8 -0
    Adding tests

  mysql-test/t/ctype_ucs.test
    1.50 07/03/28 18:57:27 bar@stripped +14 -0
    Adding tests

  mysql-test/r/ctype_utf8.result
    1.104 07/03/28 18:57:27 bar@stripped +12 -0
    Adding tests

  mysql-test/r/ctype_ucs.result
    1.51 07/03/28 18:57:27 bar@stripped +18 -0
    Adding tests

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	bar
# Host:	bar.myoffice.izhnet.ru
# Root:	/home/bar/mysql-5.0.b22638

--- 1.298/sql/item_strfunc.cc	2007-03-10 20:56:09 +04:00
+++ 1.299/sql/item_strfunc.cc	2007-03-28 18:57:27 +05:00
@@ -1805,7 +1805,8 @@
 {
   collation.set(args[0]->collation);
   max_length=args[0]->max_length;
-  set_if_bigger(max_length,4);
+  set_if_bigger(max_length, 4 * collation.collation->mbminlen);
+  tmp_value.set_charset(collation.collation);
 }
 
 
@@ -1815,14 +1816,15 @@
   else return 0
 */
 
-static char soundex_toupper(char ch)
+static int soundex_toupper(int ch)
 {
   return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch;
 }
 
-static char get_scode(char *ptr)
+
+static char get_scode(int wc)
 {
-  uchar ch= soundex_toupper(*ptr);
+  int ch= soundex_toupper(wc);
   if (ch < 'A' || ch > 'Z')
   {
 					// Thread extended alfa (country spec)
@@ -1832,46 +1834,121 @@
 }
 
 
+static bool my_uni_isalpha(int wc)
+{
+  /*
+    Return true for all Basic Latin letters: a..z A..Z.
+    Return true for all Unicode characters with code higher than U+00C0:
+    - characters between 'z' and U+00C0 are controls and punctuations.
+    - "U+00C0 LATIN CAPITAL LETTER A WITH GRAVE" is the first letter after 'z'.
+  */
+  return (wc >= 'a' && wc <= 'z') ||
+         (wc >= 'A' && wc <= 'Z') ||
+         (wc >= 0xC0);
+}
+
+
 String *Item_func_soundex::val_str(String *str)
 {
   DBUG_ASSERT(fixed == 1);
   String *res  =args[0]->val_str(str);
   char last_ch,ch;
   CHARSET_INFO *cs= collation.collation;
+  my_wc_t wc;
+  uint nchars;
+  int rc;
 
-  if ((null_value=args[0]->null_value))
+  if ((null_value= args[0]->null_value))
     return 0; /* purecov: inspected */
 
-  if (tmp_value.alloc(max(res->length(),4)))
+  if (tmp_value.alloc(max(res->length(), 4 * cs->mbminlen)))
     return str; /* purecov: inspected */
   char *to= (char *) tmp_value.ptr();
-  char *from= (char *) res->ptr(), *end=from+res->length();
-  tmp_value.set_charset(cs);
+  char *to_end= to + tmp_value.alloced_length();
+  char *from= (char *) res->ptr(), *end= from + res->length();
   
-  while (from != end && !my_isalpha(cs,*from)) // Skip pre-space
-    from++; /* purecov: inspected */
-  if (from == end)
-    return &my_empty_string;		// No alpha characters.
-  *to++ = soundex_toupper(*from);	// Copy first letter
-  last_ch = get_scode(from);		// code of the first letter
-					// for the first 'double-letter check.
-					// Loop on input letters until
-					// end of input (null) or output
-					// letter code count = 3
-  for (from++ ; from < end ; from++)
+  for ( ; ; ) /* Skip pre-space */
   {
-    if (!my_isalpha(cs,*from))
-      continue;
-    ch=get_scode(from);
+    if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0)
+      return &my_empty_string; /* EOL or invalid byte sequence */
+    
+    if (rc == 1 && cs->ctype)
+    {
+      /* Single byte letter found */
+      if (my_isalpha(cs, *from))
+      {
+        last_ch= get_scode(*from);       // Code of the first letter
+        *to++= soundex_toupper(*from++); // Copy first letter
+        break;
+      }
+      from++;
+    }
+    else
+    {
+      from+= rc;
+      if (my_uni_isalpha(wc))
+      {
+        /* Multibyte letter found */
+        wc= soundex_toupper(wc);
+        last_ch= get_scode(wc);     // Code of the first letter
+        if ((rc= cs->cset->wc_mb(cs, wc, (uchar*) to, (uchar*) to_end)) <= 0)
+        {
+          /* Extra safety - should not really happen */
+          DBUG_ASSERT(false);
+          return &my_empty_string;
+        }
+        to+= rc;
+        break;
+      }
+    }
+  }
+  
+  /*
+     last_ch is now set to the first 'double-letter' check.
+     loop on input letters until end of input
+  */
+  for (nchars= 1 ; ; )
+  {
+    if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0)
+      break; /* EOL or invalid byte sequence */
+
+    if (rc == 1 && cs->ctype)
+    {
+      if (!my_isalpha(cs, *from++))
+        continue;
+    }
+    else
+    {
+      from+= rc;
+      if (!my_uni_isalpha(wc))
+        continue;
+    }
+    
+    ch= get_scode(wc);
     if ((ch != '0') && (ch != last_ch)) // if not skipped or double
     {
-       *to++ = ch;			// letter, copy to output
-       last_ch = ch;			// save code of last input letter
-    }					// for next double-letter check
+      // letter, copy to output
+      if ((rc= cs->cset->wc_mb(cs, (my_wc_t) ch,
+                               (uchar*) to, (uchar*) to_end)) <= 0)
+      {
+        // Extra safety - should not really happen
+        DBUG_ASSERT(false);
+        break;
+      }
+      to+= rc;
+      nchars++;
+      last_ch= ch;  // save code of last input letter
+    }               // for next double-letter check
   }
-  for (end=(char*) tmp_value.ptr()+4 ; to < end ; to++)
-    *to = '0';
-  *to=0;				// end string
+  
+  /* Pad up to 4 characters with DIGIT ZERO, if the string is shorter */
+  if (nchars < 4) 
+  {
+    uint nbytes= (4 - nchars) * cs->mbminlen;
+    cs->cset->fill(cs, to, nbytes, '0');
+    to+= nbytes;
+  }
+
   tmp_value.length((uint) (to-tmp_value.ptr()));
   return &tmp_value;
 }

--- 1.50/mysql-test/r/ctype_ucs.result	2006-10-03 14:16:21 +05:00
+++ 1.51/mysql-test/r/ctype_ucs.result	2007-03-28 18:57:27 +05:00
@@ -839,6 +839,24 @@
 river
 drop table t1;
 deallocate prepare stmt;
+set names latin1;
+set character_set_connection=ucs2;
+select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
+soundex('')	soundex('he')	soundex('hello all folks')	soundex('#3556 in bugdb')
+	H000	H4142	I51231
+select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
+hex(soundex(''))	hex(soundex('he'))	hex(soundex('hello all folks'))	hex(soundex('#3556 in bugdb'))
+	0048003000300030	00480034003100340032	004900350031003200330031
+select 'mood' sounds like 'mud';
+'mood' sounds like 'mud'
+1
+select hex(soundex(_ucs2 0x041004110412));
+hex(soundex(_ucs2 0x041004110412))
+0410003000300030
+select hex(soundex(_ucs2 0x00BF00C0));
+hex(soundex(_ucs2 0x00BF00C0))
+00C0003000300030
+set names latin1;
 create table t1(a blob, b text charset utf8, c text charset ucs2);
 select data_type, character_octet_length, character_maximum_length
 from information_schema.columns where table_name='t1';

--- 1.103/mysql-test/r/ctype_utf8.result	2006-11-20 17:57:53 +04:00
+++ 1.104/mysql-test/r/ctype_utf8.result	2007-03-28 18:57:27 +05:00
@@ -854,6 +854,18 @@
 id	a
 1	Test
 drop table t1;
+select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
+soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)
+阅000
+select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
+hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB))
+E99885303030
+select soundex(_utf8 0xD091D092D093);
+soundex(_utf8 0xD091D092D093)
+Б000
+select hex(soundex(_utf8 0xD091D092D093));
+hex(soundex(_utf8 0xD091D092D093))
+D091303030
 SET collation_connection='utf8_general_ci';
 create table t1 select repeat('a',4000) a;
 delete from t1;

--- 1.49/mysql-test/t/ctype_ucs.test	2006-11-21 00:46:47 +04:00
+++ 1.50/mysql-test/t/ctype_ucs.test	2007-03-28 18:57:27 +05:00
@@ -573,6 +573,20 @@
 deallocate prepare stmt;
 
 #
+# Bug#22638 SOUNDEX broken for international characters
+#
+set names latin1;
+set character_set_connection=ucs2;
+select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
+select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
+select 'mood' sounds like 'mud';
+# Cyrillic A, BE, VE
+select hex(soundex(_ucs2 0x041004110412));
+# Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter
+select hex(soundex(_ucs2 0x00BF00C0));
+set names latin1;
+
+#
 # Bug #14290: character_maximum_length for text fields
 #
 create table t1(a blob, b text charset utf8, c text charset ucs2);

--- 1.94/mysql-test/t/ctype_utf8.test	2006-11-20 14:57:25 +04:00
+++ 1.95/mysql-test/t/ctype_utf8.test	2007-03-28 18:57:27 +05:00
@@ -702,6 +702,14 @@
 select * from t1 where soundex(a) = soundex('test');
 drop table t1;
 
+#
+# Bug#22638 SOUNDEX broken for international characters
+#
+select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
+select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
+select soundex(_utf8 0xD091D092D093);
+select hex(soundex(_utf8 0xD091D092D093));
+
 
 SET collation_connection='utf8_general_ci';
 -- source include/ctype_filesort.inc
Thread
bk commit into 5.0 tree (bar:1.2477) BUG#22638bar28 Mar