List:Commits« Previous MessageNext Message »
From:Sergey Vojtovich Date:May 29 2006 1:48pm
Subject:bk commit into 5.1 tree (svoj:1.2187) BUG#19580
View as plain text  
Below is the list of changes that have just been committed into a local
5.1 repository of svoj. When svoj does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2187 06/05/29 16:46:46 svoj@stripped +5 -0
  BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
  
  The problem was that MySQL hadn't true ctype implementation. As a
  result many multibyte punctuation/whitespace characters were
  treated as word characters.
  
  This fix uses recently added CTYPE table for unicode character sets
  (WL1386) to detect unicode punctuation/whitespace characters
  correctly.
  
  Note: this is incompatible change since it changes parser behavior.
  One will have to use REPAIR TABLE statement to rebuild fulltext
  indexes.

  storage/myisam/ftdefs.h
    1.37 06/05/29 16:45:09 svoj@stripped +3 -2
    Use WL1386 "CTYPE table for unicode character sets" functionality.
    
    Rework true_word_char macro so it accepts ctype instead of charset
    as first param. It doesn't use my_isalnum anymore, but instead
    directly checks ctype.
    Obsolete word_char macro removed.

  storage/myisam/ft_update.c
    1.43 06/05/29 16:45:08 svoj@stripped +0 -5
    Use WL1386 "CTYPE table for unicode character sets" functionality.
    
    Reverse fix for BUG#16489 "utf8 + fulltext leads to corrupt index
    file.". It is not needed anymore, since we have true ctype
    implementation.

  storage/myisam/ft_parser.c
    1.53 06/05/29 16:45:08 svoj@stripped +22 -10
    Use WL1386 "CTYPE table for unicode character sets" functionality.

  mysql-test/t/fulltext2.test
    1.15 06/05/29 16:45:08 svoj@stripped +10 -0
    Testcase for BUG#19580.

  mysql-test/r/fulltext2.result
    1.14 06/05/29 16:45:08 svoj@stripped +8 -0
    Testcase for BUG#19580.

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	svoj
# Host:	may.pils.ru
# Root:	/home/svoj/devel/mysql/BUG19580/mysql-5.1-new

--- 1.52/storage/myisam/ft_parser.c	2006-04-06 19:02:51 +05:00
+++ 1.53/storage/myisam/ft_parser.c	2006-05-29 16:45:08 +05:00
@@ -114,6 +114,7 @@
                  FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
 {
   byte *doc=*start;
+  int ctype;
   uint mwc, length, mbl;
 
   param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
@@ -122,9 +123,11 @@
 
   while (doc<end)
   {
-    for (;doc<end;doc++)
+    for (; doc < end; doc+= (mbl > 0 ? mbl : 1))
     {
-      if (true_word_char(cs,*doc)) break;
+      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      if (true_word_char(ctype, *doc))
+        break;
       if (*doc == FTB_RQUOT && param->quot)
       {
         param->quot=doc;
@@ -158,14 +161,16 @@
     }
 
     mwc=length=0;
-    for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc),
doc+=(mbl ? mbl : 1))
-      if (true_word_char(cs,*doc))
+    for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+    {
+      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      if (true_word_char(ctype, *doc))
         mwc=0;
       else if (!misc_word_char(*doc) || mwc)
         break;
       else
         mwc++;
-
+    }
     param->prev='A'; /* be sure *prev is true_word_char */
     word->len= (uint)(doc-word->pos) - mwc;
     if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
@@ -200,24 +205,31 @@
 {
   byte *doc= *start;
   uint mwc, length, mbl;
+  int ctype;
   DBUG_ENTER("ft_simple_get_word");
 
   do
   {
-    for (;; doc++)
+    for (;; doc+= (mbl > 0 ? mbl : 1))
     {
-      if (doc >= end) DBUG_RETURN(0);
-      if (true_word_char(cs, *doc)) break;
+      if (doc >= end)
+        DBUG_RETURN(0);
+      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      if (true_word_char(ctype, *doc))
+        break;
     }
 
     mwc= length= 0;
-    for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc),
doc+=(mbl ? mbl : 1))
-      if (true_word_char(cs,*doc))
+    for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+    {
+      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      if (true_word_char(ctype, *doc))
         mwc= 0;
       else if (!misc_word_char(*doc) || mwc)
         break;
       else
         mwc++;
+    }
 
     word->len= (uint)(doc-word->pos) - mwc;
 

--- 1.42/storage/myisam/ft_update.c	2006-04-06 19:02:51 +05:00
+++ 1.43/storage/myisam/ft_update.c	2006-05-29 16:45:08 +05:00
@@ -174,11 +174,6 @@
   FT_SEG_ITERATOR ftsi1, ftsi2;
   CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
   DBUG_ENTER("_mi_ft_cmp");
-#ifndef MYSQL_HAS_TRUE_CTYPE_IMPLEMENTATION
-  if (cs->mbmaxlen > 1)
-    DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT);
-#endif
-
   _mi_ft_segiterator_init(info, keynr, rec1, &ftsi1);
   _mi_ft_segiterator_init(info, keynr, rec2, &ftsi2);
 

--- 1.36/storage/myisam/ftdefs.h	2006-04-06 19:02:51 +05:00
+++ 1.37/storage/myisam/ftdefs.h	2006-05-29 16:45:09 +05:00
@@ -24,9 +24,10 @@
 #include <queues.h>
 #include <mysql/plugin.h>
 
-#define true_word_char(s,X)	(my_isalnum(s,X) || (X)=='_')
+#define true_word_char(ctype, character) \
+                      ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
+                       (character) == '_')
 #define misc_word_char(X)	0
-#define word_char(s,X)		(true_word_char(s,X) || misc_word_char(X))
 
 #define FT_MAX_WORD_LEN_FOR_SORT 31
 

--- 1.13/mysql-test/r/fulltext2.result	2006-05-12 21:40:15 +05:00
+++ 1.14/mysql-test/r/fulltext2.result	2006-05-29 16:45:08 +05:00
@@ -241,3 +241,11 @@
 a
 drop table t1;
 set names latin1;
+SET NAMES utf8;
+CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+INSERT INTO t1 VALUES('„MySQL“');
+SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE);
+a
+„MySQL“
+DROP TABLE t1;
+SET NAMES latin1;

--- 1.14/mysql-test/t/fulltext2.test	2006-05-12 21:26:46 +05:00
+++ 1.15/mysql-test/t/fulltext2.test	2006-05-29 16:45:08 +05:00
@@ -221,3 +221,13 @@
 set names latin1;
 
 # End of 4.1 tests
+
+#
+# BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
+#
+SET NAMES utf8;
+CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+INSERT INTO t1 VALUES('„MySQL“');
+SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE);
+DROP TABLE t1;
+SET NAMES latin1;
Thread
bk commit into 5.1 tree (svoj:1.2187) BUG#19580Sergey Vojtovich29 May