From: Date: May 25 2006 8:13pm Subject: bk commit into 5.1 tree (svoj:1.2187) BUG#19580 List-Archive: http://lists.mysql.com/commits/6883 X-Bug: 19580 Message-Id: <200605251813.k4PIDoKo017703@may.pils.ru> Below is the list of changes that have just been committed into a local 5.1 repository of svoj. When svoj does a push these changes will be propagated to the main repository and, within 24 hours after the push, to the public repository. For information on how to access the public repository see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html ChangeSet 1.2187 06/05/25 23:13:43 svoj@stripped +5 -0 BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns The problem was that MySQL hadn't true ctype implementation. As a result many multibyte punctuation/whitespace characters were treated as word characters. This fix uses recently added CTYPE table for unicode character sets (WL1386) to detect unicode punctuation/whitespace characters correctly. Note: this is incompatible change since it changes parser behavior. One will have to use REPAIR TABLE statement to rebuild fulltext indexes. storage/myisam/ftdefs.h 1.37 06/05/25 23:13:37 svoj@stripped +3 -2 Use WL1386 "CTYPE table for unicode character sets" functionality. Rework true_word_char macro so it accepts ctype instead of charset as first param. It doesn't use my_isalnum anymore, but instead directly checks ctype. Obsolete word_char macro removed. storage/myisam/ft_update.c 1.43 06/05/25 23:13:37 svoj@stripped +0 -5 Use WL1386 "CTYPE table for unicode character sets" functionality. Reverse fix for BUG#16489 "utf8 + fulltext leads to corrupt index file.". It is not needed anymore, since we have true ctype implementation. storage/myisam/ft_parser.c 1.53 06/05/25 23:13:37 svoj@stripped +30 -10 Use WL1386 "CTYPE table for unicode character sets" functionality. mysql-test/t/fulltext2.test 1.15 06/05/25 23:13:37 svoj@stripped +10 -0 Testcase for BUG#19580. mysql-test/r/fulltext2.result 1.14 06/05/25 23:13:37 svoj@stripped +8 -0 Testcase for BUG#19580. # This is a BitKeeper patch. What follows are the unified diffs for the # set of deltas contained in the patch. The rest of the patch, the part # that BitKeeper cares about, is below these diffs. # User: svoj # Host: may.pils.ru # Root: /home/svoj/devel/mysql/BUG19580/mysql-5.1-new --- 1.52/storage/myisam/ft_parser.c 2006-04-06 19:02:51 +05:00 +++ 1.53/storage/myisam/ft_parser.c 2006-05-25 23:13:37 +05:00 @@ -114,6 +114,7 @@ FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) { byte *doc=*start; + int ctype; uint mwc, length, mbl; param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); @@ -122,9 +123,13 @@ while (doccset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (!ctype) + goto ret; + if (true_word_char(ctype, *doc)) + break; if (*doc == FTB_RQUOT && param->quot) { param->quot=doc; @@ -158,14 +163,18 @@ } mwc=length=0; - for (word->pos=doc; docpos= doc; doc < end; length++, doc+= mbl) + { + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (!ctype) + goto ret; + else if (true_word_char(ctype, *doc)) mwc=0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; - + } param->prev='A'; /* be sure *prev is true_word_char */ word->len= (uint)(doc-word->pos) - mwc; if ((param->trunc=(doc= end) DBUG_RETURN(0); - if (true_word_char(cs, *doc)) break; + if (doc >= end) + DBUG_RETURN(0); + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (!ctype) + DBUG_RETURN(0); /* Return EOF if bad ctype */ + if (true_word_char(ctype, *doc)) + break; } mwc= length= 0; - for (word->pos=doc; docpos= doc; doc < end; length++, doc+= mbl) + { + mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); + if (!ctype) + DBUG_RETURN(0); + else if (true_word_char(ctype, *doc)) mwc= 0; else if (!misc_word_char(*doc) || mwc) break; else mwc++; + } word->len= (uint)(doc-word->pos) - mwc; --- 1.42/storage/myisam/ft_update.c 2006-04-06 19:02:51 +05:00 +++ 1.43/storage/myisam/ft_update.c 2006-05-25 23:13:37 +05:00 @@ -174,11 +174,6 @@ FT_SEG_ITERATOR ftsi1, ftsi2; CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset; DBUG_ENTER("_mi_ft_cmp"); -#ifndef MYSQL_HAS_TRUE_CTYPE_IMPLEMENTATION - if (cs->mbmaxlen > 1) - DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT); -#endif - _mi_ft_segiterator_init(info, keynr, rec1, &ftsi1); _mi_ft_segiterator_init(info, keynr, rec2, &ftsi2); --- 1.36/storage/myisam/ftdefs.h 2006-04-06 19:02:51 +05:00 +++ 1.37/storage/myisam/ftdefs.h 2006-05-25 23:13:37 +05:00 @@ -24,9 +24,10 @@ #include #include -#define true_word_char(s,X) (my_isalnum(s,X) || (X)=='_') +#define true_word_char(ctype, character) \ + ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \ + (character) == '_') #define misc_word_char(X) 0 -#define word_char(s,X) (true_word_char(s,X) || misc_word_char(X)) #define FT_MAX_WORD_LEN_FOR_SORT 31 --- 1.13/mysql-test/r/fulltext2.result 2006-05-12 21:40:15 +05:00 +++ 1.14/mysql-test/r/fulltext2.result 2006-05-25 23:13:37 +05:00 @@ -241,3 +241,11 @@ a drop table t1; set names latin1; +SET NAMES utf8; +CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8; +INSERT INTO t1 VALUES('„MySQL“'); +SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE); +a +„MySQL“ +DROP TABLE t1; +SET NAMES latin1; --- 1.14/mysql-test/t/fulltext2.test 2006-05-12 21:26:46 +05:00 +++ 1.15/mysql-test/t/fulltext2.test 2006-05-25 23:13:37 +05:00 @@ -221,3 +221,13 @@ set names latin1; # End of 4.1 tests + +# +# BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns +# +SET NAMES utf8; +CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8; +INSERT INTO t1 VALUES('„MySQL“'); +SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE); +DROP TABLE t1; +SET NAMES latin1;