Below is the list of changes that have just been committed into a local
5.1 repository of svoj. When svoj does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet
1.2187 06/05/29 16:46:46 svoj@stripped +5 -0
BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
The problem was that MySQL hadn't true ctype implementation. As a
result many multibyte punctuation/whitespace characters were
treated as word characters.
This fix uses recently added CTYPE table for unicode character sets
(WL1386) to detect unicode punctuation/whitespace characters
correctly.
Note: this is incompatible change since it changes parser behavior.
One will have to use REPAIR TABLE statement to rebuild fulltext
indexes.
storage/myisam/ftdefs.h
1.37 06/05/29 16:45:09 svoj@stripped +3 -2
Use WL1386 "CTYPE table for unicode character sets" functionality.
Rework true_word_char macro so it accepts ctype instead of charset
as first param. It doesn't use my_isalnum anymore, but instead
directly checks ctype.
Obsolete word_char macro removed.
storage/myisam/ft_update.c
1.43 06/05/29 16:45:08 svoj@stripped +0 -5
Use WL1386 "CTYPE table for unicode character sets" functionality.
Reverse fix for BUG#16489 "utf8 + fulltext leads to corrupt index
file.". It is not needed anymore, since we have true ctype
implementation.
storage/myisam/ft_parser.c
1.53 06/05/29 16:45:08 svoj@stripped +22 -10
Use WL1386 "CTYPE table for unicode character sets" functionality.
mysql-test/t/fulltext2.test
1.15 06/05/29 16:45:08 svoj@stripped +10 -0
Testcase for BUG#19580.
mysql-test/r/fulltext2.result
1.14 06/05/29 16:45:08 svoj@stripped +8 -0
Testcase for BUG#19580.
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: svoj
# Host: may.pils.ru
# Root: /home/svoj/devel/mysql/BUG19580/mysql-5.1-new
--- 1.52/storage/myisam/ft_parser.c 2006-04-06 19:02:51 +05:00
+++ 1.53/storage/myisam/ft_parser.c 2006-05-29 16:45:08 +05:00
@@ -114,6 +114,7 @@
FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
{
byte *doc=*start;
+ int ctype;
uint mwc, length, mbl;
param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
@@ -122,9 +123,11 @@
while (doc<end)
{
- for (;doc<end;doc++)
+ for (; doc < end; doc+= (mbl > 0 ? mbl : 1))
{
- if (true_word_char(cs,*doc)) break;
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
+ break;
if (*doc == FTB_RQUOT && param->quot)
{
param->quot=doc;
@@ -158,14 +161,16 @@
}
mwc=length=0;
- for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc),
doc+=(mbl ? mbl : 1))
- if (true_word_char(cs,*doc))
+ for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
mwc=0;
else if (!misc_word_char(*doc) || mwc)
break;
else
mwc++;
-
+ }
param->prev='A'; /* be sure *prev is true_word_char */
word->len= (uint)(doc-word->pos) - mwc;
if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
@@ -200,24 +205,31 @@
{
byte *doc= *start;
uint mwc, length, mbl;
+ int ctype;
DBUG_ENTER("ft_simple_get_word");
do
{
- for (;; doc++)
+ for (;; doc+= (mbl > 0 ? mbl : 1))
{
- if (doc >= end) DBUG_RETURN(0);
- if (true_word_char(cs, *doc)) break;
+ if (doc >= end)
+ DBUG_RETURN(0);
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
+ break;
}
mwc= length= 0;
- for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc),
doc+=(mbl ? mbl : 1))
- if (true_word_char(cs,*doc))
+ for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
+ {
+ mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+ if (true_word_char(ctype, *doc))
mwc= 0;
else if (!misc_word_char(*doc) || mwc)
break;
else
mwc++;
+ }
word->len= (uint)(doc-word->pos) - mwc;
--- 1.42/storage/myisam/ft_update.c 2006-04-06 19:02:51 +05:00
+++ 1.43/storage/myisam/ft_update.c 2006-05-29 16:45:08 +05:00
@@ -174,11 +174,6 @@
FT_SEG_ITERATOR ftsi1, ftsi2;
CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
DBUG_ENTER("_mi_ft_cmp");
-#ifndef MYSQL_HAS_TRUE_CTYPE_IMPLEMENTATION
- if (cs->mbmaxlen > 1)
- DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT);
-#endif
-
_mi_ft_segiterator_init(info, keynr, rec1, &ftsi1);
_mi_ft_segiterator_init(info, keynr, rec2, &ftsi2);
--- 1.36/storage/myisam/ftdefs.h 2006-04-06 19:02:51 +05:00
+++ 1.37/storage/myisam/ftdefs.h 2006-05-29 16:45:09 +05:00
@@ -24,9 +24,10 @@
#include <queues.h>
#include <mysql/plugin.h>
-#define true_word_char(s,X) (my_isalnum(s,X) || (X)=='_')
+#define true_word_char(ctype, character) \
+ ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
+ (character) == '_')
#define misc_word_char(X) 0
-#define word_char(s,X) (true_word_char(s,X) || misc_word_char(X))
#define FT_MAX_WORD_LEN_FOR_SORT 31
--- 1.13/mysql-test/r/fulltext2.result 2006-05-12 21:40:15 +05:00
+++ 1.14/mysql-test/r/fulltext2.result 2006-05-29 16:45:08 +05:00
@@ -241,3 +241,11 @@
a
drop table t1;
set names latin1;
+SET NAMES utf8;
+CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+INSERT INTO t1 VALUES('„MySQL“');
+SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE);
+a
+„MySQL“
+DROP TABLE t1;
+SET NAMES latin1;
--- 1.14/mysql-test/t/fulltext2.test 2006-05-12 21:26:46 +05:00
+++ 1.15/mysql-test/t/fulltext2.test 2006-05-29 16:45:08 +05:00
@@ -221,3 +221,13 @@
set names latin1;
# End of 4.1 tests
+
+#
+# BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
+#
+SET NAMES utf8;
+CREATE TABLE t1(a VARCHAR(255), FULLTEXT(a)) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+INSERT INTO t1 VALUES('„MySQL“');
+SELECT a FROM t1 WHERE MATCH a AGAINST('“MySQL„' IN BOOLEAN MODE);
+DROP TABLE t1;
+SET NAMES latin1;
| Thread |
|---|
| • bk commit into 5.1 tree (svoj:1.2187) BUG#19580 | Sergey Vojtovich | 29 May |