#At file:///home/bar/mysql-bzr/mysql-6.0-b32391/
2823 Alexander Barkov 2008-12-25
Bug#32391 Character sets: crash with --character-set-server
Problem:
Crashed on initialization of the build-in stopwords
when started with --character-set-server=utf16.
ft_init_topwords() mistakenly compared the built-in
stopwords using "utf16" as character set, which lead
to exit on DBUG_ASSERT((slen % 2) == 0) in
my_strnncollsp_utf16() when comparing a word with an even length
(for example word="about", len=5).
Fix:
- using latin1 when initializing the built-in stopwords.
- adding conversion from latin1 to "real" multi-byte character sets
(UCS2, UTF16, UTF32) when searching stopwords.
- additional fix: stopwords are now searched according
to the column collation, that is when the built-in
stopwords are in use, the word "ABOUT" is treated as
a stopword only in a case insensitive collation.
Changeset:
- New files added to test --character-set-server=utf16:
mysql-test/r/ctype_utf16_def.result
mysql-test/t/ctype_utf16_def-master.opt
mysql-test/t/ctype_utf16_def.test
- Moving the character set conversion function
from /sql to /strings. Adding the function prototype:
include/m_ctype.h
strings/ctype.c
- Adding tests for stopword case sensitivity:
mysql-test/r/fulltext.result
mysql-test/t/fulltext.test
- Moving most of the converstion code to /strings:
sql/sql_string.cc
- The main fix: splitting code into more separate functions,
loading stopwords into two trees (for case sensitive and
case insensitive searches), adding conversion code:
storage/myisam/ft_parser.c
storage/myisam/ft_stopwords.c
storage/myisam/ftdefs.h
removed:
sql/MSG00001.bin
added:
mysql-test/r/ctype_utf16_def.result
mysql-test/t/ctype_utf16_def-master.opt
mysql-test/t/ctype_utf16_def.test
modified:
include/m_ctype.h
mysql-test/r/fulltext.result
mysql-test/t/fulltext.test
sql/sql_string.cc
storage/myisam/ft_parser.c
storage/myisam/ft_stopwords.c
storage/myisam/ftdefs.h
strings/ctype.c
=== modified file 'include/m_ctype.h'
--- a/include/m_ctype.h 2008-12-23 14:34:03 +0000
+++ b/include/m_ctype.h 2008-12-25 10:31:47 +0000
@@ -579,6 +579,11 @@ size_t my_strxfrm_pad_desc_and_reverse(C
my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs);
+size_t
+my_charset_convert(CHARSET_INFO *to_cs, char *to, size_t to_length,
+ CHARSET_INFO *from_cs, const char *from, size_t from_length,
+ size_t *errors);
+
#define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */
#define _MY_NMR 04 /* Numeral (digit) */
=== added file 'mysql-test/r/ctype_utf16_def.result'
--- a/mysql-test/r/ctype_utf16_def.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/r/ctype_utf16_def.result 2008-12-25 10:31:47 +0000
@@ -0,0 +1,46 @@
+DROP TABLE IF EXISTS t1;
+SHOW VARIABLES LIKE 'collation_server';
+Variable_name Value
+collation_server utf16_general_ci
+SHOW VARIABLES LIKE 'character_set_server';
+Variable_name Value
+character_set_server utf16
+SHOW VARIABLES LIKE 'ft%';
+Variable_name Value
+ft_boolean_syntax + -><()~*:""&|
+ft_max_word_len 84
+ft_min_word_len 4
+ft_query_expansion_limit 20
+ft_stopword_file (built-in)
+CREATE TABLE t1 (a text character set utf16 collate utf16_general_ci);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` text
+) ENGINE=MyISAM DEFAULT CHARSET=utf16
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+a
+word
+DROP TABLE t1;
+CREATE TABLE t1 (a text character set utf16 collate utf16_bin);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` text CHARACTER SET utf16 COLLATE utf16_bin
+) ENGINE=MyISAM DEFAULT CHARSET=utf16
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+a
+ABOUT
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+a
+word
+DROP TABLE t1;
+End of 6.0 tests
=== modified file 'mysql-test/r/fulltext.result'
--- a/mysql-test/r/fulltext.result 2008-12-09 10:16:06 +0000
+++ b/mysql-test/r/fulltext.result 2008-12-25 10:31:47 +0000
@@ -531,3 +531,34 @@ SELECT MATCH(a) AGAINST('aaa1* aaa14 aaa
MATCH(a) AGAINST('aaa1* aaa14 aaa15 aaa16' IN BOOLEAN MODE)
2
DROP TABLE t1;
+CREATE TABLE t1 (a text character set latin1 collate latin1_bin);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` text CHARACTER SET latin1 COLLATE latin1_bin
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+a
+ABOUT
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+a
+word
+DROP TABLE t1;
+CREATE TABLE t1 (a text character set latin1 collate latin1_general_ci);
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` text CHARACTER SET latin1 COLLATE latin1_general_ci
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+a
+word
+DROP TABLE t1;
=== added file 'mysql-test/t/ctype_utf16_def-master.opt'
--- a/mysql-test/t/ctype_utf16_def-master.opt 1970-01-01 00:00:00 +0000
+++ b/mysql-test/t/ctype_utf16_def-master.opt 2008-12-25 10:31:47 +0000
@@ -0,0 +1 @@
+--default-collation=utf16_general_ci --default-character-set=utf16,latin1
=== added file 'mysql-test/t/ctype_utf16_def.test'
--- a/mysql-test/t/ctype_utf16_def.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/t/ctype_utf16_def.test 2008-12-25 10:31:47 +0000
@@ -0,0 +1,37 @@
+-- source include/have_utf16.inc
+
+--disable_warnings
+DROP TABLE IF EXISTS t1;
+--enable_warnings
+
+#
+# Bug #32391 Character sets: crash with --character-set-server
+#
+SHOW VARIABLES LIKE 'collation_server';
+SHOW VARIABLES LIKE 'character_set_server';
+SHOW VARIABLES LIKE 'ft%';
+
+#
+# Test that stopwords are searched according to the collation
+# That is case sensitively in a case sensitive collation
+# an case insensitively in a case insensitive collation.
+# The word "ABOUT" is treated as a stopword only in a case insensitive
+# collation.
+#
+CREATE TABLE t1 (a text character set utf16 collate utf16_general_ci);
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+DROP TABLE t1;
+
+CREATE TABLE t1 (a text character set utf16 collate utf16_bin);
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+DROP TABLE t1;
+
+--echo End of 6.0 tests
=== modified file 'mysql-test/t/fulltext.test'
--- a/mysql-test/t/fulltext.test 2008-12-09 09:27:46 +0000
+++ b/mysql-test/t/fulltext.test 2008-12-25 10:31:47 +0000
@@ -454,3 +454,25 @@ INSERT INTO t1 VALUES('aaa15');
SELECT MATCH(a) AGAINST('aaa1* aaa14 aaa16' IN BOOLEAN MODE) FROM t1;
SELECT MATCH(a) AGAINST('aaa1* aaa14 aaa15 aaa16' IN BOOLEAN MODE) FROM t1;
DROP TABLE t1;
+
+#
+# Test that stopwords are searched according to the collation
+# That is case sensitively in a case sensitive collation
+# an case insensitively in a case insensitive collation.
+# The word "ABOUT" is treated as a stopword only in a case insensitive
+# collation.
+CREATE TABLE t1 (a text character set latin1 collate latin1_bin);
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+DROP TABLE t1;
+CREATE TABLE t1 (a text character set latin1 collate latin1_general_ci);
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+DROP TABLE t1;
+
=== removed file 'sql/MSG00001.bin'
Binary files a/sql/MSG00001.bin 2008-10-23 13:28:53 +0000 and b/sql/MSG00001.bin 1970-01-01 00:00:00 +0000 differ
=== modified file 'sql/sql_string.cc'
--- a/sql/sql_string.cc 2008-07-18 13:30:53 +0000
+++ b/sql/sql_string.cc 2008-12-25 10:31:47 +0000
@@ -734,121 +734,26 @@ String *copy_if_not_alloced(String *to,S
RETURN
length of bytes copied to 'to'
+
+
+ Note, my_charset_convert() is 64-bit compatible
+ and can convert strings longer than 4Gb on a 64-bit machine.
+ This function is a 32-bit wrapper for convenience:
+ all String functions uses uint32.
*/
-static uint32
-copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs,
- const char *from, uint32 from_length,
- CHARSET_INFO *from_cs,
- uint *errors)
-{
- int cnvres;
- my_wc_t wc;
- const uchar *from_end= (const uchar*) from+from_length;
- char *to_start= to;
- uchar *to_end= (uchar*) to+to_length;
- my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
- my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
- uint error_count= 0;
-
- while (1)
- {
- if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from,
- from_end)) > 0)
- from+= cnvres;
- else if (cnvres == MY_CS_ILSEQ)
- {
- error_count++;
- from++;
- wc= '?';
- }
- else if (cnvres > MY_CS_TOOSMALL)
- {
- /*
- A correct multibyte sequence detected
- But it doesn't have Unicode mapping.
- */
- error_count++;
- from+= (-cnvres);
- wc= '?';
- }
- else
- break; // Not enough characters
-
-outp:
- if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
- to+= cnvres;
- else if (cnvres == MY_CS_ILUNI && wc != '?')
- {
- error_count++;
- wc= '?';
- goto outp;
- }
- else
- break;
- }
- *errors= error_count;
- return (uint32) (to - to_start);
-}
-
-
-/*
- Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
-*/
uint32
copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length, CHARSET_INFO *from_cs,
uint *errors)
{
- /*
- If any of the character sets is not ASCII compatible,
- immediately switch to slow mb_wc->wc_mb method.
- */
- if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
- return copy_and_convert_extended(to, to_length, to_cs,
- from, from_length, from_cs, errors);
-
- uint32 length= min(to_length, from_length), length2= length;
-
-#if defined(__i386__)
- /*
- Special loop for i386, it allows to refer to a
- non-aligned memory block as UINT32, which makes
- it possible to copy four bytes at once. This
- gives about 10% performance improvement comparing
- to byte-by-byte loop.
- */
- for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
- {
- if ((*(uint32*)from) & 0x80808080)
- break;
- *((uint32*) to)= *((const uint32*) from);
- }
-#endif
-
- for (; ; *to++= *from++, length--)
- {
- if (!length)
- {
- *errors= 0;
- return length2;
- }
- if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
- {
- uint32 copied_length= length2 - length;
- to_length-= copied_length;
- from_length-= copied_length;
- return copied_length + copy_and_convert_extended(to, to_length,
- to_cs,
- from, from_length,
- from_cs,
- errors);
- }
- }
-
- DBUG_ASSERT(FALSE); // Should never get to here
- return 0; // Make compiler happy
+ uint32 error_count;
+ uint32 res= (uint32) my_charset_convert(to_cs, to, to_length,
+ from_cs, from, from_length,
+ &error_count);
+ *errors= (uint) error_count;
+ return res;
}
=== modified file 'storage/myisam/ft_parser.c'
--- a/storage/myisam/ft_parser.c 2008-07-09 07:12:43 +0000
+++ b/storage/myisam/ft_parser.c 2008-12-25 10:31:47 +0000
@@ -175,7 +175,7 @@ uchar ft_get_word(CHARSET_INFO *cs, ucha
if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
doc++;
- if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
+ if (((length >= ft_min_word_len && !is_stopword(cs, (char*) word->pos,
word->len))
|| param->trunc) && length < ft_max_word_len)
{
@@ -238,7 +238,7 @@ uchar ft_simple_get_word(CHARSET_INFO *c
if (skip_stopwords == FALSE ||
(length >= ft_min_word_len && length < ft_max_word_len &&
- !is_stopword((char*) word->pos, word->len)))
+ !is_stopword(cs, (char*) word->pos, word->len)))
{
*start= doc;
DBUG_RETURN(1);
=== modified file 'storage/myisam/ft_stopwords.c'
--- a/storage/myisam/ft_stopwords.c 2008-04-28 16:24:05 +0000
+++ b/storage/myisam/ft_stopwords.c 2008-12-25 10:31:47 +0000
@@ -24,16 +24,29 @@ typedef struct st_ft_stopwords
uint len;
} FT_STOPWORD;
-static TREE *stopwords3=NULL;
+static TREE *stopwords3_cs= NULL;
+static TREE *stopwords3_ci= NULL;
+
+static inline CHARSET_INFO*
+tree_charset(TREE *tree)
+{
+ return (CHARSET_INFO*) tree->custom_arg;
+}
+
+
+static int stopwords_are_pure_ascii_8bit= 0;
static int FT_STOPWORD_cmp(void* cmp_arg __attribute__((unused)),
- FT_STOPWORD *w1, FT_STOPWORD *w2)
+ FT_STOPWORD *w1, FT_STOPWORD *w2)
{
- return ha_compare_text(default_charset_info,
+ DBUG_ASSERT(cmp_arg != NULL);
+
+ return ha_compare_text((CHARSET_INFO*) cmp_arg,
(uchar *)w1->pos,w1->len,
(uchar *)w2->pos,w2->len,0,0);
}
+
static void FT_STOPWORD_free(FT_STOPWORD *w, TREE_FREE action,
void *arg __attribute__((unused)))
{
@@ -41,66 +54,83 @@ static void FT_STOPWORD_free(FT_STOPWORD
my_free((uchar*) w->pos, MYF(0));
}
-static int ft_add_stopword(const char *w)
+static int ft_add_stopword(TREE *tree, const FT_WORD *w)
{
FT_STOPWORD sw;
- return !w ||
- (((sw.len= (uint) strlen(sw.pos=w)) >= ft_min_word_len) &&
- (tree_insert(stopwords3, &sw, 0, stopwords3->custom_arg)==NULL));
+ sw.pos= w->pos;
+ return !w->pos ||
+ (((sw.len= (uint) w->len) >= ft_min_word_len) &&
+ (tree_insert(tree, &sw, 0, tree_charset(tree))==NULL));
}
-int ft_init_stopwords()
-{
- DBUG_ENTER("ft_init_stopwords");
- if (!stopwords3)
- {
- if (!(stopwords3=(TREE *)my_malloc(sizeof(TREE),MYF(0))))
- DBUG_RETURN(-1);
- init_tree(stopwords3,0,0,sizeof(FT_STOPWORD),(qsort_cmp2)&FT_STOPWORD_cmp,
- 0,
- (ft_stopword_file ? (tree_element_free)&FT_STOPWORD_free : 0),
- NULL);
- }
- if (ft_stopword_file)
+static int
+ft_stopwords_init_tree(TREE **tree, CHARSET_INFO *cs)
+{
+ if (!(*tree= (TREE*) my_malloc(sizeof(TREE), MYF(0))))
+ return -1;
+ init_tree(*tree,
+ 0, /* default_alloc_size*/
+ 0, /*memory_limit */
+ sizeof(FT_STOPWORD),
+ (qsort_cmp2) &FT_STOPWORD_cmp,
+ 0, /* with_delete */
+ ft_stopword_file ? (tree_element_free) &FT_STOPWORD_free : 0,
+ cs);
+ return 0;
+}
+
+
+static int
+ft_stopwords_load_file(TREE *tree, const char* stopword_file)
+{
+ File fd;
+ uint len;
+ uchar *buffer, *start, *end;
+ FT_WORD w;
+ int error= -1;
+
+ DBUG_ENTER("ft_stopwords_load_file");
+
+ if (!*stopword_file)
+ DBUG_RETURN(0);
+
+ if ((fd= my_open(stopword_file, O_RDONLY, MYF(MY_WME))) == -1)
+ DBUG_RETURN(-1);
+ len= (uint) my_seek(fd, 0L, MY_SEEK_END, MYF(0));
+ my_seek(fd, 0L, MY_SEEK_SET, MYF(0));
+ if (!(start= buffer= my_malloc(len + 1, MYF(MY_WME))))
+ goto err0;
+ len= my_read(fd, buffer, len, MYF(MY_WME));
+ end= start + len;
+ while (ft_simple_get_word(default_charset_info, &start, end, &w, TRUE))
{
- File fd;
- uint len;
- uchar *buffer, *start, *end;
- FT_WORD w;
- int error=-1;
-
- if (!*ft_stopword_file)
- DBUG_RETURN(0);
-
- if ((fd=my_open(ft_stopword_file, O_RDONLY, MYF(MY_WME))) == -1)
- DBUG_RETURN(-1);
- len=(uint)my_seek(fd, 0L, MY_SEEK_END, MYF(0));
- my_seek(fd, 0L, MY_SEEK_SET, MYF(0));
- if (!(start=buffer=my_malloc(len+1, MYF(MY_WME))))
- goto err0;
- len=my_read(fd, buffer, len, MYF(MY_WME));
- end=start+len;
- while (ft_simple_get_word(default_charset_info, &start, end, &w, TRUE))
- {
- if (ft_add_stopword(my_strndup((char*) w.pos, w.len, MYF(0))))
- goto err1;
- }
- error=0;
+ w.pos= my_memdup(w.pos, w.len, MYF(0));
+ if (ft_add_stopword(tree, &w))
+ goto err1;
+ }
+ error= 0;
err1:
- my_free(buffer, MYF(0));
+ my_free(buffer, MYF(0));
err0:
- my_close(fd, MYF(MY_WME));
- DBUG_RETURN(error);
- }
- else
+ my_close(fd, MYF(MY_WME));
+ DBUG_RETURN(error);
+}
+
+
+static int
+ft_stopwords_load_built_in(TREE *tree, const char **precompiled_stopwords)
+{
+ DBUG_ENTER("ft_stopwords_load_built_int");
{
/* compatibility mode: to be removed */
- char **sws=(char **)ft_precompiled_stopwords;
-
- for (;*sws;sws++)
+ char **sws= (char **) precompiled_stopwords;
+ for ( ; *sws; sws++)
{
- if (ft_add_stopword(*sws))
+ FT_WORD w;
+ w.pos= *sws;
+ w.len= strlen(w.pos);
+ if (ft_add_stopword(tree, &w))
DBUG_RETURN(-1);
}
ft_stopword_file="(built-in)"; /* for SHOW VARIABLES */
@@ -109,12 +139,79 @@ err0:
}
-int is_stopword(char *word, uint len)
+int ft_init_stopwords()
+{
+ int rc;
+
+ DBUG_ENTER("ft_init_stopwords");
+
+ if ((rc= ft_stopwords_init_tree(&stopwords3_ci,
+ ft_stopword_file ?
+ default_charset_info :
+ &my_charset_latin1)) ||
+ (rc= ft_stopwords_init_tree(&stopwords3_cs, &my_charset_bin)))
+ goto ret;
+
+ if (ft_stopword_file)
+ {
+ rc= ft_stopwords_load_file(stopwords3_ci, ft_stopword_file) ||
+ ft_stopwords_load_file(stopwords3_cs, ft_stopword_file);
+ }
+ else
+ {
+ /* compatibility mode: to be removed */
+ stopwords_are_pure_ascii_8bit= 1;
+ rc= ft_stopwords_load_built_in(stopwords3_ci, ft_precompiled_stopwords) ||
+ ft_stopwords_load_built_in(stopwords3_cs, ft_precompiled_stopwords);
+ ft_stopword_file="(built-in)"; /* for SHOW VARIABLES */
+ }
+ret:
+ DBUG_RETURN(rc);
+}
+
+
+int is_stopword(CHARSET_INFO *cs, char *word, uint len)
{
FT_STOPWORD sw;
- sw.pos=word;
- sw.len=len;
- return tree_search(stopwords3,&sw, stopwords3->custom_arg) != NULL;
+ char buf[HA_FT_MAXBYTELEN];
+ size_t errors;
+ TREE *tree= (cs->state & (MY_CS_BINSORT|MY_CS_CSSORT)) ?
+ stopwords3_cs : stopwords3_ci;
+
+ if ((cs->mbminlen == 1 && stopwords_are_pure_ascii_8bit) ||
+ my_charset_same(cs, tree_charset(stopwords3_ci)))
+ {
+ /*
+ Both character sets are single byte and all stopwords are pure ASCII,
+ or the character sets are the same.
+ Conversion is not required.
+ */
+ sw.pos= word;
+ sw.len= len;
+ }
+ else /* Conversion required */
+ {
+ uint cnvlen= (uint) my_charset_convert((CHARSET_INFO*)
+ stopwords3_ci->custom_arg,
+ buf, sizeof(buf),
+ cs, word, (size_t) len,
+ &errors);
+ sw.pos= buf;
+ sw.len= cnvlen;
+ }
+ return tree_search(tree, &sw, tree->custom_arg) != NULL;
+}
+
+
+static void
+ft_stopwords_tree_free(TREE **tree)
+{
+ if (*tree)
+ {
+ delete_tree(*tree); /* purecov: inspected */
+ my_free((char*) *tree, MYF(0));
+ *tree= 0;
+ }
}
@@ -122,12 +219,9 @@ void ft_free_stopwords()
{
DBUG_ENTER("ft_free_stopwords");
- if (stopwords3)
- {
- delete_tree(stopwords3); /* purecov: inspected */
- my_free((char*) stopwords3,MYF(0));
- stopwords3=0;
- }
+ ft_stopwords_tree_free(&stopwords3_ci);
+ ft_stopwords_tree_free(&stopwords3_cs);
+
ft_stopword_file= 0;
DBUG_VOID_RETURN;
}
=== modified file 'storage/myisam/ftdefs.h'
--- a/storage/myisam/ftdefs.h 2008-04-09 05:41:40 +0000
+++ b/storage/myisam/ftdefs.h 2008-12-25 10:31:47 +0000
@@ -101,7 +101,7 @@ typedef struct st_ft_word {
double weight;
} FT_WORD;
-int is_stopword(char *word, uint len);
+int is_stopword(CHARSET_INFO *cs, char *word, uint len);
uint _ft_make_key(MI_INFO *, uint , uchar *, FT_WORD *, my_off_t);
=== modified file 'strings/ctype.c'
--- a/strings/ctype.c 2008-12-23 14:34:03 +0000
+++ b/strings/ctype.c 2008-12-25 10:31:47 +0000
@@ -428,3 +428,137 @@ my_charset_is_ascii_compatible(CHARSET_I
}
return 1;
}
+
+
+/*
+ Convert a string between character sets
+
+ SYNOPSIS
+ copy_and_convert()
+ to Store result here
+ to_cs Character set of result string
+ from Copy from here
+ from_length Length of from string
+ from_cs From character set
+
+ NOTES
+ 'to' must be big enough as form_length * to_cs->mbmaxlen
+
+ RETURN
+ length of bytes copied to 'to'
+*/
+
+
+static size_t
+copy_and_convert_extended(CHARSET_INFO *to_cs,
+ char *to, size_t to_length,
+ CHARSET_INFO *from_cs,
+ const char *from, size_t from_length,
+ size_t *errors)
+{
+ int cnvres;
+ my_wc_t wc;
+ const uchar *from_end= (const uchar*) from+from_length;
+ char *to_start= to;
+ uchar *to_end= (uchar*) to + to_length;
+ my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
+ my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
+ size_t error_count= 0;
+
+ while (1)
+ {
+ if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
+ from+= cnvres;
+ else if (cnvres == MY_CS_ILSEQ)
+ {
+ error_count++;
+ from++;
+ wc= '?';
+ }
+ else if (cnvres > MY_CS_TOOSMALL)
+ {
+ /*
+ A correct multibyte sequence detected
+ But it doesn't have Unicode mapping.
+ */
+ error_count++;
+ from+= (-cnvres);
+ wc= '?';
+ }
+ else
+ break; // Not enough characters
+
+outp:
+ if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
+ to+= cnvres;
+ else if (cnvres == MY_CS_ILUNI && wc != '?')
+ {
+ error_count++;
+ wc= '?';
+ goto outp;
+ }
+ else
+ break;
+ }
+ *errors= error_count;
+ return (size_t) (to - to_start);
+}
+
+
+/*
+ Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
+*/
+size_t
+my_charset_convert(CHARSET_INFO *to_cs, char *to, size_t to_length,
+ CHARSET_INFO *from_cs, const char *from, size_t from_length,
+ size_t *errors)
+{
+ /*
+ If any of the character sets is not ASCII compatible,
+ immediately switch to slow mb_wc->wc_mb method.
+ */
+ if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
+ return copy_and_convert_extended(to_cs, to, to_length,
+ from_cs, from, from_length, errors);
+
+ size_t length= min(to_length, from_length), length2= length;
+
+#if defined(__i386__)
+ /*
+ Special loop for i386, it allows to refer to a
+ non-aligned memory block as UINT32, which makes
+ it possible to copy four bytes at once. This
+ gives about 10% performance improvement comparing
+ to byte-by-byte loop.
+ */
+ for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
+ {
+ if ((*(uint32*)from) & 0x80808080)
+ break;
+ *((uint32*) to)= *((const uint32*) from);
+ }
+#endif
+
+ for (; ; *to++= *from++, length--)
+ {
+ if (!length)
+ {
+ *errors= 0;
+ return length2;
+ }
+ if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
+ {
+ size_t copied_length= length2 - length;
+ to_length-= copied_length;
+ from_length-= copied_length;
+ return copied_length + copy_and_convert_extended(to_cs,
+ to, to_length,
+ from_cs,
+ from, from_length,
+ errors);
+ }
+ }
+
+ DBUG_ASSERT(FALSE); // Should never get to here
+ return 0; // Make compiler happy
+}