List:Commits« Previous MessageNext Message »
From:Alexander Barkov Date:December 25 2008 10:38am
Subject:bzr commit into mysql-6.0-bugteam branch (bar:2823) Bug#32391
View as plain text  
#At file:///home/bar/mysql-bzr/mysql-6.0-b32391/

 2823 Alexander Barkov	2008-12-25
      Bug#32391 Character sets: crash with --character-set-server
      
      Problem:
      Crashed on initialization of the build-in stopwords
      when started with --character-set-server=utf16.
      ft_init_topwords() mistakenly compared the built-in
      stopwords using "utf16" as character set, which lead
      to exit on DBUG_ASSERT((slen % 2) == 0) in
      my_strnncollsp_utf16() when comparing a word with an even length
      (for example word="about", len=5).
      
      Fix:
      - using latin1 when initializing the built-in stopwords.
      - adding conversion from latin1 to "real" multi-byte character sets
        (UCS2, UTF16, UTF32) when searching stopwords.
      - additional fix: stopwords are now searched according
        to the column collation, that is when the built-in
        stopwords are in use, the word "ABOUT" is treated as
        a stopword only in a case insensitive collation.
      
      Changeset:
      
      - New files added to test --character-set-server=utf16:
      
        mysql-test/r/ctype_utf16_def.result
        mysql-test/t/ctype_utf16_def-master.opt
        mysql-test/t/ctype_utf16_def.test
      
      - Moving the character set conversion function
        from /sql to /strings. Adding the function prototype:
      
        include/m_ctype.h
        strings/ctype.c
      
      -  Adding tests for stopword case sensitivity:
      
        mysql-test/r/fulltext.result
        mysql-test/t/fulltext.test
      
      
      - Moving most of the converstion code to /strings:
        sql/sql_string.cc
      
      - The main fix: splitting code into more separate functions,
        loading stopwords into two trees (for case sensitive and 
        case insensitive searches), adding conversion code:
        
        storage/myisam/ft_parser.c
        storage/myisam/ft_stopwords.c
        storage/myisam/ftdefs.h
removed:
  sql/MSG00001.bin
added:
  mysql-test/r/ctype_utf16_def.result
  mysql-test/t/ctype_utf16_def-master.opt
  mysql-test/t/ctype_utf16_def.test
modified:
  include/m_ctype.h
  mysql-test/r/fulltext.result
  mysql-test/t/fulltext.test
  sql/sql_string.cc
  storage/myisam/ft_parser.c
  storage/myisam/ft_stopwords.c
  storage/myisam/ftdefs.h
  strings/ctype.c

=== modified file 'include/m_ctype.h'
--- a/include/m_ctype.h	2008-12-23 14:34:03 +0000
+++ b/include/m_ctype.h	2008-12-25 10:31:47 +0000
@@ -579,6 +579,11 @@ size_t my_strxfrm_pad_desc_and_reverse(C
 
 my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs);
 
+size_t
+my_charset_convert(CHARSET_INFO *to_cs, char *to, size_t to_length,
+                   CHARSET_INFO *from_cs, const char *from, size_t from_length,
+                   size_t *errors);
+
 #define	_MY_U	01	/* Upper case */
 #define	_MY_L	02	/* Lower case */
 #define	_MY_NMR	04	/* Numeral (digit) */

=== added file 'mysql-test/r/ctype_utf16_def.result'
--- a/mysql-test/r/ctype_utf16_def.result	1970-01-01 00:00:00 +0000
+++ b/mysql-test/r/ctype_utf16_def.result	2008-12-25 10:31:47 +0000
@@ -0,0 +1,46 @@
+DROP TABLE IF EXISTS t1;
+SHOW VARIABLES LIKE 'collation_server';
+Variable_name	Value
+collation_server	utf16_general_ci
+SHOW VARIABLES LIKE 'character_set_server';
+Variable_name	Value
+character_set_server	utf16
+SHOW VARIABLES LIKE 'ft%';
+Variable_name	Value
+ft_boolean_syntax	+ -><()~*:""&|
+ft_max_word_len	84
+ft_min_word_len	4
+ft_query_expansion_limit	20
+ft_stopword_file	(built-in)
+CREATE TABLE t1 (a text character set utf16 collate utf16_general_ci);
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` text
+) ENGINE=MyISAM DEFAULT CHARSET=utf16
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+a
+word
+DROP TABLE t1;
+CREATE TABLE t1 (a text character set utf16 collate utf16_bin);
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` text CHARACTER SET utf16 COLLATE utf16_bin
+) ENGINE=MyISAM DEFAULT CHARSET=utf16
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+a
+ABOUT
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+a
+word
+DROP TABLE t1;
+End of 6.0 tests

=== modified file 'mysql-test/r/fulltext.result'
--- a/mysql-test/r/fulltext.result	2008-12-09 10:16:06 +0000
+++ b/mysql-test/r/fulltext.result	2008-12-25 10:31:47 +0000
@@ -531,3 +531,34 @@ SELECT MATCH(a) AGAINST('aaa1* aaa14 aaa
 MATCH(a) AGAINST('aaa1* aaa14 aaa15 aaa16' IN BOOLEAN MODE)
 2
 DROP TABLE t1;
+CREATE TABLE t1 (a text character set latin1 collate latin1_bin);
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` text CHARACTER SET latin1 COLLATE latin1_bin
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+a
+ABOUT
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+a
+word
+DROP TABLE t1;
+CREATE TABLE t1 (a text character set latin1 collate latin1_general_ci);
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` text CHARACTER SET latin1 COLLATE latin1_general_ci
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+a
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+a
+word
+DROP TABLE t1;

=== added file 'mysql-test/t/ctype_utf16_def-master.opt'
--- a/mysql-test/t/ctype_utf16_def-master.opt	1970-01-01 00:00:00 +0000
+++ b/mysql-test/t/ctype_utf16_def-master.opt	2008-12-25 10:31:47 +0000
@@ -0,0 +1 @@
+--default-collation=utf16_general_ci --default-character-set=utf16,latin1

=== added file 'mysql-test/t/ctype_utf16_def.test'
--- a/mysql-test/t/ctype_utf16_def.test	1970-01-01 00:00:00 +0000
+++ b/mysql-test/t/ctype_utf16_def.test	2008-12-25 10:31:47 +0000
@@ -0,0 +1,37 @@
+-- source include/have_utf16.inc
+
+--disable_warnings
+DROP TABLE IF EXISTS t1;
+--enable_warnings
+
+#
+# Bug #32391  	Character sets: crash with --character-set-server
+# 
+SHOW VARIABLES LIKE 'collation_server';
+SHOW VARIABLES LIKE 'character_set_server';
+SHOW VARIABLES LIKE 'ft%';
+
+#
+# Test that stopwords are searched according to the collation
+# That is case sensitively in a case sensitive collation
+# an case insensitively in a case insensitive collation.
+# The word "ABOUT" is treated as a stopword only in a case insensitive
+# collation.
+#
+CREATE TABLE t1 (a text character set utf16 collate utf16_general_ci);
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+DROP TABLE t1;
+
+CREATE TABLE t1 (a text character set utf16 collate utf16_bin);
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+DROP TABLE t1;
+
+--echo End of 6.0 tests

=== modified file 'mysql-test/t/fulltext.test'
--- a/mysql-test/t/fulltext.test	2008-12-09 09:27:46 +0000
+++ b/mysql-test/t/fulltext.test	2008-12-25 10:31:47 +0000
@@ -454,3 +454,25 @@ INSERT INTO t1 VALUES('aaa15');
 SELECT MATCH(a) AGAINST('aaa1* aaa14 aaa16' IN BOOLEAN MODE) FROM t1;
 SELECT MATCH(a) AGAINST('aaa1* aaa14 aaa15 aaa16' IN BOOLEAN MODE) FROM t1;
 DROP TABLE t1;
+
+#
+# Test that stopwords are searched according to the collation
+# That is case sensitively in a case sensitive collation
+# an case insensitively in a case insensitive collation.
+# The word "ABOUT" is treated as a stopword only in a case insensitive
+# collation.
+CREATE TABLE t1 (a text character set latin1 collate latin1_bin);
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+DROP TABLE t1;
+CREATE TABLE t1 (a text character set latin1 collate latin1_general_ci);
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES ('about'),('ABOUT'),('word');
+SELECT * FROM t1 WHERE MATCH a AGAINST ('about' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('ABOUT' IN BOOLEAN MODE);
+SELECT * FROM t1 WHERE MATCH a AGAINST ('word' IN BOOLEAN MODE);
+DROP TABLE t1;
+

=== removed file 'sql/MSG00001.bin'
Binary files a/sql/MSG00001.bin	2008-10-23 13:28:53 +0000 and b/sql/MSG00001.bin	1970-01-01 00:00:00 +0000 differ

=== modified file 'sql/sql_string.cc'
--- a/sql/sql_string.cc	2008-07-18 13:30:53 +0000
+++ b/sql/sql_string.cc	2008-12-25 10:31:47 +0000
@@ -734,121 +734,26 @@ String *copy_if_not_alloced(String *to,S
 
   RETURN
     length of bytes copied to 'to'
+  
+  
+  Note, my_charset_convert() is 64-bit compatible
+  and can convert strings longer than 4Gb on a 64-bit machine.
+  This function is a 32-bit wrapper for convenience:
+  all String functions uses uint32.
 */
 
 
-static uint32
-copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
-                          const char *from, uint32 from_length,
-                          CHARSET_INFO *from_cs,
-                          uint *errors)
-{
-  int         cnvres;
-  my_wc_t     wc;
-  const uchar *from_end= (const uchar*) from+from_length;
-  char *to_start= to;
-  uchar *to_end= (uchar*) to+to_length;
-  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
-  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
-  uint error_count= 0;
-
-  while (1)
-  {
-    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from,
-				      from_end)) > 0)
-      from+= cnvres;
-    else if (cnvres == MY_CS_ILSEQ)
-    {
-      error_count++;
-      from++;
-      wc= '?';
-    }
-    else if (cnvres > MY_CS_TOOSMALL)
-    {
-      /*
-        A correct multibyte sequence detected
-        But it doesn't have Unicode mapping.
-      */
-      error_count++;
-      from+= (-cnvres);
-      wc= '?';
-    }
-    else
-      break;  // Not enough characters
-
-outp:
-    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
-      to+= cnvres;
-    else if (cnvres == MY_CS_ILUNI && wc != '?')
-    {
-      error_count++;
-      wc= '?';
-      goto outp;
-    }
-    else
-      break;
-  }
-  *errors= error_count;
-  return (uint32) (to - to_start);
-}
-
-
-/*
-  Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
-*/
 uint32
 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
                  const char *from, uint32 from_length, CHARSET_INFO *from_cs,
                  uint *errors)
 {
-  /*
-    If any of the character sets is not ASCII compatible,
-    immediately switch to slow mb_wc->wc_mb method.
-  */
-  if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
-    return copy_and_convert_extended(to, to_length, to_cs,
-                                     from, from_length, from_cs, errors);
-
-  uint32 length= min(to_length, from_length), length2= length;
-
-#if defined(__i386__)
-  /*
-    Special loop for i386, it allows to refer to a
-    non-aligned memory block as UINT32, which makes
-    it possible to copy four bytes at once. This
-    gives about 10% performance improvement comparing
-    to byte-by-byte loop.
-  */
-  for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
-  {
-    if ((*(uint32*)from) & 0x80808080)
-      break;
-    *((uint32*) to)= *((const uint32*) from);
-  }
-#endif
-
-  for (; ; *to++= *from++, length--)
-  {
-    if (!length)
-    {
-      *errors= 0;
-      return length2;
-    }
-    if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
-    {
-      uint32 copied_length= length2 - length;
-      to_length-= copied_length;
-      from_length-= copied_length;
-      return copied_length + copy_and_convert_extended(to, to_length,
-                                                       to_cs,
-                                                       from, from_length,
-                                                       from_cs,
-                                                       errors);
-    }
-  }
-
-  DBUG_ASSERT(FALSE); // Should never get to here
-  return 0;           // Make compiler happy
+  uint32 error_count;
+  uint32 res= (uint32) my_charset_convert(to_cs, to, to_length,
+                                          from_cs, from, from_length,
+                                          &error_count);
+  *errors= (uint) error_count;
+  return res;
 }
 
 

=== modified file 'storage/myisam/ft_parser.c'
--- a/storage/myisam/ft_parser.c	2008-07-09 07:12:43 +0000
+++ b/storage/myisam/ft_parser.c	2008-12-25 10:31:47 +0000
@@ -175,7 +175,7 @@ uchar ft_get_word(CHARSET_INFO *cs, ucha
     if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
       doc++;
 
-    if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
+    if (((length >= ft_min_word_len && !is_stopword(cs, (char*) word->pos,
                                                     word->len))
          || param->trunc) && length < ft_max_word_len)
     {
@@ -238,7 +238,7 @@ uchar ft_simple_get_word(CHARSET_INFO *c
 
     if (skip_stopwords == FALSE ||
         (length >= ft_min_word_len && length < ft_max_word_len &&
-         !is_stopword((char*) word->pos, word->len)))
+         !is_stopword(cs, (char*) word->pos, word->len)))
     {
       *start= doc;
       DBUG_RETURN(1);

=== modified file 'storage/myisam/ft_stopwords.c'
--- a/storage/myisam/ft_stopwords.c	2008-04-28 16:24:05 +0000
+++ b/storage/myisam/ft_stopwords.c	2008-12-25 10:31:47 +0000
@@ -24,16 +24,29 @@ typedef struct st_ft_stopwords
   uint   len;
 } FT_STOPWORD;
 
-static TREE *stopwords3=NULL;
+static TREE *stopwords3_cs= NULL;
+static TREE *stopwords3_ci= NULL;
+
+static inline CHARSET_INFO*
+tree_charset(TREE *tree)
+{
+  return (CHARSET_INFO*) tree->custom_arg;
+}
+
+
+static int stopwords_are_pure_ascii_8bit= 0;
 
 static int FT_STOPWORD_cmp(void* cmp_arg __attribute__((unused)),
-			   FT_STOPWORD *w1, FT_STOPWORD *w2)
+                           FT_STOPWORD *w1, FT_STOPWORD *w2)
 {
-  return ha_compare_text(default_charset_info,
+  DBUG_ASSERT(cmp_arg != NULL);
+  
+  return ha_compare_text((CHARSET_INFO*) cmp_arg,
 			 (uchar *)w1->pos,w1->len,
 			 (uchar *)w2->pos,w2->len,0,0);
 }
 
+
 static void FT_STOPWORD_free(FT_STOPWORD *w, TREE_FREE action,
                              void *arg __attribute__((unused)))
 {
@@ -41,66 +54,83 @@ static void FT_STOPWORD_free(FT_STOPWORD
     my_free((uchar*) w->pos, MYF(0));
 }
 
-static int ft_add_stopword(const char *w)
+static int ft_add_stopword(TREE *tree, const FT_WORD *w)
 {
   FT_STOPWORD sw;
-  return !w ||
-         (((sw.len= (uint) strlen(sw.pos=w)) >= ft_min_word_len) &&
-          (tree_insert(stopwords3, &sw, 0, stopwords3->custom_arg)==NULL));
+  sw.pos= w->pos;
+  return !w->pos ||
+         (((sw.len= (uint) w->len) >= ft_min_word_len) &&
+          (tree_insert(tree, &sw, 0, tree_charset(tree))==NULL));
 }
 
-int ft_init_stopwords()
-{
-  DBUG_ENTER("ft_init_stopwords");
-  if (!stopwords3)
-  {
-    if (!(stopwords3=(TREE *)my_malloc(sizeof(TREE),MYF(0))))
-      DBUG_RETURN(-1);
-    init_tree(stopwords3,0,0,sizeof(FT_STOPWORD),(qsort_cmp2)&FT_STOPWORD_cmp,
-              0,
-              (ft_stopword_file ? (tree_element_free)&FT_STOPWORD_free : 0),
-              NULL);
-  }
 
-  if (ft_stopword_file)
+static int
+ft_stopwords_init_tree(TREE **tree, CHARSET_INFO *cs)
+{
+  if (!(*tree= (TREE*) my_malloc(sizeof(TREE), MYF(0))))
+    return -1;
+  init_tree(*tree,
+            0, /* default_alloc_size*/
+            0, /*memory_limit */
+            sizeof(FT_STOPWORD),
+            (qsort_cmp2) &FT_STOPWORD_cmp,
+            0, /* with_delete */
+            ft_stopword_file ? (tree_element_free) &FT_STOPWORD_free : 0,
+            cs);
+  return 0;
+}
+
+
+static int
+ft_stopwords_load_file(TREE *tree, const char* stopword_file)
+{
+  File fd;
+  uint len;
+  uchar *buffer, *start, *end;
+  FT_WORD w;
+  int error= -1;
+
+  DBUG_ENTER("ft_stopwords_load_file");
+
+  if (!*stopword_file)
+    DBUG_RETURN(0);
+
+  if ((fd= my_open(stopword_file, O_RDONLY, MYF(MY_WME))) == -1)
+    DBUG_RETURN(-1);
+  len= (uint) my_seek(fd, 0L, MY_SEEK_END, MYF(0));
+  my_seek(fd, 0L, MY_SEEK_SET, MYF(0));
+  if (!(start= buffer= my_malloc(len + 1, MYF(MY_WME))))
+    goto err0;
+  len= my_read(fd, buffer, len, MYF(MY_WME));
+  end= start + len;
+  while (ft_simple_get_word(default_charset_info, &start, end, &w, TRUE))
   {
-    File fd;
-    uint len;
-    uchar *buffer, *start, *end;
-    FT_WORD w;
-    int error=-1;
-
-    if (!*ft_stopword_file)
-      DBUG_RETURN(0);
-
-    if ((fd=my_open(ft_stopword_file, O_RDONLY, MYF(MY_WME))) == -1)
-      DBUG_RETURN(-1);
-    len=(uint)my_seek(fd, 0L, MY_SEEK_END, MYF(0));
-    my_seek(fd, 0L, MY_SEEK_SET, MYF(0));
-    if (!(start=buffer=my_malloc(len+1, MYF(MY_WME))))
-      goto err0;
-    len=my_read(fd, buffer, len, MYF(MY_WME));
-    end=start+len;
-    while (ft_simple_get_word(default_charset_info, &start, end, &w, TRUE))
-    {
-      if (ft_add_stopword(my_strndup((char*) w.pos, w.len, MYF(0))))
-        goto err1;
-    }
-    error=0;
+    w.pos= my_memdup(w.pos, w.len, MYF(0));
+    if (ft_add_stopword(tree, &w))
+      goto err1;
+  }
+  error= 0;
 err1:
-    my_free(buffer, MYF(0));
+  my_free(buffer, MYF(0));
 err0:
-    my_close(fd, MYF(MY_WME));
-    DBUG_RETURN(error);
-  }
-  else
+  my_close(fd, MYF(MY_WME));
+  DBUG_RETURN(error);
+}
+
+
+static int
+ft_stopwords_load_built_in(TREE *tree, const char **precompiled_stopwords)
+{
+  DBUG_ENTER("ft_stopwords_load_built_int");
   {
     /* compatibility mode: to be removed */
-    char **sws=(char **)ft_precompiled_stopwords;
-
-    for (;*sws;sws++)
+    char **sws= (char **) precompiled_stopwords;
+    for ( ; *sws; sws++)
     {
-      if (ft_add_stopword(*sws))
+      FT_WORD w;
+      w.pos= *sws;
+      w.len= strlen(w.pos);
+      if (ft_add_stopword(tree, &w))
         DBUG_RETURN(-1);
     }
     ft_stopword_file="(built-in)"; /* for SHOW VARIABLES */
@@ -109,12 +139,79 @@ err0:
 }
 
 
-int is_stopword(char *word, uint len)
+int ft_init_stopwords()
+{
+  int rc;
+  
+  DBUG_ENTER("ft_init_stopwords");
+  
+  if ((rc= ft_stopwords_init_tree(&stopwords3_ci,
+                                  ft_stopword_file ?
+                                  default_charset_info :
+                                  &my_charset_latin1)) ||
+      (rc= ft_stopwords_init_tree(&stopwords3_cs, &my_charset_bin)))
+    goto ret;
+
+  if (ft_stopword_file)
+  {
+    rc= ft_stopwords_load_file(stopwords3_ci, ft_stopword_file) ||
+        ft_stopwords_load_file(stopwords3_cs, ft_stopword_file);
+  }
+  else
+  {
+    /* compatibility mode: to be removed */
+    stopwords_are_pure_ascii_8bit= 1;
+    rc= ft_stopwords_load_built_in(stopwords3_ci, ft_precompiled_stopwords) ||
+        ft_stopwords_load_built_in(stopwords3_cs, ft_precompiled_stopwords);
+    ft_stopword_file="(built-in)"; /* for SHOW VARIABLES */
+  }
+ret:
+  DBUG_RETURN(rc);
+}
+
+
+int is_stopword(CHARSET_INFO *cs, char *word, uint len)
 {
   FT_STOPWORD sw;
-  sw.pos=word;
-  sw.len=len;
-  return tree_search(stopwords3,&sw, stopwords3->custom_arg) != NULL;
+  char buf[HA_FT_MAXBYTELEN];
+  size_t errors;
+  TREE *tree= (cs->state & (MY_CS_BINSORT|MY_CS_CSSORT)) ?
+              stopwords3_cs : stopwords3_ci;
+
+  if ((cs->mbminlen == 1 && stopwords_are_pure_ascii_8bit) ||
+      my_charset_same(cs, tree_charset(stopwords3_ci)))
+  {
+    /*
+      Both character sets are single byte and all stopwords are pure ASCII,
+      or the character sets are the same.
+      Conversion is not required.
+    */
+    sw.pos= word;
+    sw.len= len;
+  }
+  else /* Conversion required */
+  {
+    uint cnvlen= (uint) my_charset_convert((CHARSET_INFO*)
+                                           stopwords3_ci->custom_arg,
+                                           buf, sizeof(buf),
+                                           cs, word, (size_t) len,
+                                           &errors);
+    sw.pos= buf;
+    sw.len= cnvlen;
+  }
+  return tree_search(tree, &sw, tree->custom_arg) != NULL;
+}
+
+
+static void
+ft_stopwords_tree_free(TREE **tree)
+{
+  if (*tree)
+  {
+    delete_tree(*tree); /* purecov: inspected */
+    my_free((char*) *tree, MYF(0));
+    *tree= 0;
+  }
 }
 
 
@@ -122,12 +219,9 @@ void ft_free_stopwords()
 {
   DBUG_ENTER("ft_free_stopwords");
 
-  if (stopwords3)
-  {
-    delete_tree(stopwords3); /* purecov: inspected */
-    my_free((char*) stopwords3,MYF(0));
-    stopwords3=0;
-  }
+  ft_stopwords_tree_free(&stopwords3_ci);
+  ft_stopwords_tree_free(&stopwords3_cs);
+  
   ft_stopword_file= 0;
   DBUG_VOID_RETURN;
 }

=== modified file 'storage/myisam/ftdefs.h'
--- a/storage/myisam/ftdefs.h	2008-04-09 05:41:40 +0000
+++ b/storage/myisam/ftdefs.h	2008-12-25 10:31:47 +0000
@@ -101,7 +101,7 @@ typedef struct st_ft_word {
   double weight;
 } FT_WORD;
 
-int is_stopword(char *word, uint len);
+int is_stopword(CHARSET_INFO *cs, char *word, uint len);
 
 uint _ft_make_key(MI_INFO *, uint , uchar *, FT_WORD *, my_off_t);
 

=== modified file 'strings/ctype.c'
--- a/strings/ctype.c	2008-12-23 14:34:03 +0000
+++ b/strings/ctype.c	2008-12-25 10:31:47 +0000
@@ -428,3 +428,137 @@ my_charset_is_ascii_compatible(CHARSET_I
   }
   return 1;
 }
+
+
+/*
+  Convert a string between character sets
+  
+  SYNOPSIS
+    copy_and_convert()
+    to			Store result here
+    to_cs		Character set of result string
+    from		Copy from here
+    from_length		Length of from string
+    from_cs		From character set
+
+  NOTES
+    'to' must be big enough as form_length * to_cs->mbmaxlen
+
+  RETURN
+    length of bytes copied to 'to'
+*/
+
+
+static size_t
+copy_and_convert_extended(CHARSET_INFO *to_cs,
+                          char *to, size_t to_length, 
+                          CHARSET_INFO *from_cs,
+                          const char *from, size_t from_length,
+                          size_t *errors)
+{
+  int         cnvres;
+  my_wc_t     wc;
+  const uchar *from_end= (const uchar*) from+from_length;
+  char *to_start= to;
+  uchar *to_end= (uchar*) to + to_length;
+  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
+  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
+  size_t error_count= 0;
+
+  while (1)
+  {
+    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
+      from+= cnvres;
+    else if (cnvres == MY_CS_ILSEQ)
+    {
+      error_count++;
+      from++;
+      wc= '?';
+    }
+    else if (cnvres > MY_CS_TOOSMALL)
+    {
+      /*
+        A correct multibyte sequence detected
+        But it doesn't have Unicode mapping.
+      */
+      error_count++;
+      from+= (-cnvres);
+      wc= '?';
+    }
+    else
+      break;  // Not enough characters
+
+outp:
+    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
+      to+= cnvres;
+    else if (cnvres == MY_CS_ILUNI && wc != '?')
+    {
+      error_count++;
+      wc= '?';
+      goto outp;
+    }
+    else
+      break;
+  }
+  *errors= error_count;
+  return (size_t) (to - to_start);
+}
+
+
+/*
+  Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
+*/
+size_t
+my_charset_convert(CHARSET_INFO *to_cs, char *to, size_t to_length, 
+                   CHARSET_INFO *from_cs, const char *from, size_t from_length, 
+                   size_t *errors)
+{
+  /*
+    If any of the character sets is not ASCII compatible,
+    immediately switch to slow mb_wc->wc_mb method.
+  */
+  if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
+    return copy_and_convert_extended(to_cs, to, to_length,
+                                     from_cs, from, from_length, errors);
+
+  size_t length= min(to_length, from_length), length2= length;
+
+#if defined(__i386__)
+  /*
+    Special loop for i386, it allows to refer to a
+    non-aligned memory block as UINT32, which makes
+    it possible to copy four bytes at once. This
+    gives about 10% performance improvement comparing
+    to byte-by-byte loop.
+  */
+  for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
+  {
+    if ((*(uint32*)from) & 0x80808080)
+      break;
+    *((uint32*) to)= *((const uint32*) from);
+  }
+#endif
+
+  for (; ; *to++= *from++, length--)
+  {
+    if (!length)
+    {
+      *errors= 0;
+      return length2;
+    }
+    if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
+    {
+      size_t copied_length= length2 - length;
+      to_length-= copied_length;
+      from_length-= copied_length;
+      return copied_length + copy_and_convert_extended(to_cs,
+                                                       to, to_length,
+                                                       from_cs,
+                                                       from, from_length,
+                                                       errors);
+    }
+  }
+
+  DBUG_ASSERT(FALSE); // Should never get to here
+  return 0;           // Make compiler happy
+}

Thread
bzr commit into mysql-6.0-bugteam branch (bar:2823) Bug#32391Alexander Barkov25 Dec
  • Re: bzr commit into mysql-6.0-bugteam branch (bar:2823) Bug#32391Sergei Golubchik8 Jan