List:Internals« Previous MessageNext Message »
From:konstantin Date:October 27 2005 12:25am
Subject:bk commit into 5.0-fulltext tree (konstantin:1.2019)
View as plain text  
Below is the list of changes that have just been committed into a local
5.0-fulltext repository of kostja. When kostja does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2019 05/10/27 02:25:02 konstantin@stripped +3 -0
  More comments for the fulltext parser plugin and plugin framework.
  Look up UDFs in plugin_dir, where all plugins should now be located.

  sql/sql_udf.cc
    1.56 05/10/27 02:24:46 konstantin@stripped +10 -5
    - make sure the server used plugin_dir to locate UDFs as well as plugins

  plugin/fulltext/cnet_parser.c
    1.9 05/10/27 02:24:46 konstantin@stripped +129 -8
    - add first comments
    - coding style
    - change max_word_len semantics to reflect its name

  include/plugin.h
    1.9 05/10/27 02:24:46 konstantin@stripped +41 -0
    Add some preliminary comments.

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	konstantin
# Host:	dragonfly.local
# Root:	/opt/local/work/mysql-5.0-cnet

--- 1.55/sql/sql_udf.cc	2005-10-14 16:40:15 +04:00
+++ 1.56/sql/sql_udf.cc	2005-10-27 02:24:46 +04:00
@@ -190,10 +190,13 @@
     void *dl = find_udf_dl(tmp->dl);
     if (dl == NULL)
     {
-      if (!(dl = dlopen(tmp->dl, RTLD_NOW)))
+      char dlpath[FN_REFLEN];
+      strxnmov(dlpath, sizeof(dlpath) - 1, opt_plugin_dir, "/", tmp->dl,
+               NullS);
+      if (!(dl= dlopen(dlpath, RTLD_NOW)))
       {
 	/* Print warning to log */
-	sql_print_error(ER(ER_CANT_OPEN_LIBRARY), tmp->dl,errno,dlerror());
+	sql_print_error(ER(ER_CANT_OPEN_LIBRARY), dlpath, errno, dlerror());
 	/* Keep the udf in the hash so that we can remove it later */
 	continue;
       }
@@ -409,12 +412,14 @@
   }
   if (!(dl = find_udf_dl(udf->dl)))
   {
-    if (!(dl = dlopen(udf->dl, RTLD_NOW)))
+    char dlpath[FN_REFLEN];
+    strxnmov(dlpath, sizeof(dlpath) - 1, opt_plugin_dir, "/", udf->dl, NullS);
+    if (!(dl = dlopen(dlpath, RTLD_NOW)))
     {
       DBUG_PRINT("error",("dlopen of %s failed, error: %d (%s)",
-			  udf->dl,errno,dlerror()));
+			  dlpath, errno, dlerror()));
       my_error(ER_CANT_OPEN_LIBRARY, MYF(0),
-                      udf->dl, errno, dlerror());
+                      dlpath, errno, dlerror());
       goto err;
     }
     new_dl=1;

--- 1.8/include/plugin.h	2005-09-14 15:34:28 +04:00
+++ 1.9/include/plugin.h	2005-10-27 02:24:46 +04:00
@@ -54,6 +54,15 @@
   FT_CHUNK_TYPE_STOPWORD= 4
 };
 
+/*
+  This structure is used in boolean search mode only. It conveys to
+  MySQL search engine boolean-mode metadata for every word in the
+  search query. A valid instance of this structure must be filled
+  in by the plugin parser and passed as an argument in call to
+  mysql_add_word (the function from structure MYSQL_FTPARSER_PARAM)
+  when a query is parsed in boolean mode.
+*/
+
 typedef struct st_mysql_ftparser_boolean_info
 {
   enum enum_ft_chunk_type type;
@@ -66,16 +75,48 @@
   byte *quot;
 } MYSQL_FTPARSER_BOOLEAN_INFO;
 
+
+/*
+  An argument of the fulltext parser plugin. This structure is
+  filled by MySQL server and passed to the parsing function of the
+  plugin as an in/out parameter.
+*/
+
 typedef struct st_mysql_ftparser_param
 {
+  /*
+    A fallback pointer to the built-in parser implementation
+    of the server. It's set by the server and can be used
+    by the parser plugin to invoke the default parser, e.g.
+    in case when the role of the plugin is just to extract
+    textual data from .doc, .pdf or .xml.
+  */
   int (*mysql_parse)(void *param, byte *doc, uint doc_len);
+  /*
+    A server callback to add a new word.
+    When parsing a document, the server sets this to point at
+    a function that adds the word to MySQL fulltext index.
+    When parsing a serach query, this function will
+    add the new word to the list of words to search for.
+    boolean_info can be NULL for all cases except the search in
+    boolean mode.
+  */
   int (*mysql_add_word)(void *param, byte *word, uint word_len,
                         MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info);
+  /* A pointer to the parser local state. This is an inout parameter. */
   void *ftparser_state;
   void *mysql_ftparam;
+  /* Character set of the document or the query */
   CHARSET_INFO *cs;
+  /* A pointer to the document or the query to be parsed */
   byte *doc;
+  /* Document/query length */
   uint length;
+  /*
+    Parsing mode: with boolean operators, in natural language mode,
+    parsing a document or a query. A combination of MYSQL_FTPARSER_*
+    flags defined earlier in this file.
+  */
   int flags;
 } MYSQL_FTPARSER_PARAM;
 

--- 1.8/plugin/fulltext/cnet_parser.c	2005-10-25 22:05:27 +04:00
+++ 1.9/plugin/fulltext/cnet_parser.c	2005-10-27 02:24:46 +04:00
@@ -18,23 +18,56 @@
 #include <m_ctype.h>
 #include <plugin.h>
 
+/*
+  Define to a path to the always-index words file. Every line in this
+  fle is treated as an always-index word. If the path is not
+  absolute, it is relative to the current working directory of the
+  server (the value of --datadir)
+*/
 
 #define CNET_WORD_CASES_PATH "ordinary-dict.txt"
 
+/*
+  A buffer where all the words from the always-index words
+  dictionary are allocated.
+*/
 static char *words_buf;
+/*
+  The always-index words dictionary represented as an array of
+  char pointers.
+*/
 static const char **words;
+/* The number of always-index words. */
 static size_t nwords;
+/*
+  If the word is not an always-index word and is less than
+  min_word_len, it's skipped during parsing and, consequently,
+  not added to the index.
+*/
 static int min_word_len= 3;
-static char *synonyms[][2]=
+/*
+  The synonyms dictionary: the parser conveys information
+  about a synonym match for to the indexing engine.
+*/
+static const char *synonyms[][2]=
 {
-  { (char *)"Wi-Fi", (char *)"wifi" },
-  { (char *)"C++", (char *)"cplusplus" },
-  { (char *)"E-mail", (char *)"email" },
-  { (char *)"Mozilla", (char *)"browser" },
+  { "Wi-Fi", "wifi" },
+  { "C++", "cplusplus" },
+  { "E-mail", "email" },
+  { "Mozilla", "browser" },
   { 0, 0 }
 };
 
-static int insert_word (const char *word)
+
+/*
+  Append a word to the always-index words dictionary.
+
+  SYNOPSIS
+    insert_word()
+      word               null-terminated pointer to the word
+*/
+
+static int insert_word(const char *word)
 {
   const char **tmp= realloc(words, sizeof(char *) * nwords + 1);
   if (! tmp)
@@ -45,6 +78,20 @@
   return(1);
 }
 
+/*
+  Initialize the parser plugin at server start or plugin installation.
+
+  SYNOPSIS
+    cnet_parser_deinit_once()
+
+  DESCRIPTION
+    Attempt to open and load the always-index words dictionary.
+
+  RETURN VALUE
+    0                    success
+    1                    the file with always-index words was not
+                         found, can't be opened, or out of memory
+*/
 
 static int cnet_parser_init_once(void)
 {
@@ -67,7 +114,8 @@
   {
     if (*end == '\n')
     {
-      if (end - start > 1) insert_word(start);
+      if (end - start > 1)
+        insert_word(start);
       *end= 0;
       end++;
       start= end;
@@ -91,6 +139,20 @@
 }
 
 
+/*
+  Free the memory and resources of the parser plugin at server shutdown
+  or plugin deinstallation.
+
+  SYNOPSIS
+    cnet_parser_init_once()
+
+  DESCRIPTION
+    Frees the always-index words dictionary.
+
+  RETURN VALUE
+    0                    never fails
+*/
+
 static int cnet_parser_deinit_once(void)
 {
   free(words_buf);
@@ -99,6 +161,25 @@
 }
 
 
+/*
+  Look up a match to a word from the always-index words dictionary.
+
+  SYNOPSIS
+    check_in_array()
+      start              start of the word
+      bytes_left         maximum possible length of the match
+
+  DESCRIPTION
+    This function is used by the parser to check if there is
+    a match of [a part of] the tail of the parsed document to one
+    of the always-index words.
+    It doesn't look for a match with maximum possible length,
+    instead it returns when the first match is found.
+
+  RETURN VALUE
+    0 if no match is found, otherwise the length of the match.
+*/
+
 static size_t check_in_array(const char *start, size_t bytes_left)
 {
   size_t i;
@@ -111,6 +192,23 @@
   return(0);
 }
 
+/*
+  Add a word to the fulltext index.
+
+  SYNOPSIS
+    add_word()
+      param              parsing context of the plugin
+      word               a word
+      length             word length
+
+  DESCRIPTION
+    Fill in boolean metadata of the word and add the word to MySQL
+    fulltext index; strictly speaking we don't have to do that for
+    anything but search in boolean mode, but it's easier to always
+    fill in the structure.
+    In case of a search in boolean mode, also look up synonyms of
+    the given word and add them to the search query.
+*/
 
 static void add_word(MYSQL_FTPARSER_PARAM *param, char *word,
                      size_t length)
@@ -126,6 +224,12 @@
       if (! strncmp(word, synonyms[i][0], length))
       {
         boolean_info.weight_adjust= -1;
+        /*
+          Inform the search engine that this is a synonym and not
+          a compulsory word: if this word is not present, don't
+          filter out the document. Documents with synonyms
+          will simply get higher relevance score.
+        */
         boolean_info.yesno= 0;
         param->mysql_add_word(param->mysql_ftparam, synonyms[i][1],
             strlen(synonyms[i][1]), &boolean_info);
@@ -142,6 +246,23 @@
 }
 
 
+/*
+  Parse a document or a search query.
+
+  SYNOPSIS
+    cnet_parser_parse()
+      param              parsing context
+
+  DESCRIPTION
+    This is the main plugin function which is called to parse
+    a document or a search query. The call mode is set in
+    param->flags.
+    When parsing a document, it simply splits the text into words
+    and passes every word to MySQL fultext indexing engine.
+    When parsing a query, and if in boolean mode, it will also
+    look up synonyms and add them to the search query.
+*/
+
 int cnet_parser_parse(MYSQL_FTPARSER_PARAM *param)
 {
  char *end, *start, *docend= param->doc + param->length;
@@ -165,7 +286,7 @@
     }
     else if (! isalnum(*end))
     {
-      if (end - start > min_word_len)
+      if (end - start >= min_word_len)
         add_word(param, start, end - start);
       start= end + 1;
     }
Thread
bk commit into 5.0-fulltext tree (konstantin:1.2019)konstantin27 Oct