List:Internals« Previous MessageNext Message »
From:svoj Date:October 18 2005 11:51am
Subject:bk commit into 5.0 tree (svoj:1.2008)
View as plain text  
Below is the list of changes that have just been committed into a local
5.0 repository of svoj. When svoj does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html

ChangeSet
  1.2008 05/10/18 14:50:51 svoj@stripped +1 -0
  WL#2671 - Fulltext: example UDF for weighting
  Added comments for weighting UDF

  plugin/fulltext/cnet_weight.c
    1.3 05/10/18 14:50:39 svoj@stripped +118 -4
    Added comments.

# This is a BitKeeper patch.  What follows are the unified diffs for the
# set of deltas contained in the patch.  The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User:	svoj
# Host:	svoj-laptop.mysql.com
# Root:	/home/svoj/devel/mysql/CNET/mysql-5.0

--- 1.2/plugin/fulltext/cnet_weight.c	2005-10-14 17:40:14 +05:00
+++ 1.3/plugin/fulltext/cnet_weight.c	2005-10-18 14:50:39 +05:00
@@ -13,8 +13,11 @@
 #include <plugin.h>
 
 
+/* This function will be used to parse query and document */
 extern int cnet_parser_parse(MYSQL_FTPARSER_PARAM *param);
 
+
+/* CNET_STRING holds a word from the query. */
 typedef struct
 {
   char *str;
@@ -25,13 +28,34 @@
 
 typedef struct
 {
-  CNET_STRING *query;
-  uint nwords;
-  double weight;
-  uint proximity;
+  CNET_STRING *query; /* broken into words query */
+  uint nwords;        /* number of words in query */
+  double weight;      /* calculated weight of the document */
+  uint proximity;     /* last match was `proximity` words ago */
 } CNET_WEIGHT_PARAM;
 
 
+/*
+  A routine used by the parser while parsing a query.
+  SYNOPSIS
+    query_add_word()
+    param - pointer to CNET_WEIGHT_PARAM
+    word - pointer to word
+    word_len - length of word
+    boolean_info - additional word information, currently only
+                   boolean_info->weight_adjust is used.
+  RETURN VALUE
+    Always returns 0
+  DESCRIPTION
+    This function adds a word from the query into
+    CNET_WEIGHT_PARAM->query structure. Also it sets initial
+    weight for current word as passed by the parser. Currently
+    parser passes following weight_adjust:
+      1 for the word in the query (weight= 0.1)
+     -1 for the synonym (case sensitive match, weight= -0.1)
+     -2 for the synonym (case insensitive match, weight= -0.2)
+*/
+
 static int query_add_word(void *param, byte *word, uint word_len,
     MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info)
 {
@@ -45,6 +69,29 @@
 }
 
 
+/*
+  cnet_weight UDF init routine, called in the beginning of the
+  query.
+  SYNOPSIS
+    cnet_weight_init()
+    initid - used to communicate information between
+             cnet_weight_init/cnet_weight_deinit/cnet_weight
+    args - arguments passed to UDF
+    message - buffer for error message
+    Detailed description of UDF is available in MySQL manual
+    in `Adding a New User-defined Function` section.
+  RETURN VALUE
+    0 - OK
+    1 - ERR
+  DESCRIPTION
+    This function checks how cnet_weight was called:
+    - if it has two arguments
+    - if both arguments are strings
+    - if second argument is constant
+    Also this function parses second argument (query) into
+    word and stores it in initid->ptr.
+*/
+
 my_bool cnet_weight_init(UDF_INIT *initid, UDF_ARGS *args,
     char *message)
 {
@@ -81,6 +128,19 @@
 }
 
 
+/*
+  Frees memory allocated by cnet_weight_init(), called at the
+  end of the query.
+  SYNOPSIS
+    cnet_weight_deinit()
+    initid - holds memory allocated by cnet_weight_init (initid->ptr)
+  RETURN VALUE
+    none
+  DESCRIPTION
+    Frees memory allocated by cnet_weight_init(), that is
+    CNET_WEIGHT_PARAM and CNET_STRING array.
+*/
+
 void cnet_weight_deinit(UDF_INIT *initid)
 {
   CNET_WEIGHT_PARAM *a= (CNET_WEIGHT_PARAM *)initid->ptr;
@@ -89,32 +149,86 @@
 }
 
 
+/*
+  A routine used by parser while parsing document.
+  SYNOPSIS
+    param - holds CNET_WEIGHT_PARAM
+    word - next word in the document
+    word_len - length of the word
+    boolean_info - unused
+  RETURN VALUE
+    Always returns 0
+  DESCRIPTION
+    This function performs relevancy calculation.
+*/
+
 static int document_add_word(void *param, byte *word, uint word_len,
     MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
 {
   CNET_WEIGHT_PARAM *weight_param= (CNET_WEIGHT_PARAM *)param;
   uint idx, match= 0;
+  /* Increase proximity */
   weight_param->proximity++;
+  /*
+    Compare each word from the query with current word from the
+    document.
+  */ 
   for (idx= 0; idx < weight_param->nwords; idx++)
   {
     CNET_STRING *qwrd= &weight_param->query[idx];
+    /* Skip this word if lengthes aren't equal */
     if (qwrd->length != word_len)
       continue;
     if (! strncmp(qwrd->str, word, word_len))
     {
+      /*
+        Word from the query matched word from the document
+        in case sensitive fashion. Increase document weight.
+        Lower distance between words gives higher weight
+        (2.0 / weight_param->proximity). Also qwrd->weight,
+        which was calculated in cnet_weight_init function
+        affects weight. So synonyms have lower weight than
+        word that present in the query.
+      */
       weight_param->weight+= 2.0 / weight_param->proximity + qwrd->weight;
       match= 1;
     }
     else if (! strncasecmp(qwrd->str, word, word_len))
     {
+      /*
+        Word from the query matched word from the document
+        in case insensitive fashion. Increase document weight.
+        Weight formula is almost the same as above, but
+        it has lower factor (1.0).
+      */
       weight_param->weight+= 1.0 / weight_param->proximity + qwrd->weight;
       match= 1;
     }
   }
+  /*
+    If word from the query matched current word from the document
+    reset proximity.
+  */
   if (match) weight_param->proximity= 0;
   return(0);
 }
 
+
+/*
+  User Defined Function itself.
+  SYNOPSIS
+    cnet_weight()
+    initid - holds broken into words query
+    args - only first argument is used, which is document
+    is_null - unused
+    error - unused
+  RETURN VALUE
+    document weight
+  DESCRIPTION
+    This function initializes parser variables and calls
+    parser with first argument (args->args[0]) which is
+    document.
+*/
 
 double cnet_weight(UDF_INIT *initid, UDF_ARGS *args,
     char *is_null, char *error)
Thread
bk commit into 5.0 tree (svoj:1.2008)svoj18 Oct