Below is the list of changes that have just been committed into a local
5.0 repository of svoj. When svoj does a push these changes will
be propagated to the main repository and, within 24 hours after the
push, to the public repository.
For information on how to access the public repository
see http://dev.mysql.com/doc/mysql/en/installing-source-tree.html
ChangeSet
1.2006 05/10/10 13:53:14 svoj@stripped +3 -0
WL#2671 - Fulltext: example UDF for weighting
- Proximity
- Higher rate case matching
- Word synonyms
- Word frequency
plugin/fulltext/cnet_weight.c
1.1 05/10/10 13:53:10 svoj@stripped +137 -0
New BitKeeper file ``plugin/fulltext/cnet_weight.c''
plugin/fulltext/cnet_weight.c
1.0 05/10/10 13:53:10 svoj@stripped +0 -0
BitKeeper file /home/svoj/devel/mysql/CNET/mysql-5.0/plugin/fulltext/cnet_weight.c
plugin/fulltext/cnet_parser.c
1.4 05/10/10 13:53:10 svoj@stripped +41 -7
Extend query with synonyms in boolean mode.
plugin/fulltext/Makefile.am
1.3 05/10/10 13:53:10 svoj@stripped +2 -3
Added cnet_weight.c
cnet_parser.so renamed to libcnet.so
# This is a BitKeeper patch. What follows are the unified diffs for the
# set of deltas contained in the patch. The rest of the patch, the part
# that BitKeeper cares about, is below these diffs.
# User: svoj
# Host: svoj-laptop.mysql.com
# Root: /home/svoj/devel/mysql/CNET/mysql-5.0
--- 1.2/plugin/fulltext/Makefile.am 2005-09-07 02:40:46 +05:00
+++ 1.3/plugin/fulltext/Makefile.am 2005-10-10 13:53:10 +05:00
@@ -1,4 +1,3 @@
INCLUDES= -I$(top_builddir)/include
-pkglib_LTLIBRARIES= cnet_parser.la
-cnet_parser_la_SOURCES= cnet_parser.c
-cnet_parser_la_LDFLAGS= -module
+pkglib_LTLIBRARIES= libcnet.la
+libcnet_la_SOURCES= cnet_parser.c cnet_weight.c
--- 1.3/plugin/fulltext/cnet_parser.c 2005-09-17 22:25:47 +05:00
+++ 1.4/plugin/fulltext/cnet_parser.c 2005-10-10 13:53:10 +05:00
@@ -18,7 +18,14 @@
static const char **words;
static size_t nwords;
static int min_word_len= 3;
-
+static char *synonyms[][2]=
+{
+ { (char *)"Wi-Fi", (char *)"wifi" },
+ { (char *)"C++", (char *)"cplusplus" },
+ { (char *)"E-mail", (char *)"email" },
+ { (char *)"Mozilla", (char *)"browser" },
+ { 0, 0 }
+};
static int insert_word (const char *word)
{
@@ -97,11 +104,39 @@
}
-static int cnet_parser_parse(MYSQL_FTPARSER_PARAM *param)
+static void add_word(MYSQL_FTPARSER_PARAM *param, char *word,
+ size_t length)
{
MYSQL_FTPARSER_BOOLEAN_INFO boolean_info=
{ FT_CHUNK_TYPE_WORD, 1, 0, 0, 0, ' ', 0 };
- char *end, *start, *docend= param->doc + param->length;
+ param->mysql_add_word(param->mysql_ftparam, word, length, &boolean_info);
+ if (param->flags & MYSQL_FTPARSER_WITH_BOOLEAN_OPERATORS)
+ {
+ size_t i;
+ for (i= 0; synonyms[i][0]; i++)
+ {
+ if (! strncmp(word, synonyms[i][0], length))
+ {
+ boolean_info.weight_adjust= -1;
+ boolean_info.yesno= 0;
+ param->mysql_add_word(param->mysql_ftparam, synonyms[i][1],
+ strlen(synonyms[i][1]), &boolean_info);
+ }
+ else if (! strncasecmp(word, synonyms[i][0], length))
+ {
+ boolean_info.weight_adjust= -2;
+ boolean_info.yesno= 0;
+ param->mysql_add_word(param->mysql_ftparam, synonyms[i][1],
+ strlen(synonyms[i][1]), &boolean_info);
+ }
+ }
+ }
+}
+
+
+int cnet_parser_parse(MYSQL_FTPARSER_PARAM *param)
+{
+ char *end, *start, *docend= param->doc + param->length;
size_t l;
for (end= start= param->doc;; end++)
@@ -109,12 +144,12 @@
if (end == docend)
{
if (end - start > min_word_len)
- param->mysql_add_word(param->mysql_ftparam, start, end - start, 0);
+ add_word(param, start, end - start);
break;
}
else if ((l= check_in_array(end, docend - end)))
{
- param->mysql_add_word(param->mysql_ftparam, end, l, &boolean_info);
+ add_word(param, end, l);
end+= l;
if (end == docend)
break;
@@ -123,8 +158,7 @@
else if (! isalnum(*end))
{
if (end - start > min_word_len)
- param->mysql_add_word(param->mysql_ftparam, start, end - start,
- &boolean_info);
+ add_word(param, start, end - start);
start= end + 1;
}
}
--- New file ---
+++ plugin/fulltext/cnet_weight.c 05/10/10 13:53:10
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdio.h>
#include <my_global.h>
#include <mysql.h>
#include <m_ctype.h>
#include <plugin.h>
extern int cnet_parser_parse(MYSQL_FTPARSER_PARAM *param);
typedef struct
{
char *str;
double weight;
uint length;
} CNET_STRING;
typedef struct
{
CNET_STRING *query;
uint nwords;
double weight;
uint proximity;
} CNET_WEIGHT_PARAM;
static int query_add_word(void *param, byte *word, uint word_len,
MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info)
{
CNET_WEIGHT_PARAM *a= (CNET_WEIGHT_PARAM *)param;
a->query= realloc(a->query, (a->nwords + 1) * sizeof(CNET_STRING));
a->query[a->nwords].str= word;
a->query[a->nwords].length= word_len;
a->query[a->nwords].weight= boolean_info->weight_adjust * 0.1;
a->nwords++;
return(0);
}
my_bool cnet_weight_init(UDF_INIT *initid, UDF_ARGS *args,
char *message)
{
MYSQL_FTPARSER_PARAM param;
if (args->arg_count != 2 || args->arg_type[0] != STRING_RESULT ||
args->arg_type[1] != STRING_RESULT)
{
strcpy(message, "cnet_weight() requires two strings");
return(1);
}
if (! args->args[1])
{
strcpy(message, "Second argument must be constant");
return(1);
}
if (! (initid->ptr= calloc(1, sizeof(CNET_WEIGHT_PARAM))))
{
strcpy(message, "Not enough memory");
return(1);
}
initid->decimals= 5;
initid->max_length= 10;
param.mysql_parse= 0;
param.mysql_add_word= query_add_word;
param.ftparser_state= 0;
param.mysql_ftparam= initid->ptr;
param.cs= 0;
param.doc= args->args[1];
param.length= args->lengths[1];
param.flags= MYSQL_FTPARSER_BOOLEAN_MODE;
cnet_parser_parse(¶m);
return(0);
}
void cnet_weight_deinit(UDF_INIT *initid)
{
CNET_WEIGHT_PARAM *a= (CNET_WEIGHT_PARAM *)initid->ptr;
if (a->query) free(a->query);
free(initid->ptr);
}
static int document_add_word(void *param, byte *word, uint word_len,
MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
{
CNET_WEIGHT_PARAM *weight_param= (CNET_WEIGHT_PARAM *)param;
uint idx, match= 0;
weight_param->proximity++;
for (idx= 0; idx < weight_param->nwords; idx++)
{
CNET_STRING *qwrd= &weight_param->query[idx];
if (qwrd->length != word_len)
continue;
if (! strncmp(qwrd->str, word, word_len))
{
weight_param->weight+= 2.0 / weight_param->proximity + qwrd->weight;
match= 1;
}
else if (! strncasecmp(qwrd->str, word, word_len))
{
weight_param->weight+= 1.0 / weight_param->proximity + qwrd->weight;
match= 1;
}
}
if (match) weight_param->proximity= 0;
return(0);
}
double cnet_weight(UDF_INIT *initid, UDF_ARGS *args,
char *is_null, char *error)
{
MYSQL_FTPARSER_PARAM param;
CNET_WEIGHT_PARAM *weight_param= (CNET_WEIGHT_PARAM *)initid->ptr;
weight_param->weight= 0.0;
weight_param->proximity= 0;
param.mysql_parse= 0;
param.mysql_add_word= document_add_word;
param.ftparser_state= 0;
param.mysql_ftparam= weight_param;
param.cs= 0;
param.doc= args->args[0];
param.length= args->lengths[0];
param.flags= MYSQL_FTPARSER_DOCUMENT_MODE;
cnet_parser_parse(¶m);
return(weight_param->weight);
}
| Thread |
|---|
| • bk commit into 5.0 tree (svoj:1.2006) | svoj | 10 Oct |