307 lines
7.4 KiB
C++

/* Copyright (c) 2014, 2018 Oracle and/or its affiliates. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
#include <fts0tokenize.h>
/* We are following InnoDB coding guidelines. */
/** Ngram token size, by default bigram. */
static int ngram_token_size;
#define RETURN_IF_ERROR(ret) if (ret != 0) return ret;
/** Parse a document into ngram.
@param[in] param plugin parser param
@param[in] doc document to parse
@param[in] len document length in bytes
@param[in,out] bool_info boolean info
@retval 0 on success
@retval 1 on failure. */
static
int
ngram_parse(
MYSQL_FTPARSER_PARAM* param,
const char* doc,
int len,
MYSQL_FTPARSER_BOOLEAN_INFO*
bool_info)
{
const CHARSET_INFO* cs = param->cs;
char* start;
char* next;
char* end;
int char_len;
int n_chars;
int ret = 0;
bool is_first = true;
DBUG_ASSERT(cs->mbminlen == 1);
start = const_cast<char*>(doc);
next = start;
end = start + len;
n_chars = 0;
while (next < end) {
char_len = my_mbcharlen_ptr(cs, next, end);
/* Skip the rest of the doc if invalid char. */
if (next + char_len > end || char_len == 0) {
break;
} else {
/* Skip SPACE or ","/"." etc as they are not words*/
int ctype;
cs->cset->ctype(
cs, &ctype, (uchar*) next, (uchar*) end);
if (char_len == 1 && (*next == ' '
|| !true_word_char(ctype, *next))) {
start = next + 1;
next = start;
n_chars = 0;
continue;
}
next += char_len;
n_chars++;
}
if (n_chars == ngram_token_size) {
/* Add a ngram */
bool_info->position = start - doc;
ret = param->mysql_add_word(
param, start, next - start, bool_info);
RETURN_IF_ERROR(ret);
/* Move a char forward */
start += my_mbcharlen_ptr(cs, start, end);
n_chars = ngram_token_size - 1;
is_first = false;
}
}
/* We handle unigram in cases below:
1. BOOLEAN MODE: suppose we have phrase search like ('"a bc"');
2. STOPWORD MODE: we should handle unigram when matching phrase.
Note: only when the document char len is less than ngram_token_size. */
switch (param->mode) {
case MYSQL_FTPARSER_FULL_BOOLEAN_INFO:
case MYSQL_FTPARSER_WITH_STOPWORDS:
if (n_chars > 0 && is_first) {
DBUG_ASSERT(next > start);
DBUG_ASSERT(n_chars < ngram_token_size);
ret = param->mysql_add_word(
param, start, next - start, bool_info);
}
break;
default:
break;
}
return(ret);
}
/** Get token char size by charset
@param[in] cs charset
@param[in] token token
@param[in] len token length in bytes
@retval size in number of chars */
static
int
ngram_get_token_size(
const CHARSET_INFO* cs,
const char* token,
int len)
{
const char* start;
const char* end;
int size = 0;
int char_len;
start = token;
end = start + len;
while (start < end) {
char_len = my_mbcharlen_ptr(cs, start, end);
size++;
start += char_len;
}
return(size);
}
/** Convert term into phrase and handle wildcard.
@param[in] param plugin parser param
@param[in] token token to parse
@param[in] len token length in bytes
@param[in,out] bool_info boolean info
@retval 0 on success
@retval 1 on failure. */
static
int
ngram_term_convert(
MYSQL_FTPARSER_PARAM* param,
const char* token,
int len,
MYSQL_FTPARSER_BOOLEAN_INFO* bool_info)
{
MYSQL_FTPARSER_BOOLEAN_INFO token_info =
{ FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
const CHARSET_INFO* cs = param->cs;
int token_size;
int ret = 0;
DBUG_ASSERT(bool_info->type == FT_TOKEN_WORD);
DBUG_ASSERT(bool_info->quot == NULL);
DBUG_ASSERT(cs->mbminlen == 1);
/* Convert rules:
1. if term with wildcard and term length is less than ngram_token_size,
we keep it as normal term search.
2. otherwise, term is converted to phrase and wildcard is ignored.
e.g. 'abc' and 'abc*' are both equivalent to '"ab bc"'. */
token_size = ngram_get_token_size(cs, token, len);
if (bool_info->trunc && token_size < ngram_token_size) {
ret = param->mysql_add_word(param, const_cast<char*>(token),
len, bool_info);
} else {
bool_info->type = FT_TOKEN_LEFT_PAREN;
bool_info->quot = reinterpret_cast<char*>(1);
ret = param->mysql_add_word(param, NULL, 0, bool_info);
RETURN_IF_ERROR(ret);
ret = ngram_parse(param, token, len, &token_info);
RETURN_IF_ERROR(ret);
bool_info->type = FT_TOKEN_RIGHT_PAREN;
ret = param->mysql_add_word(param, NULL, 0, bool_info);
DBUG_ASSERT(bool_info->quot == NULL);
bool_info->type = FT_TOKEN_WORD;
}
return(ret);
}
/** Ngram parser parse document.
@param[in] param plugin parser param
@retval 0 on success
@retval 1 on failure. */
static
int
ngram_parser_parse(
MYSQL_FTPARSER_PARAM* param)
{
MYSQL_FTPARSER_BOOLEAN_INFO bool_info =
{ FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
const CHARSET_INFO* cs = param->cs;
uchar** start = reinterpret_cast<uchar**>(&param->doc);
uchar* end = *start + param->length;
FT_WORD word = {NULL, 0, 0};
int ret = 0;
switch (param->mode) {
case MYSQL_FTPARSER_SIMPLE_MODE:
case MYSQL_FTPARSER_WITH_STOPWORDS:
ret = ngram_parse(param, param->doc,
param->length, &bool_info);
break;
case MYSQL_FTPARSER_FULL_BOOLEAN_INFO:
/* Ngram parser cannot handle query in boolean mode, so we
first parse query into words with boolean info, then we parse
the words into ngram. */
while (fts_get_word(cs, start, end, &word, &bool_info)) {
if (bool_info.type == FT_TOKEN_WORD) {
if (bool_info.quot != NULL) {
/* Phrase search */
ret = ngram_parse(
param,
reinterpret_cast<char*>(word.pos),
word.len,
&bool_info);
} else {
/* Term serach */
ret = ngram_term_convert(
param,
reinterpret_cast<char*>(word.pos),
word.len,
&bool_info);
DBUG_ASSERT(bool_info.quot == NULL);
DBUG_ASSERT(bool_info.type
== FT_TOKEN_WORD);
}
} else {
ret = param->mysql_add_word(
param,
reinterpret_cast<char*>(word.pos),
word.len,
&bool_info);
}
RETURN_IF_ERROR(ret);
}
break;
}
return(ret);
}
/** Fulltext ngram parser */
static struct st_mysql_ftparser ngram_parser_descriptor =
{
MYSQL_FTPARSER_INTERFACE_VERSION,
ngram_parser_parse,
0,
0
};
static MYSQL_SYSVAR_INT(token_size, ngram_token_size,
PLUGIN_VAR_READONLY,
"InnoDB ngram full text plugin parser token size in characters",
NULL, NULL, 2, 1, 10, 0);
/** Ngram plugin system variables */
static struct st_mysql_sys_var* ngram_system_variables[] =
{
MYSQL_SYSVAR(token_size),
NULL
};
/** Ngram plugin descriptor */
mysql_declare_plugin(ngram_parser)
{
MYSQL_FTPARSER_PLUGIN, /*!< type */
&ngram_parser_descriptor, /*!< descriptor */
"ngram", /*!< name */
"Oracle Corp", /*!< author */
"Ngram Full-Text Parser", /*!< description*/
PLUGIN_LICENSE_GPL,
NULL, /*!< init function (when loaded)*/
NULL, /*!< deinit function (when unloaded)*/
0x0001, /*!< version */
NULL, /*!< status variables */
ngram_system_variables, /*!< system variables */
NULL,
0,
}
mysql_declare_plugin_end;