/* Copyright (c) 2014, 2018 Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include /* We are following InnoDB coding guidelines. */ /** Ngram token size, by default bigram. */ static int ngram_token_size; #define RETURN_IF_ERROR(ret) if (ret != 0) return ret; /** Parse a document into ngram. @param[in] param plugin parser param @param[in] doc document to parse @param[in] len document length in bytes @param[in,out] bool_info boolean info @retval 0 on success @retval 1 on failure. */ static int ngram_parse( MYSQL_FTPARSER_PARAM* param, const char* doc, int len, MYSQL_FTPARSER_BOOLEAN_INFO* bool_info) { const CHARSET_INFO* cs = param->cs; char* start; char* next; char* end; int char_len; int n_chars; int ret = 0; bool is_first = true; DBUG_ASSERT(cs->mbminlen == 1); start = const_cast(doc); next = start; end = start + len; n_chars = 0; while (next < end) { char_len = my_mbcharlen_ptr(cs, next, end); /* Skip the rest of the doc if invalid char. */ if (next + char_len > end || char_len == 0) { break; } else { /* Skip SPACE or ","/"." etc as they are not words*/ int ctype; cs->cset->ctype( cs, &ctype, (uchar*) next, (uchar*) end); if (char_len == 1 && (*next == ' ' || !true_word_char(ctype, *next))) { start = next + 1; next = start; n_chars = 0; continue; } next += char_len; n_chars++; } if (n_chars == ngram_token_size) { /* Add a ngram */ bool_info->position = start - doc; ret = param->mysql_add_word( param, start, next - start, bool_info); RETURN_IF_ERROR(ret); /* Move a char forward */ start += my_mbcharlen_ptr(cs, start, end); n_chars = ngram_token_size - 1; is_first = false; } } /* We handle unigram in cases below: 1. BOOLEAN MODE: suppose we have phrase search like ('"a bc"'); 2. STOPWORD MODE: we should handle unigram when matching phrase. Note: only when the document char len is less than ngram_token_size. */ switch (param->mode) { case MYSQL_FTPARSER_FULL_BOOLEAN_INFO: case MYSQL_FTPARSER_WITH_STOPWORDS: if (n_chars > 0 && is_first) { DBUG_ASSERT(next > start); DBUG_ASSERT(n_chars < ngram_token_size); ret = param->mysql_add_word( param, start, next - start, bool_info); } break; default: break; } return(ret); } /** Get token char size by charset @param[in] cs charset @param[in] token token @param[in] len token length in bytes @retval size in number of chars */ static int ngram_get_token_size( const CHARSET_INFO* cs, const char* token, int len) { const char* start; const char* end; int size = 0; int char_len; start = token; end = start + len; while (start < end) { char_len = my_mbcharlen_ptr(cs, start, end); size++; start += char_len; } return(size); } /** Convert term into phrase and handle wildcard. @param[in] param plugin parser param @param[in] token token to parse @param[in] len token length in bytes @param[in,out] bool_info boolean info @retval 0 on success @retval 1 on failure. */ static int ngram_term_convert( MYSQL_FTPARSER_PARAM* param, const char* token, int len, MYSQL_FTPARSER_BOOLEAN_INFO* bool_info) { MYSQL_FTPARSER_BOOLEAN_INFO token_info = { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0}; const CHARSET_INFO* cs = param->cs; int token_size; int ret = 0; DBUG_ASSERT(bool_info->type == FT_TOKEN_WORD); DBUG_ASSERT(bool_info->quot == NULL); DBUG_ASSERT(cs->mbminlen == 1); /* Convert rules: 1. if term with wildcard and term length is less than ngram_token_size, we keep it as normal term search. 2. otherwise, term is converted to phrase and wildcard is ignored. e.g. 'abc' and 'abc*' are both equivalent to '"ab bc"'. */ token_size = ngram_get_token_size(cs, token, len); if (bool_info->trunc && token_size < ngram_token_size) { ret = param->mysql_add_word(param, const_cast(token), len, bool_info); } else { bool_info->type = FT_TOKEN_LEFT_PAREN; bool_info->quot = reinterpret_cast(1); ret = param->mysql_add_word(param, NULL, 0, bool_info); RETURN_IF_ERROR(ret); ret = ngram_parse(param, token, len, &token_info); RETURN_IF_ERROR(ret); bool_info->type = FT_TOKEN_RIGHT_PAREN; ret = param->mysql_add_word(param, NULL, 0, bool_info); DBUG_ASSERT(bool_info->quot == NULL); bool_info->type = FT_TOKEN_WORD; } return(ret); } /** Ngram parser parse document. @param[in] param plugin parser param @retval 0 on success @retval 1 on failure. */ static int ngram_parser_parse( MYSQL_FTPARSER_PARAM* param) { MYSQL_FTPARSER_BOOLEAN_INFO bool_info = { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0}; const CHARSET_INFO* cs = param->cs; uchar** start = reinterpret_cast(¶m->doc); uchar* end = *start + param->length; FT_WORD word = {NULL, 0, 0}; int ret = 0; switch (param->mode) { case MYSQL_FTPARSER_SIMPLE_MODE: case MYSQL_FTPARSER_WITH_STOPWORDS: ret = ngram_parse(param, param->doc, param->length, &bool_info); break; case MYSQL_FTPARSER_FULL_BOOLEAN_INFO: /* Ngram parser cannot handle query in boolean mode, so we first parse query into words with boolean info, then we parse the words into ngram. */ while (fts_get_word(cs, start, end, &word, &bool_info)) { if (bool_info.type == FT_TOKEN_WORD) { if (bool_info.quot != NULL) { /* Phrase search */ ret = ngram_parse( param, reinterpret_cast(word.pos), word.len, &bool_info); } else { /* Term serach */ ret = ngram_term_convert( param, reinterpret_cast(word.pos), word.len, &bool_info); DBUG_ASSERT(bool_info.quot == NULL); DBUG_ASSERT(bool_info.type == FT_TOKEN_WORD); } } else { ret = param->mysql_add_word( param, reinterpret_cast(word.pos), word.len, &bool_info); } RETURN_IF_ERROR(ret); } break; } return(ret); } /** Fulltext ngram parser */ static struct st_mysql_ftparser ngram_parser_descriptor = { MYSQL_FTPARSER_INTERFACE_VERSION, ngram_parser_parse, 0, 0 }; static MYSQL_SYSVAR_INT(token_size, ngram_token_size, PLUGIN_VAR_READONLY, "InnoDB ngram full text plugin parser token size in characters", NULL, NULL, 2, 1, 10, 0); /** Ngram plugin system variables */ static struct st_mysql_sys_var* ngram_system_variables[] = { MYSQL_SYSVAR(token_size), NULL }; /** Ngram plugin descriptor */ mysql_declare_plugin(ngram_parser) { MYSQL_FTPARSER_PLUGIN, /*!< type */ &ngram_parser_descriptor, /*!< descriptor */ "ngram", /*!< name */ "Oracle Corp", /*!< author */ "Ngram Full-Text Parser", /*!< description*/ PLUGIN_LICENSE_GPL, NULL, /*!< init function (when loaded)*/ NULL, /*!< deinit function (when unloaded)*/ 0x0001, /*!< version */ NULL, /*!< status variables */ ngram_system_variables, /*!< system variables */ NULL, 0, } mysql_declare_plugin_end;