Full-Text Parser PluginsでMecabを使った全文検索
「mysql5.1にテキスト解析プラグインとしてMeCabを組み込んでみる」を参考にして、MySQL5.1から導入されたFull-Text Parser PluginsでMecabを使って日本語全文検索を試してみた。
適当なsrc.rpm をダウンロードしてインストールする。
# wget ftp://rpmfind.net/linux/fedora/development/source/SRPMS/mecab-0.96-2.fc9.1.src.rpm # wget ftp://rpmfind.net/linux/fedora/development/source/SRPMS/mecab-ipadic-2.7.0.20070801-1.fc8.src.rpm # rpm -ivh mecab-0.96-2.fc9.1.src.rpm mecab-ipadic-2.7.0.20070801-1.fc8.src.rpm
RPMをビルドしてインストールする。
# cd /usr/src/redhat/SPECS/ # rpmbuild -bb ./mecab.spec # cd /usr/src/redhat/RPMS/i386/ # rpm -ivh mecab-0.96-2.1.i386.rpm mecab-devel-0.96-2.1.i386.rpm
# cd /usr/src/redhat/SPECS/ # rpmbuild -bb mecab-ipadic.spec # cd /usr/src/redhat/RPMS/i386/ # rpm -ivh mecab-ipadic-2.7.0.20070801-1.i386.rpm
「mysql5.1にテキスト解析プラグインとしてMeCabを組み込んでみる」をそのまんま適用しております。
/* Copyright (C) 2006 MySQL AB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <stdlib.h> #include <ctype.h> #include <mysql/plugin.h> #include <mecab.h> #if !defined(__attribute__) && (defined(__cplusplus) || !defined(__GNUC__) || __GNUC__ == 2 && __GNUC_MINOR__ < 8) #define __attribute__(A) #endif static long number_of_calls= 0; /* for SHOW STATUS, see below */ /* Simple full-text parser plugin that acts as a replacement for the built-in full-text parser: - All non-whitespace characters are significant and are interpreted as "word characters." - Whitespace characters are space, tab, CR, LF. - There is no minimum word length. Non-whitespace sequences of one character or longer are words. - Stopwords are used in non-boolean mode, not used in boolean mode. */ /* simple_parser interface functions: Plugin declaration functions: - simple_parser_plugin_init() - simple_parser_plugin_deinit() Parser descriptor functions: - simple_parser_parse() - simple_parser_init() - simple_parser_deinit() */ /* Initialize the parser plugin at server start or plugin installation. SYNOPSIS simple_parser_plugin_init() DESCRIPTION Does nothing. RETURN VALUE 0 success 1 failure (cannot happen) */ static int simple_parser_plugin_init(void *arg __attribute__((unused))) { return(0); } /* Terminate the parser plugin at server shutdown or plugin deinstallation. SYNOPSIS simple_parser_plugin_deinit() Does nothing. RETURN VALUE 0 success 1 failure (cannot happen) */ static int simple_parser_plugin_deinit(void *arg __attribute__((unused))) { return(0); } /* Initialize the parser on the first use in the query SYNOPSIS simple_parser_init() DESCRIPTION Does nothing. RETURN VALUE 0 success 1 failure (cannot happen) */ static int simple_parser_init(MYSQL_FTPARSER_PARAM *param __attribute__((unused))) { return(0); } /* Terminate the parser at the end of the query SYNOPSIS simple_parser_deinit() DESCRIPTION Does nothing. RETURN VALUE 0 success 1 failure (cannot happen) */ static int simple_parser_deinit(MYSQL_FTPARSER_PARAM *param __attribute__((unused))) { return(0); } /* Pass a word back to the server. SYNOPSIS add_word() param parsing context of the plugin word a word len word length DESCRIPTION Fill in boolean metadata for the word (if parsing in boolean mode) and pass the word to the server. The server adds the word to a full-text index when parsing for indexing, or adds the word to the list of search terms when parsing a search string. */ static void add_word(MYSQL_FTPARSER_PARAM *param, char *word, size_t len) { MYSQL_FTPARSER_BOOLEAN_INFO bool_info= { FT_TOKEN_WORD, 0, 0, 0, 0, ' ', 0 }; param->mysql_add_word(param, word, len, &bool_info); } /* Parse a document or a search query. SYNOPSIS simple_parser_parse() param parsing context DESCRIPTION This is the main plugin function which is called to parse a document or a search query. The call mode is set in param->mode. This function simply splits the text into words and passes every word to the MySQL full-text indexing engine. */ static int simple_parser_parse(MYSQL_FTPARSER_PARAM *param) { number_of_calls++; mecab_t *mecab; mecab_node_t *node; mecab = mecab_new(0, 0); node = mecab_sparse_tonode2(mecab, param->doc, param->length); for(; node; node = node->next) { if (node->stat == MECAB_BOS_NODE || node->stat == MECAB_EOS_NODE) { continue; } add_word(param, node->surface, node->length); } mecab_destroy(mecab); return(0); } /* Plugin type-specific descriptor */ static struct st_mysql_ftparser simple_parser_descriptor= { MYSQL_FTPARSER_INTERFACE_VERSION, /* interface version */ simple_parser_parse, /* parsing function */ simple_parser_init, /* parser init function */ simple_parser_deinit /* parser deinit function */ }; /* Plugin status variables for SHOW STATUS */ static struct st_mysql_show_var simple_status[]= { {"static", (char *)"just a static text", SHOW_CHAR}, {"called", (char *)&number_of_calls, SHOW_LONG}, {0,0,0} }; /* Plugin system variables. */ static long sysvar_one_value; static char *sysvar_two_value; static MYSQL_SYSVAR_LONG(simple_sysvar_one, sysvar_one_value, PLUGIN_VAR_RQCMDARG, "Simple fulltext parser example system variable number one. Give a number.", NULL, NULL, 77L, 7L, 777L, 0); static MYSQL_SYSVAR_STR(simple_sysvar_two, sysvar_two_value, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, "Simple fulltext parser example system variable number two. Give a string.", NULL, NULL, "simple sysvar two default"); static MYSQL_THDVAR_LONG(simple_thdvar_one, PLUGIN_VAR_RQCMDARG, "Simple fulltext parser example thread variable number one. Give a number.", NULL, NULL, 88L, 8L, 888L, 0); static MYSQL_THDVAR_STR(simple_thdvar_two, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, "Simple fulltext parser example thread variable number two. Give a string.", NULL, NULL, "simple thdvar two default"); static struct st_mysql_sys_var* simple_system_variables[]= { MYSQL_SYSVAR(simple_sysvar_one), MYSQL_SYSVAR(simple_sysvar_two), MYSQL_SYSVAR(simple_thdvar_one), MYSQL_SYSVAR(simple_thdvar_two), NULL }; /* Plugin library descriptor */ mysql_declare_plugin(ftexample) { MYSQL_FTPARSER_PLUGIN, /* type */ &simple_parser_descriptor, /* descriptor */ "simple_parser", /* name */ "MySQL AB", /* author */ "Simple Full-Text Parser", /* description */ PLUGIN_LICENSE_GPL, simple_parser_plugin_init, /* init function (when loaded) */ simple_parser_plugin_deinit,/* deinit function (when unloaded) */ 0x0001, /* version */ simple_status, /* status variables */ simple_system_variables, /* system variables */ NULL } mysql_declare_plugin_end;
# gcc -Wall -DMYSQL_DYNAMIC_PLUGIN -shared -fPIC `mecab-config --cflags` `mecab-config --libs` -o plugin_example.so plugin_example.c
- プラグインを組み込む
# cp plugin_example.so /usr/lib/mysql/.
mysql> install plugin simple_parser soname 'plugin_example.so'; mysql> show plugin;
- テスト
mysql> CREATE TABLE t ( id integer auto_increment, c text, FULLTEXT INDEX (c) WITH PARSER simple_parser, primary key(id) ) ENGINE=MyISAM DEFAULT CHARSET=Latin1;
mysql> INSERT INTO t SET c = '本日は晴天なり。'; mysql> INSERT INTO t SET c = '本日は曇天なり。'; mysql> INSERT INTO t SET c = '本日は快晴なり。'; mysql> SELECT * FROM t WHERE match(c) against('曇天' in boolean mode ); +----+--------------------------+ | id | c | +----+--------------------------+ | 2 | 本日は曇天なり。 | +----+--------------------------+ 1 row in set (0.01 sec)
できた。
- 備考
テーブルをutf8 で作成すると、うまくマッチしない。
mysql> CREATE TABLE t ( id integer auto_increment, c text, FULLTEXT INDEX (c) WITH PARSER simple_parser, primary key(id) ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
なぜか、どれもマッチしてしまう。
mysql> INSERT INTO t SET c = '本日は晴天なり。'; mysql> INSERT INTO t SET c = '本日は曇天なり。'; mysql> INSERT INTO t SET c = '本日は快晴なり。'; mysql> SELECT * FROM t WHERE match(c) against('曇天' in boolean mode ); +----+--------------------------+ | id | c | +----+--------------------------+ | 1 | 本日は晴天なり。 | | 2 | 本日は曇天なり。 | | 3 | 本日は快晴なり。 | +----+--------------------------+ 3 rows in set (0.01 sec)