Sphinx 之 Coreseek、Sphinx-for-chinaese、Sphinx+Scws 評測

Sphinx是一個基於SQL的全文檢索引擎;廣泛使用於不少網站;但因爲中英文的差別,其自己,對中文的支持並很差。
主要體如今對一段話斷詞;英文只需按照空格對其分詞便可;但對於博大精深的中文來講,倒是件困難的事情。php

分詞在兩個地方會用到;
一、索引時,根據分詞索引原始數據
二、搜索時,對用戶輸入分詞,到索引中查詢html

本文提供了三種目前最經常使用的方案Coreseek、Sphinx-for-chinaese、Sphinx+Scws,並對其簡單對比;在合適的場景,選擇合適的方案,得出最優的解。前端

評測:
1、Coreseek 爲國人基於Sphinx開發的方案,目前最穩定版,是基於經典的Sphinx0.9.9版mysql

優勢:有成熟的文檔、以及社區;其分詞mmseg爲目前國內最爲好用分詞,索引和搜索分詞均可以用到;
缺點:深度開發、版本更新較慢;索引較慢
策略:一個詞庫管理後臺,維護詞庫;按期生成字典;此套件會自動分詞索引;
適用場景:普通青年、搭建差很少的搜索,適用於廣泛網站sql

2、Sphinx-for-chinaese 爲國人2基於經典的Sphinx0.9.9版開發的擴展版
優勢:部署簡單,易操做,內嵌分詞和詞庫,索引和搜索分詞均可以用到;
缺點:版本更新較慢;分詞較弱;索引相對較慢
策略:同一
適用場景:普通青年、快速搭建搜索的小站shell

3、Sphinx+Scws 爲兩套獨立系統,單獨部署,所謂高內聚低耦合,強烈推薦
優勢:兩套系統,相對獨立,各自單獨Server;分詞能夠作其餘用途;版本更新較快;
缺點:部署稍複雜,使用稍複雜;索引分詞只能用一元分詞,數據量較大
策略:詞庫管理外;使用時,先調用分詞服務,後調用搜索
適用場景:文藝青年、搭建像樣點的搜索;好吧文藝青年數據庫

因爲本人強烈推薦方案三,對方案三有詳細描述,其餘方案僅提供簡單部署方式。bootstrap

三種方案具體實現以下:api

1、coreseek
=======================================================================================================網絡

官網 http://www.coreseek.cn

基礎庫 http://ftp.gnu.org/gnu/automake/

一、安裝

1.0 基礎庫

#設置路徑和中文環境:
export PATH=/usr/local/bin:$PATH
export LC_ALL=zh_CN.UTF-8
export LANG=zh_CN.UTF-8

#依賴包安裝順序
m四、autoconf、automake、libtool

./configure –prefix=/usr/local
make && make install

1.1 安裝mmseg

./bootstrap
./configure –prefix=/opt/server/mmseg
make
make install

1.2 安裝csft

sh buildconf.sh

./configure –prefix=/opt/server/coreseek –without-unixodbc –with-mmseg=/opt/server/mmseg –with-mmseg-includes=/opt/server/mmseg/include/mmseg/ –with-mmseg-libs=/opt/server/mmseg/lib/ –with-mysql=/opt/server/mysql
make
mkae install

1.3 配置

mv csft_mysql.conf /opt/server/coreseek/etc

配置 /opt/server/coreseek/etc/csft_mysql.conf 參數 ;注意編碼問題;

#源定義
source mysql
{
    type                    = mysql

    sql_host                = 127.0.0.1
    sql_user                = root
    sql_pass                = 123456
    sql_db                  = test
    sql_port                = 3306
    sql_query_pre            = SET NAMES utf8

    sql_query                = SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content FROM documents
                                                              #sql_query第一列id需爲整數
                                                              #title、content做爲字符串/文本字段,被全文索引
    sql_attr_uint            = group_id           #從SQL讀取到的值必須爲整數
    sql_attr_timestamp        = date_added #從SQL讀取到的值必須爲整數,做爲時間屬性

    sql_query_info_pre      = SET NAMES utf8                                        #命令行查詢時,設置正確的字符集
    sql_query_info            = SELECT * FROM documents WHERE id=$id #命令行查詢時,從數據庫讀取原始數據信息
}

#index定義
index mysql
{
    source            = mysql             #對應的source名稱
    path            = /opt/server/coreseek/var/data/mysql #請修改成實際使用的絕對路徑,例如:/usr/local/coreseek/var/...
    docinfo            = extern
    mlock            = 0
    morphology        = none
    min_word_len        = 1
    html_strip                = 0

    #中文分詞配置,詳情請查看:http://www.coreseek.cn/products-install/coreseek_mmseg/
    charset_dictpath = /opt/server/mmseg/etc/ #BSD、Linux環境下設置,/符號結尾
    #charset_dictpath = etc/                             #Windows環境下設置,/符號結尾,最好給出絕對路徑,例如:C:/usr/local/coreseek/etc/...
    charset_type        = zh_cn.utf-8
}

#全局index定義
indexer
{
    mem_limit            = 128M
}

#searchd服務定義
searchd
{
    listen              =   9312
    read_timeout        = 5
    max_children        = 30
    max_matches         = 1000
    seamless_rotate        = 0
    preopen_indexes        = 0
    unlink_old            = 1
    pid_file = /opt/server/coreseek/var/log/searchd_mysql.pid
    log = /opt/server/coreseek/var/log/searchd_mysql.log
    query_log = /opt/server/coreseek/var/log/query_mysql.log
}

  

1.4測試

/opt/server/coreseek/bin/indexer -c /opt/server/coreseek/etc/csft.conf –all
/opt/server/coreseek/bin/search -c /opt/server/coreseek/etc/csft.conf 網絡搜索

1.5詞庫管理

詞庫文件
/opt/server/mmseg/bin/mmseg -u /opt/server/mmseg/etc/unigram.txt

unigram.txt.uni 改名爲 uni.lib 放入對應字典目錄

2、sphinx-for-chinese
=======================================================================================================

官網 http://www.sphinx-search.com

一、安裝
./configure –prefix=/opt/server/sphinx-for-chinese –with-mysql=/opt/server/mysql

make
make install

二、中文支持

2.1 生成字典
/opt/server/sphinx-for-chinese/bin/mkdict xdict_1.1.txt xdict

2.2 配置使用字典
mv xdict /opt/server/sphinx-for-chinese/etc/xdict

配置 sphinx.conf 參數

加入
charset_type = utf-8
chinese_dictionary = /opt/server/sphinx-for-chinese/etc/xdict

相關配置詳見 /opt/server/sphinx-for-chinese/etc/sphinx.conf;注意編碼問題;

#
# Minimal Sphinx configuration sample (clean, simple, functional)
#

source src1
{
	type					= mysql

	sql_host				= 127.0.0.1
	sql_user				= root
	sql_pass				= 123456
	sql_db					= mocube
	sql_port				= 3306	# optional, default is 3306
	sql_query_pre 				= SET NAMES UTF8
	sql_query				= SELECT id, title, content FROM mb_activity_detail

	sql_attr_uint				= id

	sql_query_info				= SELECT * FROM mb_activity_detail WHERE id=$id
}


index test1
{
	source					= src1
	path					= /opt/server/sphinx-for-chinese/var/data/test1
	docinfo					= extern
	charset_type 				= utf-8
	chinese_dictionary 			= /opt/server/sphinx-for-chinese/etc/xdict
}


indexer
{
	mem_limit				= 256M
}


searchd
{
	port					= 9312
	log					= /opt/server/sphinx-for-chinese/var/log/searchd.log
	query_log				= /opt/server/sphinx-for-chinese/var/log/query.log
	read_timeout				= 5
	max_children				= 30
	pid_file				= /opt/server/sphinx-for-chinese/var/log/searchd.pid
	max_matches				= 1000
	seamless_rotate				= 1
	preopen_indexes				= 0
	unlink_old				= 1
}

  

三、測試
/opt/server/sphinx-for-chinese/bin/search -c /opt/server/sphinx-for-chinese/etc/sphinx.conf 分享身邊的精彩

3、sphinx+Scws
=======================================================================================================

Scws官網:http://www.ftphp.com/scws/

一、安裝

./configure –prefix=/opt/server/sphinx –with-mysql=/opt/server/mysql

make
make install

二、配置

見/opt/server/sphinx/etc/sphinx.conf

#
#  基礎索引方案
#

source src1
{
	type					= mysql

	sql_host				= 127.0.0.1
	sql_user				= root
	sql_pass				= 123456
	sql_db					= mocube
	sql_port				= 3306
	sql_query_pre 			= SET NAMES UTF8
	sql_query				= SELECT id, title, content FROM mb_activity_detail

	sql_attr_uint			= id

	sql_query_info			= SELECT * FROM mb_activity_detail WHERE id=$id
}


index test1
{
	source			= src1
	path			= /opt/server/sphinx/var/data/test1
	docinfo			= extern
	mlock			= 0
	stopwords		=
	min_prefix_len	= 0
	min_infix_len	= 0
	min_word_len	= 1
	charset_type	= utf-8
	charset_table = U+FF10..U+FF19->0..9, 0..9, U+FF41..U+FF5A->a..z, U+FF21..U+FF3A->a..z,A..Z->a..z, a..z, U+0149, U+017F, U+0138, U+00DF, U+00FF, U+00C0..U+00D6->U+00E0..U+00F6,U+00E0..U+00F6, U+00D8..U+00DE->U+00F8..U+00FE, U+00F8..U+00FE, U+0100->U+0101, U+0101,U+0102->U+0103, U+0103, U+0104->U+0105, U+0105, U+0106->U+0107, U+0107, U+0108->U+0109,U+0109, U+010A->U+010B, U+010B, U+010C->U+010D, U+010D, U+010E->U+010F, U+010F,U+0110->U+0111, U+0111, U+0112->U+0113, U+0113, U+0114->U+0115, U+0115, U+0116->U+0117,U+0117, U+0118->U+0119, U+0119, U+011A->U+011B, U+011B, U+011C->U+011D, U+011D,U+011E->U+011F, U+011F, U+0130->U+0131, U+0131, U+0132->U+0133, U+0133, U+0134->U+0135,U+0135, U+0136->U+0137, U+0137, U+0139->U+013A, U+013A, U+013B->U+013C, U+013C,U+013D->U+013E, U+013E, U+013F->U+0140, U+0140, U+0141->U+0142, U+0142, U+0143->U+0144,U+0144, U+0145->U+0146, U+0146, U+0147->U+0148, U+0148, U+014A->U+014B, U+014B,U+014C->U+014D, U+014D, U+014E->U+014F, U+014F, U+0150->U+0151, U+0151, U+0152->U+0153,U+0153, U+0154->U+0155, U+0155, U+0156->U+0157, U+0157, U+0158->U+0159, U+0159,U+015A->U+015B, U+015B, U+015C->U+015D, U+015D, U+015E->U+015F, U+015F, U+0160->U+0161,U+0161, U+0162->U+0163, U+0163, U+0164->U+0165, U+0165, U+0166->U+0167, U+0167,U+0168->U+0169, U+0169, U+016A->U+016B, U+016B, U+016C->U+016D, U+016D, U+016E->U+016F,U+016F, U+0170->U+0171, U+0171, U+0172->U+0173, U+0173, U+0174->U+0175, U+0175,U+0176->U+0177, U+0177, U+0178->U+00FF, U+00FF, U+0179->U+017A, U+017A, U+017B->U+017C,U+017C, U+017D->U+017E, U+017E, U+0410..U+042F->U+0430..U+044F, U+0430..U+044F,U+05D0..U+05EA, U+0531..U+0556->U+0561..U+0586, U+0561..U+0587, U+0621..U+063A, U+01B9,U+01BF, U+0640..U+064A, U+0660..U+0669, U+066E, U+066F, U+0671..U+06D3, U+06F0..U+06FF,U+0904..U+0939, U+0958..U+095F, U+0960..U+0963, U+0966..U+096F, U+097B..U+097F,U+0985..U+09B9, U+09CE, U+09DC..U+09E3, U+09E6..U+09EF, U+0A05..U+0A39, U+0A59..U+0A5E,U+0A66..U+0A6F, U+0A85..U+0AB9, U+0AE0..U+0AE3, U+0AE6..U+0AEF, U+0B05..U+0B39,U+0B5C..U+0B61, U+0B66..U+0B6F, U+0B71, U+0B85..U+0BB9, U+0BE6..U+0BF2, U+0C05..U+0C39,U+0C66..U+0C6F, U+0C85..U+0CB9, U+0CDE..U+0CE3, U+0CE6..U+0CEF, U+0D05..U+0D39, U+0D60,U+0D61, U+0D66..U+0D6F, U+0D85..U+0DC6, U+1900..U+1938, U+1946..U+194F, U+A800..U+A805,U+A807..U+A822, U+0386->U+03B1, U+03AC->U+03B1, U+0388->U+03B5, U+03AD->U+03B5,U+0389->U+03B7, U+03AE->U+03B7, U+038A->U+03B9, U+0390->U+03B9, U+03AA->U+03B9,U+03AF->U+03B9, U+03CA->U+03B9, U+038C->U+03BF, U+03CC->U+03BF, U+038E->U+03C5,U+03AB->U+03C5, U+03B0->U+03C5, U+03CB->U+03C5, U+03CD->U+03C5, U+038F->U+03C9,U+03CE->U+03C9, U+03C2->U+03C3, U+0391..U+03A1->U+03B1..U+03C1,U+03A3..U+03A9->U+03C3..U+03C9, U+03B1..U+03C1, U+03C3..U+03C9, U+0E01..U+0E2E,U+0E30..U+0E3A, U+0E40..U+0E45, U+0E47, U+0E50..U+0E59, U+A000..U+A48F, U+4E00..U+9FBF,U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF, U+2F800..U+2FA1F, U+2E80..U+2EFF,U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF, U+3040..U+309F, U+30A0..U+30FF,U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF, U+3130..U+318F, U+A000..U+A48F,U+A490..U+A4CF
	ngram_len = 1 
	ngram_chars = U+4E00..U+9FBF, U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF,U+2F800..U+2FA1F, U+2E80..U+2EFF, U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF,U+3040..U+309F, U+30A0..U+30FF, U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF,U+3130..U+318F, U+A000..U+A48F, U+A490..U+A4CF
}


index testrt
{
	type			= rt
	rt_mem_limit		= 32M

	path			= /opt/server/sphinx/var/data/testrt
	charset_type		= utf-8

	rt_field		= title
	rt_field		= content
	rt_attr_uint		= id
}


indexer
{
	mem_limit		= 32M
}


searchd
{
	listen			= 9312
	log			= /opt/server/sphinx/var/log/searchd.log
	query_log		= /opt/server/sphinx/var/log/query.log
	read_timeout		= 5
	max_children		= 30
	pid_file		= /opt/server/sphinx/var/log/searchd.pid
	max_matches		= 1000
	seamless_rotate		= 1
	preopen_indexes		= 1
	unlink_old		= 1
	workers			= threads # for RT to work
	binlog_path		= /opt/server/sphinx/var/data
}

  

三、啓動服務

/opt/server/sphinx/bin/searchd

四、php分詞 scws

官網 http://www.ftphp.com/scws/

4.1 安裝
./configure –prefix=/opt/server/scws
make
make install

4.2 詞庫
scws-dict-chs-utf8.tar.bz2 解壓放入 /opt/server/scws/etc

詞庫 dict.utf-8.xdb
規則 rules.utf-8.ini

4.3 php 擴展

源碼在 phpext下

/opt/server/php/bin/phpize
./configure –with-scws=/opt/server/scws –with-php-config=/opt/server/php/bin/php-config
make
make install

vi php.ini
[scws]
extension = scws.so
scws.default.charset = utf-8
scws.default.fpath = /opt/server/scws/etc

查看
/opt/server/php/bin/php -m |grep scws

4.4 分詞測試

官方測試地址:http://www.ftphp.com/scws/docs.php

詳見測試文件 test_all.php

<?php

	echo '<p>'.microtime(true).'</p>';

	//搜索詞
	$key = trim($_GET['k']);

//========================================分詞

	$so = scws_new();
	$so->set_charset('utf-8');
	//默認詞庫
	$so->add_dict(ini_get('scws.default.fpath') . '/dict.utf-8.xdb');
	//自定義詞庫
	$so->add_dict('./dd.txt',SCWS_XDICT_TXT);
	//默認規則
	$so->set_rule(ini_get('scws.default.fpath') . '/rules.utf-8.ini');

	//設定分詞返回結果時是否去除一些特殊的標點符號
	$so->set_ignore(true);

	//設定分詞返回結果時是否複式分割,如「中國人」返回「中國+人+中國人」三個詞。
	// 按位異或的 1 | 2 | 4 | 8 分別表示: 短詞 | 二元 | 主要單字 | 全部單字
	//1,2,4,8 分別對應常量 SCWS_MULTI_SHORT  SCWS_MULTI_DUALITY SCWS_MULTI_ZMAIN SCWS_MULTI_ZALL
	$so->set_multi(false);

	//設定是否將閒散文字自動以二字分詞法聚合
	$so->set_duality(false);

	//設定搜索詞
	$so->send_text($key);

	$words_array = $so->get_result();

	foreach($words_array as $v)
	{
		$words = $words.' '.$v['word'];
	}

	$so->close();


	echo '<p>輸入:'.$key.'</p>';
	echo '<p>分詞:'.$words.'</p>';

	//var_dump($words);
	//exit;

//========================================搜索

	require_once 'sphinxapi.php';

	$sc = new SphinxClient();

	$sc->SetServer('127.0.0.1',9312);

	$sc->SetMatchMode(SPH_MATCH_ALL);

	$sc->SetArrayResult(TRUE);

	$res = $sc->Query($words);

	var_dump($res);


	echo '<p>'.microtime(true).'</p>';


	exit;

?>

  

其餘高階操做
=======================================================================================================
5 索引

//索引某個索引
/opt/server/sphinx/bin/indexer test1

//searchd 索引某個索引
/opt/server/sphinx/bin/indexer test1 –rotate

//指定索引搜索
/opt/server/sphinx/bin/indexer -i test1 ‘逗她男’

5.1 增量索引方案

//建立表記錄偏移
CREATE TABLE search_counter
(
counterid INTEGER PRIMARY KEY NOT NULL,
max_doc_id INTEGER NOT NULL
);

//增量索引
/opt/server/sphinx/bin/indexer test1stemmed –rotate

//合併索引
/opt/server/sphinx/bin/indexer –merge test1 test1stemmed –rotate

索引策略

一、搜索時,同時從主索引和增量索引取數據
二、每5分鐘,運行一次增量索引;知足新數據搜索需求
三、每晚,運行一次主索引,同時會更新索引標示;再運行增量索引,實質爲清空增量索引,避免與主索引重複
四、好處:避免開合併索引,合併索引效率較差
五、如數據量特別大,可考慮合併索引的方案

索引策略shell

//add.sh
#!/bin/sh
/opt/server/sphinx/bin/indexer test1stemmed –rotate >> /opt/server/sphinx/var/log/add.sh.log

//all.sh
#!/bin/sh
/opt/server/sphinx/bin/indexer test1 –rotate >> /opt/server/sphinx/var/log/all.sh.log
/opt/server/sphinx/bin/indexer test1stemmed –rotate >> /opt/server/sphinx/var/log/add.sh.log

詳情配置

/opt/server/sphinx/etc/sphinx.conf

#
# 增量索引方案
#

source src1
{
	type					= mysql

	sql_host				= 127.0.0.1
	sql_user				= root
	sql_pass				= 123456
	sql_db					= mocube
	sql_port				= 3306
	sql_query_pre				= SET NAMES UTF8
	sql_query_pre				= REPLACE INTO search_counter SELECT 1,MAX(uid) FROM mb_renren  #建立主索引前更改標識位置
	sql_query				= SELECT uid,email,url,nickname,is_access FROM mb_renren
	sql_attr_uint				= is_access	

	sql_query_info				= SELECT * FROM mb_renren WHERE uid=$id
}


source src1throttled
{
	type					= mysql

	sql_host				= 127.0.0.1
	sql_user				= root
	sql_pass				= 123456
	sql_db					= mocube
	sql_port				= 3306
	sql_query_pre 				= SET NAMES UTF8
	sql_query				= SELECT uid,email,url,nickname,is_access FROM mb_renren WHERE uid>(SELECT max_doc_id FROM search_counter WHERE counterid=1) #增量索引是id大於標識位置的部分
	sql_attr_uint                           = is_access
	sql_query_info				= SELECT * FROM mb_renren WHERE uid=$id
}


index test1
{
	source			= src1
	path			= /opt/server/sphinx/var/data/test1
	docinfo			= extern
	mlock			= 0
	stopwords		=
	min_prefix_len	= 0
	min_infix_len	= 0
	min_word_len	= 1
	charset_type	= utf-8
	charset_table = U+FF10..U+FF19->0..9, 0..9, U+FF41..U+FF5A->a..z, U+FF21..U+FF3A->a..z,A..Z->a..z, a..z, U+0149, U+017F, U+0138, U+00DF, U+00FF, U+00C0..U+00D6->U+00E0..U+00F6,U+00E0..U+00F6, U+00D8..U+00DE->U+00F8..U+00FE, U+00F8..U+00FE, U+0100->U+0101, U+0101,U+0102->U+0103, U+0103, U+0104->U+0105, U+0105, U+0106->U+0107, U+0107, U+0108->U+0109,U+0109, U+010A->U+010B, U+010B, U+010C->U+010D, U+010D, U+010E->U+010F, U+010F,U+0110->U+0111, U+0111, U+0112->U+0113, U+0113, U+0114->U+0115, U+0115, U+0116->U+0117,U+0117, U+0118->U+0119, U+0119, U+011A->U+011B, U+011B, U+011C->U+011D, U+011D,U+011E->U+011F, U+011F, U+0130->U+0131, U+0131, U+0132->U+0133, U+0133, U+0134->U+0135,U+0135, U+0136->U+0137, U+0137, U+0139->U+013A, U+013A, U+013B->U+013C, U+013C,U+013D->U+013E, U+013E, U+013F->U+0140, U+0140, U+0141->U+0142, U+0142, U+0143->U+0144,U+0144, U+0145->U+0146, U+0146, U+0147->U+0148, U+0148, U+014A->U+014B, U+014B,U+014C->U+014D, U+014D, U+014E->U+014F, U+014F, U+0150->U+0151, U+0151, U+0152->U+0153,U+0153, U+0154->U+0155, U+0155, U+0156->U+0157, U+0157, U+0158->U+0159, U+0159,U+015A->U+015B, U+015B, U+015C->U+015D, U+015D, U+015E->U+015F, U+015F, U+0160->U+0161,U+0161, U+0162->U+0163, U+0163, U+0164->U+0165, U+0165, U+0166->U+0167, U+0167,U+0168->U+0169, U+0169, U+016A->U+016B, U+016B, U+016C->U+016D, U+016D, U+016E->U+016F,U+016F, U+0170->U+0171, U+0171, U+0172->U+0173, U+0173, U+0174->U+0175, U+0175,U+0176->U+0177, U+0177, U+0178->U+00FF, U+00FF, U+0179->U+017A, U+017A, U+017B->U+017C,U+017C, U+017D->U+017E, U+017E, U+0410..U+042F->U+0430..U+044F, U+0430..U+044F,U+05D0..U+05EA, U+0531..U+0556->U+0561..U+0586, U+0561..U+0587, U+0621..U+063A, U+01B9,U+01BF, U+0640..U+064A, U+0660..U+0669, U+066E, U+066F, U+0671..U+06D3, U+06F0..U+06FF,U+0904..U+0939, U+0958..U+095F, U+0960..U+0963, U+0966..U+096F, U+097B..U+097F,U+0985..U+09B9, U+09CE, U+09DC..U+09E3, U+09E6..U+09EF, U+0A05..U+0A39, U+0A59..U+0A5E,U+0A66..U+0A6F, U+0A85..U+0AB9, U+0AE0..U+0AE3, U+0AE6..U+0AEF, U+0B05..U+0B39,U+0B5C..U+0B61, U+0B66..U+0B6F, U+0B71, U+0B85..U+0BB9, U+0BE6..U+0BF2, U+0C05..U+0C39,U+0C66..U+0C6F, U+0C85..U+0CB9, U+0CDE..U+0CE3, U+0CE6..U+0CEF, U+0D05..U+0D39, U+0D60,U+0D61, U+0D66..U+0D6F, U+0D85..U+0DC6, U+1900..U+1938, U+1946..U+194F, U+A800..U+A805,U+A807..U+A822, U+0386->U+03B1, U+03AC->U+03B1, U+0388->U+03B5, U+03AD->U+03B5,U+0389->U+03B7, U+03AE->U+03B7, U+038A->U+03B9, U+0390->U+03B9, U+03AA->U+03B9,U+03AF->U+03B9, U+03CA->U+03B9, U+038C->U+03BF, U+03CC->U+03BF, U+038E->U+03C5,U+03AB->U+03C5, U+03B0->U+03C5, U+03CB->U+03C5, U+03CD->U+03C5, U+038F->U+03C9,U+03CE->U+03C9, U+03C2->U+03C3, U+0391..U+03A1->U+03B1..U+03C1,U+03A3..U+03A9->U+03C3..U+03C9, U+03B1..U+03C1, U+03C3..U+03C9, U+0E01..U+0E2E,U+0E30..U+0E3A, U+0E40..U+0E45, U+0E47, U+0E50..U+0E59, U+A000..U+A48F, U+4E00..U+9FBF,U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF, U+2F800..U+2FA1F, U+2E80..U+2EFF,U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF, U+3040..U+309F, U+30A0..U+30FF,U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF, U+3130..U+318F, U+A000..U+A48F,U+A490..U+A4CF
	ngram_len = 1 
	ngram_chars = U+4E00..U+9FBF, U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF,U+2F800..U+2FA1F, U+2E80..U+2EFF, U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF,U+3040..U+309F, U+30A0..U+30FF, U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF,U+3130..U+318F, U+A000..U+A48F, U+A490..U+A4CF
}

index test1stemmed
{
	source			= src1throttled
	path			= /opt/server/sphinx/var/data/test1stemmed
	docinfo			= extern
	mlock			= 0
	stopwords		=
	min_prefix_len	= 0
	min_infix_len	= 0
	min_word_len	= 1
	charset_type	= utf-8
	charset_table = U+FF10..U+FF19->0..9, 0..9, U+FF41..U+FF5A->a..z, U+FF21..U+FF3A->a..z,A..Z->a..z, a..z, U+0149, U+017F, U+0138, U+00DF, U+00FF, U+00C0..U+00D6->U+00E0..U+00F6,U+00E0..U+00F6, U+00D8..U+00DE->U+00F8..U+00FE, U+00F8..U+00FE, U+0100->U+0101, U+0101,U+0102->U+0103, U+0103, U+0104->U+0105, U+0105, U+0106->U+0107, U+0107, U+0108->U+0109,U+0109, U+010A->U+010B, U+010B, U+010C->U+010D, U+010D, U+010E->U+010F, U+010F,U+0110->U+0111, U+0111, U+0112->U+0113, U+0113, U+0114->U+0115, U+0115, U+0116->U+0117,U+0117, U+0118->U+0119, U+0119, U+011A->U+011B, U+011B, U+011C->U+011D, U+011D,U+011E->U+011F, U+011F, U+0130->U+0131, U+0131, U+0132->U+0133, U+0133, U+0134->U+0135,U+0135, U+0136->U+0137, U+0137, U+0139->U+013A, U+013A, U+013B->U+013C, U+013C,U+013D->U+013E, U+013E, U+013F->U+0140, U+0140, U+0141->U+0142, U+0142, U+0143->U+0144,U+0144, U+0145->U+0146, U+0146, U+0147->U+0148, U+0148, U+014A->U+014B, U+014B,U+014C->U+014D, U+014D, U+014E->U+014F, U+014F, U+0150->U+0151, U+0151, U+0152->U+0153,U+0153, U+0154->U+0155, U+0155, U+0156->U+0157, U+0157, U+0158->U+0159, U+0159,U+015A->U+015B, U+015B, U+015C->U+015D, U+015D, U+015E->U+015F, U+015F, U+0160->U+0161,U+0161, U+0162->U+0163, U+0163, U+0164->U+0165, U+0165, U+0166->U+0167, U+0167,U+0168->U+0169, U+0169, U+016A->U+016B, U+016B, U+016C->U+016D, U+016D, U+016E->U+016F,U+016F, U+0170->U+0171, U+0171, U+0172->U+0173, U+0173, U+0174->U+0175, U+0175,U+0176->U+0177, U+0177, U+0178->U+00FF, U+00FF, U+0179->U+017A, U+017A, U+017B->U+017C,U+017C, U+017D->U+017E, U+017E, U+0410..U+042F->U+0430..U+044F, U+0430..U+044F,U+05D0..U+05EA, U+0531..U+0556->U+0561..U+0586, U+0561..U+0587, U+0621..U+063A, U+01B9,U+01BF, U+0640..U+064A, U+0660..U+0669, U+066E, U+066F, U+0671..U+06D3, U+06F0..U+06FF,U+0904..U+0939, U+0958..U+095F, U+0960..U+0963, U+0966..U+096F, U+097B..U+097F,U+0985..U+09B9, U+09CE, U+09DC..U+09E3, U+09E6..U+09EF, U+0A05..U+0A39, U+0A59..U+0A5E,U+0A66..U+0A6F, U+0A85..U+0AB9, U+0AE0..U+0AE3, U+0AE6..U+0AEF, U+0B05..U+0B39,U+0B5C..U+0B61, U+0B66..U+0B6F, U+0B71, U+0B85..U+0BB9, U+0BE6..U+0BF2, U+0C05..U+0C39,U+0C66..U+0C6F, U+0C85..U+0CB9, U+0CDE..U+0CE3, U+0CE6..U+0CEF, U+0D05..U+0D39, U+0D60,U+0D61, U+0D66..U+0D6F, U+0D85..U+0DC6, U+1900..U+1938, U+1946..U+194F, U+A800..U+A805,U+A807..U+A822, U+0386->U+03B1, U+03AC->U+03B1, U+0388->U+03B5, U+03AD->U+03B5,U+0389->U+03B7, U+03AE->U+03B7, U+038A->U+03B9, U+0390->U+03B9, U+03AA->U+03B9,U+03AF->U+03B9, U+03CA->U+03B9, U+038C->U+03BF, U+03CC->U+03BF, U+038E->U+03C5,U+03AB->U+03C5, U+03B0->U+03C5, U+03CB->U+03C5, U+03CD->U+03C5, U+038F->U+03C9,U+03CE->U+03C9, U+03C2->U+03C3, U+0391..U+03A1->U+03B1..U+03C1,U+03A3..U+03A9->U+03C3..U+03C9, U+03B1..U+03C1, U+03C3..U+03C9, U+0E01..U+0E2E,U+0E30..U+0E3A, U+0E40..U+0E45, U+0E47, U+0E50..U+0E59, U+A000..U+A48F, U+4E00..U+9FBF,U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF, U+2F800..U+2FA1F, U+2E80..U+2EFF,U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF, U+3040..U+309F, U+30A0..U+30FF,U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF, U+3130..U+318F, U+A000..U+A48F,U+A490..U+A4CF
	ngram_len = 1 
	ngram_chars = U+4E00..U+9FBF, U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF,U+2F800..U+2FA1F, U+2E80..U+2EFF, U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF,U+3040..U+309F, U+30A0..U+30FF, U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF,U+3130..U+318F, U+A000..U+A48F, U+A490..U+A4CF
}


index testrt
{
        type                    = rt
        rt_mem_limit            = 32M

        path                    = /opt/server/sphinx/var/data/testrt
        charset_type            = utf-8

        rt_field                = email
        rt_field                = url
	rt_field                = nickname
        rt_attr_uint            = is_access
}


indexer
{
	mem_limit		= 32M
}


searchd
{
	listen			= 9312
	log			= /opt/server/sphinx/var/log/searchd.log
	query_log		= /opt/server/sphinx/var/log/query.log
	read_timeout		= 5
	max_children		= 30
	pid_file		= /opt/server/sphinx/var/log/searchd.pid
	max_matches		= 1000
	seamless_rotate		= 1
	preopen_indexes		= 1
	unlink_old		= 1
	workers			= threads # for RT to work
	binlog_path		= /opt/server/sphinx/var/data
}

  

5.2 實時索引

策略:

在插入Myslq數據的同時,將須要索引的數據,同時往sphinx 虛擬Mysql數據庫寫入一份,實時索引

應用場景:

對實施搜索要求很是要的應用,通常不推薦此方式,尚且不穩定

方案:

編寫相關PHP類,在Myslq插入數據的類中,同時插入索引數據,對前端使用者無需關心邏輯

詳情配置

/opt/server/sphinx/etc/sphinx.conf

#
# 實時索引方案
#

source src1
{
	type					= mysql

	sql_host				= 127.0.0.1
	sql_user				= root
	sql_pass				= 123456
	sql_db					= test
	sql_port				= 3306
	sql_query_pre 			= SET NAMES UTF8
    sql_query                = SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content FROM documents
                                                              #sql_query第一列id需爲整數
                                                              #title、content做爲字符串/文本字段,被全文索引
    sql_attr_uint            = group_id           #從SQL讀取到的值必須爲整數
    sql_attr_timestamp        = date_added #從SQL讀取到的值必須爲整數,做爲時間屬性
    sql_query_info            = SELECT * FROM documents WHERE id=$id #命令行查詢時,從數據庫讀取原始數據信息
}


index test1
{
	source			= src1
	path			= /opt/server/sphinx/var/data/test1
	docinfo			= extern
	mlock			= 0
	stopwords		=
	min_prefix_len	= 0
	min_infix_len	= 0
	min_word_len	= 1
	charset_type	= utf-8
	charset_table = U+FF10..U+FF19->0..9, 0..9, U+FF41..U+FF5A->a..z, U+FF21..U+FF3A->a..z,A..Z->a..z, a..z, U+0149, U+017F, U+0138, U+00DF, U+00FF, U+00C0..U+00D6->U+00E0..U+00F6,U+00E0..U+00F6, U+00D8..U+00DE->U+00F8..U+00FE, U+00F8..U+00FE, U+0100->U+0101, U+0101,U+0102->U+0103, U+0103, U+0104->U+0105, U+0105, U+0106->U+0107, U+0107, U+0108->U+0109,U+0109, U+010A->U+010B, U+010B, U+010C->U+010D, U+010D, U+010E->U+010F, U+010F,U+0110->U+0111, U+0111, U+0112->U+0113, U+0113, U+0114->U+0115, U+0115, U+0116->U+0117,U+0117, U+0118->U+0119, U+0119, U+011A->U+011B, U+011B, U+011C->U+011D, U+011D,U+011E->U+011F, U+011F, U+0130->U+0131, U+0131, U+0132->U+0133, U+0133, U+0134->U+0135,U+0135, U+0136->U+0137, U+0137, U+0139->U+013A, U+013A, U+013B->U+013C, U+013C,U+013D->U+013E, U+013E, U+013F->U+0140, U+0140, U+0141->U+0142, U+0142, U+0143->U+0144,U+0144, U+0145->U+0146, U+0146, U+0147->U+0148, U+0148, U+014A->U+014B, U+014B,U+014C->U+014D, U+014D, U+014E->U+014F, U+014F, U+0150->U+0151, U+0151, U+0152->U+0153,U+0153, U+0154->U+0155, U+0155, U+0156->U+0157, U+0157, U+0158->U+0159, U+0159,U+015A->U+015B, U+015B, U+015C->U+015D, U+015D, U+015E->U+015F, U+015F, U+0160->U+0161,U+0161, U+0162->U+0163, U+0163, U+0164->U+0165, U+0165, U+0166->U+0167, U+0167,U+0168->U+0169, U+0169, U+016A->U+016B, U+016B, U+016C->U+016D, U+016D, U+016E->U+016F,U+016F, U+0170->U+0171, U+0171, U+0172->U+0173, U+0173, U+0174->U+0175, U+0175,U+0176->U+0177, U+0177, U+0178->U+00FF, U+00FF, U+0179->U+017A, U+017A, U+017B->U+017C,U+017C, U+017D->U+017E, U+017E, U+0410..U+042F->U+0430..U+044F, U+0430..U+044F,U+05D0..U+05EA, U+0531..U+0556->U+0561..U+0586, U+0561..U+0587, U+0621..U+063A, U+01B9,U+01BF, U+0640..U+064A, U+0660..U+0669, U+066E, U+066F, U+0671..U+06D3, U+06F0..U+06FF,U+0904..U+0939, U+0958..U+095F, U+0960..U+0963, U+0966..U+096F, U+097B..U+097F,U+0985..U+09B9, U+09CE, U+09DC..U+09E3, U+09E6..U+09EF, U+0A05..U+0A39, U+0A59..U+0A5E,U+0A66..U+0A6F, U+0A85..U+0AB9, U+0AE0..U+0AE3, U+0AE6..U+0AEF, U+0B05..U+0B39,U+0B5C..U+0B61, U+0B66..U+0B6F, U+0B71, U+0B85..U+0BB9, U+0BE6..U+0BF2, U+0C05..U+0C39,U+0C66..U+0C6F, U+0C85..U+0CB9, U+0CDE..U+0CE3, U+0CE6..U+0CEF, U+0D05..U+0D39, U+0D60,U+0D61, U+0D66..U+0D6F, U+0D85..U+0DC6, U+1900..U+1938, U+1946..U+194F, U+A800..U+A805,U+A807..U+A822, U+0386->U+03B1, U+03AC->U+03B1, U+0388->U+03B5, U+03AD->U+03B5,U+0389->U+03B7, U+03AE->U+03B7, U+038A->U+03B9, U+0390->U+03B9, U+03AA->U+03B9,U+03AF->U+03B9, U+03CA->U+03B9, U+038C->U+03BF, U+03CC->U+03BF, U+038E->U+03C5,U+03AB->U+03C5, U+03B0->U+03C5, U+03CB->U+03C5, U+03CD->U+03C5, U+038F->U+03C9,U+03CE->U+03C9, U+03C2->U+03C3, U+0391..U+03A1->U+03B1..U+03C1,U+03A3..U+03A9->U+03C3..U+03C9, U+03B1..U+03C1, U+03C3..U+03C9, U+0E01..U+0E2E,U+0E30..U+0E3A, U+0E40..U+0E45, U+0E47, U+0E50..U+0E59, U+A000..U+A48F, U+4E00..U+9FBF,U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF, U+2F800..U+2FA1F, U+2E80..U+2EFF,U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF, U+3040..U+309F, U+30A0..U+30FF,U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF, U+3130..U+318F, U+A000..U+A48F,U+A490..U+A4CF
	ngram_len = 1 
	ngram_chars = U+4E00..U+9FBF, U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF,U+2F800..U+2FA1F, U+2E80..U+2EFF, U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF,U+3040..U+309F, U+30A0..U+30FF, U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF,U+3130..U+318F, U+A000..U+A48F, U+A490..U+A4CF
}


index testrt
{
	type			= rt
	rt_mem_limit		= 32M
	path			= /opt/server/sphinx/var/data/testrt

	docinfo            = extern
	mlock            = 0
	morphology        = none
	min_word_len        = 1
	html_strip                = 0

	charset_type	= utf-8
	charset_table = U+FF10..U+FF19->0..9, 0..9, U+FF41..U+FF5A->a..z, U+FF21..U+FF3A->a..z,A..Z->a..z, a..z, U+0149, U+017F, U+0138, U+00DF, U+00FF, U+00C0..U+00D6->U+00E0..U+00F6,U+00E0..U+00F6, U+00D8..U+00DE->U+00F8..U+00FE, U+00F8..U+00FE, U+0100->U+0101, U+0101,U+0102->U+0103, U+0103, U+0104->U+0105, U+0105, U+0106->U+0107, U+0107, U+0108->U+0109,U+0109, U+010A->U+010B, U+010B, U+010C->U+010D, U+010D, U+010E->U+010F, U+010F,U+0110->U+0111, U+0111, U+0112->U+0113, U+0113, U+0114->U+0115, U+0115, U+0116->U+0117,U+0117, U+0118->U+0119, U+0119, U+011A->U+011B, U+011B, U+011C->U+011D, U+011D,U+011E->U+011F, U+011F, U+0130->U+0131, U+0131, U+0132->U+0133, U+0133, U+0134->U+0135,U+0135, U+0136->U+0137, U+0137, U+0139->U+013A, U+013A, U+013B->U+013C, U+013C,U+013D->U+013E, U+013E, U+013F->U+0140, U+0140, U+0141->U+0142, U+0142, U+0143->U+0144,U+0144, U+0145->U+0146, U+0146, U+0147->U+0148, U+0148, U+014A->U+014B, U+014B,U+014C->U+014D, U+014D, U+014E->U+014F, U+014F, U+0150->U+0151, U+0151, U+0152->U+0153,U+0153, U+0154->U+0155, U+0155, U+0156->U+0157, U+0157, U+0158->U+0159, U+0159,U+015A->U+015B, U+015B, U+015C->U+015D, U+015D, U+015E->U+015F, U+015F, U+0160->U+0161,U+0161, U+0162->U+0163, U+0163, U+0164->U+0165, U+0165, U+0166->U+0167, U+0167,U+0168->U+0169, U+0169, U+016A->U+016B, U+016B, U+016C->U+016D, U+016D, U+016E->U+016F,U+016F, U+0170->U+0171, U+0171, U+0172->U+0173, U+0173, U+0174->U+0175, U+0175,U+0176->U+0177, U+0177, U+0178->U+00FF, U+00FF, U+0179->U+017A, U+017A, U+017B->U+017C,U+017C, U+017D->U+017E, U+017E, U+0410..U+042F->U+0430..U+044F, U+0430..U+044F,U+05D0..U+05EA, U+0531..U+0556->U+0561..U+0586, U+0561..U+0587, U+0621..U+063A, U+01B9,U+01BF, U+0640..U+064A, U+0660..U+0669, U+066E, U+066F, U+0671..U+06D3, U+06F0..U+06FF,U+0904..U+0939, U+0958..U+095F, U+0960..U+0963, U+0966..U+096F, U+097B..U+097F,U+0985..U+09B9, U+09CE, U+09DC..U+09E3, U+09E6..U+09EF, U+0A05..U+0A39, U+0A59..U+0A5E,U+0A66..U+0A6F, U+0A85..U+0AB9, U+0AE0..U+0AE3, U+0AE6..U+0AEF, U+0B05..U+0B39,U+0B5C..U+0B61, U+0B66..U+0B6F, U+0B71, U+0B85..U+0BB9, U+0BE6..U+0BF2, U+0C05..U+0C39,U+0C66..U+0C6F, U+0C85..U+0CB9, U+0CDE..U+0CE3, U+0CE6..U+0CEF, U+0D05..U+0D39, U+0D60,U+0D61, U+0D66..U+0D6F, U+0D85..U+0DC6, U+1900..U+1938, U+1946..U+194F, U+A800..U+A805,U+A807..U+A822, U+0386->U+03B1, U+03AC->U+03B1, U+0388->U+03B5, U+03AD->U+03B5,U+0389->U+03B7, U+03AE->U+03B7, U+038A->U+03B9, U+0390->U+03B9, U+03AA->U+03B9,U+03AF->U+03B9, U+03CA->U+03B9, U+038C->U+03BF, U+03CC->U+03BF, U+038E->U+03C5,U+03AB->U+03C5, U+03B0->U+03C5, U+03CB->U+03C5, U+03CD->U+03C5, U+038F->U+03C9,U+03CE->U+03C9, U+03C2->U+03C3, U+0391..U+03A1->U+03B1..U+03C1,U+03A3..U+03A9->U+03C3..U+03C9, U+03B1..U+03C1, U+03C3..U+03C9, U+0E01..U+0E2E,U+0E30..U+0E3A, U+0E40..U+0E45, U+0E47, U+0E50..U+0E59, U+A000..U+A48F, U+4E00..U+9FBF,U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF, U+2F800..U+2FA1F, U+2E80..U+2EFF,U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF, U+3040..U+309F, U+30A0..U+30FF,U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF, U+3130..U+318F, U+A000..U+A48F,U+A490..U+A4CF
	ngram_len = 1 
	ngram_chars = U+4E00..U+9FBF, U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF,U+2F800..U+2FA1F, U+2E80..U+2EFF, U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF,U+3040..U+309F, U+30A0..U+30FF, U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF,U+3130..U+318F, U+A000..U+A48F, U+A490..U+A4CF

    #全文索引字段
    rt_field                  = title               #全文索引字段
    rt_field                  = content         #全文索引字段

    #屬性字段
    rt_attr_uint            = groupid
    rt_attr_bigint         = biguid
    rt_attr_float           = score
    rt_attr_timestamp  = date_added

    #存儲內容字段
    rt_attr_string          = author          #存儲author的內容

    #已設置全文索引,並須要同時存儲內容的字段
    rt_attr_string          = title              #同時存儲title的內容
    rt_attr_string          = content        #同時存儲content的內容
}


indexer
{
	mem_limit		= 32M
}


searchd
{
	listen			= 9312
	listen			= 9306:mysql41
	log			= /opt/server/sphinx/var/log/searchd.log
	query_log		= /opt/server/sphinx/var/log/query.log
	read_timeout		= 5
	max_children		= 30
	pid_file		= /opt/server/sphinx/var/log/searchd.pid
	max_matches		= 1000
	seamless_rotate		= 1
	preopen_indexes		= 1
	unlink_old		= 1
	workers			= threads # for RT to work
	binlog_path		= /opt/server/sphinx/var/data
}

  

/opt/server/mysql/bin/mysql -h127.0.0.1 -P9306
show tables;
desc testrt;

插入數據
INSERT INTO testrt VALUES ( 1, ‘first record’, ‘test one’, 123 );
INSERT INTO testrt VALUES ( 2, ‘second record’, ‘test two’, 234 );
INSERT INTO testrt VALUES ( 4, ‘沒有什麼不得了’, ‘真的麼,就不告訴test’, 6789 );

檢索
SELECT * FROM testrt WHERE MATCH(‘test’);
SELECT * FROM testrt WHERE MATCH(‘真的麼’);

5.3 多個表獨立索引方案

場景:若有用戶搜索、商品搜索等多個索引需求

策略:

配置一個多索引方案,每一個表單獨創建索引

前端根據不一樣類型選擇不一樣的查詢索引;所有,即選擇全部索引

相關文章
相關標籤/搜索