Sphinx是一個基於SQL的全文檢索引擎,能夠結合MySQL,PostgreSQL作全文搜索,它能夠提供比數據庫自己更專業的搜索功能,使得應用程序更容易實現專業化的全文檢索。Sphinx特別爲一些腳本語言設計搜索API接口,如PHP,Python,Perl,Ruby等,同時爲MySQL也設計了一個存儲引擎插件。html
Sphinx 單一索引最大可包含1億條記錄,在1千萬條記錄狀況下的查詢速度爲0.x秒(毫秒級)。Sphinx建立索引的速度爲:建立100萬條記錄的索引只需 3~4分鐘,建立1000萬條記錄的索引能夠在50分鐘內完成,而只包含最新10萬條記錄的增量索引,重建 若是用到sphinx,全文索引交給sphinx來作,sphinx返回含有該word的ID號,而後用該ID號直接去數據庫準肯定位那些數據,整個過程以下圖:一次只需幾十秒。mysql
由於sphinx默認不支持中文索引及檢索,而coreseek基於sphinx開發了coreseek全文檢索服務器,它提供了爲sphinx設計的中文分詞包libmmseg包含mmseg中文分詞,是目前用的最多的sphinx中文檢索。在沒有sphinx以前,mysql數據庫要對海量的文章中的詞進行全文索引,通常用的語句例如:SELECT *** WHERE *** LIKE '%word%';這樣的LIKE查詢,而且再結合通配符%,是使用不到mysql自己的索引,須要全表掃描,時間超慢的!linux
若是用到sphinx,全文索引交給sphinx來作,sphinx返回含有該word的ID號,而後用該ID號直接去數據庫準肯定位那些數據,整個過程以下圖:sql
1. cd /data/program 數據庫
2. tar zxvf coreseek-4.1-beta.tar.gz bootstrap
3. cd coreseek-4.1-beta windows
4. cd mmseg-3.2.14 服務器
5. ./bootstrap 網絡
6. ./configure --prefix=/usr/local/mmseg3 less
7. make && make install
8.
9. 遇到的問題:
10. error: cannot find input file: src/Makefile.in
11. 或者遇到其餘相似error錯誤時...
12.
13. 解決方案:
14. 依次執行下面的命令
15. yum -y install libtool
16.
17. aclocal
18. libtoolize --force
19. automake --add-missing
20. autoconf
21. autoheader
22. make clean
安裝好'libtool'繼續從'aclocal'開始執行上面提到的一串命令,執行完後再運行最開始的安裝流程便可。
1. ##安裝coreseek
2. $ cd csft-3.2.14 或者 cd csft-4.0.1 或者 cd csft-4.1
3. $ sh buildconf.sh #輸出的warning信息能夠忽略,若是出現error則須要解決
4. 如沒法編譯
5. 1. 在 csft-4.1/buildconf.sh 文件中,查找
6. && aclocal \ 後加上
7. && automake --add-missing \
8. 2. 在 csft-4.1/configure.ac 文件中,
9. 查找:
10. AM_INIT_AUTOMAKE([-Wall -Werror foreign])改成:
11. AM_INIT_AUTOMAKE([-Wall foreign])
12. 查找:
13. AC_PROG_RANLIB 後面加上:
14. AM_PROG_AR
15. 3. 最後,在 csft-4.1/src/sphinxexpr.cpp 文件中, 替換全部:
16. T val = ExprEval ( this->m_pArg, tMatch );成爲:
17. T val = this->ExprEval ( this->m_pArg, tMatch );
無錯誤後繼續執行下面命令:
18. $ ./configure --prefix=/usr/local/coreseek --without-unixodbc --with-mmseg --with-mmseg-includes=/usr/local/mmseg3/include/mmseg/ --with-mmseg-libs=/usr/local/mmseg3/lib/ --with-mysql
19. ##若是提示mysql問題,能夠查看MySQL數據源安裝說明 http://www.coreseek.cn/product_install/install_on_bsd_linux/#mysql
20. $ make && make install
21. $ cd ..
22.
23.
24. ##命令行測試mmseg分詞,coreseek搜索(須要預先設置好字符集爲zh_CN.UTF-8,確保正確顯示中文)
25. $ cd testpack
26. $ cat var/test/test.xml #此時應該正確顯示中文
27. $ /usr/local/mmseg3/bin/mmseg -d /usr/local/mmseg3/etc var/test/test.xml
28. $ /usr/local/coreseek/bin/indexer -c etc/csft.conf --all
29. $ /usr/local/coreseek/bin/search -c etc/csft.conf 網絡搜索
如出現這個 xmlpipe2 support NOT compiled in. To use xmlpipe2, install missing XML libra 錯誤,執行如下命令:
1. yum -y install expat expat-devel
依次安裝後,重新編譯coreseek,而後再生成索引,就能夠經過了。
測試結果以下:
1. Coreseek Fulltext 4.1 [ Sphinx 2.0.2-dev (r2922)]
2. Copyright (c) 2007-2011,
3. Beijing Choice Software Technologies Inc (http://www.coreseek.com)
4.
5. using config file 'etc/csft.conf'...
6. index 'xml': query '網絡搜索 ': returned 1 matches of 1 total in 0.000 sec
7.
8. displaying matches:
9. 1. document=1, weight=1590, published=Thu Apr 1 07:20:07 2010, author_id=1
10.
11. words:
12. 1. '網絡': 1 documents, 1 hits
建立配置sphinx與mysql的配置文件
1. # vi /usr/local/coreseek/etc/csft_mysql.conf
1. #MySQL數據源配置,詳情請查看:http://www.coreseek.cn/docs/coreseek_4.1-sphinx_2.0.1-beta.html#conf-reference
2.
2. #源定義
3. source threads
4. {
5. type = mysql
6.
7. sql_host = localhost
8. sql_user = root
9. sql_pass = root
10. sql_db = ultrax
11. sql_port = 3306
12. sql_query_pre = SET NAMES utf8 #(這裏若是數據庫是utf8的就修改成utf8,下面全部SET NAME設置同這裏)
13. # sql_query_pre = SET SESSION query_cache_type=OFF
14. sql_query_pre = CREATE TABLE IF NOT EXISTS pre_common_sphinxcounter ( indexid INTEGER PRIMARY KEY NOT NULL,maxid INTEGER NOT NULL)
15. sql_query_pre = REPLACE INTO pre_common_sphinxcounter SELECT 1, MAX(tid)-10 FROM pre_forum_thread
16.
17. sql_query = SELECT t.tid AS id,t.tid,t.subject,t.digest,t.displayorder,t.authorid,t.lastpost,t.special \
18. FROM pre_forum_thread AS t \
19. WHERE t.tid>=$start AND t.tid<=$end
20.
21. sql_query_range = SELECT (SELECT MIN(tid) FROM pre_forum_thread),maxid FROM pre_common_sphinxcounter WHERE indexid=1
22. sql_range_step = 4096
23.
24. sql_attr_uint = tid
25. sql_attr_uint = digest
26. sql_attr_uint = displayorder
27. sql_attr_uint = authorid
28. sql_attr_uint = special
29.
30. sql_attr_timestamp =lastpost
31.
32. sql_query_info = SELECT * FROM pre_forum_thread WHERE tid=$id
33. }
34.
35. #threads
36. index threads
37. {
38. source = threads
39. path = /var/data/threads #windows下最好用全路徑
40. docinfo = extern
41. mlock = 0
42. morphology = none
43. min_word_len = 1
44. html_strip = 0
45. charset_dictpath = /usr/local/mmseg3/etc/ #BSD、Linux環境下設置,/符號結尾
46. charset_type = zh_cn.utf-8
47. #charset_debug = 0
48. ngram_len = 0
49. }
50.
51. #threads_minute
52. source threads_minute : threads
53. {
54. sql_query_pre =
55. sql_query_pre = SET NAMES utf8
56. # sql_query_pre = SET SESSION query_cache_type=OFF
57. sql_query_range = SELECT maxid+1,(SELECT MAX(tid) FROM pre_forum_thread) FROM pre_common_sphinxcounter WHERE indexid=1
58. }
59.
60. #threads_minute
61. index threads_minute : threads
62. {
63. source = threads_minute
64. path = /var/data/threads_minute #windows下最好用全路徑
65. }
66.
67. ##################################################
68.
69. #posts
70. source posts
71. {
72. type = mysql
73.
74. sql_host = localhost
75. sql_user = root
76. sql_pass = root
77. sql_db = ultrax
78. sql_port = 3306
79. sql_query_pre =
80. sql_query_pre = SET NAMES utf8
81. # sql_query_pre = SET SESSION query_cache_type=OFF
82. sql_query_pre = REPLACE INTO pre_common_sphinxcounter SELECT 2, MAX(pid)-2 FROM pre_forum_post
83. sql_query = SELECT p.pid AS id,p.tid,p.subject,p.message,t.digest,t.displayorder,t.authorid,t.lastpost,t.special \
84. FROM pre_forum_post AS p LEFT JOIN pre_forum_thread AS t USING(tid) \
85. WHERE p.pid>=$start AND p.pid<=$end
86.
87. sql_query_range = SELECT (SELECT MIN(pid) FROM pre_forum_post),maxid FROM pre_common_sphinxcounter WHERE indexid=2
88. sql_range_step = 4096
89.
90. sql_attr_uint = tid
91. sql_attr_uint = digest
92. sql_attr_uint = displayorder
93. sql_attr_uint = authorid
94. sql_attr_uint = special
95.
96. sql_attr_timestamp =lastpost
97.
98. sql_query_info = SELECT * FROM pre_forum_post WHERE pid=$id
99. }
100.
101. #posts
102. index posts
103. {
104. source = posts
105. path = /var/data/posts #windows下最好用全路徑
106. docinfo = extern
107. mlock = 0
108. morphology = none
109. min_word_len = 1
110. html_strip = 0
111. charset_dictpath = /usr/local/mmseg3/etc/ #BSD、Linux環境下設置,/符號結尾
112. charset_type = zh_cn.utf-8
113. #charset_debug = 0
114. ngram_len = 0
115. }
116.
117. #posts_minute
118. source posts_minute : posts
119. {
120. sql_query_pre =
121. sql_query_pre = SET NAMES utf8
122. # sql_query_pre = SET SESSION query_cache_type=OFF
123. sql_query_range = SELECT maxid+1,(SELECT MAX(pid) FROM pre_forum_post) FROM pre_common_sphinxcounter WHERE indexid=2
124. }
125.
126. #posts_minute
127. index posts_minute : posts
128. {
129. source = posts_minute
130. path = /var/data/posts_minute #windows下最好用全路徑
131. }
132.
133. #全局indexer定義
134. indexer
135. {
136. mem_limit = 256M
137. }
138.
139. #searchd服務定義
140. searchd
141. {
142. listen = 3312
143. read_timeout = 5
144. max_children = 30
145. max_matches = 10000
146. seamless_rotate = 0
147. preopen_indexes = 0
148. unlink_old = 1
149. pid_file = /usr/local/coreseek/var/log/searchd_discuzx.pid #windows下最好用全路徑
150. log = /usr/local/coreseek/var/log/searchd_discuzx.log #windows下最好用全路徑
151. query_log = /usr/local/coreseek/var/log/query_discuzx.log #windows下最好用全路徑
152. }
啓動後臺服務:(必須開啓)
1. # /usr/local/coreseek/bin/searchd -c /usr/local/coreseek/etc/csft_mysql.conf
執行索引:(查詢、測試前必須執行一次)
1. /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/csft_mysql.conf --all --rotate
執行增量索引: (delta替換爲具體索引名)
1. /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/csft_mysql.conf delta --rotate
合併索引:(main、delta替換爲具體索引名,一次只能兩個索引進行合併)
1. /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/csft_mysql.conf --merge main delta --rotate --merge-dst-range deleted 0 0
(爲了防止多個關鍵字指向同一個文檔加上--merge-dst-range deleted 0 0)
後臺服務測試:
1. # /usr/local/coreseek/bin/search -c /usr/local/coreseek/etc/csft_mysql.conf aaa
關閉後臺服務:
1. # /usr/local/coreseek/bin/searchd -c /usr/local/coreseek/etc/csft_mysql.conf --stop
自動化命令:(每隔一分鐘執行一遍增量索引,每五分鐘執行一遍合併索引,天天1:30執行總體索引。)
1. crontab -e
2. */1 * * * * /bin/sh /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/csft_mysql.conf delta --rotate
3. */5 * * * * /bin/sh /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/csft_mysql.conf --merge main delta --rotate --merge-dst-range deleted 0 0
4. 30 1 * * * /bin/sh /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/csft_mysql.conf --all --rotate
至此全部配置工做完成,只要在discuz後臺配置開啓sphinx便可。