一個檢索系統,在歸併拉鍊並得到摘要數據以後,必不可少的環節是飄紅截斷。php
對於整個架構來講,檢索索引以及rank等後端通常使用c/c++來實現,真正的展示ui可使用php/python等腳本語言來實現。對於飄紅而言,能夠放在ui端使用php截斷飄紅,也能夠放在後端經過c/c++來飄紅截斷。文本編碼能夠用gbk也能夠用utf-8。對於存儲而言,若是使用gbk編碼能夠比utf-8python
使用php的優勢:操做簡單,有大量現成的庫,不須要關注gbk等具體實現,經過mb_str庫能夠搞定全部事情,缺點在文本較長的時候性能及其低下。顯而易見,使用C/C++的有點是性能極高,可是肯定是須要本身去關注字符編碼。ios
由於排期較緊,我起初使用了php來進行飄紅截斷,一個文本(字數大概在5k左右)飄紅大概花了200+ms,這個性能是不能忍受的,以後我只好改爲了c/c++來進行飄紅截斷,性能獲得顯著提高,耗時只有0.01ms。性能提高達到萬倍。c++
如今描述下怎麼來使用c/c++來飄紅截斷文本。後端
首先先簡要介紹一下gbk(gb2312編碼)。安全
GBK是在國家標準GB2312基礎上擴容後兼容GB2312的標準,包含全部的中文字符。一箇中文須要3個字節,最高位爲1,因此第一個字節大於0x80. 此外字符編碼還有utf-8。gbk和utf-8之間能夠經過Unicode編碼進行轉換。gbk,utf-8,Unicode之間的關係若是有不瞭解的請本身Google或者百度.架構
須要截斷文本content
須要飄紅的詞組hi_words,用"|"進行進行分割
須要截斷的字數 len函數
優先截斷字節數目必定;優先截取飄紅詞右邊的整句;若是整句不到截斷字數,在拿飄紅詞左邊的句子進行填充。性能
這個使用strtok_r輕輕鬆鬆搞定(注意不要使用非線程安全的strtok).測試
int getHighWord(char* high_words, vector<string>& words) { char keyword[1024]; char *ptok = NULL; snprintf(keyword, sizeof(keyword), "%s", high_words); char *part = strtok_r(keyword, "|", &ptok); while( part != NULL) { string tmp(part); words.push_back(tmp); part = strtok_r(NULL, "|", &ptok); } return 0; }
sep_china是中文句子分隔符,sep_uni是英文分割符,他們都是一個整句。
vector<string> sep_china; vector<string> sep_uni; sep_china.push_back(","); sep_china.push_back("。"); sep_china.push_back(";"); sep_china.push_back(":"); sep_china.push_back("!"); sep_uni.push_back(","); sep_uni.push_back("."); sep_uni.push_back("?"); sep_uni.push_back(";"); sep_uni.push_back(":");
string str_cnt(content); //首先判斷有沒有這個word int pos_word = -1; string min_word; int i; for(i=0; i<words.size(); i++) { pos_word = str_cnt.find(words[i]); if(pos_word > 0) { min_word = words[i]; break; } } //若是沒有找到,直接截斷返回 if( pos_word < 0) { if( num <= getExtraWord(content, hi_word, 0, num) ) { strcat(hi_word,"..."); return 0; } }
//還須要多少字節 int left = num - min_word.size(); //前面有多少字節 int before = pos_word; //後面還有多少字節 int after = str_cnt.size() - min_word.size() - pos_word; //得到前一個標點的位置 int pos_quoto_china = -1; int pos_quoto_uni = -1; string quoto_str = str_cnt.substr(0, pos_word); pos_quoto_china = findMaxPos(quoto_str, sep_china) + 2; //一個標點2個字符 pos_quoto_uni = findMaxPos(quoto_str,sep_uni) +1 ; //一個標點一個字符 int quoto_pos = pos_quoto_uni > pos_quoto_china ? pos_quoto_uni : pos_quoto_china; //得到前一個標點有多少字節 int quoto = quoto_pos > 0 ? pos_word - quoto_pos : 0;
int getStatus(int before, int after, int quoto, int left) { int status = 0; if(quoto >= left) { //直接截斷quoto status = 0; } else if( quoto + after > left) { //返回截斷quoto+left status =1 ; } else { //返回截斷after和left-after的前面 status =2; } return status; }
char before_word[1024]; char after_word[1024]; int left_cnt; int right_cnt; before_word[0] = after_word[0] = '0'; switch(status) { case 0: getExtraWord(content, hi_word, pos_word, left); strcat(hi_word, "..."); break; case 1: left_cnt = getExtraWord(content, before_word, pos_word -1, -1 * quoto); right_cnt = getExtraWord(content, after_word, pos_word + min_word.size(), left- quoto); if( left_cnt < pos_word) { strcpy(hi_word, "..."); } strcat(hi_word, before_word); strcat(hi_word, min_word.c_str() ); strcat(hi_word, after_word); strcat(hi_word, "..."); break; case 2: right_cnt = getExtraWord(content, before_word, pos_word + min_word.size() , after); left_cnt = getExtraWord(content, after_word, pos_word-1 , after- left); if( left_cnt < pos_word) { strcpy(hi_word, "..."); } strcat(hi_word, before_word); strcat(hi_word, min_word.c_str() ); strcat(hi_word, after_word); strcat(hi_word, "..."); break; default: getExtraWord(content, hi_word, 0, num); break; }
其中最爲核心的爲截斷函數,以下所示,從word的begin位置截取length個字符(length小於0表示從左截斷,0x80是判斷標準。
int getExtraWord(const char* word, char* res, int begin, int length) { if(length == 0 ) { res[0] = '\0'; return 0; } int i; int flag; const char *str = word + begin; const char *chech_vaild = (length >0 )? str : str+1; if( false == checkVaild(chech_vaild) ) { cout << "not vaild\n"; //盡力修復下吧 begin ++; str++; } //若是從後截斷 word沒有 length,直接返回 if( (int)strlen(str) <= length ) { strcpy(res, str); return strlen(str); } //若是從前截斷 word沒有lenght長 if( begin <= -1 * length) { strncpy(res, word, begin); res[begin ] = '\0'; return begin; } flag = length > 0 ? 1 : -1; int num = 0; while( *str ) { //若是是中文 if((unsigned char)*str > 0x80) { num += 2; str += flag * 2; } else { num ++; str += flag; } if( num >= flag * length) { break; } } res[0] = '\0'; i = 0; str = (length > 0) ? word + begin : word + begin - num + 1; while(*str) { res[i++] = *(str++); if(i==num) { break; } } res[i] = '\0'; return i; }
int main() { const char* content = "啊。雞 從到信陽,而後在火車站有直達雞公山的汽車,只收10元。約一個小時就到雞公山腳下,之前票價是63,自從港中旅進駐雞公山後,如今好像改爲123了,固然了,港中旅把雞公山搞得更加漂亮和特點了。 在雞公山門口有去山頂的旅遊大巴,單程15元,往返的20元,不過建議你們作單程上去,而後步行從爬山古道或者長生谷下山,會更加有趣味。和朋友在爬山古道的入口處景區處於全監控狀態,因此至關安全。美齡舞廳外面的招牌很顯眼啊。頤廬是雞公山上最有名的建築,儘管世界各國都曾有在山上建房子,但唯獨中國人靳"; vector<string> words; char* hi = "信陽"; getHighWord(hi, words); for(int i=0; i<words.size(); i++) { cout << words[i] << endl; } char hi_word[1024]; hi_word[0] = '\0'; getHilight(content, hi_word, words, 200); cout << "hi_word is " << hi_word << endl; }
#include <stdio.h> #include <iostream> #include <string> #include <vector> using namespace std; /** 查看一句話是不是完整的gbk語句 */ bool checkVaild(const char* word) { bool vaild = true; while(*word) { if( (unsigned char)*word++ > 0x80 ) { vaild = !vaild; } } return vaild; } int getExtraWord(const char* word, char* res, int begin, int length) { if(length == 0 ) { res[0] = '\0'; return 0; } int i; int flag; const char *str = word + begin; const char *chech_vaild = (length >0 )? str : str+1; if( false == checkVaild(chech_vaild) ) { cout << "not vaild\n"; //盡力修復下吧 begin ++; str++; } //若是從後截斷 word沒有 length,直接返回 if( (int)strlen(str) <= length ) { strcpy(res, str); return strlen(str); } //若是從前截斷 word沒有lenght長 if( begin <= -1 * length) { strncpy(res, word, begin); res[begin ] = '\0'; return begin; } flag = length > 0 ? 1 : -1; int num = 0; while( *str ) { //若是是中文 if((unsigned char)*str > 0x80) { num += 2; str += flag * 2; } else { num ++; str += flag; } if( num >= flag * length) { break; } } res[0] = '\0'; i = 0; str = (length > 0) ? word + begin : word + begin - num + 1; while(*str) { res[i++] = *(str++); if(i==num) { break; } } res[i] = '\0'; return i; } int getStatus(int before, int after, int quoto, int left) { int status = 0; if(quoto >= left) { //直接截斷quoto status = 0; } else if( quoto + after > left) { //返回截斷quoto+left status =1 ; } else { //返回截斷after和left-after的前面 status =2; } return status; } int findMaxPos(string content, vector<string>quotos) { int i; int pos = string::npos; int tmp_pos; for(i=0; i<quotos.size(); i++) { tmp_pos = content.rfind(quotos[i]); if( tmp_pos > pos) { pos = tmp_pos; } } return pos; } int getHilight(const char* content, char* hi_word, vector<string> words, int num) { //若是不夠長 if( strlen(content) <= num ) { strcpy(hi_word, content); return 0; } //首先分詞 vector<string> sep_china; vector<string> sep_uni; sep_china.push_back(","); sep_china.push_back("。"); sep_china.push_back("?"); sep_china.push_back(";"); sep_china.push_back(":"); sep_china.push_back("!"); sep_uni.push_back(","); sep_uni.push_back("."); sep_uni.push_back("?"); sep_uni.push_back(";"); sep_uni.push_back(":"); sep_uni.push_back(";"); //sep_uni.push_back(" "); //內容 string str_cnt(content); //首先判斷有沒有這個word int pos_word = -1; string min_word; int i; for(i=0; i<words.size(); i++) { pos_word = str_cnt.find(words[i]); if(pos_word > 0) { min_word = words[i]; break; } } //若是沒有找到,直接截斷返回 if( pos_word < 0) { if( num <= getExtraWord(content, hi_word, 0, num) ) { strcat(hi_word,"..."); return 0; } } //還須要多少字節 int left = num - min_word.size(); //前面有多少字節 int before = pos_word; //後面還有多少字節 int after = str_cnt.size() - min_word.size() - pos_word; //得到前一個標點的位置 int pos_quoto_china = -1; int pos_quoto_uni = -1; string quoto_str = str_cnt.substr(0, pos_word); pos_quoto_china = findMaxPos(quoto_str, sep_china) + 2; //一個標點2個字符 pos_quoto_uni = findMaxPos(quoto_str,sep_uni) +1 ; //一個標點一個字符 int quoto_pos = pos_quoto_uni > pos_quoto_china ? pos_quoto_uni : pos_quoto_china; //得到前一個標點有多少字節 int quoto = quoto_pos > 0 ? pos_word - quoto_pos : 0; int status = getStatus(before, after, quoto, left); char before_word[1024]; char after_word[1024]; int left_cnt; int right_cnt; before_word[0] = after_word[0] = '0'; switch(status) { case 0: getExtraWord(content, hi_word, pos_word, left); strcat(hi_word, "..."); break; case 1: left_cnt = getExtraWord(content, before_word, pos_word -1, -1 * quoto); right_cnt = getExtraWord(content, after_word, pos_word + min_word.size(), left- quoto); if( left_cnt < pos_word) { strcpy(hi_word, "..."); } strcat(hi_word, before_word); strcat(hi_word, min_word.c_str() ); strcat(hi_word, after_word); strcat(hi_word, "..."); break; case 2: right_cnt = getExtraWord(content, before_word, pos_word + min_word.size() , after); left_cnt = getExtraWord(content, after_word, pos_word-1 , after- left); if( left_cnt < pos_word) { strcpy(hi_word, "..."); } strcat(hi_word, before_word); strcat(hi_word, min_word.c_str() ); strcat(hi_word, after_word); strcat(hi_word, "..."); break; default: getExtraWord(content, hi_word, 0, num); break; } return 0; } int getHighWord(char* high_words, vector<string>& words) { char keyword[1024]; char *ptok = NULL; snprintf(keyword, sizeof(keyword), "%s", high_words); char *part = strtok_r(keyword, "|", &ptok); while( part != NULL) { string tmp(part); words.push_back(tmp); part = strtok_r(NULL, "|", &ptok); } return 0; } int main() { const char* content = "啊。雞 從到信陽,而後在火車站有直達雞公山的汽車,只收10元。約一個小時就到雞公山腳下,之前票價是63,自從港中旅進駐雞公山後,如今好像改爲123了,固然了,港中旅把雞公山搞得更加漂亮和特點了。 在雞公山門口有去山頂的旅遊大巴,單程15元,往返的20元,不過建議你們作單程上去,而後步行從爬山古道或者長生谷下山,會更加有趣味。和朋友在爬山古道的入口處景區處於全監控狀態,因此至關安全。美齡舞廳外面的招牌很顯眼啊。頤廬是雞公山上最有名的建築,儘管世界各國都曾有在山上建房子,但唯獨中國人靳"; vector<string> words; char* hi = "信陽"; getHighWord(hi, words); for(int i=0; i<words.size(); i++) { cout << words[i] << endl; } char hi_word[1024]; hi_word[0] = '\0'; getHilight(content, hi_word, words, 100); cout << "hi_word is " << hi_word << endl; }