gbk編碼的簡介以及針對gbk文本飄紅截斷原理以及實現

時間 2019-11-11

原文原文鏈接

一個檢索系統，在歸併拉鍊並得到摘要數據以後，必不可少的環節是飄紅截斷。php

對於整個架構來講，檢索索引以及rank等後端通常使用c/c++來實現，真正的展示ui可使用php/python等腳本語言來實現。對於飄紅而言，能夠放在ui端使用php截斷飄紅，也能夠放在後端經過c/c++來飄紅截斷。文本編碼能夠用gbk也能夠用utf-8。對於存儲而言，若是使用gbk編碼能夠比utf-8python

使用php的優勢：操做簡單，有大量現成的庫，不須要關注gbk等具體實現，經過mb_str庫能夠搞定全部事情，缺點在文本較長的時候性能及其低下。顯而易見，使用C/C++的有點是性能極高，可是肯定是須要本身去關注字符編碼。ios

由於排期較緊，我起初使用了php來進行飄紅截斷，一個文本（字數大概在5k左右）飄紅大概花了200+ms，這個性能是不能忍受的，以後我只好改爲了c/c++來進行飄紅截斷，性能獲得顯著提高，耗時只有0.01ms。性能提高達到萬倍。c++

如今描述下怎麼來使用c/c++來飄紅截斷文本。後端

一、 gbk簡介

首先先簡要介紹一下gbk(gb2312編碼)。安全

GBK是在國家標準GB2312基礎上擴容後兼容GB2312的標準，包含全部的中文字符。一箇中文須要3個字節，最高位爲1，因此第一個字節大於0x80. 此外字符編碼還有utf-8。gbk和utf-8之間能夠經過Unicode編碼進行轉換。gbk，utf-8，Unicode之間的關係若是有不瞭解的請本身Google或者百度.架構

二、飄紅需求

2.1、輸入：

須要截斷文本content
須要飄紅的詞組hi_words，用"|"進行進行分割
須要截斷的字數 len函數

2.2、飄紅截斷規則

優先截斷字節數目必定；優先截取飄紅詞右邊的整句；若是整句不到截斷字數，在拿飄紅詞左邊的句子進行填充。性能

三、實現

3.1 把輸入的飄紅詞語翻入vector.

這個使用strtok_r輕輕鬆鬆搞定(注意不要使用非線程安全的strtok).測試

int getHighWord(char* high_words, vector<string>& words)
{
	char keyword[1024];
	char *ptok = NULL;
	snprintf(keyword, sizeof(keyword), "%s", high_words);
	char *part = strtok_r(keyword, "|", &ptok);
	while( part != NULL)
	{
		string tmp(part);
		words.push_back(tmp);
		part = strtok_r(NULL, "|", &ptok);
	}
	return 0;
}

3.二、把句子分隔符放入到vector中。

sep_china是中文句子分隔符，sep_uni是英文分割符，他們都是一個整句。

vector<string> sep_china;
vector<string> sep_uni;
sep_china.push_back("，");
sep_china.push_back("。");
sep_china.push_back("；");
sep_china.push_back("：");
sep_china.push_back("！");

sep_uni.push_back(",");
sep_uni.push_back(".");
sep_uni.push_back("?");
sep_uni.push_back(";");
sep_uni.push_back(":");

3.三、具體處理流程

3.4 判斷有沒有這些words，返回pos_word

string str_cnt(content);
	//首先判斷有沒有這個word
	int pos_word = -1;
	string min_word;
	int i;
	for(i=0; i<words.size(); i++)
	{
		pos_word = str_cnt.find(words[i]);
		if(pos_word > 0)
		{
			min_word = words[i];
			break;
		}
	}

	//若是沒有找到,直接截斷返回
	if( pos_word < 0)
	{
		if( num <= getExtraWord(content, hi_word, 0, num) )
		{
			strcat(hi_word,"...");
			return 0;
		}
	}

3.5 計算word前面字符數before，後面字符串after,還須要截斷字符數left，以及word前面第一個標點的偏移位置quoto(須要考慮中文和英文)。

//還須要多少字節
	int left = num - min_word.size();
	//前面有多少字節
	int before = pos_word;
	//後面還有多少字節
	int after = str_cnt.size() - min_word.size() - pos_word;
	//得到前一個標點的位置
	int pos_quoto_china = -1;
	int pos_quoto_uni = -1;
	string quoto_str = str_cnt.substr(0, pos_word);
	pos_quoto_china = findMaxPos(quoto_str, sep_china) + 2; //一個標點2個字符 
	pos_quoto_uni = findMaxPos(quoto_str,sep_uni) +1 ; //一個標點一個字符
	int quoto_pos = pos_quoto_uni > pos_quoto_china ? pos_quoto_uni : pos_quoto_china;
	//得到前一個標點有多少字節
	int quoto = quoto_pos > 0 ? pos_word - quoto_pos : 0;

3.6 根據before, after, quoto, left計算目前status

int getStatus(int before, int after, int quoto, int left)
{
	int status = 0;
	if(quoto >= left)
	{
		//直接截斷quoto
		status = 0;
	}
	else if( quoto + after > left)
	{
		//返回截斷quoto+left
		status =1 ;
	}
	else
	{
		//返回截斷after和left-after的前面
		status =2;
	}
	return status;
}

3.7 根據返回的status進行處理

char before_word[1024];
	char after_word[1024];
	int left_cnt;
	int right_cnt;
	before_word[0] = after_word[0] = '0';

	switch(status)
	{
		case 0:
			getExtraWord(content, hi_word, pos_word, left);
			strcat(hi_word, "...");
			break;
		case 1:
			left_cnt = getExtraWord(content, before_word, pos_word -1, -1 * quoto);
			right_cnt = getExtraWord(content, after_word, pos_word + min_word.size(), left- quoto);
			if( left_cnt < pos_word)
			{
				strcpy(hi_word, "...");
			}
			strcat(hi_word, before_word);
			strcat(hi_word, min_word.c_str() );
			strcat(hi_word, after_word);
			strcat(hi_word, "...");
			break;
		case 2:
			right_cnt = getExtraWord(content, before_word, pos_word + min_word.size() , after);
			left_cnt = getExtraWord(content, after_word, pos_word-1 , after- left);
			if( left_cnt < pos_word)
			{
				strcpy(hi_word, "...");
			}
			strcat(hi_word, before_word);
			strcat(hi_word, min_word.c_str() );
			strcat(hi_word, after_word);
			strcat(hi_word, "...");
			break;
		default:
			getExtraWord(content, hi_word, 0, num);
			break;
	}

3.8 截斷函數

其中最爲核心的爲截斷函數，以下所示，從word的begin位置截取length個字符（length小於0表示從左截斷，0x80是判斷標準。

int getExtraWord(const char* word, char* res, int begin, int length)
{
	if(length == 0 )
	{
		res[0] = '\0';
		return 0;
	}
	int i;
	int flag;
	const char *str = word + begin;
	const char *chech_vaild = (length >0 )? str : str+1;

	if( false == checkVaild(chech_vaild) )
	{
		cout << "not vaild\n";
		//盡力修復下吧
		begin ++;
		str++;
	}
	//若是從後截斷 word沒有 length，直接返回
	if( (int)strlen(str)  <= length  )
	{
		strcpy(res, str);
		return strlen(str);
	}

	//若是從前截斷 word沒有lenght長
	if( begin <= -1 * length) 
	{
		strncpy(res, word, begin);
		res[begin ] =  '\0';
		return begin;
	}

	flag = length > 0 ? 1 : -1;

	int num = 0;
	
	while( *str )
	{
		//若是是中文
		if((unsigned char)*str > 0x80)
		{
			num += 2;
			str += flag * 2;
		}
		else
		{
			num ++;
			str += flag;
		}

		if( num >= flag * length)
		{
			break;
		}
	}
	res[0] = '\0';
	i = 0;
	str = (length  > 0) ? word + begin : word + begin - num + 1;
	while(*str)
	{
		res[i++] =  *(str++);
		if(i==num)
		{
			break;
		}
	}
	res[i] = '\0';
	return i;
}

四、結果

4.1 測試case：

int main()
{
	const char* content = "啊。雞 從到信陽，而後在火車站有直達雞公山的汽車，只收10元。約一個小時就到雞公山腳下，之前票價是63，自從港中旅進駐雞公山後，如今好像改爲123了，固然了，港中旅把雞公山搞得更加漂亮和特點了。 在雞公山門口有去山頂的旅遊大巴，單程15元，往返的20元，不過建議你們作單程上去，而後步行從爬山古道或者長生谷下山，會更加有趣味。和朋友在爬山古道的入口處景區處於全監控狀態，因此至關安全。美齡舞廳外面的招牌很顯眼啊。頤廬是雞公山上最有名的建築，儘管世界各國都曾有在山上建房子，但唯獨中國人靳";

	vector<string> words;
	char* hi = "信陽";
	getHighWord(hi, words);
	for(int i=0; i<words.size(); i++)
	{
		cout << words[i] << endl;
	}
	char hi_word[1024];
	hi_word[0] = '\0';
	getHilight(content, hi_word, words, 200);
	cout << "hi_word is " << hi_word << endl;
}

4.2 結果:

五、所有源代碼

#include <stdio.h>
#include <iostream>
#include <string>
#include <vector>
using namespace std;


/** 查看一句話是不是完整的gbk語句 */
bool checkVaild(const char* word)
{
	bool vaild = true;
	while(*word)
	{
		if( (unsigned char)*word++ > 0x80 )
		{
			vaild = !vaild;
		}
	}
	return vaild;
}

int getExtraWord(const char* word, char* res, int begin, int length)
{
	if(length == 0 )
	{
		res[0] = '\0';
		return 0;
	}
	int i;
	int flag;
	const char *str = word + begin;
	const char *chech_vaild = (length >0 )? str : str+1;

	if( false == checkVaild(chech_vaild) )
	{
		cout << "not vaild\n";
		//盡力修復下吧
		begin ++;
		str++;
	}
	//若是從後截斷 word沒有 length，直接返回
	if( (int)strlen(str)  <= length  )
	{
		strcpy(res, str);
		return strlen(str);
	}

	//若是從前截斷 word沒有lenght長
	if( begin <= -1 * length) 
	{
		strncpy(res, word, begin);
		res[begin ] =  '\0';
		return begin;
	}

	flag = length > 0 ? 1 : -1;

	int num = 0;
	
	while( *str )
	{
		//若是是中文
		if((unsigned char)*str > 0x80)
		{
			num += 2;
			str += flag * 2;
		}
		else
		{
			num ++;
			str += flag;
		}

		if( num >= flag * length)
		{
			break;
		}
	}
	res[0] = '\0';
	i = 0;
	str = (length  > 0) ? word + begin : word + begin - num + 1;
	while(*str)
	{
		res[i++] =  *(str++);
		if(i==num)
		{
			break;
		}
	}
	res[i] = '\0';
	return i;
}

int getStatus(int before, int after, int quoto, int left)
{
	int status = 0;
	if(quoto >= left)
	{
		//直接截斷quoto
		status = 0;
	}
	else if( quoto + after > left)
	{
		//返回截斷quoto+left
		status =1 ;
	}
	else
	{
		//返回截斷after和left-after的前面
		status =2;
	}
	return status;
}

int findMaxPos(string content, vector<string>quotos)
{
	int i;
	int pos = string::npos;
	int tmp_pos;
	for(i=0; i<quotos.size(); i++)
	{
		tmp_pos = content.rfind(quotos[i]);
		if( tmp_pos > pos)
		{
			pos = tmp_pos;
		}
	}
	return pos;
}

int getHilight(const char* content, char* hi_word,  vector<string> words, int num)
{

	//若是不夠長
	if( strlen(content) <= num )
	{
		strcpy(hi_word, content);
		return 0;
	}

	//首先分詞
	vector<string> sep_china;
	vector<string> sep_uni;
	sep_china.push_back("，");
	sep_china.push_back("。");
	sep_china.push_back("？");
	sep_china.push_back("；");
	sep_china.push_back("：");
	sep_china.push_back("！");

	sep_uni.push_back(",");
	sep_uni.push_back(".");
	sep_uni.push_back("?");
	sep_uni.push_back(";");
	sep_uni.push_back(":");
	sep_uni.push_back(";");
	//sep_uni.push_back(" ");

	//內容
	string str_cnt(content);
	//首先判斷有沒有這個word
	int pos_word = -1;
	string min_word;
	int i;
	for(i=0; i<words.size(); i++)
	{
		pos_word = str_cnt.find(words[i]);
		if(pos_word > 0)
		{
			min_word = words[i];
			break;
		}
	}

	//若是沒有找到,直接截斷返回
	if( pos_word < 0)
	{
		if( num <= getExtraWord(content, hi_word, 0, num) )
		{
			strcat(hi_word,"...");
			return 0;
		}
	}

	//還須要多少字節
	int left = num - min_word.size();
	//前面有多少字節
	int before = pos_word;
	//後面還有多少字節
	int after = str_cnt.size() - min_word.size() - pos_word;
	//得到前一個標點的位置
	int pos_quoto_china = -1;
	int pos_quoto_uni = -1;
	string quoto_str = str_cnt.substr(0, pos_word);
	pos_quoto_china = findMaxPos(quoto_str, sep_china) + 2; //一個標點2個字符 
	pos_quoto_uni = findMaxPos(quoto_str,sep_uni) +1 ; //一個標點一個字符
	int quoto_pos = pos_quoto_uni > pos_quoto_china ? pos_quoto_uni : pos_quoto_china;
	//得到前一個標點有多少字節
	int quoto = quoto_pos > 0 ? pos_word - quoto_pos : 0;
	int status = getStatus(before, after, quoto, left);
	
	char before_word[1024];
	char after_word[1024];
	int left_cnt;
	int right_cnt;
	before_word[0] = after_word[0] = '0';

	switch(status)
	{
		case 0:
			getExtraWord(content, hi_word, pos_word, left);
			strcat(hi_word, "...");
			break;
		case 1:
			left_cnt = getExtraWord(content, before_word, pos_word -1, -1 * quoto);
			right_cnt = getExtraWord(content, after_word, pos_word + min_word.size(), left- quoto);
			if( left_cnt < pos_word)
			{
				strcpy(hi_word, "...");
			}
			strcat(hi_word, before_word);
			strcat(hi_word, min_word.c_str() );
			strcat(hi_word, after_word);
			strcat(hi_word, "...");
			break;
		case 2:
			right_cnt = getExtraWord(content, before_word, pos_word + min_word.size() , after);
			left_cnt = getExtraWord(content, after_word, pos_word-1 , after- left);
			if( left_cnt < pos_word)
			{
				strcpy(hi_word, "...");
			}
			strcat(hi_word, before_word);
			strcat(hi_word, min_word.c_str() );
			strcat(hi_word, after_word);
			strcat(hi_word, "...");
			break;
		default:
			getExtraWord(content, hi_word, 0, num);
			break;
	}
	return 0;
}


int getHighWord(char* high_words, vector<string>& words)
{
	char keyword[1024];
	char *ptok = NULL;
	snprintf(keyword, sizeof(keyword), "%s", high_words);
	char *part = strtok_r(keyword, "|", &ptok);
	while( part != NULL)
	{
		string tmp(part);
		words.push_back(tmp);
		part = strtok_r(NULL, "|", &ptok);
	}
	return 0;
}



int main()
{
	const char* content = "啊。雞 從到信陽，而後在火車站有直達雞公山的汽車，只收10元。約一個小時就到雞公山腳下，之前票價是63，自從港中旅進駐雞公山後，如今好像改爲123了，固然了，港中旅把雞公山搞得更加漂亮和特點了。 在雞公山門口有去山頂的旅遊大巴，單程15元，往返的20元，不過建議你們作單程上去，而後步行從爬山古道或者長生谷下山，會更加有趣味。和朋友在爬山古道的入口處景區處於全監控狀態，因此至關安全。美齡舞廳外面的招牌很顯眼啊。頤廬是雞公山上最有名的建築，儘管世界各國都曾有在山上建房子，但唯獨中國人靳";

	vector<string> words;
	char* hi = "信陽";
	getHighWord(hi, words);
	for(int i=0; i<words.size(); i++)
	{
		cout << words[i] << endl;
	}
	char hi_word[1024];
	hi_word[0] = '\0';
	getHilight(content, hi_word, words, 100);
	cout << "hi_word is " << hi_word << endl;
}