編程珠璣 - 算法優化 - 過濾敏感詞 - 第七步:瘋狂測試

到了第六步,咱們只是理論上探討優化的步驟,最後,咱們進行集體測試,使用敏感詞越多,效果越明顯:java

package test;

import static util.PrintUtil.print;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

public class Test {
	static int key_max = 0; // 敏感詞最大長度
	static String[] keys = {"辦證", "氣槍出售", "裸聊", "裸表演", "土槍賣"};
	static String tContent = "再辦證頂";
	static ArrayList<String> first = new ArrayList<String>();
	static String[] sortFirst;
	static char[] charFirst;
	static HashMap<String, ArrayList<String>> map = new HashMap<String, ArrayList<String>>();
	static HashMap<String, String[]> sortMap = new HashMap<String, String[]>();
	static HashMap<String, char[]> charMap = new HashMap<String, char[]>();
	static void init(String[] keys) {
		ArrayList<String> temp;
		String key, value;
		int length;
		for (String k : keys) {
			if (!first.contains(k.substring(0, 1))) {
				first.add(k.substring(0, 1));
			}
			length = k.length();
			if (length > key_max) key_max = length;
			for (int i = 1; i < length; i ++) {
				key = k.substring(0, i);
				value = k.substring(i, i + 1);
				if (i == 1 && !first.contains(key)) {
					first.add(key);
				}
				
				// 有,添加
				if (map.containsKey(key)) {
					if (!map.get(key).contains(value)) {
						map.get(key).add(value);
					}
				}
				// 沒有添加
				else {
					temp = new ArrayList<String>();
					temp.add(value);
					map.put(key, temp);
				}
			}
		}
		sortFirst = first.toArray(new String[first.size()]);
		Arrays.sort(sortFirst); // 排序
		
		charFirst = new char[first.size()];
		for (int i = 0; i < charFirst.length; i ++) {
			charFirst[i] = first.get(i).charAt(0);
		}
		Arrays.sort(charFirst); // 排序
		
		String[] sortValue;
		ArrayList<String> v;
		Map.Entry<String, ArrayList<String>> entry;
		Iterator<Entry<String, ArrayList<String>>> iter = map.entrySet().iterator();
		while (iter.hasNext()) {
			entry = (Map.Entry<String, ArrayList<String>>) iter.next();
			v = (ArrayList<String>)entry.getValue();
			sortValue = v.toArray(new String[v.size()]);
			Arrays.sort(sortValue); // 排序
			sortMap.put(entry.getKey(), sortValue);
		}
		
		char[] charValue;
		iter = map.entrySet().iterator();
		while (iter.hasNext()) {
			entry = (Map.Entry<String, ArrayList<String>>) iter.next();
			v = (ArrayList<String>)entry.getValue();
			charValue = new char[v.size()];
			for (int i = 0; i < charValue.length; i ++) {
				charValue[i] = v.get(i).charAt(0);
			}
			Arrays.sort(charValue); // 排序
			charMap.put(entry.getKey(), charValue);
		}
	}
	
	/**
	 * 快速實現的方法
	 */
	public final static String test1(String content) {
		for (String k : keys) {
			if (content.indexOf(k) > -1)
				return k;
		}
		return null;
	}
	
	/**
	 * 優化一
	 */
	public final static String test2(String content) {
		boolean bFirst = false;
		int length = content.length();
		for (int i = 0; i < length; i ++) {
			if (first.contains(content.substring(i, i + 1))) {
				bFirst = true;
				break;
			}
		}
		return bFirst ? test1(content) : null;
	}
	
	/**
	 * 優化二
	 */
	public final static String test3(String content) {
		String r = null, f, g, c = content;
		ArrayList<String> temps;
		int length = c.length();
		tag : for (int i = 0; i < length - 1; i++) {
			f = c.substring(i, i + 1);
			if (first.contains(f)) {
				for (int j = i + 1; j < length; j++) {
					f = c.substring(i, j);
					g = c.substring(j, j + 1);
					temps = map.get(f);
					if (temps == null) { // 找到了
						//print("ok");
						r = f;
						break tag;
					}
					if (temps.contains(g)) {
						if (j == length - 1) {
							//print("find!");
							r = c.substring(i, j + 1);
							break tag;
						}
					} else { // 沒有找到了
						break;
					}
				}
			}
		}
		return r;
	}
	
	/**
	 * 優化三
	 */
	public final static String test4(String content) {
		String r = null, f, g, c = content;
		String[] temps;
		int length = c.length();
		tag : for (int i = 0; i < length - 1; i++) {
			f = c.substring(i, i + 1);
			// 二分查找
			if (Arrays.binarySearch(sortFirst, f) > -1) {
				for (int j = i + 1; j < length; j++) {
					f = c.substring(i, j);
					g = c.substring(j, j + 1);
					temps = sortMap.get(f);
					if (temps == null) { // 找到了
						//print("ok");
						r = f;
						break tag;
					}
					// 二分查找
					if (Arrays.binarySearch(temps, g) > -1) {
						if (j == length - 1) {
							//print("find!");
							r = c.substring(i, j + 1);
							break tag;
						}
					} else { // 沒有找到了
						break;
					}
				}
			}
		}
		return r;
	}
	
	/**
	 * 優化四
	 */
	public final static String test5(String content) {
		String r = null, f, c = content;
		char g;
		char[] temps;
		int length = c.length();
		tag : for (int i = 0; i < length - 1; i++) {
			g = c.charAt(i);
			// 二分查找
			if (Arrays.binarySearch(charFirst, g) > -1) {
				for (int j = i + 1; j < length; j++) {
					f = c.substring(i, j);
					g = c.charAt(j);
					temps = charMap.get(f);
					if (temps == null) { // 找到了
						//print("ok");
						r = f;
						break tag;
					}
					// 二分查找
					if (Arrays.binarySearch(temps, g) > -1) {
						if (j == length - 1) {
							//print("find!");
							r = c.substring(i, j + 1);
							break tag;
						}
					} else { // 沒有找到了
						break;
					}
				}
			}
		}
		return r;
	}
	
	/**
	 * 優化五
	 */
	public final static String test6(String content) {
		String r = null, c = content;
		char g;
		char[] temps;
		char[] keys = new char[key_max];
		int length = c.length(), index;
		tag : for (int i = 0; i < length - 1; i++) {
			index = 0;
			g = c.charAt(i);
			// 過濾特殊字符
			if (Arrays.binarySearch(filters, g) > -1) {
				continue;
			}
			// 二分查找
			if (Arrays.binarySearch(charFirst, g) > -1) {
				keys[index++] = g;
				for (int j = i + 1; j < length; j++) {
					g = c.charAt(j);
					// 過濾特殊字符
					if (Arrays.binarySearch(filters, g) > -1) {
						continue;
					}
					temps = charMap.get(String.valueOf(keys, 0, index));
					if (temps == null) { // 找到了
						//print("ok");
						r = String.valueOf(keys, 0, index);
						break tag;
					}
					// 二分查找
					if (Arrays.binarySearch(temps, g) > -1) {
						if (j == length - 1) {
							//print("find!");
							keys[index++] = g;
							r = String.valueOf(keys, 0, index);
							break tag;
						}
					} else { // 沒有找到了
						break;
					}
					keys[index++] = g;
				}
			}
		}
		return r;
	}
	
	public static StringBuffer read(String file) throws IOException{
		BufferedReader in = new BufferedReader(new FileReader(file));
		String line = null;
		StringBuffer buffer = new StringBuffer();
		while((line = in.readLine())!= null){
			buffer.append(line);
		}
		return buffer;
	}
	
	// 過濾特殊字符[敏感詞須要過濾、用戶輸入內容也須要過濾]
	static char[] filters = ",.~!@#$%^&*(){}[];':\"".toCharArray();
	static { Arrays.sort(filters); /* 排序 */ }
	// 過濾特殊字符正則表達式
	static String regexp = ",|\\.|\\(|\\)|\\*|&|\\^|%|\\$";
	public static void main(String[] args) throws IOException {
		// 讀取敏感詞組
		String[] keys = read("data/keyword1").toString().split("@");
		tContent = read("data/test1").toString(); // 讀取測試內容
		init(keys); // 初始化
		
		long time1;
		int max = 1000;
		
		String newContent;
		time1 = System.currentTimeMillis();
		for (int i = 0; i < max; i ++) {
			newContent = tContent.replaceAll(regexp, "");
			test1(newContent);
		}
		print("test1 time:" + (System.currentTimeMillis() - time1));
		
		time1 = System.currentTimeMillis();
		for (int i = 0; i < max; i ++) {
			newContent = tContent.replaceAll(regexp, "");
			test2(newContent);
		}
		print("test2 time:" + (System.currentTimeMillis() - time1));
		
		time1 = System.currentTimeMillis();
		for (int i = 0; i < max; i ++) {
			newContent = tContent.replaceAll(regexp, "");
			test3(newContent);
		}
		print("test3 time:" + (System.currentTimeMillis() - time1));
		
		time1 = System.currentTimeMillis();
		for (int i = 0; i < max; i ++) {
			newContent = tContent.replaceAll(regexp, "");
			test4(newContent);
		}
		print("test4 time:" + (System.currentTimeMillis() - time1));
		
		time1 = System.currentTimeMillis();
		for (int i = 0; i < max; i ++) {
			newContent = tContent.replaceAll(regexp, "");
			test5(newContent);
		}
		print("test5 time:" + (System.currentTimeMillis() - time1));
		
		time1 = System.currentTimeMillis();
		for (int i = 0; i < max; i ++) {
			// 取消正則過濾特殊字符
			test6(tContent);
		}
		print("test6 time:" + (System.currentTimeMillis() - time1));
	}
}

代碼有不妥之處,歡迎指出^_^。正則表達式

相關文章
相關標籤/搜索