在上篇 ac多模式匹配最後,有說到下面的冗餘轉移,這篇探討下肯定性有限自動機多模式匹配算法;node
已知g(4,e) = 5; 假設M 當前狀態爲4, 且下一個字符不是'e',這時候M 會調用f(4)=1,其實這時候咱們已經不須要去查找狀態1以'e'爲外向邊的狀態了,由於下一個字符肯定不是'e';若是沒有"his"模式,咱們能夠直接從狀態1跳到狀態0;而前面代碼是會去作這個多餘查找動做的。這個能夠用肯定有限自動機來避免;ios
肯定性有限自動機用空間換時間,DFA由有限狀態集S和Next函數δ 組成,對於每一個狀態s和輸入字符a, 有δ(s,a)是有限狀態集S的元素,能夠說DFA使得每一個狀態轉移變得惟一。正則表達式
構造DFA的Next函數僞代碼:算法
本例的Next表結果:函數
完整實現代碼:this
#include<iostream> #include<string.h> #include<malloc.h> #include <queue> using namespace std; /* reallocation step for AC_NODE_t.matched_patterns */ #define REALLOC_CHUNK_MATCHED_PATTERN 1 /* reallocation step for AC_NODE_t.outgoing array */ #define REALLOC_CHUNK_OUTGOING 2 struct ac_edge; typedef struct AC_PATTERN_s { const char * astring; /* String of alphabets */ unsigned int length; /* Length of pattern */ } AC_PATTERN_t; typedef struct node{ int id; /* Node ID : just for debugging purpose */ unsigned short depth; /* depth: distance between this node and the root */ struct node *parent; /*parent node, for compute failure function*/ struct node *failure_node; /* The failure node of this node */ short int final; /* 0: no ; 1: yes, it is a final node */ //int patternNo; /*Accept pattern index: just for debugging purpose */ /* Matched patterns */ AC_PATTERN_t * matched_patterns; /* Array of matched patterns */ unsigned short matched_patterns_num; /* Number of matched patterns at this node */ unsigned short matched_patterns_max; /* Max capacity of allocated memory for matched_patterns */ /* Outgoing Edges */ struct ac_edge* outgoing_edge;/* Array of outgoing character edges */ unsigned short outgoing_num; /* Number of outgoing character edges */ unsigned short outgoing_max; /* Max capacity of allocated memory for outgoing character edges */ }AC_NODE_t; /* The Ougoing Edge of the Node */ struct ac_edge { char alpha; /* Edge alpha */ AC_NODE_t * next; /* Target of the edge */ }; static void node_assign_id (AC_NODE_t * thiz); static AC_NODE_t * node_find_next(AC_NODE_t * pAc_node, char ch); /****************************************************************************** * Create node ******************************************************************************/ AC_NODE_t *node_create() { AC_NODE_t* pNode = (AC_NODE_t*)malloc(sizeof(AC_NODE_t)); memset(pNode, 0, sizeof(AC_NODE_t)); pNode->failure_node = NULL; pNode->parent = NULL; pNode->final = 0; /*init matched pattern*/ pNode->matched_patterns_max = REALLOC_CHUNK_MATCHED_PATTERN; pNode->matched_patterns = (AC_PATTERN_t *) malloc (pNode->matched_patterns_max*sizeof(AC_PATTERN_t)); /*init outgoing character edges*/ pNode->outgoing_max = REALLOC_CHUNK_OUTGOING; pNode->outgoing_edge = (struct ac_edge *) malloc (pNode->outgoing_max*sizeof(struct ac_edge)); node_assign_id(pNode); return pNode; } /****************************************************************************** * assign a unique ID to the node (used for debugging purpose). ******************************************************************************/ static void node_assign_id (AC_NODE_t * thiz) { static int unique_id = 0; thiz->id = unique_id ++; } /****************************************************************************** * Establish an new edge between two nodes ******************************************************************************/ void node_add_outgoing (AC_NODE_t * thiz, AC_NODE_t * next, char alpha) { if(thiz->outgoing_num >= thiz->outgoing_max) { thiz->outgoing_max += REALLOC_CHUNK_OUTGOING; thiz->outgoing_edge = (struct ac_edge *)realloc(thiz->outgoing_edge, thiz->outgoing_max*sizeof(struct ac_edge)); } thiz->outgoing_edge[thiz->outgoing_num].alpha = alpha; thiz->outgoing_edge[thiz->outgoing_num++].next = next; } /****************************************************************************** * Create a next node with the given alpha. ******************************************************************************/ AC_NODE_t * node_create_next (AC_NODE_t * pCur_node, char alpha) { AC_NODE_t * pNext_node = NULL; pNext_node = node_find_next (pCur_node, alpha); if (pNext_node) { /* The (labeled alpha) edge already exists */ return NULL; } /* Otherwise add new edge (node) */ pNext_node = node_create (); node_add_outgoing(pCur_node, pNext_node, alpha); return pNext_node; } /****************************************************************************** * Find out the next node for a given Alpha to move. this function is used in * the pre-processing stage in which edge array is not sorted. so it uses linear search. ******************************************************************************/ static AC_NODE_t * node_find_next(AC_NODE_t * pAc_node, char ch) { int i = 0; if(NULL == pAc_node) { return NULL; } for (i=0; i < pAc_node->outgoing_num; i++) { if(pAc_node->outgoing_edge[i].alpha == ch) return (pAc_node->outgoing_edge[i].next); } return NULL; } /****************************************************************************** * Determine if a final node contains a pattern in its accepted pattern list or not. * return : 1 = it had, 0 = it hadn't ******************************************************************************/ int node_had_match_ptn (AC_NODE_t * pAc_node, AC_PATTERN_t * pAc_ptn) { unsigned int i = 0, j = 0; AC_PATTERN_t * pTmp_ptn = NULL; for (i=0; i < pAc_node->matched_patterns_num; i++) { pTmp_ptn = &pAc_node->matched_patterns[i]; if (pTmp_ptn->length != pAc_ptn->length) { continue; } for (j=0; j<pTmp_ptn->length; j++) { if(pTmp_ptn->astring[j] != pAc_ptn->astring[j]) { continue; } } if (j == pTmp_ptn->length) { return 1; } } return 0; } /****************************************************************************** * Adds the pattern to the list of accepted pattern. ******************************************************************************/ void node_register_match_ptn (AC_NODE_t * pAc_node, AC_PATTERN_t * pAc_ptn) { /* Check if the new pattern already exists in the node's matched patterns list */ if (node_had_match_ptn(pAc_node, pAc_ptn)) { return; } /* Manage memory */ if (pAc_node->matched_patterns_num >= pAc_node->matched_patterns_max) { pAc_node->matched_patterns_max += REALLOC_CHUNK_MATCHED_PATTERN; pAc_node->matched_patterns = (AC_PATTERN_t *) realloc (pAc_node->matched_patterns, pAc_node->matched_patterns_max*sizeof(AC_PATTERN_t)); } pAc_node->matched_patterns[pAc_node->matched_patterns_num].astring = pAc_ptn->astring; pAc_node->matched_patterns[pAc_node->matched_patterns_num].length = pAc_ptn->length; pAc_node->matched_patterns_num++; } /****************************************************************************** * add parent node's all leaf node(outgoing node) into queue ******************************************************************************/ int queue_add_leaf_node(AC_NODE_t *parent, queue<AC_NODE_t*> &ac_node_queue) { int i; for (i = 0; i < parent->outgoing_num; i++) { ac_node_queue.push (parent->outgoing_edge[i].next); } return 0; } /****************************************************************************** * Initialize automata; allocate memories and add patterns into automata ******************************************************************************/ AC_NODE_t * ac_automata_create(char pattern[][255], int patterns_num) { int iPattern_index, iChar_index; AC_NODE_t *root = node_create(); AC_NODE_t *pCur_node = NULL, *pNext_node = NULL; char alpha; AC_PATTERN_t pCur_ptn; for(iPattern_index=0; iPattern_index<patterns_num; iPattern_index++) { pCur_node = root; pCur_ptn.astring = pattern[iPattern_index]; pCur_ptn.length = strlen(pattern[iPattern_index]); for(iChar_index=0; iChar_index<pCur_ptn.length; iChar_index++) ///對每一個模式進行處理 { alpha = *(pCur_ptn.astring+iChar_index); pNext_node = node_find_next(pCur_node, alpha); if(NULL != pNext_node) { pCur_node = pNext_node; } else { pNext_node = node_create_next(pCur_node, alpha); if(NULL != pNext_node) { pNext_node->parent = pCur_node; pNext_node->depth = pCur_node->depth + 1; pCur_node = pNext_node; } } } pCur_node->final = 1; node_register_match_ptn(pCur_node, &pCur_ptn); } return root; } /****************************************************************************** * find failure node for all node, actually failure function maps a state into a new state. * the failure function is consulted whenever the goto function reports fail; * specificialy compute the failue node, we use it's parent node's failure node ******************************************************************************/ int ac_automata_setfailure(AC_NODE_t * root) { int i =0; queue<AC_NODE_t*> ac_node_queue; char edge_ch = '\0'; AC_NODE_t *pCur_node = NULL, *parent = NULL, *pNext_Node = NULL; for(i= 0; i< root->outgoing_num; i++) //f(s) = 0 for all states s of depth 1 { root->outgoing_edge[i].next->failure_node = root; } queue_add_leaf_node(root, ac_node_queue); while(!ac_node_queue.empty()) { parent = ac_node_queue.front(); ac_node_queue.pop(); queue_add_leaf_node(parent, ac_node_queue); for(i = 0; i < parent->outgoing_num; i++) { edge_ch = parent->outgoing_edge[i].alpha; pCur_node = parent->outgoing_edge[i].next; pNext_Node = node_find_next(parent->failure_node, edge_ch); if(NULL == pNext_Node) { if(parent->failure_node == root) { pCur_node->failure_node = root; } else { parent = parent->failure_node->parent; } } else { pCur_node->failure_node = pNext_Node; } } } return 0; } /****************************************************************************** * Collect accepted patterns of the node. the accepted patterns consist of the * node's own accepted pattern plus accepted patterns of its failure node. ******************************************************************************/ void ac_automata_union_match_ptn(AC_NODE_t * root) { unsigned int i; AC_NODE_t * pCur_node = root; AC_NODE_t * pNext_node = NULL; queue<AC_NODE_t*> ac_node_queue; ac_node_queue.push( pCur_node ); while(!ac_node_queue.empty()) { pCur_node = ac_node_queue.front(); ac_node_queue.pop(); pNext_node = pCur_node; while ((pNext_node = pNext_node->failure_node)) { for (i=0; i < pNext_node->matched_patterns_num; i++) { node_register_match_ptn(pCur_node, &(pNext_node->matched_patterns[i])); } if (pNext_node->final) { pCur_node->final = 1; } } for (int i = 0; i < pCur_node->outgoing_num; i++) { ac_node_queue.push (pCur_node->outgoing_edge[i].next); } } } #if 1 AC_NODE_t * node_find_by_id(AC_NODE_t * root, int node_id) { int i = 0; AC_NODE_t * pCur_node = root; queue<AC_NODE_t*> ac_node_queue; ac_node_queue.push( pCur_node ); while(!ac_node_queue.empty()) { pCur_node = ac_node_queue.front(); if(node_id == pCur_node ->id) { return pCur_node; } ac_node_queue.pop(); for (int i = 0; i < pCur_node->outgoing_num; i++) { ac_node_queue.push (pCur_node->outgoing_edge[i].next); } } return root; } static int delta[255][26] = {0}; void ac_automata_compute_deterministic_transition(AC_NODE_t * root, char * txt, int txt_len) { char alpha; int i =0; AC_NODE_t *pCur_node = NULL, *pNext_Node = NULL; queue<AC_NODE_t*> ac_node_queue; for(i = 0; i < txt_len; i++) { alpha = *(txt + i); pNext_Node = node_find_next(root, alpha); if(NULL != pNext_Node) { delta[0][alpha-'a'] = pNext_Node->id; ac_node_queue.push( pNext_Node ); } } while(!ac_node_queue.empty()) { pCur_node = ac_node_queue.front(); ac_node_queue.pop(); for(i = 0; i < txt_len; i++) { alpha = *(txt + i); if(pNext_Node = node_find_next(pCur_node, alpha)) { ac_node_queue.push ( pNext_Node ); delta[pCur_node->id][alpha-'a'] = pNext_Node->id; } else { delta[pCur_node->id][alpha-'a'] = delta[pCur_node->failure_node->id][alpha-'a']; } } } } void ac_automata_display_deterministic_transition(char* text, int txt_len) { char alpha; int i = 0; for(i = 0; i < txt_len; i++) { alpha = *(text+i); cout << delta[0][alpha - 'a'] << endl;; } return; } int ac_automata_deterministic_search(AC_NODE_t * root, char* text, int txt_len, char pattern[][255]) { int i = 0; char alpha; AC_NODE_t *pCur_node = root; int position = 0; int node_id = 0; while(position < txt_len) { alpha = *(text + position); node_id = delta[pCur_node->id][alpha - 'a']; pCur_node = node_find_by_id(root, node_id); position++; if(pCur_node->final == 1) ///some pattern matched { for(i = 0; i < pCur_node->matched_patterns_num; i++) { cout << position-pCur_node->matched_patterns[i].length << '\t' << '\t' << pCur_node->matched_patterns[i].astring <<endl; } } } return 0; } #endif /****************************************************************************** * Search in the input text using the given automata. ******************************************************************************/ int ac_automata_search(AC_NODE_t * root, char* text, int txt_len, char pattern[][255]) { int i = 0; AC_NODE_t *pCur_node = root; AC_NODE_t *pNext_node = NULL; int position = 0; while(position < txt_len) { pNext_node = node_find_next(pCur_node, text[position]); if (NULL == pNext_node) { if(pCur_node == root) { position++; } else { pCur_node = pCur_node->failure_node; } } else { pCur_node = pNext_node; position++; } if(pCur_node->final == 1) ///some pattern matched { for(i = 0; i < pCur_node->matched_patterns_num; i++) { cout << position-pCur_node->matched_patterns[i].length << '\t' << '\t' << pCur_node->matched_patterns[i].astring <<endl; } } } return 0; } /****************************************************************************** * Prints the automata to output in human readable form. ******************************************************************************/ void ac_automata_display (AC_NODE_t * root) { unsigned int i; AC_NODE_t * pCur_node = root; struct ac_edge * pEdge = NULL; if(root == NULL) { return; } printf("---------------------------------\n"); queue<AC_NODE_t*> ac_node_queue; ac_node_queue.push( pCur_node ); while(!ac_node_queue.empty()) { pCur_node = ac_node_queue.front(); ac_node_queue.pop(); printf("NODE(%3d)/----fail----> NODE(%3d)\n", pCur_node->id, (pCur_node->failure_node)?pCur_node->failure_node->id:0); for (i = 0; i < pCur_node->outgoing_num; i++) { ac_node_queue.push (pCur_node->outgoing_edge[i].next); pEdge = &pCur_node->outgoing_edge[i]; printf(" |----("); if(isgraph(pEdge->alpha)) printf("%c)---", pEdge->alpha); else printf("0x%x)", pEdge->alpha); printf("--> NODE(%3d)\n", pEdge->next->id); } printf("---------------------------------\n"); } } /****************************************************************************** * Release all allocated memories to the automata ******************************************************************************/ int ac_automata_release(AC_NODE_t * root) { if(root == NULL) { return 0; } queue<AC_NODE_t*> ac_node_queue; AC_NODE_t *pCur_node = NULL; ac_node_queue.push( root ); root = NULL; while(!ac_node_queue.empty()) { pCur_node = ac_node_queue.front(); ac_node_queue.pop(); for (int i = 0; i < pCur_node->outgoing_num; i++) { ac_node_queue.push (pCur_node->outgoing_edge[i].next); } free(pCur_node); } return 0; } int main() { unsigned int i = 0; char haystack[] = "ushers"; char needle[4][255]={"he","she", "his","hers"}; /* 1. create ac finite state automata match machine, compute goto and output func*/ AC_NODE_t *root = ac_automata_create(needle, sizeof(needle)/sizeof(needle[0])); /* 2. compute failure function*/ ac_automata_setfailure( root ); /*3 process those pattern is substring of other pattern, need add failure transition match pattern*/ ac_automata_union_match_ptn( root ); #if 1 ac_automata_compute_deterministic_transition(root, haystack, strlen(haystack)); ac_automata_display_deterministic_transition(haystack, strlen(haystack)); cout << endl << "haystack : " << haystack << endl; cout << "needles : "; for(i = 0; i<sizeof(needle)/sizeof(needle[0]); i++) { cout << needle[i] << " "; } cout << endl << endl; cout << "match result : " << endl << "position\t" << "pattern" << endl; ac_automata_deterministic_search(root, haystack, strlen(haystack), needle); #else /* 3. Display automata (if you are interested)*/ ac_automata_display( root ); cout << endl << "haystack : " << haystack << endl; cout << "needles : "; for(i = 0; i<sizeof(needle)/sizeof(needle[0]); i++) { cout << needle[i] << " "; } cout << endl << endl; cout << "match result : " << endl << "position\t" << "node_id\t\t" << "pattern" << endl; /* 3. seaching multi patterns use automata*/ ac_automata_search(root, haystack, strlen(haystack), needle); #endif /* 4. Release the automata */ ac_automata_release ( root ); return 0; }
後記:spa
一、上一篇的AC算法以及這篇的DFA都有個缺點,就是查找某個節點的下一個字符分支,須要去遍歷查找,比較耗費時間。針對這個的改進能夠使用Trie字典樹,進一步利用空間來換取時間效能(網上有不少用Trie實現多模式匹配的文章)。debug
二、同時瞭解到正則表達式裏有用到NFA以及DFA,本人對正則表達式不是很熟,看了下不甚瞭解,瞭解到:3d
一、NFA不須要讀入字符也能夠移動到下個狀態;以及讀入同一個字符能夠轉移到多個狀態(沒明白應用到哪裏);rest
二、肯定有限狀態自動機在計算能力上等價於非肯定有限狀態自動機。
三、NFA 和 DFA 間能夠互相轉換;
等等,不肯定這篇肯定性有限自動機和DFA是否同樣,熟悉的XDJM能夠指教下.
有任何問題,還請不吝賜教~
references:
<1>、Efficient String Matching: An Aid to Bibliographic Search.pdf(june 1975)
<2>、精通正則表達式