snort中的字串查找,如下代碼出自snort-2.9.6.0。是snort中用於字串匹配的底層接口。主要是參考了Boyer-Moore算法進行實現的.redis
關於Boyer-Moore的文章能夠參考: http://my.oschina.net/u/572632/blog/283380算法
本文主要是對該部分代碼作一分析總結。
數組
/* * bmh.h * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License Version 2 as * published by the Free Software Foundation. You may not use, modify or * distribute this program under any other version of the GNU General * Public License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright (C) 2014 Cisco and/or its affiliates. All rights reserved. * Copyright (C) 2005-2013 Sourcefire, Inc. * * Author: Marc Norton * * Date: 5/2005 * * Boyer-Moore-Horsepool for small pattern groups * */ #ifndef BOYER_MOORE_HORSPOOL #define BOYER_MOORE_HORSPOOL #define HBM_STATIC typedef struct { unsigned char *P; /*原始模式串*/ unsigned char *Pnc; /*所有轉換爲大寫的模式串*/ int M; /*模式串長度*/ int bcShift[256]; /*存放壞字符到模式串尾部的距離*/ int nocase; /*大小不敏感標記*/ }HBM_STRUCT; HBM_STATIC HBM_STRUCT * hbm_prep( unsigned char * pat, int m, int nocase ); HBM_STATIC int hbm_prepx( HBM_STRUCT *p, unsigned char * pat, int m, int nocase ); HBM_STATIC const unsigned char * hbm_match( HBM_STRUCT *p, const unsigned char * text, int n ); HBM_STATIC void hbm_free( HBM_STRUCT *p ); #endif
/* * bmh.c * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License Version 2 as * published by the Free Software Foundation. You may not use, modify or * distribute this program under any other version of the GNU General * Public License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright (C) 2014 Cisco and/or its affiliates. All rights reserved. * Copyright (C) 2005-2013 Sourcefire, Inc. * * Author: Marc Norton * * Date: 5/2005 * * Boyer-Moore-Horsepool for small pattern groups * */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "sf_types.h" #include "bmh.h" #include "sf_dynamic_engine.h" HBM_STATIC int hbm_prepx (HBM_STRUCT *p, unsigned char * pat, int m, int nocase ) { int i,k; unsigned char *t; if( !m ) return 0; if( !p ) return 0; p->P = pat; p->M = m; p->nocase = nocase; if( nocase ) /* nocase 是爲了處理大小寫不敏感的匹配方式 */ { t = (unsigned char*)malloc(m); if ( !t ) return 0; memcpy(t,pat,m); for(i=0;i<m;i++) { t[i] = (unsigned char)toupper(t[i]); } p->Pnc = t; } else { p->Pnc = 0; } /* Compute normal Boyer-Moore Bad Character Shift */ /** 構建壞字符跳轉表, 數組中的索引爲字符的值, 其中數據爲該字符在模式串中最後一次出現與模式串尾部的距離*/ for(k = 0; k < 256; k++) p->bcShift[k] = m; if( nocase ) { for(k = 0; k < m; k++) p->bcShift[ p->Pnc[k] ] = m - k - 1; } else { for(k = 0; k < m; k++) p->bcShift[ p->P[k] ] = m - k - 1; } return 1; } /** * 分配空間, 不用關注太多 */ HBM_STATIC HBM_STRUCT * hbm_prep(unsigned char * pat, int m, int nocase) { HBM_STRUCT *p; p = (HBM_STRUCT*)malloc(sizeof(HBM_STRUCT)); if (!p) { DynamicEngineFatalMessage("Failed to allocate memory for pattern matching."); } if( !hbm_prepx( p, pat, m, nocase) ) { DynamicEngineFatalMessage("Error initializing pattern matching. Check arguments."); } return p; } /* * 釋放空間,不用關注太多 */ HBM_STATIC void hbm_free( HBM_STRUCT *p ) { if(p) { if( p->Pnc )free(p->Pnc); free(p); } } /* * Boyer-Moore Horspool * Does NOT use Sentinel Byte(s) * Scan and Match Loops are unrolled and separated * Optimized for 1 byte patterns as well * */ HBM_STATIC const unsigned char * hbm_match(HBM_STRUCT * px, const unsigned char * text, int n) { const unsigned char *pat, *t, *et, *q; int m1, k; int *bcShift; if( px->nocase ) { pat = px->Pnc; } else { pat = px->P; } m1 = px->M-1; bcShift= px->bcShift; //printf("bmh_match: pattern=%.*s, %d bytes \n",px->M,pat,px->M); t = text + m1; et = text + n; /** * 1. t 指向將模式串與text串頭部對齊後, 與模式串尾部對齊的text中的字符/ * 2. et 指向text串的尾部 */ /* Handle 1 Byte patterns - it's a faster loop */ if( !m1 ) /*模式串長度爲1時直接匹配*/ { if( !px->nocase ) { for( ;t<et; t++ ) if( *t == *pat ) return t; } else { for( ;t<et; t++ ) if( toupper(*t) == *pat ) return t; } return 0; } if( !px->nocase ) { /* Handle MultiByte Patterns */ while( t < et ) { /* Scan Loop - Bad Character Shift */ /** * 這裏代碼的目地是不斷匹配,找到可以與模式串有一個末尾字符匹配的字串 * 1.與模式串尾部有一位字符出現匹配時中止 * 2.整個輸入字符掃描完後任然沒法匹配就匹配失敗 */ do { /*同時完成多個操做,充分利用cpu流水線的並行處理,減小操做數*/ t += bcShift[*t]; if( t >= et )return 0; t += (k=bcShift[*t]); if( t >= et )return 0; } while( k ); /* Unrolled Match Loop */ k = m1; q = t - m1; while( k >= 4 ) { /*同時完成多個操做,充分利用cpu流水線的並行處理,減小操做數*/ if( pat[k] != q[k] )goto NoMatch; k--; if( pat[k] != q[k] )goto NoMatch; k--; if( pat[k] != q[k] )goto NoMatch; k--; if( pat[k] != q[k] )goto NoMatch; k--; } /* Finish Match Loop */ while( k >= 0 ) { if( pat[k] != q[k] )goto NoMatch; k--; } /* If matched - return 1st char of pattern in text */ return q; NoMatch: t++; } } else /* NoCase - convert input string to upper case as we process it */ { /* Handle MultiByte Patterns */ while( t < et ) { /* Scan Loop - Bad Character Shift */ do { t += bcShift[toupper(*t)]; if( t >= et )return 0;; t += (k=bcShift[toupper(*t)]); if( t >= et )return 0; } while( k ); /* Unrolled Match Loop */ k = m1; q = t - m1; while( k >= 4 ) { if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--; if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--; if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--; if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--; } /* Finish Match Loop */ while( k >= 0 ) { if( pat[k] != toupper(q[k]) )goto NoMatchNC; k--; } /* If matched - return 1st char of pattern in text */ return q; NoMatchNC: t++; } } return 0; } #endif
該段代碼主要是利用了BM算法的壞字符處理,而未使用好後綴,一旦找到末尾匹配上一個字符的就使用樸素匹配的方式oop
在循環中包含多部處理,雖然相互有依賴,但能經過cpu並行處理減小開銷this