基於DFA算法實現的敏感詞過濾

時間 2019-12-10

標籤基於 dfa 算法實現敏感過濾简体版

原文原文鏈接

本文轉自淺析敏感詞過濾算法(C++)，本身也在其基礎上根據本身的狀況作了一點修改。html

https://blog.csdn.net/u012755940/article/details/51689401?utm_source=appios

爲了提升查找效率，這裏將敏感詞用樹形結構存儲，每一個節點有一個map成員，其映射關係爲一個string對應一個WordNode。
好比敏感詞庫裏面有槍手、手槍這幾個詞，讀入後就變成了以下圖所示的樹狀結構。
算法

STL::map是按照operator<比較判斷元素是否相同，以及比較元素的大小，而後選擇合適的位置插入到樹中。
下面主要實現了WordNode類，進行節點的插入以及查詢。windows

WordNode.happ

#ifndef __WORDNODE_H__
#define __WORDNODE_H__

#define PACE       1

#include <string>
#include <map>
#include <stdio.h>

class CWordNode
{
public:
    CWordNode(std::string character);
    CWordNode(){ m_character = ""; };
    ~CWordNode();
    std::string getCharacter() const{ return m_character; };
    CWordNode* findChild(std::string& nextCharacter);
    CWordNode* insertChild(std::string& nextCharacter);
private:
    friend class CWordTree;
    typedef std::map<std::string, CWordNode> _TreeMap;
    typedef std::map<std::string, CWordNode>::iterator _TreeMapIterator;

    std::string m_character;
    _TreeMap m_map;
    CWordNode* m_parent;
};

#endif

WordNode.cpp函數

#include "WordNode.h"

using namespace std;

CWordNode::~CWordNode()
{

}

CWordNode::CWordNode(std::string character)
{
    if (character.size() == PACE)
    {
        m_character.assign(character);
    }
}


CWordNode* CWordNode::findChild(std::string& nextCharacter)
{
    _TreeMapIterator TreeMapIt = m_map.find(nextCharacter);
    if (TreeMapIt == m_map.end())
    {
        return NULL;
    }
    else
    {
        return &TreeMapIt->second;
    }
}

CWordNode* CWordNode::insertChild(std::string& nextCharacter)
{
    if (!findChild(nextCharacter))
    {
        m_map.insert(pair<std::string, CWordNode>(nextCharacter, CWordNode(nextCharacter)));
        return &(m_map.find(nextCharacter)->second);
    }
    return NULL;
}

另外，編碼

#define PACE 1

這裏的PACE本來是2，由於一個GBK漢字佔兩個字符，並且原文中也說了若是須要考慮英文或中英文結合的狀況，將PACE改成１。
不過我試過以後，以爲無論是中文、英文仍是中英文，PACE爲 1 都適用，結果都沒錯，只不過中文的狀況下每一個節點的string都再也不是一個完整的漢字，而是漢字的一個字符。lua

接下來實現這個tree，在創建WordNode樹時，以parent爲根節點創建，一開始parent爲m_emptyRoot，而後keyword按照規則添加到樹中，假設一開始m_emptyRoot爲空，keyword爲」敏感詞」，則會以」敏感詞」爲一條分支創建成爲一顆樹枝’敏’->’感’->’詞’,此後，若想再添加」敏感度」，因爲」敏感詞」與」敏感度」的前兩個字相同，則會在’敏’->’感’->’詞’的基礎上，從字’感’開始新生長出一顆分支，即’敏’->’感’->’度’,這兩顆分支共用’敏’->’感’。spa

下面代碼實現了WordTree類，進行樹的構成及查詢。.net

WordTree.h

#ifndef __WORDTREE_H__
#define __WORDTREE_H__

#include "WordNode.h"

class CWordTree
{
public:
    CWordTree();
    ~CWordTree();

    int nCount;
    CWordNode* insert(std::string &keyWord);
    CWordNode* insert(const char* keyword);
    CWordNode* find(std::string& keyword);
private:
    CWordNode m_emptyRoot;
    int m_pace;
    CWordNode* insert(CWordNode* parent, std::string& keyword);
    CWordNode* insertBranch(CWordNode* parent, std::string& keyword);
    CWordNode* find(CWordNode* parent, std::string& keyword);
};

#endif // __WORDTREE_H__

WordTree.cpp

#include "WordTree.h"

CWordTree::CWordTree()
:nCount(0)
{

}

CWordTree::~CWordTree()
{
}

CWordNode* CWordTree::insert(std::string &keyWord)
{
    return insert(&m_emptyRoot, keyWord);
}

CWordNode* CWordTree::insert(const char* keyWord)
{
    std::string wordstr(keyWord);
    return insert(wordstr);
}

CWordNode* CWordTree::insert(CWordNode* parent, std::string& keyWord)
{
    if (keyWord.size() == 0)
    {
        return NULL;
    }
    std::string firstChar = keyWord.substr(0, PACE);
    CWordNode* firstNode = parent->findChild(firstChar);
    if (firstNode == NULL)
    {
        return insertBranch(parent, keyWord);
    }
    std::string restChar = keyWord.substr(PACE, keyWord.size());
    return insert(firstNode, restChar);
}

CWordNode* CWordTree::find(std::string& keyWord)
{
    return find(&m_emptyRoot, keyWord);
}

CWordNode* CWordTree::find(CWordNode* parent, std::string& keyWord)
{
    std::string firstChar = keyWord.substr(0, PACE);
    CWordNode* firstNode = parent->findChild(firstChar);
     if (firstNode == NULL)
    {
        nCount = 0;
        return NULL;
    }
    std::string restChar = keyWord.substr(PACE, keyWord.size());
    if (firstNode->m_map.empty())
    {
        return firstNode;
    }
    if (keyWord.size() == PACE)
    {
        return NULL;
    }
    nCount++;
    return find(firstNode, restChar);
}

CWordNode* CWordTree::insertBranch(CWordNode* parent, std::string& keyWord)
{
    std::string firstChar = keyWord.substr(0, PACE);
    CWordNode* firstNode = parent->insertChild(firstChar);
    if (firstNode != NULL)
    {
        std::string restChar = keyWord.substr(PACE, keyWord.size());
        if (!restChar.empty())
        {
            return insertBranch(firstNode, restChar);
        }
    }
    return NULL;
}

最後就是利用上述的Tree來實現敏感詞過濾，WordFilter::censor(string &source) 函數用來進行敏感詞過濾，source即輸入的字符串，若是source包含敏感詞，則用「**」替代掉。

WordFilter::load(const char* filepath) 函數經過文件載入敏感詞，並構建WordTree，這裏我用的是txt文件。

下面實現了WordFilter類。

WordFilter.h

#ifndef __WORDFILTER_H__
#define __WORDFILTER_H__

#include "WordTree.h"
#include "base/CCRef.h"

USING_NS_CC;

class CWordFilter : public Ref
{
public:
    ~CWordFilter();
    bool loadFile(const char* filepath);
    bool censorStr(std::string &source);
    bool censorStrWithOutSymbol(const std::string &source);
    static CWordFilter* getInstance();
    static void release();
private:
    std::string string_To_UTF8(const std::string & str);
    std::string UTF8_To_string(const std::string & str);
    CWordFilter();
    static CWordFilter* m_pInstance;
    CWordTree m_WordTree;
};



#endif // __WORDFILTER_H__

WordFilter.cpp

#include "WordFilter.h"
#include <ctype.h>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <istream>

using namespace std;

USING_NS_CC;

CWordFilter* CWordFilter::m_pInstance = nullptr;
CWordFilter::CWordFilter()
{
}

CWordFilter::~CWordFilter()
{
}

CWordFilter* CWordFilter::getInstance()
{
    if (m_pInstance == NULL)
    {
        m_pInstance = new CWordFilter();
    }
    return m_pInstance;
}

void CWordFilter::release()
{
    if (m_pInstance)
    {
        delete m_pInstance;
    }
    m_pInstance = NULL;
}

bool CWordFilter::loadFile(const char* filepath)
{
    ifstream infile(filepath, ios::in);

    if (!infile)
    {
        return false; 
    }
    else
    {
        string read;
        while (getline(infile, read))
        {
#if (CC_TARGET_PLATFORM == CC_PLATFORM_ANDROID || CC_TARGET_PLATFORM == CC_PLATFORM_IOS)
            string s;
            s = read.substr(0, read.length() - 1);
            m_WordTree.insert(s);
#else
            m_WordTree.insert(read);
#endif
        }
    }

    infile.close();
    return true;
}

bool CWordFilter::censorStr(string &source)
{
    int lenght = source.size();
    for (int i = 0; i < lenght; i += 1)
    {
        string substring = source.substr(i, lenght - i);
        if (m_WordTree.find(substring) != NULL)
        {
            source.replace(i, (m_WordTree.nCount + 1), "**");
            lenght = source.size();
            return true;
        }
    }
    return false;
}

bool CWordFilter::censorStrWithOutSymbol(const std::string &source)
{    
    string sourceWithOutSymbol;

    int i = 0;
    while (source[i] != 0)
    {
        if (source[i] & 0x80 && source[i] & 0x40 && source[i] & 0x20)
        {
            int byteCount = 0;
            if (source[i] & 0x10)
            {
                byteCount = 4;
            }
            else
            {
                byteCount = 3;
            }
            for (int a = 0; a < byteCount; a++)
            {
                sourceWithOutSymbol += source[i];
                i++;
            }
        }
        else if (source[i] & 0x80 && source[i] & 0x40)
        {
            i += 2;
        }
        else
        {
            i += 1;
        }
    }
    return censorStr(sourceWithOutSymbol);
}

這裏說明一點，本人是作Cocos2d-x手遊客戶端開發的，程序是要移植到安卓或者iOS平臺上。當逐行讀取txt文件中的敏感詞並構成樹的時候，getline(infile, read)函數獲得的read字符串後面帶有結束符，好比「槍手\0」，這時跟咱們須要檢測的字符串「…槍手…」就明顯不符合，這是檢測不出來的。這種狀況我如今只知道在安卓或者iOS平臺存在，而在windows環境下VS中是不會出現這種問題的。因此我對讀取到的字符串作了處理，把最後一個字符也就是結束符去掉，再進行下一步操做。

而我使用的是lua，lua發送給C++的字符串都是用utf-8編碼的，因此再去除字符串的時候並不能簡答的使用(a & 0x80)來判斷

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。