python爬蟲學習(11) —— 也寫個AC自動機

0. 寫在前面

本文記錄了一個AC自動機的誕生!javascript

以前看過有人用C++寫過AC自動機,也有用C#寫的,還有一個用nodejs寫的。。php

感受他們的代碼過於冗長,並且AC率也不是很理想。
恰好在回宿舍的路上和學弟聊起這個事
隨意想了想思路,以爲仍是蠻簡單的,就順手寫了一個,效果,還能夠接受。html

先上個圖吧:java

zhenAC

rank

最後應該還能夠繼續刷,若是修改代碼或者再添加如下其餘搜索引擎能夠AC更多題,
不過我有意控制在3000這個AC量,也有意跟在五虎上將以後。node

1. 爬蟲思路

思路其實很是清晰:正則表達式

  1. 模擬登陸HDU
  2. 針對某一道題目
    • 搜索AC代碼
      • 經過正則表達式進行代碼的提取
      • 經過htmlparser進行代碼的處理
    • 提交
      • 若AC,返回2
      • 不然,繼續提交代碼(這裏最多隻提交10份代碼)
      • 10次提交後還未AC,放棄此題

2. 簡單粗暴的代碼

#coding='utf-8'
import requests, re, os, HTMLParser, time, getpass

host_url = 'http://acm.hdu.edu.cn'
post_url = 'http://acm.hdu.edu.cn/userloginex.php?action=login'
sub_url = 'http://acm.hdu.edu.cn/submit.php?action=submit'
csdn_url = 'http://so.csdn.net/so/search/s.do'
head = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36' }
html_parser = HTMLParser.HTMLParser()
s = requests.session()

def login(usr,psw): 
    s.get(host_url);
    data = {'username':usr,'userpass':psw,'login':'Sign In'}    
    r = s.post(post_url,data=data)

def check_lan(lan):
    if 'java' in lan:
        return '5'
    return '0'

def parser_code(code):
    return html_parser.unescape(code).encode('utf-8')

def is_ac(pid,usr):
    tmp = requests.get('http://acm.hdu.edu.cn/userstatus.php?user='+usr).text
    accept = re.search('List of solved problems</font></h3>.*?<p align=left><script language=javascript>(.*?)</script><br></p>',tmp,re.S)
    if pid in accept.group(1):
        print '%s was solved' %pid
        return True
    else:
        return False

def search_csdn(PID,usr): 
    get_data = { 'q':'HDU ' + PID,  't':'blog', 'o':'', 's':'', 'l':'null'  }
    search_html = requests.get(csdn_url,params=get_data).text
    linklist = re.findall('<dd class="search-link"><a href="(.*?)" target="_blank">',search_html,re.S)
    for l in linklist:
        print l
        tm_html = requests.get(l,headers=head).text;
        title = re.search('<title>(.*?)</title>',tm_html,re.S).group(1).lower()
        if PID not in title:
            continue
        if 'hdu' not in title:
            continue            
        tmp = re.search('name="code" class="(.*?)">(.*?)</pre>',tm_html,re.S)
        if tmp == None:
            print 'code not find'
            continue
        LAN = check_lan(tmp.group(1))
        CODE =  parser_code(tmp.group(2))
        if r'include' in CODE:
            pass
        elif r'import java' in CODE:
            pass
        else:
            continue
        print PID, LAN
        print '--------------'
        submit_data = { 'check':'0', 'problemid':PID, 'language':LAN, 'usercode':CODE }
        s.post(sub_url,headers=head,data=submit_data)
        time.sleep(5)
        if is_ac(PID,usr):
            break
     
if __name__ == '__main__':
    usr = raw_input('input your username:')
    psw = getpass.getpass('input your password:')
    login(usr,psw)
    pro_cnt = 1000
    while pro_cnt <= 5679:
        PID = str(pro_cnt)
        if is_ac(PID,usr):
            pro_cnt += 1
            continue
        search_csdn(PID,usr)
        pro_cnt += 1

代碼不長,僅僅只有78行,是的,就是這樣!算法

yanhsi

3. TDDO

目前沒有打算完善這篇博客,也不推薦去研究這個東西,推薦的是去學習真正的算法,哈哈!session

好久好久之前本身寫過的AC自動機,,,,貼一發:post

#include <cstdio>
#include <cstring>
#include <algorithm>
#include <queue>
using namespace std;
#define clr( a, b ) memset( a, b, sizeof(a) )
const int SIGMA_SIZE = 26;
const int NODE_SIZE = 500000 + 10;

struct ac_automaton{
    int ch[ NODE_SIZE ][ SIGMA_SIZE ];
    int f[ NODE_SIZE ], val[ NODE_SIZE ], last[ NODE_SIZE ];
    int sz;
    void init(){
        sz = 1;
        clr( ch[0], 0 ), clr( val, 0 );
    }
    void insert( char *s ){
        int u = 0, i = 0;
        for( ; s[i]; ++i ){
            int c = s[i] - 'a';
            if( !ch[u][c] ){
                clr( ch[sz], 0 );
                val[sz] = 0;
                ch[u][c] = sz++;
            }
            u = ch[u][c];
        }
        val[u]++;
    }
    void getfail(){
        queue<int> q;
        f[0] = 0;
        for( int c = 0; c < SIGMA_SIZE; ++c ){
            int u = ch[0][c];
            if( u ) f[u] = 0, q.push(u), last[u] = 0;
        }
        while( !q.empty() ){
            int r = q.front(); q.pop();
            for( int c = 0; c < SIGMA_SIZE; ++c ){
                int u = ch[r][c];
                if( !u ){
                    ch[r][c] = ch[ f[r] ][c];
                    continue;    
                } 
                q.push( u );
                int v = f[r];
                while( v && !ch[v][c] ) v = f[v];
                f[u] = ch[v][c];
                last[u] = val[ f[u] ] ? f[u] : last[ f[u] ];
            }
        }
    }
    int work( char* s ){
        int res = 0;
        int u = 0, i = 0, e;
        for( ; s[i]; ++i ){
            int c = s[i] - 'a';
            u = ch[u][c];
            e = u;
            while( val[e] ){
                res += val[e];
                val[e] = 0;
                e = last[e];
            }
        }
        return res;
    } 
}ac;
相關文章
相關標籤/搜索