Python爬蟲之HDU提交數據

時間 2019-11-13

標籤 python 爬蟲 hdu 提交數據欄目 Python 简体版

原文原文鏈接

前一篇http://www.cnblogs.com/liyinggang/p/6094338.html 使用了爬蟲爬取hdu 的代碼,今天實現了將數據向hdu 提交的功能,接下來就是須要將兩個功能合併了.php

這裏感謝綦大神的博客, 不只ACM玩得厲害,並且還精通各類語言.我輩楷模,我從他這裏學會了怎麼使用 chrome 抓包.按 F12,然後去找到NetWork就好了.而後就能夠看到各類信息.css

好比在hdu的登陸界面咱們就能夠看到以下信息:html

而後能夠根據這些信息肯定這個網頁是須要Post方法仍是GET方法,還有header的信息,以及發送數據的格式等等.c++

咱們總共是有三個網頁須要進行解析:正則表達式

登陸 chrome

網頁：http://acm.hdu.edu.cn/userloginex.php?action=login
數據: username=用戶名&userpass=密碼&login=Sign+In

提交:服務器

網頁:http://acm.hdu.edu.cn/submit.php?action=submit
數據:problemid=pid&language=lang&usercode=code&check=1

而後獲取狀態的頁面,下載好了而後再用正則表達式去匹配.這裏對於每一個網頁的下載,特別是這個 status 頁面,我用本身寫的Download 函數下不下來，多是HDU作了什麼防爬蟲的手段...這裏關於傳輸數據,下載在網上參考了別人寫的代碼才搞定,可是它的正則表達式是有問題的...說說這個正則表達式吧,真的弄了我很久,由於我一直想很貪心的把本身的提交記錄一會兒就給匹配到,這樣反而作很差 (反正我是弄很差一句話去匹配,老是匹配多了 = =) 後來我直接先把全部的 <tr ** > </tr> 標籤弄出來,而後到每一個裏面找個人提交記錄,這樣分開處理要好多了,而後找狀態就很簡單了.這裏網頁裏面若是有 \n 符用 .*? 是匹配不到的,由於 . 是不包括換行符的全部字符,因此要用 [\s\S]或者 [\d\D]這種.cookie

/**這一段是更新,不看也無妨**/ide

/*************更新*******************************/
這裏的話有另外一種的方法能夠獲得咱們所需的表單提交時所需的信息，咱們可以利用 lxml.html 的 cssselect 進行解析。
/*********************************************/
#coding:utf-8
import urllib2

import lxml.html


__author__ = 'liyinggang'


def getInputFromForm(html):
    '''This method is to use all the input tags of the form
    '''
    tree = lxml.html.fromstring(html)
    data = {}
    for e in tree.cssselect('form input'): #使用css選擇器遍歷表單全部 input標籤
        if e.get('name'):
            data[e.get('name')] = e.get('value')
    return data

if __name__ == '__main__':
    url = 'http://acm.hdu.edu.cn/userloginex.php'
    html = urllib2.urlopen(url).read()
    getInputFromForm(html)

View Code

/*********************/函數

附上code(^_^但願你們學習爬蟲不要不停地刷hdu界面,那樣的話對你們都很差,hdu的維護靠你們~):

#coding:utf-8
'''
Created on 2016年11月29日

@author: liyinggang
'''
import cookielib, logging
from time import sleep
import urllib2, urllib, re


seed_url = "http://acm.hdu.edu.cn"
login_url = "/userloginex.php?action=login"
submit_url = "/submit.php?action=submit"
status_url ="/status.php"
class HDU:
    
    def __init__(self,username,password):
        self.username= username
        self.password = password
        self.code = None
        self.pid = 1000
        self.retry=False
        cj=cookielib.CookieJar()
        opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj),urllib2.HTTPHandler)
        #urllib2.urlopen()函數不支持驗證、cookie或者其它HTTP高級功能。要支持這些功能，必須使用build_opener()函數建立自定義Opener對象。
        urllib2.install_opener(opener) #這句必須加,開始一直登陸不上,可是具體爲何依舊待弄清
        self.headers ={"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36"}
        
    def login(self):
        postdata = {'username': self.username,
                    'userpass': self.password,
                    'login': 'Sign In'}
        postdata = urllib.urlencode(postdata)
        try:
            request=urllib2.Request(seed_url+login_url,postdata,self.headers)
            response = urllib2.urlopen(request, timeout=10)
            html = response.read()
            if html.find('Sign Out')==-1: 
                logging.error('login failed')
                return False
            print 'login success!'
            return True
        except:
            logging.error('login failed')
            return False
            
    def getstatus(self):
        postdata = {'user': self.username,
                'lang': 0,
                'first': '',
                'pid': '',
                'status': 0}
        postdata = urllib.urlencode(postdata)
        status = ''
        waitstatus = ['Compiling','Queuing','Running']
        cnt = 0
        #regex = '(<tr( bgcolor=#D7EBFF | )align=center >){1}(<td.*?</td>){2}<td><font color=.*?>(.*?)</font></td>(<td.*?</td>){5}<td class=fixedsize><a href="/userstatus.php\?user=%s">(?=.*?</a></td>)'%self.username
        while True:
            try:
                regex = '<table[^>]+>([\s\S]*?)</table>'
                request=urllib2.Request(seed_url+status_url,postdata,self.headers)
                response = urllib2.urlopen(request, timeout=10)
                html = response.read()
                table = re.findall(regex, html)[1]
                regex = '<tr[^>]+>([\s\S]*?)</tr>'
                L = re.findall(regex, table)
                result = L[1]
                regex = str(self.username)
                flag = True
                for i in L:
                    if re.search(regex, i):
                        flag = False 
                        result = i
                        break
                #print result
                if flag: 
                    status = 'UNKNOWN ERROR'
                    break
                regex = '<font[^>]+>(.*?)</font>'
                status = re.findall(regex, result)[0]
                if status not in waitstatus or cnt>=50:
                    break
                cnt+=1
                sleep(10)
            except:
                print '程序發生錯誤終止'
                return False
        print 'hduoj problem '+str(self.pid)+':'+status
        if status=='Compilation Error' and self.retry==False:
            self.retry = True
            self.submit(pid=self.pid,lang=2,code=self.code)
            if self.getstatus(): #再用c++交一次
                return True
        if status=='Accepted':
            return True
        return False
        
    def submit(self,pid,lang,code):
        postdata = {'problemid':pid,
                    'language' :lang,
                    'usercode' : code,
                    'check': '1'   
                }
        self.code = code
        self.pid = pid
        postdata = urllib.urlencode(postdata)
        try:   
            request=urllib2.Request(seed_url+submit_url,postdata,self.headers)
            response = urllib2.urlopen(request, timeout=10)
            sleep(1)
            if(response.code!=200 and response.code!=302):
                logging.error("submit fail!")
                return False 
        except:
            logging.error("submit fail!")
            return False
        print 'submit success!'
        return True

在blog裏面提取想要代碼(開頭的連接已經有這部分功能了,不過整合一下,若是有大佬能提出修改意見,幫我提供更好的正則表達式固然再感謝不過,寶寶內心苦,博客園的代碼解析出來有時候帶有行號,因此乾脆不要了,心塞塞（/TДT)/ ):

def getcode(url):
    '''返回值的第一個參數表明code,第二個參數表明用什麼語言提交, 0是G++,5是 Java
    '''
    D = Downloader(user_agent='lyg')
    html = D(url)
    tree = lxml.html.fromstring(html)
    texts = tree.cssselect('pre')
    texts.extend(tree.cssselect('p > textarea.cpp'))
    regex0 = re.compile('^(#include([\s\S]*)main()[\d\D]+)')  #若是是代碼裏面必定包含 main() 函數
    regex1 = re.compile('^(#import([\s\S]*)main()[\d\D]+)')
    for text in texts:
        text = text.text_content()
        pattern0 = re.search(regex0, text)
        pattern1 = re.search(regex1, text)
        if(pattern0):
            text = pattern0.group(1)
            return [text,0] 
        if(pattern1):
            text = pattern1.group(1)
            return [text,5]
    return None

測試代碼:

#coding:utf-8
'''
Created on 2016年11月29日

@author: admin
'''
from HDU import HDU
from time import sleep
lang = 0
pid = 1000
code = '''
#include<stdio.h>
int main()
{
 int a,b;
 while(scanf("%d%d",&a,&b)!=EOF)
 printf("%d\\n",a+b);
 return 0;
}
'''
hdu = HDU('***','***') 
if(hdu.login()):
    if(hdu.submit(pid, lang, code)):
        sleep(2)
        hdu.getstatus()