python 爬蟲的初步實踐

簡介

個人sisiter,想要我爬一些試題給她。有80套,她不想手工點,因此,我來了,比較簡單的網站。因此沒有費很大的力氣。期間參考了一系列的網站,都沒有記錄下來。html

code

#!/usr/bin/env python
#coding=utf-8

import pdfkit
import time
import requests
import sys
import urllib2
import re

def get_hiddenvalue(url):
    request=urllib2.Request(url)
    reponse=urllib2.urlopen(request)
    resu=reponse.read()
    VIEWSTATE=re.findall(r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />',resu, re.I)
    EVENTVALIDATION =re.findall(r'input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', resu,re.I)
    return VIEWSTATE[0],EVENTVALIDATION[0]

def get_hiddenvalue_string(myStr):
    VIEWSTATE=re.findall(r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />',myStr, re.I)
    EVENTVALIDATION =re.findall(r'input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', myStr,re.I)
    return VIEWSTATE[0],EVENTVALIDATION[0]

reload(sys)
sys.setdefaultencoding( "utf-8" )
data = {
    'cid':'1',
    'pid':'5'
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
}
options = {
    'page-size': 'A4',
    'margin-top': '0mm',
    'margin-right': '0mm',
    'margin-bottom': '0mm',
    'margin-left': '0mm',
    # 'orientation':'Landscape',#橫向
    'encoding': "UTF-8",
    'no-outline': None,
      # 'footer-right':'[page]' 設置頁碼
}
myMap=["UniversityPsychology","HigherEducationRegulations","HigherEducation","TeacherEthics"]
confg = pdfkit.configuration(wkhtmltopdf='/usr/local/bin/wkhtmltopdf')
for i in range(4):
    for j in range(20):
        data['cid'] = str(i+1)
        data['pid'] = str(j+1)
        url='http://zjzx.zjnu.edu.cn/test/Default.aspx?cid='+str(i+1)+'&pid='+str(j+1)
        #response = requests.post(url=url,data=data,headers=headers)
        myStr = myMap[i]+'_'+'exam'+str(j+1)+'.pdf'
        #response.encoding = 'utf-8'
        myStr1 = myMap[i]+'_'+'exam'+str(j+1)+'.html'

        data['Button1'] = '提交併查看單選題答案'
        data['drop1']= '1'
        data['drop2']='1'
        data['__VIEWSTATE'],data['__EVENTVALIDATION'] = get_hiddenvalue(url) # 請查閱__VIEWSTATE __EVENTVALIDATION 一個是把全部的按鍵信息編碼上傳,一個是加密用的
        response = requests.post(url=url,data=data,headers=headers)
        responseReturn = response.text
        time.sleep(3)
        del data['Button1']
        data['__VIEWSTATE'],data['__EVENTVALIDATION']=get_hiddenvalue_string(responseReturn)
        data['Button2']='提交併查看多選題答案'
        response = requests.post(url=url,data=data,headers=headers)
        responseReturn = response.text

        time.sleep(3)
        del data['Button2']
        data['__VIEWSTATE'],data['__EVENTVALIDATION']=get_hiddenvalue_string(responseReturn)
        data['Button3']='提交併查看判斷題答案'
        response = requests.post(url=url,data=data,headers=headers)
        responseReturn = response.text
        del data['Button3']
        with open(myStr1,'a') as file:
            file.write(responseReturn)
                
        with open(myStr1, 'r') as file:
            answer = file.read()
            pdfkit.from_string(answer, myStr,configuration=confg,options=options)
        
        #pdfkit.from_url(url, myStr,configuration=confg)
相關文章
相關標籤/搜索