pycurl參考文檔:http://pycurl.io/docs/latest/index.html 是英文文檔,看起來也不是特麼吃力跟着作問題不大。html
#coding=utf-8 import pycurl import StringIO
""" 簡單原則:不要對str使用encode,不要對unicode使用decode ( s是code_A的str s.decode(‘code_A‘).encode(‘code_B‘) 0 """ class PySpider(): def __init__(self): self.c = pycurl.Curl() self.c.setopt(pycurl.COOKIEFILE, "cookie_file_name")#把cookie保存在該文件中 self.c.setopt(pycurl.COOKIEJAR, "cookie_file_name") #設置跳轉 self.c.setopt(pycurl.FOLLOWLOCATION, 1) #遇到302時候是否進行自動跳轉 self.c.setopt(pycurl.MAXREDIRS, 5) #網頁最多跳轉的次數 #設置超時 self.c.setopt(pycurl.CONNECTTIMEOUT,60) #設置連接超時 self.c.setopt(pycurl.TIMEOUT,120) #設置下載超時 #設置代理 若是有須要請去掉註釋,並設置合適的參數 #c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080') #c.setopt(pycurl.PROXYUSERPWD, '’'aaa:aaa') #-----------------------------------get函數-----------------------------------# def GetData(self, url): headers = ['Accept:*/*', 'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0'] buf = StringIO.StringIO() self.c.setopt(self.c.WRITEFUNCTION, buf.write) self.c.setopt(pycurl.URL, url) self.c.setopt(self.c.HTTPHEADER, headers) self.c.perform() the_page =buf.getvalue() charset = re.findall("""charset=([a-zA-Z0-9_\S][^"^>^']*)""",the_page) buf.close() return the_page,charset def PostData(self, url, data): headers = ['User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0'] #寫入html buf = StringIO.StringIO() self.c.setopt(self.c.WRITEFUNCTION, buf.write) #設置POST傳入數據#是不是post方法,默認是get#post的數據,是字典:個字典:{"key":"value"} self.c.setopt(pycurl.POST,1) self.c.setopt(pycurl.POSTFIELDS, data) #訪問的url self.c.setopt(pycurl.URL, url) # 傳入Headers self.c.setopt(pycurl.HEADER, True) self.c.setopt(self.c.HTTPHEADER, headers) self.c.perform() charset = re.findall('charset=([a-zA-Z0-9_\S][^"^>]*)',the_page) the_page = buf.getvalue() buf.close() return the_page,charset
稍微封裝了一下,就變成上面這玩意了保存上面代碼到Spider.py文件中,用法以下:ajax
from Spider import PySpider import json spider = PySpider() #GET方法 html = spider.GetData('http://www.baidu.com') print html,charset #post方法 postdate = json.dumps({ 'cityListName':'gz', 'trade':'', }) html = spider.PostData('http://qy.m.58.com/m_entlist/ajax_listinfo/2') print html