1 kv = { 2 'Cookie': 'ccpassport=ec081bd592c086d4057c3442b43b7998; wzwsconfirm=52277a99b139398795c925c264b5cf54; wzwstemplate=OQ==; wzwschallenge=-1; 3 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', 4 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'} 5 requests.adapters.DEFAULT_RETRIES = 5 6 requests.session().keep_alive = False 7 r = requests.get(url, headers=kv, timeout=60) 8 r.raise_for_status() 9 r.encoding = r.apparent_encoding
設置重連次數:requests.adapters.DEFAULT_RETRIES
設置鏈接活躍狀態: requests.session().keep_alive = False
添加參數,params={}
查看http返回狀態碼:r.raise_for_status()
設置返回數據的編碼:r.encoding
獲取返回數據的文本:r.text; 若返回數據爲json,可經過r.json()獲取獲得html
r = requests.post(url, data={}, headers=kv, timeout=60)
與get請求相似。注意,post請求參數名稱爲datapython
二者更具體的用法見:requests快速上手、requests高級用法、requests開發接口git
1 from selenium import webdriver 2 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 3 4 fire_fox_user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0" 5 dcap = dict(DesiredCapabilities.PHANTOMJS) 6 dcap["phantomjs.page.settings.userAgent"] = fire_fox_user_agent 7 brower = webdriver.PhantomJS(desired_capabilities=dcap) 8 brower.set_page_load_timeout(180) 9 brower.get(url) 10 brower.maximize_window() 11 path = 'my.jpg' 12 brower.save_screenshot(path) 13 brower.close()
更多用法見:Python+Selenium WebDriver API:瀏覽器及元素的經常使用函數及變量整理總結、 Selenium API文件github
1 import json 2 fp = open('my.json', 'a+', encoding='utf-8') 3 json.dump(dict, fp, ensure_ascii=False) 4 fp.close()
此外,json.dumps(dict)可將字典轉爲字符串web
1 import json 2 fp=open('my.json', 'r', encoding="utf-8") 3 dict = json.load(fp) 4 fp.close()
此外,json.loads()可將字符串轉爲字典,且必須是'{"xx":"c","f":"v"}'這種形式,即外面是單引號,裏面是雙引號,反之報錯。json
1 f = open("my.pdf",'wb') 2 f.write(requests.get(url).content) 3 f.close()
1 import logging 2 3 logging.basicConfig(filename= 'crawlLog.log', 4 level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") 5 6 logging.info("info") 7 logging.error("error ")
更具體的用法見:Python之日誌處理(logging模塊)api
1 import schedule 2 3 def job(): 4 print("I'm working...") 5 6 schedule.every(10).minutes.do(job) 7 schedule.every().hour.do(job) 8 schedule.every().day.at("10:30").do(job) 9 schedule.every().monday.do(job) 10 schedule.every().wednesday.at("13:15").do(job) 11 12 while True: 13 schedule.run_pending()
schedule API見:https://schedule.readthedocs.io/en/stable/api.html瀏覽器
另,爬蟲加密:簡談-Python爬蟲破解JS加密的Cookiesession