我開通了公衆號【智能製造專欄】,之後技術類文章會發在專欄。
用Python寫爬蟲是很方便的,最近看了xlzd.me的文章,他的文章寫的很到位,提供了很好的思路。由於他的文章部分代碼省略了。下面是基於他的文章的三個代碼片斷:
基於Python3,Python2的話須要修改下input輸入函數和print的用法。本文github代碼地址html
有些代碼作了更改。其中把獲取的數據存儲到excel中。關於存取數據到excel能夠看個人另外一篇文章:。python
用到的庫git
#!/usr/bin/env python # encoding=utf-8 import requests,re import codecs from bs4 import BeautifulSoup from openpyxl import Workbook wb = Workbook() dest_filename = '電影.xlsx' ws1 = wb.active ws1.title = "電影top250" DOWNLOAD_URL = 'http://movie.douban.com/top250/' def download_page(url): """獲取url地址頁面內容""" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' } data = requests.get(url, headers=headers).content return data def get_li(doc): soup = BeautifulSoup(doc, 'html.parser') ol = soup.find('ol', class_='grid_view') name = [] #名字 star_con = [] #評價人數 score = [] #評分 info_list = [] #短評 for i in ol.find_all('li'): detail = i.find('div', attrs={'class': 'hd'}) movie_name = detail.find('span', attrs={'class': 'title'}).get_text() #電影名字 level_star = i.find('span',attrs={'class':'rating_num'}).get_text() #評分 star = i.find('div',attrs={'class':'star'}) star_num = star.find(text=re.compile('評價')) #評價 info = i.find('span',attrs={'class':'inq'}) #短評 if info: #判斷是否有短評 info_list.append(info.get_text()) else: info_list.append('無') score.append(level_star) name.append(movie_name) star_con.append(star_num) page = soup.find('span', attrs={'class': 'next'}).find('a') #獲取下一頁 if page: return name,star_con,score,info_list,DOWNLOAD_URL + page['href'] return name,star_con,score,info_list,None def main(): url = DOWNLOAD_URL name = [] star_con=[] score = [] info = [] while url: doc = download_page(url) movie,star,level_num,info_list,url = get_li(doc) name = name + movie star_con = star_con + star score = score+level_num info = info+ info_list for (i,m,o,p) in zip(name,star_con,score,info): col_A = 'A%s'%(name.index(i)+1) col_B = 'B%s'%(name.index(i)+1) col_C = 'C%s'%(name.index(i)+1) col_D = 'D%s'%(name.index(i)+1) ws1[col_A]=i ws1[col_B] = m ws1[col_C] = o ws1[col_D] = p wb.save(filename=dest_filename) if __name__ == '__main__': main()
結果以下:github
職位信息存儲在json中,獲取到json對象,再從中遍歷出公司名、地址、待遇等信息。json
import requests from openpyxl import Workbook def get_json(url, page, lang_name): data = {'first': 'true', 'pn': page, 'kd': lang_name} json = requests.post(url, data).json() list_con = json['content']['positionResult']['result'] info_list = [] for i in list_con: info = [] info.append(i['companyShortName']) info.append(i['companyName']) info.append(i['salary']) info.append(i['city']) info.append(i['education']) info_list.append(info) return info_list def main(): lang_name = input('職位名:') page = 1 url = 'http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' info_result = [] while page < 31: info = get_json(url, page, lang_name) info_result = info_result + info page += 1 wb = Workbook() ws1 = wb.active ws1.title = lang_name for row in info_result: ws1.append(row) wb.save('職位信息.xlsx') if __name__ == '__main__': main()
運行結果:segmentfault
經過開發者工具,獲取post的數據。session
import requests,time from bs4 import BeautifulSoup def get_captcha(data): with open('captcha.gif','wb') as fp: fp.write(data) return input('輸入驗證碼:') def login(username,password,oncaptcha): sessiona = requests.Session() headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'} _xsrf = BeautifulSoup(sessiona.get('https://www.zhihu.com/#signin',headers=headers).content,'html.parser').find('input',attrs={'name':'_xsrf'}).get('value') captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content data = { "_xsrf":_xsrf, "email":username, "password":password, "remember_me":True, "captcha":oncaptcha(captcha_content) } resp = sessiona.post('https://www.zhihu.com/login/email',data,headers=headers).content print(resp) return resp if __name__ == "__main__": login('your_email','your_password',get_captcha)
運行後會在運行目錄下獲得驗證碼圖片:app
輸入驗證碼後獲得以下響應結果代表登陸成功。函數