Python爬蟲爬取拉勾網職位信息,生成Excel並經過郵件發送

此代碼包含了Python爬蟲、Python生成Excel和Python發送郵件3部分主要功能。html

利用Python,能夠爬取拉勾網的職位信息,首先,經過瀏覽器的開發者工具,打開Network選項卡,篩選XHR類型的請求,咱們能夠找到拉勾網Ajax異步請求的url地址,也就是圖中紅框標記的位置python

而後觀察post參數的值,能夠發現傳遞了3個參數,kd爲搜索的關鍵字,pn爲頁碼,見圖中紅框git

 

再看返回的是json格式的數據,包含了列表頁的職位信息:json

打開詳情頁,觀察頁面的URL地址,發現前面部分是固定的,後面是返回的json數據中職位的positionId.html瀏覽器

因而能夠開始爬取了,根據功能需求,分以下4個部分:第一部分是根據Ajax調用地址獲取當前頁返回數據的方法,第二部分是根據返回的職位信息生成Excel,其中調用了爬取詳情頁的方法用於獲取工做地址,傳入的參數爲該職位的positionId,第三部分是將獲得的Excel發送郵件,第四部分是main方法調用服務器

#一、根據url發送post請求,獲得返回的json格式的數據,代碼以下:
import requests
import openpyxl
from openpyxl.styles import Font
import time
import os
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
import math

"""
kw: 職位搜索關鍵字
city: 城市
pageNo: 第幾頁
"""
def get_one_page(kw, city, pageNo):
url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=" + city + "&needAddtionalResult=false"
headers = {
"Cookie":"WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782",
"Referer":"https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
data = {
"first": "true",
"pn": pageNo,
"kd": kw
}
try:
rsp = requests.post(url, data=data, headers=headers)
if rsp.status_code == 200:
return rsp.json()
except Exception as ex:
print(ex)

# 定義獲取詳情頁數據的方法
def get_detail_page(positionId):
url = "https://www.lagou.com/jobs/{0}.html".format(positionId)
headers = {
"Cookie": "WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782",
"Referer": "https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
try:
rsp = requests.get(url=url, headers=headers)
if rsp.status_code == 200:
return rsp.content.decode()
except Exception as ex:
print(ex)


#二、將獲取到json數據獲得的職位信息生成Excel:
def to_excel(json_data, filename):
# 建立workbook和sheet對象

  workbook = openpyxl.Workbook() # Workbook的開頭W 大寫
sheet = workbook.active
ft = Font(name="宋體", size=12)
positionResult = json_data
#標題
sheet.cell(row=1, column=1).font = ft
sheet.cell(row=1, column=1).value = "公司名稱"
sheet.cell(row=1, column=2).font = ft
sheet.cell(row=1, column=2).value = "公司標籤"
sheet.cell(row=1, column=3).font = ft
sheet.cell(row=1, column=3).value = "公司簡稱"
sheet.cell(row=1, column=4).font = ft
sheet.cell(row=1, column=4).value = "建立時間"
sheet.cell(row=1, column=5).font = ft
sheet.cell(row=1, column=5).value = "地區"
sheet.cell(row=1, column=6).font = ft
sheet.cell(row=1, column=6).value = "學歷"
sheet.cell(row=1, column=7).font = ft
sheet.cell(row=1, column=7).value = "職位標籤"
sheet.cell(row=1, column=8).font = ft
sheet.cell(row=1, column=8).value = "職位誘惑"
sheet.cell(row=1, column=9).font = ft
sheet.cell(row=1, column=9).value = "職位名稱"
sheet.cell(row=1, column=10).font = ft
sheet.cell(row=1, column=10).value = "薪水"
sheet.cell(row=1, column=11).font = ft
sheet.cell(row=1, column=11).value = "工做年限"
sheet.cell(row=1, column=12).font = ft
sheet.cell(row=1, column=12).value = "工做地址"

#內容
for index, item in enumerate(positionResult):
sheet.cell(row=index+2, column=1).font = ft
sheet.cell(row=index+2, column=1).value = item["companyFullName"]
sheet.cell(row=index+2, column=2).font = ft
sheet.cell(row=index+2, column=2).value = "".join(item["companyLabelList"])
sheet.cell(row=index+2, column=3).font = ft
sheet.cell(row=index+2, column=3).value = item["companyShortName"]
sheet.cell(row=index+2, column=4).font = ft
sheet.cell(row=index+2, column=4).value = item["createTime"]
sheet.cell(row=index+2, column=5).font = ft
sheet.cell(row=index+2, column=5).value = item["district"]
sheet.cell(row=index+2, column=6).font = ft
sheet.cell(row=index+2, column=6).value = item["education"]
sheet.cell(row=index+2, column=7).font = ft
sheet.cell(row=index+2, column=7).value = "".join(item["industryLables"])
sheet.cell(row=index+2, column=8).font = ft
sheet.cell(row=index+2, column=8).value = item["positionAdvantage"]
sheet.cell(row=index+2, column=9).font = ft
sheet.cell(row=index+2, column=9).value = item["positionName"]
sheet.cell(row=index+2, column=10).font = ft
sheet.cell(row=index+2, column=10).value = item["salary"]
sheet.cell(row=index+2, column=11).font = ft
sheet.cell(row=index+2, column=11).value = item["workYear"]
sheet.cell(row=index + 2, column=12).font = ft
  #獲取詳情頁的html
html = get_detail_page(item["positionId"])
  #解析獲取到的html獲得詳情頁的工做地址
soup = BeautifulSoup(html.replace("\n", ""), 'lxml')
address = soup.find(class_="work_addr").find_all(rel="nofollow")
addressStr = ""
for i in range(len(address) - 1):
if(address[i].string != None):
# print(address[i].string)
addressStr += address[i].string
length = len(soup.find(class_="work_addr").contents)
sheet.cell(row=index + 2, column=12).value = addressStr + soup.find(class_="work_addr").contents[length-3].string.strip()
workbook.save(filename = filename + '.xlsx')
#三、將職位信息的Excel做爲附件發送郵件:

# 寄信人郵箱帳號和密碼,替換成你本身的
sender = 'xxx@qq.com'
sender_pwd = "xxx"
# 接收郵件,可設置爲你的QQ郵箱或者其餘郵箱
receivers = ['xxx@qq.com']
# smtp服務器地址
smtp_srv = "smtp.qq.com"
# 發送附件郵件
def sendMIMEMultipartEmail(fileList):
# 建立一個帶附件的實例
message = MIMEMultipart()
message['From'] = Header(sender)
message['To'] = Header("拉勾網職位信息", 'utf-8')
subject = '拉勾網職位信息'
message['Subject'] = Header(subject, 'utf-8')
# 郵件正文內容
message.attach(MIMEText('拉勾網招聘職位信息', 'plain', 'utf-8'))

for item in fileList:
# 構造附件,傳送fileList中的文件
att = MIMEText(open(item, 'rb').read(), 'base64', 'utf-8')
att["Content-Type"] = 'application/octet-stream'
# 這裏的filename能夠任意寫,寫什麼名字,郵件中顯示什麼名字
att["Content-Disposition"] = 'attachment; filename=result.xlsx'
message.attach(att)

try:
smtp = smtplib.SMTP()
smtp.connect(smtp_srv, 25)
# 登陸郵箱
smtp.login(sender, sender_pwd)
# 發送郵件
smtp.sendmail(sender, receivers, message.as_string())
print("郵件發送成功")
except smtplib.SMTPException as ex:
print("Error: 沒法發送郵件")
print(ex)

四、main方法調用:
if __name__ == "__main__":
"""
本例獲取城市爲濟南,職位搜索關鍵字爲Python的職位列表
第一次調用傳入頁碼1,獲取總頁數,
即json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"]
後面從第二頁開始循環爬取
"""
json_data = get_one_page("Python", "濟南", 1)
positionResult = json_data["content"]["positionResult"]["result"]
positionList = positionResult
count = math.ceil(json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"])
for i in range(2, count+1):
json_data = get_one_page("Python", "濟南", i)
positionResult = json_data["content"]["positionResult"]["result"]
positionList += positionResult
# print(json_data)
#print(positionList)
to_excel(positionList, "result")
time.sleep(3)
fileList = []
fileList.append(os.getcwd() + r"\result.xlsx")
sendMIMEMultipartEmail(fileList)

 完整代碼:https://gitee.com/ZiSeWuDao/Python/blob/master/Spider/lagouStandAlone.pyapp

相關文章
相關標籤/搜索