Python爬蟲爬取拉勾網職位信息，生成Excel並經過郵件發送

時間 2019-11-13

標籤 python 爬蟲拉勾職位信息生成 excel 並經郵件發送欄目 Python 简体版

原文原文鏈接

此代碼包含了Python爬蟲、Python生成Excel和Python發送郵件3部分主要功能。html

利用Python，能夠爬取拉勾網的職位信息，首先，經過瀏覽器的開發者工具，打開Network選項卡，篩選XHR類型的請求，咱們能夠找到拉勾網Ajax異步請求的url地址，也就是圖中紅框標記的位置python

而後觀察post參數的值，能夠發現傳遞了3個參數，kd爲搜索的關鍵字，pn爲頁碼，見圖中紅框git

再看返回的是json格式的數據，包含了列表頁的職位信息：json

打開詳情頁，觀察頁面的URL地址，發現前面部分是固定的，後面是返回的json數據中職位的positionId.html瀏覽器

因而能夠開始爬取了，根據功能需求，分以下4個部分：第一部分是根據Ajax調用地址獲取當前頁返回數據的方法，第二部分是根據返回的職位信息生成Excel，其中調用了爬取詳情頁的方法用於獲取工做地址，傳入的參數爲該職位的positionId，第三部分是將獲得的Excel發送郵件，第四部分是main方法調用服務器

#一、根據url發送post請求，獲得返回的json格式的數據，代碼以下：
import requests
import openpyxl
from openpyxl.styles import Font
import time
import os
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
import math

"""
    kw: 職位搜索關鍵字
    city: 城市
    pageNo: 第幾頁
"""
def get_one_page(kw, city, pageNo):
    url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=" + city + "&needAddtionalResult=false"
    headers = {
        "Cookie":"WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782",
        "Referer":"https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
    }
    data = {
        "first": "true",
        "pn": pageNo,
        "kd": kw
    }
    try:
        rsp = requests.post(url, data=data, headers=headers)
        if rsp.status_code == 200:
            return rsp.json()
    except Exception as ex:
        print(ex)

# 定義獲取詳情頁數據的方法
def get_detail_page(positionId):
    url = "https://www.lagou.com/jobs/{0}.html".format(positionId)
    headers = {
        "Cookie": "WEBTJ-ID=2018-9-30085456-16627f89d092b2-0aaa3c837f5b6a-4d045769-2073600-16627f89d0a35; user_trace_token=20180930085238-207c3c8d-c44b-11e8-bb68-5254005c3644; LGUID=20180930085238-207c40e7-c44b-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAADEAAFI227067DA2A0D7AD4FDE28079864FCB1E; _gat=1; PRE_UTM=; PRE_HOST=www.hao123.com; PRE_SITE=https%3A%2F%2Fwww.hao123.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_search; _gid=GA1.2.234281082.1538268897; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538269092,1538273622,1538273640,1538290427; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538290438; _ga=GA1.2.631515347.1538268897; LGSID=20180930145128-414a5532-c47d-11e8-a84f-525400f775ce; LGRID=20180930145138-47a12048-c47d-11e8-a84f-525400f775ce; SEARCH_ID=14da97091c164a25bcac51d60de7c782",
        "Referer": "https://www.lagou.com/jobs/list_python?px=default&city=%E7%83%9F%E5%8F%B0",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
    }
    try:
        rsp = requests.get(url=url, headers=headers)
        if rsp.status_code == 200:
            return rsp.content.decode()
    except Exception as ex:
        print(ex)



#二、將獲取到json數據獲得的職位信息生成Excel：

def to_excel(json_data, filename):
   # 建立workbook和sheet對象

　　workbook = openpyxl.Workbook()  # Workbook的開頭W 大寫

sheet = workbook.active
ft = Font(name="宋體", size=12)
positionResult = json_data
#標題
sheet.cell(row=1, column=1).font = ft
sheet.cell(row=1, column=1).value = "公司名稱"
sheet.cell(row=1, column=2).font = ft
sheet.cell(row=1, column=2).value = "公司標籤"
sheet.cell(row=1, column=3).font = ft
sheet.cell(row=1, column=3).value = "公司簡稱"
sheet.cell(row=1, column=4).font = ft
sheet.cell(row=1, column=4).value = "建立時間"
sheet.cell(row=1, column=5).font = ft
sheet.cell(row=1, column=5).value = "地區"
sheet.cell(row=1, column=6).font = ft
sheet.cell(row=1, column=6).value = "學歷"
sheet.cell(row=1, column=7).font = ft
sheet.cell(row=1, column=7).value = "職位標籤"
sheet.cell(row=1, column=8).font = ft
sheet.cell(row=1, column=8).value = "職位誘惑"
sheet.cell(row=1, column=9).font = ft
sheet.cell(row=1, column=9).value = "職位名稱"
sheet.cell(row=1, column=10).font = ft
sheet.cell(row=1, column=10).value = "薪水"
sheet.cell(row=1, column=11).font = ft
sheet.cell(row=1, column=11).value = "工做年限"
sheet.cell(row=1, column=12).font = ft
sheet.cell(row=1, column=12).value = "工做地址"

#內容
for index, item in enumerate(positionResult):
    sheet.cell(row=index+2, column=1).font = ft
    sheet.cell(row=index+2, column=1).value = item["companyFullName"]
    sheet.cell(row=index+2, column=2).font = ft
    sheet.cell(row=index+2, column=2).value = "".join(item["companyLabelList"])
    sheet.cell(row=index+2, column=3).font = ft
    sheet.cell(row=index+2, column=3).value = item["companyShortName"]
    sheet.cell(row=index+2, column=4).font = ft
    sheet.cell(row=index+2, column=4).value = item["createTime"]
    sheet.cell(row=index+2, column=5).font = ft
    sheet.cell(row=index+2, column=5).value = item["district"]
    sheet.cell(row=index+2, column=6).font = ft
    sheet.cell(row=index+2, column=6).value = item["education"]
    sheet.cell(row=index+2, column=7).font = ft
    sheet.cell(row=index+2, column=7).value = "".join(item["industryLables"])
    sheet.cell(row=index+2, column=8).font = ft
    sheet.cell(row=index+2, column=8).value = item["positionAdvantage"]
    sheet.cell(row=index+2, column=9).font = ft
    sheet.cell(row=index+2, column=9).value = item["positionName"]
    sheet.cell(row=index+2, column=10).font = ft
    sheet.cell(row=index+2, column=10).value = item["salary"]
    sheet.cell(row=index+2, column=11).font = ft
    sheet.cell(row=index+2, column=11).value = item["workYear"]
    sheet.cell(row=index + 2, column=12).font = ft
　　#獲取詳情頁的html
    html = get_detail_page(item["positionId"])
　　#解析獲取到的html獲得詳情頁的工做地址
    soup = BeautifulSoup(html.replace("\n", ""), 'lxml')
    address = soup.find(class_="work_addr").find_all(rel="nofollow")
    addressStr = ""
for i in range(len(address) - 1):
        if(address[i].string != None):
            # print(address[i].string)
addressStr += address[i].string
    length = len(soup.find(class_="work_addr").contents)
    sheet.cell(row=index + 2, column=12).value = addressStr + soup.find(class_="work_addr").contents[length-3].string.strip()
workbook.save(filename = filename + '.xlsx')

#三、將職位信息的Excel做爲附件發送郵件：

# 寄信人郵箱帳號和密碼，替換成你本身的

sender = 'xxx@qq.com'
sender_pwd = "xxx"
# 接收郵件，可設置爲你的QQ郵箱或者其餘郵箱
receivers = ['xxx@qq.com']
# smtp服務器地址
smtp_srv = "smtp.qq.com"
# 發送附件郵件
def sendMIMEMultipartEmail(fileList):
    # 建立一個帶附件的實例
    message = MIMEMultipart()
    message['From'] = Header(sender)
    message['To'] = Header("拉勾網職位信息", 'utf-8')
    subject = '拉勾網職位信息'
    message['Subject'] = Header(subject, 'utf-8')
    # 郵件正文內容
    message.attach(MIMEText('拉勾網招聘職位信息', 'plain', 'utf-8'))

    for item in fileList:
        # 構造附件，傳送fileList中的文件
        att = MIMEText(open(item, 'rb').read(), 'base64', 'utf-8')
        att["Content-Type"] = 'application/octet-stream'
        # 這裏的filename能夠任意寫，寫什麼名字，郵件中顯示什麼名字
        att["Content-Disposition"] = 'attachment; filename=result.xlsx'
        message.attach(att)

    try:
        smtp = smtplib.SMTP()
        smtp.connect(smtp_srv, 25)
        # 登陸郵箱
        smtp.login(sender, sender_pwd)
        # 發送郵件
        smtp.sendmail(sender, receivers, message.as_string())
        print("郵件發送成功")
    except smtplib.SMTPException as ex:
        print("Error: 沒法發送郵件")
        print(ex)

四、main方法調用：

if __name__ == "__main__":
    """
       本例獲取城市爲濟南，職位搜索關鍵字爲Python的職位列表
       第一次調用傳入頁碼1，獲取總頁數，
       即json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"]
       後面從第二頁開始循環爬取
    """
    json_data = get_one_page("Python", "濟南", 1)
    positionResult = json_data["content"]["positionResult"]["result"]
    positionList = positionResult
    count = math.ceil(json_data["content"]["positionResult"]["totalCount"]/json_data["content"]["positionResult"]["resultSize"])
    for i in range(2, count+1):
        json_data = get_one_page("Python", "濟南", i)
        positionResult = json_data["content"]["positionResult"]["result"]
        positionList += positionResult
    # print(json_data)
    #print(positionList)
    to_excel(positionList, "result")
    time.sleep(3)
    fileList = []
    fileList.append(os.getcwd() + r"\result.xlsx")
    sendMIMEMultipartEmail(fileList)