Python爬蟲——Python 崗位分析報告

時間 2019-11-09

原文原文鏈接

前兩篇咱們分別爬取了糗事百科和妹子圖網站，學習了 Requests, Beautiful Soup 的基本使用。不過前兩篇都是從靜態 HTML 頁面中來篩選出咱們須要的信息。這一篇咱們來學習下如何來獲取 Ajax 請求返回的結果。javascript

歡迎關注公號【智能製造社區】學習更多原創智能製造及編程知識。java

Python 爬蟲入門(二)——爬取妹子圖
 Python 爬蟲入門(一)——爬取糗百python

本篇以拉勾網爲例來講明一下如何獲取 Ajax 請求內容mysql

本文目標

獲取 Ajax 請求,解析 JSON 中所需字段
數據保存到 Excel 中
數據保存到 MySQL, 方便分析

簡單分析

五個城市 Python 崗位平均薪資水平git

Python 崗位要求學歷分佈github

Python 行業領域分佈sql

Python 公司規模分佈數據庫

查看頁面結構

咱們輸入查詢條件以 Python 爲例，其餘條件默認不選，點擊查詢，就能看到全部 Python 的崗位了，而後咱們打開控制檯，點擊網絡標籤能夠看到以下請求：編程

從響應結果來看，這個請求正是咱們須要的內容。後面咱們直接請求這個地址就行了。從圖中能夠看出 result 下面就是各個崗位信息。json

到這裏咱們知道了從哪裏請求數據，從哪裏獲取結果。可是 result 列表中只有第一頁 15 條數據，其餘頁面數據怎麼獲取呢？

分析請求參數

咱們點擊參數選項卡，以下：

發現提交了三個表單數據，很明顯看出來 kd 就是咱們搜索的關鍵詞，pn 就是當前頁碼。first 默認就好了，不用管它。剩下的事情就是構造請求，來下載 30 個頁面的數據了。

構造請求，並解析數據

構造請求很簡單，咱們仍是用 requests 庫來搞定。首先咱們構造出表單數據 data = {'first': 'true', 'pn': page, 'kd': lang_name} 以後用 requests 來請求url地址，解析獲得的 Json 數據就算大功告成了。因爲拉勾對爬蟲限制比較嚴格，咱們須要把瀏覽器中 headers 字段所有加上，並且把爬蟲間隔調大一點，我後面設置的爲 10-20s，而後就能正常獲取數據了。

import requests

def get_json(url, page, lang_name):
    headers = {
        'Host': 'www.lagou.com',
        'Connection': 'keep-alive',
        'Content-Length': '23',
        'Origin': 'https://www.lagou.com',
        'X-Anit-Forge-Code': '0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'X-Requested-With': 'XMLHttpRequest',
        'X-Anit-Forge-Token': 'None',
        'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
    }
    data = {'first': 'false', 'pn': page, 'kd': lang_name}
    json = requests.post(url, data, headers=headers).json()
    list_con = json['content']['positionResult']['result']
    info_list = []
    for i in list_con:
        info = []
        info.append(i.get('companyShortName', '無'))
        info.append(i.get('companyFullName', '無'))
        info.append(i.get('industryField', '無'))
        info.append(i.get('companySize', '無'))
        info.append(i.get('salary', '無'))
        info.append(i.get('city', '無'))
        info.append(i.get('education', '無'))
        info_list.append(info)
    return info_list

獲取全部數據

瞭解瞭如何解析數據，剩下的就是連續請求全部頁面了，咱們構造一個函數來請求全部 30 頁的數據。

def main():
    lang_name = 'python'
    wb = Workbook()
    conn = get_conn()
    for i in ['北京', '上海', '廣州', '深圳', '杭州']:
        page = 1
        ws1 = wb.active
        ws1.title = lang_name
        url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
        while page < 31:
            info = get_json(url, page, lang_name)
            page += 1
            import time
            a = random.randint(10, 20)
            time.sleep(a)
            for row in info:
                insert(conn, tuple(row))
                ws1.append(row)
    conn.close()
    wb.save('{}職位信息.xlsx'.format(lang_name))

if __name__ == '__main__':
    main()

完整代碼

import random
import time

import requests
from openpyxl import Workbook
import pymysql.cursors


def get_conn():
    '''創建數據庫鏈接'''
    conn = pymysql.connect(host='localhost',
                                user='root',
                                password='root',
                                db='python',
                                charset='utf8mb4',
                                cursorclass=pymysql.cursors.DictCursor)
    return conn


def insert(conn, info):
    '''數據寫入數據庫'''
    with conn.cursor() as cursor:
        sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
        cursor.execute(sql, info)
    conn.commit()


def get_json(url, page, lang_name):
    '''返回當前頁面的信息列表'''
    headers = {
        'Host': 'www.lagou.com',
        'Connection': 'keep-alive',
        'Content-Length': '23',
        'Origin': 'https://www.lagou.com',
        'X-Anit-Forge-Code': '0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'X-Requested-With': 'XMLHttpRequest',
        'X-Anit-Forge-Token': 'None',
        'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
    }
    data = {'first': 'false', 'pn': page, 'kd': lang_name}
    json = requests.post(url, data, headers=headers).json()
    list_con = json['content']['positionResult']['result']
    info_list = []
    for i in list_con:
        info = []
        info.append(i.get('companyShortName', '無'))  # 公司名
        info.append(i.get('companyFullName', '無'))
        info.append(i.get('industryField', '無'))   # 行業領域
        info.append(i.get('companySize', '無'))  # 公司規模
        info.append(i.get('salary', '無'))   # 薪資
        info.append(i.get('city', '無'))
        info.append(i.get('education', '無'))   # 學歷
        info_list.append(info)
    return info_list   # 返回列表


def main():
    lang_name = 'python'
    wb = Workbook()  # 打開 excel 工做簿
    conn = get_conn()  # 創建數據庫鏈接  不存數據庫 註釋此行
    for i in ['北京', '上海', '廣州', '深圳', '杭州']:   # 五個城市
        page = 1
        ws1 = wb.active
        ws1.title = lang_name
        url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
        while page < 31:   # 每一個城市30頁信息
            info = get_json(url, page, lang_name)
            page += 1
            time.sleep(random.randint(10, 20))
            for row in info:
                insert(conn, tuple(row))  # 插入數據庫，若不想存入 註釋此行
                ws1.append(row)
    conn.close()  # 關閉數據庫鏈接，不存數據庫 註釋此行
    wb.save('{}職位信息.xlsx'.format(lang_name))

if __name__ == '__main__':
    main()

GitHub 地址：https://github.com/injetlee/Python/tree/master/%E7%88%AC%E8%99%AB%E9%9B%86%E5%90%88

若是你想要爬蟲獲取的崗位信息，請關注公號【智能製造社區】後臺留言發送 "python崗位"。