用requests爬取一個招聘網站

import requests
import re

session = requests.session()
第一步:訪問登錄頁,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
# 一、請求url:https://passport.lagou.com/login/login.html
# 二、請求方法:GET 由於是get請求不須要請求體
# 三、請求頭:User-agent
代碼以下:
r1 = session.get('https://passport.lagou.com/login/login.html',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                 },
                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]   #正則表達式獲取的值是一個列表
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

第二步:登錄html

# 一、請求url:https://passport.lagou.com/login/login.json
# 二、請求方法:POST
# 三、請求頭:包含:cookie,User-agent,Referer,X-Anit-Forge-Code,X-Anit-Forge-Token
# 四、請求體包含以下:
# isValidate:true
# username:18611453110
# password:70621c64832c4d4d66a47be6150b4a8e
# request_form_verifyCode:''
# submit:''
代碼以下:
r2 = session.post('https://passport.lagou.com/login/login.json',
                  headers={
                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                      'Referer': 'https://passport.lagou.com/login/login.html',
                      'X-Anit-Forge-Code': X_Anti_Forge_Code,
                      'X-Anit-Forge-Token': X_Anti_Forge_Token,
                      'X-Requested-With': 'XMLHttpRequest'
                  },
                  data={
                      "isValidate": True,
                      'username': '18611453110',#這是登錄的用戶名,
                      'password': '70621c64832c4d4d66a47be6150b4a8e',#這是加密的密碼
                      'request_form_verifyCode': '',
                      'submit': ''
                  }
                  )

第三步:受權java

 一、請求url:https://passport.lagou.com/grantServiceTicket/grant.html
# 二、請求方法:GET
# 三、請求頭:包含:User-agent,Referer
r3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                     'Referer': 'https://passport.lagou.com/login/login.html',
                 }
                 )
第四步:驗證是登錄成功:
r4 = session.get('https://www.lagou.com/resume/myresume.html',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                 }
                 )
# print('18611453110' in r4.text)#驗證是否登錄成功
 

第五步:篩選職位信息ajax

# 請求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91
# 請求方法:GET
# 請求頭:
# User-Agent
# 請求參數:
# gj:3年及如下
# px:default
# yx:25k-50k
# city:北京
from urllib.parse import urlencode

res = urlencode({'k': 'java高級開發'}, encoding='utf-8').split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# r5 = session.get(url,
#                  headers={
#                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
#                  },
#                  params={
#                      'gj': '3年及如下',
#                      'px': 'default',
#                      'yx': '25k-50k',
#                      'city': '北京'
#                  }
#                  )
#
# print(r5.text)

  沒有取到數據,由於數據是經過ajax發送的,因此咱們換另外一種方法解決:正則表達式

#請求url:https://www.lagou.com/jobs/positionAjax.json
#請求方法:POST
#請求頭
# Referer
# User-Agent
#請求體:
# first:true
# pn:1
# kd:java高級開發
#請求參數
# params={
# 'gj': '3年及如下',
# 'px': 'default',
# 'yx': '25k-50k',
# 'city': '北京',
# 'needAddtionalResult':False,
# 'isSchoolJob':0
# }
r6=session.post('https://www.lagou.com/jobs/positionAjax.json',
             headers={
                    'Referer':url,
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',

             },
             data={
                 'first':True,
                 'pn':1,
                 'kd':'java高級開發'
             },
             params={
                 'gj': '3年及如下',
                 'px': 'default',
                 'yx': '25k-50k',
                 'city': '北京',
                 'needAddtionalResult': False,
                 'isSchoolJob': 0
             }
             )
comapines_list=r6.json()['content']['positionResult']['result']
for comapiny in comapines_list:
    positionId=comapiny['positionId']
    company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)
    companyShortName = comapiny['companyShortName']
    positionName = comapiny['positionName']
    salary = comapiny['salary']
    print('''
    詳情鏈接:%s
    公司名:%s
    職位名:%s
    薪資:%s
    ''' %(company_link,companyShortName,positionName,salary))
#第七步:訪問詳情頁,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
# 請求url:詳情頁地址
# 請求方式:GET
# 請求頭:User-Agent
 r7=session.get(company_link,
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                }
                )
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0]
#第八步:投遞簡歷
#請求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
#請求方式:POST
#請求頭:
#Referer:詳情頁地址
#User-agent
#X-Anit-Forge-Code:53165984
#X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
#X-Requested-With:XMLHttpRequest
#請求體:
# positionId:職位ID
# type:1
# force:true
   session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                     'Referer': company_link,
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
    'positionId':positionId,
    'type':1,
    'force':True
                 }
                 )
    print('%s 投遞成功' %(companyShortName))

 

第7步,8步是並列的,放在第六步的裏面。第六步找到一個公司,進入詳情頁,而後投遞簡歷。
相關文章
相關標籤/搜索