爬蟲投遞簡歷小示例

1、流程分析html

第一步:獲取登陸頁,獲取X_Anti_Forge_Token,X_Anti_Forge_Code
    一、請求url:https://passport.lagou.com/login/login.html
    2、請求方式:get
    3、請求頭:
           - cookie:用session處理了
           - User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name
第二步:登陸
    一、請求url:https://passport.lagou.com/login/login.json
    2、請求方式:post
    3、請求頭:
        cookie
        User-agent
        Referer:https://passport.lagou.com/login/login.html
        X-Anit-Forge-Code:53165984
        X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
        X-Requested-With:XMLHttpRequest
    4、請求體
        isValidate:true
        username:18611453110
        password:70621c64832c4d4d66a47be6150b4a8e
        request_form_verifyCode:''
        submit:''
第三步:受權
        一、請求url:https://passport.lagou.com/grantServiceTicket/grant.html
        2、請求方法:GET
        3、請求頭:
           User-agent
           Referer:https://passport.lagou.com/login/login.html
第四步:驗證
第五步:篩選職位信息
    請求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91
    請求方法:GET
    請求頭:
        User-Agent
    請求參數:
        gj:3年及如下
        px:default
        yx:25k-50k
        city:北京
第六步:訪問詳情頁,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
    請求url:詳情頁地址
    請求方式:GET
    請求頭:User-Agent
第七步:投遞簡歷
    請求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
    請求方式:POST
    請求頭:
        Referer:詳情頁地址
        User-agent
        X-Anit-Forge-Code:53165984
        X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
        X-Requested-With:XMLHttpRequest
    請求體:
    positionId:職位ID
    type:1
    force:true

2、代碼實現java

  1 import requests
  2 import re
  3 from urllib.parse import urlencode
  4 session = requests.session()
  5 r1 = session.get(
  6     "https://passport.lagou.com/login/login.html",
  7     headers = {
  8         "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
  9     }
 10 )
 11 X_Anit_Forge_Code  = re.findall("X_Anit_Forge_Code ='(.*?)'",r1.text,re.S)
 12 X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token ='(.*?)'",r1.text,re.S)
 13 r2 = session.post(
 14     "https://passport.lagou.com/login/login.json",
 15     headers = {
 16         "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 17         "Referer":"https://passport.lagou.com/login/login.html",
 18         "X-Anit-Forge-Code":X_Anit_Forge_Code,
 19         "X-Anit-Forge-Token":X_Anit_Forge_Token,
 20         "X-Requested-With":"XMLHttpRequest"
 21     },
 22     data={
 23         "isValidate": True,
 24         'username': '18611453110',
 25         'password': '70621c64832c4d4d66a47be6150b4a8e',
 26         'request_form_verifyCode': '',
 27         'submit': ''
 28     }
 29 )
 30 r3 = session.get(
 31     "https://passport.lagou.com/grantServiceTicket/grant.html",
 32     headers = {
 33         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 34         'Referer': 'https://passport.lagou.com/login/login.html',
 35     }
 36 )
 37 r4 = session.get(
 38     'https://www.lagou.com/resume/myresume.html',
 39     headers = {
 40         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 41     }
 42 )
 43 
 44 print('18611453110' in r4.text)
 45 
 46 # ============================
 47 # res = urlencode({"k":"java高級開發"},encoding="utf-8").split("=")[-1]
 48 # url = "https://www.lagou.com/jobs/list_"+res
 49 # r5 =session.get(url,
 50 #             headers={
 51 #                     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 52 #                 },
 53 #              params={
 54 #                      'gj': '3年及如下',
 55 #                      'px': 'default',
 56 #                      'yx': '25k-50k',
 57 #                      'city': '北京'
 58 #                 }
 59 #          ) #按照套路進行,結果取不到值,由於人家發的是ajax請求獲取的數據,因此選擇了r6的方式
 60 res = urlencode({"k":"java高級開發"},encoding="utf-8").split("=")[-1]
 61 url = "https://www.lagou.com/jobs/list_"+res
 62 r6 = session.post(
 63     'https://www.lagou.com/jobs/postionAjax.json',
 64     headers = {
 65         'Referer': url,
 66         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 67     },
 68     data = {
 69         "first":True,
 70         "pn":1,
 71         "kd":"java高級開發"
 72     },
 73     params = {
 74         "gj":"3年及如下",
 75         "gx":"default",
 76         "yx":"15k-25k",
 77         "city":"北京",
 78         "needAddtionResult":False,
 79         "isSchoolJob":0
 80     }
 81 )
 82 from pprint import pprint
 83 # print(r6.json())
 84 comapines_list=r6.json()['content']['positionResult']['result']
 85 for comapiny in comapines_list:
 86     positionId=comapiny['positionId']
 87     company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)
 88     companyShortName = comapiny['companyShortName']
 89     positionName = comapiny['positionName']
 90     salary = comapiny['salary']
 91     print('''
 92     詳情鏈接:%s
 93     公司名:%s
 94     職位名:%s
 95     薪資:%s
 96     ''' %(company_link,companyShortName,positionName,salary))
 97     r7=session.get(company_link,
 98                 headers={
 99                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
100                 }
101                 )
102     X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]
103     X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0]
104     # print(X_Anti_Forge_Token,X_Anti_Forge_Code)
105 
106 
107     session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
108                  headers={
109                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
110                      'Referer': company_link,
111                      'X-Anit-Forge-Code': X_Anti_Forge_Code,
112                      'X-Anit-Forge-Token': X_Anti_Forge_Token,
113                      'X-Requested-With': 'XMLHttpRequest'
114                  },
115                  data={
116     'positionId':positionId,
117     'type':1,
118     'force':True
119                  }
120                  )
121     print('%s 投遞成功' %(companyShortName))
View Code
相關文章
相關標籤/搜索