源碼:php
1 import requests 2 import re 3 from my_mysql import MysqlConnect 4 import time,random 5 6 7 # 獲取招聘詳情連接 8 def get_urls(page, headers): 9 url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=page'.format(page) 10 response = requests.get(url, headers=headers) 11 pat = r'href="(position_detail.*?)">' 12 url_list_bytes = re.findall(pat.encode('utf-8'), response.content) 13 return url_list_bytes 14 15 # 獲取招聘詳情 16 def get_info(url, headers): 17 response = requests.get(url, headers=headers) 18 html_bytes = response.content 19 # print(html_bytes) 20 21 # title 標題 22 pat = r'id="sharetitle">(.*?)</td>' 23 res = re.search(pat.encode('utf-8'), html_bytes) 24 title = res.group(1).decode('utf-8') 25 # address 地點 26 pat = r'工做地點:</span>(.*?)</td>' 27 res = re.search(pat.encode('utf-8'), html_bytes) 28 address = res.group(1).decode('utf-8') 29 # types 類別 30 pat = r'職位類別:</span>(.*?)</td>' 31 res = re.search(pat.encode('utf-8'), html_bytes) 32 types = res.group(1).decode('utf-8') 33 # counts 人數 34 pat = r'招聘人數:</span>(.*?)</td>' 35 res = re.search(pat.encode('utf-8'), html_bytes) 36 counts = res.group(1).decode('utf-8') 37 # duty 職責 38 pat = r'工做職責.*?<ul class="squareli">(.*?)</ul>' 39 res = re.search(pat.encode('utf-8'), html_bytes) 40 duty_str = res.group(1).decode('utf-8') 41 pat = r'<li>(.*?)</li>' 42 duty = re.findall(pat,duty_str) 43 duty = ('\n').join(duty) 44 # requires 要求 45 pat = r'工做要求.*?<ul class="squareli">(.*?)</ul>' 46 res = re.search(pat.encode('utf-8'), html_bytes) 47 requires_str = res.group(1).decode('utf-8') 48 pat = r'<li>(.*?)</li>' 49 requires = re.findall(pat, requires_str) 50 requires = ('\n').join(requires) 51 return title,address,types,counts,duty,requires 52 53 54 if __name__ == '__main__': 55 mc = MysqlConnect('127.0.0.1','root','123456','homework') 56 sql = "insert into tencentzp(title,address,types,counts,duty,requires) values(%s,%s,%s,%s,%s,%s)" 57 headers = { 58 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 59 } 60 for page in range(0,200,10): 61 url_list_bytes = get_urls(page,headers) 62 # print(url_list_bytes) 63 for url in url_list_bytes: 64 # print(url.decode('utf-8')) 65 url = 'https://hr.tencent.com/' + url.decode('utf-8') 66 info = get_info(url,headers) 67 print(info) 68 mc.exec_data(sql,info) 69 time.sleep(random.random()*5)