爬取百度招聘上關於IT職業的有關信息進行分析,只是對常見職業進行分析,未解決異步加載,只能顯示第一頁,並且最多顯示100條信息,爬取二升級若對所有職業和城市,則需對數據庫進行寫入html
1 #-*- coding:UTF-8 -*- 2 #@author:若鳥 3 #functions: 爬取百度招聘上關於IT職業的有關信息進行分析,只是對常見職業進行分析,未解決異步加載,只能顯示第一頁,並且最多顯示100條信息,爬取二升級若對所有職業和城市,則需對數據庫進行寫入 4 import requests 5 import re 6 import xlwt 7 from bs4 import BeautifulSoup 8 from Get_jobList import GetJobList 9 def getHtml(url): 10 try: 11 r = requests.get(url) 12 r.raise_for_status() 13 r.encoding = r.apparent_encoding 14 return r.text 15 except Exception as e: 16 print(url,"爬取失敗,失敗信息:",e) 17 def getJoblist(): 18 citys = ['肥城','泰安','北京','上海','深圳','杭州','濟南'] 19 jobs = GetJobList()[:50] #獲取部分職位名稱,爬取自拉勾網 20 url_job = [] 21 for city in citys: 22 for job in jobs: 23 url_job.append("http://zhaopin.baidu.com/quanzhi?query={}&city={}&rn=100".format(job,city)) #網站實現的js翻頁,通過每頁最多顯示100條 24 25 return url_job 26 def getJobData(urls): 27 count = 0 28 job_data = [] #<p class="area line-clamp1">濟南 | 濟南宅客信息科技有限公司</p> 29 for url in urls: 30 31 html = getHtml(url) 32 job = re.findall(r'<p class="title-h3 line-clamp1">(.+)</p>',html) 33 addr = re.findall(r'<p class="area line-clamp1">(.+) \| .+</p>',html) 34 company = re.findall(r'<p class="area line-clamp1">.+ \| (.+)</p>',html) 35 salary = re.findall(r'<p class="salary">(.+)</p>',html) 36 source = re.findall(r'<p>.+ \| 來自(.+)</p>',html) 37 num = len(job) 38 for i in range(num): 39 job_data.append([job[i],addr[i],company[i],salary[i],source[i]]) 40 41 42 count +=1 43 print(count) 44 45 return job_data 46 def writeXls(data_list): 47 #print(data_list) 48 num = len(data_list) 49 f = xlwt.Workbook() 50 sheet1 = f.add_sheet('sheet1',cell_overwrite_ok = True) 51 for i in range(num): 52 num_ = len(data_list[i]) 53 for x in range(num_): 54 sheet1.write(i,x,data_list[i][x]) 55 f.save("JobData.xls") 56 if __name__ == '__main__': 57 job_data = getJobData(getJoblist()) 58 writeXls(job_data)