import requests import re import json import time from openpyxl import workbook from openpyxl import load_workbook from pymongo import MongoClient from requests.exceptions import RequestException import csv import xlwt def get_one_page(url): headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Host':'search.51job.com', 'Cookie':'partner=www_baidu_com; guid=992c8a3fa4140d299ad06533b8965bdd; 51job=cenglish%3D0%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60040000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA04%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA05%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60030200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAC%2B%2B%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21', } try: response = requests.get(url,headers=headers) if response.status_code==200: response.encoding='gbk' return response.text return None except RequestException: return None def parse_one_page(html): pattern=re.compile('<a target="_blank" title="(.*?)"[.\s\S]*?<span class="t2"><a target="_blank" title=.*?href="(.*?)">' '(.*?)</a></span>[.\s\S]*?<span class="t3">(.*?)</span>[.\s\S]*?<span class="t4">(.*?)</span>',re.S) results=re.findall(pattern,html) return results def write_to_file(content): with open('jobs.txt','a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False)+'\n') f.close() client=MongoClient() db=client['jobs'] collections=db['jobs'] def save_to_mongo(result): if collections.insert(result): print('Saved to Mongo') def save_to_csv(result): with open('data.csv','a',encoding='utf-8') as csvfile: fieldnames=['name','web','company','location','salary'] writer=csv.DictWriter(csvfile,fieldnames=fieldnames) writer.writeheader() writer.writerow(result) def save_to_excel(result): global ws ws.append(result) def main(page): #url='https://search.51job.com/list/040000,000000,0000,00,9,99,C%252B%252B%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,'+str(page)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' #url='https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%259F%25B9%25E8%25AE%25AD%25E6%259C%25BA%25E6%259E%2584,2,'+str(page)+'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' url='https://search.51job.com/list/010000,000000,0000,00,9,09,%25E5%259F%25B9%25E8%25AE%25AD%25E6%259C%25BA%25E6%259E%2584,2,'+str(page)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' html=get_one_page(url) for content in parse_one_page(html): print(content) save_to_excel(content) if __name__=='__main__': wb=workbook.Workbook() ws=wb.active fieldnames = ['name', 'web', 'company', 'location', 'salary'] ws.append(fieldnames) for i in range(1,3): main(i) time.sleep(1) wb.save('data.xlsx')