#! -*-coding:utf-8 -*- from urllib import request, parse from bs4 import BeautifulSoup as BS import json import datetime import xlsxwriter import threading import queue import time url="http://www.lagou.com/jobs/positionAjax.json" tag = ['companyName', 'companyShortName', 'positionName', 'education', 'salary', 'financeStage', 'companySize', 'industryField', 'companyLabelList','workYear'] # 這是須要抓取的標籤信息,包括公司名稱,學歷要求,薪資等等 tag_name = ['公司名稱', '公司簡稱', '職位名稱', '所需學歷', '工資', '公司資質', '公司規模', '所屬類別', '公司介紹','工做年限'] def read_page(url,page_num,keyword): page_header={ 'Host':'www.lagou.com', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36', 'Connection':'keep-alive' } if page_num==1: firstFlag=True else: firstFlag=False page_data=parse.urlencode( [ ('first',firstFlag), ('pn',page_num), ('kd',keyword) ] ) reg=request.Request(url,headers=page_header) page=request.urlopen(url,data=page_data.encode('utf-8')).read() page=page.decode('utf-8') return page def read_max_page(page): page_json=json.loads(page) max_page_num=page_json['content']['totalPageCount'] if max_page_num>30: max_page_num=30 return max_page_num def read_tag(page, tag): page_json=json.loads(page) page_json=page_json['content']['result'] page_result=[num for num in range(15)] for i in range(15): page_result[i]=[] for page_tag in tag: page_result[i].append(page_json[i].get(page_tag)) page_result[i][8]=','.join(page_result[i][8]) print(page_result) # return page_result class MyThread(threading.Thread): def __init__(self,records, page_num,keyword): threading.Thread.__init__(self, name=page_num) self.name =page_num self._page_num=page_num self._records=records self._keyword=keyword def run(self): while(self._records.qsize()): page=read_page(url,self._records.get(),self._keyword) read_tag(page,tag) time.sleep(0.01) # print('下載第%d頁'%(self._page_num)) def reptile(records,threadNum,keyword): tasks=[] for page_num in range(1,10): Thread=MyThread(records,page_num,keyword) Thread.setDaemon(False) Thread.start() tasks.append(Thread) for task in tasks: if task.isAlive(): tasks.append(task) continue if __name__ == '__main__': myQueue=queue.Queue() keyword=input('請輸入你想要查詢的崗位語言') starttime = datetime.datetime.now() fin_result=[] page=read_page(url,1,keyword) max_page_num=read_max_page(page) for row in range(1,max_page_num): myQueue.put(row) reptile(myQueue,max_page_num,keyword) endtime=datetime.datetime.now() time=(endtime-starttime).seconds print('總共用時:%s s'%time)