#!/usr/bin/python3 #coding:utf8 import re from bs4 import BeautifulSoup import requests from threading import Thread class main(object): def __init__(self): self.url='https://www.lagou.com/zhaopin/Java/1/' self.w=open('Java','w') self.headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'www.lagou.com', 'Referer':'https://www.lagou.com/', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } def getpage(self): page=requests.get(self.url,headers=self.headers) pattern=re.compile( u'<div.*?class="list_item_top">.*?'+ u'<a class="position_link" href="//(.*?)".*?'+ u'<h2>(.*?)</h2>.*?'+ u'<em>(.*?)</em>.*?'+ u'<span class="format-time">(.*?)</span>.*?'+ u'<div class="li_b_l">.*?'+ u'<span class="money">(.*?)</span>.*?<!--<i></i>-->(.*?)</div>.*?'+ u'<div class="company">.*?'+ u'<div class="company_name">.*?<a.*?>(.*?)</a>.*?'+ u'<div class="industry">(.*?)</div>.*?' ,re.S) l=re.findall(pattern,page.text) for s in l: self.w.write(str(s[0]+"\t,"+s[1]+"\t,"+s[2]+"\t,"+s[3]+"\t,"+s[4]+"\t,"+s[5]+"\t,"+s[6]+"\t,"+s[7])+"\n") if __name__=="__main__": m=main() t=[] for n in range(1,30): m.url='https://www.lagou.com/zhaopin/Java/' m.url+=str(n) T=Thread(target=m.getpage) T.start() t.append(T) for tt in t: tt.join() m.w.close()