1 #!usr/bin/env python 2 #coding:utf-8 3 4 import sys; 5 reload(sys); 6 sys.setdefaultencoding('utf-8'); 7 8 import urllib2 9 from bs4 import BeautifulSoup 10 11 filename = open('work.txt','w') 12 13 user_anget = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'} 14 15 header = { 16 'User-Agent' : user_anget, 17 } 18 url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8D%97%E6%98%8C&kw=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&sm=0&p=' 19 20 #filename = open('work_url.txt','w') 21 url_list = [] 22 i = 1 23 while i<3: 24 full_url = url + str(i) 25 request = urllib2.Request(full_url) 26 response = urllib2.urlopen(request) 27 soup = BeautifulSoup(response,'lxml',from_encoding='utf-8') 28 #<td class="zwmc" style="width: 250px;"> 29 links = soup.find_all('td',class_='zwmc') 30 #print links 31 for link in links: 32 new_url = link.find('a')['href'] 33 print new_url 34 url_list.append(new_url) 35 i +=1 36 print url_list 37 38 filename = open('work.txt','a') 39 while len(url_list) != 0: 40 new_url = url_list.pop() 41 request = urllib2.Request(new_url) 42 response = urllib2.urlopen(request) 43 soup = BeautifulSoup(response,'lxml',from_encoding='utf-8') 44 #<div class="inner-left fl"> <h1>商品專員/數據分析員</h1> 45 title = soup.find('div',class_="inner-left fl").find('h1') 46 #<ul class="terminal-ul clearfix"> 47 clearfix = soup.find('ul',class_="terminal-ul clearfix") 48 #<div class="tab-inner-cont"> 49 cont = soup.find('div',class_="tab-inner-cont") 50 #print biaoti.get_text(),yaoqiu.get_text(),zhiwu.get_text() 51 52 filename.write(new_url + '\n') 53 filename.write(title.get_text()) 54 filename.write(clearfix.get_text()) 55 filename.write(cont.get_text()) 56 filename.close() 57 print url_list
不足:python
一、獲取網頁的代碼能夠重複利用,這裏沒有寫好!懶,主要是。編程
二、仍是沒有用面向對象編程(白天試了,有些地方不懂,就pass了)app
三、沒有按本身的要求保存數據。url
四、可能會抓取到重複,由於用的是列表,沒有用集合。spa
五、抓取的網頁信息是從最後一項開始抓取的,這樣也很差。code
我怎麼感受寫的不足愈來愈多了啊 ,加了好幾條了,(⊙﹏⊙)b,算了不寫了,就這樣吧,在寫下去都沒有信心了!xml
不過整體來講仍是完成了本身想要實現的目的,抓取每一個工做的網址,並根據抓取的網址老獲取想要的信息!對象
有點進步,最起碼代碼就長了點了。blog