使用Beautifulsoup模塊爬取藥智網數據html
Tips:1.爬取多頁時,先用一頁的作測試,要否則ip容易被封正則表達式
2.本身經常使用的處理數據的方法:app
reg=re.compile('正則表達式') data=reg.sub('要替換的字符串',data)
代碼(其實沒多少)ide
1 # encoding=utf-8 2 from bs4 import BeautifulSoup 3 import urllib2 4 import time 5 class YBZC(): 6 def __init__(self): 7 self.user_agent='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 8 self.headers={'User-Agent':self.user_agent} 9 def getHtml(self,pageIndex): 10 try: 11 url='http://db.yaozh.com/zhuce?p='+str(pageIndex) 12 request=urllib2.Request(url,headers=self.headers) 13 respone=urllib2.urlopen(request) 14 html=respone.read() 15 return html 16 except urllib2.URLError,e: 17 if hasattr(e,'reason'): 18 print u"鏈接失敗",e.reason 19 return None 20 def getItems(self): 21 for i in range(1,13): 22 html=self.getHtml() 23 soup=BeautifulSoup(html,"html.parser") 24 tr_list=soup.find_all('tr') 25 # 表格標題 26 if i==1: 27 for item in tr_list[0]: 28 if item not in ['\n','\t',' ']: 29 with open('yaopinzhuce1030.txt','a') as f: 30 f.write(item.get_text(strip=True).encode('utf-8')+'|') 31 #=========================2015-10-30================================ 32 # 第一次的時候是現將數據所有都取下來,等存入文件的時候再篩選,如今直接篩選再 33 # 存入文件中,當時的時候並無想到而且沒有理解get_text()方法,這個也是 34 # 代碼不精簡的緣由。。。。 35 #=================================================================== 36 # list_tit=[] 37 # for ths in tr_list[0]: 38 # if ths.find('a'): 39 # for item in ths: 40 # if type(item)!=unicode: 41 # list_tit.append(item.string) 42 # else: 43 # list_tit.append(ths.get_text(strip=True)) 44 # for item in list_tit: 45 # if item not in ['',' ','\n','\t']: 46 # with open('yaopinzhuce_new.txt','a') as f: 47 # f.write(item.encode('utf-8')+'|') 48 # 表格內容 49 f=open('yaopinzhuce1030.txt','a') 50 for tr in tr_list[1:]: 51 f.write('\n') 52 for item in tr: 53 if item not in ['',' ','\n']: 54 if item.string==None: 55 f.write('None'+'|') 56 else: 57 f.write(item.string.encode('utf-8')+'|') 58 59 f.close() 60 print 'sleeping... pageloading %d/12' %i 61 time.sleep(5) 62 63 64 spider=YBZC() 65 spider.getItems()