好久沒更新博客了,最近自學Python,寫個在百度上爬算法題題解的爬蟲,第一次寫爬蟲..純當練手css
慢慢來..html
#coding:utf-8 ''' Created on 2016年11月22日 @author: liyinggang ''' from link_crawler_baidu import link_crawler_baidu import urllib baseUrl = 'http://www.baidu.com/s' page = 1 #第幾頁 ojname = 'hdu' problemnum = '1006' word = ojname + problemnum data = {'wd':word,'pn':str(page-1)+'0','tn':'baidurt','ie':'utf-8','bsst':'1'} data = urllib.urlencode(data) url = baseUrl+'?'+data print url link_crawler_baidu(url,'lyg',ojname,problemnum)
首先是進入百度主頁,main.pyweb
而後我將博客園和csdn的連接篩選出來(編碼問題好難弄,最後百度了個笨方法).算法
#coding:utf-8 ''' Created on 2016年11月22日 @author: liyinggang ''' import re,lxml.html,urlparse from download import download import sys from link_crawler_cnblogs import link_crawler_cnblogs from link_crawler_csdn import link_crawler_csdn reload(sys) def link_crawler_baidu(seed_url,user_agent=None,ojname='hdu',problemnum='1001'): html = download(seed_url,user_agent=user_agent) url1 = "http://www.cnblogs.com/"; url2 = "http://blog.csdn.net/"; regex1 = re.compile(url1) regex2 = re.compile(url2) cnt1 = 0 cnt2 = 0 links = [link for link in get_links(html) if \ re.match(regex1,link) or re.match(regex2, link)] for link in links: link = union(seed_url,link) html = download(link,user_agent=user_agent) html = unicode(html, "utf-8") #只能用這個笨方法了 tree = lxml.html.fromstring(html) text = tree.cssselect('title')[0].text_content() regex1 = re.compile(u'%s([\s\S]*)%s([\s\S]*)博客園'%(ojname,problemnum),re.IGNORECASE) regex2 = re.compile(u'%s([\s\S]*)%s([\s\S]*)CSDN.NET'%(ojname,problemnum),re.IGNORECASE) if(re.search(regex1,text)): filename = ojname+problemnum+'_'+str(cnt1) if(link_crawler_cnblogs(link,user_agent=user_agent,filename=filename)): cnt1+=1 if(re.search(regex2,text)): filename = ojname+problemnum+'_'+str(cnt2) if(link_crawler_csdn(link,user_agent=user_agent,filename=filename)): cnt2+=1 def union(seed_url,link): """ 將seed_url 與 link 拼接 """ link, _ = urlparse.urldefrag(link) #將link分解成去掉fragment的新url和去掉的fragment的二元組 return urlparse.urljoin(seed_url, link) def get_links(html): """ 獲取html中的外鏈連接 """ webpage_regex = re.compile('href=["\'](.*?)["\']', re.IGNORECASE) #忽略大小寫 # return all links list return webpage_regex.findall(html)
博客園的代碼的獲取.函數
#coding:utf-8 ''' Created on 2016年11月22日 @author: liyinggang ''' from download import download import lxml.html import re from fileinput import filename def link_crawler_cnblogs(url,user_agent=None,filename='filename'): html = download(url,user_agent=user_agent) html = unicode(html,"utf-8") tree = lxml.html.fromstring(html) texts = tree.cssselect('pre') regex = re.compile('^(#include|import)([\s\S]*)main()') #若是是代碼裏面必定包含 mian() 函數 flag = False for text in texts: text = text.text_content() if(re.search(regex, text)): flag = True f = open("D:\\download\\cnblogs\\%s.txt"%filename,"w") f.write(text) f.close() break return flag
csdn代碼的獲取:測試
#coding:utf-8 ''' Created on 2016年11月22日 @author: liyinggang ''' from download import download import lxml.html import re def link_crawler_csdn(url,user_agent=None,filename='filename'): html = download(url,user_agent=user_agent) html = unicode(html,"utf-8") tree = lxml.html.fromstring(html) texts = tree.cssselect('pre') texts.extend(tree.cssselect('p > textarea.cpp')) flag = False regex = re.compile('^(#include|import)([\s\S]*)main()') #若是是代碼裏面必定包含 mian() 函數 for text in texts: text = text.text_content() if(re.search(regex, text)): flag = True f = open("D:\\download\\csdn\\%s.txt"%filename,"w") f.write(text) f.close() break return flag
下載網頁的代碼:網站
#coding:utf-8 ''' Created on 2016-11-20 @author: admin ''' import urllib2 import urlparse def download(url,user_agent='lyg',proxy=None,retest=5): #user_agent是設置代理用戶,proxy表明是否使用代理,retest是表明測試多少次 """ This method is to download the website source code """ print 'Downloading:',url headers = {'User-agent':user_agent} #設置用戶代理,默認Python-urllib/2.7有可能被一些網站封禁 request = urllib2.Request(url,headers=headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).schema: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: html = opener.open(request).read() except urllib2.URLError as e: print 'Downlaod error:',e.reason html = None if retest>0: if hasattr(e, 'code') and 500<= e.code <=600: #若是錯誤代碼在 500- 600 之間就從新試,404這種就不必嘗試了 download(url, retest-1) return html
獲得的結果:ui