Python練手之爬蟲

時間 2019-11-20

標籤 python 爬蟲欄目 Python 简体版

原文原文鏈接

好久沒更新博客了,最近自學Python,寫個在百度上爬算法題題解的爬蟲,第一次寫爬蟲..純當練手css

慢慢來..html

#coding:utf-8
'''
Created on 2016年11月22日

@author: liyinggang
'''
from link_crawler_baidu import link_crawler_baidu
import urllib
baseUrl = 'http://www.baidu.com/s'
page = 1 #第幾頁
ojname = 'hdu'
problemnum = '1006'
word = ojname + problemnum
data = {'wd':word,'pn':str(page-1)+'0','tn':'baidurt','ie':'utf-8','bsst':'1'}
data = urllib.urlencode(data)
url = baseUrl+'?'+data
print url
link_crawler_baidu(url,'lyg',ojname,problemnum)

首先是進入百度主頁,main.pyweb

而後我將博客園和csdn的連接篩選出來(編碼問題好難弄,最後百度了個笨方法).算法

#coding:utf-8
'''
Created on 2016年11月22日

@author: liyinggang
'''
import re,lxml.html,urlparse
from download import download
import sys
from link_crawler_cnblogs import link_crawler_cnblogs
from link_crawler_csdn import link_crawler_csdn
reload(sys)
def link_crawler_baidu(seed_url,user_agent=None,ojname='hdu',problemnum='1001'):
    html = download(seed_url,user_agent=user_agent)
    url1 = "http://www.cnblogs.com/";
    url2 = "http://blog.csdn.net/";
    regex1 = re.compile(url1)
    regex2 = re.compile(url2)
    cnt1 = 0
    cnt2 = 0
    links = [link for link in get_links(html) if \
             re.match(regex1,link) or re.match(regex2, link)]
    for link in links:
        link = union(seed_url,link)
        html = download(link,user_agent=user_agent)
        html = unicode(html, "utf-8") #只能用這個笨方法了
        tree = lxml.html.fromstring(html)
        text = tree.cssselect('title')[0].text_content()
        regex1 = re.compile(u'%s([\s\S]*)%s([\s\S]*)博客園'%(ojname,problemnum),re.IGNORECASE)
        regex2 = re.compile(u'%s([\s\S]*)%s([\s\S]*)CSDN.NET'%(ojname,problemnum),re.IGNORECASE)
        if(re.search(regex1,text)):
            filename = ojname+problemnum+'_'+str(cnt1) 
            if(link_crawler_cnblogs(link,user_agent=user_agent,filename=filename)):
                cnt1+=1
        if(re.search(regex2,text)): 
            filename = ojname+problemnum+'_'+str(cnt2) 
            if(link_crawler_csdn(link,user_agent=user_agent,filename=filename)):
                cnt2+=1
        
def union(seed_url,link):
    """
          將seed_url 與 link 拼接
    """
    link, _ = urlparse.urldefrag(link)  #將link分解成去掉fragment的新url和去掉的fragment的二元組
    return urlparse.urljoin(seed_url, link)

def get_links(html):
    """
          獲取html中的外鏈連接
    """
    webpage_regex = re.compile('href=["\'](.*?)["\']', re.IGNORECASE) #忽略大小寫
    # return all links list
    return webpage_regex.findall(html)

博客園的代碼的獲取.函數

#coding:utf-8
'''
Created on 2016年11月22日

@author: liyinggang
'''
from download import download
import lxml.html
import re
from fileinput import filename
def link_crawler_cnblogs(url,user_agent=None,filename='filename'):
    html = download(url,user_agent=user_agent)
    html = unicode(html,"utf-8")
    tree = lxml.html.fromstring(html)
    texts = tree.cssselect('pre')
    regex = re.compile('^(#include|import)([\s\S]*)main()')  #若是是代碼裏面必定包含 mian() 函數
    flag = False
    for text in texts:
        text = text.text_content()
        if(re.search(regex, text)):
            flag = True
            f = open("D:\\download\\cnblogs\\%s.txt"%filename,"w")
            f.write(text)
            f.close()
            break
    return flag

csdn代碼的獲取:測試

#coding:utf-8
'''
Created on 2016年11月22日

@author: liyinggang
'''
from download import  download
import  lxml.html
import re
def link_crawler_csdn(url,user_agent=None,filename='filename'):
    html = download(url,user_agent=user_agent)
    html = unicode(html,"utf-8")
    tree = lxml.html.fromstring(html)
    texts = tree.cssselect('pre')
    texts.extend(tree.cssselect('p > textarea.cpp'))
    flag = False
    regex = re.compile('^(#include|import)([\s\S]*)main()') #若是是代碼裏面必定包含 mian() 函數
    for text in texts:
        text = text.text_content()
        if(re.search(regex, text)):
            flag = True
            f = open("D:\\download\\csdn\\%s.txt"%filename,"w")
            f.write(text)
            f.close()
            break
    return flag

下載網頁的代碼:網站

#coding:utf-8
'''
Created on 2016-11-20

@author: admin
'''
import urllib2
import urlparse
def download(url,user_agent='lyg',proxy=None,retest=5):
#user_agent是設置代理用戶,proxy表明是否使用代理,retest是表明測試多少次
    """
    This method is to download the website source code
    """
    print 'Downloading:',url
    headers = {'User-agent':user_agent} #設置用戶代理,默認Python-urllib/2.7有可能被一些網站封禁
    request = urllib2.Request(url,headers=headers)
    
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).schema: proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:    
        html = opener.open(request).read()
    except urllib2.URLError as e:
        print 'Downlaod error:',e.reason
        html = None
        if retest>0:
            if hasattr(e, 'code') and 500<= e.code <=600:
                #若是錯誤代碼在 500- 600 之間就從新試,404這種就不必嘗試了
                download(url, retest-1)
    return html