爬蟲下載頁面

時間 2019-11-09

原文原文鏈接

簡介

爬蟲下載頁面html

代碼

簡易下載

#!/usr/bin/env python
#coding=utf-8
import urllib2

def download(url):
    print('Download:',url)
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError as e:
        print('Download error:', e.reason)
        html = None
    return html

if __name__ == '__main__':
    download('http://www.baidu.com')

彷佛並無把百度的html 下載下來python

屢次嘗試下載 5XX服務器錯誤並設置代理

不少網站都不喜歡被爬蟲程序訪問，但又沒有辦法徹底禁止，因而就設置了一些反爬策略。好比User Agent，中文名爲用戶代理，簡稱UA。User Agent存放於Headers中，服務器就是經過查看Headers中的User Agent來判斷是誰在訪問。
經過不一樣的瀏覽器訪問，會有不一樣的User Agent，若是爬蟲不設置的話，很容易被識別出來，就會被限制訪問。通常的作法是收集不少不一樣的User Agent，而後隨機使用。

def download(url, user_agent='wswp', num_retries=2):
    print 'Downloading:', url
    headers = {'User-agent':user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                #retyr 5XX HTTP errors
                return download(url, user_agent, num_retries-1)
    return html

使用網站地圖下載相關的頁面

def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    # download each link
    for link in links:
        html = download(link)
        print link

網站可能會把前面的字符串忽略而後能夠只用後面的數字

def crawl_string():
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/view/-%d' % page
        html = download(url)
        if ( html is None):
            break
        else:
            pass

網站經過一個頁面的連接下載

def get_links(html):
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
    return webpage_regex.findall(html)

def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        print "getlinks", get_links(html)
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urlparse.urljoin(seed_url, link)
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)

if __name__ == '__main__':
    link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

支持對 robots.txt 的解析

def link_crawler(seed_url, link_regex):
    rp = robotparser.RobotFileParser()
    rp.set_url(seed_url+'/robots.txt')
    rp.read()
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        user_agent = 'wswp'
        if rp.can_fetch(user_agent, url):
            html = download(url)
            print "getlinks", get_links(html)
            for link in get_links(html):
                if re.match(link_regex, link):
                    link = urlparse.urljoin(seed_url, link)
                    if link not in seen:
                        seen.add(link)
                        crawl_queue.append(link)
        else:
            print 'Blocked by robots.txt:', url

代理

def link_crawler(seed_url, link_regex, proxy=False):
    if proxy: # 暫時沒法代理

        proxy_info={
            'host':'106.12.38.133',
            'port':22
        }

        # We create a handler for the proxy
        proxy_support = urllib2.ProxyHandler({"http" : "http://%(host)s:%(port)d" % proxy_info})
        # We create an opener which uses this handler:
        opener = urllib2.build_opener(proxy_support)

        # Then we install this opener as the default opener for urllib2:
        urllib2.install_opener(opener)

        #若是代理須要驗證
        proxy_info = { 
                       'host' : '106.12.38.133',
                       'port' : 20,
                       'user' : 'root',
                       'pass' : 'Woaini7758258!'
                     }
        proxy_support = urllib2.ProxyHandler({"http" : "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})
        opener = urllib2.build_opener(proxy_support)
        urllib2.install_opener(opener)
        #htmlpage = urllib2.urlopen("http://sebsauvage.net/").read(200000)



    rp = robotparser.RobotFileParser()
    rp.set_url(seed_url+'/robots.txt')
    rp.read()
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        user_agent = 'wswp'
        if rp.can_fetch(user_agent, url):
            html = download(url)
            print "getlinks", get_links(html)
            for link in get_links(html):
                if re.match(link_regex, link):
                    link = urlparse.urljoin(seed_url, link)
                    if link not in seen:
                        seen.add(link)
                        crawl_queue.append(link)
        else:
            print 'Blocked by robots.txt:', url

下載限速

class Throttle:
    """
    下載延遲 下載以前調用
    """
    def __init__(self, delay):
        self.delay = delay
        self.domains()
    def wait(self, url):
        domain = urlparse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)

        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
        if sleep_secs > 0:
            time.sleep(sleep_secs)
        self.domains[domain] = datetime.datetime.now()