爬蟲下載頁面html
#!/usr/bin/env python #coding=utf-8 import urllib2 def download(url): print('Download:',url) try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print('Download error:', e.reason) html = None return html if __name__ == '__main__': download('http://www.baidu.com')
彷佛並無把百度的html 下載下來python
不少網站都不喜歡被爬蟲程序訪問,但又沒有辦法徹底禁止,因而就設置了一些反爬策略。好比User Agent,中文名爲用戶代理,簡稱UA。User Agent存放於Headers中,服務器就是經過查看Headers中的User Agent來判斷是誰在訪問。 經過不一樣的瀏覽器訪問,會有不一樣的User Agent,若是爬蟲不設置的話,很容易被識別出來,就會被限制訪問。通常的作法是收集不少不一樣的User Agent,而後隨機使用。
def download(url, user_agent='wswp', num_retries=2): print 'Downloading:', url headers = {'User-agent':user_agent} request = urllib2.Request(url, headers=headers) try: html = urllib2.urlopen(request).read() except urllib2.URLError as e: print('Download error:', e.reason) html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: #retyr 5XX HTTP errors return download(url, user_agent, num_retries-1) return html
def crawl_sitemap(url): # download the sitemap file sitemap = download(url) # extract the sitemap links links = re.findall('<loc>(.*?)</loc>', sitemap) # download each link for link in links: html = download(link) print link
def crawl_string(): for page in itertools.count(1): url = 'http://example.webscraping.com/view/-%d' % page html = download(url) if ( html is None): break else: pass
def get_links(html): webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE) return webpage_regex.findall(html) def link_crawler(seed_url, link_regex): crawl_queue = [seed_url] # keep track which URL's have seen before seen = set(crawl_queue) while crawl_queue: url = crawl_queue.pop() html = download(url) print "getlinks", get_links(html) for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in seen: seen.add(link) crawl_queue.append(link) if __name__ == '__main__': link_crawler('http://example.webscraping.com', '/places/default/(index|view)')
def link_crawler(seed_url, link_regex): rp = robotparser.RobotFileParser() rp.set_url(seed_url+'/robots.txt') rp.read() crawl_queue = [seed_url] # keep track which URL's have seen before seen = set(crawl_queue) while crawl_queue: url = crawl_queue.pop() user_agent = 'wswp' if rp.can_fetch(user_agent, url): html = download(url) print "getlinks", get_links(html) for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in seen: seen.add(link) crawl_queue.append(link) else: print 'Blocked by robots.txt:', url
def link_crawler(seed_url, link_regex, proxy=False): if proxy: # 暫時沒法代理 proxy_info={ 'host':'106.12.38.133', 'port':22 } # We create a handler for the proxy proxy_support = urllib2.ProxyHandler({"http" : "http://%(host)s:%(port)d" % proxy_info}) # We create an opener which uses this handler: opener = urllib2.build_opener(proxy_support) # Then we install this opener as the default opener for urllib2: urllib2.install_opener(opener) #若是代理須要驗證 proxy_info = { 'host' : '106.12.38.133', 'port' : 20, 'user' : 'root', 'pass' : 'Woaini7758258!' } proxy_support = urllib2.ProxyHandler({"http" : "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info}) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) #htmlpage = urllib2.urlopen("http://sebsauvage.net/").read(200000) rp = robotparser.RobotFileParser() rp.set_url(seed_url+'/robots.txt') rp.read() crawl_queue = [seed_url] # keep track which URL's have seen before seen = set(crawl_queue) while crawl_queue: url = crawl_queue.pop() user_agent = 'wswp' if rp.can_fetch(user_agent, url): html = download(url) print "getlinks", get_links(html) for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in seen: seen.add(link) crawl_queue.append(link) else: print 'Blocked by robots.txt:', url
class Throttle: """ 下載延遲 下載以前調用 """ def __init__(self, delay): self.delay = delay self.domains() def wait(self, url): domain = urlparse.urlparse(url).netloc last_accessed = self.domains.get(domain) if self.delay > 0 and last_accessed is not None: sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds if sleep_secs > 0: time.sleep(sleep_secs) self.domains[domain] = datetime.datetime.now()
https://tieba.baidu.com/p/5832236970web