偶然想到說若是哪一天cnblogs掛了,那以前記錄的隨筆怎麼辦?可不能夠把它們給download下來本地保存一份。正好看到有個庫叫requests, 幹嗎不試試看呢。html
有了requests 和 beautifulsoup,代碼其實很簡單。惟一須要注意的是,不能太頻繁地用requests.get來抓取網頁,否則會報錯。通常的網站都會有相似的自我保護機制吧,防止被爬蟲給爬死了。python
import requests from BeautifulSoup import BeautifulSoup import re import os import time URL='http://www.cnblogs.com/fangwenyu/p/' URL_PATTERN = 'http://www.cnblogs.com/fangwenyu/p|archive' pattern = re.compile(URL_PATTERN) DIRECTORY = os.path.dirname(__file__) ESCAPE_CHARS = '/\:*?"<>|' # Those characters are not allowed to be used in file name in Windows. tbl = {ord(char): u'' for char in ESCAPE_CHARS} # get the total page number page_count = 0 resp = requests.get(URL) if resp.status_code == requests.codes.ok: soup = BeautifulSoup(resp.content) attr = {'class':'Pager'} result = soup.find('div', attr) page_count = int(result.getText()[1:2]) with open(os.path.join(DIRECTORY, 'blog_archive.txt'), 'w') as blog_archive: for page in range(1,page_count+1): param = {'page':page} resp = requests.get(URL, params=param) soup = BeautifulSoup(resp.content, convertEntities=BeautifulSoup.HTML_ENTITIES) blog_list = [(a.getText(), a.get('href')) for a in soup.findAll('a', id=True, href=pattern)] for title, link in blog_list: norm_title = title.translate(tbl) item = '%s |[%s]| %s ' % (title, norm_title, link) blog_archive.write(item.encode('utf-8')) blog_archive.write('\n') with open(os.path.join(DIRECTORY, norm_title + '.html'), 'w') as f: f.write(requests.get(link).content) # sleep for some time as access the cnblogs too freqently will cause the server not respond. # Something like this -- # ... # requests.exceptions.ConnectionError: ('Connection aborted.', error(10060, 'A connection attempt failed # because the connected party did not properly respond after a period of time, or established connection failed # because connected host has failed to respond')) time.sleep(5)