from urllib.request import urlopen html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html") print(html.read())
$ python3 1-basicExample.py b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'
from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page1.html") bsObj = BeautifulSoup(html.read(), 'lxml'); print(bsObj.h1)
$ python3 2-beautifulSoup.py <h1>An Interesting Title</h1>
• html → <html><head>...</head><body>...</body></html> — head → <head><title>A Useful Page<title></head> — title → <title>A Useful Page</title> — body → <body><h1>An Int...</h1><div>Lorem ip...</div></body> — h1 → <h1>An Interesting Title</h1> — div → <div>Lorem Ipsum dolor...</div>
bsObj.html.body.h1 bsObj.body.h1 bsObj.html.h1
•在服務器上找不到該頁面(或獲取錯誤), 404或者500
try: html = urlopen("http://www.pythonscraping.com/pages/page1.html") except HTTPError as e: print(e) #return null, break, or do some other "Plan B" else: #program continues. Note: If you return or break in the #exception catch, you do not need to use the "else" statement
本文摘要自Web Scraping with Python - 2015
推薦的python基礎教程: http://www.diveintopython.net
交流:python開發自動化測試羣291184506 PythonJava單元白盒測試羣144081101
# section 1 User-agent: BadCrawler Disallow: / # section 2 User-agent: * Crawl-delay: 5 Disallow: /trap # section 3 Sitemap: http://example.webscraping.com/sitemap.xml
更多關於web機器人的介紹參見 http://www.robotstxt.org。
Sitemap的協議: http://www.sitemaps.org/protocol.html,好比:
<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url><loc>http://example.webscraping.com/view/Afghanistan-1 </loc></url> <url><loc>http://example.webscraping.com/view/Aland-Islands-2 </loc></url> <url><loc>http://example.webscraping.com/view/Albania-3</loc> </url> ... </urlset>
經過google的site查詢 好比:site:automationtesting.sinaapp.com
# pip install builtwith # ipython In [1]: import builtwith In [2]: builtwith.parse('http://automationtesting.sinaapp.com/') Out[2]: {u'issue-trackers': [u'Trac'], u'javascript-frameworks': [u'jQuery'], u'programming-languages': [u'Python'], u'web-servers': [u'Nginx']}
# pip install python-whois # ipython In [1]: import whois In [2]: print whois.whois('http://automationtesting.sinaapp.com') { "updated_date": "2016-01-07 00:00:00", "status": [ "serverDeleteProhibited https://www.icann.org/epp#serverDeleteProhibited", "serverTransferProhibited https://www.icann.org/epp#serverTransferProhibited", "serverUpdateProhibited https://www.icann.org/epp#serverUpdateProhibited" ], "name": null, "dnssec": null, "city": null, "expiration_date": "2021-06-29 00:00:00", "zipcode": null, "domain_name": "SINAAPP.COM", "country": null, "whois_server": "whois.paycenter.com.cn", "state": null, "registrar": "XIN NET TECHNOLOGY CORPORATION", "referral_url": "http://www.xinnet.com", "address": null, "name_servers": [ "NS1.SINAAPP.COM", "NS2.SINAAPP.COM", "NS3.SINAAPP.COM", "NS4.SINAAPP.COM" ], "org": null, "creation_date": "2009-06-29 00:00:00", "emails": null }
import urllib2 def download(url): print 'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print 'Download error:', e.reason html = None return html
import urllib2 def download(url, num_retries=2): print 'Downloading:', url try: html = urllib2.urlopen(url).read() except urllib2.URLError as e: print 'Download error:', e.reason html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: # recursively retry 5xx HTTP errors return download(url, num_retries-1) return html
http://httpstat.us/500 會返回500,能夠用它來測試下:
>>> download('http://httpstat.us/500') Downloading: http://httpstat.us/500 Download error: Internal Server Error Downloading: http://httpstat.us/500 Download error: Internal Server Error Downloading: http://httpstat.us/500 Download error: Internal Server Error
設置 user agent:
urllib2默認的user agent是「Python-urllib/2.7」,不少網站會對此進行攔截, 推薦使用接近真實的agent,好比
Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0
爲此咱們增長user agent設置:
import urllib2 def download(url, user_agent='Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0', num_retries=2): print 'Downloading:', url headers = {'User-agent': user_agent} request = urllib2.Request(url, headers=headers) try: html = urllib2.urlopen(request).read() except urllib2.URLError as e: print 'Download error:', e.reason html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: # recursively retry 5xx HTTP errors return download(url, num_retries-1) return html
def crawl_sitemap(url): # download the sitemap file sitemap = download(url) # extract the sitemap links links = re.findall('<loc>(.*?)</loc>', sitemap) # download each link for link in links: html = download(link) # scrape html here # ...
• http://example.webscraping.com/view/Afghanistan-1
• http://example.webscraping.com/view/Australia-2
• http://example.webscraping.com/view/Brazil-3
http://example.webscraping.com/view/1 ,這樣咱們就能夠數據庫的id抓取網頁。
for page in itertools.count(1): url = 'http://example.webscraping.com/view/-%d' % page html = download(url) if html is None: break else: # success - can scrape the result pass
# maximum number of consecutive download errors allowed max_errors = 5 # current number of consecutive download errors num_errors = 0 for page in itertools.count(1): url = 'http://example.webscraping.com/view/-%d' % page html = download(url) if html is None: # received an error trying to download this webpage num_errors += 1 if num_errors == max_errors: # reached maximum number of # consecutive errors so exit break else: # success - can scrape the result # ... num_errors = 0
In [1]: import re In [2]: import common In [3]: url = 'http://example.webscraping.com/view/UnitedKingdom-239' In [4]: html = common.download(url) Downloading: http://example.webscraping.com/view/UnitedKingdom-239 In [5]: re.findall('<td class="w2p_fw">(.*?)</td>', html) Out[5]: ['<img src="/places/static/images/flags/gb.png" />', '244,820 square kilometres', '62,348,447', 'GB', 'United Kingdom', 'London', '<a href="/continent/EU">EU</a>', '.uk', 'GBP', 'Pound', '44', '@# #@@|@## #@@|@@# #@@|@@## #@@|@#@ #@@|@@#@ #@@|GIR0AA', '^(([A-Z]\\d{2}[A-Z]{2})|([A-Z]\\d{3}[A-Z]{2})|([A-Z]{2}\\d{2}[A-Z]{2})|([A-Z]{2}\\d{3}[A-Z]{2})|([A-Z]\\d[A-Z]\\d[A-Z]{2})|([A-Z]{2}\\d[A-Z]\\d[A-Z]{2})|(GIR0AA))$', 'en-GB,cy-GB,gd', '<div><a href="/iso/IE">IE </a></div>'] In [6]: re.findall('<td class="w2p_fw">(.*?)</td>', html)[1] Out[6]: '244,820 square kilometres'
Beautiful Soup:
In [7]: from bs4 import BeautifulSoup In [8]: broken_html = '<ul class=country><li>Area<li>Population</ul>' In [9]: # parse the HTML In [10]: soup = BeautifulSoup(broken_html, 'html.parser') In [11]: fixed_html = soup.prettify() In [12]: print fixed_html <ul class="country"> <li> Area <li> Population </li> </li> </ul> In [13]: ul = soup.find('ul', attrs={'class':'country'}) In [14]: ul.find('li') # returns just the first match Out[14]: <li>Area<li>Population</li></li> In [15]: ul.find_all('li') # returns all matches Out[15]: [<li>Area<li>Population</li></li>, <li>Population</li>]
In [1]: from bs4 import BeautifulSoup In [2]: url = 'http://example.webscraping.com/places/view/United-Kingdom-239' In [3]: import common In [5]: html = common.download(url) Downloading: http://example.webscraping.com/places/view/United-Kingdom-239 In [6]: soup = BeautifulSoup(html) /usr/lib/python2.7/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently. To get rid of this warning, change this: BeautifulSoup([your markup]) to this: BeautifulSoup([your markup], "lxml") markup_type=markup_type)) In [7]: # locate the area row In [8]: tr = soup.find(attrs={'id':'places_area__row'}) In [9]: td = tr.find(attrs={'class':'w2p_fw'}) # locate the area tag In [10]: area = td.text # extract the text from this tag In [11]: print area 244,820 square kilometres
Lxml基於 libxml2(c語言實現),更快速,可是有時更難安裝。網址:http://lxml.de/installation.html。
In [1]: import lxml.html In [2]: broken_html = '<ul class=country><li>Area<li>Population</ul>' In [3]: tree = lxml.html.fromstring(broken_html) # parse the HTML In [4]: fixed_html = lxml.html.tostring(tree, pretty_print=True) In [5]: print fixed_html <ul class="country"> <li>Area</li> <li>Population</li> </ul>
In [1]: import common In [2]: import lxml.html In [3]: url = 'http://example.webscraping.com/places/view/United-Kingdom-239' In [4]: html = common.download(url) Downloading: http://example.webscraping.com/places/view/United-Kingdom-239 In [5]: tree = lxml.html.fromstring(html) In [6]: td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0] In [7]: area = td.text_content() In [8]: print area 244,820 square kilometres
在 CSS 中,選擇器是一種模式,用於選擇須要添加樣式的元素。
"CSS" 列指示該屬性是在哪一個 CSS 版本中定義的。(CSS一、CSS2 仍是 CSS3。)
選擇器 | 例子 | 例子描述 | CSS |
.class | .intro | 選擇 class="intro" 的全部元素。 | 1 |
#id | #firstname | 選擇 id="firstname" 的全部元素。 | 1 |
* | * | 選擇全部元素。 | 2 |
element | p | 選擇全部 <p> 元素。 | 1 |
element,element | div,p | 選擇全部 <div> 元素和全部 <p> 元素。 | 1 |
element element | div p | 選擇 <div> 元素內部的全部 <p> 元素。 | 1 |
element>element | div>p | 選擇父元素爲 <div> 元素的全部 <p> 元素。 | 2 |
element+element | div+p | 選擇緊接在 <div> 元素以後的全部 <p> 元素。 | 2 |
[attribute] | [target] | 選擇帶有 target 屬性全部元素。 | 2 |
[attribute=value] | [target=_blank] | 選擇 target="_blank" 的全部元素。 | 2 |
[attribute~=value] | [title~=flower] | 選擇 title 屬性包含單詞 "flower" 的全部元素。 | 2 |
[attribute|=value] | [lang|=en] | 選擇 lang 屬性值以 "en" 開頭的全部元素。 | 2 |
:link | a:link | 選擇全部未被訪問的連接。 | 1 |
:visited | a:visited | 選擇全部已被訪問的連接。 | 1 |
:active | a:active | 選擇活動連接。 | 1 |
:hover | a:hover | 選擇鼠標指針位於其上的連接。 | 1 |
:focus | input:focus | 選擇得到焦點的 input 元素。 | 2 |
:first-letter | p:first-letter | 選擇每一個 <p> 元素的首字母。 | 1 |
:first-line | p:first-line | 選擇每一個 <p> 元素的首行。 | 1 |
:first-child | p:first-child | 選擇屬於父元素的第一個子元素的每一個 <p> 元素。 | 2 |
:before | p:before | 在每一個 <p> 元素的內容以前插入內容。 | 2 |
:after | p:after | 在每一個 <p> 元素的內容以後插入內容。 | 2 |
:lang(language) | p:lang(it) | 選擇帶有以 "it" 開頭的 lang 屬性值的每一個 <p> 元素。 | 2 |
element1~element2 | p~ul | 選擇前面有 <p> 元素的每一個 <ul> 元素。 | 3 |
[attribute^=value] | a[src^="https"] | 選擇其 src 屬性值以 "https" 開頭的每一個 <a> 元素。 | 3 |
[attribute$=value] | a[src$=".pdf"] | 選擇其 src 屬性以 ".pdf" 結尾的全部 <a> 元素。 | 3 |
[attribute*=value] | a[src*="abc"] | 選擇其 src 屬性中包含 "abc" 子串的每一個 <a> 元素。 | 3 |
:first-of-type | p:first-of-type | 選擇屬於其父元素的首個 <p> 元素的每一個 <p> 元素。 | 3 |
:last-of-type | p:last-of-type | 選擇屬於其父元素的最後 <p> 元素的每一個 <p> 元素。 | 3 |
:only-of-type | p:only-of-type | 選擇屬於其父元素惟一的 <p> 元素的每一個 <p> 元素。 | 3 |
:only-child | p:only-child | 選擇屬於其父元素的惟一子元素的每一個 <p> 元素。 | 3 |
:nth-child(n) | p:nth-child(2) | 選擇屬於其父元素的第二個子元素的每一個 <p> 元素。 | 3 |
:nth-last-child(n) | p:nth-last-child(2) | 同上,從最後一個子元素開始計數。 | 3 |
:nth-of-type(n) | p:nth-of-type(2) | 選擇屬於其父元素第二個 <p> 元素的每一個 <p> 元素。 | 3 |
:nth-last-of-type(n) | p:nth-last-of-type(2) | 同上,可是從最後一個子元素開始計數。 | 3 |
:last-child | p:last-child | 選擇屬於其父元素最後一個子元素每一個 <p> 元素。 | 3 |
:root | :root | 選擇文檔的根元素。 | 3 |
:empty | p:empty | 選擇沒有子元素的每一個 <p> 元素(包括文本節點)。 | 3 |
:target | #news:target | 選擇當前活動的 #news 元素。 | 3 |
:enabled | input:enabled | 選擇每一個啓用的 <input> 元素。 | 3 |
:disabled | input:disabled | 選擇每一個禁用的 <input> 元素 | 3 |
:checked | input:checked | 選擇每一個被選中的 <input> 元素。 | 3 |
:not(selector) | :not(p) | 選擇非 <p> 元素的每一個元素。 | 3 |
::selection | ::selection | 選擇被用戶選取的元素部分。 | 3 |
CSS 選擇器參見:http://www.w3school.com.cn/cssref/css_selectors.ASP 和 https://pythonhosted.org/cssselect/#supported-selectors。
import urllib2 import itertools import re from bs4 import BeautifulSoup import lxml.html import time FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours') def download(url, user_agent='Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0', num_retries=2): print 'Downloading:', url headers = {'User-agent': user_agent} request = urllib2.Request(url, headers=headers) try: html = urllib2.urlopen(request).read() except urllib2.URLError as e: print 'Download error:', e.reason html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: # recursively retry 5xx HTTP errors return download(url, num_retries-1) return html def re_scraper(html): results = {} for field in FIELDS: results[field] = re.search(r'places_%s__row.*?w2p_fw">(.*?)</td>' % field, html.replace('\n','')).groups()[0] return results def bs_scraper(html): soup = BeautifulSoup(html, 'html.parser') results = {} for field in FIELDS: results[field] = soup.find('table').find('tr',id='places_%s__row' % field).find('td',class_='w2p_fw').text return results def lxml_scraper(html): tree = lxml.html.fromstring(html) results = {} for field in FIELDS: results[field] = tree.cssselect('table > tr#places_%s__row> td.w2p_fw' % field)[0].text_content() return results NUM_ITERATIONS = 1000 # number of times to test each scraper html = download('http://example.webscraping.com/places/view/United-Kingdom-239') for name, scraper in [('Regular expressions', re_scraper),('BeautifulSoup', bs_scraper),('Lxml', lxml_scraper)]: # record start time of scrape start = time.time() for i in range(NUM_ITERATIONS): if scraper == re_scraper: re.purge() result = scraper(html) # check scraped result is as expected assert(result['area'] == '244,820 square kilometres') # record end time of scrape and output the total end = time.time() print '%s: %.2f seconds' % (name, end - start)
Downloading: http://example.webscraping.com/places/view/United-Kingdom-239 Regular expressions: 11.63 seconds BeautifulSoup: 92.80 seconds Lxml: 7.25 seconds
Downloading: http://example.webscraping.com/places/view/United-Kingdom-239 Regular expressions: 3.09 seconds BeautifulSoup: 29.40 seconds Lxml: 4.25 seconds
其中 re.purge() 用戶清正則表達式的緩存。