HTMLParser版:
#
!/usr/bin/python
#
-*- coding: UTF-8 -*-
import HTMLParser
class UrlParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser. __init__(self) self.urls = [] def handle_starttag(self, tag, attrs): if tag == ' a ': for name,value in attrs: if name == ' href ': self.urls.append(value) def geturls(self): return self.urls
if __name__ == ' __main__ ': urls = [] url = UrlParser() url.feed( ' 1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333 ') urls += url.geturls() print urls
class UrlParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser. __init__(self) self.urls = [] def handle_starttag(self, tag, attrs): if tag == ' a ': for name,value in attrs: if name == ' href ': self.urls.append(value) def geturls(self): return self.urls
if __name__ == ' __main__ ': urls = [] url = UrlParser() url.feed( ' 1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333 ') urls += url.geturls() print urls
pyquery版:
#
!/usr/bin/python
#
-*- coding: UTF-8 -*-
from pyquery
import PyQuery as pq
class UrlParser(): def __init__(self): self.urls = [] def feed(self,data): d = pq(data) if d.find( ' a '): # 關於下面一行,我用d('a').attr('href')只能獲得第一個URL,暫時只會用map,不知道有沒有別的夠pythonic的代碼 url = d( ' a ').map( lambda i, e: pq(e)( ' a ').attr( ' href ')) for u in url: self.urls.append(u) def geturls(self): return self.urls
if __name__ == ' __main__ ': urls = [] url = UrlParser() url.feed( ' 1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333 ') urls += url.geturls() print urls
class UrlParser(): def __init__(self): self.urls = [] def feed(self,data): d = pq(data) if d.find( ' a '): # 關於下面一行,我用d('a').attr('href')只能獲得第一個URL,暫時只會用map,不知道有沒有別的夠pythonic的代碼 url = d( ' a ').map( lambda i, e: pq(e)( ' a ').attr( ' href ')) for u in url: self.urls.append(u) def geturls(self): return self.urls
if __name__ == ' __main__ ': urls = [] url = UrlParser() url.feed( ' 1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333 ') urls += url.geturls() print urls
正則表達式版:
#
!/usr/bin/python
#
-*- coding: UTF-8 -*-
import re
class UrlParser(): def __init__(self): self.urls = [] def feed(self,data): url = re.findall(r ''' <a(\s*)(.*?)(\s*)href(\s*)=(\s*)([\"\s]*)([^\"\']+?)([\"\s]+)(.*?)> ''',data,re.S|re.I) for u in url: self.urls.append(u[6]) def geturls(self): return self.urls
if __name__ == ' __main__ ': urls = [] url = UrlParser() url.feed( ' 1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333 ') urls += url.geturls() print urls
class UrlParser(): def __init__(self): self.urls = [] def feed(self,data): url = re.findall(r ''' <a(\s*)(.*?)(\s*)href(\s*)=(\s*)([\"\s]*)([^\"\']+?)([\"\s]+)(.*?)> ''',data,re.S|re.I) for u in url: self.urls.append(u[6]) def geturls(self): return self.urls
if __name__ == ' __main__ ': urls = [] url = UrlParser() url.feed( ' 1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333 ') urls += url.geturls() print urls
速度比較:正則表達式 > pyquery > HTMLParser
測試的時候遍歷大約1000個頁面,正則表達式佔絕對優點,這3個速度比例大約是 8:2:1
HTMLParser最慢,pyquery速度大約是它的2倍,正則的速度是它的8倍,看來之後如非必要再也不考慮HTMLParser了,用起 來也不如pyquery方便,正則速度卻是很快,功能也強大,前二者能提取的內容用正則所有都能實現,而正則能實現的功能前二者就不必定能實現了。只是正 則的可讀性很差。之後遇到數據量大的用正則表達式,數據量不大不考慮時間因素但邏輯複雜的的用pyquery,之後維護起來方便Python抓取頁面中超連接(URL)的三中方法比較(HTMLParser、pyquery、正則表達式)