對html的解析是網頁抓取的基礎,分析抓取的結果找到本身想要的內容或標籤以達到抓取的目的。 html
HTMLParser是python用來解析html的模塊。它能夠分析出html裏面的標籤、數據等等,是一種處理html的簡便途徑。 HTMLParser採用的是一種事件驅動的模式,當HTMLParser找到一個特定的標記時,它會去調用一個用戶定義的函數,以此來通知程序處理。它主要的用戶回調函數的命名都是以handler_開頭的,都是HTMLParser的成員函數。當咱們使用時,就從HTMLParser派生出新的類,而後從新定義這幾個以handler_開頭的函數便可。這幾個函數包括: java
1. 基本解析,找到開始和結束標籤 python
from html.parser import HTMLParser class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): print("Encountered the beginning of a %s tag" %(tag)) def handle_endtag(self, tag): print ("Encountered the end of a %s tag" %(tag)) if __name__ == '__main__': a = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><!--insert javaScript here!--><title>test</title><body><a href="http: //www.163.com">連接到163</a></body></html>' m = MyHTMLParser() #傳入要分析的html模塊 m.feed(a)
2. 解析html的超連接和連接顯示的內容 app
from html.parser import HTMLParser class MyHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.flag=None def handle_starttag(self, tag, attrs): # 這裏從新定義了處理開始標籤的函數 if tag == 'a': # 判斷標籤<a>的屬性 self.flag='a' for name,value in attrs: if name == 'href': print("href:"+value) def handle_data(self,data): if self.flag == 'a': print("data:"+data) if __name__ == '__main__': a = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><!--insert javaScript here!--><title>test</title><body><a href="http: //www.163.com">連接到163</a></body></html>' my = MyHTMLParser() my.feed(a)
3. 完整解析html元素,拼接後原樣輸出 ide
from html.parser import HTMLParser import html.entities class BaseHTMLProcessor(HTMLParser): def reset(self): # extend (called by HTMLParser.__init__) self.pieces = [] HTMLParser.reset(self) def handle_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")] # Ideally we would like to reconstruct original tag and attributes, but # we may end up quoting attribute values that weren't quoted in the source # document, or we may change the type of quotes around the attribute value # (single to double quotes). # Note that improperly embedded non-HTML code (like client-side Javascript) # may be parsed incorrectly by the ancestor, causing runtime script errors. # All non-HTML code must be enclosed in HTML comment tags (<!-- code -->) # to ensure that it will pass through this parser unaltered (in handle_comment). strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) self.pieces.append("<%(tag)s%(strattrs)s>" % locals()) def handle_endtag(self, tag): # called for each end tag, e.g. for </pre>, tag will be "pre" # Reconstruct the original end tag. self.pieces.append("</%(tag)s>" % locals()) def handle_charref(self, ref): # called for each character reference, e.g. for " ", ref will be "160" # Reconstruct the original character reference. self.pieces.append("&#%(ref)s;" % locals()) def handle_entityref(self, ref): # called for each entity reference, e.g. for "©", ref will be "copy" # Reconstruct the original entity reference. self.pieces.append("&%(ref)s" % locals()) # standard HTML entities are closed with a semicolon; other entities are not if entities.entitydefs.has_key(ref): self.pieces.append(";") def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. self.pieces.append(text) def handle_comment(self, text): # called for each HTML comment, e.g. <!-- insert Javascript code here --> # Reconstruct the original comment. # It is especially important that the source document enclose client-side # code (like Javascript) within comments so it can pass through this # processor undisturbed; see comments in unknown_starttag for details. self.pieces.append("<!--%(text)s-->" % locals()) def handle_pi(self, text): # called for each processing instruction, e.g. <?instruction> # Reconstruct original processing instruction. self.pieces.append("<?%(text)s>" % locals()) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" # "http://www.w3.org/TR/html4/loose.dtd"> # Reconstruct original DOCTYPE self.pieces.append("<!%(text)s>" % locals()) def output(self): """Return processed HTML as a single string""" return "".join(self.pieces) if __name__ == '__main__': a = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><!--insert javaScript here!--><title>test</title><body><a href="http: //www.163.com">連接到163</a></body></html>' bhp =BaseHTMLProcessor() bhp.feed(a) print(bhp.output())