1 # coding = utf-8 2 # BeautifulSoup 主要功能是解析提取HTML數據 3 # re lxml bs4 4 5 # pip install Beautifulsoup4 6 7 # from bs4 import BeautifulSoup 8 9 html = ''' 10 <html><head><title>The Dormouse's story</title></head> 11 12 <p class="title"><b>The Dormouse's story</b></p> 13 14 <p class="story">Once upon a time there were three little sisters; and their names were 15 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 16 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 17 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 18 and they lived at the bottom of a well.</p> 19 20 <p class="story">...</p> 21 22 ''' 23 ############################################################################ 24 # BeautifulSoup部分 25 ############################################################################# 26 27 # soup = BeautifulSoup(html, 'lxml') 28 29 # 四大對象種類:Tag NavigableString Beautifulsoup Comment 30 31 # print(soup.a) # 獲取a標籤 32 # print(soup.a.get('href')) # 取a標籤的屬性,得到超連接 33 # print(soup.a.text) # 獲取a標籤下的文本,若a下有子標籤,可能獲取不到 34 # print(soup.a.string) # 獲取a標籤(包含a下的子標籤)下的文本 35 36 37 # 搜索文檔:find find_all 按照必定的過濾條件進行匹配 38 39 # 字符串 40 # print(soup.find_all('a')) # 匹配整個文檔中的a標籤 41 # print(soup.find_all(attrs={'class': 'title'})) # 匹配class爲title的標籤 42 43 # #正則表達式 44 # import re 45 # print(soup.find_all(re.compile('^p'))) # 匹配以p開頭的標籤 46 # print(soup.find_all(re.compile('y$'))) # 匹配以y結尾的標籤 47 # print(soup.find_all(re.compile('t'))) # 匹配包含t的標籤 48 49 # 列表 50 # for tag in soup.find_all(['a', 'b']): # 匹配a標籤,b標籤 51 # print(tag) 52 53 # for tag in soup.find_all('p', class_='story'): # 匹配class=story的p標籤 54 # print(tag) 55 56 # # 方法 給find_all傳入一個方法做爲過濾條件 57 # def has_class_but_no_id(tag): 58 # """ 59 # 定義一個判斷有class屬性可是沒有id屬性的方法,做爲過濾條件 60 # """ 61 # return tag.has_attr('class') and not tag.has_attr('id') 62 # 63 # for tag in soup.find_all(has_class_but_no_id): 64 # print(tag) 65 66 67 # css選擇器 68 # print(soup.select('title')) # 經過標籤名查找 69 # print(soup.select('.sister')) # 經過class名查找 70 # print(soup.select('#link1')) # 經過id名查找 71 # print(soup.select('p #link2')) # 組合查找,id爲link2的p標籤 72 73 # > 只可以一級一級向下查找 74 # print(soup.select('body > p .sister')) # 查找body下類名爲sister的p 75 76 77 # 百度搜索python,對返回頁面進行屬性查找 78 # import requests 79 # url = 'http://www.baidu.com/s?wd=python' 80 # response = requests.get(url) # 獲取的數據是網頁源代碼,未通過js渲染 81 # 82 # soup = BeautifulSoup(response.text, 'lxml') 83 84 # 查找返回頁面搜索到的結果 85 # items = soup.find_all('div', class_='result c-container ') 86 87 # 打印搜索結果 88 # for item in items: 89 # print(item.select('h3 > a')[0].get('href') # 取a標籤 90 # print(item.select('h3 > a')[0].get_text()) 91 92 ################################################################################# 93 # xpath 部分 94 # 通配符 / // @ # . .. 95 # /表示從當前節點匹配 //整個文檔匹配 @選取屬性 * 96 ######################################################################################## 97 html = ''' 98 <html><head><title>The Dormouse's story</title></head> 99 <p class="title"><b>The Dormouse's story</b></p> 100 <p class="story">Once upon a time there were three little sisters; and their names were 101 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 102 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 103 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 104 and they lived at the bottom of a well.</p> 105 <p class="story">...</p> 106 ''' 107 # from lxml import etree 108 # e = etree.HTML(html) 109 # for i in e.xpath('//p'): # 整個文檔中搜索p標籤 110 # # print(i.xpath('string(.)')) # 獲取當前標籤下全部文本(標籤下套標籤),包括下面子標籤的文本 111 # print(i.text) # 匹配當前標籤下的文本內容,不包含子標籤 112 113 """ 114 # for i in e.xpath('//p/@class'): # 選取p的class屬性 115 # for i in e.xpath('//p[@class=title]'): # 搜索class=title的p標籤 116 //title[@*] 匹配全部有屬性的title標籤 117 """ 118 # 百度搜索python,用xpath查找 119 import requests 120 from lxml import etree 121 122 url = 'http://www.baidu.com/s?wd=python' 123 response = requests.get(url) # 獲取的數據是網頁源代碼 124 tree = etree.HTML(response.text) 125 126 # 查找返回頁面搜索到的結果 127 items = tree.xpath('//div[@class="result c-container "]') 128 for item in items: 129 # print(item.xpath('h3/a/@href')) 130 print(item.xpath('h3/a')[0].xpath('string(.)'))