網頁解析--BeautifulSoup練習

  1 # coding = utf-8
  2 # BeautifulSoup 主要功能是解析提取HTML數據
  3 # re lxml bs4
  4 
  5 # pip install Beautifulsoup4
  6 
  7 # from bs4 import BeautifulSoup
  8 
  9 html = '''
 10 <html><head><title>The Dormouse's story</title></head>
 11 
 12 <p class="title"><b>The Dormouse's story</b></p>
 13 
 14 <p class="story">Once upon a time there were three little sisters; and their names were
 15 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
 16 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
 17 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
 18 and they lived at the bottom of a well.</p>
 19 
 20 <p class="story">...</p>
 21 
 22 '''
 23 ############################################################################
 24 # BeautifulSoup部分
 25 #############################################################################
 26 
 27 # soup = BeautifulSoup(html, 'lxml')
 28 
 29 # 四大對象種類:Tag NavigableString Beautifulsoup Comment
 30 
 31 # print(soup.a)  # 獲取a標籤
 32 # print(soup.a.get('href'))   # 取a標籤的屬性,得到超連接
 33 # print(soup.a.text)  # 獲取a標籤下的文本,若a下有子標籤,可能獲取不到
 34 # print(soup.a.string)  # 獲取a標籤(包含a下的子標籤)下的文本
 35 
 36 
 37 # 搜索文檔:find  find_all 按照必定的過濾條件進行匹配
 38 
 39 # 字符串
 40 # print(soup.find_all('a'))  # 匹配整個文檔中的a標籤
 41 # print(soup.find_all(attrs={'class': 'title'})) # 匹配class爲title的標籤
 42 
 43 # #正則表達式
 44 # import re
 45 # print(soup.find_all(re.compile('^p')))  # 匹配以p開頭的標籤
 46 # print(soup.find_all(re.compile('y$')))  # 匹配以y結尾的標籤
 47 # print(soup.find_all(re.compile('t')))  # 匹配包含t的標籤
 48 
 49 # 列表
 50 # for tag in soup.find_all(['a', 'b']):  # 匹配a標籤,b標籤
 51 #     print(tag)
 52 
 53 # for tag in soup.find_all('p', class_='story'):  # 匹配class=story的p標籤
 54 #     print(tag)
 55 
 56 # # 方法  給find_all傳入一個方法做爲過濾條件
 57 # def has_class_but_no_id(tag):
 58 #     """
 59 #     定義一個判斷有class屬性可是沒有id屬性的方法,做爲過濾條件
 60 #     """
 61 #     return tag.has_attr('class') and not tag.has_attr('id')
 62 #
 63 # for tag in soup.find_all(has_class_but_no_id):
 64 #     print(tag)
 65 
 66 
 67 # css選擇器
 68 # print(soup.select('title'))   # 經過標籤名查找
 69 # print(soup.select('.sister')) # 經過class名查找
 70 # print(soup.select('#link1'))  # 經過id名查找
 71 # print(soup.select('p #link2'))  # 組合查找,id爲link2的p標籤
 72 
 73 # > 只可以一級一級向下查找
 74 # print(soup.select('body > p .sister')) # 查找body下類名爲sister的p
 75 
 76 
 77 # 百度搜索python,對返回頁面進行屬性查找
 78 # import requests
 79 # url = 'http://www.baidu.com/s?wd=python'
 80 # response = requests.get(url)  # 獲取的數據是網頁源代碼,未通過js渲染
 81 #
 82 # soup = BeautifulSoup(response.text, 'lxml')
 83 
 84 # 查找返回頁面搜索到的結果
 85 # items = soup.find_all('div', class_='result c-container ')
 86 
 87 # 打印搜索結果
 88 # for item in items:
 89 #     print(item.select('h3 > a')[0].get('href')  # 取a標籤
 90 #     print(item.select('h3 > a')[0].get_text())
 91 
 92 #################################################################################
 93 # xpath 部分
 94 # 通配符  /  //  @  # .  ..
 95 #   /表示從當前節點匹配  //整個文檔匹配  @選取屬性 *
 96 ########################################################################################
 97 html = '''
 98 <html><head><title>The Dormouse's story</title></head>
 99 <p class="title"><b>The Dormouse's story</b></p>
100 <p class="story">Once upon a time there were three little sisters; and their names were
101 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
102 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
103 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
104 and they lived at the bottom of a well.</p>
105 <p class="story">...</p>
106 '''
107 # from lxml import etree
108 # e = etree.HTML(html)
109 # for i in e.xpath('//p'):  # 整個文檔中搜索p標籤
110 #     # print(i.xpath('string(.)'))  # 獲取當前標籤下全部文本(標籤下套標籤),包括下面子標籤的文本
111 #     print(i.text) # 匹配當前標籤下的文本內容,不包含子標籤
112 
113 """
114 # for i in e.xpath('//p/@class'):  # 選取p的class屬性
115 # for i in e.xpath('//p[@class=title]'):  # 搜索class=title的p標籤
116 //title[@*]  匹配全部有屬性的title標籤
117 """
118 # 百度搜索python,用xpath查找
119 import requests
120 from lxml import etree
121 
122 url = 'http://www.baidu.com/s?wd=python'
123 response = requests.get(url)  # 獲取的數據是網頁源代碼
124 tree = etree.HTML(response.text)
125 
126 # 查找返回頁面搜索到的結果
127 items = tree.xpath('//div[@class="result c-container "]')
128 for item in items:
129     # print(item.xpath('h3/a/@href'))
130     print(item.xpath('h3/a')[0].xpath('string(.)'))
相關文章
相關標籤/搜索