BeautifulSoup的基本用法

時間 2020-06-14

標籤 beautifulsoup 基本用法简体版

原文原文鏈接

 1 # -*- coding:UTF-8 -*-
 2 from urllib import request  3 from bs4 import BeautifulSoup  4 import re  5 import sys  6 
 7 if __name__ == "__main__":  8     #建立txt文件
 9     file = open('一念永恆.txt', 'w', encoding='utf-8') 10     #一念永恆小說目錄地址
11     target_url = 'http://www.biqukan.com/1_1094/'
12     #User-Agent
13     head = {} 14     head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
15     target_req = request.Request(url = target_url, headers = head) 16     target_response = request.urlopen(target_req) 17     target_html = target_response.read().decode('gbk','ignore') 18     #建立BeautifulSoup對象
19     listmain_soup = BeautifulSoup(target_html,'lxml') 20 
21     #搜索文檔樹,找出div標籤中class爲listmain的全部子標籤
22     chapters = listmain_soup.find_all('div',class_ = 'listmain') 23     #使用查詢結果再建立一個BeautifulSoup對象,對其繼續進行解析
24     download_soup = BeautifulSoup(str(chapters), 'lxml') 25     #計算章節個數
26     numbers = (len(download_soup.dl.contents) - 1) / 2 - 8
27     index = 1
28     #開始記錄內容標誌位,只要正文卷下面的連接,最新章節列表連接剔除
29     begin_flag = False 30     #遍歷dl標籤下全部子節點
31     for child in download_soup.dl.children: 32         #濾除回車
33         if child != '\n': 34             #找到《一念永恆》正文卷,使能標誌位
35             if child.string == u"《一念永恆》正文卷": 36                 begin_flag = True 37             #爬取連接並下載連接內容
38             if begin_flag == True and child.a != None: 39                 download_url = "http://www.biqukan.com" + child.a.get('href') 40                 download_req = request.Request(url = download_url, headers = head) 41                 download_response = request.urlopen(download_req) 42                 download_html = download_response.read().decode('gbk','ignore') 43                 download_name = child.string 44                 soup_texts = BeautifulSoup(download_html, 'lxml') 45                 texts = soup_texts.find_all(id = 'content', class_ = 'showtxt') 46                 soup_text = BeautifulSoup(str(texts), 'lxml') 47                 write_flag = True 48                 file.write(download_name + '\n\n') 49                 #將爬取內容寫入文件
50                 for each in soup_text.div.text.replace('\xa0',''): 51                     if each == 'h': 52                         write_flag = False 53                     if write_flag == True and each != ' ': 54  file.write(each) 55                     if write_flag == True and each == '\r': 56                         file.write('\n') 57                 file.write('\n\n') 58                 #打印爬取進度
59                 sys.stdout.write("已下載:%.3f%%" % float(index/numbers) + '\r') 60  sys.stdout.flush() 61                 index += 1
62     file.close()

>>> for link in soup.find_all('a'):
... print(link.get('href'))
#用於爬取a標籤的連接css

Beautiful Soup 4.4.0 文檔連接：http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

已下文章來自博客園大佬：http://www.cnblogs.com/sakura3/p/8460224.html（爲了複習方便，搬一下，謝謝）html

爬小說：python

 1 #!/usr/bin/python
 2 # -*- coding: UTF-8 -*-
 3 import requests  4 from bs4 import BeautifulSoup  5 # get_url_list 獲取全部章節的URL，在一個list裏
 6 def get_url_list(url):  7     content = requests.get(url).content           #獲取頁面內容
 8     soup = BeautifulSoup(content,'lxml')          #Beautifulsoup 實例化對象
 9     url_list = []                                #空的url_list 數組
10     # urls = soup.find('div',{'id':'list'}).find('dl').find_all('dd')
11     urls = soup.select('#list > dl > dd > a')    # 根據頁面選擇到URL ，還能夠urls = soup.find('div',{'id':'list'}).find('dl').find_all('dd')
12     for i in urls:          #遍歷裏面的每一章的URL
13         i = i.get('href')   #獲取URL
14         # print(i) 
15         i = 'http://www.biquge.com.tw' + i   #分析文章組成，造成最終的URL
16         url_list.append(i)    #添加到url_list 裏面去
17     # print (url_list)
18     return url_list 19 # 獲取這一章的內容
20 def get_data(url): 21     content = requests.get(url).content 22     soup = BeautifulSoup(content, 'lxml') 23     f = open(r'C:\Users\HBX\Documents\staudy\HMXX.txt','a+',encoding='utf-8')  #不加utf-8 會有編碼報錯
24     text_name = soup.find('div',{'class':'bookname'}).find('h1').text     #得到章節名字
25     # text_content = soup.select('#content')
26     text_content = soup.find('div',{'id':'content'}).get_text()    #得到章節內容 ，還有一種select css 選擇的獲取章節內容的方式
27     book =text_name+ '\r\n' + text_content    #總體的一章
28     # print(book)
29     f.write((book)+'\r\n')   #換行寫入
30     f.close()   #關閉文件
31     # for x in text_content:
32     # a = x.text.replace('readx();', '')
33     # print(a)
34 
35 
36 
37 if __name__ =='__main__': 38     url = 'http://www.biquge.com.tw/18_18049/'   #筆趣閣的小說目錄頁面
39     url_list = get_url_list(url)   #獲取了全部的url
40     for i in url_list:    # 循環一章url
41         get_data(i)      #獲取文章內容

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。