最近總是寫selenium的爬蟲,想複習下requests + BeautifulSoup爬取網站內容。html
先寫一下思路: 打開網站,獲取網站的首頁顯示的小說-------------->根據輸入的內容來進行判斷是否含有該小說,有,就對該小說進行訪問。------------->打開含有小說目錄的網頁,匹配章節名稱和URL---------->循環獲取文本內容,並對內容進行清理,寫入文本文檔。python
所有代碼:app
1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 4 import requests 5 import datetime 6 from bs4 import BeautifulSoup 7 import time 8 def book_info(): 9 url = 'http://www.quanshuwang.com/' 10 headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'} 11 html = requests.get(url,timeout=30,headers=headers) 12 time.sleep(2) 13 html.encoding = html.apparent_encoding 14 soup = BeautifulSoup(html.text,'html.parser') 15 #獲取熱門書籍 16 hot_list = soup.select('a.msgBorder') 17 hot_book = {} 18 for x in hot_list: 19 hot_book[x['title']] = x['href'] 20 #print(hot_book) 21 #獲取好看的書籍: 22 wonderful_list = soup.find_all(name='a',attrs={'class':'clearfix stitle'}) 23 wonderful_book = {} 24 for y in wonderful_list: 25 wonderful_book[y['title']] = y['href'] 26 #print(len(wonderful_list)) 27 #添加到一個總的字典中。 28 book_dict = {} 29 for k,v in hot_book.items(): 30 book_dict[k] = v 31 for k,v in wonderful_book.items(): 32 book_dict[k] = v 33 return book_dict 34 #詢問用戶想看什麼書 35 def search_book(book_name,book_dict): 36 if book_name in book_dict: 37 return book_dict[book_name] 38 else: 39 return '對不起,您要查詢的書籍沒有找到。' 40 #獲取書籍的網址,並訪問。 41 def down_book(url_1): 42 url = url_1 43 headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'} 44 html2 = requests.get(url,timeout=30,headers=headers) 45 html2.encoding = html2.apparent_encoding 46 soup2 = BeautifulSoup(html2.text,'html.parser') 47 #網頁中有一個開始閱讀的按鈕,須要對其連接進行訪問,而後才能進入目錄界面。 48 read_url = soup2.select('a.reader')[0]['href'] 49 html3 = requests.get(read_url,timeout=30,headers=headers) 50 html3.encoding = html3.apparent_encoding 51 soup3 = BeautifulSoup(html3.text,'html.parser') 52 info_list = soup3.select('div[class="clearfix dirconone"] a') 53 catalog_dict = {} 54 for x in info_list: 55 catalog_dict[x['title']] = x['href'] 56 return catalog_dict 57 #a = down_book(search_book('盜墓筆記',book_info())) 58 #print(a) 59 def write_book(book_name,dicts): 60 headers = { 61 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'} 62 with open('%s.txt' % book_name,'w+') as w_b: 63 for k,v in dicts.items(): 64 w_b.write('\n\n\n%s \n\n\n' % k) 65 html4 = requests.get(v,timeout=30,headers=headers) 66 html4.encoding = html4.apparent_encoding 67 soup4 = BeautifulSoup(html4.text,'html.parser') 68 text_list = soup4.select('div.mainContenr') 69 sss = '' 70 for xabc in text_list: 71 abcd = xabc.text.replace(' ','').replace('style5();','').replace('style6();','') 72 sss += abcd 73 print(sss) 74 w_b.write(sss) 75 return w_b.close() 76 77 start_time = datetime.datetime.now() 78 bbb = input('請輸入你要查詢的書籍:') 79 if search_book(bbb,book_info()) != '對不起,您要查詢的書籍沒有找到。': 80 a = down_book(search_book(bbb,book_info())) 81 write_book(bbb,a) 82 else: 83 print(search_book(bbb,book_info())) 84 end_time = datetime.datetime.now() 85 cha = (end_time - start_time).seconds 86 print('這次運行耗時%s秒.' % cha)
代碼中都進行了註釋,若是有不懂的地方,請在文章下方進行評論。網站
謝謝您的閱讀!url
----------------by sniper-huohuo -----------------spa
------------ 知恥然後勇 --------------code