python3.6 單文件爬蟲斷點續存普通版文件續存方式

# 導入必備的包 # 本文爬取的是頂點小說中的完美世界爲列。文中的aa.text,bb.text爲本身建立的text文件
import requests from bs4 import BeautifulSoup # 爬取目標url
 url = 'https://www.x23us.com/html/42/42377/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3393.4 Safari/537.36' } # 單獨的函數，用於打開bb文件,bb文件用於存章節的url
def open_href(): with open('bb.text', 'r', encoding='utf-8') as f: a = f.readlines() f.close() # 將文件裏的全部url讀出並以列表的形式返回
    return a # 請求目標網址，並返回文本源碼
def page_index(): html = requests.get(url, headers=headers) if html: return html.text # 將目標網址進行解析，獲得全部章節的url
def page_list(html): if html: a = [] html_bs4 = BeautifulSoup(html, 'lxml') html_b = html_bs4.select('.L a') for i in html_b: title = i.get_text() href = url + i.get('href') data = { 'title': title, 'href': href } a.append(data) return a # 將aa文件打開並將獲得的章節內容寫進文件裏並關閉存檔
def text_cun_html(title, html): if html: with open('aa.text', 'a+', encoding='utf-8') as f: f.write(title + '\n' + html + '\n') f.close() print('存檔成功！！！！') return 'yes'
    else: None # 將bb文件打開並將寫入aa文件相對應的url寫進bb文件裏並關閉存檔
def text_cun_href(href): if href: with open('bb.text', 'a+', encoding='utf-8') as f: f.write(href + '\n') f.close() print('href存檔成功！！') return 'ok'
    else: None # 將獲得的章節url解析並二次請求獲取章節內容
def html_list_index(title, href): if href: html = requests.get(url=href, headers=headers) if html.status_code == 200: bs = BeautifulSoup(html.text, 'lxml') bs4 = bs.select('#contents') for item in bs4: a = text_cun_html(title, item.get_text()) # 章節內容存檔成功返回yes
                if a == 'yes': text_cun_href(href) else: None def main(): # 首先獲取下bb文件的url列表
    number = open_href() print(number) html = page_index() data = page_list(html) for i in data: title = i.get('title') href = i.get('href') print(href) # 判斷髮生異常後，在啓動服務時，咱們過濾掉已爬取過的url
        if href + '\n' not in number: html_list_index(title, href) else: None if __name__ == '__main__': main() #簡單思路實現斷點續存，不喜勿噴，歡迎共同討論
python3.6 單文件爬蟲 斷點續存 普通版 文件續存方式

python3.6 單文件爬蟲斷點續存普通版文件續存方式