python爬取網站的小說

時間 2020-10-23

標籤 html 正則表達式 url spa code htm blog utf-8 get requests 欄目 Python 简体版

原文原文鏈接

截圖

源代碼

import requests

# re是正則表達式
import re

# 要爬取的小說url
url = 'http://www.shujy.com/5200/244309/'

response = requests.get(url)

response.encoding='utf-8'

html = response.text

print(html)

print('*'*100)
# 小說標題   正則表達式匹配
title=re.findall(r'<meta property="og:novel:book_name" content="(.*?)"/>',html)[0]

print(title)

# 新建一個txt文件來保存小說內容   與該py文件在同一目錄中

fb = open('%s.txt'% title,'w',encoding='utf-8')


# 小說的章節目標都在標籤<div id='list'>裏咱們經過下面的代碼獲取對應的章節名和url。用一個list來存放章節信息。
dl=re.findall(r'<div id="list">.*?</div>',html,re.S)[0]
chapter_info_list=re.findall(r'<a href="(.*?)">(.*?)</a>',dl)


print("*"*100)
print("獲取章節列表")
# 循環每一章節，分別下載，先把章節的連接，章節名提取出來。
for chapter_info in  chapter_info_list:
    chapter_url,chapter_title = chapter_info
    chapter_url = "http://www.shujy.com/5200/244309/%s" % chapter_url
    chapter_url = chapter_url.replace(' ','')
    print(chapter_url)

    # 下載小說內容
    chapter_response = requests.get(chapter_url)
    chapter_response.encoding='utf-8'
    # print(chapter_response.text)
    chapter_html = chapter_response.text
    # 小說的具體內容是在標籤<div id=content>裏，和獲取小說章節同樣咱們採用正則表達式來取得小說的內容。
    chapter_content = re.findall(r'<div id="content">(.*?)</div>',chapter_html,re.S)[0]
    # 這時候的chaper_content的內容還包含<br>，&nbsp等標籤
    # print(chapter_content)

    #作數據的清洗
    chapter_content = chapter_content.replace('&nbsp;','')
    chapter_content = chapter_content.replace('<br />','')
    chapter_content = chapter_content.replace('&amp;t','')
    chapter_content = chapter_content.replace('&emsp;','')
    chapter_content = chapter_content.replace('&t;','')
    chapter_content = chapter_content.replace('\n','')
    # 數據清洗後的內容   是純文本了
    print(chapter_content)

#     把小說保存到txt中
    fb.write(chapter_title)
    fb.write('\n')
    fb.write(chapter_content)
    fb.write('\n')
    print(chapter_url,chapter_title)

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。