從精彩閱讀網上爬取《凡人修仙傳》

 1 # /usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # __Author__: yunrui
 4 # __Date__:   2019/1/18
 5 
 6 #從精彩閱讀網上爬取《凡人修仙傳》
 7 
 8 import requests
 9 import re
10 headers = {
11             "User-Agent": "Mozilla//5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
12 url = 'http://www.jingcaiyuedu.com/book/20278/list.html'
13 response = requests.get(url = url, headers = headers)
14 response.encoding = 'utf-8'
15 html = response.text
16 title = re.findall(r'<a href="/book/20278.html">(.*?)</a>', html)[0]
17 fb = open('%s.txt' % title, 'w', encoding='utf-8')
18 
19 chapter_info_list = re.findall(r'<dd class="col-md-3"><a href="(.*?)">(.*?)<', html, re.S)
20 
21 for chapter_info in chapter_info_list:
22     chapter_url, chapter_title = chapter_info
23     chapter_url = "http://www.jingcaiyuedu.com%s" % chapter_url
24     chapter_response = requests.get(url = chapter_url, headers = headers)
25     chapter_response.encoding = 'utf-8'
26     chapter_html = chapter_response.text
27     chapter_content = re.findall(r'<div class="panel-body" id="htmlCont(.*?)</div>', chapter_html)[0]
28     
29     replace_list = [
30         '<br>', '</br>', ' ', '<br />', 'ent">', '<br/>'
31     ]
32     for each in replace_list:
33         chapter_content = chapter_content.replace(each, '')
34     chapter_content = chapter_content.replace('\r', '\n\t')
35     
36     print('%s 已經下載完畢' % chapter_title)
37     
38     fb.write('')
39     fb.write(chapter_title)
40     fb.write('')
41     fb.write('\n')
42     fb.write(chapter_content)
43     fb.write('\n')
44     fb.write('********************************************')
45     fb.write('\n')
相關文章
相關標籤/搜索