1 # /usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # __Author__: yunrui 4 # __Date__: 2019/1/18 5 6 #從精彩閱讀網上爬取《凡人修仙傳》 7 8 import requests 9 import re 10 headers = { 11 "User-Agent": "Mozilla//5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"} 12 url = 'http://www.jingcaiyuedu.com/book/20278/list.html' 13 response = requests.get(url = url, headers = headers) 14 response.encoding = 'utf-8' 15 html = response.text 16 title = re.findall(r'<a href="/book/20278.html">(.*?)</a>', html)[0] 17 fb = open('%s.txt' % title, 'w', encoding='utf-8') 18 19 chapter_info_list = re.findall(r'<dd class="col-md-3"><a href="(.*?)">(.*?)<', html, re.S) 20 21 for chapter_info in chapter_info_list: 22 chapter_url, chapter_title = chapter_info 23 chapter_url = "http://www.jingcaiyuedu.com%s" % chapter_url 24 chapter_response = requests.get(url = chapter_url, headers = headers) 25 chapter_response.encoding = 'utf-8' 26 chapter_html = chapter_response.text 27 chapter_content = re.findall(r'<div class="panel-body" id="htmlCont(.*?)</div>', chapter_html)[0] 28 29 replace_list = [ 30 '<br>', '</br>', ' ', '<br />', 'ent">', '<br/>' 31 ] 32 for each in replace_list: 33 chapter_content = chapter_content.replace(each, '') 34 chapter_content = chapter_content.replace('\r', '\n\t') 35 36 print('%s 已經下載完畢' % chapter_title) 37 38 fb.write('【') 39 fb.write(chapter_title) 40 fb.write('】') 41 fb.write('\n') 42 fb.write(chapter_content) 43 fb.write('\n') 44 fb.write('********************************************') 45 fb.write('\n')