1.主題html
爬取小說網站的《全職高手》小說第一章python
2.代碼app
導入包dom
import random import requests import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS
import jieba
import numpy as np
from PIL import Image
取出所須要的標題和正文函數
req1 = requests.get('http://www.biqukan.cc/book/11/10358.html',headers=header[random.randint(0, 4)]) #向目標網站發送get請求 req2 = requests.get('http://www.biqukan.cc/book/11/10358_2.html', headers=header[random.randint(0, 4)]) result1 = req1.content result1 = result1.decode('gbk') result2 = req2.content result2 = result2.decode('gbk') title_re = re.compile(r' <li class="active">(.*?)</li>') #取出文章的標題 text_re = re.compile(r'<br><br>([\s\S]*?)</div>') title = re.findall(title_re, result1) #找出標題 text1 = re.findall(text_re, result1) #找出第一部分的正文 text2 = re.findall(text_re, result2) title = title[0] print(title) text1.append(text2[0]) text1 = '\r\n'.join(text1) text1 = text1.split('\r\n') text_1 = []
定義一個獲取全部章節 url的函數網站
def get_url(url): req = requests.get(url,headers = header[random.randint(0,4)]) result = req.content result = result.decode('gbk') res = r'<dd class="col-md-3"><a href=(.*?) title=' list_url = re.findall(res,result) list_url_ = [] #定義一個空列表 for url_ in list_url: if '"''"' in url_: url_ = url_.replace('"','') url_ = url_.replace('"', '') list_url_.append('http://www.biqukan.cc/book/11/' + url_) elif "'""'" in url_: url_ = url_.replace("'", '') url_ = url_.replace("'", '') list_url_.append('http://www.biqukan.cc/book/11/' + url_) return list_url_
去掉句子中多餘的部分url
for sentence in text1: sentence = sentence.strip() if ' ' in sentence: sentence = sentence.replace(' ', '') if '<br />' in sentence: sentence = sentence.replace('<br />', '') text_1.append(sentence) else: text_1.append(sentence) elif '<br />' in sentence: sentence = sentence.replace('<br />', '') text_1.append(sentence) elif '-->><p class="text-danger text-center mg0">本章未完,點擊下一頁繼續閱讀</p>' in sentence: sentence = sentence.replace(r'-->><p class="text-danger text-center mg0">本章未完,點擊下一頁繼續閱讀</p>','') text_1.append(sentence) else: text_1.append(sentence)
將數據放入txt文本文件spa
fo = open("qzgs.txt", "wb") for url_txt in get_url('http://www.biqukan.cc/book/11/'): get_txt(url_txt) fo.close()
讀取要生成詞雲的文件和生成形狀的圖片code
text_from_file_with_apath = open('qzgs.txt',encoding='gbk').read()
abel_mask = np.array(Image.open("qzgs.jpg"))
進行分隔htm
wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = True) wl_space_split = " ".join(wordlist_after_jieba)
設置詞雲生成圖片的樣式
wordcloud = WordCloud(
background_color='white',
mask = abel_mask,
max_words = 80,
max_font_size = 150,
random_state = 30,
scale=.5
stopwords = {}.fromkeys(['nbsp', 'br']),
font_path = 'C:/Users/Windows/fonts/simkai.ttf',
).generate(wl_space_split)
image_colors = ImageColorGenerator(abel_mask)
顯示詞雲生成的圖片
plt.imshow(my_wordcloud) plt.axis("off") plt.show()
3.數據截圖
4.遇到的問題及解決方法
詞雲一直安裝失敗
解決方法:去百度上下載了詞雲,而後來安裝,才安裝成功
5.總結
使用了Python後發現Python的用途很廣,不少地方都須要,是個要好好學的語言