爬蟲大做業

import requests
import re
from bs4 import BeautifulSoup
import json
import urllib
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import xlwt
import jieba.analyse
from PIL import Image,ImageSequence

url='https://juejin.im/search?query=前端'
res = requests.get(url)
res.encoding = "utf-8"
soup = BeautifulSoup(res.text,"html.parser")


ajaxUrlBegin='https://search-merger-ms.juejin.im/v1/search?query=%E5%89%8D%E7%AB%AF&page='
ajaxUrlLast='&raw_result=false&src=web'
for i in range(0,25):
    ajaxUrl=ajaxUrlBegin+str(i)+ajaxUrlLast;

response=urllib.request.urlopen(ajaxUrl)
ajaxres=response.read().decode('utf-8')
json_str = json.dumps(ajaxres) 
strdata = json.loads(json_str)  
data=eval(strdata) #str轉換爲dict

for i in range(0,25):
    ajaxUrl = ajaxUrlBegin + str(i) + ajaxUrlLast;
    for i in range(0,19):
        result=[]
        result=data['d'][i]['title']
        print(result+'\n')
        f = open('finally.txt', 'a', encoding='utf-8')
        f.write(result)
        f.close()

f = open('finally.txt', 'r', encoding='utf-8')
str = f.read()
stringList = list(jieba.cut(str))
symbol = {"/", "(", ")", " ", "", "", "", "","+","?"," ","","","","","","","","","",""}
stringSet = set(stringList) - symbol
title_dict = {}
for i in stringSet:
    title_dict[i] = stringList.count(i)
print(title_dict)

di = title_dict
wbk = xlwt.Workbook(encoding='utf-8')
sheet = wbk.add_sheet("wordCount")  
k = 0
for i in di.items():
    sheet.write(k, 0, label=i[0])
    sheet.write(k, 1, label=i[1])
    k = k + 1
wbk.save('前端數據.xls')  

font = r'C:\Windows\Fonts\simhei.ttf'
content = ' '.join(title_dict.keys())
image = np.array(Image.open('test.jpg'))
wordcloud = WordCloud(background_color='white', font_path=font, mask=image, width=1000, height=860, margin=2).generate(content)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud.to_file('c-cool.jpg')

生成詞雲html

 

相關文章
相關標籤/搜索
本站公眾號
   歡迎關注本站公眾號,獲取更多信息