貼一段Lyrichu的代碼:
(運行環境爲P2.7)
# -*- coding: utf-8 -*-
# @Time : 2017/3/28 8:46
# @Author : Lyrichu
# @Email : 919987476@qq.com
# @File : NetCloud_spider3.py
'''
@Description:
網易雲音樂評論爬蟲,能夠完整爬取整個評論
部分參考了@平胸小仙女的文章(地址:https://www.zhihu.com/question/36081767)
post加密部分也給出了,能夠參考原帖:
做者:平胸小仙女
連接:https://www.zhihu.com/question/36081767/answer/140287795
來源:知乎
'''
from Crypto.Cipher import AES
import base64
import requests
import json
import codecs
import time
# 頭部信息
headers = {
'Host':"music.163.com",
'Accept-Language':"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
'Accept-Encoding':"gzip, deflate",
'Content-Type':"application/x-www-form-urlencoded",
'Cookie':"_ntes_nnid=754361b04b121e078dee797cdb30e0fd,1486026808627; _ntes_nuid=754361b04b121e078dee797cdb30e0fd; JSESSIONID-WYYY=yfqt9ofhY%5CIYNkXW71TqY5OtSZyjE%2FoswGgtl4dMv3Oa7%5CQ50T%2FVaee%2FMSsCifHE0TGtRMYhSPpr20i%5CRO%2BO%2B9pbbJnrUvGzkibhNqw3Tlgn%5Coil%2FrW7zFZZWSA3K9gD77MPSVH6fnv5hIT8ms70MNB3CxK5r3ecj3tFMlWFbFOZmGw%5C%3A1490677541180; _iuqxldmzr_=32; vjuids=c8ca7976.15a029d006a.0.51373751e63af8; vjlast=1486102528.1490172479.21; __gads=ID=a9eed5e3cae4d252:T=1486102537:S=ALNI_Mb5XX2vlkjsiU5cIy91-ToUDoFxIw; vinfo_n_f_l_n3=411a2def7f75a62e.1.1.1486349441669.1486349607905.1490173828142; P_INFO=m15527594439@163.com|1489375076|1|study|00&99|null&null&null#hub&420100#10#0#0|155439&1|study_client|15527594439@163.com; NTES_CMT_USER_INFO=84794134%7Cm155****4439%7Chttps%3A%2F%2Fsimg.ws.126.net%2Fe%2Fimg5.cache.netease.com%2Ftie%2Fimages%2Fyun%2Fphoto_default_62.png.39x39.100.jpg%7Cfalse%7CbTE1NTI3NTk0NDM5QDE2My5jb20%3D; usertrack=c+5+hljHgU0T1FDmA66MAg==; Province=027; City=027; _ga=GA1.2.1549851014.1489469781; __utma=94650624.1549851014.1489469781.1490664577.1490672820.8; __utmc=94650624; __utmz=94650624.1490661822.6.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; playerid=81568911; __utmb=94650624.23.10.1490672820",
'Connection':"keep-alive",
'Referer':'http://music.163.com/'
}
# 設置代理服務器
proxies= {
'http:':'http://121.232.146.184',
'https:':'https://144.255.48.197'
}
# offset的取值爲:(評論頁數-1)*20,total第一頁爲true,其他頁爲false
# first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}' # 第一個參數
second_param = "010001" # 第二個參數
# 第三個參數
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
# 第四個參數
forth_param = "0CoJUm6Qyw8W8jud"
# 獲取參數
def get_params(page): # page爲傳入頁數
iv = "0102030405060708"
first_key = forth_param
second_key = 16 * 'F'
if(page == 1): # 若是爲第一頁
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
h_encText = AES_encrypt(first_param, first_key, iv)
else:
offset = str((page-1)*20)
first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' %(offset,'false')
h_encText = AES_encrypt(first_param, first_key, iv)
h_encText = AES_encrypt(h_encText, second_key, iv)
return h_encText
# 獲取 encSecKey
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey
# 解密過程
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text)
encrypt_text = base64.b64encode(encrypt_text)
return encrypt_text
# 得到評論json數據
def get_json(url, params, encSecKey):
data = {
"params": params,
"encSecKey": encSecKey
}
response = requests.post(url, headers=headers, data=data,proxies = proxies)
return response.content
# 抓取熱門評論,返回熱評列表
def get_hot_comments(url):
hot_comments_list = []
hot_comments_list.append(u"用戶ID 用戶暱稱 用戶頭像地址 評論時間 點贊總數 評論內容\n")
params = get_params(1) # 第一頁
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
hot_comments = json_dict['hotComments'] # 熱門評論
print("共有%d條熱門評論!" % len(hot_comments))
for item in hot_comments:
comment = item['content'] # 評論內容
likedCount = item['likedCount'] # 點贊總數
comment_time = item['time'] # 評論時間(時間戳)
userID = item['user']['userID'] # 評論者id
nickname = item['user']['nickname'] # 暱稱
avatarUrl = item['user']['avatarUrl'] # 頭像地址
comment_info = userID + " " + nickname + " " + avatarUrl + " " + comment_time + " " + likedCount + " " + comment + u"\n"
hot_comments_list.append(comment_info)
return hot_comments_list
# 抓取某一首歌的所有評論
def get_all_comments(url):
all_comments_list = [] # 存放全部評論
all_comments_list.append(u"用戶ID 用戶暱稱 用戶頭像地址 評論時間 點贊總數 評論內容\n") # 頭部信息
params = get_params(1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
comments_num = int(json_dict['total'])
if(comments_num % 20 == 0):
page = comments_num / 20
else:
page = int(comments_num / 20) + 1
print("共有%d頁評論!" % page)
for i in range(page): # 逐頁抓取
params = get_params(i+1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
if i == 0:
print("共有%d條評論!" % comments_num) # 所有評論總數
for item in json_dict['comments']:
comment = item['content'] # 評論內容
likedCount = item['likedCount'] # 點贊總數
comment_time = item['time'] # 評論時間(時間戳)
userID = item['user']['userId'] # 評論者id
nickname = item['user']['nickname'] # 暱稱
avatarUrl = item['user']['avatarUrl'] # 頭像地址
comment_info = unicode(userID) + u" " + nickname + u" " + avatarUrl + u" " + unicode(comment_time) + u" " + unicode(likedCount) + u" " + comment + u"\n"
all_comments_list.append(comment_info)
print("第%d頁抓取完畢!" % (i+1))
return all_comments_list
# 將評論寫入文本文件
def save_to_file(list,filename):
with codecs.open(filename,'a',encoding='utf-8') as f:
f.writelines(list)
print("寫入文件成功!")
if __name__ == "__main__":
start_time = time.time() # 開始時間
url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_453185824?csrf_token="
filename = u"On_My_Way.txt"
all_comments_list = get_all_comments(url)
save_to_file(all_comments_list,filename)
end_time = time.time() #結束時間
print("程序耗時%f秒." % (end_time - start_time))
其中AES須要安裝pycrypto庫,在安裝時報錯,點擊more,能夠找到須要安裝的C類庫便可(直接複製相應的網址,下載並安裝便可,好像是VCForPython27.msi)
結果以下:
代碼文件:
#!/usr/bin/env python
# encoding=utf-8
from __future__ import print_function
import os
import requests
import re
import time
import xml.dom.minidom
import json
import sys
import math
import subprocess
import ssl
import threading
import urllib, urllib2
DEBUG = False
MAX_GROUP_NUM = 2 # 每組人數
INTERFACE_CALLING_INTERVAL = 5 # 接口調用時間間隔, 間隔過短容易出現"操做太頻繁", 會被限制操做半小時左右
MAX_PROGRESS_LEN = 50
QRImagePath = os.path.join(os.getcwd(), 'qrcode.jpg')
tip = 0
uuid = ''
base_uri = ''
redirect_uri = ''
push_uri = ''
skey = ''
wxsid = ''
wxuin = ''
pass_ticket = ''
deviceId = 'e000000000000000'
BaseRequest = {}
ContactList = []
My = []
SyncKey = []
try:
xrange
range = xrange
except:
# python 3
pass
def responseState(func, BaseResponse):
ErrMsg = BaseResponse['ErrMsg']
Ret = BaseResponse['Ret']
if DEBUG or Ret != 0:
print('func: %s, Ret: %d, ErrMsg: %s' % (func, Ret, ErrMsg))
if Ret != 0:
return False
return True
def getUUID():
global uuid
url = 'https://login.weixin.qq.com/jslogin'
params = {
'appid': 'wx782c26e4c19acffb',
'fun': 'new',
'lang': 'zh_CN',
'_': int(time.time()),
}
r = myRequests.get(url=url, params=params)
r.encoding = 'utf-8'
data = r.text
# print(data)
# window.QRLogin.code = 200; window.QRLogin.uuid = "oZwt_bFfRg==";
regx = r'window.QRLogin.code = (\d+); window.QRLogin.uuid = "(\S+?)"'
pm = re.search(regx, data)
code = pm.group(1)
uuid = pm.group(2)
if code == '200':
return True
return False
def showQRImage():
global tip
url = 'https://login.weixin.qq.com/qrcode/' + uuid
params = {
't': 'webwx',
'_': int(time.time()),
}
r = myRequests.get(url=url, params=params)
tip = 1
f = open(QRImagePath, 'wb+')
f.write(r.content)
f.close()
time.sleep(1)
if sys.platform.find('darwin') >= 0:
subprocess.call(['open', QRImagePath])
else:
os.startfile(QRImagePath)
print('請使用微信掃描二維碼以登陸')
def waitForLogin():
global tip, base_uri, redirect_uri, push_uri
url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % (
tip, uuid, int(time.time()))
r = myRequests.get(url=url)
r.encoding = 'utf-8'
data = r.text
# print(data)
# window.code=500;
regx = r'window.code=(\d+);'
pm = re.search(regx, data)
code = pm.group(1)
if code == '201': # 已掃描
print('成功掃描,請在手機上點擊確認以登陸')
tip = 0
elif code == '200': # 已登陸
print('正在登陸...')
regx = r'window.redirect_uri="(\S+?)";'
pm = re.search(regx, data)
redirect_uri = pm.group(1) + '&fun=new'
base_uri = redirect_uri[:redirect_uri.rfind('/')]
# push_uri與base_uri對應關係(排名分前後)(就是這麼奇葩..)
services = [
('wx2.qq.com', 'webpush2.weixin.qq.com'),
('qq.com', 'webpush.weixin.qq.com'),
('web1.wechat.com', 'webpush1.wechat.com'),
('web2.wechat.com', 'webpush2.wechat.com'),
('wechat.com', 'webpush.wechat.com'),
('web1.wechatapp.com', 'webpush1.wechatapp.com'),
]
push_uri = base_uri
for (searchUrl, pushUrl) in services:
if base_uri.find(searchUrl) >= 0:
push_uri = 'https://%s/cgi-bin/mmwebwx-bin' % pushUrl
break
# closeQRImage
if sys.platform.find('darwin') >= 0: # for OSX with Preview
os.system("osascript -e 'quit app \"Preview\"'")
elif code == '408': # 超時
pass
# elif code == '400' or code == '500':
return code
def login():
global skey, wxsid, wxuin, pass_ticket, BaseRequest
r = myRequests.get(url=redirect_uri)
r.encoding = 'utf-8'
data = r.text
# print(data)
doc = xml.dom.minidom.parseString(data)
root = doc.documentElement
for node in root.childNodes:
if node.nodeName == 'skey':
skey = node.childNodes[0].data
elif node.nodeName == 'wxsid':
wxsid = node.childNodes[0].data
elif node.nodeName == 'wxuin':
wxuin = node.childNodes[0].data
elif node.nodeName == 'pass_ticket':
pass_ticket = node.childNodes[0].data
# print('skey: %s, wxsid: %s, wxuin: %s, pass_ticket: %s' % (skey, wxsid,
# wxuin, pass_ticket))
if not all((skey, wxsid, wxuin, pass_ticket)):
return False
BaseRequest = {
'Uin': int(wxuin),
'Sid': wxsid,
'Skey': skey,
'DeviceID': deviceId,
}
return True
def webwxinit():
url = (base_uri +
'/webwxinit?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())))
params = {'BaseRequest': BaseRequest}
headers = {'content-type': 'application/json; charset=UTF-8'}
r = myRequests.post(url=url, data=json.dumps(params), headers=headers)
r.encoding = 'utf-8'
data = r.json()
if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxinit.json'), 'wb')
f.write(r.content)
f.close()
# print(data)
global ContactList, My, SyncKey
dic = data
ContactList = dic['ContactList']
My = dic['User']
SyncKey = dic['SyncKey']
state = responseState('webwxinit', dic['BaseResponse'])
return state
def webwxgetcontact():
url = (base_uri +
'/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())))
headers = {'content-type': 'application/json; charset=UTF-8'}
r = myRequests.post(url=url, headers=headers)
r.encoding = 'utf-8'
data = r.json()
if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxgetcontact.json'), 'wb')
f.write(r.content)
f.close()
dic = data
MemberList = dic['MemberList']
# 倒序遍歷,否則刪除的時候出問題..
SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync",
"floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp", "facebookapp",
"masssendapp",
"meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder", "weixinreminder",
"wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts", "notification_messages", "wxitil",
"userexperience_alarm"]
for i in range(len(MemberList) - 1, -1, -1):
Member = MemberList[i]
if Member['VerifyFlag'] & 8 != 0: # 公衆號/服務號
MemberList.remove(Member)
elif Member['UserName'] in SpecialUsers: # 特殊帳號
MemberList.remove(Member)
elif Member['UserName'].find('@@') != -1: # 羣聊
MemberList.remove(Member)
elif Member['UserName'] == My['UserName']: # 本身
MemberList.remove(Member)
return MemberList
def syncKey():
SyncKeyItems = ['%s_%s' % (item['Key'], item['Val'])
for item in SyncKey['List']]
SyncKeyStr = '|'.join(SyncKeyItems)
return SyncKeyStr
def syncCheck():
url = push_uri + '/synccheck?'
params = {
'skey': BaseRequest['Skey'],
'sid': BaseRequest['Sid'],
'uin': BaseRequest['Uin'],
'deviceId': BaseRequest['DeviceID'],
'synckey': syncKey(),
'r': int(time.time()),
}
r = myRequests.get(url=url, params=params)
r.encoding = 'utf-8'
data = r.text
# print(data)
# window.synccheck={retcode:"0",selector:"2"}
regx = r'window.synccheck={retcode:"(\d+)",selector:"(\d+)"}'
pm = re.search(regx, data)
retcode = pm.group(1)
selector = pm.group(2)
return selector
def webwxsync():
global SyncKey
url = base_uri + '/webwxsync?lang=zh_CN&skey=%s&sid=%s&pass_ticket=%s' % (
BaseRequest['Skey'], BaseRequest['Sid'], urllib.quote_plus(pass_ticket))
params = {
'BaseRequest': BaseRequest,
'SyncKey': SyncKey,
'rr': ~int(time.time()),
}
headers = {'content-type': 'application/json; charset=UTF-8'}
r = myRequests.post(url=url, data=json.dumps(params))
r.encoding = 'utf-8'
data = r.json()
# print(data)
dic = data
SyncKey = dic['SyncKey']
state = responseState('webwxsync', dic['BaseResponse'])
return state
def heartBeatLoop():
while True:
selector = syncCheck()
if selector != '0':
webwxsync()
time.sleep(1)
def main():
global myRequests
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36'}
myRequests = requests.Session()
myRequests.headers.update(headers)
if not getUUID():
print('獲取uuid失敗')
return
print('正在獲取二維碼圖片...')
showQRImage()
while waitForLogin() != '200':
pass
os.remove(QRImagePath)
if not login():
print('登陸失敗')
return
if not webwxinit():
print('初始化失敗')
return
MemberList = webwxgetcontact()
threading.Thread(target=heartBeatLoop)
MemberCount = len(MemberList)
print('通信錄共%s位好友' % MemberCount)
d = {}
imageIndex = 0
for Member in MemberList:
imageIndex = imageIndex + 1
# name = 'C:\\Users\\Public\\Pictures\\' + str(imageIndex) + '.jpg'
# imageUrl = 'http://wx2.qq.com' + Member['HeadImgUrl']
# r = myRequests.get(url=imageUrl, headers=headers)
# imageContent = (r.content)
# fileImage = open(name, 'wb')
# fileImage.write(imageContent)
# fileImage.close()
# print('正在下載第:' + str(imageIndex) + '位好友頭像')
d[Member['UserName']] = (Member['NickName'], Member['RemarkName'])
city = Member['City']
city = 'nocity' if city == '' else city
name = Member['NickName']
name = 'noname' if name == '' else name
sign = Member['Signature']
sign = 'nosign' if sign == '' else sign
remark = Member['RemarkName']
remark = 'noremark' if remark == '' else remark
alias = Member['Alias']
alias = 'noalias' if alias == '' else alias
nick = Member['NickName']
nick = 'nonick' if nick == '' else nick
print(name, '|||', city, '|||', Member['Sex'], '|||', Member['StarFriend'], '|||', sign,
'|||', remark, '|||', alias, '|||', nick)
if __name__ == '__main__':
main()
print('回車鍵退出...')
input()
import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError
from spider_basic.config import *
#
client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB]
#獲取主頁,offse是ajax動態加載的偏移量,keyword是搜索的關鍵字
def get_page_index(offset, keyword):
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
url = base + '?' + params
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
#下載
def download_image(url):
print('Downloading', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except ConnectionError:
return None
#保存
def save_image(content):
# 文件名
# 路徑 名稱 後綴
# 名稱md5 這是爲了當咱們運行一次程序出錯時,第二次再也不保存重複圖片
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'pic',md5(content).hexdigest(), 'jpg')
print(file_path)
# 若是文件不存在則保存
# content是二進制形式的網頁內容
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
#解析搜索後返回的網頁(就是圖集XHR中的那些)
def parse_page_index(text):
try:
#轉化爲JSON
data = json.loads(text)
#若是JSON數據中含有data這個鍵
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass
#獲取詳情頁
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
#解析詳情頁
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
#獲取標題
result = soup.select('title')
title = result[0].get_text() if result else ''
#獲取圖片地址(定義正則規則,並search)
images_pattern = re.compile('var gallery = (.*?);', re.S)
result = re.search(images_pattern, html)
#若是非空
if result:
#轉化爲JSON格式
data = json.loads(result.group(1))
#若是sub_images這個鍵的值對應了各類url
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
#下載圖片
for image in images: download_image(image)
return {
'title': title,
'url': url,
'images': images
}
#存儲到數據庫
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('Successfully Saved to Mongo', result)
return True
return False
def main(offset):
text = get_page_index(offset, KEYWORD)
#生成器返回圖集中每個圖集的URL的遍歷器
urls = parse_page_index(text)
#遍歷每個圖集,獲取詳情頁信息
for url in urls:
#獲取詳情頁
html = get_page_detail(url)
#解析詳情頁
result = parse_page_detail(html, url)
#保存到MongoDB
if result: save_to_mongo(result)
# if __name__ == '__main__':
# main(60)
# 注意,若是不在if __name__ == '__main__':中運行的話,會爆出一堆多線程的錯誤
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()