Python的幾個爬蟲代碼整理(網易雲、微信、淘寶、今日頭條)

整理了一下網易雲歌曲評論抓取、分析好友信息抓取、淘寶寶貝抓取、今日頭條美圖抓取的一些代碼
抓取網易雲評論
進入歌曲界面:
找到以下的數據源:
貼一段Lyrichu的代碼:
(運行環境爲P2.7)
# -*- coding: utf-8 -*-
# @Time : 2017/3/28 8:46
# @Author : Lyrichu
# @Email : 919987476@qq.com
# @File : NetCloud_spider3.py
'''
@Description:
網易雲音樂評論爬蟲,能夠完整爬取整個評論
部分參考了@平胸小仙女的文章(地址:https://www.zhihu.com/question/36081767)
post加密部分也給出了,能夠參考原帖:
做者:平胸小仙女
連接:https://www.zhihu.com/question/36081767/answer/140287795
來源:知乎
'''
from Crypto.Cipher import AES
import base64
import requests
import json
import codecs
import time

# 頭部信息
headers = {
'Host':"music.163.com",
'Accept-Language':"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
'Accept-Encoding':"gzip, deflate",
'Content-Type':"application/x-www-form-urlencoded",
'Cookie':"_ntes_nnid=754361b04b121e078dee797cdb30e0fd,1486026808627; _ntes_nuid=754361b04b121e078dee797cdb30e0fd; JSESSIONID-WYYY=yfqt9ofhY%5CIYNkXW71TqY5OtSZyjE%2FoswGgtl4dMv3Oa7%5CQ50T%2FVaee%2FMSsCifHE0TGtRMYhSPpr20i%5CRO%2BO%2B9pbbJnrUvGzkibhNqw3Tlgn%5Coil%2FrW7zFZZWSA3K9gD77MPSVH6fnv5hIT8ms70MNB3CxK5r3ecj3tFMlWFbFOZmGw%5C%3A1490677541180; _iuqxldmzr_=32; vjuids=c8ca7976.15a029d006a.0.51373751e63af8; vjlast=1486102528.1490172479.21; __gads=ID=a9eed5e3cae4d252:T=1486102537:S=ALNI_Mb5XX2vlkjsiU5cIy91-ToUDoFxIw; vinfo_n_f_l_n3=411a2def7f75a62e.1.1.1486349441669.1486349607905.1490173828142; P_INFO=m15527594439@163.com|1489375076|1|study|00&99|null&null&null#hub&420100#10#0#0|155439&1|study_client|15527594439@163.com; NTES_CMT_USER_INFO=84794134%7Cm155****4439%7Chttps%3A%2F%2Fsimg.ws.126.net%2Fe%2Fimg5.cache.netease.com%2Ftie%2Fimages%2Fyun%2Fphoto_default_62.png.39x39.100.jpg%7Cfalse%7CbTE1NTI3NTk0NDM5QDE2My5jb20%3D; usertrack=c+5+hljHgU0T1FDmA66MAg==; Province=027; City=027; _ga=GA1.2.1549851014.1489469781; __utma=94650624.1549851014.1489469781.1490664577.1490672820.8; __utmc=94650624; __utmz=94650624.1490661822.6.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; playerid=81568911; __utmb=94650624.23.10.1490672820",
'Connection':"keep-alive",
'Referer':'http://music.163.com/'
}
# 設置代理服務器
proxies= {
'http:':'http://121.232.146.184',
'https:':'https://144.255.48.197'
}

# offset的取值爲:(評論頁數-1)*20,total第一頁爲true,其他頁爲false
# first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}' # 第一個參數
second_param = "010001" # 第二個參數
# 第三個參數
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
# 第四個參數
forth_param = "0CoJUm6Qyw8W8jud"

# 獲取參數
def get_params(page): # page爲傳入頁數
iv = "0102030405060708"
first_key = forth_param
second_key = 16 * 'F'
if(page == 1): # 若是爲第一頁
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
h_encText = AES_encrypt(first_param, first_key, iv)
else:
offset = str((page-1)*20)
first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' %(offset,'false')
h_encText = AES_encrypt(first_param, first_key, iv)
h_encText = AES_encrypt(h_encText, second_key, iv)
return h_encText

# 獲取 encSecKey
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey


# 解密過程
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text)
encrypt_text = base64.b64encode(encrypt_text)
return encrypt_text

# 得到評論json數據
def get_json(url, params, encSecKey):
data = {
"params": params,
"encSecKey": encSecKey
}
response = requests.post(url, headers=headers, data=data,proxies = proxies)
return response.content

# 抓取熱門評論,返回熱評列表
def get_hot_comments(url):
hot_comments_list = []
hot_comments_list.append(u"用戶ID 用戶暱稱 用戶頭像地址 評論時間 點贊總數 評論內容\n")
params = get_params(1) # 第一頁
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
hot_comments = json_dict['hotComments'] # 熱門評論
print("共有%d條熱門評論!" % len(hot_comments))
for item in hot_comments:
comment = item['content'] # 評論內容
likedCount = item['likedCount'] # 點贊總數
comment_time = item['time'] # 評論時間(時間戳)
userID = item['user']['userID'] # 評論者id
nickname = item['user']['nickname'] # 暱稱
avatarUrl = item['user']['avatarUrl'] # 頭像地址
comment_info = userID + " " + nickname + " " + avatarUrl + " " + comment_time + " " + likedCount + " " + comment + u"\n"
hot_comments_list.append(comment_info)
return hot_comments_list

# 抓取某一首歌的所有評論
def get_all_comments(url):
all_comments_list = [] # 存放全部評論
all_comments_list.append(u"用戶ID 用戶暱稱 用戶頭像地址 評論時間 點贊總數 評論內容\n") # 頭部信息
params = get_params(1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
comments_num = int(json_dict['total'])
if(comments_num % 20 == 0):
page = comments_num / 20
else:
page = int(comments_num / 20) + 1
print("共有%d頁評論!" % page)
for i in range(page): # 逐頁抓取
params = get_params(i+1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
if i == 0:
print("共有%d條評論!" % comments_num) # 所有評論總數
for item in json_dict['comments']:
comment = item['content'] # 評論內容
likedCount = item['likedCount'] # 點贊總數
comment_time = item['time'] # 評論時間(時間戳)
userID = item['user']['userId'] # 評論者id
nickname = item['user']['nickname'] # 暱稱
avatarUrl = item['user']['avatarUrl'] # 頭像地址
comment_info = unicode(userID) + u" " + nickname + u" " + avatarUrl + u" " + unicode(comment_time) + u" " + unicode(likedCount) + u" " + comment + u"\n"
all_comments_list.append(comment_info)
print("%d頁抓取完畢!" % (i+1))
return all_comments_list


# 將評論寫入文本文件
def save_to_file(list,filename):
with codecs.open(filename,'a',encoding='utf-8') as f:
f.writelines(list)
print("寫入文件成功!")

if __name__ == "__main__":
start_time = time.time() # 開始時間
url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_453185824?csrf_token="
filename = u"On_My_Way.txt"
all_comments_list = get_all_comments(url)
save_to_file(all_comments_list,filename)
end_time = time.time() #結束時間
print("程序耗時%f." % (end_time - start_time))
其中AES須要安裝pycrypto庫,在安裝時報錯,點擊more,能夠找到須要安裝的C類庫便可(直接複製相應的網址,下載並安裝便可,好像是VCForPython27.msi)
結果以下:
代碼文件:


參考:


******************************♣******************************
抓取微信好友信息
代碼以下:
#!/usr/bin/env python
# encoding=utf-8
from __future__ import print_function

import os
import requests
import re
import time
import xml.dom.minidom
import json
import sys
import math
import subprocess
import ssl
import threading
import urllib, urllib2

DEBUG = False

MAX_GROUP_NUM = 2 # 每組人數
INTERFACE_CALLING_INTERVAL = 5 # 接口調用時間間隔, 間隔過短容易出現"操做太頻繁", 會被限制操做半小時左右
MAX_PROGRESS_LEN = 50

QRImagePath = os.path.join(os.getcwd(), 'qrcode.jpg')

tip = 0
uuid = ''

base_uri = ''
redirect_uri = ''
push_uri = ''

skey = ''
wxsid = ''
wxuin = ''
pass_ticket = ''
deviceId = 'e000000000000000'

BaseRequest = {}

ContactList = []
My = []
SyncKey = []

try:
xrange
range = xrange
except:
# python 3
pass


def responseState(func, BaseResponse):
ErrMsg = BaseResponse['ErrMsg']
Ret = BaseResponse['Ret']
if DEBUG or Ret != 0:
print('func: %s, Ret: %d, ErrMsg: %s' % (func, Ret, ErrMsg))

if Ret != 0:
return False

return True


def getUUID():
global uuid

url = 'https://login.weixin.qq.com/jslogin'
params = {
'appid': 'wx782c26e4c19acffb',
'fun': 'new',
'lang': 'zh_CN',
'_': int(time.time()),
}

r = myRequests.get(url=url, params=params)
r.encoding = 'utf-8'
data = r.text

# print(data)

# window.QRLogin.code = 200; window.QRLogin.uuid = "oZwt_bFfRg==";
regx = r'window.QRLogin.code = (\d+); window.QRLogin.uuid = "(\S+?)"'
pm = re.search(regx, data)

code = pm.group(1)
uuid = pm.group(2)

if code == '200':
return True

return False


def showQRImage():
global tip

url = 'https://login.weixin.qq.com/qrcode/' + uuid
params = {
't': 'webwx',
'_': int(time.time()),
}

r = myRequests.get(url=url, params=params)
tip = 1

f = open(QRImagePath, 'wb+')
f.write(r.content)
f.close()
time.sleep(1)

if sys.platform.find('darwin') >= 0:
subprocess.call(['open', QRImagePath])
else:
os.startfile(QRImagePath)

print('請使用微信掃描二維碼以登陸')


def waitForLogin():
global tip, base_uri, redirect_uri, push_uri

url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % (
tip, uuid, int(time.time()))

r = myRequests.get(url=url)
r.encoding = 'utf-8'
data = r.text

# print(data)

# window.code=500;
regx = r'window.code=(\d+);'
pm = re.search(regx, data)

code = pm.group(1)

if code == '201': # 已掃描
print('成功掃描,請在手機上點擊確認以登陸')
tip = 0
elif code == '200': # 已登陸
print('正在登陸...')
regx = r'window.redirect_uri="(\S+?)";'
pm = re.search(regx, data)
redirect_uri = pm.group(1) + '&fun=new'
base_uri = redirect_uri[:redirect_uri.rfind('/')]

# push_uribase_uri對應關係(排名分前後)(就是這麼奇葩..)
services = [
('wx2.qq.com', 'webpush2.weixin.qq.com'),
('qq.com', 'webpush.weixin.qq.com'),
('web1.wechat.com', 'webpush1.wechat.com'),
('web2.wechat.com', 'webpush2.wechat.com'),
('wechat.com', 'webpush.wechat.com'),
('web1.wechatapp.com', 'webpush1.wechatapp.com'),
]
push_uri = base_uri
for (searchUrl, pushUrl) in services:
if base_uri.find(searchUrl) >= 0:
push_uri = 'https://%s/cgi-bin/mmwebwx-bin' % pushUrl
break

# closeQRImage
if sys.platform.find('darwin') >= 0: # for OSX with Preview
os.system("osascript -e 'quit app \"Preview\"'")
elif code == '408': # 超時
pass
# elif code == '400' or code == '500':

return code


def login():
global skey, wxsid, wxuin, pass_ticket, BaseRequest

r = myRequests.get(url=redirect_uri)
r.encoding = 'utf-8'
data = r.text

# print(data)

doc = xml.dom.minidom.parseString(data)
root = doc.documentElement

for node in root.childNodes:
if node.nodeName == 'skey':
skey = node.childNodes[0].data
elif node.nodeName == 'wxsid':
wxsid = node.childNodes[0].data
elif node.nodeName == 'wxuin':
wxuin = node.childNodes[0].data
elif node.nodeName == 'pass_ticket':
pass_ticket = node.childNodes[0].data

# print('skey: %s, wxsid: %s, wxuin: %s, pass_ticket: %s' % (skey, wxsid,
# wxuin, pass_ticket))

if not all((skey, wxsid, wxuin, pass_ticket)):
return False

BaseRequest = {
'Uin': int(wxuin),
'Sid': wxsid,
'Skey': skey,
'DeviceID': deviceId,
}

return True


def webwxinit():
url = (base_uri +
'/webwxinit?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())))
params = {'BaseRequest': BaseRequest}
headers = {'content-type': 'application/json; charset=UTF-8'}

r = myRequests.post(url=url, data=json.dumps(params), headers=headers)
r.encoding = 'utf-8'
data = r.json()

if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxinit.json'), 'wb')
f.write(r.content)
f.close()

# print(data)

global ContactList, My, SyncKey
dic = data
ContactList = dic['ContactList']
My = dic['User']
SyncKey = dic['SyncKey']

state = responseState('webwxinit', dic['BaseResponse'])
return state


def webwxgetcontact():
url = (base_uri +
'/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())))
headers = {'content-type': 'application/json; charset=UTF-8'}

r = myRequests.post(url=url, headers=headers)
r.encoding = 'utf-8'
data = r.json()

if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxgetcontact.json'), 'wb')
f.write(r.content)
f.close()

dic = data
MemberList = dic['MemberList']

# 倒序遍歷,否則刪除的時候出問題..
SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync",
"floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp", "facebookapp",
"masssendapp",
"meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder", "weixinreminder",
"wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts", "notification_messages", "wxitil",
"userexperience_alarm"]
for i in range(len(MemberList) - 1, -1, -1):
Member = MemberList[i]
if Member['VerifyFlag'] & 8 != 0: # 公衆號/服務號
MemberList.remove(Member)
elif Member['UserName'] in SpecialUsers: # 特殊帳號
MemberList.remove(Member)
elif Member['UserName'].find('@@') != -1: # 羣聊
MemberList.remove(Member)
elif Member['UserName'] == My['UserName']: # 本身
MemberList.remove(Member)

return MemberList


def syncKey():
SyncKeyItems = ['%s_%s' % (item['Key'], item['Val'])
for item in SyncKey['List']]
SyncKeyStr = '|'.join(SyncKeyItems)
return SyncKeyStr


def syncCheck():
url = push_uri + '/synccheck?'
params = {
'skey': BaseRequest['Skey'],
'sid': BaseRequest['Sid'],
'uin': BaseRequest['Uin'],
'deviceId': BaseRequest['DeviceID'],
'synckey': syncKey(),
'r': int(time.time()),
}

r = myRequests.get(url=url, params=params)
r.encoding = 'utf-8'
data = r.text

# print(data)

# window.synccheck={retcode:"0",selector:"2"}
regx = r'window.synccheck={retcode:"(\d+)",selector:"(\d+)"}'
pm = re.search(regx, data)

retcode = pm.group(1)
selector = pm.group(2)

return selector


def webwxsync():
global SyncKey

url = base_uri + '/webwxsync?lang=zh_CN&skey=%s&sid=%s&pass_ticket=%s' % (
BaseRequest['Skey'], BaseRequest['Sid'], urllib.quote_plus(pass_ticket))
params = {
'BaseRequest': BaseRequest,
'SyncKey': SyncKey,
'rr': ~int(time.time()),
}
headers = {'content-type': 'application/json; charset=UTF-8'}

r = myRequests.post(url=url, data=json.dumps(params))
r.encoding = 'utf-8'
data = r.json()

# print(data)

dic = data
SyncKey = dic['SyncKey']

state = responseState('webwxsync', dic['BaseResponse'])
return state


def heartBeatLoop():
while True:
selector = syncCheck()
if selector != '0':
webwxsync()
time.sleep(1)


def main():
global myRequests

if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context

headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36'}
myRequests = requests.Session()
myRequests.headers.update(headers)

if not getUUID():
print('獲取uuid失敗')
return

print('正在獲取二維碼圖片...')
showQRImage()

while waitForLogin() != '200':
pass

os.remove(QRImagePath)

if not login():
print('登陸失敗')
return

if not webwxinit():
print('初始化失敗')
return

MemberList = webwxgetcontact()

threading.Thread(target=heartBeatLoop)

MemberCount = len(MemberList)
print('通信錄共%s位好友' % MemberCount)

d = {}
imageIndex = 0
for Member in MemberList:
imageIndex = imageIndex + 1
# name = 'C:\\Users\\Public\\Pictures\\' + str(imageIndex) + '.jpg'
# imageUrl = 'http://wx2.qq.com' + Member['HeadImgUrl']
# r = myRequests.get(url=imageUrl, headers=headers)
# imageContent = (r.content)
# fileImage = open(name, 'wb')
# fileImage.write(imageContent)
# fileImage.close()
# print('正在下載第:' + str(imageIndex) + '位好友頭像')
d[Member['UserName']] = (Member['NickName'], Member['RemarkName'])
city = Member['City']
city = 'nocity' if city == '' else city
name = Member['NickName']
name = 'noname' if name == '' else name
sign = Member['Signature']
sign = 'nosign' if sign == '' else sign
remark = Member['RemarkName']
remark = 'noremark' if remark == '' else remark
alias = Member['Alias']
alias = 'noalias' if alias == '' else alias
nick = Member['NickName']
nick = 'nonick' if nick == '' else nick
print(name, '|||', city, '|||', Member['Sex'], '|||', Member['StarFriend'], '|||', sign,
'|||', remark, '|||', alias, '|||', nick)



if __name__ == '__main__':
main()
print('回車鍵退出...')
input()
程序運行過程當中會跳出二維碼,須要咱們掃描登陸
做者原文基於mac,因此我本身修改爲了這個樣子(紅色加粗和藍色底紋部分)
subprocess.call(['open', QRImagePath]) 是給linux或mac下用來打開文件的
而windows下要用os.startfile(QRImagePath)(不要問我怎麼知道的,我運行報錯後猜出它的做用而後百度的)
(感謝好友的一路陪伴和困厄之時的支持)
微信頭像被保存在對應的文件路徑C:\\Users\\Public\\Pictures\\中
在CSV中:
通過分析(藉助了EasyChart的配色):
   
神奇,竟然仍是女生多?難道是最近加個人微信或者騙子比較多嘛?

文件
參考:
Python對微信好友進行簡單統計分析
當Python趕上微信,能夠這麼玩

******************************♣******************************
抓取淘寶寶貝信息
代碼以下:
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from taobao.config import *
import pymongo

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
wait = WebDriverWait(browser, 10)
#設置窗口大小,以避免默認的相對較小的窗口影響操做
browser.set_window_size(1400, 900)

def search():
print('正在搜索')
try:
browser.get('https://www.taobao.com')
# http://selenium-python.readthedocs.io/waits.html#explicit-waits
# 輸入框
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
)
#搜索提交按鈕
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))

input.send_keys(KEYWORD)
submit.click()

#等待搜索內容加載,表示爲共x
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
#total中就是總共有多少頁
get_products()
return total.text
except TimeoutException:
# wait在網速過慢的時候會出現超時錯誤,因此咱們遞歸調用從新請求
return search()


def next_page(page_number):
#進入下一頁有多種方式,好比點擊xx頁,好比點擊下一頁按鈕
#好比在輸入框中設置頁碼,點擊肯定
#咱們採用最後一種,由於第一章容易錯亂
#而最後一種相比於第二種方便咱們在出錯的時候遞歸
print('正在翻頁', page_number)
try:
#輸入頁碼框
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
#提交按鈕
submit = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
#清楚原先的頁碼
input.clear()
input.send_keys(page_number)
submit.click()
#判斷翻頁是否成功(判斷條件是高亮的當前代碼)
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
get_products()
except TimeoutException:
#出錯則遞歸,從新執行
next_page(page_number)

#解析
def get_products():
#判斷全部的寶貝信息是否傳遞成功
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
html = browser.page_source
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'image': item.find('.pic .img').attr('src'),
'price': item.find('.price').text(),
'deal': item.find('.deal-cnt').text()[:-3],
'title': item.find('.title').text(),
'shop': item.find('.shop').text(),
'location': item.find('.location').text()
}
print(product)
save_to_mongo(product)

#保存
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('存儲到MONGODB成功', result)
except Exception:
print('存儲到MONGODB失敗', result)


def main():
try:
total = search()
#提取總頁數
total = int(re.compile('(\d+)').search(total).group(1))
for i in range(2, total + 1):
#從第二頁纔開始須要點擊下一頁
next_page(i)
except Exception:
print('出錯啦')
finally:
browser.close()

if __name__ == '__main__':
main()
注意,是在Python3下
config文件(上面標紅的config的引入路徑要根據實際狀況修改)
MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'product'

# http://phantomjs.org/api/command-line.html
#第一個參數是不加載圖片,使程序運行更快
#第二個的開啓緩存
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']

KEYWORD = '漢服'

******************************♣******************************
抓取今日頭條
代碼以下:

import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError
from spider_basic.config import *

#
client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB]


#獲取主頁,offseajax動態加載的偏移量,keyword是搜索的關鍵字
def get_page_index(offset, keyword):
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
url = base + '?' + params
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None

#下載
def download_image(url):
print('Downloading', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except ConnectionError:
return None

#保存
def save_image(content):
# 文件名
# 路徑 名稱 後綴
# 名稱md5 這是爲了當咱們運行一次程序出錯時,第二次再也不保存重複圖片
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'pic',md5(content).hexdigest(), 'jpg')
print(file_path)
# 若是文件不存在則保存
# content是二進制形式的網頁內容
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()

#解析搜索後返回的網頁(就是圖集XHR中的那些)
def parse_page_index(text):
try:
#轉化爲JSON
data = json.loads(text)
#若是JSON數據中含有data這個鍵
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass

#獲取詳情頁
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None

#解析詳情頁
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
#獲取標題
result = soup.select('title')
title = result[0].get_text() if result else ''
#獲取圖片地址(定義正則規則,並search
images_pattern = re.compile('var gallery = (.*?);', re.S)
result = re.search(images_pattern, html)
#若是非空
if result:
#轉化爲JSON格式
data = json.loads(result.group(1))
#若是sub_images這個鍵的值對應了各類url
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
#下載圖片
for image in images: download_image(image)
return {
'title': title,
'url': url,
'images': images
}

#存儲到數據庫
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('Successfully Saved to Mongo', result)
return True
return False


def main(offset):
text = get_page_index(offset, KEYWORD)
#生成器返回圖集中每個圖集的URL的遍歷器
urls = parse_page_index(text)
#遍歷每個圖集,獲取詳情頁信息
for url in urls:
#獲取詳情頁
html = get_page_detail(url)
#解析詳情頁
result = parse_page_detail(html, url)
#保存到MongoDB
if result: save_to_mongo(result)


# if __name__ == '__main__':
# main(60)

# 注意,若是不在if __name__ == '__main__':中運行的話,會爆出一堆多線程的錯誤
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()
config文件
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'

GROUP_START = 1
GROUP_END = 20
KEYWORD='萌寵'






附件列表

相關文章
相關標籤/搜索