python Scrapy Selenium PhantomJS 爬取微博圖片

時間 2019-11-12

標籤 python scrapy selenium phantomjs 微博圖片欄目 Python 简体版

原文原文鏈接

1,建立項目css

scrapy startproject weibo #建立工程
scrapy genspider -t basic weibo.com weibo.com #建立spider

目錄結構html

定義Items python

編輯items.pylinux

import scrapy


class WeiboItem(scrapy.Item):
    # define the fields for your item here like:
    image_urls = scrapy.Field()
    dirname = scrapy.Field()

編輯pipelines.pyweb

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import hashlib
# from scrapy.contrib.pipeline.images import ImagesPipeline
# from scrapy.http import Request
from scrapy.utils.python import to_bytes
import urllib
import os
import redis

#Scrapy 自帶圖片下載器，對gif動圖支持很差，緣由沒找，因此爲支持gif動圖，自定義下載器
# class WeboPipeline(ImagesPipeline):
#     # def process_item(self, item, spider):
#     #     return item
#     def get_media_requests(self, item, info):
#         for image_url in item['image_urls']:
#             request =  Request('http:'+image_url)
#             request.meta['item'] = {'dirname':item['dirname']}
#             yield request

#定義存儲目錄和文件擴展名
#     def file_path(self, request, response=None, info=None):
#         url = request.url
#         item = request.meta['item']
#         image_guid = hashlib.sha1(to_bytes(url)).hexdigest()
#         ext = url.split('.')[-1]
#         url = 'full/%s/%s.%s' % (item['dirname'],image_guid,ext)
#         return url 

#     def item_completed(self, results, item, info):
#         return item

#沒有用原生的Scrapy文件下載功能，由於gif圖片下載不全，不能動
class WeboPipeline(object):
　　#用redis判重，簡單粗暴了點，
    def open_spider(self,spider):
        self.client=redis.Redis(host='127.0.0.1', port=6379)

    def process_item(self, item, spider):
        file_path = item['dirname']
        #redis判重複
        yn = self.client.get(file_path)
        if yn is not None:
            print '已經下載過'
            return item

        for image_url in item['image_urls']:
            imageurl = 'http:'+image_url
            savepath = self.get_file_path(file_path,imageurl)
            print imageurl,savepath
            try:
　　　　　　　　　 #下載圖片到指定地址
                urllib.urlretrieve(imageurl,savepath)
            except Exception as e:
                print str(e)
        #根據微博內容中每組圖片地址的hash作惟一標識
        self.client.set(file_path,1)
        return item

    def get_file_path(self,dirname, url):
　　　　　```
　　　　　　獲取新的文件名
　　　　　```

        image_guid = hashlib.sha1(to_bytes(url)).hexdigest()
        ext = url.split('.')[-1]
　　　　　#文件存儲目錄寫死了，可在setting中設置
        file_dir = './full/%s'%(dirname)
        if os.path.exists(file_dir) == False:
            os.makedirs(file_dir)
        return '%s/%s.%s' % (file_dir,image_guid,ext)

編寫爬蟲ajax

spiders/weibo_com.pyredis

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import os
import time
from collections import defaultdict
from scrapy.selector import Selector
import re
import urllib
from weibo.items import WeiboItem
import time
import hashlib

class WebComSpider(scrapy.Spider):
    name = 'weibo.com'
    allowed_domains = ['weibo.com']
    start_urls = ['https://weibo.com/']
　　 #cookie存儲地址
    cookie_file_path = './cookies.json'
    #要抓取博主的ID
    uids = ['5139152205']
    def saveCookie(self,cookie):
　　　　 ```保存cookie到文件```
        with open(self.cookie_file_path,'w') as outputfile:
            json.dump(cookie, outputfile)

    def getCookie(self):
　　　　　```從文件中獲取cookie```
        if os.path.exists(self.cookie_file_path) == False:
            self.cookie = None
            return
        with open(self.cookie_file_path,'r') as inputfile:
            self.cookie = json.load(inputfile)

    def start_requests(self):
　　　　 ```抓取微博```
        self.getCookie()
　　　　　#若是沒有cookie,模擬登錄獲取cookie
        if self.cookie is None:
　　　　　　　#用PhantomJS模擬瀏覽器
            driver = webdriver.PhantomJS(executable_path='/data/software/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')
            driver.get('https://weibo.com') 
            try:
　　　　　　　　　 #設置窗口大小，這個很重要，若是不設置，獲取不到下面的元素
                driver.set_window_size(1920,1080)
　　　　　　　　　 #等待獲取到 loginname 對象後纔會進行下一步操做
                userElen = WebDriverWait(driver,10).until(
                    EC.presence_of_element_located((By.ID,'loginname'))
                )
　　　　　　　　　 #等待時間，反扒
                time.sleep(3);
                print 'sleep 3'
　　　　　　　　　 #設置登錄用戶名
                userElen.send_keys('登錄用戶名')
                print 'sleep 5'
                time.sleep(5)
                pasElen = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input') 
　　　　　　　　　 #設置登錄密碼
                pasElen.send_keys('登錄密碼')
                print 'sleep 1'
                time.sleep(1)
                sumbButton =  driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a')
　　　　　　　　　 #登錄
                sumbButton.click()
                #當頁面title中包含個人首頁以後，表示登錄成功，績效下一步
                element = WebDriverWait(driver,10).until(
                        EC.title_contains(u'個人首頁')
                    )
            except Exception as e:
                print '22222222222222222',str(e)
　　　　　　　#獲取cookie
            ck = driver.get_cookies()
            self.cookie = defaultdict()
            for item in ck:
                self.cookie[item['name']] = item['value']
　　　　　　　#保存cookie
            self.saveCookie(self.cookie)
　　　　　#用獲得的cookie抓取你想要的博主的圖片微博
        for uid in self.uids:
　　　　　　　#博主帶圖片的微博列表，根據具體需求本身定義
            url = 'https://weibo.com/u/%s?profile_ftype=1&is_pic=1#_0' %(uid,)
            request = scrapy.Request(url=url,cookies=self.cookie,callback=self.parse)
            request.meta['item'] =  {'uid':uid,'page_num':1}
            yield request
　　
    def parse(self,response):
　　　　　```
　　　　　　解析頁面，由於微博採用頁面跳轉和Ajax兩種翻頁模式，每次頁面跳轉以後都會有兩次Ajax請求獲取數據，
          微博頁面所有由js渲染html字符生成，因此無法用xpath,css選擇器，只能採用正則查找方法找到本身想要的內容
　　　　 ```
        title = response.xpath('//title/text()').extract()[0]
        print title
        seletor = Selector(text=response.body)
　　　　 #獲取Ajax請求參數
        pageId = seletor.re(r"\$CONFIG\[\'page_id\'\]=\'(\d+)\'")[0]
        domain = seletor.re(r"\$CONFIG\[\'domain\'\]=\'(\d+)\'")[0]
　　　　 #分析頁面跳轉後html頁面內容
        for itemObj in self.parse_content(seletor):
            yield itemObj

        #Ajax請求數據
        item = response.meta['item']
        ajaxUrl = 'https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain=%s&profile_ftype=1&is_pic=1&pagebar=%s&pl_name=Pl_Official_MyProfileFeed__21&id=%s&script_uri=/u/%s&feed_type=0&page=%s&pre_page=1&domain_op=%s&__rnd=%s'
        for num in range(0,2):
            rand = str(int(time.time()*1000))
            url = ajaxUrl%(domain,num,pageId,item['uid'],item['page_num'],domain,rand)
            print url
            print '------------sleep 10------------'
            time.sleep(10)
            yield scrapy.Request(url=url,cookies=self.cookie,callback=self.parse_ajax)
        item['page_num'] +=  1
        nexpage = 'https://weibo.com/u/%s?is_search=0&visible=0&is_pic=1&is_tag=0&profile_ftype=1&page=%s#feedtop'%(item['uid'],item['page_num'])
        request = scrapy.Request(url = nexpage,cookies=self.cookie,callback=self.parse)
        request.meta['item'] = item
        yield request

    def parse_ajax(self,response):
　　　　　```解析Ajax內容```
        bodyObj = json.loads(response.body)
        seletor = Selector(text=bodyObj['data'])
        for itemObj in self.parse_content(seletor):
            yield itemObj

    def parse_content(self,seletor):
　　　　 ```獲取圖片地址```
        pre = re.compile(r'clear_picSrc=(.*?)[\&|\\"]')
        imagelist =  seletor.re(pre)
        for row in imagelist:
            hs = hashlib.md5()
            hs.update(row)
            row = urllib.unquote(row)
　　　　　　　#用每組圖片的地址作惟一標識，和子目錄名
            imgset = row.split(',')
            yield WeiboItem(image_urls=imgset,dirname=hs.hexdigest())

　　修改Setting.pyjson

ROBOTSTXT_OBEY = False #不遵循robots規則
註冊 WeiboPipeline
ITEM_PIPELINES = {
   'webo.pipelines.WeiboPipeline': 300,
}

　　執行爬蟲瀏覽器

scrapy crawl weibo.com

相關標籤/搜索

scrapy+selenium+phantomjs

python爬蟲-爬微博

python+selenium+phantomjs

selenium+python+phantomjs

selenium+phantomjs+python

python+phantomjs+selenium+beautifulsoup

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。