1,建立項目css
scrapy startproject weibo #建立工程 scrapy genspider -t basic weibo.com weibo.com #建立spider
目錄結構html
定義Items python
編輯items.pylinux
import scrapy class WeiboItem(scrapy.Item): # define the fields for your item here like: image_urls = scrapy.Field() dirname = scrapy.Field()
編輯pipelines.pyweb
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import hashlib # from scrapy.contrib.pipeline.images import ImagesPipeline # from scrapy.http import Request from scrapy.utils.python import to_bytes import urllib import os import redis #Scrapy 自帶圖片下載器,對gif動圖支持很差,緣由沒找,因此爲支持gif動圖,自定義下載器 # class WeboPipeline(ImagesPipeline): # # def process_item(self, item, spider): # # return item # def get_media_requests(self, item, info): # for image_url in item['image_urls']: # request = Request('http:'+image_url) # request.meta['item'] = {'dirname':item['dirname']} # yield request #定義存儲目錄和文件擴展名 # def file_path(self, request, response=None, info=None): # url = request.url # item = request.meta['item'] # image_guid = hashlib.sha1(to_bytes(url)).hexdigest() # ext = url.split('.')[-1] # url = 'full/%s/%s.%s' % (item['dirname'],image_guid,ext) # return url # def item_completed(self, results, item, info): # return item #沒有用原生的Scrapy文件下載功能,由於gif圖片下載不全,不能動 class WeboPipeline(object): #用redis判重,簡單粗暴了點, def open_spider(self,spider): self.client=redis.Redis(host='127.0.0.1', port=6379) def process_item(self, item, spider): file_path = item['dirname'] #redis判重複 yn = self.client.get(file_path) if yn is not None: print '已經下載過' return item for image_url in item['image_urls']: imageurl = 'http:'+image_url savepath = self.get_file_path(file_path,imageurl) print imageurl,savepath try: #下載圖片到指定地址 urllib.urlretrieve(imageurl,savepath) except Exception as e: print str(e) #根據微博內容中每組圖片地址的hash作惟一標識 self.client.set(file_path,1) return item def get_file_path(self,dirname, url): ``` 獲取新的文件名 ``` image_guid = hashlib.sha1(to_bytes(url)).hexdigest() ext = url.split('.')[-1] #文件存儲目錄寫死了,可在setting中設置 file_dir = './full/%s'%(dirname) if os.path.exists(file_dir) == False: os.makedirs(file_dir) return '%s/%s.%s' % (file_dir,image_guid,ext)
編寫爬蟲ajax
spiders/weibo_com.pyredis
# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import json import os import time from collections import defaultdict from scrapy.selector import Selector import re import urllib from weibo.items import WeiboItem import time import hashlib class WebComSpider(scrapy.Spider): name = 'weibo.com' allowed_domains = ['weibo.com'] start_urls = ['https://weibo.com/'] #cookie存儲地址 cookie_file_path = './cookies.json' #要抓取博主的ID uids = ['5139152205'] def saveCookie(self,cookie): ```保存cookie到文件``` with open(self.cookie_file_path,'w') as outputfile: json.dump(cookie, outputfile) def getCookie(self): ```從文件中獲取cookie``` if os.path.exists(self.cookie_file_path) == False: self.cookie = None return with open(self.cookie_file_path,'r') as inputfile: self.cookie = json.load(inputfile) def start_requests(self): ```抓取微博``` self.getCookie() #若是沒有cookie,模擬登錄獲取cookie if self.cookie is None: #用PhantomJS模擬瀏覽器 driver = webdriver.PhantomJS(executable_path='/data/software/phantomjs-2.1.1-linux-x86_64/bin/phantomjs') driver.get('https://weibo.com') try: #設置窗口大小,這個很重要,若是不設置,獲取不到下面的元素 driver.set_window_size(1920,1080) #等待獲取到 loginname 對象後纔會進行下一步操做 userElen = WebDriverWait(driver,10).until( EC.presence_of_element_located((By.ID,'loginname')) ) #等待時間,反扒 time.sleep(3); print 'sleep 3' #設置登錄用戶名 userElen.send_keys('登錄用戶名') print 'sleep 5' time.sleep(5) pasElen = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input') #設置登錄密碼 pasElen.send_keys('登錄密碼') print 'sleep 1' time.sleep(1) sumbButton = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a') #登錄 sumbButton.click() #當頁面title中包含個人首頁以後,表示登錄成功,績效下一步 element = WebDriverWait(driver,10).until( EC.title_contains(u'個人首頁') ) except Exception as e: print '22222222222222222',str(e) #獲取cookie ck = driver.get_cookies() self.cookie = defaultdict() for item in ck: self.cookie[item['name']] = item['value'] #保存cookie self.saveCookie(self.cookie) #用獲得的cookie抓取你想要的博主的圖片微博 for uid in self.uids: #博主帶圖片的微博列表,根據具體需求本身定義 url = 'https://weibo.com/u/%s?profile_ftype=1&is_pic=1#_0' %(uid,) request = scrapy.Request(url=url,cookies=self.cookie,callback=self.parse) request.meta['item'] = {'uid':uid,'page_num':1} yield request def parse(self,response): ``` 解析頁面,由於微博採用頁面跳轉和Ajax兩種翻頁模式,每次頁面跳轉以後都會有兩次Ajax請求獲取數據, 微博頁面所有由js渲染html字符生成,因此無法用xpath,css選擇器,只能採用正則查找方法找到本身想要的內容 ``` title = response.xpath('//title/text()').extract()[0] print title seletor = Selector(text=response.body) #獲取Ajax請求參數 pageId = seletor.re(r"\$CONFIG\[\'page_id\'\]=\'(\d+)\'")[0] domain = seletor.re(r"\$CONFIG\[\'domain\'\]=\'(\d+)\'")[0] #分析頁面跳轉後html頁面內容 for itemObj in self.parse_content(seletor): yield itemObj #Ajax請求數據 item = response.meta['item'] ajaxUrl = 'https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain=%s&profile_ftype=1&is_pic=1&pagebar=%s&pl_name=Pl_Official_MyProfileFeed__21&id=%s&script_uri=/u/%s&feed_type=0&page=%s&pre_page=1&domain_op=%s&__rnd=%s' for num in range(0,2): rand = str(int(time.time()*1000)) url = ajaxUrl%(domain,num,pageId,item['uid'],item['page_num'],domain,rand) print url print '------------sleep 10------------' time.sleep(10) yield scrapy.Request(url=url,cookies=self.cookie,callback=self.parse_ajax) item['page_num'] += 1 nexpage = 'https://weibo.com/u/%s?is_search=0&visible=0&is_pic=1&is_tag=0&profile_ftype=1&page=%s#feedtop'%(item['uid'],item['page_num']) request = scrapy.Request(url = nexpage,cookies=self.cookie,callback=self.parse) request.meta['item'] = item yield request def parse_ajax(self,response): ```解析Ajax內容``` bodyObj = json.loads(response.body) seletor = Selector(text=bodyObj['data']) for itemObj in self.parse_content(seletor): yield itemObj def parse_content(self,seletor): ```獲取圖片地址``` pre = re.compile(r'clear_picSrc=(.*?)[\&|\\"]') imagelist = seletor.re(pre) for row in imagelist: hs = hashlib.md5() hs.update(row) row = urllib.unquote(row) #用每組圖片的地址作惟一標識,和子目錄名 imgset = row.split(',') yield WeiboItem(image_urls=imgset,dirname=hs.hexdigest())
修改Setting.pyjson
ROBOTSTXT_OBEY = False #不遵循robots規則 註冊 WeiboPipeline ITEM_PIPELINES = { 'webo.pipelines.WeiboPipeline': 300, }
執行爬蟲瀏覽器
scrapy crawl weibo.com