爬取伯樂在線文章（三）爬取全部頁面的文章

時間 2019-11-05

標籤伯樂在線文章全部頁面简体版

原文原文鏈接

爬取全部頁面

以前只是爬取某一篇文章的內容，可是如何爬取全部文章css

修改start_urls = ['http://blog.jobbole.com/all-posts/']node

從新啓動scrapy的shellshell

parse函數須要作兩件事dom

1. 獲取文章列表頁中的文章URL並交給scrapy下載後並解析
2. 獲取下一頁的URL並交給scrapy進行下載，下載完成後交給parse

獲取列表頁中的全部文章URLscrapy

post_urls = response.css('#archive .floated-thumb .post-thumb a::attr(href)').extract()

獲取全部URL以後將其交給scrapy進行下載並解析，如何交給scrapy進行下載，下載完成以後調用咱們本身定義的解析函數，這就須要用到scrapy的另外一類Request，在scrapy.http裏面ide

    def parse(self, response):
        '''
        1. 獲取文章列表頁中的文章URL並交給scrapy下載後並解析
        2. 獲取下一頁的URL並交給scrapy進行下載，下載完成後交給parse
        '''

        #  解析列表頁中的全部文章的URL並交給scrapy下載後並解析
        post_urls = response.css('#archive .floated-thumb .post-thumb a::attr(href)').extract()
        for post_url in post_urls:
            Request(url=post_url, callback=self.parse_detail)
            print(post_url)
            pass

    def parse_detail(self,response):
        #提取文章的具體邏輯
        title = response.xpath('//*[@id="post-110287"]/div[1]/h1/text()').extract()[0]
        date = response.xpath('//*[@id="post-110287"]/div[2]/p/text()').extract()[0].strip().replace("·", "")
        praise_num = response.xpath('//*[@id="110287votetotal"]/text()').extract()[0]
        collect_num = response.xpath('//*[@id="post-110287"]/div[3]/div[9]/span[2]/text()').extract()[0].split(" ")[1]
        comment_num = response.xpath('//*[@id="post-110287"]/div[3]/div[9]/a/span/text()').extract()[0].split(" ")[1]

        pass

可能有些網站獲取的URL裏面只有/114466/，這是就須要當前的URL和獲取的URL進行一個拼接從而造成完整的URL，這就須要用到urllib中的parse函數，將Request交給scrapy進行下載使用yield關鍵字函數

    def parse(self, response):
        '''
        1. 獲取文章列表頁中的文章URL並交給scrapy下載後並解析
        2. 獲取下一頁的URL並交給scrapy進行下載，下載完成後交給parse
        '''

        #  解析列表頁中的全部文章的URL並交給scrapy下載後並解析
        post_urls = response.css('#archive .floated-thumb .post-thumb a::attr(href)').extract()
        for post_url in post_urls:
            yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)

還須要提取下一頁並交給scrapy進行下載post

    def parse(self, response):
        '''
        1. 獲取文章列表頁中的文章URL並交給scrapy下載後並解析
        2. 獲取下一頁的URL並交給scrapy進行下載，下載完成後交給parse
        '''

        #  解析列表頁中的全部文章的URL並交給scrapy下載後並解析
        post_urls = response.css('#archive .floated-thumb .post-thumb a::attr(href)').extract()
        for post_url in post_urls:
            yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)

        #  提取下一頁並交給scrapy進行下載
        next_url = response.css('.next.page-numbers::attr(href)').extract_first()
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

全部代碼以下網站

# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http import Request
from urllib import parse


class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    #容許的域名
    allowed_domains = ['blog.jobbole.com']
    #起始的url
    start_urls = ['http://blog.jobbole.com/all-posts/']

    #業務邏輯
    def parse(self, response):
        '''
        1. 獲取文章列表頁中的文章URL並交給scrapy下載後並解析
        2. 獲取下一頁的URL並交給scrapy進行下載，下載完成後交給parse
        '''

        #  解析列表頁中的全部文章的URL並交給scrapy下載後並解析
        post_urls = response.css('#archive .floated-thumb .post-thumb a::attr(href)').extract()
        for post_url in post_urls:
            yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)

        #  提取下一頁並交給scrapy進行下載
        next_url = response.css('.next.page-numbers::attr(href)').extract_first()
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)


    def parse_detail(self,response):
        print("目前爬取的URL是："+response.url)
        #提取文章的具體邏輯

        #  獲取文章標題
        title = response.css('.entry-header h1::text').extract()[0]
        #  獲取發佈日期
        date = response.css('.entry-meta .entry-meta-hide-on-mobile::text').extract()[0].strip().replace("·", "")
        #  獲取點贊數
        praise_num = response.css('.vote-post-up h10::text').extract()[0]
        #  獲取收藏數
        collect_num = response.css('.post-adds .bookmark-btn::text').extract()[0].split(" ")[1]
        collect_match_re = re.match(r'.*?(\d+).*', collect_num)
        if collect_match_re:
            collect_num = int(collect_match_re.group(1))
        else:
            collect_num = 0
        #  獲取評論數
        comment_num = response.css('.post-adds .hide-on-480::text').extract()[0]
        comment_match_re = re.match(r'.*?(\d+).*', comment_num)
        if comment_match_re:
            comment_num = int(comment_match_re.group(1))
        else:
            comment_num = 0

        content = response.css('div.entry').extract()[0]


        print(title+"\t"+"發佈時間："+date+"\t"+str(praise_num)+"點贊"+"\t"+str(collect_num)+"收藏"+"\t"+str(comment_num)+"評論")
        #date = response.xpath('//*[@id="post-110287"]/div[2]/p/text()').extract()[0].strip().replace("·", "")
        #praise_num = response.xpath('//*[@id="110287votetotal"]/text()').extract()[0]
        #collect_num = response.xpath('//*[@id="post-110287"]/div[3]/div[9]/span[2]/text()').extract()[0].split(" ")[1]
        #comment_num = response.xpath('//*[@id="post-110287"]/div[3]/div[9]/a/span/text()').extract()[0].split(" ")[1]

View Code

爬取圖片

爬取圖片經過Request傳入response，在Request的meta參數url

#業務邏輯
    def parse(self, response):
        '''
        1. 獲取文章列表頁中的文章URL並交給scrapy下載後並解析
        2. 獲取下一頁的URL並交給scrapy進行下載，下載完成後交給parse
        '''

        #  解析列表頁中的全部文章的URL並交給scrapy下載後並解析
        post_nodes = response.css('#archive .floated-thumb .post-thumb a')

        for post_node in post_nodes:
            image_url = post_node.css("img::attr(src)").extract_first("")
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)

        #  提取下一頁並交給scrapy進行下載
        next_url = response.css('.next.page-numbers::attr(href)').extract_first()
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

經過response獲取圖片

front_image = response.meta.get("front_image_url", "")

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。