使用python scrapy爬取網頁中帶有地圖展現的數據

時間 2019-11-08

標籤使用 python scrapy 網頁帶有地圖展現數據欄目 Python 简体版

原文原文鏈接

最近有個需求，是要爬取某個物流公司的官網信息，我看了下官網，基本上都是靜態頁面比較好抓取，不像那種資訊類，電子商務類型的網站結果複雜，反爬嚴格，AJAX衆多，還心裏暗自慶幸，當我進一步分析時候發現並不是普通的靜態頁面。
例如這個URL界面，我要獲取全中國各大城市的物流園區分佈信息，而且要獲取詳情信息，
這個頁面裏面是有個地圖鑲嵌，每一個城市物流信息你要單獨點擊地圖上的信息才能顯示。
https://www.glprop.com.cn/our...html

我剛開始想，這種會不會是ajax請求呢，經過chrmoe抓包並無發現，而後我查看網頁源代碼
發現全部城市信息在一個scripts裏面
如圖：

而後各個園區的信息在一個叫park={xx}裏面存着node

原來都在這裏面，直接獲取源代碼，正則匹配，開幹。
item：web

#普洛斯
class PuluosiNewsItem(scrapy.Item):
    newstitle=scrapy.Field()
    newtiems=scrapy.Field()
    newslink=scrapy.Field()
class PuluosiItem(scrapy.Item):
    assetstitle = scrapy.Field()
    assetaddress=scrapy.Field()
    assetgaikuang=scrapy.Field()
    assetpeople=scrapy.Field()
    asseturl = scrapy.Field()

pipelines：ajax

class PuluosiNewsPipeline(object):
    def __init__(self):
        self.wb=Workbook()
        self.ws=self.wb.active
        #設置表頭
        self.ws.append(['普洛斯新聞標題','新聞發佈時間','新聞URL'])
        self.wb2 = Workbook()
        self.ws2 = self.wb2.active
        self.ws2.append(['資產標題', '資產地址', '資產概況','其餘信息','URL'])
    def process_item(self,item,spider):
        if isinstance(item, PuluosiNewsItem):
            line = [item['newstitle'], item['newtiems'], item['newslink']]  # 把數據中每一項整理出來
            self.ws.append(line)
            self.wb.save('PuluosiNews.xlsx')  # 保存xlsx文件
        elif isinstance(item,PuluosiItem):
            line = [item['assetstitle'], item['assetaddress'], item['assetgaikuang'],item['assetpeople'],item['asseturl']]
            self.ws2.append(line)
            self.wb2.save('PuluosiAsset.xlsx')  # 保存xlsx文件
        return item

spider：json

# -*- coding: utf-8 -*-
import scrapy,re,json
from news.items import PuluosiNewsItem,PuluosiItem
from scrapy.linkextractors import LinkExtractor

class PuluosiSpider(scrapy.Spider):
    name = 'puluosi'
    allowed_domains = ['glprop.com.cn']
    # start_urls = ['https://www.glprop.com.cn/press-releases.html']

    def start_requests(self):
        yield scrapy.Request('https://www.glprop.com.cn/press-releases.html', self.parse1)
        yield scrapy.Request('https://www.glprop.com.cn/in-the-news.html', self.parse2)
        yield scrapy.Request('https://www.glprop.com.cn/proposed-privatization.html', self.parse3)
        yield scrapy.Request('https://www.glprop.com.cn/our-network/network-detail.html', self.parse4)

    def parse1(self, response):
        print('此時啓動的爬蟲爲：puluosi' )
        item=PuluosiNewsItem()
        web=response.xpath('//tbody/tr')
        web.pop(0)
        for node in  web:
            item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()
            print(item['newstitle'])
            item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()
            print(item['newtiems'])
            # urljoin建立絕對的links路徑，始用於網頁中的href值爲相對路徑的鏈接
            item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])
            # print(item['newslink'])
            yield item
        #加入try 來判斷當前年份的新聞是否有下一頁出現
        try:
            next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一頁")]/@href').extract()[0]
            if next_url_tmp:
                next_url = "https://www.glprop.com.cn" + next_url_tmp
                yield scrapy.Request(next_url,callback=self.parse1)
        except Exception as e:
            print("當前頁面沒有下一頁")
        href=response.xpath('//ul[@class="timeList"]/li/a/@href')
        for nexturl in href:
            url1 =nexturl.extract()
            if url1:
                url="https://www.glprop.com.cn"+url1
                yield scrapy.Request(url,callback=self.parse1)

    def parse2(self,response):
        item = PuluosiNewsItem()
        web = response.xpath('//tbody/tr')
        web.pop(0)
        for node in  web:
            item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()
            print(item['newstitle'])
            item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()
            print(item['newtiems'])
            # urljoin建立絕對的links路徑，始用於網頁中的href值爲相對路徑的鏈接
            item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])
            print(item['newslink'])
            yield item
        #加入try 來判斷當前年份的新聞是否有下一頁出現
        try:
            next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一頁")]/@href').extract()[0]
            if next_url_tmp:
                next_url = "https://www.glprop.com.cn" + next_url_tmp
                yield scrapy.Request(next_url,callback=self.parse2)
        except Exception as e:
            print("當前頁面沒有下一頁")
        href=response.xpath('//ul[@class="timeList"]/li/a/@href')
        for nexturl in href:
            url1 =nexturl.extract()
            if url1:
                url="https://www.glprop.com.cn"+url1
                yield scrapy.Request(url,callback=self.parse2)

    def parse3(self,response):
        item=PuluosiNewsItem()
        web=response.xpath('//tbody/tr')
        web.pop()
        for node in  web:
            item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()
            print(item['newstitle'])
            item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()
            print(item['newtiems'])
            # urljoin建立絕對的links路徑，始用於網頁中的href值爲相對路徑的鏈接
            item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])
            print(item['newslink'])
            yield item

    def parse4(self,response):
        link=LinkExtractor(restrict_xpaths='//div[@class="net_pop1"]//div[@class="city"]')
        links=link.extract_links(response)
        #獲取全部城市的links
        for i in links:
            detailurl=i.url
            yield scrapy.Request(url=detailurl,callback=self.parse5)

    def parse4(self, response):
        item = PuluosiItem()
        citycode=re.findall('var cities =(.*);',response.text )
        citycodejson=json.loads(("".join(citycode)))
        #把每一個城市的id和name取出來放到一個字典
        dictcity={}
        for i in citycodejson:
            citycodename=i['name']
            citycodenm=i['id']
            dictcity[citycodenm]=citycodename
        detail=re.findall('var parks =(.*);',response.text )
        jsonBody = json.loads(("".join(detail)))
        list = []
        for key1 in jsonBody:
            for key2  in jsonBody[key1]:
                tmp=jsonBody[key1][key2]
                list.append(jsonBody[key1][key2])
        for node in list:
            assetaddress = node['city_id']
            item['assetaddress'] = dictcity[assetaddress]
            # print(item['assetaddress'])
            item['assetstitle'] = node['name']
            # print(item['assetstitle'])
            item['assetgaikuang'] = node['detail_single'].strip().replace('&nbsp;', '').replace(' ', '')
            # print(item['assetgaikuang'])
            assetpeople = node['description']
            item['assetpeople'] = re.sub(r'<.*?>', '', (assetpeople.strip())).replace('&nbsp;', '')
            item['asseturl']='https://www.glprop.com.cn/network-city-detail.html?city='+item['assetaddress']
            # print(item['assetpeople'])
            yield item

而後我順便把頁面的新聞信息也爬取了。app