最近有個需求,是要爬取某個物流公司的官網信息,我看了下官網,基本上都是靜態頁面比較好抓取,不像那種資訊類,電子商務類型的網站結果複雜,反爬嚴格,AJAX衆多,還心裏暗自慶幸,當我進一步分析時候發現並不是普通的靜態頁面。
例如這個URL界面,我要獲取全中國各大城市的物流園區分佈信息,而且要獲取詳情信息,
這個頁面裏面是有個地圖鑲嵌,每一個城市物流信息你要單獨點擊地圖上的信息才能顯示。
https://www.glprop.com.cn/our...html
我剛開始想,這種會不會是ajax請求呢,經過chrmoe抓包並無發現,而後我查看網頁源代碼
發現全部城市信息在一個scripts裏面
如圖:
而後各個園區的信息在一個叫park={xx}裏面存着node
原來都在這裏面,直接獲取源代碼,正則匹配,開幹。
item:web
#普洛斯 class PuluosiNewsItem(scrapy.Item): newstitle=scrapy.Field() newtiems=scrapy.Field() newslink=scrapy.Field() class PuluosiItem(scrapy.Item): assetstitle = scrapy.Field() assetaddress=scrapy.Field() assetgaikuang=scrapy.Field() assetpeople=scrapy.Field() asseturl = scrapy.Field()
pipelines:ajax
class PuluosiNewsPipeline(object): def __init__(self): self.wb=Workbook() self.ws=self.wb.active #設置表頭 self.ws.append(['普洛斯新聞標題','新聞發佈時間','新聞URL']) self.wb2 = Workbook() self.ws2 = self.wb2.active self.ws2.append(['資產標題', '資產地址', '資產概況','其餘信息','URL']) def process_item(self,item,spider): if isinstance(item, PuluosiNewsItem): line = [item['newstitle'], item['newtiems'], item['newslink']] # 把數據中每一項整理出來 self.ws.append(line) self.wb.save('PuluosiNews.xlsx') # 保存xlsx文件 elif isinstance(item,PuluosiItem): line = [item['assetstitle'], item['assetaddress'], item['assetgaikuang'],item['assetpeople'],item['asseturl']] self.ws2.append(line) self.wb2.save('PuluosiAsset.xlsx') # 保存xlsx文件 return item
spider:json
# -*- coding: utf-8 -*- import scrapy,re,json from news.items import PuluosiNewsItem,PuluosiItem from scrapy.linkextractors import LinkExtractor class PuluosiSpider(scrapy.Spider): name = 'puluosi' allowed_domains = ['glprop.com.cn'] # start_urls = ['https://www.glprop.com.cn/press-releases.html'] def start_requests(self): yield scrapy.Request('https://www.glprop.com.cn/press-releases.html', self.parse1) yield scrapy.Request('https://www.glprop.com.cn/in-the-news.html', self.parse2) yield scrapy.Request('https://www.glprop.com.cn/proposed-privatization.html', self.parse3) yield scrapy.Request('https://www.glprop.com.cn/our-network/network-detail.html', self.parse4) def parse1(self, response): print('此時啓動的爬蟲爲:puluosi' ) item=PuluosiNewsItem() web=response.xpath('//tbody/tr') web.pop(0) for node in web: item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip() print(item['newstitle']) item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip() print(item['newtiems']) # urljoin建立絕對的links路徑,始用於網頁中的href值爲相對路徑的鏈接 item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0]) # print(item['newslink']) yield item #加入try 來判斷當前年份的新聞是否有下一頁出現 try: next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一頁")]/@href').extract()[0] if next_url_tmp: next_url = "https://www.glprop.com.cn" + next_url_tmp yield scrapy.Request(next_url,callback=self.parse1) except Exception as e: print("當前頁面沒有下一頁") href=response.xpath('//ul[@class="timeList"]/li/a/@href') for nexturl in href: url1 =nexturl.extract() if url1: url="https://www.glprop.com.cn"+url1 yield scrapy.Request(url,callback=self.parse1) def parse2(self,response): item = PuluosiNewsItem() web = response.xpath('//tbody/tr') web.pop(0) for node in web: item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip() print(item['newstitle']) item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip() print(item['newtiems']) # urljoin建立絕對的links路徑,始用於網頁中的href值爲相對路徑的鏈接 item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0]) print(item['newslink']) yield item #加入try 來判斷當前年份的新聞是否有下一頁出現 try: next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一頁")]/@href').extract()[0] if next_url_tmp: next_url = "https://www.glprop.com.cn" + next_url_tmp yield scrapy.Request(next_url,callback=self.parse2) except Exception as e: print("當前頁面沒有下一頁") href=response.xpath('//ul[@class="timeList"]/li/a/@href') for nexturl in href: url1 =nexturl.extract() if url1: url="https://www.glprop.com.cn"+url1 yield scrapy.Request(url,callback=self.parse2) def parse3(self,response): item=PuluosiNewsItem() web=response.xpath('//tbody/tr') web.pop() for node in web: item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip() print(item['newstitle']) item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip() print(item['newtiems']) # urljoin建立絕對的links路徑,始用於網頁中的href值爲相對路徑的鏈接 item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0]) print(item['newslink']) yield item def parse4(self,response): link=LinkExtractor(restrict_xpaths='//div[@class="net_pop1"]//div[@class="city"]') links=link.extract_links(response) #獲取全部城市的links for i in links: detailurl=i.url yield scrapy.Request(url=detailurl,callback=self.parse5) def parse4(self, response): item = PuluosiItem() citycode=re.findall('var cities =(.*);',response.text ) citycodejson=json.loads(("".join(citycode))) #把每一個城市的id和name取出來放到一個字典 dictcity={} for i in citycodejson: citycodename=i['name'] citycodenm=i['id'] dictcity[citycodenm]=citycodename detail=re.findall('var parks =(.*);',response.text ) jsonBody = json.loads(("".join(detail))) list = [] for key1 in jsonBody: for key2 in jsonBody[key1]: tmp=jsonBody[key1][key2] list.append(jsonBody[key1][key2]) for node in list: assetaddress = node['city_id'] item['assetaddress'] = dictcity[assetaddress] # print(item['assetaddress']) item['assetstitle'] = node['name'] # print(item['assetstitle']) item['assetgaikuang'] = node['detail_single'].strip().replace(' ', '').replace(' ', '') # print(item['assetgaikuang']) assetpeople = node['description'] item['assetpeople'] = re.sub(r'<.*?>', '', (assetpeople.strip())).replace(' ', '') item['asseturl']='https://www.glprop.com.cn/network-city-detail.html?city='+item['assetaddress'] # print(item['assetpeople']) yield item
而後我順便把頁面的新聞信息也爬取了。app