通常來講爬蟲類框架抓取Ajax動態頁面都是經過一些第三方的webkit庫去手動執行html頁面中的js代碼, 最後將生產的html代碼交給spider分析。本篇文章則是經過利用fiddler抓包獲取json數據分析Ajax頁面的具體請求內容,找到獲取數據的接口url,直接調用該接口獲取數據,省去了引入python-webkit庫的麻煩,並且因爲通常ajax請求的數據都是結構化數據,這樣更省去了咱們利用xpath解析html的痛苦。html
手機打開糗事百科APP ,利用fiddler抓包獲取json數據 檢查 獲得的接口url是否能正常訪問 若是能訪問在換個瀏覽器試試 如圖python
打開以後的json數據如圖推薦用json—handle插件(chrome安裝)打開mysql
代碼實現:以99頁爲例web
items.pyajax
1 import scrapy 2 3 4 class QiushibalkeItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 uid=scrapy.Field() 8 nickname = scrapy.Field() 9 gender=scrapy.Field() 10 11 astrology=scrapy.Field() 12 13 content=scrapy.Field() 14 crawl_time=scrapy.Field()
spiders/qiushi.pyredis
1 # -*- coding: utf-8 -*- 2 import scrapy 3 import json 4 from qiushibalke.items import QiushibalkeItem 5 from datetime import datetime 6 class QiushiSpider(scrapy.Spider): 7 name = "qiushi" 8 allowed_domains = ["m2.qiushibaike.com"] 9 def start_requests(self): 10 for i in range(1,100): 11 url = "https://m2.qiushibaike.com/article/list/text?page={}".format(i) 12 yield scrapy.Request(url,callback=self.parse_item) 13 14 15 def parse_item(self, response): 16 datas = json.loads(response.text)["items"] 17 print(datas) 18 for data in datas: 19 # print(data['votes']['up']) 20 # print(data['user']['uid']) 21 # print(data['user']["login"]) 22 # print(data['user']["gender"]) 23 # print(data['user']["astrology"]) 24 25 item = QiushibalkeItem() 26 item["uid"]= data['user']["uid"] 27 28 item["nickname"] = data['user']["login"] 29 item["gender"] = data['user']["gender"] 30 31 item["astrology"] = data['user']["astrology"] 32 item["content"]=data["content"] 33 item["crawl_time"] = datetime.now() 34 35 yield item 36
pipelines.pysql
import pymysql class QiushibalkePipeline(object): def process_item(self, item, spider): con = pymysql.connect(host="127.0.0.1", user="youusername", passwd="youpassword", db="qiushi", charset="utf8") cur = con.cursor() sql = ("insert into baike(uid,nickname,gender,astrology,content,crawl_time)" "VALUES(%s,%s,%s,%s,%s,%s)") lis = (item["uid"],item["nickname"],item["gender"],item["astrology"],item["content"],item["crawl_time"]) cur.execute(sql, lis) con.commit() cur.close() con.close() return item
settings.pychrome
1 BOT_NAME = 'qiushibalke' 2 3 SPIDER_MODULES = ['qiushibalke.spiders'] 4 NEWSPIDER_MODULE = 'qiushibalke.spiders' 5 ROBOTSTXT_OBEY = False 6 DOWNLOAD_DELAY = 5 7 COOKIES_ENABLED = False 8 DEFAULT_REQUEST_HEADERS = { 9 "User-Agent":"qiushibalke_10.13.0_WIFI_auto_7", 10 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 11 # 'Accept-Language': 'en', 12 } 13 ITEM_PIPELINES = { 14 'qiushibalke.pipelines.QiushibalkePipeline': 300, 15 # 'scrapy_redis.pipelines.RedisPipeline':300, 16 }
數據如圖:json