Scrapy-爬取安智市場app詳情

時間 2020-08-22

標籤 scrapy 市場 app 詳情欄目 Python 简体版

原文原文鏈接

前言

本篇文章是利用Scrapy扒取安智市場的app詳情頁，如點擊查看和平精英，包括app名、版本號、圖標icon、分類、時間、大小、下載量、做者、簡介、更新說明、軟件截圖、精彩內容等，扒取的圖片資源icon和市場展現圖（app截圖）下載到本地，並將全部數據存儲到數據庫。html

考慮的問題：python

存儲的數據庫設計
圖片資源連接存在重定向
下載app的圖標需爲.png後綴
...

須要先熟悉Scrapy框架的同窗：點擊學習mysql

數據庫設計

建立的爲mysql數據庫，名稱爲app_anzhigame，表名爲games，安智市場的市場圖限制爲4-5張，簡介等爲1500字之內，圖片均爲相對地址sql

# 建庫
CREATE DATABASE app_anzhigame CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;

USE app_anzhigame;
DROP TABLE games;

# 建表
CREATE TABLE games(
  id INTEGER(11)  UNSIGNED AUTO_INCREMENT COLLATE utf8mb4_general_ci,
  name VARCHAR(20) NOT NULL COLLATE utf8mb4_general_ci COMMENT '遊戲名' ,
  versionCode VARCHAR(10) COLLATE utf8mb4_general_ci COMMENT '版本號' NOT NULL DEFAULT 'v1.0',
  icon VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '遊戲圖標icon' NOT NULL DEFAULT '',
  type VARCHAR(20) COLLATE utf8mb4_general_ci COMMENT '分類' NOT NULL DEFAULT '',
  onlineTime VARCHAR(20) COLLATE utf8mb4_general_ci COMMENT '上線時間',
  size VARCHAR(10) COLLATE utf8mb4_general_ci COMMENT '大小' NOT NULL DEFAULT '0B',
  download VARCHAR(10) COLLATE utf8mb4_general_ci COMMENT '下載量' NOT NULL DEFAULT '0',
  author VARCHAR(20) COLLATE utf8mb4_general_ci COMMENT '做者',
  intro VARCHAR(1500) COLLATE utf8mb4_general_ci COMMENT '簡介',
  updateInfo VARCHAR(1500) COLLATE utf8mb4_general_ci COMMENT '更新說明',
  highlight VARCHAR(1500) COLLATE utf8mb4_general_ci COMMENT '精彩內容',
  image1 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖1',
  image2 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖2',
  image3 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖3',
  image4 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖4',
  image5 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖5',
  link VARCHAR(200) COLLATE utf8mb4_general_ci COMMENT '爬取連接',
  create_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間',
  update_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE current_timestamp COMMENT '更新時間',
  PRIMARY KEY (`id`)
)ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT '安智市場爬取遊戲列表';

建立item

建立項目scrapy startproject anzhispider，修改items.py數據庫

class AnzhispiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 連接地址
    link = scrapy.Field()
    # app名稱
    name = scrapy.Field()
    # 版本號
    versionCode = scrapy.Field()
    # 遊戲圖標icon
    icon = scrapy.Field()
    # icon存儲地址
    iconPath = scrapy.Field()
    # 分類
    type = scrapy.Field()
    # 上線時間
    onlineTime = scrapy.Field()
    # 大小
    size = scrapy.Field()
    # 下載量
    download = scrapy.Field()
    # 做者
    author = scrapy.Field()
    # 簡介
    intro = scrapy.Field()
    # 更新說明
    updateInfo = scrapy.Field()
    # 精彩內容
    highlight = scrapy.Field()
    # 市場圖  字符數組
    images = scrapy.Field()
    # 市場圖存儲地址
    imagePaths = scrapy.Field()

建立Spider

在spiders目錄下建立AnzhiSpider.py，並建立class AnzhiSpider，繼承於scrapy.Spider。數組

class AnzhiSpider(Spider):
    name = "AnzhiSpider"
    # 容許訪問的域
    allowed_domains = ["www.anzhi.com"]

    start_urls = ["http://www.anzhi.com/pkg/3d81_com.tencent.tmgp.pubgmhd.html"]

    # start_urls = ["http://www.anzhi.com/pkg/3d81_com.tencent.tmgp.pubgmhd.html","http://www.anzhi.com/pkg/84bf_com.sxiaoao.feijidazhan.html","http://www.anzhi.com/pkg/4f41_com.tencent.tmgp.WePop.html"]

    def parse(self, response):
        item = AnzhispiderItem()
        root = response.xpath('.//div[@class="content_left"]')
        # 連接
        item['link'] = response.url
        # 圖標
        item['icon'] = root.xpath('.//div[@class="app_detail"]/div[@class="detail_icon"]/img/@src').extract()[0]
        # app名稱
        item['name'] = root.xpath(
            './/div[@class="app_detail"]/div[@class="detail_description"]/div[@class="detail_line"]/h3/text()').extract()[
            0]
        # 版本號
        item['versionCode'] = root.xpath(
            './/div[@class="app_detail"]/div[@class="detail_description"]/div[@class="detail_line"]/span[@class="app_detail_version"]/text()').extract()[
            0]
        if item['versionCode'] and item['versionCode'].startswith("(") and item['versionCode'].endswith(")"):
            item['versionCode'] = item['versionCode'][1:-1]

        # 分類、上線時間、大小、下載量、做者  先獲取全部的詳情
        details = root.xpath(
            './/div[@class="app_detail"]/div[@class="detail_description"]/ul[@id="detail_line_ul"]/li/text()').extract()
        details_right = root.xpath(
            './/div[@class="app_detail"]/div[@class="detail_description"]/ul[@id="detail_line_ul"]/li/span/text()').extract()
        details.extend(details_right)

        for detailItem in details:
            if detailItem.startswith("分類："):
                item['type'] = detailItem[3:]
                continue
            if detailItem.startswith("時間："):
                item['onlineTime'] = detailItem[3:]
                continue
            if detailItem.startswith("大小："):
                item['size'] = detailItem[3:]
                continue
            if detailItem.startswith("下載："):
                item['download'] = detailItem[3:]
                continue
            if detailItem.startswith("做者："):
                item['author'] = detailItem[3:]
                continue

        # 簡介
        item['intro'] = root.xpath(
            './/div[@class="app_detail_list"][contains(./div[@class="app_detail_title"],"簡介")]/div[@class="app_detail_infor"]').extract()
        if item['intro']:
            item['intro'] = item['intro'][0].replace('\t', '').replace('\n', '').replace('\r', '')
        else:
            item['intro'] = ""
        # 更新說明
        item['updateInfo'] = root.xpath(
            './/div[@class="app_detail_list"][contains(./div[@class="app_detail_title"],"更新說明")]/div[@class="app_detail_infor"]').extract()
        if item['updateInfo']:
            item['updateInfo'] = item['updateInfo'][0].replace('\t', '').replace('\n', '').replace('\r', '')
        else:
            item['updateInfo'] = ""
        # 精彩內容
        item['highlight'] = root.xpath(
            './/div[@class="app_detail_list"][contains(./div[@class="app_detail_title"],"精彩內容")]/div[@class="app_detail_infor"]').extract()
        if item['highlight']:
            item['highlight'] = item['highlight'][0].replace('\t', '').replace('\n', '').replace('\r', '')
        else:
            item['highlight'] = ""

        # 市場圖地址
        item['images'] = root.xpath(
            './/div[@class="app_detail_list"][contains(./div[@class="app_detail_title"],"軟件截圖")]//ul/li/img/@src').extract()
        yield item

下載icon和市場圖

建立ImageResPipeline並繼承於from scrapy.pipelines.files import FilesPipeline，不用ImagesPipeline的緣由能夠查看ImagesPipeline官網的解釋，它的主要功能爲：app

將全部下載的圖片轉換成通用的格式（JPG）和模式（RGB）
避免從新下載最近已經下載過的圖片
縮略圖生成
檢測圖像的寬/高，確保它們知足最小限制

劃重點下載的圖片爲jpg格式，小編須要下載icon爲png格式的，須要圖標爲無背景的，採用ImagesPipeline圖片就算進行類型轉換仍是不能去掉背景，這樣會致使圓角的圖標空缺被白色補滿。框架

class ImageResPipeline(FilesPipeline):
    def get_media_requests(self, item, info):
        '''
        根據文件的url發送請求（url跟進）
        :param item:
        :param info:
        :return:
        '''
        # 根據index區分是icon圖片仍是市場圖
        yield scrapy.Request(url='http://www.anzhi.com' + item['icon'], meta={'item': item, 'index': 0})
        # 市場圖下載
        for i in range(0, len(item['images'])):
            yield scrapy.Request(url='http://www.anzhi.com' + item['images'][i], meta={'item': item, 'index': (i + 1)})

    def file_path(self, request, response=None, info=None):
        '''
        自定義文件保存路徑
        默認的保存路徑是在FILES_STORE下建立的一個full來存放，若是咱們想要直接在FILES_STORE下存放或者日期路徑，則須要自定義存放路徑。
        默認下載的是無後綴的文件，根據index區分，icon須要增長.png後綴，市場圖增長.jpg後綴
        :param request:
        :param response:
        :param info:
        :return:
        '''
        item = request.meta['item']
        index = request.meta['index']
        today = str(datetime.date.today())
        # 定義在FILES_STORE下的存放路徑爲YYYY/MM/dd/app名稱，如2019/11/28/和平精英
        outDir = today[0:4] + r"\\" + today[5:7] + r"\\" + today[8:] + r"\\" + item['name'] + r"\\"
        if index > 0:
            # index>0爲市場圖 命名爲[index].jpg  注意：以數字命名的文件要轉換成字符串，不然下載失敗，不會報具體緣由！！！
            file_name = outDir + str(index) + ".jpg"
        else:
            # index==0爲icon下載，需採用png格式合適
            file_name = outDir + "icon.png"
        # 輸出的文件已存在就刪除
        if os.path.exists(FILES_STORE + outDir) and os.path.exists(FILES_STORE + file_name):
            os.remove(FILES_STORE + file_name)
        return file_name

    def item_completed(self, results, item, info):
        '''
        處理請求結果
        :param results:
        :param item:
        :param info:
        :return:
        '''
        '''
        results的格式爲：
        [(True,
            {'checksum': '2b00042f7481c7b056c4b410d28f33cf',
            'path': 'full/7d97e98f8af710c7e7fe703abc8f639e0ee507c4.jpg',
            'url': 'http://www.example.com/images/product1.jpg'}),
        (True,
            {'checksum': 'b9628c4ab9b595f72f280b90c4fd093d',
            'path': 'full/1ca5879492b8fd606df1964ea3c1e2f4520f076f.jpg',
            'url': 'http://www.example.com/images/product2.jpg'}),
        (False,
            Failure(...))
        ]
        '''
        file_paths = [x['path'] for ok, x in results if ok]
        if not file_paths:
            raise DropItem("Item contains no files")

        for file_path in file_paths:
            if file_path.endswith("png"):
                # icon的圖片地址賦值給iconPath
                item['iconPath'] = FILES_STORE + file_path
            else:
                # 市場圖的地址給imagePaths 不存在屬性就建立空數組
                if 'imagePaths' not in item:
                    item['imagePaths'] = []
                item['imagePaths'].append(FILES_STORE + file_path)
        return item

數據庫存儲

鏈接mysql採用的PyMySQL==0.9.2，小編新建了一個工具類存放，插入、更新、刪除語句調用update(self, sql)，查詢語句調用query(self, sql)，dom

class MySQLHelper:
    def __init__(self):
        pass

    def query(self, sql):
        # 打開數據庫鏈接
        db = self.conn()

        # 使用cursor()方法獲取操做遊標
        cur = db.cursor()

        # 1.查詢操做
        # 編寫sql 查詢語句  user 對應個人表名
        # sql = "select * from user"
        try:
            cur.execute(sql)  # 執行sql語句

            results = cur.fetchall()  # 獲取查詢的全部記錄
            return results
        except Exception as e:
            thread_logger.debug('[mysql]：{} \n\tError SQL： {}'.format(e, sql))
            raise e
        finally:
            self.close(db)  # 關閉鏈接

    def update(self, sql):
        # 2.插入操做
        db = self.conn()

        # 使用cursor()方法獲取操做遊標
        cur = db.cursor()

        try:
            data = cur.execute(sql)
            # 提交
            data1 = db.commit()
            return True
        except Exception as e:
            thread_logger.debug('[mysql]：{} \n\tError SQL： {}'.format(e, sql))
            # 錯誤回滾
            db.rollback()
            return False
        finally:
            self.close(db)

    # 創建連接
    def conn(self):
        db = pymysql.connect(host="192.168.20.202", user="***",
                             password="****", db="app_anzhigame", port=3306, use_unicode=True, charset="utf8mb4")
        return db

    # 關閉
    def close(self, db):
        db.close()

更改AnzhispiderPipeline，插入數據，部分數據有默認值處理，scrapy

class AnzhispiderPipeline(object):
    """
    數據庫存儲
    """

    def __init__(self):
        # 打開數據庫連接
        self.mysqlHelper = MySQLHelper()

    def process_item(self, item, spider):
        # 數據庫存儲的sql
        sql = "INSERT INTO games(link,name,versionCode,icon,type,onlineTime,size,download,author,intro,updateInfo,highlight,image1,image2,image3,image4,image5) " \
              "VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
                  item['link'], item['name'], parseProperty(item, "versionCode", "v1.0"),
                  parseProperty(item, "iconPath", ""), parseProperty(item, "type", ""),
                  parseProperty(item, "onlineTime", ""), parseProperty(item, "size", "0B"),
                  parseProperty(item, "download", "0"), parseProperty(item, "author", "未知"),
                  parseProperty(item, "intro", "無"), parseProperty(item, "updateInfo", "無"),
                  parseProperty(item, "highlight", "無"), parseImageList(item, 0), parseImageList(item, 1),
                  parseImageList(item, 2), parseImageList(item, 3), parseImageList(item, 4))
        # 插入數據
        self.mysqlHelper.update(sql)
        return item

def parseProperty(item, property, defaultValue)爲自定義的方法，用於判空獲取默認值，def parseImageList(item, index)用於獲取市場圖，

def parseProperty(item, property, defaultValue):
    """
    判斷對象的對應屬性是否爲空 爲空就返回默認值
    :param item: 對象
    :param property: 屬性名稱
    :param defaultValue: 默認值
    """
    if property in item and item[property]:
        return item[property]
    else:
        return defaultValue


def parseImageList(item, index):
    """
    返回市場圖地址
    :param item:
    :param index:
    :return:
    """
    if "imagePaths" in item and item["imagePaths"]:
        # 有圖片
        # 獲取數組大小
        if len(item["imagePaths"]) >= index + 1:
            return item["imagePaths"][index]
        else:
            return ""
    else:
        return ""

配置settings.py

注意增長FILES_STORE用於存儲文件下載的路徑，MEDIA_ALLOW_REDIRECTS爲容許圖片重定向，由於安智的圖片連接爲重定向的，不設置會下載失敗。

# 文件下載地址
FILES_STORE = ".\\anzhigames\\"

# 是否容許重定向(可選)
MEDIA_ALLOW_REDIRECTS = True

配置pipelines，注意ImageResPipeline的數值須要比AnzhispiderPipeline小，數值範圍爲0-1000，越小優先級越高。

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'anzhispider.pipelines.AnzhispiderPipeline': 300,
   'anzhispider.pipelines.ImageResPipeline': 11,
}

至此。結束。scrapy crawl AnzhiSpider運行，收工。項目下.\\anzhigames\\生成了圖片，