本篇文章是利用Scrapy扒取安智市場的app詳情頁,如點擊查看和平精英,包括app名、版本號、圖標icon、分類、時間、大小、下載量、做者、簡介、更新說明、軟件截圖、精彩內容等,扒取的圖片資源icon和市場展現圖(app截圖)下載到本地,並將全部數據存儲到數據庫。html
考慮的問題:python
須要先熟悉Scrapy框架的同窗:點擊學習mysql
建立的爲mysql數據庫,名稱爲app_anzhigame
,表名爲games
,安智市場的市場圖限制爲4-5張,簡介等爲1500字之內,圖片均爲相對地址sql
# 建庫 CREATE DATABASE app_anzhigame CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; USE app_anzhigame; DROP TABLE games; # 建表 CREATE TABLE games( id INTEGER(11) UNSIGNED AUTO_INCREMENT COLLATE utf8mb4_general_ci, name VARCHAR(20) NOT NULL COLLATE utf8mb4_general_ci COMMENT '遊戲名' , versionCode VARCHAR(10) COLLATE utf8mb4_general_ci COMMENT '版本號' NOT NULL DEFAULT 'v1.0', icon VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '遊戲圖標icon' NOT NULL DEFAULT '', type VARCHAR(20) COLLATE utf8mb4_general_ci COMMENT '分類' NOT NULL DEFAULT '', onlineTime VARCHAR(20) COLLATE utf8mb4_general_ci COMMENT '上線時間', size VARCHAR(10) COLLATE utf8mb4_general_ci COMMENT '大小' NOT NULL DEFAULT '0B', download VARCHAR(10) COLLATE utf8mb4_general_ci COMMENT '下載量' NOT NULL DEFAULT '0', author VARCHAR(20) COLLATE utf8mb4_general_ci COMMENT '做者', intro VARCHAR(1500) COLLATE utf8mb4_general_ci COMMENT '簡介', updateInfo VARCHAR(1500) COLLATE utf8mb4_general_ci COMMENT '更新說明', highlight VARCHAR(1500) COLLATE utf8mb4_general_ci COMMENT '精彩內容', image1 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖1', image2 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖2', image3 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖3', image4 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖4', image5 VARCHAR(100) COLLATE utf8mb4_general_ci COMMENT '市場圖5', link VARCHAR(200) COLLATE utf8mb4_general_ci COMMENT '爬取連接', create_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間', update_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE current_timestamp COMMENT '更新時間', PRIMARY KEY (`id`) )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT '安智市場爬取遊戲列表';
建立項目scrapy startproject anzhispider
,修改items.py
數據庫
class AnzhispiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 連接地址 link = scrapy.Field() # app名稱 name = scrapy.Field() # 版本號 versionCode = scrapy.Field() # 遊戲圖標icon icon = scrapy.Field() # icon存儲地址 iconPath = scrapy.Field() # 分類 type = scrapy.Field() # 上線時間 onlineTime = scrapy.Field() # 大小 size = scrapy.Field() # 下載量 download = scrapy.Field() # 做者 author = scrapy.Field() # 簡介 intro = scrapy.Field() # 更新說明 updateInfo = scrapy.Field() # 精彩內容 highlight = scrapy.Field() # 市場圖 字符數組 images = scrapy.Field() # 市場圖存儲地址 imagePaths = scrapy.Field()
在spiders
目錄下建立AnzhiSpider.py
,並建立class AnzhiSpider
,繼承於scrapy.Spider。數組
class AnzhiSpider(Spider): name = "AnzhiSpider" # 容許訪問的域 allowed_domains = ["www.anzhi.com"] start_urls = ["http://www.anzhi.com/pkg/3d81_com.tencent.tmgp.pubgmhd.html"] # start_urls = ["http://www.anzhi.com/pkg/3d81_com.tencent.tmgp.pubgmhd.html","http://www.anzhi.com/pkg/84bf_com.sxiaoao.feijidazhan.html","http://www.anzhi.com/pkg/4f41_com.tencent.tmgp.WePop.html"] def parse(self, response): item = AnzhispiderItem() root = response.xpath('.//div[@class="content_left"]') # 連接 item['link'] = response.url # 圖標 item['icon'] = root.xpath('.//div[@class="app_detail"]/div[@class="detail_icon"]/img/@src').extract()[0] # app名稱 item['name'] = root.xpath( './/div[@class="app_detail"]/div[@class="detail_description"]/div[@class="detail_line"]/h3/text()').extract()[ 0] # 版本號 item['versionCode'] = root.xpath( './/div[@class="app_detail"]/div[@class="detail_description"]/div[@class="detail_line"]/span[@class="app_detail_version"]/text()').extract()[ 0] if item['versionCode'] and item['versionCode'].startswith("(") and item['versionCode'].endswith(")"): item['versionCode'] = item['versionCode'][1:-1] # 分類、上線時間、大小、下載量、做者 先獲取全部的詳情 details = root.xpath( './/div[@class="app_detail"]/div[@class="detail_description"]/ul[@id="detail_line_ul"]/li/text()').extract() details_right = root.xpath( './/div[@class="app_detail"]/div[@class="detail_description"]/ul[@id="detail_line_ul"]/li/span/text()').extract() details.extend(details_right) for detailItem in details: if detailItem.startswith("分類:"): item['type'] = detailItem[3:] continue if detailItem.startswith("時間:"): item['onlineTime'] = detailItem[3:] continue if detailItem.startswith("大小:"): item['size'] = detailItem[3:] continue if detailItem.startswith("下載:"): item['download'] = detailItem[3:] continue if detailItem.startswith("做者:"): item['author'] = detailItem[3:] continue # 簡介 item['intro'] = root.xpath( './/div[@class="app_detail_list"][contains(./div[@class="app_detail_title"],"簡介")]/div[@class="app_detail_infor"]').extract() if item['intro']: item['intro'] = item['intro'][0].replace('\t', '').replace('\n', '').replace('\r', '') else: item['intro'] = "" # 更新說明 item['updateInfo'] = root.xpath( './/div[@class="app_detail_list"][contains(./div[@class="app_detail_title"],"更新說明")]/div[@class="app_detail_infor"]').extract() if item['updateInfo']: item['updateInfo'] = item['updateInfo'][0].replace('\t', '').replace('\n', '').replace('\r', '') else: item['updateInfo'] = "" # 精彩內容 item['highlight'] = root.xpath( './/div[@class="app_detail_list"][contains(./div[@class="app_detail_title"],"精彩內容")]/div[@class="app_detail_infor"]').extract() if item['highlight']: item['highlight'] = item['highlight'][0].replace('\t', '').replace('\n', '').replace('\r', '') else: item['highlight'] = "" # 市場圖地址 item['images'] = root.xpath( './/div[@class="app_detail_list"][contains(./div[@class="app_detail_title"],"軟件截圖")]//ul/li/img/@src').extract() yield item
建立ImageResPipeline
並繼承於from scrapy.pipelines.files import FilesPipeline
,不用ImagesPipeline
的緣由能夠查看ImagesPipeline官網的解釋,它的主要功能爲:app
劃重點下載的圖片爲jpg格式,小編須要下載icon爲png格式的,須要圖標爲無背景的,採用ImagesPipeline
圖片就算進行類型轉換仍是不能去掉背景,這樣會致使圓角的圖標空缺被白色補滿。框架
class ImageResPipeline(FilesPipeline): def get_media_requests(self, item, info): ''' 根據文件的url發送請求(url跟進) :param item: :param info: :return: ''' # 根據index區分是icon圖片仍是市場圖 yield scrapy.Request(url='http://www.anzhi.com' + item['icon'], meta={'item': item, 'index': 0}) # 市場圖下載 for i in range(0, len(item['images'])): yield scrapy.Request(url='http://www.anzhi.com' + item['images'][i], meta={'item': item, 'index': (i + 1)}) def file_path(self, request, response=None, info=None): ''' 自定義文件保存路徑 默認的保存路徑是在FILES_STORE下建立的一個full來存放,若是咱們想要直接在FILES_STORE下存放或者日期路徑,則須要自定義存放路徑。 默認下載的是無後綴的文件,根據index區分,icon須要增長.png後綴,市場圖增長.jpg後綴 :param request: :param response: :param info: :return: ''' item = request.meta['item'] index = request.meta['index'] today = str(datetime.date.today()) # 定義在FILES_STORE下的存放路徑爲YYYY/MM/dd/app名稱,如2019/11/28/和平精英 outDir = today[0:4] + r"\\" + today[5:7] + r"\\" + today[8:] + r"\\" + item['name'] + r"\\" if index > 0: # index>0爲市場圖 命名爲[index].jpg 注意:以數字命名的文件要轉換成字符串,不然下載失敗,不會報具體緣由!!! file_name = outDir + str(index) + ".jpg" else: # index==0爲icon下載,需採用png格式合適 file_name = outDir + "icon.png" # 輸出的文件已存在就刪除 if os.path.exists(FILES_STORE + outDir) and os.path.exists(FILES_STORE + file_name): os.remove(FILES_STORE + file_name) return file_name def item_completed(self, results, item, info): ''' 處理請求結果 :param results: :param item: :param info: :return: ''' ''' results的格式爲: [(True, {'checksum': '2b00042f7481c7b056c4b410d28f33cf', 'path': 'full/7d97e98f8af710c7e7fe703abc8f639e0ee507c4.jpg', 'url': 'http://www.example.com/images/product1.jpg'}), (True, {'checksum': 'b9628c4ab9b595f72f280b90c4fd093d', 'path': 'full/1ca5879492b8fd606df1964ea3c1e2f4520f076f.jpg', 'url': 'http://www.example.com/images/product2.jpg'}), (False, Failure(...)) ] ''' file_paths = [x['path'] for ok, x in results if ok] if not file_paths: raise DropItem("Item contains no files") for file_path in file_paths: if file_path.endswith("png"): # icon的圖片地址賦值給iconPath item['iconPath'] = FILES_STORE + file_path else: # 市場圖的地址給imagePaths 不存在屬性就建立空數組 if 'imagePaths' not in item: item['imagePaths'] = [] item['imagePaths'].append(FILES_STORE + file_path) return item
鏈接mysql採用的PyMySQL==0.9.2
,小編新建了一個工具類存放,插入、更新、刪除語句調用update(self, sql)
,查詢語句調用query(self, sql)
,dom
class MySQLHelper: def __init__(self): pass def query(self, sql): # 打開數據庫鏈接 db = self.conn() # 使用cursor()方法獲取操做遊標 cur = db.cursor() # 1.查詢操做 # 編寫sql 查詢語句 user 對應個人表名 # sql = "select * from user" try: cur.execute(sql) # 執行sql語句 results = cur.fetchall() # 獲取查詢的全部記錄 return results except Exception as e: thread_logger.debug('[mysql]:{} \n\tError SQL: {}'.format(e, sql)) raise e finally: self.close(db) # 關閉鏈接 def update(self, sql): # 2.插入操做 db = self.conn() # 使用cursor()方法獲取操做遊標 cur = db.cursor() try: data = cur.execute(sql) # 提交 data1 = db.commit() return True except Exception as e: thread_logger.debug('[mysql]:{} \n\tError SQL: {}'.format(e, sql)) # 錯誤回滾 db.rollback() return False finally: self.close(db) # 創建連接 def conn(self): db = pymysql.connect(host="192.168.20.202", user="***", password="****", db="app_anzhigame", port=3306, use_unicode=True, charset="utf8mb4") return db # 關閉 def close(self, db): db.close()
更改AnzhispiderPipeline
,插入數據,部分數據有默認值處理,scrapy
class AnzhispiderPipeline(object): """ 數據庫存儲 """ def __init__(self): # 打開數據庫連接 self.mysqlHelper = MySQLHelper() def process_item(self, item, spider): # 數據庫存儲的sql sql = "INSERT INTO games(link,name,versionCode,icon,type,onlineTime,size,download,author,intro,updateInfo,highlight,image1,image2,image3,image4,image5) " \ "VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % ( item['link'], item['name'], parseProperty(item, "versionCode", "v1.0"), parseProperty(item, "iconPath", ""), parseProperty(item, "type", ""), parseProperty(item, "onlineTime", ""), parseProperty(item, "size", "0B"), parseProperty(item, "download", "0"), parseProperty(item, "author", "未知"), parseProperty(item, "intro", "無"), parseProperty(item, "updateInfo", "無"), parseProperty(item, "highlight", "無"), parseImageList(item, 0), parseImageList(item, 1), parseImageList(item, 2), parseImageList(item, 3), parseImageList(item, 4)) # 插入數據 self.mysqlHelper.update(sql) return item
def parseProperty(item, property, defaultValue)
爲自定義的方法,用於判空獲取默認值,def parseImageList(item, index)
用於獲取市場圖,
def parseProperty(item, property, defaultValue): """ 判斷對象的對應屬性是否爲空 爲空就返回默認值 :param item: 對象 :param property: 屬性名稱 :param defaultValue: 默認值 """ if property in item and item[property]: return item[property] else: return defaultValue def parseImageList(item, index): """ 返回市場圖地址 :param item: :param index: :return: """ if "imagePaths" in item and item["imagePaths"]: # 有圖片 # 獲取數組大小 if len(item["imagePaths"]) >= index + 1: return item["imagePaths"][index] else: return "" else: return ""
注意增長FILES_STORE
用於存儲文件下載的路徑,MEDIA_ALLOW_REDIRECTS
爲容許圖片重定向,由於安智的圖片連接爲重定向的,不設置會下載失敗。
# 文件下載地址 FILES_STORE = ".\\anzhigames\\" # 是否容許重定向(可選) MEDIA_ALLOW_REDIRECTS = True
配置pipelines,注意ImageResPipeline
的數值須要比AnzhispiderPipeline
小,數值範圍爲0-1000,越小優先級越高。
# Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'anzhispider.pipelines.AnzhispiderPipeline': 300, 'anzhispider.pipelines.ImageResPipeline': 11, }
至此。結束。scrapy crawl AnzhiSpider
運行,收工。項目下.\\anzhigames\\
生成了圖片,
數據庫存儲狀況
須要項目源碼,點擊原文連接
💡 更多好文歡迎關注個人公衆號~