項目地址 BookSpiderhtml
本篇涉及的內容主要是獲取分類下的全部圖書數據,並寫入MySQLnode
Python3.六、Scrapy、Twisted、MySQLdb等git
scrapy startproject BookSpider #建立項目 scrapy genspider douban book.douban.com #建立豆瓣爬蟲
from scrapy.cmdline import execute execute(['scrapy', 'crawl', 'douban'])
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' #瀏覽器 ROBOTSTXT_OBEY = False #不遵循豆瓣網站的爬蟲協議
start_urls = ['https://book.douban.com/tag/神經網絡'] # 只測試爬取神經網絡
from scrapy.http import Request from urllib.parse import urljoin def parse(self, response): get_nodes = response.xpath('//div[@id="subject_list"]/ul/li/div[@class="pic"]/a') for node in get_nodes: url = node.xpath("@href").get() img_url = node.xpath('img/@src').get() yield Request(url=url, meta={"img_url": img_url}, callback=self.parse_book) # 傳遞img_url值 放在meta裏面, parse_book回調函數,獲取的詳情再分析 next_url = response.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href').get() # 獲取下一頁地址 if(next_url): yield Request(url=urljoin(response.url, next_url), callback=self.parse) # 獲取下一頁內容
class BookspiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() author = scrapy.Field() publish = scrapy.Field() page_num = scrapy.Field() isbm = scrapy.Field() binding = scrapy.Field() publish_date = scrapy.Field() price = scrapy.Field() rate = scrapy.Field() img_url = scrapy.Field() image_path = scrapy.Field()
import re from BookSpider.items import BookspiderItem def parse_book(self, response): BookItem = BookspiderItem() BookItem['name'] = response.xpath('//span[@property="v:itemreviewed"]/text()').get("").strip() BookItem['author'] = response.xpath('//span[contains(text(), "做者")]/following-sibling::a[1]/text()').get("").split()[-1] BookItem['publish'] = response.xpath('//span[contains(text(), "出版社")]/following-sibling::text()').get("").strip() page_num = response.xpath('//span[contains(text(), "頁數")]/following-sibling::text()').get("").strip() BookItem['page_num'] = 0 if(page_num == '') else page_num BookItem['isbm'] = response.xpath('//span[contains(text(), "ISBN")]/following-sibling::text()').get("").strip() BookItem['binding'] = response.xpath('//span[contains(text(), "裝幀")]/following-sibling::text()').get("").strip() BookItem['publish_date'] = response.xpath('//span[contains(text(), "出版年")]/following-sibling::text()').get("").strip() price = response.xpath('//span[contains(text(), "訂價")]/following-sibling::text()').get("").strip() BookItem['price'] = '' if(len(price) == 0) else re.findall(r'\d+\.?\d*', price)[0] BookItem['rate'] = response.xpath('//div[contains(@class, "rating_self ")]/strong/text()').get("").strip() BookItem['img_url'] = [response.meta.get('img_url')] #圖片是列表 yield BookItem
一、建立images文件加
二、配置spiders/settings.pygithub
ITEM_PIPELINES = { 'BookSpider.pipelines.ImageStorePipeline': 1, #後面的數據是優先級 } IMAGES_URLS_FIELD = "image_url" IMAGES_STORE = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'images')
三、建立ImageStorePipeline類(spiders/pipelines.py)web
from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem from scrapy.http import Request class ImageStorePipeline(ImagesPipeline): default_headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', #這個必定要 } def get_media_requests(self, item, info): for image_url in item['img_url']: self.default_headers['referer'] = image_url yield Request(image_url, headers=self.default_headers) def item_completed(self, results, item, info): image_path = [x['path'] for ok, x in results if ok] if not image_path: raise DropItem("Item contains no images") item['image_path'] = image_path return item
一、配置spiders/settings.pysql
#設置數據庫 MYSQL_HOST = "" MYSQL_DBNAME = "" MYSQL_USER = "" MYSQL_PASSWORD = "" ITEM_PIPELINES = { 'BookSpider.pipelines.ImageStorePipeline': 1, 'BookSpider.pipelines.MysqlTwistedPipeline': 30, }
二、建立MysqlTwistedPipeline類(spiders/pipelines.py)數據庫
import MySQLdb.cursors from twisted.enterprise import adbapi class MysqlTwistedPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod #靜態方法,會優先執行from_settings, 這樣self.dbpool就有值了 def from_settings(cls, settings): dbpool = adbapi.ConnectionPool("MySQLdb", host=settings['MYSQL_HOST'], db = settings['MYSQL_DBNAME'], user = settings['MYSQL_USER'], passwd = settings['MYSQL_PASSWORD'], charset = 'utf8', cursorclass = MySQLdb.cursors.DictCursor, use_unicode = True) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error,item,spider) def do_insert(self, cursor, item): insert_sql = """ insert into douban(name, author, publish, page_num, isbm, binding, publish_date, price, rate, img_url, image_path) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ cursor.execute(insert_sql, (item['name'], item['author'], item['publish'], item['page_num'], item['isbm'], item['binding'], item['publish_date'], item['price'], item['rate'], item['img_url'], item['image_path'])) def handle_error(self, failure, item, spider): print(failure)
一、執行main.py文件api