pip install scrapy
scrapy startproject book_project
import scrapy class BookItem(scrapy.Item): title = scrapy.Field() isbn = scrapy.Field() price = scrapy.Field()
import scrapy from book_project.items import BookItem class BookInfoSpider(scrapy.Spider): name = "bookinfo" allowed_domains = ["allitebooks.com", "amazon.com"] start_urls = [ "http://www.allitebooks.com/security/", ]
def parse(self, response): num_pages = int(response.xpath('//a[contains(@title, "Last Page →")]/text()').extract_first()) base_url = "http://www.allitebooks.com/security/page/{0}/" for page in range(1, num_pages): yield scrapy.Request(base_url.format(page), dont_filter=True, callback=self.parse_page)
'//a'的意思全部的a標籤;
'//a[contains(@title, "Last Page →")' 的意思是在全部的a標籤中,title屬性包涵"Last Page →"的a標籤;
extract() 方法解析並返回符合條件的節點數據。css
def parse_page(self, response): for sel in response.xpath('//div/article'): book_detail_url = sel.xpath('div/header/h2/a/@href').extract_first() yield scrapy.Request(book_detail_url, callback=self.parse_book_info) def parse_book_info(self, response): title = response.css('.single-title').xpath('text()').extract_first() isbn = response.xpath('//dd[2]/text()').extract_first() item = BookItem() item['title'] = title item['isbn'] = isbn amazon_search_url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' + isbn yield scrapy.Request(amazon_search_url, callback=self.parse_price, meta={ 'item': item })
def parse_price(self, response): item = response.meta['item'] item['price'] = response.xpath('//span/text()').re(r'\$[0-9]+\.[0-9]{2}?')[0] yield item
scrapy crawl bookinfo -o books.csv
大數據,大數據分析、BeautifulSoup,Beautiful Soup入門,Scrapy, Scrapy爬蟲,數據挖掘,數據分析,數據處理,pandas,網絡爬蟲,web scraper,python excel,python寫入excel數據,python處理csv文件 Scrapy csv, python操做Excel,excel讀寫 Scrapy框架 Scrapy框架入門大數據,大數據分析、BeautifulSoup,Beautiful Soup入門,Scrapy, Scrapy爬蟲,數據挖掘,數據分析,數據處理,pandas,網絡爬蟲,web scraper,python excel,python寫入excel數據,python處理csv文件 Scrapy csv, python操做Excel,excel讀寫 Scrapy框架 Scrapy框架入門 大數據,大數據分析、BeautifulSoup,Beautiful Soup入門,Scrapy, Scrapy爬蟲,數據挖掘,數據分析,數據處理,pandas,網絡爬蟲,web scraper,python excel,python寫入excel數據,python處理csv文件 Scrapy csv, python操做Excel,excel讀寫 Scrapy框架 Scrapy框架入門 大數據,大數據分析、BeautifulSoup,Beautiful Soup入門,Scrapy, Scrapy爬蟲,數據挖掘,數據分析,數據處理,pandas,網絡爬蟲,web scraper,python excel,python寫入excel數據,python處理csv文件 Scrapy csv, python操做Excel,excel讀寫 Scrapy框架 Scrapy框架入門 大數據,大數據分析、BeautifulSoup,Beautiful Soup入門,Scrapy, Scrapy爬蟲,數據挖掘,數據分析,數據處理,pandas,網絡爬蟲,web scraper,python excel,python寫入excel數據,python處理csv文件 Scrapy csv, python操做Excel,excel讀寫 Scrapy框架 Scrapy框架入門html