dos窗口輸入:html
scrapy startproject quote
cd quote
import scrapy class QuoteItem(scrapy.Item): # define the fields for your item here like: text = scrapy.Field() author = scrapy.Field() tags = scrapy.Field()
dos窗口輸入:python
scrapy genspider myspider quotes.toscrape.com
# -*- coding: utf-8 -*- import scrapy from quote.items import QuoteItem class MyspiderSpider(scrapy.Spider): name = 'myspider' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] def parse(self, response): for each in response.xpath('//div[@class="quote"]'): item = QuoteItem() item['text'] = each.xpath('./span/text()').extract()[0] item['author'] = each.xpath('.//small/text()').extract()[0] list = each.xpath('.//a[@class="tag"]/text()').extract() #列表形式的文件不能存入mysql,須要弄成str形式 item['tags']= '/'.join(list) yield item next = response.xpath('//li[@class="next"]/a/@href').extract()[0] url = response.urljoin(next) yield scrapy.Request(url=url,callback=self.parse)
import pymysql.cursors class QuotePipeline(object): def __init__(self): self.connect = pymysql.connect( host='localhost', user='root', password='', database='quotes', charset='utf8', ) self.cursor = self.connect.cursor() def process_item(self, item, spider): item = dict(item) sql = 'insert into quote(text,author,tags) values(%s,%s,%s)' self.cursor.execute(sql,(item['text'],item['author'],item['tags'])) self.connect.commit() return item def close_spider(self,spider): self.cursor.close() self.connect.close()
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql.cursors class QuotePipeline(object): def __init__(self): self.connect = pymysql.connect( host='localhost', user='root', password='', database='quotes', charset='utf8', ) self.cursor = self.connect.cursor() def process_item(self, item, spider): item = dict(item) table = 'quote' keys = ','.join(item.keys()) values = ','.join(['%s']*len(item)) sql = 'insert into {table}({keys}) values({values})'.format(table=table,keys=keys,values=values) try: if self.cursor.execute(sql, tuple(item.values())): self.connect.commit() print("Successful!") except: print("Failed!") self.connect.rollback() return item def close_spider(self, spider): self.cursor.close() self.connect.close()
1.在setting文件設置2個屬性mysql
MONGO_URI = 'localhost' MONGO_DB = 'study' #一個管道文件 ITEM_PIPELINES = { # 'quote.pipelines.QuotePipeline': 300, 'quote.pipelines.MongoPipeline': 300, }
2.pipeline.pysql
import pymongo class MongoPipeline(object): # 表名字 collection = 'student' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB'), ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): # 插入到mongo數據庫 self.db[self.collection].insert(dict(item)) return item
robox協議數據庫
# Obey robots.txt rules ROBOTSTXT_OBEY = False
headersapp
DEFAULT_REQUEST_HEADERS = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', }
pipelinesdom
ITEM_PIPELINES = { 'quote.pipelines.QuotePipeline': 300, }
dos窗口輸入:scrapy
scrapy crawl myspider
運行結果ide