爬取⾖瓣電影 top250movie.douban.com/top250 的電影數據,並保存在MongoDB 中。python
class DoubanspiderItem(scrapy.Item): # 電影標題 title = scrapy.Field() # 電影評分 score = scrapy.Field() # 電影信息 content = scrapy.Field() # 簡介 info = scrapy.Field()
import scrapy from doubanSpider.items import DoubanspiderItem class DoubanSpider(scrapy.Spider): name = "douban" allowed_domains = ["movie.douban.com"] start = 0 url = 'https://movie.douban.com/top250?start=' end = '&filter=' start_urls = [url + str(start) + end] def parse(self, response): item = DoubanspiderItem() movies = response.xpath("//div[@class=\'info\']") for each in movies: title = each.xpath('div[@class="hd"]/a/span[@class="tit le"]/text()').extract() content = each.xpath('div[@class="bd"]/p/text()').extra ct() score = each.xpath('div[@class="bd"]/div[@class="star"] /span[@class="rating_num"]/text()').extract() info = each.xpath('div[@class="bd"]/p[@class="quote"]/s pan/text()').extract() item['title'] = title[0] # 以;做爲分隔,將 content 列表⾥全部元素合併成⼀個新的字符 串 item['content'] = ';'.join(content) item['score'] = score[0] item['info'] = info[0] # 提交 item yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self. end, callback=self.parse)
from scrapy.conf import settings import pymongo class DoubanspiderPipeline(object): def init (self): # 獲取 setting 主機名、端⼝號和數據庫 名 host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbname = settings['MONGODB_DBNAME'] # pymongo.MongoClient(host, port) 建立 MongoDB 連接 client = pymongo.MongoClient(host=host,port=port) # 指向指定的數據庫 mdb = client[dbname] # 獲取數據庫⾥存放數據的表名 self.post = mdb[settings['MONGODB_DOCNAME']] def process_item(self, item, spider): data = dict(item) # 向指定的表⾥添加數據 self.post.insert(data) return item
BOT_NAME = 'doubanSpider' SPIDER_MODULES = ['doubanSpider.spiders'] NEWSPIDER_MODULE = 'doubanSpider.spiders' ITEM_PIPELINES = { 'doubanSpider.pipelines.DoubanspiderPipeline' : 300 } # Crawl responsibly by identifying yourself (and your website) on t he user-agent USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) Apple WebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.3 6' # MONGODB 主機環回地址 127.0.0.1 MONGODB_HOST = '127.0.0.1' # 端⼝號,默認是 27017 MONGODB_PORT = 27017 # 設置數據庫名稱 MONGODB_DBNAME = 'DouBan' # 存放本次數據的表名稱 MONGODB_DOCNAME = 'DouBanMovies'
啓動 MongoDB 數據庫須要兩個命令: mongod:是 mongoDB 數據庫進程自己 mongo:是命令⾏shell 客戶端 sudo mongod # ⾸先啓動數據庫服務,再執⾏Scrapy sudo mongo # 啓動數據庫 shell 在 mongo shell 下使⽤命令: # 查看當前數據庫 > db # 列出全部的數據庫 > show dbs # 鏈接 DouBan 數據庫 > use DouBan # 列出全部表 > show collections # 查看錶⾥的數據 > db.DouBanMoives.find() #複製並訪問連接:https://item.taobao.com/item.htm?spm=a2oq0.12575281.0.0.4ace1deb26SVlT&ft=t&id=619117901939 獲取全套完整python視頻課程
小夥伴,怎麼樣,運行後什麼結果?是和下面同樣的嗎?
web