豆瓣電影TOP 250爬取-->>>數據保存到MongoDB
豆瓣電影TOP 250網址python
要求:數據庫
1.爬取豆瓣top 250電影名字、演員列表、評分和簡介服務器
2.設置隨機UserAgent和Proxydom
3.爬取到的數據保存到MongoDB數據庫scrapy
items.pyide
# -*- coding: utf-8 -*- import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # 標題 title = scrapy.Field() # 信息 bd = scrapy.Field() # 評分 star = scrapy.Field() # 簡介 quote = scrapy.Field()
doubanmovie.pyui
# -*- coding: utf-8 -*- import scrapy from douban.items import DoubanItem class DoubamovieSpider(scrapy.Spider): name = "doubanmovie" allowed_domains = ["movie.douban.com"] offset = 0 url = "https://movie.douban.com/top250?start=" start_urls = ( url+str(offset), ) def parse(self, response): item = DoubanItem() movies = response.xpath("//div[@class='info']") for each in movies: # 標題 item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()[0] # 信息 item['bd'] = each.xpath(".//div[@class='bd']/p/text()").extract()[0] # 評分 item['star'] = each.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0] # 簡介 quote = each.xpath(".//p[@class='quote']/span/text()").extract() if len(quote) != 0: item['quote'] = quote[0] yield item if self.offset < 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
pipelines.py編碼
# -*- coding: utf-8 -*- import pymongo from scrapy.conf import settings class DoubanPipeline(object): def __init__(self): host = settings["MONGODB_HOST"] port = settings["MONGODB_PORT"] dbname = settings["MONGODB_DBNAME"] sheetname= settings["MONGODB_SHEETNAME"] # 建立MONGODB數據庫連接 client = pymongo.MongoClient(host = host, port = port) # 指定數據庫 mydb = client[dbname] # 存放數據的數據庫表名 self.sheet = mydb[sheetname] def process_item(self, item, spider): data = dict(item) self.sheet.insert(data) return item
settings.pyurl
DOWNLOAD_DELAY = 2.5 COOKIES_ENABLED = False DOWNLOADER_MIDDLEWARES = { 'douban.middlewares.RandomUserAgent': 100, 'douban.middlewares.RandomProxy': 200, } USER_AGENTS = [ 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13' ] PROXIES = [ {"ip_port" :"121.42.140.113:16816", "user_passwd" : "****"}, #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} ] ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 300, } # MONGODB 主機名 MONGODB_HOST = "127.0.0.1" # MONGODB 端口號 MONGODB_PORT = 27017 # 數據庫名稱 MONGODB_DBNAME = "Douban" # 存放數據的表名稱 MONGODB_SHEETNAME = "doubanmovies"
middlewares.pyspa
#!/usr/bin/env python # -*- coding:utf-8 -*- import random import base64 from settings import USER_AGENTS from settings import PROXIES # 隨機的User-Agent class RandomUserAgent(object): def process_request(self, request, spider): useragent = random.choice(USER_AGENTS) #print useragent request.headers.setdefault("User-Agent", useragent) class RandomProxy(object): def process_request(self, request, spider): proxy = random.choice(PROXIES) if proxy['user_passwd'] is None: # 沒有代理帳戶驗證的代理使用方式 request.meta['proxy'] = "http://" + proxy['ip_port'] else: # 對帳戶密碼進行base64編碼轉換 base64_userpasswd = base64.b64encode(proxy['user_passwd']) # 對應到代理服務器的信令格式裏 request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd request.meta['proxy'] = "http://" + proxy['ip_port']