目錄html
pip install Scrapy
# 建立項目 scrapy startproject douban # 切換到項目目錄下,生成爬蟲 cd spiders scrapy genspider douban_spider movie.douban.com
在 items.py
下 定義想要爬取的字段python
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # 序號 serial_number = scrapy.Field() # 名稱 movie_name = scrapy.Field() # 介紹 introduce = scrapy.Field() # 星級 star = scrapy.Field() # 評論 evaluate = scrapy.Field() # 描述 describe = scrapy.Field()
使用 Xpath 進行 html 選擇。能夠藉助 google 瀏覽器擴展 Xpath Helper 進行選擇器編寫。mongodb
# -*- coding: utf-8 -*- import scrapy from douban.items import DoubanItem class DoubanSpiderSpider(scrapy.Spider): # 爬蟲名稱 name = 'douban_spider' # 容許爬取的域名 allowed_domains = ['movie.douban.com'] # 入口URL,由調度器傳遞到下載器 start_urls = ['https://movie.douban.com/top250'] # 解析方法 def parse(self, response): movie_list = response.xpath("//ol[@class='grid_view']//li") for item in movie_list: # 導入items文件 douban_item = DoubanItem() douban_item['serial_number'] = item.xpath(".//div[1]//div[1]//em[1]/text()").extract_first() douban_item['movie_name'] = item.xpath(".//div[1]//div[2]//div[1]//a[1]//span[@class='title']/text()").extract_first() content = item.xpath(".//div[1]//div[2]//div[2]//p[1]/text()").extract() for content_item in content: content_text = "".join(content_item.split()) douban_item['introduce'] = content_text douban_item['star'] = item.xpath(".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = item.xpath(".//div[@class='star']//span[4]/text()").extract_first() douban_item['describe'] = item.xpath(".//p[@class='quote']//span[@class='inq']/text()").extract_first() yield douban_item # 獲取下一頁 next_link = response.xpath("//link[@rel='next']/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse)
運行項目json
scrapy crawl douban_spider
Note 若是出現報錯,能夠在 settings.py
裏面設置頭部信息。如 USER_AGENT
等瀏覽器
爲了方便運行,能夠建立 main.py
框架
from scrapy import cmdline cmdline.execute('scrapy crawl douban_spider'.split())
scrapy crawl douban_spider -o test.json
scrapy crawl douban_spider -o test.csv
Note 用 Excel 打開剛生成的 csv 時,可能出現亂碼,須要將文件轉碼盛 utf8 帶 bom 的形式。dom
mongo_host = 'localhost' mongo_port = 27017 mongo_db_name = 'douban' mongo_db_collection = 'douban_movie'
並取消註釋scrapy
ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 300, }
pip install pymongo
# -*- coding: utf-8 -*- import pymongo from douban.settings import mongo_host, mongo_port, mongo_db_name, mongo_db_collection # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class DoubanPipeline(object): def __init__(self): host = mongo_host port = mongo_port dbname = mongo_db_name dbcollection = mongo_db_collection client = pymongo.MongoClient(host=host, port=port) mydb = client[dbname] self.post = mydb[dbcollection] def process_item(self, item, spider): data = dict(item) self.post.insert(data) return item
import base64 . . . class my_proxy(object): def process_request(self, request, spider): request.meta['proxy'] = 'http-pro.abuyun.com:9010' proxy_name_pass = b'HwSSD7VC73K55YT1P:F121AA9B2DDA7C35' encode_name_pass = base64.b64encode(proxy_name_pass) request.headers['Proxy-Authorization'] = 'Basic ' + encode_name_pass.decode()
. . . DOWNLOADER_MIDDLEWARES = { # 'douban.middlewares.DoubanDownloaderMiddleware': 543, 'douban.middlewares.my_proxy': 543, } . . .
import random . . . class my_useragent(object): def process_request(self, request, spider): USER_AGENT_LIST = [ 'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23', 'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)', 'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)', 'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)', 'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)', 'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0', 'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)', 'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)' ] agent = random.choice(USER_AGENT_LIST) request.headers['User-Agent'] = agent . . .
. . . DOWNLOADER_MIDDLEWARES = { # 'douban.middlewares.DoubanDownloaderMiddleware': 543, 'douban.middlewares.my_proxy': 543, 'douban.middlewares.my_useragent': 544, } . . .