scrapy基礎筆記

時間 2019-11-12

標籤 scrapy 基礎筆記欄目 Python 简体版

原文原文鏈接

公衆號原文

公衆號排版更友好，建議查看公衆號原文html

前言

reference: https://www.tutorialspoint.com/scrapy/scrapy_quick_guide.htmpython

offical doc: http://doc.scrapy.org/en/1.0/intro/tutorial.htmldocker

安裝

reference: http://doc.scrapy.org/en/1.0/intro/install.html#intro-installshell

啓動個容器安裝scrapy(耗時比較長)

root@ubuntu:/home/vickey# docker run -itd --name test-scrapy ubuntu
root@ubuntu:/home/vickey# docker exec -it test-scrapy /bin/bash
root@8b825656f58b:/# apt-get update
...
root@8b825656f58b:/# apt-get install python-dev python-pip libxml2-dev libxslt1-dev zlib1g-dev libffi-dev libssl-dev
...
root@8b825656f58b:/# pip install scrapy
...
root@8b825656f58b:/# scrapy -v 
Scrapy 1.6.0 - no active project
...

還能夠直接用本人作好的鏡像: vickeywu/scrapy-python3

root@ubuntu:/home/vickey# docker pull vickeywu/scrapy-python3
Using default tag: latest
latest: Pulling from vickeywu/scrapy-python3
Digest: sha256:e1bdf37f93ac7ced9168a7a697576ce905e73fb4775f7cb80de196fa2df5a549
Status: Downloaded newer image for vickeywu/scrapy-python3:latest
root@ubuntu:/home/vickey# docker run -itd --name test-scrapy vickeywu/scrapy-python3

建立項目

reference: http://doc.scrapy.org/en/1.0/intro/tutorial.html#creating-a-project數據庫

root@ubuntu:/home/vickey# docker exec -it test-scrapy /bin/bash
root@2fb0da64a933:/# cd /home
root@2fb0da64a933:/home# scrapy startproject test_scrapy
New Scrapy project 'test_scrapy', using template directory '/usr/local/lib/python2.7/dist-packages/scrapy/templates/project', created in:
    /home/test_scrapy

You can start your first spider with:
    cd test_scrapy
    scrapy genspider example example.com

建立項目爬蟲

root@2fb0da64a933:/home/test_scrapy# cd test_scrapy/
root@2fb0da64a933:/home/test_scrapy/test_scrapy# scrapy genspider test_spider baidu.com
Created spider 'test_spider' using template 'basic' in module:
  test_scrapy.spiders.test_spider

項目及爬蟲文件

概覽

root@8b825656f58b:/home# tree -L 2 test_scrapy/
test_scrapy/                                            # Deploy the configuration file
|-- scrapy.cfg                                          # Name of the project
`-- test_scrapy
    |-- __init__.py
    |-- items.py                                        # It is project's items file
    |-- middlewares.py                                  # It is project's pipelines file
    |-- pipelines.py                                    # It is project's pipelines file
    |-- settings.py                                     # It is project's settings file
    `-- spiders
        |-- __init__.py
        `-- test_spider.py                              # It is project's spiders file

2 directories, 6 files

scrapy.cfg

root@2fb0da64a933:/home# cd test_scrapy/                # 進入建立的項目
root@2fb0da64a933:/home/test_scrapy# ls
scrapy.cfg  test_scrapy
root@2fb0da64a933:/home/test_scrapy# cat scrapy.cfg 
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html

[settings]
default = test_scrapy.settings                          # default = 項目名.settings  

[deploy]
#url = http://localhost:6800/
project = test_scrapy                                   # project = 項目名
root@2fb0da64a933:/home/test_scrapy# cd test_scrapy/
root@2fb0da64a933:/home/test_scrapy/test_scrapy# ls     # 建立項目時默認建立的文件
__init__.py  __init__.pyc  items.py  middlewares.py  pipelines.py  settings.py  settings.pyc  spiders

items.pyubuntu

設置數據庫字段api

root@2fb0da64a933:/home/test_scrapy/test_scrapy# cat items.py 
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class TestScrapyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

middlewares.py(暫忽略)

root@2fb0da64a933:/home/test_scrapy/test_scrapy# cat middlewares.py 
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class TestScrapySpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    ...


class TestScrapyDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    ...

pipelines.pybash

鏈接、寫入數據庫的操做等寫在這裏(先看模版，以後會給出實例)dom

root@2fb0da64a933:/home/test_scrapy/test_scrapy# cat pipelines.py 
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class TestScrapyPipeline(object):
    def process_item(self, item, spider):
        return item

settings.py

root@2fb0da64a933:/home/test_scrapy/test_scrapy# cat settings.py|grep -v ^# |grep -v ^$
BOT_NAME = 'test_scrapy'
SPIDER_MODULES = ['test_scrapy.spiders']
NEWSPIDER_MODULE = 'test_scrapy.spiders'
ROBOTSTXT_OBEY = True

項目爬蟲文件

reference: https://docs.scrapy.org/en/latest/topics/spiders.html?highlight=filter#scrapy-spiderpython2.7

root@2fb0da64a933:/home/test_scrapy/test_scrapy# cd spiders/
root@2fb0da64a933:/home/test_scrapy/test_scrapy/spiders# ls
__init__.py test_spider.py                              # test.spider.py就是建立的爬蟲文件，建立的全部同一項目爬蟲都會放在這裏
root@2fb0da64a933:/home/test_scrapy/test_scrapy/spiders# cat test_spider.py 
# -*- coding: utf-8 -*-
import scrapy


class TestSpiderSpider(scrapy.Spider):                  # 類名爲：爬蟲名+Spider
    name = 'test_spider'                                # 建立爬蟲時定義的爬蟲名
    allowed_domains = ['baidu.com']                     # 建立爬蟲時定義的爬蟲要爬的域名或URL
    start_urls = ['http://baidu.com/']                  # 爬蟲要爬取信息的根URL，是個列表類型

    def parse(self, response):
        pass

運行項目爬蟲

不帶參數運行爬蟲

官方文檔說須要回到項目頂層目錄運行爬蟲，但實際上好像不用，只要在項目目錄內就行

root@2fb0da64a933:/home/test_scrapy/test_scrapy/spiders# scrapy crawl test_spider
2019-06-26 07:02:52 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: test_scrapy)
......
2019-06-26 07:02:53 [scrapy.core.engine] INFO: Spider closed (finished)

帶參數運行爬蟲

前提是須要在__init__中先接收該傳入參數

root@2fb0da64a933:/home/test_scrapy/test_scrapy/spiders# cat test_spider.py
# -*- coding: utf-8 -*-
import scrapy


class TestSpiderSpider(scrapy.Spider):
    name = 'test_spider'
    allowed_domains = ['baidu.com']
    start_urls = ['http://baidu.com/']

    def __init__(self, group, *args, **kargs):
        super(TestSpiderSpider, self).__init__(*args, **kwargs)
        self.start_urls = ['http://www.example.com/group/%s' % group]

    def parse(self, response):
        pass
root@2fb0da64a933:/home/test_scrapy/test_scrapy/spiders# scrapy crawl test_spider -a group=aa
2019-06-27 03:11:35 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: test_scrapy)
......
2019-06-27 03:11:35 [scrapy.core.engine] INFO: Spider closed (finished)