配置scrapy-splash+python爬取醫院信息(利用了scrapy-splash)

北京艾麗斯婦科醫院(http://fuke.fuke120.com/)python

首先先說一下配置splashredis

1.利用pip安裝scrapy-splash庫docker

pip install scrapy-splash數據庫

2.如今就要用到另外一個神器(Docker)windows

Docker下載地址:https://www.docker.com/community-edition#/windowsapi

3.安裝好Docker後啓動Docker拉取鏡像瀏覽器

docker pull scrapinghub/splashapp

4.利用Docker運行splashdom

docker run -p 8050:8050 scrapinghub/splash(運行以後你們能夠去瀏覽器輸入http://192.168.99.100:8050檢查Docker是否正確)scrapy

5settings.py配置

SPLASH_URL = 'http://192.168.99.100:8050'(重中之重,一個大坑,必定要注意這個IP就是192.168.99.100,我就一直用的本身IP一直沒運行成功)
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}

SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}

DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'

HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

ROBOTSTXT_OBEY = True(此處注意,有的網站是True,而有的網站須要把它改爲False)

 爬蟲的py文件1.py

# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from scrapy.http import Request
# from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
import pymongo
import scrapy
from scrapy.selector import HtmlXPathSelector

client = pymongo.MongoClient(host="127.0.0.1")
db = client.Health
collection = db.Healthclass  # 表名classification

import redis  # 導入redis數據庫

r = redis.Redis(host='127.0.0.1', port=6379, db=0)


ii = 0
class healthcareClassSpider(scrapy.Spider):
    name = "HealthCare"
    allowed_domains = ["fuke120.com"]  # 容許訪問的域
    start_urls = [
        "http://fuke.fuke120.com/",
    ]

    # 每爬完一個網頁會回調parse方法
    def parse(self, response):
        global ii
        hxs = HtmlXPathSelector(response)
        hx = hxs.select('//div[@id="allsort"]/div[@class="item"]/span/a')
        hx1 = hxs.select('//div[@id="allsort"]/div[@class="item born"]/span/a')
        # hx2 = hxs.select('//div[@id="allsort"]/div[@class="item"]/div[@class="i-mc"]/div[@class="i-mc01"]/ul[@class="w_ul01"]/li/a')
        for secItem in hx:
            ii+=1
            url = secItem.select("@href").extract()
            c = "http://fuke.fuke120.com"+url[0]
            name = secItem.select("text()").extract()

            print(c)
            print(name)
            classid = collection.insert({'healthclass': name, 'pid': None})
            healthurl = '%s,%s,%s' % (classid, c, ii)
            r.lpush('healthclassurl',healthurl)
        for secItem1 in hx1:
            url = secItem1.select("@href").extract()
            c1 = "http://fuke.fuke120.com"+url[0]
            name1 = secItem1.select("text()").extract()
            print(c1)
            print(name1)
            classid = collection.insert({'healthclass': name1, 'pid': None})
            healthurl = '%s,%s,%s' % (classid, c1, 0)
            r.lpush('healthclassurl', healthurl)

  2.py

# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
import pymongo
import scrapy
from scrapy.selector import HtmlXPathSelector
from bson.objectid import ObjectId
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from urllib.request import Request,ProxyHandler
from urllib.request import build_opener
client = pymongo.MongoClient(host="127.0.0.1")
db = client.Health            #庫名dianping
collection = db.Diseaseclass          #表名classification


import redis        #導入redis數據庫

r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8')
class healthcareClassSpider(scrapy.Spider):

    name = "HealthCare1"
    allowed_domains = ["fuke120.com"]  # 容許訪問的域
    dict = {}
    start_urls = []

    def __init__(self):
        a = r.lrange('healthclassurl', 0,-1)

        for item in a:
            healthurl = bytes.decode(item)
            arr = healthurl.split(',')
            healthcareClassSpider.start_urls.append(arr[1])

            num = arr[2]
            pid = arr[0]
            url = arr[1]
            self.dict[url] = {"pid": pid, "num": num}
    def parse(self, response):
        nameInfo = self.dict[response.url]
        pid1 = nameInfo['pid']
        pid = ObjectId(pid1)
        num = nameInfo['num']
        hxs = HtmlXPathSelector(response)
        hx = hxs.select('//div[@class="x_con02_2"]/div[@class="x_con02_3"]/ul/li/p/a')
        for secItem in hx:
            url = secItem.select("@href").extract()
            url = "http://fuke.fuke120.com"+url[0]
            name = secItem.select("text()").extract()
            print(url)
            print(name)
            classid = collection.insert({'Diseaseclass': name, 'pid': pid})
            diseaseclassurl = '%s,%s,%s' % (classid, url, pid)
            r.lpush('diseaseclassurl', diseaseclassurl)

  3.py

# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
import pymongo
import scrapy
from scrapy_splash import SplashMiddleware
from scrapy.http import Request, HtmlResponse
from scrapy_splash import SplashRequest
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from bson.objectid import ObjectId
# from diseaseHealth.diseaseHealth.spiders.SpiderJsDynamic import phantomjs1
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request

client = pymongo.MongoClient(host="127.0.0.1")
db = client.Health  # 庫名dianping
collection = db.Treatclass  # 表名classification
#
import redis  # 導入redis數據庫
#
r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8')


class healthcareClassSpider(scrapy.Spider):

    name = "HealthCare2"
    allowed_domains = ["fuke120.com"]  # 容許訪問的域
    dict = {}
    start_urls = []

    def __init__(self):
        a = r.lrange('diseaseclassurl', 0,-1)

        for item in a:
            healthurl = bytes.decode(item)
            arr = healthurl.split(',')
            healthcareClassSpider.start_urls.append(arr[1])

            num = arr[2]
            pid = arr[0]
            url = arr[1]
            self.dict[url] = {"pid": pid, "num": num}

    def start_requests(self):

        for url in self.start_urls:
            yield SplashRequest(url, self.parse, args={'wait': 0.5})
    def parse(self, response):
            # a = response.body.decode('utf-8')
            # print(a)


        nameInfo = self.dict[response.url]
        pid1 = nameInfo['pid']
        pid = ObjectId(pid1)
        num = nameInfo['num']
        print(num)
        print(pid)
        hxs = HtmlXPathSelector(response)
        hx = hxs.select('//div[@class="dh01"]/ul[@class="ul_bg01"]/li/a')
        for secItem in hx:
            url = secItem.select("@href").extract()
            c = "http://fuke.fuke120.com" + url[0]
            name = secItem.select("text()").extract()
            print(c)
            print(name)
            classid = collection.insert({'Treatclass': name, 'pid': pid})
            treatclassurl = '%s,%s,%s' % (classid, c, pid)
            r.lpush('treatclassurl', treatclassurl)

  大功告成,主要仍是爲了使用scrapy-splash。

相關文章
相關標籤/搜索