北京艾麗斯婦科醫院(http://fuke.fuke120.com/)python
首先先說一下配置splashredis
1.利用pip安裝scrapy-splash庫docker
pip install scrapy-splash數據庫
2.如今就要用到另外一個神器(Docker)windows
Docker下載地址:https://www.docker.com/community-edition#/windowsapi
3.安裝好Docker後啓動Docker拉取鏡像瀏覽器
docker pull scrapinghub/splashapp
4.利用Docker運行splashdom
docker run -p 8050:8050 scrapinghub/splash(運行以後你們能夠去瀏覽器輸入http://192.168.99.100:8050檢查Docker是否正確)scrapy
5settings.py配置
SPLASH_URL = 'http://192.168.99.100:8050'(重中之重,一個大坑,必定要注意這個IP就是192.168.99.100,我就一直用的本身IP一直沒運行成功) DOWNLOADER_MIDDLEWARES = { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, } SPIDER_MIDDLEWARES = { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, } DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
ROBOTSTXT_OBEY = True(此處注意,有的網站是True,而有的網站須要把它改爲False)
爬蟲的py文件1.py
# -*- coding: utf-8 -*- import re from urllib.request import urlopen from scrapy.http import Request # from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree import pymongo import scrapy from scrapy.selector import HtmlXPathSelector client = pymongo.MongoClient(host="127.0.0.1") db = client.Health collection = db.Healthclass # 表名classification import redis # 導入redis數據庫 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class healthcareClassSpider(scrapy.Spider): name = "HealthCare" allowed_domains = ["fuke120.com"] # 容許訪問的域 start_urls = [ "http://fuke.fuke120.com/", ] # 每爬完一個網頁會回調parse方法 def parse(self, response): global ii hxs = HtmlXPathSelector(response) hx = hxs.select('//div[@id="allsort"]/div[@class="item"]/span/a') hx1 = hxs.select('//div[@id="allsort"]/div[@class="item born"]/span/a') # hx2 = hxs.select('//div[@id="allsort"]/div[@class="item"]/div[@class="i-mc"]/div[@class="i-mc01"]/ul[@class="w_ul01"]/li/a') for secItem in hx: ii+=1 url = secItem.select("@href").extract() c = "http://fuke.fuke120.com"+url[0] name = secItem.select("text()").extract() print(c) print(name) classid = collection.insert({'healthclass': name, 'pid': None}) healthurl = '%s,%s,%s' % (classid, c, ii) r.lpush('healthclassurl',healthurl) for secItem1 in hx1: url = secItem1.select("@href").extract() c1 = "http://fuke.fuke120.com"+url[0] name1 = secItem1.select("text()").extract() print(c1) print(name1) classid = collection.insert({'healthclass': name1, 'pid': None}) healthurl = '%s,%s,%s' % (classid, c1, 0) r.lpush('healthclassurl', healthurl)
2.py
# -*- coding: utf-8 -*- import re from urllib.request import urlopen from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree import pymongo import scrapy from scrapy.selector import HtmlXPathSelector from bson.objectid import ObjectId # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule # from scrapy.linkextractors import LinkExtractor from urllib.request import Request,ProxyHandler from urllib.request import build_opener client = pymongo.MongoClient(host="127.0.0.1") db = client.Health #庫名dianping collection = db.Diseaseclass #表名classification import redis #導入redis數據庫 r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8') class healthcareClassSpider(scrapy.Spider): name = "HealthCare1" allowed_domains = ["fuke120.com"] # 容許訪問的域 dict = {} start_urls = [] def __init__(self): a = r.lrange('healthclassurl', 0,-1) for item in a: healthurl = bytes.decode(item) arr = healthurl.split(',') healthcareClassSpider.start_urls.append(arr[1]) num = arr[2] pid = arr[0] url = arr[1] self.dict[url] = {"pid": pid, "num": num} def parse(self, response): nameInfo = self.dict[response.url] pid1 = nameInfo['pid'] pid = ObjectId(pid1) num = nameInfo['num'] hxs = HtmlXPathSelector(response) hx = hxs.select('//div[@class="x_con02_2"]/div[@class="x_con02_3"]/ul/li/p/a') for secItem in hx: url = secItem.select("@href").extract() url = "http://fuke.fuke120.com"+url[0] name = secItem.select("text()").extract() print(url) print(name) classid = collection.insert({'Diseaseclass': name, 'pid': pid}) diseaseclassurl = '%s,%s,%s' % (classid, url, pid) r.lpush('diseaseclassurl', diseaseclassurl)
3.py
# -*- coding: utf-8 -*- import re from urllib.request import urlopen from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree import pymongo import scrapy from scrapy_splash import SplashMiddleware from scrapy.http import Request, HtmlResponse from scrapy_splash import SplashRequest from scrapy.selector import Selector from scrapy.selector import HtmlXPathSelector from bson.objectid import ObjectId # from diseaseHealth.diseaseHealth.spiders.SpiderJsDynamic import phantomjs1 # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request client = pymongo.MongoClient(host="127.0.0.1") db = client.Health # 庫名dianping collection = db.Treatclass # 表名classification # import redis # 導入redis數據庫 # r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8') class healthcareClassSpider(scrapy.Spider): name = "HealthCare2" allowed_domains = ["fuke120.com"] # 容許訪問的域 dict = {} start_urls = [] def __init__(self): a = r.lrange('diseaseclassurl', 0,-1) for item in a: healthurl = bytes.decode(item) arr = healthurl.split(',') healthcareClassSpider.start_urls.append(arr[1]) num = arr[2] pid = arr[0] url = arr[1] self.dict[url] = {"pid": pid, "num": num} def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse, args={'wait': 0.5}) def parse(self, response): # a = response.body.decode('utf-8') # print(a) nameInfo = self.dict[response.url] pid1 = nameInfo['pid'] pid = ObjectId(pid1) num = nameInfo['num'] print(num) print(pid) hxs = HtmlXPathSelector(response) hx = hxs.select('//div[@class="dh01"]/ul[@class="ul_bg01"]/li/a') for secItem in hx: url = secItem.select("@href").extract() c = "http://fuke.fuke120.com" + url[0] name = secItem.select("text()").extract() print(c) print(name) classid = collection.insert({'Treatclass': name, 'pid': pid}) treatclassurl = '%s,%s,%s' % (classid, c, pid) r.lpush('treatclassurl', treatclassurl)
大功告成,主要仍是爲了使用scrapy-splash。