Scrapy爬取大衆養生網

(1)
進入養生之道網首頁,分析首頁佈局:
首頁佈局分爲五個大類,食療養生,養生人羣,運動養生,醫學健康,糖尿病
咱們同過這幾個大類,進入連接,爬取大類下的子分類,並把子分類連接和子分類名and五個大類名字,
分別存入redis和mongodb庫中,編寫Health.py文件。html

#-----------Health.py------------
import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule # from scrapy.linkextractors import LinkExtractor from urllib.request import urlopen #from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree from bson.objectid import ObjectId import pymongo client = pymongo.MongoClient(host="127.0.0.1") db = client.publicHealth #庫名dianping collection = db.healthClass import redis r = redis.Redis(host='127.0.0.1', port=6379, db=0) class healthClassSpider(scrapy.Spider): name = "health" allowed_domains = ["ys137.com"] # 容許訪問的域 start_urls = [ "https://www.ys137.com/lvyou/", ] #每爬完一個網頁會回調parse方法 # def parse(self, response): # print(response.body.decode('utf-8')) def parse(self, response): hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="container-fluid top-nav"]/div[@class="container main clearfix"]/table[@class="pull-left"]/tr/th/a') for secItem in hxsObj: className = secItem.select('text()').extract() classUrl = secItem.select('@href').extract() print(className[0]) print(classUrl[0]) print('----------------------------------') classid = self.insertMongo(className[0], None) request = Request(classUrl[0], callback=lambda response, pid=str(classid): self.parse_subClass(response, pid)) yield request print("======================") def parse_subClass(self, response, pid): hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="channel-sons pull-left"]/a') for secItem in hxsObj: className2 = secItem.select('text()').extract() classUrl2 = secItem.select('@href').extract() print(className2) print('----------------------------') print(classUrl2) classid = self.insertMongo(className2[0], ObjectId(pid)) self.pushRedis(classid, pid, classUrl2[0]) def insertMongo(self, classname, pid): classid = collection.insert({'classname': classname, 'pid': pid}) return classid def pushRedis(self, classid, pid, url): healthurl = '%s,%s,%s' % (classid, pid, url) r.lpush('healthurl', healthurl)

  (2)
經過向rsdis庫依次取出子分類連接,進入子分類頁面,爬取子分類頁面的內容標題和連接,並再次存入
標題和連接到redis和mongodb中,編寫Health2.py文件python

 

#-----------Health2.py------------
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor import pymongo from bson.objectid import ObjectId client = pymongo.MongoClient(host="127.0.0.1") db = client.publicHealth # 庫名dianping collection = db.healthTitle import redis # 導入redis數據庫 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class healthClassSpider(scrapy.Spider): name = "health2" allowed_domains = ["ys137.com"] # 容許訪問的域 def __init__(self): # global pid # 查詢reids庫novelurl # qidianNovelSpider.start_urls=["https://www.qidian.com/all",] start_urls = [] urlList = r.lrange('healthurl', 0, 1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid,"urls":url, "num": 0} # ii += 1 # if ii > 3: # break print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] pid = classInfo['pid'] headurl=classInfo['urls'] num = classInfo['num'] if num > 3: return None hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="arc-infos clearfix"]/h2/a') for secItem in hxsObj: className = secItem.select('text()').extract() classUrl = secItem.select('@href').extract() print(className[0]) print(classUrl) #classid =self.insertMongo(className[0],ObjectId(objectid)) #self.pushRedis(classid,objectid, classUrl[0]) # --------------------------不用調用方法直接取下一頁------------------------------------------------------------------------------ nextPages= hxs.select('//ul[@class="pagination"]/li/a/@href') print(len(nextPages)) nextPage=nextPages.extract()[len(nextPages)-1] #print(headurl) nextPage= headurl+nextPage #print(nextPage) classInfo['num'] += 1 self.dict[nextPage] = classInfo request = Request(nextPage, callback=self.parse) yield request print('--------end--------------') ''' def insertMongo(self, className, pid): classid = collection.insert({'classname': className, 'pid': pid}) return classid def pushRedis(self, classid, pid, classUrl): titlename = '%s,%s,%s,' % (classid, pid, classUrl) r.lpush('titlenameurl', titlename) '''

  

(3)
向庫中取出每一個分類下的標題連接,爬取到標題下的內容,並根據id更新到,存有標題表,的對應標題下
編寫py文件。web

#--------------Health3.py---------------------# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo
from bson.objectid import ObjectId
import re
client = pymongo.MongoClient(host="127.0.0.1")
db = client.publicHealth # 庫名dianping
collection = db.healthTitle
import redis  # 導入redis數據庫
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
ii = 0

class healthClassSpider(scrapy.Spider):
    name = "health3"
    allowed_domains = ["ys137.com"]  # 容許訪問的域

    def __init__(self):
        # global pid
        # 查詢reids庫novelurl
        #qidianNovelSpider.start_urls=["https://read.qidian.com/chapter/kbE0tc0oVoNrZK4x-CuJuw2/92LFs_xdtPXwrjbX3WA1AA2",]
        start_urls = []
        urlList = r.lrange('titlenameurl', 0,1)
        ii = 0
        self.dict = {}
        for item in urlList:
            itemStr = str(item, encoding="utf-8")
            arr = itemStr.split(',')
            classid = arr[0]
            pid = arr[1]
            url = arr[2]
            print(arr[2])
            start_urls.append(url)
            self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
            ii += 1
            if ii > 1:
                break
        print(start_urls)
        self.start_urls = start_urls

    def parse(self, response):
        classInfo = self.dict[response.url]
        objectid = classInfo['classid']
        objectid2 = ObjectId(objectid)
        pid = classInfo['pid']
        num = classInfo['num']
        ii = ""
        #==================================================================================
        hxs = HtmlXPathSelector(response)
        hxsObj = hxs.select('//div[@class="article-content"]/table/tr/td/p')
        for secItem in hxsObj:
            healthTitleContent = secItem.select('text()').extract()
            #print(healthTitleContent[0])
            if healthTitleContent ==[]:
                pass
            else:
                ii = ii+healthTitleContent[0]
        print(ii)
                # db.healthTitle.update({"_id": objectid2}, {"$set": {'healthTitleContent':ii}})
        # sleep(0.3)
        print('------------------------------------------------------')
        '''
        html = response.body.decode('gbk')
        selector = etree.HTML(html)
        Name = selector.xpath('//div[@class="article-content"]/table/tr/td/h2/text()')
        #print(Name)
        # print(len(Name))
        arr=[]
        for i in range(len(Name)):
            print(Name[i])
            arr[i]=Name[i]+'/n'
        #db.healthTitle.update({"_id": objectid2}, {"$set": {'healthTitlechapter': Name}})
        # print('----------------------------------------------------')

        classname = selector.xpath('//div[@class="article-content"]/table/tr/td/p/text()')
        #print(classname)
        for a in range(len(classname)):
            print(classname[a])


       # db.healthTitle.update({"_id": objectid2}, {"$set": {'healthTitleContent': classname}})
'''
相關文章
相關標籤/搜索