1.安裝
pip install Scrapy
#必定要以管理員身份運行dos窗口
conda install scrapy
2.建立項目
scrapy startproject novelhtml
3.建立qidianClass4.py文件,爬取小說一級分類,二級分類,名稱和連接,分別存入mongdb和redis庫中對應表中python
import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule # from scrapy.linkextractors import LinkExtractor from urllib.request import urlopen #from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree from bson.objectid import ObjectId import pymongo client = pymongo.MongoClient(host="127.0.0.1") db = client.novel #庫名dianping collection = db.novelclass import redis r = redis.Redis(host='127.0.0.1', port=6379, db=0) class qidianClassSpider(scrapy.Spider): name = "qidianClass4" allowed_domains = ["qidian.com"] # 容許訪問的域 start_urls = [ "https://www.qidian.com/all", ] # #每爬完一個網頁會回調parse方法 # def parse(self, response): # print(response.body.decode('utf-8')) def parse(self, response): hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="work-filter type-filter"]/ul[@type="category"]/li[@class=""]/a') for secItem in hxsObj: className = secItem.select('text()').extract() classUrl = secItem.select('@href').extract() classUrl = 'https:' + classUrl[0] print(className[0]) print(classUrl) classid = self.insertMongo(className[0],None) request = Request(classUrl, callback=lambda response, pid=str(classid): self.parse_subClass(response, pid)) yield request print("======================") def parse_subClass(self, response,pid): hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="sub-type"]/dl[@class=""]/dd[@class=""]/a') for secItem in hxsObj: className2 = secItem.select('text()').extract() classUrl2 = secItem.select('@href').extract() print(className2) print('----------------------------') classUrl2 = 'https:' + classUrl2[0] print(classUrl2) classid = self.insertMongo(className2[0], ObjectId(pid)) self.pushRedis(classid, pid, classUrl2) def insertMongo(self, classname, pid): classid = collection.insert({'classname': classname, 'pid': pid}) return classid def pushRedis(self, classid, pid, url): novelurl = '%s,%s,%s' % (classid, pid, url) r.lpush('novelurl', novelurl)
4..建立qidianNovel.py文件,爬取小說名稱和連接,分別存入mongdb和redis庫中對應表中web
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor import pymongo client = pymongo.MongoClient(host="127.0.0.1") db = client.novel # 庫名dianping collection = db.novelname import redis # 導入redis數據庫 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class qidianNovelSpider(scrapy.Spider): name = "qidianNovel" allowed_domains = ["qidian.com"] # 容許訪問的域 def __init__(self): # global pid # 查詢reids庫novelurl # qidianNovelSpider.start_urls=["https://www.qidian.com/all",] start_urls = [] urlList = r.lrange('novelurl', 0, -1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid, "num": 0} # ii += 1 # if ii > 3: # break print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] pid = classInfo['pid'] num = classInfo['num'] if num > 3: return None hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="book-mid-info"]/h4/a') for secItem in hxsObj: className = secItem.select('text()').extract() classUrl = secItem.select('@href').extract() classUrl = 'https:' + classUrl[0] print(className[0]) print(classUrl) classid =self.insertMongo(className[0],objectid) self.pushRedis(classid,objectid, classUrl) nextPage = self.nextUrl(response) # sleep(0.3) # --------------------------不用調用方法直接取下一頁------------------------------------------------------------------------------ # nextPages= hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]') # nextPages = nextPages.select('@href').extract() # nextPage = "https:" + nextPages[0] classInfo['num'] += 1 self.dict[nextPage] = classInfo request = Request(nextPage, callback=self.parse) yield request print('--------end--------------') # --------------------------------------------------------------------------------------------------------------- # ===================獲取下一頁連接方法======================================================= def nextUrl(self, response): hxs = HtmlXPathSelector(response) # nextPage = hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]') nextPage = hxs.select('//a[@class="lbf-pagination-next "]') # print(nextPage.extract()) if len(nextPage) == 1: nextPage = nextPage.select('@href').extract() nextPage = "https:" + nextPage[0] print('==============' + nextPage + '====================') return nextPage # =====================獲取下一頁連接結束================================================== def insertMongo(self, className, pid): classid = collection.insert({'classname': className, 'pid': pid}) return classid def pushRedis(self, classid, pid, classUrl): novelnameurl = '%s,%s,%s,' % (classid, pid, classUrl) r.lpush('novelnameurl', novelnameurl)
5.建立qidianNovelChapterInfo.py文件,爬取小說名稱下的章節和連接,分別存入mongdb和redis庫的中的對應表中redis
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor from lxml import etree import pymongo client = pymongo.MongoClient(host="127.0.0.1") db = client.novel # 庫名dianping collection = db.novelChapterInfo import redis # 導入redis數據庫 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class qidianNovelSpider(scrapy.Spider): name = "qidianNovelChapterInfo" allowed_domains = ["qidian.com"] # 容許訪問的域 def __init__(self): # global pid # 查詢reids庫novelurl # qidianNovelSpider.start_urls=["https://www.qidian.com/all",] start_urls = [] urlList = r.lrange('novelnameurl', 0, -1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid, "num": 0} # ii += 1 # if ii > 1: # break print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] pid = classInfo['pid'] # num = classInfo['num'] # if num > 3: # return None html = response.body.decode('utf-8') selector = etree.HTML(html) novelChapters = selector.xpath('//ul[@class="cf"]/li/a') for item in novelChapters: novelChapter= item.text print(item.text) novelChapterUrl='https:'+item.get('href') print(novelChapterUrl) # print(item.get('href')) classid = self.insertMongo(novelChapter, objectid) self.pushRedis(classid, objectid, novelChapterUrl) def insertMongo(self,novelChapter, pid): classid = collection.insert({'novelChapter': novelChapter,'pid': pid}) return classid def pushRedis(self, classid,pid, novelChapterUrl): novelChapterUrl = '%s,%s,%s' % ( classid , pid, novelChapterUrl) r.lpush('novelChapterUrl', novelChapterUrl)
6.建立qidianNovelWorksInfo.py文件,爬取小說基本信息,更新到原有的存小說名稱的mongdb(novel)庫小說名稱表中數據庫
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor from lxml import etree import pymongo from bson.objectid import ObjectId client = pymongo.MongoClient(host="127.0.0.1") db = client.novel # 庫名dianping collection = db.novelname import redis # 導入redis數據庫 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class qidianNovelSpider(scrapy.Spider): name = "qidianNovelWorksInfo" allowed_domains = ["qidian.com"] # 容許訪問的域 def __init__(self): # global pid # 查詢reids庫novelurl # qidianNovelSpider.start_urls=["https://www.qidian.com/all",] start_urls = [] urlList = r.lrange('novelnameurl', 0, -1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid, "num": 0} # ii += 1 # if ii > 5: # break print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] objectid2 = ObjectId(objectid) pid = classInfo['pid'] # num = classInfo['num'] # if num > 3: # return None html = response.body.decode('utf-8') selector = etree.HTML(html) workName = selector.xpath('//div[@class="book-info "]/h1/span/a[@class="writer"]/text()') novelName = selector.xpath('//div[@class="book-info "]/h1/em/text()') novelState = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/span[@class="blue"]/text()') novelClass = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/a[@class="red"]/text()') objClass=novelClass[0] sonClass=novelClass[1] print("小說名:"+novelName[0]) print("做者名:"+workName[0]) print("狀態:" + novelState[0]) print("小說分類:"+objClass) print("小說分類2:" + sonClass) db.novelname.update({"_id": objectid2}, {"$set": {'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass,'sonClass': sonClass}}) print('--------end--------------') # --------------------------------------------------------------------------------------------------------------- # def updateMongo(self, workName,novelName,novelState,objClass,sonClass,objectid2): # # classid = collection.update({'workName': workName,'novelName':novelName,'novelState':novelState,'objClass':objClass,'sonClass':sonClass,'pid': pid}) # classid = collection.update({"_id":objectid2 },{"$set":{'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass, 'sonClass': sonClass}}) # return classid
7.建立qidianNovelChapterContent.py文件,爬取小說章節內容,更新到原有的存小說章節的mongdb(novel)庫下章節表app
# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor from lxml import etree import pymongo from bson.objectid import ObjectId client = pymongo.MongoClient(host="127.0.0.1") db = client.novel # 庫名dianping collection = db.novelChapterInfo import redis # 導入redis數據庫 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class qidianNovelSpider(scrapy.Spider): name = "qidianNovelChapterContent" allowed_domains = ["qidian.com"] # 容許訪問的域 def __init__(self): # global pid # 查詢reids庫novelurl #qidianNovelSpider.start_urls=["https://read.qidian.com/chapter/kbE0tc0oVoNrZK4x-CuJuw2/92LFs_xdtPXwrjbX3WA1AA2",] start_urls = [] urlList = r.lrange('novelChapterUrl', 0,-1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid, "num": 0} # ii += 1 # if ii > 10: # break # print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] objectid2 = ObjectId(objectid) pid = classInfo['pid'] num = classInfo['num'] ii = "" #================================================================================== html = response.body.decode('utf-8') selector = etree.HTML(html) novelChaptersContents = selector.xpath('//div[@class ="read-content j_readContent"]/p') # print(novelChaptersContent) for item in novelChaptersContents: novelChaptersContent=item.text # print(novelChaptersContent) ii = novelChaptersContent + ii # classid = collection.insert({'content': ii, 'pid': pid}) db.novelChapterInfo.update({"_id": objectid2}, {"$set": {'novelChaptersContent':ii}}) # sleep(0.3) print('------------------------------------------------------') # --------------------------------------------------------------------------------------------------------------- # def nextChapter(self, response): # hxs = HtmlXPathSelector(response) # nextChapter = hxs.select('//div[@"chapter-control dib-wrap"]/a[@id = "j_chapterNext"]') # # print(nextPage.extract()) # if len(nextChapter) == 1: # nextChapter = nextChapter.select('@href').extract() # nextChapter= "https:" + nextChapter[0] # print('==============' + nextChapter + '====================') # return nextChapter
9.運行,在項目根目錄下dos執行:
scrapy crawl dmoz(對應py文件中的name=" ")dom
最近一直忙於手中的項目,一直沒有整理,抱歉scrapy
中ide