1、建立reptileWuxia.py文件,使用BeautifulSoup模塊,html
一、環境安裝:python
1)安裝Python 3.6.1,git
2)配置環境變量:github
測試:Python --version多線程
3)安裝BeautifulSoupapp
pip install beautifulsoup4dom
2、代碼實現ide
用到多線程測試
# coding=utf-8 import urllib.request import re import time import os import threading from bs4 import BeautifulSoup from urllib.parse import urlparse from urllib.parse import urljoin from threading import Thread from concurrent import futures #from concurrent.futures import ThreadPoolExecutor class BookProperty: def __init__(self, name, url, worksDict): self.name = name self.url = url self.worksDict = worksDict class OutputContent: def createDirectory(self,directory): localDir = os.path.join(os.getcwd(),directory) #dirname(__file__) if not os.path.exists(localDir): #if os.path.exists(save_dir) is False: os.makedirs(localDir) return localDir def createFile(self,newfile): #if not os.path.exists(newfile): f = open(newfile,'w',encoding='utf-8') f.close() def writeContent(self,fileName,chapterList): try: self.createFile(fileName) #list = [chapterTitle,"\n",chapterContent,"\n"] with open(fileName,'a',encoding='utf-8') as f: f.writelines(chapterList) except Exception as e: print('save file error.'+str(e)) class ParserPage: #加載頁面,獲得BeautifulSoup對象 def loadPage(self,url): html=None;soup=None try: request = urllib.request.urlopen(url) html = request.read().decode('gb2312','ignore') except Exception as e: print(e) try: soup = BeautifulSoup(html,'html.parser') #建立一個beautifulsoup的類 except Exception as ex: print(ex) #raise ex.reason return soup def __urlHandle(self,*url): if len(url) > 1 : return urljoin(url[0], url[1]) else: result = urlparse(url[0]) return result.scheme + '://' + result.netloc def __parsetAuthorWorks(self,url,soup): worksDict = {} linkList = soup.find_all(class_=re.compile('style2|style3',re.IGNORECASE)) #忽略大小寫 for linkTag in linkList: aTag = linkTag.contents if len(aTag) > 0 and aTag[0].name == 'a' and aTag[0].get_text() !='': href = self.__urlHandle(url,aTag[0].get('href')) worksDict.update({href:aTag[0].get_text()}) #url,做者名(或書名) return worksDict #獲得一個做者下全部做品和入口地址 def parserOneAuthorWorks(self,url): soup = self.loadPage(url) if soup is None: return dirName ='Novel' navList = soup.select('.LinkPath') #做者名 if len(navList) >1 : authorName = navList[1].get_text() worksDict = self.__parsetAuthorWorks(url,soup) return {'authorName':authorName,'worksDict':worksDict}; #獲得全部做者和做者做品入口地址,把回列表數據類型 def parserAllAuthorName(self,url,authorName): soup = self.loadPage(url) if soup is None: return authorDict = self.__parsetAuthorWorks(url,soup) return {'authorName':authorName,'url':url,'worksDict':authorDict} #解析目錄界面,獲得正文url def parserCatalogue(self,url): soup = self.loadPage(url) if soup is None: return domain = self.__urlHandle(url) #title = suop.select(".STYLE17")[0].get_text() #取書名,每一頁都不同,取消使用 aList = soup.find_all("a",{'class' : '1'}) urls = [] for aTag in aList: urls.append(domain + aTag.attrs['href']) return urls #解析正文,並獲得正文下一頁 def parserOnePage(self,url): soup = self.loadPage(url) if soup is None: return content = self.__parserPageContent(soup) nextUrl = self.__isNextPage(soup,url) return {'content':content[0],'nextUrl':nextUrl} def parsetOnePageNotCatalog(self,url): soup = self.loadPage(url) if soup is None: return content = self.__parserPageContent(soup) nextUrl = self.__isNextPage(soup,url,content[1]) return {'content':content[0],'nextUrl':nextUrl} def __parserPageContent(self,soup): h3Tag = soup.find('h3') spanTag = soup.find("span") chapterData = chapterName = None if not h3Tag is None : chapterName = h3Tag.get_text() #讀取章節標題名稱 chapterData = chapterName+'\n' if not spanTag is None: chapterContent= spanTag.get_text() #讀取正文 if not chapterContent is None: chapterContent ="".join(chapterContent.split()) #刪除空格 if not chapterData is None: chapterData= chapterData+chapterContent+'\n' else: chapterData = chapterContent+'\n' return chapterData,chapterName #對沒有目錄頁的頁面的,特殊狀況的判斷 def __isNextPage(self,*args): nextUrl = None nextATag = args[0].find('a',{'class':'LinkNextArticle'})#返回下一頁URL,由於目錄沒有全部頁的url,不帶域名 if not nextATag is None: domain = self.__urlHandle(args[1]) nextUrl = domain + nextATag.attrs['href'] if len(args)>2 and not args[2] is None: nextText = nextATag.get_text() nextText="".join(nextText.split()) chapterName="".join(args[2].split()) if nextText[0:2] != chapterName[0:2]: nextUrl = None return nextUrl class ReptileManager: def __init__(self, url): self.url = url self.parser = ParserPage() self.output = OutputContent() #爬取一本書 def reptileBook(self,url,fileName): urls = self.parser.parserCatalogue(url) if urls is None: return contentList=[] if len(urls) > 0 : nextUrl =None for url in urls: result = self.parser.parserOnePage(url) if result is None: continue nextUrl = result['nextUrl'] contentList.append(result['content']) while nextUrl : result = self.parser.parserOnePage(nextUrl) if result is None: break nextUrl = result['nextUrl'] contentList.append(result['content']) else : result = self.parser.parsetOnePageNotCatalog(url) if result is None: return contentList.append(result['content']) nextUrl = result['nextUrl'] while nextUrl : result = self.parser.parsetOnePageNotCatalog(nextUrl) contentList.append(result['content']) nextUrl = result['nextUrl'] if not contentList is None: self.output.writeContent(fileName,contentList)#寫文件 return fileName #爬取1個做者下全部做品 def reptileOneAuthorWorksBooks(self): print ('解析開始時間:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) works = self.parser.parserOneAuthorWorks(self.url) self.__reptileMuchBooks(works) print ('解析完成時間:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) #獲得全部做者下全部做品名稱與url,返回字典數據類型 def reptileAllAuthorAllWorks(self,url): worksList =[];futureList=[] result = self.parser.parserAllAuthorName(url,'') with futures.ThreadPoolExecutor(max_workers=10) as executor: for k,v in result['worksDict'].items(): future = executor.submit(self.parser.parserAllAuthorName,k,v) futureList.append(future) for future in futures.as_completed(futureList): result = future.result() worksList.append(result) #for data in executor.map(parserAllAuthorName, authorDict.keys(),authorDict.values()): return worksList; #爬取全部做者下的全部做品 def reptileAllAuthorBoos(self): print ('開始時間:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) worksList = self.reptileAllAuthorAllWorks(self.url) #with futures.ThreadPoolExecutor(max_workers=5) as executor: i=0 print(len(worksList)) for works in worksList: i +=1 if i>89 : self.__reptileMuchBooks(works) print ('結束時間:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) def __reptileMuchBooks(self,works): a=0;futureList=[] filePath = self.output.createDirectory(works['authorName']) with futures.ThreadPoolExecutor(max_workers=10) as executor: for k,v in works['worksDict'].items(): a += 1 fileName = os.path.join(filePath,str(a)+v+'.txt') future = executor.submit(self.reptileBook,k,fileName) futureList.append(future) for future in futures.as_completed(futureList): result = future.result() print(result) if __name__ == '__main__': reptile = ReptileManager('this is URL') reptile.reptileAllAuthorBoos()
參考:this
'''
參考:
https://beautifulsoup.readthedocs.io/zh_CN/latest/#
https://www.jianshu.com/p/62145aed2d49
https://www.jianshu.com/p/b9b3d66aa0be
https://github.com/yanbober/SmallReptileTraining/tree/master/ConcurrentSpider
https://www.gulongbbs.com/wuxia/ 測試使用 '''