Python 爬蟲實現

1、建立reptileWuxia.py文件,使用BeautifulSoup模塊,html

一、環境安裝:python

1)安裝Python 3.6.1,git

2)配置環境變量:github

測試:Python --version多線程

3)安裝BeautifulSoupapp

pip install beautifulsoup4dom

2、代碼實現ide

用到多線程測試

# coding=utf-8

import urllib.request
import re
import time
import os
import threading
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin
from threading import Thread
from concurrent import futures
#from concurrent.futures import ThreadPoolExecutor

class BookProperty:
    def __init__(self, name, url, worksDict):
      self.name = name
      self.url = url
      self.worksDict = worksDict

class OutputContent:
    def createDirectory(self,directory):
        localDir = os.path.join(os.getcwd(),directory) #dirname(__file__)
        if not os.path.exists(localDir): #if os.path.exists(save_dir) is False:
            os.makedirs(localDir)
        return localDir
        
    def createFile(self,newfile):
        #if not os.path.exists(newfile):
        f = open(newfile,'w',encoding='utf-8')
        f.close()

    def writeContent(self,fileName,chapterList):
        try:
            self.createFile(fileName)
            #list = [chapterTitle,"\n",chapterContent,"\n"]
            with open(fileName,'a',encoding='utf-8') as f:
                f.writelines(chapterList)
        except Exception as e:
            print('save file error.'+str(e))

class ParserPage:
    #加載頁面,獲得BeautifulSoup對象
    def loadPage(self,url):
        html=None;soup=None
        try:
            request = urllib.request.urlopen(url)
            html = request.read().decode('gb2312','ignore')
        except Exception as e:
            print(e)
        try:
            soup = BeautifulSoup(html,'html.parser') #建立一個beautifulsoup的類
        except Exception as ex:    
            print(ex)
            #raise ex.reason
        return soup

    def __urlHandle(self,*url):
        if len(url) > 1 :
            return urljoin(url[0], url[1])
        else:
            result = urlparse(url[0])
            return result.scheme + '://' + result.netloc

    def __parsetAuthorWorks(self,url,soup):
        worksDict = {}
        linkList = soup.find_all(class_=re.compile('style2|style3',re.IGNORECASE)) #忽略大小寫
        for linkTag in linkList:   
            aTag = linkTag.contents          
            if len(aTag) > 0 and aTag[0].name == 'a' and aTag[0].get_text() !='':
                href = self.__urlHandle(url,aTag[0].get('href'))
                worksDict.update({href:aTag[0].get_text()}) #url,做者名(或書名)
        return worksDict
   
     #獲得一個做者下全部做品和入口地址       
    def parserOneAuthorWorks(self,url):
        soup = self.loadPage(url)
        if soup is None: return
        dirName ='Novel'
        navList = soup.select('.LinkPath') #做者名      
        if len(navList) >1 :
            authorName = navList[1].get_text()
        worksDict = self.__parsetAuthorWorks(url,soup)
        return {'authorName':authorName,'worksDict':worksDict};

    #獲得全部做者和做者做品入口地址,把回列表數據類型
    def parserAllAuthorName(self,url,authorName):
        soup = self.loadPage(url)
        if soup is None: return
        authorDict = self.__parsetAuthorWorks(url,soup)
        
        return {'authorName':authorName,'url':url,'worksDict':authorDict}
    

    #解析目錄界面,獲得正文url
    def parserCatalogue(self,url):
        soup = self.loadPage(url)
        if soup is None: return
        domain = self.__urlHandle(url)
        #title = suop.select(".STYLE17")[0].get_text() #取書名,每一頁都不同,取消使用
        aList = soup.find_all("a",{'class' : '1'})
        urls = []
        for aTag in aList:
            urls.append(domain + aTag.attrs['href'])
        return urls

    #解析正文,並獲得正文下一頁
    def parserOnePage(self,url):
        soup = self.loadPage(url)
        if soup is None: return
        content = self.__parserPageContent(soup)
        nextUrl = self.__isNextPage(soup,url)
        return {'content':content[0],'nextUrl':nextUrl}
    
    def parsetOnePageNotCatalog(self,url):
        soup = self.loadPage(url)
        if soup is None: return
        content = self.__parserPageContent(soup)
        nextUrl = self.__isNextPage(soup,url,content[1])
        return {'content':content[0],'nextUrl':nextUrl}
        
    def __parserPageContent(self,soup):
        h3Tag = soup.find('h3')
        spanTag = soup.find("span")
        chapterData = chapterName = None
        if not h3Tag is None :
            chapterName = h3Tag.get_text()          #讀取章節標題名稱
            chapterData = chapterName+'\n'
        if not spanTag is None:
            chapterContent= spanTag.get_text()           #讀取正文
            if not chapterContent is None:
                chapterContent ="".join(chapterContent.split())  #刪除空格
                if not chapterData is None:
                    chapterData= chapterData+chapterContent+'\n'
                else:
                    chapterData = chapterContent+'\n'
        return chapterData,chapterName

    #對沒有目錄頁的頁面的,特殊狀況的判斷
    def __isNextPage(self,*args):
        nextUrl = None
        nextATag = args[0].find('a',{'class':'LinkNextArticle'})#返回下一頁URL,由於目錄沒有全部頁的url,不帶域名  
        if not nextATag is None:
            domain = self.__urlHandle(args[1])
            nextUrl = domain + nextATag.attrs['href']
            if len(args)>2 and not args[2] is None:
                nextText = nextATag.get_text()
                nextText="".join(nextText.split())
                chapterName="".join(args[2].split())
                if nextText[0:2]  !=  chapterName[0:2]:
                    nextUrl = None
        return nextUrl
                    
class ReptileManager:
    def __init__(self, url):
        self.url = url
        self.parser = ParserPage()
        self.output = OutputContent()

    #爬取一本書
    def reptileBook(self,url,fileName):
        urls = self.parser.parserCatalogue(url)
        if urls is None: return
        contentList=[]
        if len(urls) > 0 :
            nextUrl =None
            for url in urls:
                result = self.parser.parserOnePage(url)
                if result is None: continue    
                nextUrl = result['nextUrl']
                contentList.append(result['content'])
            while nextUrl :
                result = self.parser.parserOnePage(nextUrl)
                if result is None: break
                nextUrl = result['nextUrl']
                contentList.append(result['content'])
        else :
            result = self.parser.parsetOnePageNotCatalog(url)
            if result is None: return
            contentList.append(result['content'])
            nextUrl = result['nextUrl']
            while nextUrl :
                result = self.parser.parsetOnePageNotCatalog(nextUrl)
                contentList.append(result['content'])
                nextUrl = result['nextUrl']
        if not contentList is None:
            self.output.writeContent(fileName,contentList)#寫文件
        return fileName
 
    #爬取1個做者下全部做品
    def reptileOneAuthorWorksBooks(self):
        print ('解析開始時間:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        works = self.parser.parserOneAuthorWorks(self.url)
        self.__reptileMuchBooks(works)
        print ('解析完成時間:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
         
    #獲得全部做者下全部做品名稱與url,返回字典數據類型
    def reptileAllAuthorAllWorks(self,url):
        worksList =[];futureList=[]
        result = self.parser.parserAllAuthorName(url,'')
        with futures.ThreadPoolExecutor(max_workers=10) as executor:        
            for k,v in result['worksDict'].items():
                future = executor.submit(self.parser.parserAllAuthorName,k,v)
                futureList.append(future)
            for future in futures.as_completed(futureList):
                result = future.result()
                worksList.append(result)
            #for data in executor.map(parserAllAuthorName, authorDict.keys(),authorDict.values()):
        return worksList;

    #爬取全部做者下的全部做品
    def reptileAllAuthorBoos(self):
        print ('開始時間:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        worksList = self.reptileAllAuthorAllWorks(self.url)
        #with futures.ThreadPoolExecutor(max_workers=5) as executor:         
        i=0
        print(len(worksList))
        for works in worksList:
            i +=1
            if i>89 :
                self.__reptileMuchBooks(works)
        print ('結束時間:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 

    def __reptileMuchBooks(self,works):
        a=0;futureList=[]
        filePath = self.output.createDirectory(works['authorName'])
        with futures.ThreadPoolExecutor(max_workers=10) as executor:   
            for k,v in works['worksDict'].items():
                a += 1
                fileName = os.path.join(filePath,str(a)+v+'.txt')
                future = executor.submit(self.reptileBook,k,fileName)
                futureList.append(future)
            for future in futures.as_completed(futureList):
                result = future.result()
                print(result)
if __name__ == '__main__': 
    reptile = ReptileManager('this is URL')
    reptile.reptileAllAuthorBoos()

參考:this

'''
參考:
https://beautifulsoup.readthedocs.io/zh_CN/latest/#
https://www.jianshu.com/p/62145aed2d49
https://www.jianshu.com/p/b9b3d66aa0be
https://github.com/yanbober/SmallReptileTraining/tree/master/ConcurrentSpider

https://www.gulongbbs.com/wuxia/ 測試使用 '''

相關文章
相關標籤/搜索