Python爬蟲入門教程： All IT eBooks多線程爬取

時間 2019-12-10

標籤 python 爬蟲入門教程 ebooks 多線程欄目 Python 简体版

原文原文鏈接

All IT eBooks多線程爬取-寫在前面

對一個爬蟲愛好者來講，或多或少都有這麼一點點的收集癖 ~ 發現好的圖片，發現好的書籍，發現各類能存放在電腦上的東西，都喜歡把它批量的爬取下來。而後放着，是的，就這麼放着.......而後慢慢的遺忘掉.....html

All IT eBooks多線程爬取-爬蟲分析

打開網址 http://www.allitebooks.com/ 發現特別清晰的小頁面，一看就好爬
瀏覽器

在點擊一本圖書進入，發現下載的小連接也很明顯的展現在了咱們面前，小激動一把，這麼清晰無廣告的網站很少見了。
服務器

All IT eBooks多線程爬取-擼代碼

此次我採用了一個新的模塊 requests-html 這個模塊的做者以前開發了一款 requests，你應該很是熟悉了，線程控制採用的 queue
安裝 requests-html 模塊session

pip install requests-html

關於這個模塊的使用，你只須要使用搜索引擎搜索一下這個模塊名稱，那文章也是不少滴，做爲能學到這篇博客的你來講，是很簡單的拉~多線程

咱們編寫一下核心的內容app

from requests_html import HTMLSession
from queue import Queue
import requests
import random

import threading
CARWL_EXIT = False
DOWN_EXIT = False

#####
# 其餘代碼
####
if __name__ == '__main__':

    page_queue = Queue(5)
    for i in range(1,6):
        page_queue.put(i)  # 把頁碼存儲到page_queue裏面

    # 採集結果
    data_queue = Queue()

    # 記錄線程列表
    thread_crawl = []
    # 每次開啓5個線程
    craw_list = ["採集線程1號","採集線程2號","採集線程3號","採集線程4號","採集線程5號"]

    for thread_name in craw_list:
        c_thread = ThreadCrawl(thread_name,page_queue,data_queue)
        c_thread.start()
        thread_crawl.append(c_thread)

    while not page_queue.empty():
        pass

    # 若是page_queue爲空，採集線程退出循環
    CARWL_EXIT = True
    for thread in thread_crawl:
        thread.join()
        print("抓取線程結束")

上面就是爬取圖書詳情頁面的線程了，我開啓了5個線程爬取，頁碼也只爬取了5 頁，若是你須要更多的，只須要修改dom

    page_queue = Queue(5)
    for i in range(1,6):
        page_queue.put(i)  # 把頁碼存儲到page_queue裏面

下面咱們把 ThreadCrawl 類編寫完畢函數

session = HTMLSession()

# 這個地方是 User_Agents 之後我把他配置到服務器上面，就能夠遠程獲取了  這個列表裏面有不少項，你本身去源碼裏面找吧
USER_AGENTS = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20"
]
# 獲取圖書下載連接的線程類
class ThreadCrawl(threading.Thread):
    # 構造函數
    def __init__(self,thread_name,page_queue,data_queue):

        super(ThreadCrawl,self).__init__()
        self.thread_name = thread_name
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.page_url = "http://www.allitebooks.com/page/{}"   #URL拼接模板

    def run(self):
        print(self.thread_name+" 啓動*********")

        while not CARWL_EXIT:
            try:
                page = self.page_queue.get(block=False)
                page_url = self.page_url.format(page)   # 拼接URL操做
                self.get_list(page_url)   # 分析頁面連接 

            except Exception as e:
                print(e)
                break


    # 獲取當前列表頁全部圖書連接
    def get_list(self,url):
        try:
            response = session.get(url)
        except Exception as e:
            print(e)
            raise e

        all_link = response.html.find('.entry-title>a') # 獲取頁面全部圖書詳情連接

        for link in all_link:
            self.get_book_url(link.attrs['href'])   # 獲取圖書連接

    # 獲取圖書下載連接
    def get_book_url(self,url):
        try:
            response = session.get(url)

        except Exception as e:
            print(e)
            raise e

        download_url = response.html.find('.download-links a', first=True)

        if download_url is not None: # 若是下載連接存在，那麼繼續下面的爬取工做
            link = download_url.attrs['href']
            self.data_queue.put(link)   # 把圖書下載地址 存儲到 data_queue裏面，準備後面的下載
            print("抓取到{}".format(link))

上述代碼一個很是重要的內容就是把圖書的下載連接存儲到了data_queue 裏面，這些數據在另外一個下載線程裏面是最基本的數據。學習

下面開始編寫圖書下載的類和方法。網站

我開啓了4個線程，操做和上面的很是相似

class ThreadDown(threading.Thread):
    def __init__(self, thread_name, data_queue):
        super(ThreadDown, self).__init__()
        self.thread_name = thread_name
        self.data_queue = data_queue

    def run(self):
        print(self.thread_name + ' 啓動************')
        while not DOWN_EXIT:
            try:
                book_link = self.data_queue.get(block=False)
                self.download(book_link)
            except Exception as e:
                pass

    def download(self,url):
        # 隨機瀏覽器User-Agent
        headers = {"User-Agent":random.choice(USER_AGENTS)}
        # 獲取文件名字
        filename = url.split('/')[-1]
        # 若是url裏面包含pdf
        if '.pdf' in url or '.epub' in url:
            file = 'book/'+filename  # 文件路徑已經寫死，請在跟目錄先建立好一個book文件夾
            with open(file,'wb') as f:  # 開始二進制寫文件
                print("正在下載 {}".format(filename))
                response = requests.get(url,stream=True,headers=headers)
                # 獲取文件大小
                totle_length = response.headers.get("content-length")
                # 若是文件大小不存在，則直接寫入返回的文本
                if totle_length is None:
                    f.write(response.content)
                else:
                    for data in response.iter_content(chunk_size=4096):
                        f.write(data)
                    else:
                        f.close()

                print("{}下載完成".format(filename))

if __name__ == '__main__': 

# 其餘代碼在上面
    thread_image = []
    image_list = ['下載線程1號', '下載線程2號', '下載線程3號', '下載線程4號']
    for thread_name in image_list:
        d_thread = ThreadDown(thread_name, data_queue)
        d_thread.start()
        thread_image.append(d_thread)

    while not data_queue.empty():
        pass

    DOWN_EXIT = True
    for thread in thread_image:
        thread.join()
        print("下載線程結束")