最近在學習python的爬蟲,而且玩的不亦說乎,所以寫個博客,記錄並分享一下。html
需下載下載如下模塊python
from concurrent.futures import ThreadPoolExecutor import requests,argparse,re,os from bs4 import BeautifulSoup as Soup headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0' } ## 設置命令行參數 def setArgs(): parser = argparse.ArgumentParser(description="功能:下載pdf") parser.add_argument("url",help="目標url") parser.add_argument("-t","--thread",help="最大的線程數。默認爲3",default=3,type=int) parser.add_argument("-f","--filedir",help="文件保存的路徑.默認爲當前目錄下的downloads文件夾.若是不存在,便自動新建",default="downloads") return parser.parse_args() ## 獲取全部pdf的url def getPdfUrl(root_url): response = requests.get(root_url, headers=headers) ## 若是requests沒有從頁面中得到字符編碼,那麼設置爲utf-8 if "charset" not in response.headers: response.encoding = "utf-8" bsObj = Soup(response.text, "html.parser") pdfs = bsObj.find_all("a", {"href": re.compile(r'.pdf$')}) ## 得到一個字典,key爲pdf完整url,value爲pdf名稱 url_pdfName = {root_url[:root_url.rfind("/")+1]+pdf["href"]:pdf.string for pdf in pdfs} return url_pdfName ## 顯示正在下載的pdf的名稱 def showPdf(pdf_name): print(pdf_name+"...") ## 下載pdf def savePdf(url,pdf_name): response = requests.get(url,headers=headers,stream=True) ## 若是指定的文件夾,那麼便新建 if not os.path.exists(FILE_DIR): os.makedirs(FILE_DIR) ## os.path.join(a,b..)若是a字符串沒有以/結尾,那麼自動加上\\。(windows下) with open(os.path.join(FILE_DIR,pdf_name),"wb") as pdf_file: for content in response.iter_content(): pdf_file.write(content) ## 設置要下載一個pdf要作的事情,做爲線程的基本 def downOne(url,pdf_name): showPdf(pdf_name) savePdf(url,pdf_name) print(pdf_name+" has been downloaded!!") ## 開始線程 def downPdf(root_url,max_thread): url_pdfName = getPdfUrl(root_url) with ThreadPoolExecutor(max_thread) as executor: executor.map(downOne,url_pdfName.keys(),url_pdfName.values()) def main(): ## 得到參數 args = setArgs() ## 若是沒有輸入必須的參數,便結束,返回簡略幫助 try: global FILE_DIR FILE_DIR = args.filedir downPdf(args.url,args.thread) except: exit() if __name__ == "__main__": main()
例子:windows
with ThreadPoolExecutor(max_thread) as executor: executor.map(downOne,url_pdfName.keys(),url_pdfName.values())
global FILE_DIR FILE_DIR = args.filedir