上述文章中的代碼講述的很是清楚,個人基本能思路也是這樣,本篇文章中的代碼僅僅作了一些異常處理和一些日誌顯示優化工做,寫此文章主要是當作筆記,方便之後查閱,修改的地方以下:多線程
一、異常處理下面在代碼中會單獨標紅app
二、多線程版使用了multiprocessing這個庫,須要在main函數開始調用freeze_support(),防止打包成exe以後,運行時建立線程失敗async
三、多線程版本加了一個命令行自定義線程個數功能函數
1 #coding=utf-8 2 import requests 3 from bs4 import BeautifulSoup 4 import os 5 6 all_url = 'http://www.mzitu.com' 7 8 9 #http請求頭 10 Hostreferer = { 11 'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 12 'Referer':'http://www.mzitu.com' 13 } 14 Picreferer = { 15 'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 16 'Referer':'http://i.meizitu.net' 17 } 18 #此請求頭破解盜鏈 19 20 start_html = requests.get(all_url, headers = Hostreferer) 21 22 #保存地址 23 path = os.getcwd() + '/mzitu/' 24 25 #找尋最大頁數 26 soup = BeautifulSoup(start_html.text, "html.parser") 27 page = soup.find_all('a', class_='page-numbers') 28 max_page = page[-2].text 29 30 31 same_url = 'http://www.mzitu.com/page/' 32 for n in range(0, int(max_page)+1):#遍歷頁面數 33 ul = same_url+str(n) 34 start_html = requests.get(ul, headers = Hostreferer) 35 soup = BeautifulSoup(start_html.text, "html.parser") 36 all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank') 37 for a in all_a:#每一個頁面包含的妹子數 38 title = a.get_text() #提取文本 39 if(title != ''): 40 print("準備扒取:" + title) 41 42 #win不能建立帶?的目錄 43 if(os.path.exists(path+title.strip().replace('?', ''))): 44 #print('目錄已存在') 45 flag = 1 46 else: 47 os.makedirs(path+title.strip().replace('?', '')) 48 flag = 0 49 os.chdir(path + title.strip().replace('?', '')) 50 href = a['href'] 51 html = requests.get(href, headers = Hostreferer) 52 mess = BeautifulSoup(html.text, "html.parser") 53 pic_max = mess.find_all('span') 54 pic_max = pic_max[10].text #最大頁數 55 if(flag == 1 and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)): 56 print('已經保存完畢,跳過') 57 continue 58 for num in range(1, int(pic_max) + 1):#每一個妹子的全部照片 59 pic = href+'/'+str(num) 60 html = requests.get(pic, headers = Hostreferer) 61 mess = BeautifulSoup(html.text, "html.parser") 62 pic_url = mess.find('img', alt = title) 63 64 if 'src' not in pic_url.attrs:#有些pic_url標籤沒有src這個屬性,致使操做異常,在次進行過濾 65 continue 66 print(pic_url['src']) 67 #exit(0) 68 html = requests.get(pic_url['src'],headers = Picreferer) 69 file_name = pic_url['src'].split(r'/')[-1] 70 f = open(file_name, 'wb') 71 f.write(html.content) 72 f.close() 73 print('完成') 74 print('第',n,'頁完成')
1 #coding=utf-8 2 import requests 3 from bs4 import BeautifulSoup 4 import os 5 from multiprocessing import Pool 6 from multiprocessing import freeze_support 7 import sys 8 9 header = { 10 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36', 11 'Referer':'http://www.mzitu.com' 12 } 13 Picreferer = { 14 'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 15 'Referer':'http://i.meizitu.net' 16 } 17 18 def find_MaxPage(): 19 all_url = 'http://www.mzitu.com' 20 start_html = requests.get(all_url, headers = header) 21 #找尋最大妹子頁面數 22 soup = BeautifulSoup(start_html.text, "html.parser") 23 page = soup.find_all('a', class_ = 'page-numbers') 24 max_page = page[-2].text 25 return max_page 26 27 def Download(href, title, path): 28 html = requests.get(href, headers = header) 29 soup = BeautifulSoup(html.text, 'html.parser') 30 pic_max = soup.find_all('span') 31 pic_max = pic_max[10].text # 最大頁數 32 if(os.path.exists(path+title.strip().replace('?', '')) 33 and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)): 34 print('妹子已待命,繼續準備下一個妹子' + title) 35 return 1 36 print(f"發現妹子資源{pic_max}個,準備中:" + title) 37 os.makedirs(path + title.strip().replace('?', '')) 38 os.chdir(path + title.strip().replace('?', '')) 39 for num in range(1, int(pic_max) + 1): 40 pic = href + '/' + str(num) 41 html = requests.get(pic, headers = header) 42 mess = BeautifulSoup(html.text, "html.parser") 43 pic_url = mess.find('img', alt = title) 44 if 'src' not in pic_url.attrs:#有些pic_url標籤沒有src屬性,致使操做異常,在次進行過濾 45 continue 46 print(f"{title}:{pic_url['src']}") 47 html = requests.get(pic_url['src'], headers = header) 48 file_name = pic_url['src'].split(r'/')[-1] 49 f = open(file_name,'wb') 50 f.write(html.content) 51 f.close() 52 print('妹子已就緒,客官請慢用:' + title) 53 54 if __name__ == '__main__': 55 freeze_support()#防止打包後 運行exe建立進程失敗 56 57 #線程池中線程數 58 count = 1 59 if len(sys.argv) >=2: 60 count = int(sys.argv[1]) 61 62 pool = Pool(count) 63 print(f'初始化下載線程個數${count}') 64 65 # http請求頭 66 path = os.getcwd() + '/mzitu_mutil/' 67 max_page = find_MaxPage() #獲取最大頁數 即生成的文件夾數量 68 print(f'捕獲{max_page}頁妹子,請耐心等待下載完成') 69 same_url = 'http://www.mzitu.com/page/' 70 71 for n in range(1, int(max_page) + 1): 72 each_url = same_url + str(n) 73 start_html = requests.get(each_url, headers = header)#請求一頁中的全部妹子 74 soup = BeautifulSoup(start_html.text, "html.parser") 75 all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank') 76 for a in all_a:#遍歷每一頁中的妹子 77 title = a.get_text() # 提取文本 78 if (title != ''): 79 href = a['href']#請求妹子的全部圖集 80 pool.apply_async(Download, args = (href, title, path)) 81 pool.close() 82 pool.join() 83 print('全部妹子已就緒,客官請慢用')
資源下載地址:Python爬取妹子圖-單線程和多線程版本post
轉載聲明:本站文章無特別說明,皆爲原創,版權全部,轉載請註明:朝十晚八優化