1 import requests 2 from bs4 import BeautifulSoup as bs 3 import re 4 import os 5 import socket 6 import time 7 import threading 8 9 10 def url_open(url): 11 socket.setdefaulttimeout(20) 12 headers = { 13 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'} 14 res = requests.get(url, headers=headers) 15 16 return res 17 18 19 def save(url): # 傳入每一個子網頁連接 20 res = url_open(url) 21 res.encoding = 'gbk' 22 soup = bs(res.text, 'lxml') 23 title = soup.find('title').text.split('-')[0] # 標題 24 #os.mkdir(title) 25 # os.chdir(title) 26 temp = soup.find_all('tr', class_='tr3') 27 img = re.findall(r'data-src="(.*?jpg)" type', str(temp)) 28 29 30 imglist = [] 31 32 for each in img: 33 imglist.append(each) 34 for each in imglist: 35 filename = each.split('/')[-1] 36 img = url_open(each) 37 print('saving...+%s'%filename) 38 39 with open(title+filename, 'wb')as f: 40 f.write(img.content) 41 #os.chdir('..') 42 43 44 45 46 if __name__ == '__main__': 47 os.makedirs('1024', exist_ok=True) 48 os.chdir('1024') 49 url = 'https://cl.e7s.win/thread0806.php?fid=16&search=&page=1' #默認爬取第一個頁面,畢竟要注意身體,須要多個頁面的話,本身加個for循環也不是什麼難事~ 50 urlhead = 'https://cl.e7s.win/' #頁面解析出來的鏈接前面須要加上這個頭才能打開,根據多年經驗這個頭是會變的,若是哪天不能用了本身看下是否是這個頭變了 51 res = url_open(url) 52 res.encoding = 'gbk' 53 54 '''找到頁面中的全部子網頁''' 55 soup = bs(res.text, 'lxml') 56 temp = soup.find_all('td', class_="tal") 57 link = [] 58 for each in temp: 59 link.append(urlhead + each.h3.a.get('href')) 60 # del link[0:10] 61 62 downloads = [] 63 for each in link: 64 print(each) 65 66 down = threading.Thread(target=save, args=[each]) 67 downloads.append(down) 68 69 down.start() 70 for each in downloads: 71 each.join() 72 print('Done')