因爲一些論壇不登陸驗證,就不能查看帖子的內容和附件,因此須要登陸驗證,若是在代碼中添加登陸驗證,那就增長了代碼的複雜度,因此能夠結合cookie來登陸而後爬取須要的資料python
#!/usr/bin/env python # -*- coding:utf-8 -*- """ @author:Aiker Zhao @file:jianli.py @time:下午10:50 """ import os import re import requests from bs4 import BeautifulSoup from requests.exceptions import RequestException from hashlib import md5 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Cookie': 'PHPSESSID=ui7unwxc3yf4glbdaqmat2ee00; __cfduid=de4fa38a23ad640f0bcdb4313560af62e1543723208; ape__Session=ui7uxxxxdfd4glbdaqmat2ee00; _ga=GA1.2.176343230234.1552443854' } def get_content(): url = 'http://ask.xxxx.com/question/xxxx' # url response = requests.get(url, headers=headers).text.replace('<i class="fa fa-paperclip"></i>', '') soup = BeautifulSoup(response, 'lxml') # div = soup.select('#aw-mod-body ueditor-p-reset') pattern = re.compile('<a\shref="(http://ask.apelearn.com/file.*?)".*?>(.*?)</a>', re.S) p = soup.find_all('a') for item in p: # print(str(item)) result = re.findall(pattern, str(item)) if result: # print(result) for i in result: url, name = i # print(i) yield { 'url': url, 'name': name } def download_doc(url, name): print('正在下載', name, url) try: response = requests.get(url, headers=headers) if response.status_code == 200: save_doc(response.content, name) return None except RequestException: print('請求文檔出錯', url) return None def save_doc(content, name): try: if name: name_1 = re.sub('[:?!!:?【】]', '', name).split('.')[0] # 替換title中的特殊字符,避免創建文件出錯 name_2 = name.split('.')[-1] dir = 'z:\\jianli2\\' if os.path.exists(dir): pass else: os.mkdir(dir) file_path = '{0}/{1}.{2}'.format(dir, name_1 + md5(content).hexdigest(), name_2) # file_path = '{0}/{1}'.format(dir, name) if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() except OSError: pass def main(): # get_content() for f in get_content(): url = f.get('url') name = f.get('name') download_doc(url, name) if __name__ == '__main__': main()