1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2018/7/10 22:34
4 # @Author : chenxiaowei
5 # @Email : chen1020xiaowei@163.com
6 # @File : parse_meinv.py
7 ###利用正則表達式匹配字符串爬取***的美女圖片,保存相關數據到MongoDB而且把相關圖片保存在本地###
8 from parse_config import *
9 import requests 10 import re 11 import json 12 import time 13 import pymongo 14 from requests.exceptions import RequestException 15 import hashlib 16
17 db_client = pymongo.MongoClient(mongo_url) # 初始化MongoDB數據庫對象
18 db = db_client[mongo_database] # 引用實例
19
20
21 def get_responses(url): # 定義獲取response函數
22 try: 23 responses = requests.get(url, headers=headers) 24 if responses.status_code == 200: # 判斷是否請求成功,利用.text方法返回html代碼
25 return responses.text 26 else: 27 return None 28 except RequestException: # 捕獲父類異常
29 print('error1') 30 return None 31
32
33 def get_image_content(url): # 定義函數
34 try: 35 responses = requests.get(url, headers=headers) 36 if responses.status_code == 200: 37 return responses.content # 利用.content方法返回二進制文件
38 else: 39 return None 40 except RequestException: 41 print('error2') 42 return None 43
44
45 def download_image(content): 46 filename = '{0}.{1}'.format(hashlib.md5(content).hexdigest(), 'jpg') # 字符串的通配方法
47 with open(path_image.format(filename), 'wb')as f: # 'wb'保存圖片
48 f.write(content) 49 print(filename,'下載成功!') 50 f.close() 51
52
53 def get_url_items(html): 54 pattern = re.compile('<li>.*?<a.*?href="(.*?)".*?class="TypeBigPics".*?src="(.*?)".*?<span>(.*?)</span>'
55 + '.*?class="IcoList">(.*?)</em>.*?class="IcoTime">(.*?)</em>', re.S) 56 items = re.findall(pattern, html) # 利用re庫的compile方法構造正則表達式,findall方法獲取items
57 for item in items: 58 yield { 59 '名稱': item[2], 60 '壁紙': item[1], 61 '網址': item[0], 62 '發佈日期': item[4], 63 '查看次數': item[3][3:] 64 } # yield生成器,被調用時才賦值
65 content = get_image_content(item[1]) 66 download_image(content) 67
68
69 def save_to_file(filename,file_type,text): # 保存至本地
70 with open('{}{}{}'.format(path_txt, filename,file_type), 'a', encoding='utf-8', )as wf: # 以utf-8的編碼方式追加到文件
71 wf.write(json.dumps(text, ensure_ascii=False) + '\n') # 解碼相關json格式
72 print(text,'寫入到本地成功!') 73 wf.close() 74
75
76 def save_to_mongo(text): # 存儲到MongoDB
77 if db[mongo_table].insert(text): 78 print(text,'寫入Mongo成功!') 79 return True 80 return False 81
82
83 def main(filename, page): 84 url = 'http://www.***/bizhitupian/meinvbizhi/{}.htm'.format(page) 85 html = get_responses(url) 86 items = get_url_items(html) 87 for item in items: 88 save_to_file(filename,file_type, item) 89 save_to_mongo(item) 90
91
92 if __name__ == '__main__': 93 for page in range(start_page, end_page + 1): 94 main(file, page) 95 time.sleep(15)#等待15秒,防止被識別
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2018/7/10 22:35
4 # @Author : chenxiaowei
5 # @Email : chen1020xiaowei@163.com
6 # @File : parse_config.py
7 mongo_url = 'localhost'
8 mongo_database = 'youmeiwang'
9 mongo_table = 'meinv'
10 headers = { 11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36'
12 } 13 file = '***美女圖片.txt'
14 path_image = 'H:/Python_download/20180710/image/{}'
15 path_txt = 'H:/Python_download/20180710/file/'
16 filename = '***美女圖片'
17 file_type = '.txt'
18 start_page = 1
19 end_page = 44