import requests
import shutil
import os
from bs4 import BeautifulSouphtml
def parse_page(url):
"""
根據 url 下載頁面並轉換成 soup 對象
:param url: 頁面 url 連接
:return: soup 對象
"""
page = requests.get(url).content
return BeautifulSoup(page, 'html.parser')app
def parse_page_num(soup):
"""
解析頁面,返回總頁數
:param soup: 頁面 soup 對象
:return: 總頁數
"""
total_page_num = 0
page_div = soup.find('div', attrs={'class': 'page'})
if page_div and page_div.span:
page_span_str = page_div.span.string
page_num_list = page_span_str.split(' / ')
if len(page_num_list) == 2:
total_page_num = int(page_num_list[1])
return total_page_numide
def parse_pic_names(soup):
"""
解析頁面,返回當前頁面圖片名
:param soup: 頁面 soup 對象
:return: 圖片名稱列表
"""
pic_names = []
pic_a_list = soup.find_all('a', attrs={'class': 'mark'})
for pic_a in pic_a_list:
pic_a_href = pic_a['href']
pic_url = pic_a_href.split('?')[0]
pic_path_list = pic_url.split('/')
pic_name = pic_path_list[len(pic_path_list) - 1]
pic_names.append(pic_name)
return pic_names函數
def main():
""" 爬蟲主函數 """
print ('---------- Crawling Start ----------')
base_page_url = 'https://bing.ioliu.cn'
base_picurl = 'http://h1.ioliu.cn/bing/%s%s.jpg'
all_pic_names = []url
soup = parse_page(base_page_url) # 獲取總頁數 total_page_num = parse_page_num(soup) for page in range(total_page_num): print ('Processing page: %s' % (page + 1)) page_url = base_page_url + '/?p=' + str(page + 1) soup = parse_page(page_url) # 獲取當前頁面全部圖片名 pic_names = parse_pic_names(soup) all_pic_names.extend(pic_names) # 遍歷全部圖片名,解析並保存圖片 resolution = '1920x1080' headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } # 建立圖片保存的目錄 file_dir = './gallery/' os.mkdir(file_dir) for pic_name in all_pic_names: img_url = base_pic_url % (pic_name, resolution) pic_file_name = pic_name + '_' + resolution + '.jpg' print ('Get %s from %s' % (pic_file_name, img_url)) img_stream = requests.get(img_url, stream=True, headers=headers) if img_stream.status_code == 200: with open(file_dir + pic_file_name, 'wb') as fw: shutil.copyfileobj(img_stream.raw, fw) # 圖片爬取結束 print ('---------- Crawling End ----------')
if name == 'main':
main()spa