【Python爬蟲案例學習】分析Ajax請求並抓取今日頭條街拍圖片

1.抓取索引頁內容
利用requests請求目標站點,獲得索引網頁HTML代碼,返回結果。html

from urllib.parse import urlencode
from requests.exceptions import RequestException
import requests
'''
遇到不懂的問題?Python學習交流羣:821460695知足你的需求,資料都已經上傳羣文件,能夠自行下載!
'''
def get_page_index(offset, keyword):
    headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
    data = {
        'format': 'json',
        'offset': offset,
        'keyword': keyword,
        'autoload': 'true',
        'count': 20,
        'cur_tab': 1,
        'from': 'search_tab',
        'pd': 'synthesis',
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
    response = requests.get(url, headers=headers);
    try:
        if response.status_code == 200:
            return response.text
        return None
    except RequestException: 
        print('請求索引頁失敗')
        return None

def main():
    html = get_page_index(0,'街拍')
    print(html)

if __name__=='__main__':
    main()

2.抓取詳情頁內容
解析返回結果,獲得詳情頁的連接,並進一步抓取詳情頁的信息。數據庫

  • 獲取頁面網址:
def parse_page_index(html):
  data = json.loads(html)
  if data and 'data' in data.keys():
    for item in data.get('data'):
      yield item.get('article_url')
  • 單個頁面代碼:
def get_page_detail(url):
  headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
  try:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
      return response.text
    return None
  except RequestException:
    print('請求詳情頁頁失敗')
    return None
  • 圖片地址
def parse_page_detail(html,url):
  soup = BeautifulSoup(html,'lxml')
  title = soup.select('title')[0].get_text()
  images_pattern = re.compile('gallery: JSON.parse\((.*?)\)', re.S)
  result = re.search(images_pattern, html)
  if result:
    data = json.loads(result.group(1))
    data = json.loads(data) #將字符串轉爲dict,由於報錯了
    if data and 'sub_images' in data.keys():
      sub_images = data.get('sub_images')
      images = [item.get('url') for item in sub_images]
      for image in images: download_image(image)
      return {
        'title': title,
        'images':images,
        'url':url
      }

3.下載圖片與保存數據庫
將圖片下載到本地,並把頁面信息及圖片URL保存到MongDB。json

# 存到數據庫
def save_to_mongo(result):
  if db[MONGO_TABLE].insert(result):
    print('存儲到MongoDb成功', result)
    return True
  return False

# 下載圖片
def download_image(url):
  print('正在下載',url)
  headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.    36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
  try:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
      save_image(response.content)
    return None
  except RequestException:
    print('請求圖片失敗', url)
    return None

def save_image(content):
  file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
  if not os.path.exists(file_path):
    with open(file_path,'wb') as f:
      f.write(content)

4.開啓循環及多線程
對多頁內容遍歷,開啓多線程提升抓取速度。多線程

groups = [x*20 for x in range(GROUP_START, GROUP_END+1)]
    pool = Pool()
    pool.map(main,groups)

完整代碼:

from urllib.parse import urlencode
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from hashlib import md5
from multiprocessing import Pool
from config import *
import pymongo
import requests
import json
import re
import os
'''
遇到不懂的問題?Python學習交流羣:821460695知足你的需求,資料都已經上傳羣文件,能夠自行下載!
'''
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

def get_page_index(offset, keyword):
  headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
  data = { 'format': 'json','offset': offset,'keyword': keyword,'autoload': 'true','count': 20,'cur_tab': 1,'from': 'search_tab','pd': 'synthesis' }
  url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
  try:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
      return response.text
    return None
  except RequestException:
    print('請求索引頁失敗')
    return None

def parse_page_index(html):
  data = json.loads(html)
  if data and 'data' in data.keys():
    for item in data.get('data'):
      yield item.get('article_url')

def get_page_detail(url):
  headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
  try:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
      return response.text
    return None
  except RequestException:
    print('請求詳情頁頁失敗')
    return None

def parse_page_detail(html,url):
  soup = BeautifulSoup(html,'lxml')
  title = soup.select('title')[0].get_text()
  images_pattern = re.compile('gallery: JSON.parse\((.*?)\)', re.S)
  result = re.search(images_pattern, html)
  if result:
    data = json.loads(result.group(1))
    data = json.loads(data) #將字符串轉爲dict,由於報錯了
    if data and 'sub_images' in data.keys():
      sub_images = data.get('sub_images')
      images = [item.get('url') for item in sub_images]
      for image in images: download_image(image)
      return {
        'title': title,
        'images':images,
        'url':url
      }

def save_to_mongo(result):
  if db[MONGO_TABLE].insert(result):
    print('存儲到MongoDb成功', result)
    return True
  return False

def download_image(url):
  print('正在下載',url)
  headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.    36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
  try:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
      save_image(response.content)
    return None
  except RequestException:
    print('請求圖片失敗', url)
    return None

def save_image(content):
  file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
  if not os.path.exists(file_path):
    with open(file_path,'wb') as f:
      f.write(content)

def main(offset):
  html = get_page_index(offset,KEYWORD)
  for url in parse_page_index(html):
     html = get_page_detail(url)
     if html:
       result = parse_page_detail(html,url)
       if isinstance(result,dict):
         save_to_mongo(result)
  
if __name__=='__main__':
    groups = [x*20 for x in range(GROUP_START, GROUP_END+1)]
    pool = Pool()
    pool.map(main,groups)

config.py學習

MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'jiepai'

GROUP_START = 1 
GROUP_END = 20

KEYWORD = '街拍'
~
相關文章
相關標籤/搜索