爬蟲（八）：分析Ajax請求抓取今日頭條街拍美圖

時間 2019-12-08

原文原文鏈接

（1）：分析網頁html

分析ajax的請求網址，和須要的參數。經過不斷向下拉動滾動條，發現請求的參數中offset一直在變化，因此每次請求經過offset來控制新的ajax請求。ajax

（2）上代碼數據庫

a、經過ajax請求獲取頁面數據json

# 獲取頁面數據
def get_page_index(offset, keyword):
    # 參數經過分析頁面的ajax請求得到
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
        'from': 'search_tab',
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)  # 將字典轉換爲url參數形式
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        print('請求索引頁錯誤')
        return None

b、分析ajax請求的返回結果，獲取圖片集的urlurl

# 分析ajax請求的返回結果，獲取圖片集的url
def parse_page_index(html):
    data = json.loads(html) # 加載返回的json數據
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

c、獲得圖集url後獲取圖集的內容spa

# 獲取詳情頁的內容
def get_page_detail(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        print('詳情頁頁錯誤', url)
        return None

d、其餘看完整代碼code

完整代碼：orm

# -*- coding: utf-8 -*-
# @Author  : FELIX
# @Date    : 2018/4/4 12:49

import json

import os
from hashlib import md5

import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import re
import pymongo
from multiprocessing import Pool


MONGO_URL='localhost'

MONGO_DB='toutiao'

MONGO_TABLE='toutiao'

GROUP_START=1
GROUP_END=20
KEYWORD='街拍'


client = pymongo.MongoClient(MONGO_URL)  # 鏈接MongoDB
db = client[MONGO_DB]  # 若是已經存在鏈接，不然建立數據庫

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
}


# 獲取頁面數據
def get_page_index(offset, keyword):
    # 參數經過分析頁面的ajax請求得到
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
        'from': 'search_tab',
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)  # 將字典轉換爲url參數形式
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        print('請求索引頁錯誤')
        return None


# 分析ajax請求的返回結果，獲取圖片集的url
def parse_page_index(html):
    data = json.loads(html) # 加載返回的json數據
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')


# 獲取詳情頁的內容
def get_page_detail(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        print('詳情頁頁錯誤', url)
        return None


def parse_page_detail(html, url):
    # soup=BeautifulSoup(html,'lxml')
    # print(soup)
    # title=soup.select('tetle').get_text()
    # print(title)
    images_pattern = re.compile('articleInfo:.*?title: \'(.*?)\'.*?content.*?\'(.*?)\'', re.S)
    result = re.search(images_pattern, html)
    if result:
        title = result.group(1)
        url_pattern = re.compile('&quot;(http:.*?)&quot;')
        img_url = re.findall(url_pattern, str(result.group(2)))
        if img_url:
            for img in img_url:
                download_img(img)  # 下載
            data = {
                'title': title,
                'url': url,
                'images': img_url,
            }
            return data


def save_to_mongo(result):
    if result:
        if db[MONGO_TABLE].insert(result):  # 插入數據
            print('存儲成功', result)
            return True
    return False


def download_img(url):
    print('正在下載', url)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            save_img(response.content)
        else:
            return None
    except RequestException:
        print('下載圖片錯誤', url)
        return None


def save_img(content):
    # os.getcwd()獲取當前文件路徑，用md5命名，保證不重複
    file_path = '{}/imgs/{}.{}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
    if not os.path.exists(file_path):
        with open(file_path, 'wb')as f:
            f.write(content)


def main(offset):
    html = get_page_index(offset, KEYWORD)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        print(url,'++++++++++++++++++++++++++++++++++++++++++++++++')
        print(html)
        if html:
            result = parse_page_detail(html, url)
            save_to_mongo(result)
            # print(result)
        # print(url)


if __name__ == '__main__':
    groups = [i * 20 for i in range(GROUP_START, GROUP_END + 1)]
    pool = Pool()
    pool.map(main, groups)

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。