爬蟲——requests.get爬蟲模塊參數

地址和請求頭參數--url和header

res = requests.get(url,headers=headers)  向網站發起請求,並獲取響應對象html

參數前端

  • url :須要抓取的URL地址
  • headers : 請求頭
  • timeout : 超時時間,超過期間會拋出異常

響應對象(res)屬性web

  • encoding :響應字符編碼 res.encoding = 'utf-8'
  • text :字符串 網站源碼
  • content :字節流 字符串網站源碼
  • status_code :HTTP響應碼
  • url :實際數據的URL地址
import requests

url = 'http://www.baidu.com/'    # 爬取百度網頁
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 \
    (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}

res = requests.get(url, headers=headers)
print(res.encoding)     # 查看網站的編碼格式 ISO-8859-1

# text屬性獲取響應內容(字符串)網站源碼
res = requests.get(url,headers=headers)
res.encoding = 'utf-8'
html = res.text

# content屬性獲取響應內容(字節串 bytes)網站源碼
res = requests.get(url,headers=headers)
html = res.content.decode('utf-8')

print(res.status_code)      # 查看響應碼 200
print(res.url)              # 查看訪問的URL地址 https://www.baidu.com/

非結構化數據的保存方式redis

像壓縮文件zip、圖片文件等均可以使用非結構化數據的保存方式api

with open('xxx.jpg','wb') as f:
    f.write(res.content)

示例:保存趙麗穎圖片到本地網絡

import requests

url = 'http://dingyue.nosdn.127.net/lL1JH2YdpAWrzEhfp8BrJ8lTHa1602AEX9E7qpTpH5NzW1535203788506compressflag.jpg'
headers = {'User-Agent': 'Mozilla/5.0'}

html = requests.get(url, headers=headers).content

# 把圖片保存到本地
with open('趙麗穎.jpg', 'wb') as f:
    f.write(html)

百度貼吧圖片抓取

目標:抓取指定貼吧全部圖片app

思路dom

  1. 獲取貼吧主頁URL,下一頁,找到不一樣頁的URL規律
  2. 獲取1頁中全部帖子URL地址: [帖子連接1,帖子連接2,...]
  3. 對每一個帖子連接發請求,獲取圖片URL
  4. 向圖片的URL發請求,以wb方式寫入本地文件

貼吧URL規律:http://tieba.baidu.com/f?kw=??&pn=50ide

xpath表達式函數

一、帖子連接xpath,這裏爲何屬性選擇class,由於相同的元素他們要相同的樣式

//div[@class="t_con cleafix"]/div/div/div/a/@href

二、圖片連接xpath

//div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src

三、視頻連接xpath

//div[@class="video_src_wrapper"]/embed/@data-video

# 注意: 此處視頻連接前端對響應內容作了處理,須要查看網頁源代碼來查看,複製HTML代碼在線格式化

百度貼吧視頻抓取反爬機制(對響應內容作處理)

網頁源代碼是:

<div class="video_src_wrapper">
   <embed data-video="http://tb-video.bdstatic.com/tieba-smallvideo-transcode-cae/2754153_8fcd225842344de9901c1489e49defbe_0_cae.mp4"

F12調試定位到的代碼是:

<div class="video_src_wrapper">
    <div class="video_src_wrap_main">
        <video src="http://tb-video.bdstatic.com/tie-cae/f2358e8_0_cae.mp4" "></video>
    </div>
</div>

若是經過F12定位的位置,寫xpath,會爬取不到,由於咱們requsets爬取的是網頁代碼,最後仍是要以網頁源代碼爲主。

import requests
from lxml import etree
import random
import time
from urllib import parse


class BaiduImageSpider(object):
    def __init__(self):
        self.url = 'http://tieba.baidu.com/f?kw={}&pn={}'
        self.ua_list = [
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET \
            CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)', ]

    # 獲取html網頁代碼
    def get_html(self, url):
        headers = {'User-Agent': random.choice(self.ua_list)}
        html = requests.get(url=url, headers=headers).content.decode('utf-8', 'ignore')
        return html

    # 解析html
    def xpath_func(self, html, xpath_bds):
        parse_html = etree.HTML(html)
        r_list = parse_html.xpath(xpath_bds)
        return r_list

    # 圖片抓取
    def parse_html(self, one_url):
        html = self.get_html(one_url)
        xpath_bds = '//div[@class="t_con cleafix"]/div/div/div/a/@href'
        r_list = self.xpath_func(html, xpath_bds)  # 提取帖子連接:xpath_list ['/p/32323','','']
        for r in r_list:
            t_url = 'http://tieba.baidu.com' + r  # 拼接帖子的URL地址
            self.get_image(t_url)  # 把帖子中全部圖片保存到本地
            time.sleep(random.uniform(0, 2))  # 爬完1個帖子中全部圖片,休眠0-2秒鐘

    # 給定1個帖子URL,把帖子中全部圖片保存到本地
    def get_image(self, t_url):
        html = self.get_html(t_url)

        # 使用xpath表達式的或| : 圖片連接 + 視頻連接
        xpath_bds = '//div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src | //div[@class="video_src_wrapper"]/embed/@data-video'
        img_list = self.xpath_func(html, xpath_bds)  # ['http://xxx.jpg','']
        print(img_list)
        for img in img_list:
            html_bytes = requests.get(url=img, headers={'User-Agent': random.choice(self.ua_list)}).content
            self.save_img(html_bytes, img)

    # 保存圖片函數
    def save_img(self, html_bytes, img):
        filename = img[-10:]
        with open(filename, 'wb') as f:
            f.write(html_bytes)
            print('%s下載成功' % filename)

    # 主函數
    def main(self):
        name = input('請輸入貼吧名:')
        begin = int(input('請輸入起始頁:'))
        end = int(input('請輸入終止頁:'))
        # 對貼吧名進行編碼
        kw = parse.quote(name)
        for page in range(begin, end + 1):
            pn = (page - 1) * 50
            url = self.url.format(kw, pn)
            # 調用主線函數
            self.parse_html(url)


if __name__ == '__main__':
    spider = BaiduImageSpider()
    spider.main()

查詢參數-params

res = requests.get(url,params=params,headers=headers)

url爲基準的url地址,不包含查詢參數,該方法會自動對params字典編碼,而後和url拼接

參數類型:字典,字典中鍵值對做爲查詢參數

import requests

baseurl = 'http://tieba.baidu.com/f?'
params = {
    'kw': '趙麗穎吧',
    'pn': '50'}
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2\
                                                ; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR \
                                                3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}
# 自動對params進行編碼,而後自動和url進行拼接,去發請求
res = requests.get(baseurl, headers=headers, params=params)
res.encoding = 'utf-8'
print(res.text)

Web客戶端驗證參數-auth

res = requests.get(url, headers=headers, auth=('username','password'))

針對於須要web客戶端用戶名密碼認證的網站,auth = ('username','password')

達內課程筆記

import requests
from lxml import etree
import random
import os


class CodeSpider(object):
    def __init__(self):
        self.url = 'http://code.tarena.com.cn/AIDCode/aid1904/14-redis/'
        self.auth = ('tarenacode', 'code_2013')
        self.ua_list = [
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .\
            NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)', ]

    def parse_html(self):
        # 獲取響應內容
        html = requests.get(url=self.url, headers={'User-Agent': random.choice(self.ua_list)}, auth=self.auth)
        html = html.content.decode('utf-8', 'ignore')

        parse_html = etree.HTML(html)   # 解析
        r_list = parse_html.xpath('//a/@href')      # # r_list : ['../','day01','day02','redis_day01.zip']
        for r in r_list:
            if r.endswith('.zip') or r.endswith('.rar'):
                self.save_files(r)

    def save_files(self, r):
        directory = '/home/tarena/AID/redis/'
        if not os.path.exists(directory):
            os.makedirs(directory)

        # 拼接地址,把zip文件保存到指定目錄
        url = self.url + r
        # filename: /home/tarena/AID/redis/xxx.zip
        filename = directory + r
        html = requests.get(url=url, headers={'User-Agent': random.choice(self.ua_list)}, auth=self.auth).content

        with open(filename, 'wb') as f:
            f.write(html)
            print('%s下載成功' % r)


if __name__ == '__main__':
    spider = CodeSpider()
    spider.parse_html()

SSL證書認證參數-verify

response = requests.get(url=url,params=params,headers=headers,verify=False)

  SSL證書認證參適用於沒有通過 證書認證機構認證的https類型網站,通常這種網站會拋出 SSLError 異常則考慮使用此參數

verify:True(默認)檢查證書認證;False(經常使用)忽略證書認證

代理參數-proxies

定義:代替你原來的IP地址去對接網絡的IP地址。隱藏自身真實IP,避免被封

普通代理

獲取代理IP網站:西刺代理、快代理、全網代理、代理精靈、... ...

語法結構

proxies = {'協議':'協議://IP:端口號'}

# http和https是相同的
proxies = {
  'http':'http://59.172.27.6:38380',
  'https':'https://59.172.27.6:38380'
}

使用免費普通代理IP訪問測試網站: http://httpbin.org/get

import requests

url = 'http://httpbin.org/get'
headers = {'User-Agent': 'Mozilla/5.0'}
# 定義代理,在代理IP網站中查找免費代理IP
proxies = {
    'http': 'http://309435365:szayclhp@43.226.164.156:16818',
    'https': 'https://309435365:szayclhp@43.226.164.156:16818'}
html = requests.get(url, proxies=proxies, headers=headers, timeout=5).text
print(html)

IP池

從西刺代理上面爬取IP,迭代測試可否使用,創建一個本身的代理IP池,隨時更新用來抓取網站數據

import requests
from lxml import etree
import time
import random
from fake_useragent import UserAgent


class GetProxyIP(object):
    def __init__(self):
        self.url = 'https://www.xicidaili.com/nn/'
        self.proxies = {
            'http': 'http://163.204.247.219:9999',
            'https': 'http://163.204.247.219:9999'}

    # 隨機生成User-Agent
    def get_random_ua(self):
        ua = UserAgent()        # 建立User-Agent對象
        useragent = ua.random
        return useragent

    # 從西刺代理網站上獲取隨機的代理IP
    def get_ip_file(self, url):
        headers = {'User-Agent': self.get_random_ua()}
        # 訪問西刺代理網站國內高匿代理,找到全部的tr節點對象
        html = requests.get(url=url, proxies=self.proxies, headers=headers, timeout=5).content.decode('utf-8', 'ignore')
        parse_html = etree.HTML(html)
        # 基準xpath,匹配每一個代理IP的節點對象列表
        tr_list = parse_html.xpath('//tr')
        for tr in tr_list[1:]:
            ip = tr.xpath('./td[2]/text()')[0]
            port = tr.xpath('./td[3]/text()')[0]
            # 測試ip:port是否可用
            self.test_proxy_ip(ip, port)

    # 測試抓取的代理IP是否可用
    def test_proxy_ip(self, ip, port):
        proxies = {
            'http': 'http://{}:{}'.format(ip, port),
            'https': 'https://{}:{}'.format(ip, port), }
        test_url = 'http://www.baidu.com/'
        try:
            res = requests.get(url=test_url, proxies=proxies, timeout=8)
            if res.status_code == 200:
                print(ip, ":", port, 'Success')
                with open('proxies.txt', 'a') as f:
                    f.write(ip + ':' + port + '\n')
        except Exception as e:
            print(ip, port, 'Failed')

    # 主函數
    def main(self):
        for i in range(1, 1001):
            url = self.url.format(i)
            self.get_ip_file(url)
            time.sleep(random.randint(5, 10))


if __name__ == '__main__':
    spider = GetProxyIP()
    spider.main()

從IP池中取IP

從文件中隨機獲取代理IP寫爬蟲

import random
import requests


class BaiduSpider(object):
    def __init__(self):
        self.url = 'http://www.baidu.com/'
        self.headers = {'User-Agent': 'Mozilla/5.0'}
        self.blag = 1

    def get_proxies(self):
        with open('proxies.txt', 'r') as f:
            result = f.readlines()  # 讀取全部行並返回列表
        proxy_ip = random.choice(result)[:-1]       # 獲取了全部代理IP
        L = proxy_ip.split(':')
        proxy_ip = {
            'http': 'http://{}:{}'.format(L[0], L[1]),
            'https': 'https://{}:{}'.format(L[0], L[1])
        }
        return proxy_ip

    def get_html(self):
        proxies = self.get_proxies()
        if self.blag <= 3:
            try:
                html = requests.get(url=self.url, proxies=proxies, headers=self.headers, timeout=5).text
                print(html)
            except Exception as e:
                print('Retry')
                self.blag += 1
                self.get_html()


if __name__ == '__main__':
    spider = BaiduSpider()
    spider.get_html()

收費代理API

寫一個獲取收費開放API代理的接口

# 獲取開放代理的接口
import requests
from fake_useragent import UserAgent

ua = UserAgent()  # 建立User-Agent對象
useragent = ua.random
headers = {'User-Agent': useragent}


def ip_test(ip):
    url = 'http://www.baidu.com/'
    ip_port = ip.split(':')
    proxies = {
        'http': 'http://{}:{}'.format(ip_port[0], ip_port[1]),
        'https': 'https://{}:{}'.format(ip_port[0], ip_port[1]),
    }
    res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
    if res.status_code == 200:
        return True
    else:
        return False


# 提取代理IP
def get_ip_list():
    # 快代理:https://www.kuaidaili.com/doc/product/dps/
    api_url = 'http://dev.kdlapi.com/api/getproxy/?orderid=946562662041898&num=100&protocol=1&method=2&an_an=1&an_ha=1&sep=2'
    html = requests.get(api_url).content.decode('utf-8', 'ignore')
    ip_port_list = html.split('\n')

    for ip in ip_port_list:
        with open('proxy_ip.txt', 'a') as f:
            if ip_test(ip):
                f.write(ip + '\n')


if __name__ == '__main__':
    get_ip_list()

私密代理

一、語法結構

proxies = {
'協議':'協議://用戶名:密碼@IP:端口號'
}
proxies = {
    'http':'http://用戶名:密碼@IP:端口號',
    'https':'https://用戶名:密碼@IP:端口號'
}
proxies = {
    'http': 'http://309435365:szayclhp@106.75.71.140:16816',
    'https':'https://309435365:szayclhp@106.75.71.140:16816',
} 

用戶名和密碼會在給你API_URL的時候給你。不是你的帳號和帳號密碼。

# 獲取開放代理的接口
import requests
from fake_useragent import UserAgent

ua = UserAgent()  # 建立User-Agent對象
useragent = ua.random
headers = {'User-Agent': useragent}


def ip_test(ip):
    url = 'https://blog.csdn.net/qq_34218078/article/details/90901602/'
    ip_port = ip.split(':')
    proxies = {
        'http': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]),
        'https': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]),
    }

    res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
    if res.status_code == 200:
        print("OK")
        return True
    else:
        print(res.status_code)
        print("錯誤")
        return False


# 提取代理IP
def get_ip_list():
    # 快代理:https://www.kuaidaili.com/doc/product/dps/
    api_url = 'http://dps.kdlapi.com/api/getdps/?orderid=986603271748760&num=1000&signature=z4a5b2rpt062iejd6h7wvox16si0f7ct&pt=1&sep=2'
    html = requests.get(api_url).content.decode('utf-8', 'ignore')
    ip_port_list = html.split('\n')

    for ip in ip_port_list:
        with open('proxy_ip.txt', 'a') as f:
            if ip_test(ip):
                f.write(ip + '\n')


if __name__ == '__main__':
    get_ip_list()
相關文章
相關標籤/搜索