python爬蟲之request and BeautifulSoup

時間 2019-11-17

標籤 python 爬蟲 request beautifulsoup 欄目 Python 简体版

原文原文鏈接

1.爬蟲的本質是什麼?

模仿瀏覽器的行爲,爬取網頁信息。html

2.requests

1.get請求

無參數實例
import requests
ret = requests.get('https://github.com/timeline.json')
print ret.text
  
  
有參數實例
import requests
ret = requests.get("http://httpbin.org/get", params= {'key1': 'value1', 'key2': 'value2'})

print ret.text

get

2.post請求

import requests
import json
  
url = 'https://api.github.com/some/endpoint'
payload = {'v1': 'k1}
headers = {'content-type': 'application/json'}
  
ret = requests.post(url, data=json.dumps(payload), headers=headers)

print ret.text

View Code

3.其餘請求

requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs)
  
# 以上方法均是在此方法的基礎上構建
requests.request(method, url, **kwargs)

View Code

4.更多參數與實例

method

def param_method_url():
    ret=requests.request(method='get', url='http://127.0.0.1:8000/test/')
    ret=requests.request(method='post', url='http://127.0.0.1:8000/test/')

View Code

params

import requests  

  requests.get(url='http://127.0.0.1:8000/test/',
    params={'k1': 'v1', 'k2': 'v2'})

#他的本質與requests.get(url='xxxxx?k1=v1&k2=v2')

View Code

data

    # 能夠是字典
    # 能夠是字符串
    # 能夠是字節
    # 能夠是文件對象
    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # data={'k1': 'v1', 'k2': '水電費'})

    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # data="k1=v1; k2=v2; k3=v3; k3=v4"
    # )

    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # data="k1=v1;k2=v2;k3=v3;k3=v4",
    # headers={'Content-Type': 'application/x-www-form-urlencoded'}
    # )

    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件內容是：k1=v1;k2=v2;k3=v3;k3=v4
    # headers={'Content-Type': 'application/x-www-form-urlencoded'}
    # )

View Code

json

#若是請求體是 payload的話則須要傳入json格式
requests.request(method='POST',
                     url='http://127.0.0.1:8000/test/',
                     json={'k1': 'v1', 'k2': '水電費'})

View Code

ret1 = requests.get(
    url='https://dig.chouti.com/',
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    }   )
ret1_cookies = ret1.cookies.get_dict()
#獲取的ret1.cookies是訪問該url返回的cookies對象
#經過get_dict()獲取到字典類型的cookies

View Code

headerspython

    # 發送請求頭到服務器端
    requests.request(method='POST',
                     url='http://127.0.0.1:8000/test/',
                     json={'k1': 'v1', 'k2': '水電費'},
                     headers={'Content-Type': 'application/x-www-form-urlencoded'}
                     )
    #具體須要什麼請求頭要看服務器端

View Code

filesgit

 # 發送文件
    # file_dict = {
    # 'f1': open('readme', 'rb')
    # }
    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # files=file_dict)

    # 發送文件，定製文件名
    # file_dict = {
    # 'f1': ('test.txt', open('readme', 'rb'))
    # }
    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # files=file_dict)

    # 發送文件，定製文件名
    # file_dict = {
    # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
    # }
    # requests.request(method='POST',
    # url='http://127.0.0.1:8000/test/',
    # files=file_dict)

    # 發送文件，定製文件名
    # file_dict = {
    #     'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})
    # }
    # requests.request(method='POST',
    #                  url='http://127.0.0.1:8000/test/',
    #                  files=file_dict)

    pass

View Code

timeout

 設置超時時間,若是訪問超過超時時間就中止訪問
# ret = requests.get('http://google.com/', timeout=1)
    # print(ret)

    # ret = requests.get('http://google.com/', timeout=(5, 1))
    # print(ret)
    pass

View Code

allow_redirects

#是否容許重定向,默認爲true
ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
    print(ret.text)

View Code

BeautifulSoup

該模塊能夠將接收到的html和xml進行格式化,經過操做對象的方式快速的找到想要的標籤github

使用實例

from bs4 import BeautifulSoup
 
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
    ...
</body>
</html>
"""
 
soup = BeautifulSoup(html_doc, features="lxml")

View Code

name--->標籤名

# tag = soup.find('a')
# name = tag.name # 獲取
# print(name)
# tag.name = 'span' # 設置

View Code

attr--->標籤屬性

# tag = soup.find('a')
# attrs = tag.attrs    # 獲取
# print(attrs)
# tag.attrs = {'ik':123} # 設置
# tag.attrs['id'] = 'iiiii' # 設置

View Code

children--->全部子標籤

# body = soup.find('body')
# v = body.children

View Code

descendants 全部後代

# body = soup.find('body')
# v = body.descendants

View Code

clear--->將標籤的全部子標籤所有清空（保留標籤名）
```
# tag = soup.find('body')
# tag.clear()
# print(soup)
```
View Code
extract,遞歸的刪除全部的標籤，並獲取刪除的標籤
```
 #body = soup.find('body')
# v = body.extract()
# print(soup)
```
View Code

find,獲取匹配的第一個標籤

# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)

View Code

find_all,獲取匹配的全部標籤

# tags = soup.find_all('a')
# print(tags)
 
# tags = soup.find_all('a',limit=1)
# print(tags)
 
# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
 
 
# ####### 列表 #######
# v = soup.find_all(name=['a','div'])
# print(v)
 
# v = soup.find_all(class_=['sister0', 'sister'])
# print(v)

View Code

has_attr,檢查標籤是否具備該屬性

# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)

View Code

爬取汽車之家實例

import  requests
from bs4 import BeautifulSoup
# 這個模塊解析html


# 下載頁面
ret = requests.get(url='https://www.autohome.com.cn/news/')
# print(ret.apparent_encoding)#爬取編碼格式
# print(ret.content)
# ret.encoding = 'gbk'
ret.encoding=ret.apparent_encoding
# print(ret.text)

# 頁面解析.獲取想要的內容
soup = BeautifulSoup(ret.text,features='html.parser') # 公司用 lxml(須要單獨安裝)

# find是匹配成功的第一個
div =soup.find(name='div',id='auto-channel-lazyload-article')

#若是有class 匹配的時候:
# *****div = soup.find(name='div',attrs={'class':'dddd','id':'dfa'})*****

li_list=div.find_all(name='li')  # find_all返回的是一個列表 不可以用.find

# print(li_list)

for row in li_list:
    h3=row.find(name='h3')
    if not h3:
        continue

    a=row.find(name='a')
    print(a.get('href'))

    p = row.find(name='p')
    print(p.text)

    li_img= row.find(name='img')
    src= li_img.get('src')

    file_name = src.rsplit('__',maxsplit=1)[1]

    ret_img = requests.get('https:'+src)

    with open(file_name,'wb') as f:
        f.write(ret_img.content)

View Code

抽屜實例

import requests
from bs4 import BeautifulSoup

# 第一次訪問返回未受權的cookie值
ret1 = requests.get(
    url='https://dig.chouti.com/',
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    })
ret1_cookies = ret1.cookies.get_dict()

# 登陸成功以後cookie值已經受權
ret = requests.post(
    url='https://dig.chouti.com/login',
    data={
        'phone':'8613612201458',
        'password':'wo3384451',
        'oneMonth':'1'
    },
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    },
    cookies = ret1_cookies,
)

for num_page in range(2,10):

    ret_index= requests.get(url='https://dig.chouti.com/all/hot/recent/%s'%(num_page),
                            headers={
                                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
                            },
                            )
    soup = BeautifulSoup(ret_index.text,'html.parser')

    div = soup.find(name='div',id='content-list')

    item_list = div.find_all(attrs={'class':'part2'})

    for item in item_list:
        num = item.get('share-linkid')

    # 此時帶着已經受權的cookie值去點贊
        ret3 = requests.post(
            url='https://dig.chouti.com/link/vote?linksId=%s'%(num),
            # data={'linksId':'%s'%(num)},
            headers={
                'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
            },
            cookies = ret1_cookies
        )

        print(ret3.text)

View Code

Github實例

import requests
import re
from bs4 import BeautifulSoup

class Github(object):
    def __init__(self,username=None,passward=None):
        self.username=username
        self.passward=passward
        self.all_cookies={}

        self.process()

    def process(self):
        if not (self.username and self.passward):
            raise Exception('請輸入用戶名和密碼')
        self.get_login_key()

    def get_login_key(self):
        # 獲取authenticity_token
        login_result = requests.get(
            url='https://github.com/login',
            headers={
                'Host': 'github.com',
            }
        )
        auth_key =BS4xpath.get_auth_key(login_result.text)
        self.all_cookies = login_result.cookies.get_dict()
        self.login(auth_key)

    def login(self,auth_key):
        # 登陸獲取已經登陸的cookies
        login_result = requests.post(
            url='https://github.com/session',
            headers={
                'Upgrade-Insecure-Requests': '1',
                'Host': 'github.com',
            },
            data={
                'utf8': '✓',
                'authenticity_token':auth_key,
                'login': self.username,
                'password': self.passward,
                'commit': 'Sign in'
            },
            cookies=self.all_cookies
        )
        self.all_cookies.update(login_result.cookies.get_dict())
        if self.all_cookies['logged_in']=='no':
            raise Exception('用戶名或密碼錯誤')
    def get_msg(self):
        msg_obj = requests.get(
            url='https://github.com/settings/profile',
            headers={
                'Host': 'github.com',
                'Referer': 'https://github.com/',
            },
            cookies=self.all_cookies
        )
        msg=BS4xpath.get_msg_dict(msg_obj.text)

        return msg


class BS4xpath(object):

    @classmethod
    def get_auth_key(self,text):
        soup = BeautifulSoup(text,'html.parser')
        auth_key=soup.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
        return auth_key

    @classmethod
    def get_msg_dict(self,text):
        response = {}
        ret2_data = BeautifulSoup(text,'html.parser')
        div = ret2_data.find(name='div', attrs={'class': "column two-thirds"})
        dl_list = div.find_all(name='dl', attrs={'class': "form-group"})
        for row in dl_list:
            rowname = row.find('label').text
            dd_input = row.find('input')
            if dd_input:
                response[rowname] = dd_input.get('value')
        return response



obj = Github(username='a3384451',passward='wo3384451')

ret = obj.get_msg()
print(ret)

View Code

拉勾網實例

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import requests

all_cookie = {}

# ############### 1. 查看登陸頁面 ###############
r1 = requests.get(
    url='https://passport.lagou.com/login/login.html',
    headers={
        'Host': 'passport.lagou.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }
)

all_cookie.update(r1.cookies.get_dict())

X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

# ############### 2. 用戶名密碼登陸 ###############
r2 = requests.post(
    url='https://passport.lagou.com/login/login.json',
    headers={
        'Host': 'passport.lagou.com',
        'Referer': 'https://passport.lagou.com/login/login.html',
        'X-Anit-Forge-Code': X_Anti_Forge_Code,
        'X-Anit-Forge-Token': X_Anti_Forge_Token,
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    },
    data={
        'isValidate': True,
        'username': '15131255089',
        'password': 'ab18d270d7126ea65915cc22c0d',
        'request_form_verifyCode': '',
        'submit': '',

    },
    cookies=r1.cookies.get_dict()
)

all_cookie.update(r2.cookies.get_dict())

# ############### 3. 用戶受權 ###############
r3 = requests.get(
    url='https://passport.lagou.com/grantServiceTicket/grant.html',
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

    },
    allow_redirects=False,
    cookies=all_cookie

)

all_cookie.update(r3.cookies.get_dict())

# ############### 4. 用戶認證 ###############
r4 = requests.get(
    url=r3.headers['Location'],
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

    },
    allow_redirects=False,
    cookies=all_cookie
)

all_cookie.update(r4.cookies.get_dict())

r5 = requests.get(
    url=r4.headers['Location'],
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

    },
    allow_redirects=False,
    cookies=all_cookie
)
all_cookie.update(r5.cookies.get_dict())
r6 = requests.get(
    url=r5.headers['Location'],
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

    },
    allow_redirects=False,
    cookies=all_cookie
)

all_cookie.update(r6.cookies.get_dict())
r7 = requests.get(
    url=r6.headers['Location'],
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

    },
    allow_redirects=False,
    cookies=all_cookie
)

all_cookie.update(r7.cookies.get_dict())

# ############### 5. 查看我的頁面 ###############
r5 = requests.get(
    url='https://www.lagou.com/resume/myresume.html',
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

    },
    cookies=all_cookie
)
print('武沛齊' in r5.text)

# ############### 6. 查看 ###############
r6 = requests.get(
    url='https://gate.lagou.com/v1/neirong/account/users/0/',
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        'X-L-REQ-HEADER': "{deviceType:1}",
        'Origin': 'https://account.lagou.com',
        'Host': 'gate.lagou.com',
    },
    cookies=all_cookie

)
r6_json = r6.json()
all_cookie.update(r6.cookies.get_dict())

# ############### 7. 修改我的信息 ###############
r7 = requests.put(
    url='https://gate.lagou.com/v1/neirong/account/users/0/',
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        'Origin': 'https://account.lagou.com',
        'Host': 'gate.lagou.com',
        'X-Anit-Forge-Code': r6_json['submitCode'],
        'X-Anit-Forge-Token': r6_json['submitToken'],
        'X-L-REQ-HEADER': "{deviceType:1}",
    },
    cookies=all_cookie,
    json={"userName": "wupeiqi888", "sex": "MALE", "portrait": "images/myresume/default_headpic.png",
          "positionName": '...', "introduce": '....'}
)
print(r7.text)

View Code

防止xss攻擊

from bs4 import BeautifulSoup
class XSSFilter(object):
    __instance = None
    def __init__(self):        # XSS白名單
        self.valid_tags = {
            "font": ['color', 'size', 'face', 'style'],
            'b': [],
            'div': [],
            "span": [],
            "table": [
                'border', 'cellspacing', 'cellpadding'
            ],
            'th': [
                'colspan', 'rowspan'
            ],
            'td': [
                'colspan', 'rowspan'
            ],
            "a": ['href', 'target', 'name'],
            "img": ['src', 'alt', 'title'],
            'p': ['align'],
            "pre": ['class'],
            "hr": ['class'],
            'strong': []
        }
    def __new__(cls, *args, **kwargs):
        if not cls.__instance:
            obj = object.__new__(cls, *args, **kwargs)
            cls.__instance = obj
        return cls.__instance
    def process(self, content):
        soup = BeautifulSoup(content, 'html.parser')        # 遍歷全部HTML標籤
        for tag in soup.find_all():        # 判斷標籤名是否在白名單中
            if tag.name not in self.valid_tags:
                tag.hidden = True
                if tag.name not in ['html', 'body']:
                    tag.hidden = True
                    tag.clear()
                continue                    # 當前標籤的全部屬性白名單
            attr_rules = self.valid_tags[tag.name]
            keys = list(tag.attrs.keys())
            for key in keys:
                if key not in attr_rules:
                    del tag[key]
        return soup.decode()                    #這裏返回的就是過濾完的內容

content="""
<p class='c1' id='i1'>
   asdfaa<span style="font-family:NSimSun;" class='c1'>sdf<a>a</a>sdf</span>sdf
</p>
<p>
   <strong class='c2' id='i2'>asdf</strong>
   <script>alert(123)</script>
</p>
<h2>
   asdf
</h2>
"""

content = XSSFilter().process(content)
print('content',content)

總結:

若是爬取的網站有反爬措施,請求裏模仿瀏覽器發給服務器端
若是須要須要攜帶信息過去的
1. 去服務器返回的內容裏找.若是有將他格式化成字典或其餘保存在session
2. 看到159900098這樣格式的通常都是時間戳,可是位數須要本身觀察
3. 若是服務器返回的內容裏沒有key,那麼去html或者js找相應的數據
4. 可能下一次的操做須要攜帶着上一次服務器發過來的key或其餘
狀態碼:
1. 3開頭的狀態碼是自動跳轉.在自動跳轉的時候可能進行cookies認證
2. 注意Response request 裏的set-cookies參數