爬蟲重要案例總結與回顧

一.爬取化妝品生產許可證相關

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}


def get_ID(pages):
    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'

    ID_list = []

    for every_page in range(1, pages + 1):

        params = {
            "on": "true",
            "page": str(every_page),
            "pageSize": "15",
            "productName": "",
            "conditionType": "1",
            "applyname": "",
            "applysn": "",
        }

        data = requests.post(url=url, params=params, headers=headers).json()

        for each_dict in data["list"]:
            ID_list.append(each_dict['ID'])

        import time
        time.sleep(0.1)

    return ID_list


def get_all_detail(ID_list):
    url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById"

    with open('化妝品生產許可證信息.txt', 'a', encoding='utf-8') as f:
        for ID in ID_list:
            params = {
                "id": ID
            }

            data = requests.post(url=url, params=params, headers=headers).text

            f.write(data + '\n')

            import time
            time.sleep(0.1)

    print("數據寫入文件成功!")


ID_list = get_ID(10)
get_all_detail(ID_list)

二.爬取肯德基餐廳查詢指定地點

import requests

# UA假裝
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}

# 指定url
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'

# 處理參數
area = input('請輸入一個地名: ')
params = {
    "cname": "",
    "pid": "",
    "keyword": area,
    "pageIndex": "1",
    "pageSize": "10",
}

# 發起請求,獲取響應數據
data = requests.get(url=url, params=params, headers=headers).json()

print(data)

三.正則 - 爬取糗事百科圖片

import re
import os
import time
import requests
from urllib import request

if not os.path.exists('./qiutu'):
    os.mkdir('/qiutu')

url = "https://www.qiushibaike.com/pic/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

page_text = requests.get(url=url, headers=headers).text

if not os.path.exists('qiutu'):
    os.mkdir('qiutu')

img_url_list = re.findall('<div class="thumb">.*?<img src="(.*?)" alt.*?</div>', page_text, re.S)

for img_url in img_url_list:
    img_url = 'https:' + img_url
    img_name = img_url.split('/')[-1]
    img_path = './qiutu/' + img_name
    request.urlretrieve(img_url, img_path)
    print(img_path, '下載成功!')

    time.sleep(0.1)

四.bs4 - 爬取詩詞名句網站中三國演義小說

import requests
from bs4 import BeautifulSoup

headers={
         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
     }
def parse_content(url):
    #獲取標題正文頁數據
    page_text = requests.get(url,headers=headers).text
    soup = BeautifulSoup(page_text,'lxml')
    #解析得到標籤
    ele = soup.find('div',class_='chapter_content')
    content = ele.text #獲取標籤中的數據值
    return content

if __name__ == "__main__":
     url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
     reponse = requests.get(url=url,headers=headers)
     page_text = reponse.text

     #建立soup對象
     soup = BeautifulSoup(page_text,'lxml')
     #解析數據
     a_eles = soup.select('.book-mulu > ul > li > a')
     print(a_eles)
     cap = 1
     for ele in a_eles:
         print('開始下載第%d章節'%cap)
         cap+=1
         title = ele.string
         content_url = 'http://www.shicimingju.com'+ele['href']
         content = parse_content(content_url)

         with open('./sanguo.txt','w') as fp:
             fp.write(title+":"+content+'\n\n\n\n\n')
             print('結束下載第%d章節'%cap)

五.xpath解析相關實例

1.解析58二手房的相關數據

import requests
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url = 'https://bj.58.com/ershoufang/?PGTID=0d200001-0000-1376-eb9f-25ca6cacedce&ClickID=1'

page_text = requests.get(url=url, headers=headers).text

# 數據解析
tree = etree.HTML(page_text)

li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')

if __name__ == '__main__':

    for li in li_list:
        title = li.xpath('./div[2]/h2/a/text()')[0].strip()
        print(title)

2.下載彼岸圖網中的圖片數據:中文亂碼問題

import requests, os
from lxml import etree
from urllib import request

# 建立一個空文件夾,用於存放圖片數據
if not os.path.exists('./images'):
    os.mkdir('./images')

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url = 'http://pic.netbian.com/4kmeinv/'

# 獲取響應數據
page_text = requests.get(url=url, headers=headers).text

# 實例化etree對象
tree = etree.HTML(page_text)
# xpath解析
li_list = tree.xpath('//div[@class="slist"]/ul/li')

for li in li_list:
    img_name = li.xpath('./a/img/@alt')[0]
    # 處理中文亂碼問題
    img_name = img_name.encode('ISO-8859-1').decode('gbk')

    img_url = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
    img_path = './images/' + img_name + '.jpg'
    request.urlretrieve(url=img_url, filename=img_path)
    print("下載完成!!!")

3.下載煎蛋網中圖片數據(數據通過加密)

from lxml import etree
from urllib import request
import requests
import base64
import os

if not os.path.exists('./jiandan'):
    os.mkdir('./jiandan')

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url = 'http://jandan.net/ooxx/page-62'

page_text = requests.get(url=url, headers=headers).text

tree = etree.HTML(page_text)

img_hash_list = tree.xpath('//span[@class="img-hash"]/text()')

for img_hash in img_hash_list:
    # 圖片的真實url(須要用base64解碼)
    img_url = "http:" + base64.b64decode(img_hash).decode('utf8')

    # 設置圖片存放路徑
    img_path = './jiandan/' + img_url.split('/')[-1]

    # 持久化存儲
    request.urlretrieve(url=img_url, filename=img_path)
    print("下載完成!!", img_url)

print('over!')

4.下載站長素材中的簡歷模板數據

import requests
import random
import os
from lxml import etree

if not os.path.exists('./jianli'):
    os.mkdir('./jianli')

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

for i in range(1, 6):
    # 1.指定url
    if i == 1:
        url = 'http://sc.chinaz.com/jianli/free.html'
    else:
        url = f'http://sc.chinaz.com/jianli/free_{i}.html'
    # 2.發起請求
    response = requests.get(url=url, headers=headers)
    # 2.1編碼格式
    response.encoding = 'utf8'
    # 3.獲取響應的文本信息
    page_text = response.text
    # 4.實例化etree對象,將頁面源碼加載到該對象中
    tree = etree.HTML(page_text)
    # 5.使用xpath函數進行定位
    a_list = tree.xpath('//a[@class="title_wl"]')

    for a in a_list:
        jianli_name = a.xpath('./text()')[0]
        jianli_url = a.xpath('./@href')[0]
        print(jianli_name)
        print(jianli_url)
        print('----------------------------------------------')

        response2 = requests.get(url=jianli_url, headers=headers)
        response2.encoding = 'utf8'
        each_jinali_text = response2.text
        tree2 = etree.HTML(each_jinali_text)
        # 全部下載地址列表
        download_url_list = tree2.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
        # 隨機選擇一個下載地址
        download_url = random.choice(download_url_list)
        # 獲取響應的數據
        res = requests.get(url=download_url, headers=headers).content
        # 持久化存儲
        filepath = './jianli/' + jianli_name + '.rar'
        with open(filepath, 'wb') as f:
            f.write(res)
        print(jianli_name, '下載完成!')

print('over!')

5.解析全部城市名稱

"""
解析全部城市名稱
https://www.aqistudy.cn/historydata/
"""

import requests
from lxml import etree

url = 'https://www.aqistudy.cn/historydata/'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

response = requests.get(url=url, headers=headers)
response.encoding = 'utf8'
page_text = response.text
tree = etree.HTML(page_text)

hot_city_list = tree.xpath('//div[@class="hot"]/div[2]/ul/li/a/text()')
all_city_list = tree.xpath('//div[@class="all"]/div[2]/ul/div[2]/li/a/text()')
# 能夠用管道符鏈接以上兩者
cityname_list = tree.xpath(
    '//div[@class="hot"]/div[2]/ul/li/a/text() | //div[@class="all"]/div[2]/ul/div[2]/li/a/text()')

print('-----------------------------------------------------------')
print(hot_city_list)

print('***********************************************************')
print(all_city_list)

print('###########################################################')
print(cityname_list)

六.圖片懶加載

"""
圖片懶加載概念:
    - 圖片懶加載是一種網頁優化技術.圖片做爲一種網絡資源,
    在被請求時也與普通靜態資源同樣,將佔用網絡資源,
    而一次性將整個頁面的全部圖片加載完,
    將大大增長頁面的首屏加載時間.爲了解決這種問題,經過先後端配合,
    使圖片僅在瀏覽器當前視窗內出現時才加載該圖片,
    達到減小首屏圖片請求數的技術就被稱爲"圖片懶加載".

網站通常如何實現圖片懶加載技術呢?
    - 在網頁源碼中,在img標籤中首先會使用一個"僞屬性"(一般使用src2,original...)
    去存放真正的圖片連接而並不是是直接存放在src屬性中.當圖片出現到頁面的可視化區域中,
    會動態將僞屬性替換成src屬性,完成圖片的加載.
"""

import os
import requests
from urllib import request
from lxml import etree

if not os.path.exists('./images'):
    os.mkdir('./images')

url = 'http://sc.chinaz.com/tupian/'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

response = requests.get(url=url, headers=headers)
response.encoding = 'utf8'
page_text = response.text
tree = etree.HTML(page_text)
img_list = tree.xpath('//div[@class="box picblock col3"]/div/a/img')

for img in img_list:
    img_name = img.xpath('./@alt')[0]
    img_url = img.xpath('./@src2')[0]

    file_path = './images/' + img_name + '.jpg'
    request.urlretrieve(img_url, file_path)
    print("下載完成!!!", img_name)

print('over!')

"""
站長素材案例後續分析:
    - 經過細緻觀察頁面的結構後發現,網頁中圖片的連接是存儲在了src2這個僞屬性中
"""

七.使用雲打碼平臺識別驗證碼

ydmhttp.py:html

import http.client, mimetypes, urllib, json, time, requests


class YDMHttp:
    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''

    def __init__(self, username, password, appid, appkey):
        self.username = username
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey

    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response

    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001

    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001

    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001

    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''

    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''

    def report(self, cid):
        data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
        response = self.request(data)
        if (response):
            return response['ret']
        else:
            return -9001

    def post_url(self, url, fields, files=[]):
        for key in files:
            files[key] = open(files[key], 'rb');
        res = requests.post(url, files=files, data=fields)
        return res.text

# 該函數用於獲取識別後的驗證碼
def getCodeData(username, password, filename, codetype, timeout):
    # 用戶名
    username = username

    # 密碼
    password = password

    # 軟件ID,開發者分紅必要參數。登陸開發者後臺【個人軟件】得到!
    appid = 1234

    # 軟件密鑰,開發者分紅必要參數。登陸開發者後臺【個人軟件】得到!
    appkey = 'xxx'

    # 圖片文件
    filename = filename

    # 驗證碼類型,# 例:1004表示4位字母數字,不一樣類型收費不一樣。請準確填寫,不然影響識別率。在此查詢全部類型 http://www.yundama.com/price.html
    codetype = codetype

    # 超時時間,秒
    timeout = timeout

    # 檢查
    if (username == 'username'):
        print('請設置好相關參數再測試')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)

        # 登錄雲打碼
        uid = yundama.login();
        print('uid: %s' % uid)

        # 查詢餘額
        balance = yundama.balance();
        print('balance: %s' % balance)

        # 開始識別,圖片路徑,驗證碼類型ID,超時時間(秒),識別結果
        cid, result = yundama.decode(filename, codetype, timeout);
        print('cid: %s, result: %s' % (cid, result))
    return result

八.模擬登陸人人網,爬取我的中心頁面數據

"""
cookie的處理:
1. 手動處理
    - cookie封裝到headers
2. 自動處理
    - (1)獲取一個session對象
    - (2)使用session對象進行請求的發送
    - (3)做用: 在使用session進行請求發送的過程當中若是產生了cookie,
            則cookie會被自動存儲到session對象中.
"""


from ydmhttp import getCodeData # 識別人人網中的驗證碼圖片
import requests
from urllib import request
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
url = 'http://www.renren.com'

page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
code_img_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]

if code_img_url:
    request.urlretrieve(url=code_img_url, filename='./code.jpg')
    # 識別驗證碼圖片中的數據值,2004表示4位純漢字,其餘類型代碼參考雲打碼幫助文檔
    # 查看驗證碼類型: http://www.yundama.com/price.html
    code_data = getCodeData('username', 'password', './code.jpg', 2004, 30)
    print(code_data)  # code_data爲識別結果
else:
    print('不須要識別驗證碼')
    code_data = ''

# 指定登陸請求的url
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019241516668'
data = {    # 如下數據經過fiddler抓包工具抓取到網頁後便可拿到
    "email": "xxx",     # 你的email
    "icode": code_data,
    "origURL": "http://www.renren.com/home",
    "domain": "renren.com",
    "key_id": "1",
    "captcha_type": "web_login",
    "password": "xxx",  # 你的password密文
    "rkey": "xxx",
    "f": "http%3A%2F%2Fwww.renren.com%2F970153909"
}

# 建立session對象
session = requests.Session()
# 使用session進行請求的發送: 獲取cookie,且將cookie保存到session中
session.post(url=login_url, data=data, headers=headers)
# 指定我的主頁對應的頁面url
url = 'http://www.renren.com/970153909/profile'
# 攜帶session發送該請求,並獲取響應數據
page_text = session.get(url=url, headers=headers).text
# 持久化存儲
with open('renren.html', 'w', encoding='utf8') as f:
    f.write(page_text)

九.對古詩文網進行模擬登陸

"""
cookie的處理:
    1. 手動處理:
        - 把cookie封裝到headers中
    2. 自動處理:
        - (1)獲取一個session對象
        - (2)使用session對象進行請求的發送
        - (3)做用: 在使用session進行請求發送的過程當中,
                若是產生了cookie,cookie就會被自動存儲到session對象中.
"""


from ydmhttp import getCodeData # 識別人人網中的驗證碼圖片
from urllib import request
from lxml import etree
import requests, os, uuid

# 建立資源文件存放目錄
if not os.path.exists('./sources'):
    os.mkdir('./sources')

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

# 實例化session對象
session = requests.Session()

# 指定登錄頁面的url
url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
# 獲取登錄頁面的HTML文本
page_text = requests.get(url=url, headers=headers).text
# 建立etree對象
tree = etree.HTML(page_text)

# 獲取登錄頁面的識別驗證碼
code_img_url = 'https://so.gushiwen.org' + tree.xpath('//*[@id="imgCode"]/@src')[0]
# 持久化存儲 驗證碼圖片
filepath = f'./sources/{uuid.uuid4()}'
filename = filepath + '.jpg'
# 注意:驗證碼圖片必須用攜帶session去獲取,不然獲取的驗證碼沒法該帳戶相匹配
img_data = session.get(url=code_img_url, headers=headers).content
with open(filename, 'wb') as fp:
    fp.write(img_data)

# 識別驗證碼圖片中的數據. 驗證碼類型查詢: http://www.yundama.com/price.html
# 這裏應該填寫你雲打碼平臺 普通用戶的用戶名和密碼,而不是開發者用戶; 1004是驗證碼類型, 50是延遲時間
code_data = getCodeData('username', 'password', filename, 1004, 50)

# 指定登陸請求的url
login_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'

# 登陸該網站時須要在請求頭中加入動態參數
__VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
data = {
    "__VIEWSTATE": __VIEWSTATE,
    "__VIEWSTATEGENERATOR": __VIEWSTATEGENERATOR,
    "from": "http://so.gushiwen.org/user/collect.aspx",
    "email": "xxx",     # 你的email
    "pwd": "xxx",       # 你的密碼
    "code": code_data,
    "denglu": "登陸",
}

# 模擬登錄,拿到登陸後的首頁數據
index_text = session.post(url=login_url, data=data, headers=headers).content

# 持久化存儲
filename2 = filepath + '.html'
with open(filename2, 'wb') as f:
    f.write(index_text)

print('下載成功!!!')

十.使用線程池爬取梨視頻的視頻數據

import requests, re, os
from lxml import etree
from uuid import uuid4

# 導入線程池模塊
from multiprocessing.dummy import Pool

# 在線程池中建立10個線程
pool = Pool(10)
"""
線程池的使用場景: 應用在全部耗時的操做中
"""

# 建立資源文件存放目錄
if not os.path.exists('./sources'):
    os.mkdir('./sources')

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url = 'https://www.pearvideo.com/category_1'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
video_url_list = []  # 裝的是全部視頻連接地址
for li in li_list:
    detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
    print(detail_url)
    detail_page_text = requests.get(url=detail_url, headers=headers).text
    # 咱們發現,視頻的連接地址在JS中,沒法經過xpath取到視頻連接地址,因而採用正則匹配
    video_url = re.findall('ldUrl="",srcUrl="(.*?)",vdoUrl=srcUrl', detail_page_text, re.S)[0]
    video_url_list.append(video_url)
print(video_url_list)


def getVideoData(url):
    video_data = requests.get(url=url, headers=headers).content
    return video_data


def saveVideoData(data):
    filename = f'./sources/{uuid4()}.mp4'
    with open(filename, 'wb') as f:
        f.write(data)
    print('下載成功!')


# 對視頻的連接發起請求並獲取視頻數據
# video_data_list存儲全部視頻的二進制數據
video_data_list = pool.map(getVideoData, video_url_list)

# 使用線程池對視頻數據進行持久化存儲
pool.map(saveVideoData, video_data_list)

十一.selenium的使用

博客地址: https://www.cnblogs.com/bobo-zhang/p/9685362.html

谷歌瀏覽器驅動下載地址: http://chromedriver.storage.googleapis.com/index.html

下載的驅動程序必須和瀏覽器的版本統一,你們能夠根據http://blog.csdn.net/huilan_same/article/details/51896672中提供的版本映射表進行對應

PhantomJS下載地址: https://pan.baidu.com/s/11KMIKitILGpVU33oxxzcJA  # 提取碼:og8o

1.百度文本輸入框中錄入中國

"""
selenium: 可讓瀏覽器完成相關自動化的操做
環境安裝:
    - pip install selenium
編碼流程:
    - 導包
    - 建立某一款瀏覽器對象
    - 制定相關的行爲動做
"""

from selenium import webdriver
import time, os

if not os.path.exists('./sources'):
    os.mkdir('./sources')

# 配置瀏覽器
browser = webdriver.Chrome(executable_path=r'F:\chromedriver.exe')
time.sleep(3)

browser.get('https://www.baidu.com/')
time.sleep(3)

# find系列的函數能夠幫助咱們定位到相關的標籤
text_input = browser.find_element_by_id('kw')
# 向文本框錄入一個關鍵字
text_input.send_keys('中國')
time.sleep(3)

btn = browser.find_element_by_id('su')
btn.click()
time.sleep(3)

# 獲取當前瀏覽器顯示的頁面源碼數據(動態加載的數據)
page_text = browser.page_source

# 持久化存儲
with open('./sources/zhongguo.html', 'w', encoding='utf-8') as fp:
    fp.write(page_text)

time.sleep(3)

browser.quit()

2.爬取更多的電影詳情數據(豆瓣)

from selenium import webdriver
import time, os

if not os.path.exists('./sources'):
    os.mkdir('./sources')

# 配置瀏覽器
browser = webdriver.Chrome(executable_path=r'F:\chromedriver.exe')

browser.get('https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=')
time.sleep(3)

browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(3)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(3)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(3)

# 獲取瀏覽器當前的頁面源碼數據
page_text = browser.page_source

# 持久化存儲
with open('./sources/douban.html', 'w', encoding='utf-8') as fp:
    fp.write(page_text)

time.sleep(3)

browser.quit()

3.使用phantomJs瀏覽器爬取更多的電影詳情數據(豆瓣)

from selenium import webdriver
import time, os

if not os.path.exists('./sources'):
    os.mkdir('./sources')

# 配置PhantomJS瀏覽器
browser = webdriver.PhantomJS(
    executable_path=r'F:\phantomjs-2.1.1-windows\bin\phantomjs.exe')

browser.get('https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=')
time.sleep(3)

browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(3)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(3)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(3)

# 獲取瀏覽器當前的頁面源碼數據
page_text = browser.page_source

# 持久化存儲
with open('./sources/douban2.html', 'w', encoding='utf-8') as fp:
    fp.write(page_text)

time.sleep(3)

browser.quit()

4.使用谷歌無頭瀏覽器爬取數據

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep

# 必須寫上這三個配置項
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# 指定chrome_options
bro = webdriver.Chrome(executable_path=r'C:\Users\chromedriver.exe', chrome_options=chrome_options)
sleep(3)

bro.get('https://www.baidu.com/')
sleep(3)

# find系列的函數能夠幫助咱們定位到相關的標籤
text_input = bro.find_element_by_id('kw')
# 向文本框中錄入一個關鍵字
text_input.send_keys('中國')
sleep(3)

btn = bro.find_element_by_id('su')
btn.click()
sleep(3)

# 獲取當前瀏覽器顯示的頁面源碼數據(動態加載的數據)
page_text = bro.page_source
print(page_text)

bro.quit()

5.登陸qq空間並爬取首頁的數據

from selenium import webdriver
from lxml import etree
import time, os

if not os.path.exists('./sources'):
    os.mkdir('./sources')

# 配置瀏覽器
browser = webdriver.Chrome(executable_path=r'F:\chromedriver.exe')

# 發送請求
browser.get('https://qzone.qq.com/')
time.sleep(5)

"""
在web應用中常常會遇到frame嵌套頁面的應用,
使用WebDriver每次只能在一個頁面上識別元素,
對於frame嵌套內的頁面上的元素,
直接定位是定位不到的.
這個時候就須要經過switch_to.frame()方法
將當前定位的主體切換到frame裏
"""

# 定位到id="login_frame"的iframe標籤下的全部元素
browser.switch_to.frame('login_frame')

# 點擊id="switcher_plogin"的標籤
browser.find_element_by_id('switcher_plogin').click()
time.sleep(1)

# 給id="u"的標籤設置值
browser.find_element_by_id("u").send_keys("username")   # 你的用戶名
# time.sleep(1)

# 給id="p"的標籤設置值
browser.find_element_by_id("p").send_keys("password")   # 你的密碼
# time.sleep(3)

# 點擊id="login_button"的標籤設置值
browser.find_element_by_id("login_button").click()
time.sleep(1)

browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)

# 獲取頁面全部數據
page_text = browser.page_source

# 持久化存儲
with open('./sources/QQzone.html', 'w', encoding='utf-8') as fp:
    fp.write(page_text)

tree = etree.HTML(page_text)

li_list = tree.xpath('//*[@id="feed_friend_list"]/li')

for li in li_list:
    text_list = li.xpath('.//div[@class="f-info"]//text() | .//div[@class="f-info qz_info_cut"]//text()')
    text = ''.join(text_list)
    print(text + '\n\n\n')

browser.close()
相關文章
相關標籤/搜索