Pyppeteer 模塊

Pyppeteer 模塊:

安裝:

pip3 install pyppeteer 
pip install tushare --upgrade3

案例·:

import asyncio
from pyppeteer import launch
from lxml import etree

async def main():
    browser = await launch()  # 新建一個browser對象
    page = await browser.newPage()  # 在瀏覽器中新建一個選項卡
    await page.goto('http://quotes.toscrape.com/js/')  # 在瀏覽器中輸入URL,至關於selenium裏面的get
    page_text = await page.content()  # 使用.content()方法獲取頁面源碼
    tree = etree.HTML(page_text)
    div_list = tree.xpath('//div[@class="quote"]')
    print(len(div_list))

    await browser.close()  # 關閉瀏覽器

asyncio.get_event_loop().run_until_complete(main())

執行js程序:

import asyncio
from pyppeteer import launch
width, height = 1366, 768

async def main():
    browser = await launch(headless=False)
    page = await browser.newPage()
    await page.setViewport({'width': width, 'height': height})
    await page.goto('https://movie.douban.com/typerank?type_name=%E5%8A%A8%E4%BD%9C&type=5&interval_id=100:90&action=')
    await asyncio.sleep(3)

    # evaluate能夠返回js程序的返回值
    dimensions = await page.evaluate('window.scrollTo(0,document.body.scrollHeight)')
    await asyncio.sleep(3)
    print(dimensions)
    await browser.close()
 
asyncio.get_event_loop().run_until_complete(main())

避免檢查:

import asyncio
from pyppeteer import launch

async def main():
    browser = await launch(headless=False, args=['--disable-infobars'])
    page = await browser.newPage()
    await page.goto('https://login.taobao.com/member/login.jhtml?redirectURL=https://www.taobao.com/')
    await page.evaluate( '''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
    await asyncio.sleep(10)

asyncio.get_event_loop().run_until_complete(main())

UA假裝:

await self.page.setUserAgent('xxx')

節點交互:
import asyncio
from pyppeteer import launch
async def main():
    # headless參數設爲False,則變成有頭模式
    browser = await launch(headless=False)

    page = await browser.newPage()
    # 設置頁面視圖大小
    await page.setViewport(viewport={'width': 1280, 'height': 800})

    await page.goto('https://www.baidu.com/')
    # 節點交互
    await page.type('#kw','周杰倫',{'delay': 1000})
    await asyncio.sleep(3)
    await page.click('#su')
    await asyncio.sleep(3)
    # 使用選擇器選中標籤進行點擊
    alist = await page.querySelectorAll('.s_tab_inner > a')
    a = alist[3]
    await a.click()
    await asyncio.sleep(3)
    await browser.close()
    
asyncio.get_event_loop().run_until_complete(main())

爬取頭條 /網易:

import asyncio
from pyppeteer import launch
from lxml import etree

async def main():
    # headless參數設爲False,則變成有頭模式
    browser = await launch(headless=False)

    page1 = await browser.newPage()

    # 設置頁面視圖大小
    await page1.setViewport(viewport={'width': 1280, 'height': 800})

    await page1.goto('https://www.toutiao.com/')
    await page1.evaluate('window.scrollTo(0,document.body.scrollHeight)')
    await asyncio.sleep(2)
    # 打印頁面文本
    page_text = await page1.content()

    page2 = await browser.newPage()
    await page2.setViewport(viewport={'width': 1280, 'height': 800})
    await page2.goto('https://news.163.com/domestic/')
    await page2.evaluate('window.scrollTo(0,document.body.scrollHeight)')
    page_text1 = await page2.content()

    await browser.close()

    return {'wangyi':page_text1,'toutiao':page_text}

def parse(task):
    content_dic = task.result()
    wangyi = content_dic['wangyi']
    toutiao = content_dic['toutiao']
    
    tree = etree.HTML(toutiao)
    a_list = tree.xpath('//div[@class="title-box"]/a')
    print("頭條新聞爬取數量: ", len(a_list))
    for a in a_list:
        title = a.xpath('./text()')[0]
        print('toutiao:',title)

    tree = etree.HTML(wangyi)
    div_list = tree.xpath('//div[@class="data_row news_article clearfix "]')
    print("網易新聞爬取數量: ", len(div_list))
    for div in div_list:
        title = div.xpath('.//div[@class="news_title"]/h3/a/text()')[0]
        print('wangyi:',title)
 
tasks = []
task1 = asyncio.ensure_future(main())
task1.add_done_callback(parse)
tasks.append(task1)

asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks))
相關文章
相關標籤/搜索