爬蟲闖關:GlidedSky基礎題

網站:http://glidedsky.com

第一題:計算網頁上全部數字的和

註冊進去就看到了html

點進去待爬取的網站,發現全是數字,,,這個第一題確實簡單,沒啥講的python

file

第二題:一樣題,請求1000次

file

這個題也是同樣的,最簡單的就是將上面的寫的代碼,改改就能夠了,不過這樣的話速度太慢了,能夠試着本身優化一下,加線程或者直接使用協程,都是很不錯的。固然我以爲協程應該能更快一點。沒作具體測試。git

運行的結果,這樣是直接改的,沒有加任何線程或協程,時間有點長。很基礎,可是也要看你怎麼優化了。github

應該是還能夠更優化,我寫的代碼偷懶了,使用協程寫的。web

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 
# @Time : 2019/8/18 0:33 
# @Author : zhao.jia
# @Site :  
# @File : glide_test.py 
# @Software: PyCharm

import requests
import tools
from lxml import etree
import aiohttp
import asyncio
import datetime
import time
from requests.adapters import HTTPAdapter


class TestGlidedsky:

    def __init__(self):

        self.headers = """
            Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
            Accept-Encoding: gzip, deflate
            Accept-Language: zh-CN,zh;q=0.9
            Cache-Control: max-age=0
            Connection: keep-alive
            Cookie: _ga=GA1.2.1425271689.1566058842; _gid=GA1.2.586445152.1566058842; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566058842,1566106841; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566129989; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IjM4SmpWMlwvaWxPQklreFVaMDFXVFhRPT0iLCJ2YWx1ZSI6IjdoMUFJaVF6YUVvUUNDZU1TaERsN0FVK0dRdTdORW9QUlwvNDlMXC9uXC9IdjdCZ2JCQVhiMXNEV2JKQnI5UXVIMHAiLCJtYWMiOiIyMWMyYzc1MzM3MWQyZTMxNDQwZjA5ZTUxNDZkOThmNTAyOWQwYTQzZDQyZTc4M2Q4YjNlZTI3YjYzZjgwNzA1In0%3D; glidedsky_session=eyJpdiI6Ik1rRUMrXC8yMlVkOEZlSEZja24zdmJRPT0iLCJ2YWx1ZSI6IjRoWG84K1MrM3NLbnlRVytrUVRHd1ZqWWtkdkdyeUtwOTBKdDFWTnl4THdkS1hcL2dmRzA1c1JJRDZSaHk2NlhKIiwibWFjIjoiNmQ2MmJhNWFlNzZiOWEwY2NiMDM1ZTBkZGE2MmNiNGQwNWU4OGJmOTU2OWQxNmU2NmM1MjE1ZmI0NGQ3MjllNyJ9
            Host: glidedsky.com
            Referer: http://glidedsky.com/login
            Upgrade-Insecure-Requests: 1
            User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36
        """
        self.sess = requests.session()
        self.sess.headers = tools.headers_to_dict(self.headers)
        self.sum_count_2 = 0
        self.sess.mount('http://', HTTPAdapter(max_retries=3))
        self.sess.mount('https://', HTTPAdapter(max_retries=3))
        self.sess.verify = False


    def basic_one(self):
        sum_count = 0
        res = self.sess.get(url="http://glidedsky.com/level/web/crawler-basic-1")
        res_html = etree.HTML(res.text)
        nums = res_html.xpath('//div[@class="col-md-1"]/text()')
        for num in nums:
            sum_count += int(num.strip())
        print("sum=" + sum_count)

    # 第二題
    def basic_two(self):
        count = 1
        sum_count = 0
        while True:
            res = self.sess.get(f"http://glidedsky.com/level/web/crawler-basic-2?page={count}")
            res_html = etree.HTML(res.text)
            nums = res_html.xpath('//div[@class="col-md-1"]/text()')
            for num in nums:
                sum_count += int(num.strip())
            count += 1
            if count == 1001:
                break
        print(sum_count)

    async def basic_two_2(self, url):
        async with aiohttp.ClientSession() as session:
            async with session.get(url, headers=tools.headers_to_dict(self.headers)) as resp:
                res = await resp.text()
                res_html = etree.HTML(res)
                nums = res_html.xpath('//div[@class="col-md-1"]/text()')
                for num in nums:
                    self.sum_count_2 += int(num.strip())

    def sum_async_count(self):
        loop = asyncio.get_event_loop()
        tasks = [asyncio.ensure_future(
            self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in
                 range(1, 500)]
        loop.run_until_complete(asyncio.gather(*tasks))
        tasks = [asyncio.ensure_future(
            self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in
            range(500, 1001)]
        loop.run_until_complete(asyncio.gather(*tasks))
        print(self.sum_count_2)


if __name__ == '__main__':
    # 第二題
    # starttime = datetime.datetime.now()
    # TestGlidedsky().basic_two()
    # endtime = datetime.datetime.now()
    # count_time_1 = (endtime - starttime).seconds
    # print(count_time_1)
    # 第二題
    # starttime_2 = datetime.datetime.now()
    # TestGlidedsky().sum_async_count()
    # endtime_2 = datetime.datetime.now()
    # count_time_2 = (endtime_2 - starttime_2).seconds
    # print(count_time_2)

第三題:仍是求和,

不過此次封禁ip,每一個ip只能訪問一次,這個題就有點噁心了,只能去找代理ip了,找免費的就行,想辦法多重試。服務器

file

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 
# @Time : 2019/8/27 11:00 
# @Author : Andrew
# @Site :  
# @File : python-abu.py 
# @Software: PyCharm

#! -*- encoding:utf-8 -*-

from urllib import request
import base64
from lxml import etree
import time
import requests
from requests.adapters import HTTPAdapter


class test:

    def __init__(self):
        self.sess = requests.session()
        self.sess.mount('http://', HTTPAdapter(max_retries=3))
        self.sess.mount('https://', HTTPAdapter(max_retries=3))
        self.sess.verify = False

    def abu_test(self):

        # 代理服務器
        proxyHost = "proxy.abuyun.com"
        proxyPort = "9020"

        # 代理隧道驗證信息
        proxyUser = "H2T*****22WD"
        proxyPass = "7****10526D3F"
        proxy_dict = {'http': "http-dyn.abuyun.com:9020"}
        auth = f"{proxyUser}:{proxyPass}"
        auth = base64.b64encode(auth.encode('utf8'))
        proxy_header = {"Proxy-Authorization": 'Basic ' + auth.decode()}
        self.get_html(proxy_dict, proxy_header)

    def get_html(self, proxy_dict, proxy_header):
        count = 1
        sum_count = 0

        headers = """
                Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
                Accept-Encoding: gzip, deflate
                Accept-Language: zh-CN,zh;q=0.9
                Cache-Control: max-age=0
                Cookie: _ga=GA1.2.1251062763.1566609395; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566609396,1566627265; _gid=GA1.2.1809641921.1566875827; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IkNpMHk0SHlDSXIrWHU4MTBIaW96blE9PSIsInZhbHVlIjoiMXpzXC9GRmZGekxQYW5wcUt0ZU0xQ0l0MWVnNHdKWHo5XC9JNTRnZ0c0UWJlYjZlaDVhU1BNRGxENGNoWjBpdkE0IiwibWFjIjoiYTVjYmJjMzY3OTNiNTJjMDE5MjZhNmEzNDIwNGFmZDYwYzk5Yjg5ZjViYmExMzQwMjVkMTkzNDcyMmJjZmYxMyJ9; glidedsky_session=eyJpdiI6ImJ4aHA3QllGZE9PTlRnbTByZnNNOFE9PSIsInZhbHVlIjoiMGt6bUdqbDBcL2JSRERXbVFyMEdHNDArZmtOTHdQOFRidVlRUTFvMXRWajAzNUlja3gyN3JmV1U1QkVHUHBVU3UiLCJtYWMiOiI0OTY1ZGZmZDgwMTU4YTliNjM0NWVhZTU5MzRhNGQwYmMwM2YzNDc2ZGRkZjVmZDg0ZjQwMGUwODkyNjUwMmY3In0%3D; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566875832
                Host: glidedsky.com
                Proxy-Connection: keep-alive
                Referer: http://glidedsky.com/login
                Upgrade-Insecure-Requests: 1
                User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36
                """
        import tools
        headers = tools.headers_to_dict(headers)
        headers.update(proxy_header)
        # print(headers)
        while True:
            # if count == 37 or count == 38:
            #     continue
            try:
                res = self.sess.get(f"http://glidedsky.com/level/web/crawler-ip-block-1?page={count}", headers=headers,
                                   proxies=proxy_dict, timeout=10)
            except Exception as e:
                print("異常")
                print(e)
                continue
            file_name = f'glidedsky_{count}.html'
            if res.status_code == 200:
                with open(file_name, 'w', encoding='utf8') as f:
                    f.write(res.text)
                res_html = etree.HTML(res.text)
                nums = res_html.xpath('//div[@class="col-md-1"]/text()')
                if nums:
                    print("zhaodao")
                    # with open(file_name, 'w', encoding='utf8') as f:
                    #     f.write(res.text)
                    for num in nums:
                        sum_count += int(num.strip())
                    count += 1
                    print(sum_count)
                    if count == 1001:
                        return sum_count
            # time.sleep(3)

    def parse_html(self):
        count = 1
        sum_count = 0
        while True:
            file_name = f'glidedsky_{count}.html'
            with open(file_name, 'r', encoding='utf8') as f:
                content = f.read()
            res_html = etree.HTML(content)
            nums = res_html.xpath('//div[@class="col-md-1"]/text()')
            if nums:
                for num in nums:
                    sum_count += int(num.strip())

                print("次數綜合", count, sum_count)
                if count == 1001:
                    break
                    # return sum_count
            else:
                print("沒有內容", file_name)
                continue
            count += 1
        print("總和", sum_count)


if __name__ == '__main__':
    # test().abu_test()
    test().parse_html()

結果: filesession


本篇文章由一文多發平臺ArtiPub自動發佈
相關文章
相關標籤/搜索