網站:http://glidedsky.com
第一題:計算網頁上全部數字的和
註冊進去就看到了html
點進去待爬取的網站,發現全是數字,,,這個第一題確實簡單,沒啥講的python
第二題:一樣題,請求1000次
這個題也是同樣的,最簡單的就是將上面的寫的代碼,改改就能夠了,不過這樣的話速度太慢了,能夠試着本身優化一下,加線程或者直接使用協程,都是很不錯的。固然我以爲協程應該能更快一點。沒作具體測試。git
運行的結果,這樣是直接改的,沒有加任何線程或協程,時間有點長。很基礎,可是也要看你怎麼優化了。github
應該是還能夠更優化,我寫的代碼偷懶了,使用協程寫的。web
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/8/18 0:33 # @Author : zhao.jia # @Site : # @File : glide_test.py # @Software: PyCharm import requests import tools from lxml import etree import aiohttp import asyncio import datetime import time from requests.adapters import HTTPAdapter class TestGlidedsky: def __init__(self): self.headers = """ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cache-Control: max-age=0 Connection: keep-alive Cookie: _ga=GA1.2.1425271689.1566058842; _gid=GA1.2.586445152.1566058842; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566058842,1566106841; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566129989; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IjM4SmpWMlwvaWxPQklreFVaMDFXVFhRPT0iLCJ2YWx1ZSI6IjdoMUFJaVF6YUVvUUNDZU1TaERsN0FVK0dRdTdORW9QUlwvNDlMXC9uXC9IdjdCZ2JCQVhiMXNEV2JKQnI5UXVIMHAiLCJtYWMiOiIyMWMyYzc1MzM3MWQyZTMxNDQwZjA5ZTUxNDZkOThmNTAyOWQwYTQzZDQyZTc4M2Q4YjNlZTI3YjYzZjgwNzA1In0%3D; glidedsky_session=eyJpdiI6Ik1rRUMrXC8yMlVkOEZlSEZja24zdmJRPT0iLCJ2YWx1ZSI6IjRoWG84K1MrM3NLbnlRVytrUVRHd1ZqWWtkdkdyeUtwOTBKdDFWTnl4THdkS1hcL2dmRzA1c1JJRDZSaHk2NlhKIiwibWFjIjoiNmQ2MmJhNWFlNzZiOWEwY2NiMDM1ZTBkZGE2MmNiNGQwNWU4OGJmOTU2OWQxNmU2NmM1MjE1ZmI0NGQ3MjllNyJ9 Host: glidedsky.com Referer: http://glidedsky.com/login Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36 """ self.sess = requests.session() self.sess.headers = tools.headers_to_dict(self.headers) self.sum_count_2 = 0 self.sess.mount('http://', HTTPAdapter(max_retries=3)) self.sess.mount('https://', HTTPAdapter(max_retries=3)) self.sess.verify = False def basic_one(self): sum_count = 0 res = self.sess.get(url="http://glidedsky.com/level/web/crawler-basic-1") res_html = etree.HTML(res.text) nums = res_html.xpath('//div[@class="col-md-1"]/text()') for num in nums: sum_count += int(num.strip()) print("sum=" + sum_count) # 第二題 def basic_two(self): count = 1 sum_count = 0 while True: res = self.sess.get(f"http://glidedsky.com/level/web/crawler-basic-2?page={count}") res_html = etree.HTML(res.text) nums = res_html.xpath('//div[@class="col-md-1"]/text()') for num in nums: sum_count += int(num.strip()) count += 1 if count == 1001: break print(sum_count) async def basic_two_2(self, url): async with aiohttp.ClientSession() as session: async with session.get(url, headers=tools.headers_to_dict(self.headers)) as resp: res = await resp.text() res_html = etree.HTML(res) nums = res_html.xpath('//div[@class="col-md-1"]/text()') for num in nums: self.sum_count_2 += int(num.strip()) def sum_async_count(self): loop = asyncio.get_event_loop() tasks = [asyncio.ensure_future( self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in range(1, 500)] loop.run_until_complete(asyncio.gather(*tasks)) tasks = [asyncio.ensure_future( self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}")) for i in range(500, 1001)] loop.run_until_complete(asyncio.gather(*tasks)) print(self.sum_count_2) if __name__ == '__main__': # 第二題 # starttime = datetime.datetime.now() # TestGlidedsky().basic_two() # endtime = datetime.datetime.now() # count_time_1 = (endtime - starttime).seconds # print(count_time_1) # 第二題 # starttime_2 = datetime.datetime.now() # TestGlidedsky().sum_async_count() # endtime_2 = datetime.datetime.now() # count_time_2 = (endtime_2 - starttime_2).seconds # print(count_time_2)
第三題:仍是求和,
不過此次封禁ip,每一個ip只能訪問一次,這個題就有點噁心了,只能去找代理ip了,找免費的就行,想辦法多重試。服務器
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/8/27 11:00 # @Author : Andrew # @Site : # @File : python-abu.py # @Software: PyCharm #! -*- encoding:utf-8 -*- from urllib import request import base64 from lxml import etree import time import requests from requests.adapters import HTTPAdapter class test: def __init__(self): self.sess = requests.session() self.sess.mount('http://', HTTPAdapter(max_retries=3)) self.sess.mount('https://', HTTPAdapter(max_retries=3)) self.sess.verify = False def abu_test(self): # 代理服務器 proxyHost = "proxy.abuyun.com" proxyPort = "9020" # 代理隧道驗證信息 proxyUser = "H2T*****22WD" proxyPass = "7****10526D3F" proxy_dict = {'http': "http-dyn.abuyun.com:9020"} auth = f"{proxyUser}:{proxyPass}" auth = base64.b64encode(auth.encode('utf8')) proxy_header = {"Proxy-Authorization": 'Basic ' + auth.decode()} self.get_html(proxy_dict, proxy_header) def get_html(self, proxy_dict, proxy_header): count = 1 sum_count = 0 headers = """ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cache-Control: max-age=0 Cookie: _ga=GA1.2.1251062763.1566609395; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566609396,1566627265; _gid=GA1.2.1809641921.1566875827; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IkNpMHk0SHlDSXIrWHU4MTBIaW96blE9PSIsInZhbHVlIjoiMXpzXC9GRmZGekxQYW5wcUt0ZU0xQ0l0MWVnNHdKWHo5XC9JNTRnZ0c0UWJlYjZlaDVhU1BNRGxENGNoWjBpdkE0IiwibWFjIjoiYTVjYmJjMzY3OTNiNTJjMDE5MjZhNmEzNDIwNGFmZDYwYzk5Yjg5ZjViYmExMzQwMjVkMTkzNDcyMmJjZmYxMyJ9; glidedsky_session=eyJpdiI6ImJ4aHA3QllGZE9PTlRnbTByZnNNOFE9PSIsInZhbHVlIjoiMGt6bUdqbDBcL2JSRERXbVFyMEdHNDArZmtOTHdQOFRidVlRUTFvMXRWajAzNUlja3gyN3JmV1U1QkVHUHBVU3UiLCJtYWMiOiI0OTY1ZGZmZDgwMTU4YTliNjM0NWVhZTU5MzRhNGQwYmMwM2YzNDc2ZGRkZjVmZDg0ZjQwMGUwODkyNjUwMmY3In0%3D; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566875832 Host: glidedsky.com Proxy-Connection: keep-alive Referer: http://glidedsky.com/login Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 """ import tools headers = tools.headers_to_dict(headers) headers.update(proxy_header) # print(headers) while True: # if count == 37 or count == 38: # continue try: res = self.sess.get(f"http://glidedsky.com/level/web/crawler-ip-block-1?page={count}", headers=headers, proxies=proxy_dict, timeout=10) except Exception as e: print("異常") print(e) continue file_name = f'glidedsky_{count}.html' if res.status_code == 200: with open(file_name, 'w', encoding='utf8') as f: f.write(res.text) res_html = etree.HTML(res.text) nums = res_html.xpath('//div[@class="col-md-1"]/text()') if nums: print("zhaodao") # with open(file_name, 'w', encoding='utf8') as f: # f.write(res.text) for num in nums: sum_count += int(num.strip()) count += 1 print(sum_count) if count == 1001: return sum_count # time.sleep(3) def parse_html(self): count = 1 sum_count = 0 while True: file_name = f'glidedsky_{count}.html' with open(file_name, 'r', encoding='utf8') as f: content = f.read() res_html = etree.HTML(content) nums = res_html.xpath('//div[@class="col-md-1"]/text()') if nums: for num in nums: sum_count += int(num.strip()) print("次數綜合", count, sum_count) if count == 1001: break # return sum_count else: print("沒有內容", file_name) continue count += 1 print("總和", sum_count) if __name__ == '__main__': # test().abu_test() test().parse_html()
結果: session
本篇文章由一文多發平臺ArtiPub自動發佈