python generator

時間 2019-11-29

原文原文鏈接

1 異步IO模型html

loop = get_event_loop()
while True:
    event = loop.get_event()
    process_event(event)

loop是一個事件集合，而後循環「取出一個事件—處理一個事件」。python

一個線程在執行一個事件中可能會有堵塞，當堵塞時，會將此時「狀態」保存在loop中，而後進入下個循環，以此類推。flask

2 事件循環+回調app

在事件循環的過程當中，若是一個task執行完畢，就能夠經過了callback將result返回給另外一個等待process的task2框架

3 基於python generator的協程異步

python的generator不只能夠按需生成數據，他還能夠某個事情執行一部分，另外一部分在某個事件發生後（callback）再執行下一部分，實現異步。ide

3.1 生成器基本語法：
　　經過 (...) 解析器造成函數

　　經過yield關鍵字造成 oop

3.2 生成器中的return:
　　在一個生成器中，若是沒有return，則默認執行到函數完畢時返回StopIteration；post

　　若是遇到return,若是在執行過程當中 return，則直接拋出 StopIteration 終止迭代；

　　若是在return後返回一個值，那麼這個值爲StopIteration異常的說明，不是程序的返回值。

3.3 生成器中的send(self, value)

　　生成器函數最大的特色是能夠接受外部傳入的一個變量，並根據變量內容計算結果後返回。

　　gen.send(None), generator的第一個參數必定是None，不然會報錯。且gen.next() 等價於 gen.send(None)

def foo():
    num = 5
    while True:
        s = yield num
        num = num + s
        print num


a = foo()
'''
經過g.send(None)或者next(g)能夠啓動生成器函數，並執行到第一個yield語句結束的位置。此時，執行完了yield語句，可是沒有給receive賦值。yield value會輸出初始值0
'''
print a.send(None)
a.send(100)
a.send(100)

[out:]
5 # 輸出的初始值
105
205

4 基於generator 生成器調度的crawler

# coding=utf-8
from collections import deque
import requests
import re
import time

p_list = [7647647, 7620172, 7591696]


class Crawler(object):
    def __init__(self, p):
        self.url = 'http://www.cnblogs.com/fuzzier/p/%d.html' % p  # 就拿博客園測試了，哈哈哈

    def get_html(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
                                '(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'}
        html = requests.get(self.url, headers=headers).text
        time.sleep(0.5)
        return html

    def parser_html(self, html):
        result = re.search(r'<a id="cb_post_title_url" class="postTitle2".*</a>', html).group()
        return result

    def run(self):
        print 'start crawler ' + self.url
        yield
        html = self.get_html()
        yield
        result = self.parser_html(html)
        print result


class Runner(object):
    def __init__(self, tasks):
        self.tasks = deque(tasks)

    def my_pop(self):
        return self.tasks.pop()

    def run(self):
        while len(self.tasks):
            task = self.my_pop()
            try:
                next(task)
            except StopIteration:
                print len(self.tasks)   # 由於到最後已經沒有生成器了，但還在next()中
            else:  # 若是try成功，就會執行else語句，所next的gen就會繼續被保存在tasks中
                self.tasks.appendleft(task)


tasks = map(lambda p: Crawler(p).run(), p_list)
Runner(tasks).run()

[out:]

start crawler http://www.cnblogs.com/fuzzier/p/7591696.html
start crawler http://www.cnblogs.com/fuzzier/p/7620172.html
start crawler http://www.cnblogs.com/fuzzier/p/7647647.html
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7591696.html">Beautifulsoup模塊的一些細節說明</a>
2
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7620172.html">requests源碼框架淺析</a>
1
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7647647.html">flask0.1版本源碼淺析——請求上下文</a>
0

View Code

基於generator的半協程的Crawler

# coding=utf-8
from collections import deque
import requests
import re
import time

p_list = [7647647, 7620172, 7591696]


class Crawler(object):
    def __init__(self, p):
        self.url = 'http://www.cnblogs.com/fuzzier/p/%d.html' % p
        self.p = self.parser_html()  # 至關於一個coroutines

    def get_html(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
                                '(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'}
        self.p.send(None)  # 第一回send的值必須是None
        print 'GET ' + self.url
        yield
        html = requests.get(self.url, headers=headers).text
        time.sleep(2)
        self.p.send(html)

    def parser_html(self):
        html = yield
        if html:
            result = re.search(r'<a id="cb_post_title_url" class="postTitle2".*</a>', html).group()
            print result


class Runner(object):
    def __init__(self, tasks):
        self.tasks = deque(tasks)

    def my_pop(self):
        return self.tasks.pop()

    def run(self):
        while len(self.tasks):
            task = self.my_pop()
            try:
                next(task)
            except StopIteration:
                print len(self.tasks)   # 由於到最後已經沒有生成器了，但還在next()中
            else:  # 若是try成功，就會執行else語句，所next的gen就會繼續被保存在tasks中
                self.tasks.appendleft(task)


tasks = map(lambda p: Crawler(p).get_html(), p_list)
Runner(tasks).run()

[out:]

GET http://www.cnblogs.com/fuzzier/p/7591696.html
GET http://www.cnblogs.com/fuzzier/p/7620172.html
GET http://www.cnblogs.com/fuzzier/p/7647647.html
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7591696.html">Beautifulsoup模塊的一些細節說明</a>
2
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7620172.html">requests源碼框架淺析</a>
1
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7647647.html">flask0.1版本源碼淺析——請求上下文</a>
0

View Code

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。