改寫《python基礎教程》中的一個例子

時間 2019-12-14

原文原文鏈接

1、前言

初學python，看《python基礎教程》，第20章實現了將文本轉化成html的功能。因爲本人以前有DIY一個markdown轉html的算法，因此對這個例子有興趣。可仔細一看，發現很難看懂，一個功能分散在幾個文件中，各個類的耦合很是緊。雖然本身有幾年的c++開發經驗，但初看這個python代碼也以爲頭暈。html

2、原版

如下是其源碼python

 1 from __future__ import generators
 2 
 3 
 4 def lines(file):
 5     for line in file:
 6         yield line
 7     yield '\n'
 8 
 9 
10 def blocks(file):
11     block = []
12     for line in lines(file):
13         if line.strip():
14             block.append(line)
15         elif block:
16             yield ''.join(block).strip()
17             block = []

util.py

# This Python file uses the following encoding: utf-8
class Rule:
    """
    Base class for all rules.
    """
    def action(self, block, handler):
        handler.start(self.type)
        handler.feed(block)
        handler.end(self.type)
        return True


class HeadingRule(Rule):
    """
    A heading is a single line that is at most 70 characters and
    that doesn't end with a colon.
    """
    type = 'heading'

    def condition(self, block):
        return '\n' not in block and len(block) <= 70 and not block[-1] == ':'


class TitleRule(HeadingRule):
    """
    The title is the first block in the document, provided that it is
    a heading.
    """
    type = 'title'
    first = True

    def condition(self, block):
        if not self.first:
            return False
        self.first = False
        return HeadingRule.condition(self, block)


class ListItemRule(Rule):
    """
    A list item is a paragraph that begins with a hyphen. As part of
    the formatting, the hyphen is removed.
    """
    type = 'listitem'

    def condition(self, block):
        return block[0] == '-'

    def action(self, block, handler):
        handler.start(self.type)
        handler.feed(block[1:].strip())
        handler.end(self.type)
        return 1


# start ListRule {
class ListRule(ListItemRule):
    """
    A list begins between a block that is not a list item and a
    subsequent list item. It ends after the last consecutive list
    item.
    """
    type = 'list'
    inside = False

    def condition(self, block):
        # 總返回true，由於對每一個block都得進行檢查
        return True

    def action(self, block, handler):
        if not self.inside and ListItemRule.condition(self, block):
            handler.start(self.type)
            self.inside = True
        elif self.inside and not ListItemRule.condition(self, block):
            handler.end(self.type)
            self.inside = False
# 總返回false，由於得讓規則繼續處理
        return False
# end ListRule }


class ParagraphRule(Rule):
    """
    A paragraph is simply a block that isn't covered by any of the
    other rules.
    """
    type = 'paragraph'

    def condition(self, block):
        return True

rules.py

 1 # start Handler {
 2 class Handler:
 3     """
 4     An object that handles method calls from the Parser.
 5 
 6     The Parser will call the start() and end() methods at the
 7     beginning of each block, with the proper block name as
 8     parameter. The sub() method will be used in regular expression
 9     substitution. When called with a name such as 'emphasis', it will
10     return a proper substitution function.
11     """
12     def callback(self, prefix, name, *args):
13         method = getattr(self, prefix+name, None)
14         if callable(method):
15             return method(*args)
16 
17     def start(self, name):
18         self.callback('start_', name)
19 
20     def end(self, name):
21         self.callback('end_', name)
22 
23     def sub(self, name):
24         return lambda match: \
25                 self.callback('sub_', name, match) or match.group(0)
26 # end Handler }
27 
28 
29 # start HTMLHandler {
30 class HTMLHandler(Handler):
31     """
32     A specific handler used for rendering HTML.
33 
34     The methods in HTMLHandler are accessed from the superclass
35     Handler's start(), end(), and sub() methods. They implement basic
36     markup as used in HTML documents.
37     """
38     def start_document(self):
39         print '<html><head><title>...</title></head><body>'
40 
41     def end_document(self):
42         print '</body></html>'
43 
44     def start_paragraph(self):
45         print '<p>'
46 
47     def end_paragraph(self):
48         print '</p>'
49 
50     def start_title(self):
51         print '<h1>'
52 
53     def end_title(self):
54         print '</h1>'
55 
56     def start_heading(self):
57         print '<h2>'
58 
59     def end_heading(self):
60         print '</h2>'
61 
62     def start_list(self):
63         print '<ul>'
64 
65     def end_list(self):
66         print '</ul>'
67 
68     def start_listitem(self):
69         print '<li>'
70 
71     def end_listitem(self):
72         print '</li>'
73 
74     def sub_emphasis(self, match):
75         return '<em>%s</em>' % match.group(1)
76 
77     def sub_url(self, match):
78         return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
79 
80     def sub_mail(self, match):
81         return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
82 
83     def feed(self, data):
84         print data
85 
86 # end HTMLHandler }

handles.py

 1 import sys
 2 import re
 3 from handlers import *
 4 from util import *
 5 from rules import *
 6 
 7 
 8 # start Parser {
 9 class Parser:
10     """
11     A Parser reads a text file, applying rules and controlling a
12     handler.
13     """
14     def __init__(self, handler):
15         self.handler = handler
16         self.rules = []
17         self.filters = []
18 
19     def addRule(self, rule):
20         self.rules.append(rule)
21 
22     def addFilter(self, pattern, name):
23         def filter(block, handler):
24             return re.sub(pattern, handler.sub(name), block)
25         self.filters.append(filter)
26 
27     def parse(self, file):
28         self.handler.start('document')
29 
30         for block in blocks(file):
31             for filter in self.filters:
32                 block = filter(block, self.handler)
33 
34             for rule in self.rules:
35                 if rule.condition(block):
36                     last = rule.action(block, self.handler)
37                     if last:
38                         break
39         self.handler.end('document')
40 # end Parser }
41 
42 
43 # start BaseTextParser {
44 class BasicTextParser(Parser):
45     """
46     A specific Parser that adds rules and filters in its
47     constructor.
48     """
49     def __init__(self, handler):
50         Parser.__init__(self, handler)
51         self.addRule(ListRule())
52         self.addRule(ListItemRule())
53         self.addRule(TitleRule())
54         self.addRule(HeadingRule())
55         self.addRule(ParagraphRule())
56 
57         self.addFilter(r'\*(.+?)\*', 'emphasis')
58         self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url')
59         self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail')
60 # end BaseTextParser }
61 
62 handler = HTMLHandler()
63 parser = BasicTextParser(handler)
64 
65 parser.parse(sys.stdin)

markup.py

文本以下c++

Welcome to World Wide Spam, Inc.


These are the corporate web pages of *World Wide Spam*, Inc. We hope
you find your stay enjoyable, and that you will sample many of our
products.

A short history of the company

World Wide Spam was started in the summer of 2000. The business
concept was to ride the dot-com wave and to make money both through
bulk email and by selling canned meat online.

After receiving several complaints from customers who weren't
satisfied by their bulk email, World Wide Spam altered their profile,
and focused 100% on canned goods. Today, they rank as the world's
13,892nd online supplier of SPAM.

Destinations

From this page you may visit several of our interesting web pages:

  - What is SPAM? (http://wwspam.fu/whatisspam)

  - How do they make it? (http://wwspam.fu/howtomakeit)

  - Why should I eat it? (http://wwspam.fu/whyeatit)

How to get in touch with us

You can get in touch with us in *many* ways: By phone (555-1234), by
email (wwspam@wwspam.fu) or by visiting our customer feedback page
(http://wwspam.fu/feedback).

test_input.txt

使用命令行 python markup.py < test_input.txt > out.html 便可將文件轉化爲有格式的html文件web

上面代碼有幾點不足之處:算法

rules.py代碼和handles.py代碼緊密耦合，rules.py,handles.py一塊兒來實現根據規則來生成轉化文本。rules.py中各類rule中定義了'heading', 'listitem'等，而handles.py中有各類start_headning(), end_heading()來響應對應的類型方法。
對文本中特殊格式的轉化Filter功能分佈中markup.py和handles.py中。markup.py 57-59行，中定義了匹配模式，而替換的方法又在handles.py 74-81行。
...

3、改進

下面是本人改進後的代碼express

 1 from __future__ import generators
 2 
 3 
 4 def lines(file):
 5     for line in file:
 6         yield line
 7     yield '\n'
 8 
 9 
10 def lines2(file):
11     for line in file:
12         s = line.strip()
13         if s:
14             yield s
15     yield '\n'
16 
17 
18 def blocks(file):
19     block = []
20     for line in lines(file):
21         if line.strip():
22             block.append(line)
23         elif block:
24             yield ''.join(block).strip()
25             block = []

util.py

 1 import re
 2 
 3 
 4 def createFilter(pattern, fun):
 5     def filter(line):
 6         return re.sub(pattern, fun, line)
 7     return filter
 8 
 9 
10 def filterEm():
11     def subEm(match):
12         return '<em>%s</em>' % match.group(1)
13     return createFilter(r'\*(.+?)\*', subEm)
14 
15 
16 def filterUrl():
17     def subUrl(match):
18         return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
19     return createFilter(r'(http://[\.a-zA-Z/]+)', subUrl)
20 
21 
22 def filterMail():
23     def subMail(match):
24         return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
25     return createFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', subMail)
26 
27 
28 def createFilters():
29     filters = []
30     filters.append(filterEm())
31     filters.append(filterUrl())
32     filters.append(filterMail())
33     return filters

filters.py

  1 # This Python file uses the following encoding: utf-8
  2 class Rule:
  3     def action(self, line):
  4         self.start(line)
  5         self.feed(line)
  6         self.end(line)
  7         return True
  8 
  9     def start(self, line):
 10         pass
 11 
 12     def end(self, line):
 13         pass
 14 
 15     def feed(self, line):
 16         print line
 17 
 18     def endDoc(self):
 19         pass
 20 
 21 
 22 class HeadingRule(Rule):  # {{{
 23     def condition(self, line):
 24         return '\n' not in line and len(line) <= 30 and not line[-1] == ':'
 25 
 26     def start(self, line):
 27         print '<h2>'
 28 
 29     def end(self, line):
 30         print '</h2>'
 31 
 32 
 33 class TitleRule(HeadingRule):
 34     first = True
 35 
 36     def condition(self, line):
 37         if not self.first:
 38             return False
 39         self.first = False
 40         return HeadingRule.condition(self, line)
 41 
 42     def start(self, line):
 43         print '<h1>'
 44 
 45     def end(self, line):
 46         print '</h1>'  # }}}
 47 
 48 
 49 class ListItemRule(Rule):  # {{{
 50     def condition(self, line):
 51         return line[0] == '-'
 52 
 53     def feed(self, line):
 54         print line[1:].strip()
 55 
 56     def start(self, line):
 57         print '<li>'
 58 
 59     def end(self, line):
 60         print '</li>'
 61 
 62 
 63 class ListRule(ListItemRule):
 64     inside = False
 65     firstIn = False
 66     firstOut = False
 67 
 68     def condition(self, line):
 69         return True
 70 
 71     def action(self, line):
 72         if not self.inside and ListItemRule.condition(self, line):
 73             self.start(line)
 74             self.inside = True
 75         elif self.inside and not ListItemRule.condition(self, line):
 76             self.end(line)
 77             self.inside = False
 78         return False
 79 
 80     def start(self, line):
 81         print '<ul>'
 82 
 83     def end(self, line):
 84         print '</ul>'
 85 
 86     def feed(self, line):
 87         pass  # }}}
 88 
 89 
 90 class ParagraphRule(Rule):
 91 
 92     def condition(self, line):
 93         return True
 94 
 95     def start(self, line):
 96         print '<p>'
 97 
 98     def end(self, line):
 99         print '</p>'
100 
101 
102 class DocumentRule(Rule):
103     first = True
104     isStart = False
105 
106     def condition(self, line):
107         if self.first:
108             self.first = False
109             self.isStart = True
110             return True
111         return False
112 
113     def action(self, line):
114         if self.isStart:
115             self.start(line)
116             self.isStart = False
117         return False
118 
119     def start(self, line):
120         print '<html><head><title>...</title></head><body>'
121 
122     def end(self, line):
123         print '</body></html>'
124 
125     def endDoc(self):
126         self.end('')

rules.py

 1 # This Python file uses the following encoding: utf-8
 2 from util import *
 3 from rules import *
 4 import re
 5 import sys
 6 
 7 
 8 class MyParser:
 9     def __init__(self):
10         self.rules = []
11         self.filters = []
12 
13     def addRule(self, rule):
14         self.rules.append(rule)
15 
16     def setFilters(self, filters):
17         self.filters = filters
18 
19     def parse(self, file):
20         for line in lines2(file):
21 
22             for filter in self.filters:
23                 line = filter(line)
24 
25             for rule in self.rules:
26                 if rule.condition(line):
27                     last = rule.action(line)
28                     if last:
29                         break
30 
31         # 文檔結束後調用，以處理收尾工做
32         for rule in self.rules:
33             rule.endDoc()

parsers.py

 1 from parsers import *
 2 from util import *
 3 from rules import *
 4 from filters import *
 5 import sys
 6 
 7 
 8 p = MyParser()
 9 p.addRule(DocumentRule())
10 p.addRule(ListRule())
11 p.addRule(ListItemRule())
12 p.addRule(TitleRule())
13 p.addRule(HeadingRule())
14 p.addRule(ParagraphRule())
15 p.setFilters(createFilters())
16 
17 p.parse(sys.stdin)

main.py

使用命令 python main.py < test_input.txt > out.html 運行windows

有以下幾點改動:markdown

rules和handles功能合在一塊兒都放在rules.py中實現。
將Filter都放在filters.py中，而且能夠看到匹配模式和替換函數寫在一塊兒，文本過濾這個功能容易一眼就看出如何實現。
添加了一個DocumentRule規則用來處理文檔的開始和結束。而且在parsers.py 32行循環調用每一個rule類的endDoc()用以文檔結束時的處理。固然如今只有DocumentRule類纔會響應這個調用
util.py 中用添加lines2()函數，而且在parsers.py中使用這個函數來讀取文本行

最後，代碼應該寫得容易讓人看得懂 (尤爲是在一本初始教程中)。app

ps: 本人接下來將用上面的框架用python寫個markdown轉html的算法，而後再將代碼轉化成c++代碼。最後完善本身的筆記軟件而且用Qt寫個跨windows/mac平臺的markdown的編輯器。框架