python即時標記

時間 2019-11-08

標籤 python 即時標記欄目 Python 简体版

原文原文鏈接

將純文本文件改寫成標記語言格式的文件（如Html語言）html

python版本3.6.2python

準備文檔test_input.txtweb

Welcome to World Wide Spam. Inc.



These are the corporate web pages of *World Wide Spam*,Inc.We hope
you find your stay enjoyable,and that you will sample many of our
products.

A short history if the company

World Wide Spam was started in the summer of 2000.The business
concept was to ride the dot-com wave ande to make money both through
bulk email and by selling canned meat online.

After receiving several complaints from customers who weren't
satisfied by their bulk email.World Wide Spam altered their profile,
and focused 100%on canned goods.Today,they rank as the world's
13,892nd online supplier of SPAM.

Destinations

From this page you may visit several of our intersting web pages:

-What is SPAM?(http://wwspam.fu/whatisspam)

-How do they make it?(http://wwspam.fu/howtomakeit)

-Why should I eat it?(http://wwspam.fu/whyeatif)

How to get in touch with us

You can get in touch with us in *many* ways: By phone (555-1234),by
email (wwspam@wwspam.fu) or by visiting our customer feedback page
(http://wwspam.fu/feedback).

初次實現，找出文本塊正則表達式

找出塊的一個簡單的方法就是收集行，直到遇到一個空行，就返回以前收集的行，這些返回的行就是一個塊，而後接着收集，不要收集空行也不須要放回空塊windows

文本塊生成器util.pyapp

#util.py文本塊生成器

def lines(file):
    for line in file: yield line
    yield '\n'
    
def blocks(file):
    block=[]
    for line in lines(file):
        if line.strip():
            block.append(line)
        elif block:
            yield ''.join(block).strip()
            block=[]

lines生成器只是在文件的最後一行追加一個空行，blocks生成器基本完成了塊的收集ide

向收集到表明塊的字符串添加標記（例如：頁面開始標記，段落標記）函數

simple_markuo.pythis

#! /usr/bin/env python
#coding=utf-8
#simple_markup.py

import sys, re
from util import *

print('<html><head><title>...</title><body>')

title=True
for block in blocks(sys.stdin):
    block=re.sub(r'\*(.+?)\*',r'<em>\1</em>',block)
    if title:
        print('<h1>')
        print(block)
        print('</h1>')
        title=False
    else:
        print('<p>')
        print(block)
        print('</p>')
print('</body></html>')

我是在windows下運行的在控制檯執行url

python mark.py <test_input.txt> test_input.html

接下來進一步實現更具體的功能，對其進行擴展。

handlers.py

#! /usr/bin/env python
#coding=utf-8
#handler.py

class Handler:
    """
    
    處理從Parser調用的方法的對象
    
    這個解析器會在每一個快的開始部分調用start()和end()方法。使用適合的
    塊名做爲參數。sub()方法會用於正則表達式替換中。當時用了’emphasis'
    這樣的名字調用時，它會返回合適的替換函數。
    """
    
    def callback(self,prefix,name,*args):
        method=getattr( self,prefix+name,None)
        if callable(method):return method(*args)
    
    def start(self,name):
        self.callback('start_',name)
    
    def end(self,name):
        self.callback('end_',name)
    
    def sub(self,name):
        def substitution(match):
            result=self.callback('sub_',name,match)
            if result is None:result=match.group(0)
            return result
        return substitution
    
class HTMLRenderer(Handler):
    """
    用於生成HTML的具體處理程序
    
    HTMLRenderer內的方法均可以經過超類處理程序的start()、
    end()和sub()方法來訪問。它們實現了用於HTML文檔的基本標籤
    """
    
    def start_document(self):
        print('<html><head><title>...</title></head><body>')
    def end_document(self):
        print('</body></html>')
    def start_paragraph(self):
        print('<p>')
    def end_paragraph(self):
        print('</p>')
    def start_heading(self):
        print('<h2>')
    def end_heading(self):
        print('</h2>')
    def start_list(self):
        print('<ul>')
    def end_heading(self):
        print('</ul>')
    def start_listitem(self):
        print('<li>')
    def end_listitem(self):
        print('</li>')
    def start_title(self):
        print('<h1>')
    def end_title(self):
        print('</h1>')
    def sub_emphasis(self,match):
        return '<em>%s</em>'%match.group(1)
    def sub_url(self,match):
        return '<a href="%s">%s</a>'%(match.group(1),match.group(1))
    def sub_mail(self,match):
        return '<a href="mailto:%s">%s</a>'%(match.group(1),match.group(1))
    def feed(self,data):
        print(data)

rules.py

#! /usr/bin/env python
#coding=utf-8
#rules.py

class Rule:
    """
    
    全部規則的基類
    """
    
    def action(self,block,handler):
        handler.start(self.type)
        handler.feed(block)
        handler.end(self.type)
        return True
    
class HeadingRule(Rule):
    """
    
    標題佔一行，最多70個字符，而且不以冒號結尾、
    """
    
    type='heading'
    def condition(self,block):
        return not '\n' in block and len(block) <=70 and not block[-1] == ':'
    
class TitleRule(HeadingRule):
    """
    
    題目是文檔的第一塊。但前提是它是大標題。
    """
    
    type='title'
    first=True
    
    def condition(self,block):
        if not self.first:return False
        self.first=False
        return HeadingRule.condition(self,block)
    
class ListItemRule(Rule):
    """
    
    列表項是以連字符開始的段落。做爲格式化的一部分，要移除一連字符
    """
    
    type='listitem'
    def condition(slef,block):
        return block[0]=='-'
    def action(self,block,handler):
        handler.start(self.type)
        handler.feed(block[1:].strip())
        handler.end(self.type)
        return True
    
class ListRule(ListItemRule):
    """
    
    列表從不是列表項的塊和隨後的列表項之間在最後一個連續列表項之間。在最後一個連續列表項以後結束。
    """
    
    type='list'
    inside=False
    def condition(self,block):
        return True
    def action(self,block,handler):
        if not self.inside and ListItemRule.condition(self,block):
            handler.start(self.type)
            self.inside=True
        elif self.inside and not ListItemRule.condition(self,block):
            handler.end(self.type)
            self.inside=False
        return False
    
class ParagraphRule(Rule):
    """
    
    段落只是其餘規則並無覆蓋到塊
    """
    
    type='paragraph'
    def condition(self,block):
        return True

mark.py

#! /usr/bin/env python
#coding=utf-8
#mark.py
#控制檯執行命令   python mark.py <test_input.txt> test_input.html

import sys,re
from handlers import *
from util import *
from rules import *

class Parser:
    """
    
    語法分析器讀取文本文件、應用規則而且控制處理程序
    """
    
    def __init__(self,handler):
        self.handler=handler
        self.rules=[]
        self.filters=[]
        
    def addRule(self,rule):
        self.rules.append(rule)
    
    def addFilter(self,pattern,name):
        def filter(block,handler):
            return re.sub(pattern,handler.sub(name),block)
        self.filters.append(filter)
    
    def parse(self,file):
        self.handler.start('document')
        for block in blocks(file):
            for filter in self.filters:
                block=filter(block,self.handler)
            for rule in self.rules:
                if rule.condition(block):
                    last=rule.action(block,self.handler)
                    if last:break
        self.handler.end('document')
        
class BasicTextParser(Parser):
    """
    
    在構造函數中增長規則和過濾器的具體語法分期器
    """
    
    def __init__(self,handler):
        Parser.__init__(self,handler)
        self.addRule(ListRule())
        self.addRule(ListItemRule())
        self.addRule(TitleRule())
        self.addRule(HeadingRule())
        self.addRule(ParagraphRule())
        
        self.addFilter(r'\*(.+?)\*','emphasis')
        self.addFilter(r'(http://[\.a-zA-Z/]+)','url')
        self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)','mail')
        
handler=HTMLRenderer()
parser=BasicTextParser(handler)
parser.parse(sys.stdin)