Markup添加標記

本文內容來自《python基礎教程第二版》上的項目。html

Markup要作的就是未純文本添加一些格式。以一個文本文件做爲輸入,而後再瀏覽器中查看輸出的結果或者直接檢查新增的標籤。python

首先,咱們要作的是將文本分塊,具體是不斷地讀入行直到遇到空行,再將前面的全部行加在一塊兒算一個塊。express

util.py 兩個工具性的函數,第一個只是在文件的末尾加了一個空行,爲了能讓block函數找到最後一個塊的結束標誌。瀏覽器

def lines(file):
	'''
	Add one blank line at the end of file as a mark of the last block.	
	'''
	for line in file:
		yield line
	yield '\n'

def blocks(file):
	'''
	Divide a file into blocks.
	'''
	block = []
	for line in lines(file):
		if line.strip():
			block.append(line)
		elif block:
			yield ''.join(block).strip()
			block = []

後面的handler,rule,和主程序markup我好解釋,多是本身理解上還不到位。app

handler.py用來處理具體的標記添加和文本替換。ide

class Handler:
	'''
	An object that handles method calls from the Parser.

    The Parser will call the start() and end() methods at the
    beginning of each block, with the proper block name as a
    parameter. The sub() method will be used in regular expression
    substitution. When called with a name such as 'emphasis', it will
    return a proper substitution function.
	'''
	def callback(self, prefix, name, *args):
		method = getattr(self, prefix+name, None)
		if callable(method):
			return method(*args)
	def start(self, name):
		self.callback('start_', name)
	def end(self, name):
		self.callback('end_', name)
	def sub(self, name):
		def substitution(match):
			result = self.callback('sub_', name, match)
			if result is None:
				match.group(0)
			return result
		return substitution

class HTMLRenderer(Handler):
	'''
	A specific handler used for rendering HTML.

    The methods in HTMLRenderer are accessed from the superclass
    Handler's start(), end(), and sub() methods. They implement basic
    markup as used in HTML documents.
	'''
	def start_document(self):
		print '<html><head><title>...</title></head><body>'
	def end_document(self):
		print '</body></html>'
	def start_paragraph(self):
		print '<p>'
	def end_paragragh(self):
		print '</p>'
	def start_heading(self):
		print '<h2>'
	def end_heading(self):
		print '</h2>'
	def start_list(self):
		print '<ul>'
	def end_list(self):
		print '</ul>'
	def start_listitem(self):
		print '<li>'
	def end_listitem(self):
		print '</li>'
	def start_tile(self):
		print '<h1>'
	def end_tile(self):
		print '</h1>'
	def sub_emphasis(self, match):
		return '<em>%s</em>' % match.group(1)
	def sub_url(self, match):
		return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
	def sub_mail(self, match):
		return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
	def feed(self, data):
		print data

rules.py 用來識別文本塊中的標題,段落,列表等格式。函數

class Rule:
	'''
	Base class for all rules.
	'''
	def action(self, block, handler):
		handler.start(self.type)
		handler.feed(block)
		handler.end(self.type)
		return True

class HeadingRule(Rule):
	'''
	A heading is a single line that is at most 70 characters and
    that doesn't end with a colon.
	'''
	type = 'heading'
	def condition(self, block):
		return not '\n' in block and len(block) <= 70 and not block[-1] == ':'

class TitleRule(HeadingRule):
	'''
	The title is the first block in the document, provided that it is
    a heading.
	'''
	type = 'title'
	first = True

	def condition(self, block):
		if not self.first:
			return False
		self.first = False
		return HeadingRule.condition(self, block)

class ListItemRule(Rule):
	'''
	A list item is a paragraph that begins with a hyphen. As part of
    the formatting, the hyphen is removed.
	'''
	type = 'listitem'
	def condition(self, block):
		return block[0] == '-'

	def action(self, block, handler):
		handler.start(self.type)
		handler.feed(block[1:].strip())
		handler.end(self.type)
		return True

class ListRule(ListItemRule):
	'''
	A list begins between a block that is not a list item and a
    subsequent list item. It ends after the last consecutive list
    item.
	'''
	type = 'list'
	inside = False
	def condition(self, block):
		return True
	def action(self, block, handler):
		if not self.inside and ListItemRule.condition(self, block):
			handler.start(self.type)
			self.inside = True
		elif self.inside and not ListItemRule.condition(self, block):
			handler.end(self.type)
			self.inside = False
		return False

class ParagraphRule(Rule):
	'''
	 A paragraph is simply a block that isn't covered by any of the
    other rules.
	'''
	type = 'paragraph'
	def condition(self, block):
		return True

markup主程序。一個簡單的文本分析器。工具

import sys, re
from handlers import *
from util import *
from rules import *

class Parser:
	'''
	A Parser reads a text file, applying rules and controlling a handler.
	'''
	def __init__(self, handler):
		self.handler = handler
		self.rules = []
		self.filters = []
	def addRule(self, rule):
		self.rules.append(rule)
	def addFilter(self, pattern, name):
		def filter(block, handler):
			return re.sub(pattern, handler.sub(name), block)
		self.filters.append(filter)
	def parse(self, file):
		self.handler.start('document')
		for block in blocks(file):
			for filter in self.filters:
				block = filter(block, self.handler)
			for rule in self.rules:
				if rule.condition(block):
					last = rule.action(block, self.handler)
					if last:
						break
		self.handler.end('document')

class BasicTextParser(Parser):
	'''
	A specific Parser that adds rules and filters in its
    constructor.
	'''
	def __init__(self, handler):
		Parser.__init__(self, handler)
		self.addRule(ListRule())
		self.addRule(ListItemRule())
		self.addRule(TitleRule())
		self.addRule(HeadingRule())
		self.addRule(ParagraphRule())

		self.addFilter(r'\*(.+?)\*', 'emphasis')
		self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url')
		self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail')

if __name__ == '__main__':
	handler = HTMLRenderer()
	parser = BasicTextParser(handler)

	parser.parse(sys.stdin)
相關文章
相關標籤/搜索