做者:嵇智javascript
咱們在上一篇 ParserCore 講到了,首先先調用 normalize 這個 rule 函數將 Linux("\n"
) 與 Windows("\r\n"
) 的換行符統一處理成 "\n"
。接着就走到 ParseBlock.parse 的流程。這一步主要是產出 block 爲 true 的 token。以下圖所示:css
module.exports = function block(state) {
var token;
if (state.inlineMode) {
token = new state.Token('inline', '', 0);
token.content = state.src;
token.map = [ 0, 1 ];
token.children = [];
state.tokens.push(token);
} else {
state.md.block.parse(state.src, state.md, state.env, state.tokens);
}
};
複製代碼
parse 函數傳入了四個參數:html
{}
,通常不會須要它,除非你要作一些定製化的開發咱們再來聚焦 ParserBlock 內部的邏輯。位於 lib/parser_block.js
。java
var _rules = [
[ 'table', require('./rules_block/table'), [ 'paragraph', 'reference' ] ],
[ 'code', require('./rules_block/code') ],
[ 'fence', require('./rules_block/fence'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ],
[ 'blockquote', require('./rules_block/blockquote'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ],
[ 'hr', require('./rules_block/hr'), [ 'paragraph', 'reference', 'blockquote', 'list' ] ],
[ 'list', require('./rules_block/list'), [ 'paragraph', 'reference', 'blockquote' ] ],
[ 'reference', require('./rules_block/reference') ],
[ 'heading', require('./rules_block/heading'), [ 'paragraph', 'reference', 'blockquote' ] ],
[ 'lheading', require('./rules_block/lheading') ],
[ 'html_block', require('./rules_block/html_block'), [ 'paragraph', 'reference', 'blockquote' ] ],
[ 'paragraph', require('./rules_block/paragraph') ]
];
function ParserBlock() {
this.ruler = new Ruler();
for (var i = 0; i < _rules.length; i++) {
this.ruler.push(_rules[i][0], _rules[i][1], { alt: (_rules[i][2] || []).slice() });
}
}
ParserBlock.prototype.tokenize = function (state, startLine, endLine) {
var ok, i,
rules = this.ruler.getRules(''),
len = rules.length,
line = startLine,
hasEmptyLines = false,
maxNesting = state.md.options.maxNesting;
while (line < endLine) {
state.line = line = state.skipEmptyLines(line);
if (line >= endLine) { break; }
if (state.sCount[line] < state.blkIndent) { break; }
if (state.level >= maxNesting) {
state.line = endLine;
break;
}
for (i = 0; i < len; i++) {
ok = rules[i](state, line, endLine, false);
if (ok) { break; }
}
state.tight = !hasEmptyLines;
if (state.isEmpty(state.line - 1)) {
hasEmptyLines = true;
}
line = state.line;
if (line < endLine && state.isEmpty(line)) {
hasEmptyLines = true;
line++;
state.line = line;
}
}
};
ParserBlock.prototype.parse = function (src, md, env, outTokens) {
var state;
if (!src) { return; }
state = new this.State(src, md, env, outTokens);
this.tokenize(state, state.line, state.lineMax);
};
ParserBlock.prototype.State = require('./rules_block/state_block');
複製代碼
從構造函數能夠看出,ParserBlock 有 11 種 rule,分別爲 table
、code
、fence
、blockquote
、hr
、list
、reference
、heading
、lheading
、html_block
、paragraph
。通過由這些 rule 組成的 rules chain 以後,就能輸出 type 爲對應類型的 tokens,這也就是 ParserBlock 的做用所在。ruler 用來管理全部的 rule 以及 rule 所屬的 chain。git
for (var i = 0; i < _rules.length; i++) {
this.ruler.push(_rules[i][0], _rules[i][1], { alt: (_rules[i][2] || []).slice() });
}
複製代碼
_rules 是一個二維數組,它的元素也是一個數組,先暫稱之爲 ruleConfig。ruleConfig 的第一個元素是 rule 的 name。第二個是 rule 的 fn,第三個是 rule 的 alt,也就是所屬的職責鏈。假如 alt 爲 ['paragraph', 'reference']
,那麼若是調用 ruler.getRules('paragraph')
就能返回 [fn]
,同時調用 ruler.getRules('reference')
也能返回 [fn]
,由於 fn 的 alt 數組包含了這兩種職責鏈。github
再來看 parse 方法。shell
ParserBlock.prototype.parse = function (src, md, env, outTokens) {
var state;
if (!src) { return; }
state = new this.State(src, md, env, outTokens);
this.tokenize(state, state.line, state.lineMax);
};
ParserBlock.prototype.State = require('./rules_block/state_block');
複製代碼
先了解 ParserBlock 的 State,記得以前 ParserCore 的 State 麼?也就是在每個 Parser 的過程當中都有一個 State 實例,用來管理他們在 parse 的一些狀態。ParserBlock 的 State 是位於 lib/rules_block/state_block.js
。數組
function StateBlock(src, md, env, tokens) {
var ch, s, start, pos, len, indent, offset, indent_found;
this.src = src;
this.md = md;
this.env = env;
this.tokens = tokens
this.bMarks = []
this.eMarks = []
this.tShift = []
this.sCount = []
this.bsCount = []
this.blkIndent = 0
this.line = 0
this.lineMax = 0
this.tight = false
this.ddIndent = -1
this.parentType = 'root'
this.level = 0
this.result = ''
s = this.src
indent_found = false
for (start = pos = indent = offset = 0, len = s.length; pos < len; pos++) {
ch = s.charCodeAt(pos);
if (!indent_found) {
if (isSpace(ch)) {
indent++;
if (ch === 0x09) {
offset += 4 - offset % 4;
} else {
offset++;
}
continue;
} else {
indent_found = true;
}
}
if (ch === 0x0A || pos === len - 1) {
if (ch !== 0x0A) { pos++; }
this.bMarks.push(start);
this.eMarks.push(pos);
this.tShift.push(indent);
this.sCount.push(offset);
this.bsCount.push(0);
indent_found = false;
indent = 0;
offset = 0;
start = pos + 1;
}
}
this.bMarks.push(s.length);
this.eMarks.push(s.length);
this.tShift.push(0);
this.sCount.push(0);
this.bsCount.push(0);
this.lineMax = this.bMarks.length - 1;
}
複製代碼
理解 State 上的屬性的做用,是很關鍵的。由於這些屬性都是接下來 tokenize 所依賴的信息。重點關注以下的屬性:markdown
tokensless
tokenize 以後的 token 組成的數組
bMarks
存儲每一行的起始位置,由於 parse 的過程是根據換行符逐行掃描
eMarks
存儲每一行的終止位置
tShift
存儲每一行第一個非空格的字符的位置(製表符長度只算作1)
sCount
存儲每一行第一個非空格的字符串的位置(製表符長度爲4)
bsCount
通常爲 0
blkIndent
通常爲 0
line
當前所在行數。tokenize 的時候逐行掃描會用到
lineMax
src 被分割成了多少行
以上都是在 tokenize 過程當中很是有用的屬性。接來下看一下 tokenize 的過程,以後就生成了 block 爲 true 的 token。
ParserBlock.prototype.tokenize = function (state, startLine, endLine) {
var ok, i,
rules = this.ruler.getRules(''),
len = rules.length,
line = startLine,
hasEmptyLines = false,
maxNesting = state.md.options.maxNesting;
while (line < endLine) {
state.line = line = state.skipEmptyLines(line);
if (line >= endLine) { break; }
if (state.sCount[line] < state.blkIndent) { break; }
if (state.level >= maxNesting) {
state.line = endLine;
break;
}
for (i = 0; i < len; i++) {
ok = rules[i](state, line, endLine, false);
if (ok) { break; }
}
state.tight = !hasEmptyLines;
if (state.isEmpty(state.line - 1)) {
hasEmptyLines = true;
}
line = state.line;
if (line < endLine && state.isEmpty(line)) {
hasEmptyLines = true;
line++;
state.line = line;
}
}
}
複製代碼
函數的執行流程以下:
獲取 ParserBlock 構造函數聲明的全部 rule 函數,由於在 Ruler 類裏面規定,內部的 rule 函數必定屬於名字爲空字符串的 rule chain。固然構造函數還有不少其餘的 rule chain。好比 paragraph
、reference
、blockquote
、 list
,暫時還未用到。同時,聲明瞭不少初始變量。
而後走到一個 while 循環,由於 state_block 存放的信息都是以 src 字符串每一行做爲維度區分的,好比每一行的起始位置,每一行的終止位置,每一行第一個字符的位置。這些信息都是特定 rule 所須要的。while 語句的前面部分就是跳過空行、是否達到最大嵌套等級的判斷,重點關注這行代碼。
for (i = 0; i < len; i++) {
ok = rules[i](state, line, endLine, false);
if (ok) { break; }
}
複製代碼
這裏的循環,也就是會對 src 的每一行都執行 rule chain,進而產出 token,若是其中一個 rule 返回 true,就跳出循環,準備 tokenize 下一行。 那咱們來看下這些 rules 的做用。它們都位於 lib/rules_block
文件夾下面。
module.exports = function table(state, startLine, endLine, silent) {
var ch, lineText, pos, i, nextLine, columns, columnCount, token,
aligns, t, tableLines, tbodyLines;
if (startLine + 2 > endLine) { return false; }
nextLine = startLine + 1;
if (state.sCount[nextLine] < state.blkIndent) { return false; }
if (state.sCount[nextLine] - state.blkIndent >= 4) { return false; }
pos = state.bMarks[nextLine] + state.tShift[nextLine];
if (pos >= state.eMarks[nextLine]) { return false; }
ch = state.src.charCodeAt(pos++);
if (ch !== 0x7C/* | */ && ch !== 0x2D/* - */ && ch !== 0x3A/* : */) { return false; }
while (pos < state.eMarks[nextLine]) {
ch = state.src.charCodeAt(pos);
if (ch !== 0x7C/* | */ && ch !== 0x2D/* - */ && ch !== 0x3A/* : */ && !isSpace(ch)) { return false; }
pos++;
}
lineText = getLine(state, startLine + 1);
columns = lineText.split('|');
aligns = [];
for (i = 0; i < columns.length; i++) {
t = columns[i].trim();
if (!t) {
if (i === 0 || i === columns.length - 1) {
continue;
} else {
return false;
}
}
if (!/^:?-+:?$/.test(t)) { return false; }
if (t.charCodeAt(t.length - 1) === 0x3A/* : */) {
aligns.push(t.charCodeAt(0) === 0x3A/* : */ ? 'center' : 'right');
} else if (t.charCodeAt(0) === 0x3A/* : */) {
aligns.push('left');
} else {
aligns.push('');
}
}
lineText = getLine(state, startLine).trim();
if (lineText.indexOf('|') === -1) { return false; }
if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
columns = escapedSplit(lineText.replace(/^\||\|$/g, ''));
columnCount = columns.length;
if (columnCount > aligns.length) { return false; }
if (silent) { return true; }
token = state.push('table_open', 'table', 1);
token.map = tableLines = [ startLine, 0 ];
token = state.push('thead_open', 'thead', 1);
token.map = [ startLine, startLine + 1 ];
token = state.push('tr_open', 'tr', 1);
token.map = [ startLine, startLine + 1 ];
for (i = 0; i < columns.length; i++) {
token = state.push('th_open', 'th', 1);
token.map = [ startLine, startLine + 1 ];
if (aligns[i]) {
token.attrs = [ [ 'style', 'text-align:' + aligns[i] ] ];
}
token = state.push('inline', '', 0);
token.content = columns[i].trim();
token.map = [ startLine, startLine + 1 ];
token.children = [];
token = state.push('th_close', 'th', -1);
}
token = state.push('tr_close', 'tr', -1);
token = state.push('thead_close', 'thead', -1);
token = state.push('tbody_open', 'tbody', 1);
token.map = tbodyLines = [ startLine + 2, 0 ];
for (nextLine = startLine + 2; nextLine < endLine; nextLine++) {
if (state.sCount[nextLine] < state.blkIndent) { break; }
lineText = getLine(state, nextLine).trim();
if (lineText.indexOf('|') === -1) { break; }
if (state.sCount[nextLine] - state.blkIndent >= 4) { break; }
columns = escapedSplit(lineText.replace(/^\||\|$/g, ''));
token = state.push('tr_open', 'tr', 1);
for (i = 0; i < columnCount; i++) {
token = state.push('td_open', 'td', 1);
if (aligns[i]) {
token.attrs = [ [ 'style', 'text-align:' + aligns[i] ] ];
}
token = state.push('inline', '', 0);
token.content = columns[i] ? columns[i].trim() : '';
token.children = [];
token = state.push('td_close', 'td', -1);
}
token = state.push('tr_close', 'tr', -1);
}
token = state.push('tbody_close', 'tbody', -1);
token = state.push('table_close', 'table', -1);
tableLines[1] = tbodyLines[1] = nextLine;
state.line = nextLine;
return true;
}
複製代碼
table 這個 rule 就是用來生成 table HMTL 字符串的。內部的解析都是根據 markdown 寫 table 的規範而來的,詳細邏輯這裏就不展開,若是感興趣,能夠寫個 demo 本身打斷點試試。
module.exports = function code(state, startLine, endLine/*, silent*/) {
var nextLine, last, token;
if (state.sCount[startLine] - state.blkIndent < 4) { return false; }
last = nextLine = startLine + 1;
while (nextLine < endLine) {
if (state.isEmpty(nextLine)) {
nextLine++;
continue;
}
if (state.sCount[nextLine] - state.blkIndent >= 4) {
nextLine++;
last = nextLine;
continue;
}
break;
}
state.line = last;
token = state.push('code_block', 'code', 0);
token.content = state.getLines(startLine, last, 4 + state.blkIndent, true);
token.map = [ startLine, state.line ];
return true;
};
複製代碼
code rule 的做用也是很簡單,它認爲只要你每行的起始位置多於 3 個空格,那就是一個 code_block。好比下面的。
我如今就是一個 code_block
複製代碼
module.exports = function fence(state, startLine, endLine, silent) {
var marker, len, params, nextLine, mem, token, markup,
haveEndMarker = false,
pos = state.bMarks[startLine] + state.tShift[startLine],
max = state.eMarks[startLine];
if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
if (pos + 3 > max) { return false; }
marker = state.src.charCodeAt(pos);
if (marker !== 0x7E/* ~ */ && marker !== 0x60 /* ` */) {
return false;
}
mem = pos;
pos = state.skipChars(pos, marker);
len = pos - mem;
if (len < 3) { return false; }
markup = state.src.slice(mem, pos);
params = state.src.slice(pos, max);
if (params.indexOf(String.fromCharCode(marker)) >= 0) { return false; }
// Since start is found, we can report success here in validation mode
if (silent) { return true; }
// search end of block
nextLine = startLine;
for (;;) {
nextLine++;
if (nextLine >= endLine) {
break;
}
pos = mem = state.bMarks[nextLine] + state.tShift[nextLine];
max = state.eMarks[nextLine];
if (pos < max && state.sCount[nextLine] < state.blkIndent) {
break;
}
if (state.src.charCodeAt(pos) !== marker) { continue; }
if (state.sCount[nextLine] - state.blkIndent >= 4) {
continue;
}
pos = state.skipChars(pos, marker);
if (pos - mem < len) { continue; }
pos = state.skipSpaces(pos);
if (pos < max) { continue; }
haveEndMarker = true;
// found!
break;
}
len = state.sCount[startLine];
state.line = nextLine + (haveEndMarker ? 1 : 0);
token = state.push('fence', 'code', 0);
token.info = params;
token.content = state.getLines(startLine + 1, nextLine, len, true);
token.markup = markup;
token.map = [ startLine, state.line ];
return true;
};
複製代碼
fence rule 相似於 code rule。它表明具備語言類型的 code_block。好比 javascript、shell、css、stylus 等等。舉個栗子:
echo 'done'
複製代碼
上面就是會解析生成一個 type 爲 fence,info 爲 shell,markup 爲 "```"
的token。
代碼太長,就不貼代碼了,blockquote 的做用就是生成 markup 爲 > 的 token。下面就是一個 blockquote。
i am a blockquote
module.exports = function hr(state, startLine, endLine, silent) {
var marker, cnt, ch, token,
pos = state.bMarks[startLine] + state.tShift[startLine],
max = state.eMarks[startLine];
// if it's indented more than 3 spaces, it should be a code block
if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
marker = state.src.charCodeAt(pos++);
// Check hr marker
if (marker !== 0x2A/* * */ &&
marker !== 0x2D/* - */ &&
marker !== 0x5F/* _ */) {
return false;
}
// markers can be mixed with spaces, but there should be at least 3 of them
cnt = 1;
while (pos < max) {
ch = state.src.charCodeAt(pos++);
if (ch !== marker && !isSpace(ch)) { return false; }
if (ch === marker) { cnt++; }
}
if (cnt < 3) { return false; }
if (silent) { return true; }
state.line = startLine + 1;
token = state.push('hr', 'hr', 0);
token.map = [ startLine, state.line ];
token.markup = Array(cnt + 1).join(String.fromCharCode(marker));
return true;
};
複製代碼
hr rule 也很簡單,就是生成 type 爲 hr 的 token。它的 markup 是 ***
、---
、___
,也就是在 md 文件寫這三種語法,都能解析出 <hr>
標籤。
list 做用是爲了解析有序列表以及無序列表的。詳細的邏輯比較複雜,須要瞭解的能夠本身經過 demo 斷點調試。
reference 做用是爲了解析超連接。咱們在 md 的語法就是相似於 [reference](http://www.baidu.con)
這種。
module.exports = function heading(state, startLine, endLine, silent) {
var ch, level, tmp, token,
pos = state.bMarks[startLine] + state.tShift[startLine],
max = state.eMarks[startLine];
// if it's indented more than 3 spaces, it should be a code block
if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
ch = state.src.charCodeAt(pos);
if (ch !== 0x23/* # */ || pos >= max) { return false; }
// count heading level
level = 1;
ch = state.src.charCodeAt(++pos);
while (ch === 0x23/* # */ && pos < max && level <= 6) {
level++;
ch = state.src.charCodeAt(++pos);
}
if (level > 6 || (pos < max && !isSpace(ch))) { return false; }
if (silent) { return true; }
// Let's cut tails like ' ### ' from the end of string
max = state.skipSpacesBack(max, pos);
tmp = state.skipCharsBack(max, 0x23, pos); // #
if (tmp > pos && isSpace(state.src.charCodeAt(tmp - 1))) {
max = tmp;
}
state.line = startLine + 1;
token = state.push('heading_open', 'h' + String(level), 1);
token.markup = '########'.slice(0, level);
token.map = [ startLine, state.line ];
token = state.push('inline', '', 0);
token.content = state.src.slice(pos, max).trim();
token.map = [ startLine, state.line ];
token.children = [];
token = state.push('heading_close', 'h' + String(level), -1);
token.markup = '########'.slice(0, level);
return true;
};
複製代碼
heading 做用是解析標題標籤(h1 - h6)。它的語法主要是 #, ## 等等。
lheading 是解析自帶分隔符的標籤,好比下面
這是一個標題
========
// 上面會渲染成
<h1>這是一個標題</h1>
複製代碼
html_block 是解析 HTML,若是你在 md 裏面寫 HTML 標籤,那麼最後仍是會獲得 HTML 字符串,好比你寫以下字符串:
let src = "<p>234</p>"
// 獲得以下token
let token = [
{
"type": "html_block",
"tag": "",
"attrs": null,
"map": [
0,
1
],
"nesting": 0,
"level": 0,
"children": null,
"content": "<p>234</p>",
"markup": "",
"info": "",
"meta": null,
"block": true,
"hidden": false
}
]
最後輸出的字符串也是 `<p>234</p>`
複製代碼
module.exports = function paragraph(state, startLine/*, endLine*/) {
var content, terminate, i, l, token, oldParentType,
nextLine = startLine + 1,
terminatorRules = state.md.block.ruler.getRules('paragraph'),
endLine = state.lineMax;
oldParentType = state.parentType;
state.parentType = 'paragraph';
// jump line-by-line until empty one or EOF
for (; nextLine < endLine && !state.isEmpty(nextLine); nextLine++) {
// this would be a code block normally, but after paragraph
// it's considered a lazy continuation regardless of what's there
if (state.sCount[nextLine] - state.blkIndent > 3) { continue; }
// quirk for blockquotes, this line should already be checked by that rule
if (state.sCount[nextLine] < 0) { continue; }
// Some tags can terminate paragraph without empty line.
terminate = false;
for (i = 0, l = terminatorRules.length; i < l; i++) {
if (terminatorRules[i](state, nextLine, endLine, true)) {
terminate = true;
break;
}
}
if (terminate) { break; }
}
content = state.getLines(startLine, nextLine, state.blkIndent, false).trim();
state.line = nextLine;
token = state.push('paragraph_open', 'p', 1);
token.map = [ startLine, state.line ];
token = state.push('inline', '', 0);
token.content = content;
token.map = [ startLine, state.line ];
token.children = [];
token = state.push('paragraph_close', 'p', -1);
state.parentType = oldParentType;
return true;
};
複製代碼
paragraph 那就很簡單也是常常用到的,就是生成 p 標籤。
綜上,能夠看出 ParserBlock 的流程仍是很是的複雜與繁瑣的。首先它擁有本身的 block_state,block_state 存儲了 ParserBlock 在 tokenize 過程當中須要的不少信息,它的處理是以 src 換行符爲維度。接着在 tokenize 的過程當中逐行對字符串運用不一樣的 rule 函數,生成對應類型的 token,這樣就完成了 ParserBlock 的 parse 過程。
在 ParserBlock 處理以後,可能會生成一種 type 爲 inline 的 token。這種 token 屬於未徹底解析的 token。舉個栗子:
const src = '__ad__'
// 通過 parse 處理以後
const generatedTokens = [
{
"type": "paragraph_open",
"tag": "p",
......
},
{
"type": "inline",
"tag": "",
"attrs": null,
"map": [
0,
1
],
"nesting": 0,
"level": 1,
"children": [
{
"type": "text",
"tag": "",
......
},
{
"type": "strong_open",
"tag": "strong",
......
},
{
"type": "text",
"tag": "",
......
},
{
"type": "strong_close",
"tag": "strong",
......
},
{
"type": "text",
"tag": "",
......
}
],
"content": "__ad__",
"markup": "",
"info": "",
"meta": null,
"block": true,
"hidden": false
},
{
"type": "paragraph_close",
......
}
]
// 數組的第二個 token 的 type 爲 inline,注意它有個 children 屬性
// children 屬性上的 token是怎麼來的呢?
複製代碼
原本由 ParserBlock 處理以後,children 爲空,但這樣的話,第二個 token 的 content 屬性是 "__ad__",說明加粗的語法還未解析,所以 ParserBlock 的處理還不夠 ,咱們還須要更細粒度 token,那麼這就是 ParserInline 的由來。它的做用就是編譯 type 爲 inline 的 token,並將更細粒度的 token 放在 它的 children 屬性上,這也就是generatedTokens 第二項的 children 屬性值的由來。