請支持正版
https://time.geekbang.org/col...javascript
字符流 -> 狀態機 -> 詞token -> 棧 -> dom
構建 DOM 的過程是:從父到子,從先到後,一個一個節點構造,而且掛載到DOM樹上html
把respone拿到的字符流經過狀態機解析成一個個的詞java
詞(token)是如何被拆分的算法
eg:瀏覽器
<p class="a">text text text</p>
「標籤開始」的結束;
狀態機dom
Node 類,全部的節點都會是這個 Node 類的實例。不同的 HTML 節點對應了不一樣的 Node 的子類,此處的實現,咱們進行簡化,只把 Node 分爲 Element 和 Text函數
function Element(){ this.childNodes = []; } function Text(value){ this.value = value || ""; }
規則:ui
構建過程:
(默認:源代碼徹底遵循 xhtml,HTML 具備很強的容錯能力,奧妙在於當 tag end 跟棧頂的 start tag 不匹配的時候如何處理,暫時不考慮)this
完整的語法和詞法分析代碼spa
詞法分析
每個狀態是一個函數,經過「if else」來區分下一個字符作狀態遷移。這裏所謂的狀態遷移,就是當前狀態函數返回下一個狀態函數。
const EOF = void 0 // 詞法分析器接受字符的 function HTMLLexicalParser(syntaxer) { let state = data let token = null let attribute = null let characterReference = '' this.receiveInput = function (char) { if (state == null) { throw new Error('there is an error') } else { // 經過 state 來處理輸入的字符流 state = state(char) } } this.reset = function () { state = data } // 狀態機 c:每個字符 function data(c) { switch (c) { case '&': return characterReferenceInData // tagOpenState 是接受了一個「 < 」 字符,來判斷標籤類型的狀態 case '<': return tagOpen // perhaps will not encounter in javascript? // case '\0': // error() // emitToken(c) // return data // can be handle by default case // case EOF: // emitToken(EOF) // return data default: emitToken(c) return data } } // only handle right character reference function characterReferenceInData(c) { if (c === ';') { characterReference += c emitToken(characterReference) characterReference = '' return data } else { characterReference += c return characterReferenceInData } } function tagOpen(c) { if (c === '/') { return endTagOpen } if (/[a-zA-Z]/.test(c)) { token = new StartTagToken() token.name = c.toLowerCase() return tagName } // no need to handle this // if (c === '?') { // return bogusComment // } return error(c) } function tagName(c) { if (c === '/') { return selfClosingTag } if (/[\t \f\n]/.test(c)) { return beforeAttributeName } if (c === '>') { emitToken(token) return data } if (/[a-zA-Z]/.test(c)) { token.name += c.toLowerCase() return tagName } } function beforeAttributeName(c) { if (/[\t \f\n]/.test(c)) { return beforeAttributeName } if (c === '/') { return selfClosingTag } if (c === '>') { emitToken(token) return data } if (/["'<]/.test(c)) { return error(c) } attribute = new Attribute() attribute.name = c.toLowerCase() attribute.value = '' return attributeName } function attributeName(c) { if (c === '/') { token[attribute.name] = attribute.value return selfClosingTag } if (c === '=') { return beforeAttributeValue } if (/[\t \f\n]/.test(c)) { return beforeAttributeName } attribute.name += c.toLowerCase() return attributeName } function beforeAttributeValue(c) { if (c === '"') { return attributeValueDoubleQuoted } if (c === "'") { return attributeValueSingleQuoted } if (/\t \f\n/.test(c)) { return beforeAttributeValue } attribute.value += c return attributeValueUnquoted } function attributeValueDoubleQuoted(c) { if (c === '"') { token[attribute.name] = attribute.value return beforeAttributeName } attribute.value += c return attributeValueDoubleQuoted } function attributeValueSingleQuoted(c) { if (c === "'") { token[attribute.name] = attribute.value return beforeAttributeName } attribute.value += c return attributeValueSingleQuoted } function attributeValueUnquoted(c) { if (/[\t \f\n]/.test(c)) { token[attribute.name] = attribute.value return beforeAttributeName } attribute.value += c return attributeValueUnquoted } function selfClosingTag(c) { if (c === '>') { emitToken(token) endToken = new EndTagToken() endToken.name = token.name emitToken(endToken) return data } } function endTagOpen(c) { if (/[a-zA-Z]/.test(c)) { token = new EndTagToken() token.name = c.toLowerCase() return tagName } if (c === '>') { return error(c) } } // 輸出解析好的 token(詞) function emitToken(token) { syntaxer.receiveInput(token) } function error(c) { console.log(`warn: unexpected char '${c}'`) } } class StartTagToken {} class EndTagToken {} class Attribute {} module.exports = { HTMLLexicalParser, StartTagToken, EndTagToken } // 使用 const { HTMLLexicalParser } = require('./lexer') const testHTML = `<html maaa=a > <head> <title>cool</title> </head> <body> <img src="a" /> </body> </html>` const dummySyntaxer = { receiveInput: (token) => { if (typeof token === 'string') { console.log(`String(${token.replace(/\n/, '\\n').replace(/ /, '<whitespace>')})`) } else { console.log(token) } } } const lexer = new HTMLLexicalParser(dummySyntaxer) for (let c of testHTML) { lexer.receiveInput(c) } //便於理解:狀態遷移代碼 var state = data; var char while(char = getInput()) state = state(char);
語法分析
//簡單實現:僞代碼 function HTMLSyntaticalParser(){ var stack = [new HTMLDocument]; this.receiveInput = function(token) { //…… } this.getOutput = function(){ return stack[0]; } } const { StartTagToken, EndTagToken } = require('./lexer') class HTMLDocument { constructor () { this.isDocument = true this.childNodes = [] } } // 僅僅把 Node 分爲 Element 和 Text class Node {} class Element extends Node { constructor (token) { super(token) for (const key in token) { this[key] = token[key] } this.childNodes = [] } [Symbol.toStringTag] () { return `Element<${this.name}>` } } class Text extends Node { constructor (value) { super(value) this.value = value || '' } } function HTMLSyntaticalParser () { const stack = [new HTMLDocument] // receiveInput 負責接收詞法部分產生的詞(token),構建dom樹的算法 this.receiveInput = function (token) { // 檢查棧頂是不是 Text 節點,若是是的話就合併 Text節點 if (typeof token === 'string') { if (getTop(stack) instanceof Text) { getTop(stack).value += token } else { let t = new Text(token) getTop(stack).childNodes.push(t) stack.push(t) } } else if (getTop(stack) instanceof Text) { stack.pop() } // 匹配開始和結束標籤 if (token instanceof StartTagToken) { let e = new Element(token) getTop(stack).childNodes.push(e) return stack.push(e) } if (token instanceof EndTagToken) { return stack.pop() } } this.getOutput = () => stack[0] } function getTop (stack) { return stack[stack.length - 1] } module.exports = { HTMLSyntaticalParser } // 使用 const { HTMLSyntaticalParser } = require('./syntaxer') const { HTMLLexicalParser } = require('./lexer') const syntaxer = new HTMLSyntaticalParser() const lexer = new HTMLLexicalParser(syntaxer) const testHTML = `<html maaa=a > <head> <title>cool</title> </head> <body> <img src="a" /> </body> </html>` for (let c of testHTML) { lexer.receiveInput(c) } console.log(JSON.stringify(syntaxer.getOutput(), null, 2))
擴展閱讀:從Chrome源碼看瀏覽器如何構建DOM樹
https://zhuanlan.zhihu.com/p/...