【11】winter重學前端筆記 - 瀏覽器：一個瀏覽器是如何工做的？（階段二）DOM樹構建

時間 2019-11-11

標籤 winter 前端筆記瀏覽器一個如何階段 dom 構建欄目瀏覽器简体版

原文原文鏈接

請支持正版
https://time.geekbang.org/col...javascript

總體流程

字符流 -> 狀態機 -> 詞token -> 棧 -> dom
構建 DOM 的過程是：從父到子，從先到後，一個一個節點構造，而且掛載到DOM樹上html

如何解析請求回來的 HTML 代碼

把respone拿到的字符流經過狀態機解析成一個個的詞java

詞（token）是如何被拆分的算法
- 拆分：最小有意義單元 - token(詞)
- 詞的種類大約只有標籤開始、屬性、標籤結束、註釋、CDATA 節點...
- eg:瀏覽器
```
<p class="a">text text text</p>
```
  - <p「標籤開始」的開始；
  - class=「a」屬性
  - 「標籤開始」的結束；
  - text text text 文本；
  - /p>標籤結束
狀態機dom
- 爲何使用：咱們每讀入一個字符，其實都要作一次決策，並且這些決定是跟「當前狀態」有關的
- 定義：把每一個詞的「特徵字符」逐個拆開成獨立狀態，而後再把全部詞的特徵字符鏈合併起來，造成一個聯通圖結構。
- 絕大多數語言的詞法部分都是用狀態機實現的,HTML 官方文檔規定了 80 個狀態
- eg: 詞法部分

DOM 樹又是如何構建的

棧來實現，當接收完全部輸入，棧頂就是最後的根節點，咱們 DOM 樹的產出，就是這個 stack 的第一項
Node 類，全部的節點都會是這個 Node 類的實例。不同的 HTML 節點對應了不一樣的 Node 的子類，此處的實現，咱們進行簡化，只把 Node 分爲 Element 和 Text函數
```
function Element(){
    this.childNodes = [];
}
function Text(value){
    this.value = value || "";
}
```
規則：ui
- token中tag start和tag end須要成對實現，使用的棧正是用於匹配開始和結束標籤的方案(編譯原理技巧)
- Text 節點：把相鄰的 Text 節點合併起來，當詞（token）入棧時，檢查棧頂是不是 Text 節點，果是的話就合併 Text 節點
構建過程：
(默認：源代碼徹底遵循 xhtml，HTML 具備很強的容錯能力，奧妙在於當 tag end 跟棧頂的 start tag 不匹配的時候如何處理，暫時不考慮）this
- 棧頂元素就是當前節點；
- 遇到屬性，就添加到當前節點；
- 遇到文本節點，若是當前節點是文本節點，則跟文本節點合併，不然入棧成爲當前節點的子節點；
- 遇到註釋節點，做爲當前節點的子節點；
- 遇到 tag start 就入棧一個節點，當前節點就是這個節點的父節點
- 遇到 tag end 就出棧一個節點（還能夠檢查是否匹配）

完整的語法和詞法分析代碼spa

詞法分析
每個狀態是一個函數，經過「if else」來區分下一個字符作狀態遷移。這裏所謂的狀態遷移，就是當前狀態函數返回下一個狀態函數。

const EOF = void 0

// 詞法分析器接受字符的
function HTMLLexicalParser(syntaxer) {
    let state = data
    let token = null
    let attribute = null
    let characterReference = ''

    this.receiveInput = function (char) {
        if (state == null) {
            throw new Error('there is an error')
        } else {
              // 經過 state 來處理輸入的字符流
            state = state(char)
        }
    }

    this.reset = function () {
        state = data
    }
        
      // 狀態機 c:每個字符
    function data(c) {
        switch (c) {
            case '&':
                return characterReferenceInData
                        // tagOpenState 是接受了一個「 < 」 字符，來判斷標籤類型的狀態

            case '<':
                return tagOpen

                // perhaps will not encounter in javascript?
                // case '\0':
                //   error()
                //   emitToken(c)
                //   return data

                //  can be handle by default case
                // case EOF:
                //   emitToken(EOF)
                //   return data

            default:
                emitToken(c)
                return data
        }
    }

    // only handle right character reference
    function characterReferenceInData(c) {
        if (c === ';') {
            characterReference += c
            emitToken(characterReference)
            characterReference = ''
            return data
        } else {
            characterReference += c
            return characterReferenceInData
        }
    }

    function tagOpen(c) {
        if (c === '/') {
            return endTagOpen
        }
        if (/[a-zA-Z]/.test(c)) {
            token = new StartTagToken()
            token.name = c.toLowerCase()
            return tagName
        }
        // no need to handle this
        // if (c === '?') {
        //   return bogusComment
        // }
        return error(c)
    }


    function tagName(c) {
        if (c === '/') {
            return selfClosingTag
        }
        if (/[\t \f\n]/.test(c)) {
            return beforeAttributeName
        }
        if (c === '>') {
            emitToken(token)
            return data
        }
        if (/[a-zA-Z]/.test(c)) {
            token.name += c.toLowerCase()
            return tagName
        }
    }

    function beforeAttributeName(c) {
        if (/[\t \f\n]/.test(c)) {
            return beforeAttributeName
        }
        if (c === '/') {
            return selfClosingTag
        }
        if (c === '>') {
            emitToken(token)
            return data
        }
        if (/["'<]/.test(c)) {
            return error(c)
        }

        attribute = new Attribute()
        attribute.name = c.toLowerCase()
        attribute.value = ''
        return attributeName
    }

    function attributeName(c) {
        if (c === '/') {
            token[attribute.name] = attribute.value
            return selfClosingTag
        }
        if (c === '=') {
            return beforeAttributeValue
        }
        if (/[\t \f\n]/.test(c)) {
            return beforeAttributeName
        }
        attribute.name += c.toLowerCase()
        return attributeName
    }

    function beforeAttributeValue(c) {
        if (c === '"') {
            return attributeValueDoubleQuoted
        }
        if (c === "'") {
            return attributeValueSingleQuoted
        }
        if (/\t \f\n/.test(c)) {
            return beforeAttributeValue
        }
        attribute.value += c
        return attributeValueUnquoted
    }

    function attributeValueDoubleQuoted(c) {
        if (c === '"') {
            token[attribute.name] = attribute.value
            return beforeAttributeName
        }
        attribute.value += c
        return attributeValueDoubleQuoted
    }

    function attributeValueSingleQuoted(c) {
        if (c === "'") {
            token[attribute.name] = attribute.value
            return beforeAttributeName
        }
        attribute.value += c
        return attributeValueSingleQuoted
    }

    function attributeValueUnquoted(c) {
        if (/[\t \f\n]/.test(c)) {
            token[attribute.name] = attribute.value
            return beforeAttributeName
        }
        attribute.value += c
        return attributeValueUnquoted
    }

    function selfClosingTag(c) {
        if (c === '>') {
            emitToken(token)
            endToken = new EndTagToken()
            endToken.name = token.name
            emitToken(endToken)
            return data
        }
    }

    function endTagOpen(c) {
        if (/[a-zA-Z]/.test(c)) {
            token = new EndTagToken()
            token.name = c.toLowerCase()
            return tagName
        }
        if (c === '>') {
            return error(c)
        }
    }
        
      // 輸出解析好的 token（詞）
    function emitToken(token) {
        syntaxer.receiveInput(token)
    }

    function error(c) {
        console.log(`warn: unexpected char '${c}'`)
    }
}

class StartTagToken {}

class EndTagToken {}

class Attribute {}

module.exports = {
    HTMLLexicalParser,
    StartTagToken,
    EndTagToken
}

// 使用

const { HTMLLexicalParser } = require('./lexer')

const testHTML = `<html maaa=a >
    <head>
        <title>cool</title>
    </head>
    <body>
        <img src="a" />
    </body>
</html>`

const dummySyntaxer = {
  receiveInput: (token) => {
    if (typeof token === 'string') {
      console.log(`String(${token.replace(/\n/, '\\n').replace(/ /, '<whitespace>')})`)
    } else {
      console.log(token)
    }
  }
}

const lexer = new HTMLLexicalParser(dummySyntaxer)

for (let c of testHTML) {
  lexer.receiveInput(c)
}

//便於理解：狀態遷移代碼
var state = data;
var char
while(char = getInput())
    state = state(char);

語法分析

//簡單實現：僞代碼
function HTMLSyntaticalParser(){
    var stack = [new HTMLDocument];
    this.receiveInput = function(token) {
        //……
    }
    this.getOutput = function(){
        return stack[0];
    }
}


const { StartTagToken, EndTagToken } = require('./lexer')

class HTMLDocument {
  constructor () {
    this.isDocument = true
    this.childNodes = []
  }
}

// 僅僅把 Node 分爲 Element 和 Text
class Node {}
class Element extends Node {
  constructor (token) {
    super(token)
    for (const key in token) {
      this[key] = token[key]
    }
    this.childNodes = []
  }
  [Symbol.toStringTag] () {
    return `Element<${this.name}>`
  }
}
class Text extends Node {
  constructor (value) {
    super(value)
    this.value = value || ''
  }
}

function HTMLSyntaticalParser () {
  const stack = [new HTMLDocument]

  // receiveInput 負責接收詞法部分產生的詞（token），構建dom樹的算法
  this.receiveInput = function (token) {
    // 檢查棧頂是不是 Text 節點，若是是的話就合併 Text節點
    if (typeof token === 'string') {
      if (getTop(stack) instanceof Text) {
        getTop(stack).value += token
      } else {
        let t = new Text(token)
        getTop(stack).childNodes.push(t)
        stack.push(t)
      }
    } else if (getTop(stack) instanceof Text) {
      stack.pop()
    }
        
    // 匹配開始和結束標籤
    if (token instanceof StartTagToken) {
      let e = new Element(token)
      getTop(stack).childNodes.push(e)
      return stack.push(e)
    }
    if (token instanceof EndTagToken) {
      return stack.pop()
    }
  }

  this.getOutput = () => stack[0]
}

function getTop (stack) {
  return stack[stack.length - 1]
}

module.exports = {
  HTMLSyntaticalParser
}

// 使用
const { HTMLSyntaticalParser } = require('./syntaxer')
const { HTMLLexicalParser } = require('./lexer')

const syntaxer = new HTMLSyntaticalParser()
const lexer = new HTMLLexicalParser(syntaxer)

const testHTML = `<html maaa=a >
    <head>
        <title>cool</title>
    </head>
    <body>
        <img src="a" />
    </body>
</html>`

for (let c of testHTML) {
  lexer.receiveInput(c)
}

console.log(JSON.stringify(syntaxer.getOutput(), null, 2))

擴展閱讀：從Chrome源碼看瀏覽器如何構建DOM樹
https://zhuanlan.zhihu.com/p/...