手寫編譯器--破產版詞法分析器

時間 2019-11-07

原文原文鏈接

忽然感覺到了編譯原理的重要性，並且也是算法的落地方向之一，好記性不如爛代碼，今天我就穿起萬里長征的第一隻鞋，先手寫個破產版的玩玩。
先來一個小目標，至少要認出我寫的東西吧......
詞法分析器通常好像都是基於有限狀態機的，此處的好像體現了說明文的語言準確性，反正無非就是讀字符串，看看當前的字符組合是否知足某個狀態，知足就保持，不知足就切換，切換的時候把當前字符組合所造成的的狀態保存爲一個Token。其實蠻簡單的，整一臺先：算法

//有限自動機
type DfaState int

const (
	_ DfaState = iota
	Initial             //初始狀態
	Identifier          //標識符
	IntLiteral          //數字字面量 123
	SemiColon           //分號 ;
	LeftParen           //左括號 (
	RightParen          //右括號 )
	Assignment          //賦值號 =
	Plus                //加號 +
	Minus               //減號 +
	Star                //乘號 *
	Slash               //除號 /
	//把關鍵字分開寫比較好判斷
	Int
	Id_Int1 //int關鍵字裏的i
	Id_Int2 //int關鍵字裏的n
	Id_Int3 //int關鍵字裏的t
)
複製代碼

下邊就是核心的東西了，目前只能識別諸如int age = 45 這種的，以後會逐漸地完善，至少要有個計算器功能嘛，代碼裏見，輕噴哦bash

type Lexer interface {
	Tokenize(code string)
	Print()
}

type tokenS struct {
	tokenType token.TokenType //這塊實際上是當前的狀態，暫時被我定成字符串了哈哈
	text      string
}

type lexer struct {
	tmpToken tokenS
	tokens   []tokenS
}

func New() Lexer {
	return &lexer{}
}
func (l *lexer) Print() {
	for _, v := range l.tokens {
		fmt.Println(v.text + " " + string(v.tokenType))
	}
}
func (l *lexer) Tokenize(code string) {
	state := Initial
	ch := uint8(0)
	//平民遍歷字符串，見諒
	codeLen := len(code)
	for i := 0; i < codeLen; i++ {
		ch = code[i]
		switch state {
		case Initial:
			state = l.initToken(ch)
		case Identifier:
			if isAlpha(ch) || isNumber(ch) {
				l.tmpToken.text += string(ch)
			} else {
				state = l.initToken(ch)
			}
		case Assignment, Plus, Minus, Star, Slash:
			state = l.initToken(ch)
		case IntLiteral:
			if isNumber(ch) {
				l.tmpToken.text += string(ch)
			} else {
				state = l.initToken(ch)
			}
		case Id_Int1:
			if ch == 'n' { //先看是不是int
				l.tmpToken.text += string(ch)
				state = Id_Int2
			} else if isAlpha(ch) || isNumber(ch) { //不是int就看是否是知足標識符
				l.tmpToken.text += string(ch)
				state = Identifier
			} else {
				state = l.initToken(ch)
			}
		case Id_Int2:
			if ch == 't' { //先看是不是int
				l.tmpToken.text += string(ch)
				state = Id_Int3
			} else if isAlpha(ch) || isNumber(ch) { //不是int就看是否是知足標識符
				l.tmpToken.text += string(ch)
				state = Identifier
			} else {
				state = l.initToken(ch)
			}
		case Id_Int3:
			if isBlank(ch) {
				l.tmpToken.tokenType = token.Int
				state = l.initToken(ch)
			} else {
				state = Identifier
				l.tmpToken.text += string(ch)
			}
		}
	}
	//這裏須要把最後一個token也加進去
	if len(l.tmpToken.text) > 0 {
		l.initToken(ch)
	}
}

//每次狀態的切換都須要從新進入，而且跳向其餘狀態
func (l *lexer) initToken(ch byte) DfaState {
	if len(l.tmpToken.text) > 0 {
		l.tokens = append(l.tokens, l.tmpToken)
		l.tmpToken = tokenS{}
	}
	state := Initial
	if isAlpha(ch) {
		if ch == 'i' {
			state = Id_Int1
		} else {
			state = Identifier
		}
		l.tmpToken.tokenType = token.Identifier
		l.tmpToken.text += string(ch)
	} else if isNumber(ch) {
		state = IntLiteral
		l.tmpToken.tokenType = token.IntLiteral
		l.tmpToken.text += string(ch)
	} else if ch == '+' {
		state = Plus
		l.tmpToken.tokenType = token.Plus
		l.tmpToken.text += string(ch)
	} else if ch == '-' {
		state = Minus
		l.tmpToken.tokenType = token.Minus
		l.tmpToken.text += string(ch)
	} else if ch == '*' {
		state = Star
		l.tmpToken.tokenType = token.Star
		l.tmpToken.text += string(ch)
	} else if ch == '/' {
		state = Slash
		l.tmpToken.tokenType = token.Slash
		l.tmpToken.text += string(ch)
	} else if ch == '(' {
		state = LeftParen
		l.tmpToken.tokenType = token.LeftParen
		l.tmpToken.text += string(ch)
	} else if ch == ')' {
		state = RightParen
		l.tmpToken.tokenType = token.RightParen
		l.tmpToken.text += string(ch)
	} else if ch == '=' {
		state = Assignment
		l.tmpToken.tokenType = token.Assignment
		l.tmpToken.text += string(ch)
	}
	return state
}
//是否爲字母
func isAlpha(ch byte) bool {
	return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
}
//是否爲數字
func isNumber(ch byte) bool {
	return ch >= '0' && ch <= '9'
}
//是否爲空格
func isBlank(ch byte) bool {
	return ch >= ' ' || ch == '\t' || ch == '\n'
}

複製代碼

測試一下哈哈app

func main() {
	lx := lexer.New()
	lx.Tokenize("int age = 45")
	lx.Print()
}
複製代碼

課程週期半年，爭取一步步完成一個解釋器，一個編譯器，增刪改查雖然解餓，可是真的很差吃啊。測試

算法夢想家，來跟我一塊兒玩算法，玩音樂，聊聊文學創做，我們一塊兒天馬行空！ ui

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。