一個小Demo, 用於分析的源文件比較簡單, 主要的部分都有, 擴展比較容易. 將正則表達式表示的模式構造爲狀態轉換圖. 在本文中只列舉狀態轉換圖.
雙緩衝區(代碼中的Buffer類): http://www.javashuo.com/article/p-adlgftxv-md.htmlgit
1、詞素模式 2、打印Token 3、StateTransition類 4、StateTransition的構造與析構函數 5、StateTransition普通函數的實現 6、運算符的狀態轉換 7、數字的狀態轉換 8、保留字和ID的狀態轉換 9、空格, 製表符, 換行符設置 10、調用
將使用<~> 標記來自哪一個文件正則表達式
<~Token.h> namespace Lexical_Analysis { enum Tag { RESERVE_WORD = 256, ID, METHOD, RELOP, NUM, }; enum RelopTag { LT, // < LE, // <= EQ, // = NE, // <> GT, // > GE, // >= }; enum ReserveWordTag { INT, IF, ELSE, THEN }; class Token { public: Tag tag; explicit Token(Tag t): tag(t) { } virtual ~Token() = default; }; // 運算符值 class RelopToken : public Token { public: RelopTag relop; explicit RelopToken(RelopTag tag): Token(RELOP), relop(tag) { } }; // 保留字tag, 和詞素 class ReserveWordToken : public Token { public: ReserveWordTag reserveWordTag; std::string lexeme; ReserveWordToken(ReserveWordTag t, std::string l): Token(RESERVE_WORD), reserveWordTag(t), lexeme(l) { } }; // 存儲id 值 class IdToken : public Token { public: std::string value; explicit IdToken(std::string v): Token(ID), value(v) { } }; // 存儲int 值 class IntToken : public Token { public: int value; explicit IntToken(int v): Token(NUM), value(v) { } }; // 存儲double 值 class DoubleToken : public Token { public: double value; explicit DoubleToken(double v): Token(NUM), value(v) { } }; }
void Lexical_Analysis::StateTransition::print_token(Lexical_Analysis::Token *token) { RelopToken* relopToken; ReserveWordToken* reserveWordToken; IdToken* idToken; IntToken* intToken; DoubleToken* doubleToken; if ((relopToken = dynamic_cast<RelopToken*>(token))) { std::cout << "<" << getRelopTagStr(relopToken->relop) << ">" << std::endl; } else if ((reserveWordToken = dynamic_cast<ReserveWordToken*>(token))) { std::cout << "<" << getTagStr(reserveWordToken->tag) << ", " << getReserveWordTagStr(reserveWordToken->reserveWordTag) << ">" << std::endl; } else if ((idToken = dynamic_cast<IdToken*>(token))) { std::cout << "<id, " << idToken->value << ">" << std::endl; } else if ((intToken = dynamic_cast<IntToken*>(token))) { std::cout << "<int, " << intToken->value << ">" << std::endl; } else if ((doubleToken = dynamic_cast<DoubleToken*>(token))) { std::cout << "<double, " << doubleToken->value << ">" << std::endl; } } std::string Lexical_Analysis::StateTransition::getTagStr(Lexical_Analysis::Tag val) { if (val == Tag::RESERVE_WORD) return "RESERVE_WORD"; else if (val == Tag::ID) return "ID"; else if (val == Tag::RELOP) return "RELOP"; else if (val == Tag::METHOD) return "METHOD"; else if (val == Tag::NUM) return "NUM"; return std::__cxx11::string(); } std::string Lexical_Analysis::StateTransition::getRelopTagStr(Lexical_Analysis::RelopTag val) { if (val == RelopTag::LT) return "<"; else if (val == RelopTag::LE) return "<="; else if (val == RelopTag::EQ) return "="; else if (val == RelopTag::NE) return "<>"; else if (val == RelopTag::GT) return ">"; else if (val == RelopTag::GE) return ">="; return std::__cxx11::string(); } std::string Lexical_Analysis::StateTransition::getReserveWordTagStr(Lexical_Analysis::ReserveWordTag val) { if (val == ReserveWordTag::INT) return "INT"; else if (val == ReserveWordTag::IF) return "IF"; else if (val == ReserveWordTag::ELSE) return "ELSE"; else if (val == ReserveWordTag::THEN) return "THEN"; return std::__cxx11::string(); }
<~StateTransition.h> namespace Lexical_Analysis { class StateTransition { public: explicit StateTransition(Buffer<>* _buffer); ~StateTransition(); private: Buffer<>* buffer; // 緩衝區 std::map<std::string, ReserveWordToken*> reserveWords; // 記號表, 見構造函數初始化 private: /** * 將token存入記號表 * @param token */ void reserve(ReserveWordToken* token); public: /** * 運算符的狀態轉換實現 */ Token* getRelop(); /** * 數字的狀態轉換實現 */ Token* getNumber(); /** * 字符序列的狀態轉換實現 */ Token* getCharacterSequence(); /** * 更新空格 製表符 換行 */ void getBlankTabNewline(); /** * 全局恢復策略, 將forward值重置爲lexemeBegin的值 * 使用另外一個狀態圖從還沒有處理的輸入部分的真實位置開始識別 */ void fail(); /** * 回退一個字符 */ void retract(); public: void print_token(Token* token); std::string getTagStr(Tag val); std::string getRelopTagStr(RelopTag val); std::string getReserveWordTagStr(ReserveWordTag val); }; }
<~StateTransition.cpp> Lexical_Analysis::StateTransition::StateTransition(Lexical_Analysis::Buffer<> *_buffer):buffer(_buffer) { reserve(new ReserveWordToken(ReserveWordTag::INT, "int")); reserve(new ReserveWordToken(ReserveWordTag::IF, "if")); reserve(new ReserveWordToken(ReserveWordTag::ELSE, "else")); reserve(new ReserveWordToken(ReserveWordTag::THEN, "then")); } Lexical_Analysis::StateTransition::~StateTransition() { }
<~StateTransition.cpp> void Lexical_Analysis::StateTransition::fail() { buffer->forward = buffer->lexemeBegin; } void Lexical_Analysis::StateTransition::retract() { buffer->pre(); } void Lexical_Analysis::StateTransition::reserve(Lexical_Analysis::ReserveWordToken *token) { reserveWords[token->lexeme] = token; }
<~StateTransition.cpp> Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getRelop() { int state = 0; char c = buffer->cur(); bool is_loop = (c == '<' || c == '=' || c == '>'); if (!is_loop) return nullptr; while (is_loop) { switch (state) { case 0: c = buffer->next(); if (c == '<') state = 1; else if (c == '=') state = 5; else if (c == '>') state = 6; break; case 1: c = buffer->next(); if (c == '=') state = 2; else if (c == '>') state = 3; else state = 4; break; case 2: // 須要將lexemeBegin指針設置爲當前詞素以後的第一個字符, // 所以使forward向前移動, 更改lexemeBegin指針後在回退 buffer->next(); buffer->lexemeBegin = buffer->forward; retract(); return new RelopToken(RelopTag::LE); case 3: buffer->next(); buffer->lexemeBegin = buffer->forward; retract(); return new RelopToken(RelopTag::NE); case 4: buffer->lexemeBegin = buffer->forward; retract(); return new RelopToken(RelopTag::LT); case 5: buffer->next(); buffer->lexemeBegin = buffer->forward; retract(); return new RelopToken(RelopTag::EQ); case 6: c = buffer->next(); if (c == '=') state = 7; else state = 8; break; case 7: buffer->next(); buffer->lexemeBegin = buffer->forward; retract(); return new RelopToken(RelopTag::GE); case 8: // 當前forward指針已經指向當前詞素以後的第一個位置, 不用更改 buffer->lexemeBegin = buffer->forward; retract(); return new RelopToken(RelopTag::GT); } } }
<~StateTransition.cpp> void calculation(int& result, int val) { result = result * 10 + val; } Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getNumber() { int state = 12; bool is_loop = is_digit(buffer->cur()); /* 直接返回 */ if (!is_loop) return nullptr; int val = 0; // 整數部分 //////////////////// double dVal = 0.0; // 小數部分 int place = 10; // 小數點後第一位/10, 第二位/100 ... bool isDecimal = false; // 是否爲小數 //////////////////// int ePos = 0; // E 後面的數 (E06) bool isNegative = false; // E後面的數是否爲負數 (E-03) char c; while (is_loop) { switch (state) { case 12: c = buffer->next(); if (isdigit(c)) { calculation(val, to_digit_10(c)); state = 13; } break; case 13: c = buffer->next(); if (isdigit(c)) { calculation(val, to_digit_10(c)); state = 13; } else if (c == '.') { isDecimal = true; state = 14; } else if (c == 'E') state = 16; else state = 19; break; case 14: c = buffer->next(); if (isdigit(c)) { double d = 1.0 * to_digit_10(c) / place; dVal += d; place *= 10; state = 15; } else state = 19; break; case 15: c = buffer->next(); if (isdigit(c)) { double d = 1.0 * to_digit_10(c) / place; dVal += d; place *= 10; state = 15; } else if (c == 'E') state = 16; else state = 19; break; case 16: c = buffer->next(); if (isdigit(c)) { calculation(ePos, to_digit_10(c)); state = 18; } else if (c == '+' || c == '-') { if (c == '-') isNegative = true; state = 17; } break; case 17: c = buffer->next(); if (isdigit(c)) { calculation(ePos, to_digit_10(c)); state = 18; } break; case 18: c = buffer->next(); if (isdigit(c)) { calculation(ePos, to_digit_10(c)); state = 18; } else state = 19; break; case 19: is_loop = false; break; } } // 使lexemeBegin指向剛纔找到的詞素以後的第一個字符, 處理完詞素後, forward回退一個位置 char* tmp = buffer->forward; if (isDecimal) { // 若是是小數 dVal += val; if (isNegative) for (int i = 0; i < ePos; i++) dVal /= 10; else for (int i = 0; i < ePos; i++) dVal *= 10; // 回退, 並設置lexemeBegin值 retract(); buffer->lexemeBegin = tmp; DoubleToken* token = new DoubleToken(dVal); return token; } else { if (isNegative) for (int i = 0; i < ePos; i++) val /= 10; else for (int i = 0; i < ePos; i++) val *= 10; retract(); buffer->lexemeBegin = tmp; return new IntToken(val); } }
<~StateTransition.cpp> Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getCharacterSequence() { int state = 9; bool is_loop = is_letter(buffer->cur()); /* 直接返回 */ if (!is_loop) return nullptr; char c; while (is_loop) { switch (state) { case 9: c = buffer->next(); if (is_underline(c) || is_letter(c)) state = 10; break; case 10: c = buffer->next(); if (is_letter(c) || is_digit(c) || is_underline(c)) state = 10; else state = 11; break; case 11: is_loop = false; break; } } // 使lexemeBegin指向剛纔找到的詞素以後的第一個字符, 處理完詞素後, forward回退一個位置 char* tmp = buffer->forward; if (!buffer->is_end) { // 若是沒有讀到結尾, 執行回退 retract(); } std::string result = buffer->getString(); buffer->lexemeBegin = tmp; // 從符號表中查找是否爲關鍵詞(保留字), 若是是則返回關鍵詞token, 不然返回id token auto it = reserveWords.find(result); if (it != reserveWords.end()) { return (*it).second; } return new IdToken(result); }
<~StateTransition.cpp> void Lexical_Analysis::StateTransition::getBlankTabNewline() { bool is_loop = is_blank_tab_newline(buffer->cur()); if (!is_loop) return ; while (is_blank_tab_newline(buffer->cur())) { buffer->next(); } buffer->lexemeBegin = buffer->forward; }
<~main.cpp> int main() { string fileStr = "/home/yanuas/CompilingPrinciple/LexicalAnalysis/code.src"; Buffer<1024> buffer(fileStr); StateTransition transition(&buffer); while (1) { Token* token = transition.getRelop(); if (token != nullptr) transition.print_token(token); token = transition.getCharacterSequence(); if (token != nullptr) transition.print_token(token); token = transition.getNumber(); if (token != nullptr) transition.print_token(token); transition.getBlankTabNewline(); if (transition.buffer->is_end) break; } return 0; }