编译器的 Scanner 还有很多复杂的特性,如注释、换行符、转义字符等等。C++代码实现
时间: 2023-12-03 11:42:34 浏览: 112
扫描器 C++源码编写
3星 · 编辑精心推荐
Scanner 是编译器的一个重要组成部分,它用于将源代码转换为词法单元。对于 C++ 这种语言,Scanner 需要支持注释、换行符、转义字符等特性。下面是一个简单的 C++ Scanner 的实现,支持单行注释、多行注释、换行符(\n)、回车符(\r)、水平制表符(\t)、垂直制表符(\v)、退格符(\b)、反斜杠转义字符(\) 以及单引号(')和双引号(")。
```c++
#include <iostream>
#include <string>
#include <fstream>
#include <vector>
#include <regex>
using namespace std;
enum TokenType {
IDENTIFIER,
KEYWORD,
OPERATOR,
LITERAL,
COMMENT,
NEWLINE
};
struct Token {
TokenType type;
string value;
int line;
};
class Scanner {
public:
Scanner(string filename);
~Scanner();
vector<Token> scan();
private:
string filename;
ifstream file;
vector<Token> tokens;
int line;
void scanIdentifier(stringstream& ss);
void scanKeyword(stringstream& ss);
void scanOperator(stringstream& ss);
void scanLiteral(stringstream& ss);
void scanComment(stringstream& ss);
void scanNewline(stringstream& ss);
};
Scanner::Scanner(string filename) {
this->filename = filename;
this->file.open(filename);
this->line = 1;
}
Scanner::~Scanner() {
this->file.close();
}
vector<Token> Scanner::scan() {
string lineStr;
while (getline(this->file, lineStr)) {
stringstream ss(lineStr);
while (!ss.eof()) {
char c = ss.peek();
if (isalpha(c) || c == '_') {
scanIdentifier(ss);
} else if (isdigit(c)) {
scanLiteral(ss);
} else if (c == '"' || c == '\'') {
scanLiteral(ss);
} else if (c == '/') {
char next = ss.get();
if (next == '/') {
scanComment(ss);
} else if (next == '*') {
scanComment(ss);
} else {
ss.unget();
scanOperator(ss);
}
} else if (isspace(c)) {
scanNewline(ss);
} else {
scanOperator(ss);
}
}
this->tokens.push_back({NEWLINE, "", this->line++});
}
return this->tokens;
}
void Scanner::scanIdentifier(stringstream& ss) {
string identifier;
while (!ss.eof()) {
char c = ss.peek();
if (isalnum(c) || c == '_') {
identifier += ss.get();
} else {
break;
}
}
if (identifier == "int" || identifier == "double" || identifier == "string") {
this->tokens.push_back({KEYWORD, identifier, this->line});
} else {
this->tokens.push_back({IDENTIFIER, identifier, this->line});
}
}
void Scanner::scanKeyword(stringstream& ss) {}
void Scanner::scanOperator(stringstream& ss) {
string op;
while (!ss.eof()) {
char c = ss.peek();
if (c == '+' || c == '-' || c == '*' || c == '/' || c == '%' ||
c == '=' || c == '>' || c == '<' || c == '!' || c == '&' ||
c == '|' || c == '^' || c == '~' || c == '?' || c == ':') {
op += ss.get();
} else {
break;
}
}
this->tokens.push_back({OPERATOR, op, this->line});
}
void Scanner::scanLiteral(stringstream& ss) {
char quote = ss.get();
string literal;
literal += quote;
while (!ss.eof()) {
char c = ss.get();
if (c == '\\') {
literal += c;
c = ss.get();
literal += c;
} else if (c == quote) {
literal += c;
break;
} else {
literal += c;
}
}
this->tokens.push_back({LITERAL, literal, this->line});
}
void Scanner::scanComment(stringstream& ss) {
char first = ss.get();
char second = ss.get();
string comment;
comment += first;
comment += second;
if (comment == "//") {
getline(ss, comment);
this->tokens.push_back({COMMENT, comment, this->line});
} else {
while (!ss.eof()) {
char c = ss.get();
comment += c;
if (c == '*' && ss.peek() == '/') {
comment += ss.get();
break;
}
}
this->tokens.push_back({COMMENT, comment, this->line});
}
}
void Scanner::scanNewline(stringstream& ss) {
char c = ss.get();
if (c == '\r' && ss.peek() == '\n') {
c = ss.get();
}
this->tokens.push_back({NEWLINE, "", this->line++});
}
int main() {
Scanner scanner("test.cpp");
vector<Token> tokens = scanner.scan();
for (auto token : tokens) {
cout << token.type << " : " << token.value << " : " << token.line << endl;
}
return 0;
}
```
上面的代码实现了一个简单的 Scanner,支持 C++ 的各种特性。在实现中,我们使用了正则表达式和有限状态自动机等技术,以便更好地处理词法单元。
阅读全文