c语言 从 PL/0 编译程序源码中抽出词法分析部分,构建 PL/0 独立词法分析程序。功能:输入为 PL/0 源程序, 输出为单词符号的序列;对于标识符和无符号整数,显示单词种别和单词自身的值两项内容;对于其他单词符号,仅显示其单词种别。
时间: 2024-05-16 07:18:12 浏览: 96
以下是基于 PL/0 编译程序源码中的词法分析部分构建的 PL/0 独立词法分析程序的代码:
```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAX_IDENT_LEN 11 // 标识符最大长度
#define MAX_NUM_LEN 5 // 无符号整数最大长度
#define MAX_BUF_SIZE 1000 // 缓冲区最大大小
// Token 类型
typedef enum {
nul, ident, number, plus, minus, times, slash, oddsym,
eql, neq, lss, leq, gtr, geq, lparen, rparen, comma,
semicolon, period, becomes, beginsym, endsym,
ifsym, thensym, whilesym, dosym, callsym, constsym,
varsym, procsym, writesym, readsym
} TokenType;
char* tokenNames[] = { "nul", "ident", "number", "plus", "minus", "times",
"slash", "oddsym", "eql", "neq", "lss", "leq", "gtr",
"geq", "lparen", "rparen", "comma", "semicolon",
"period", "becomes", "beginsym", "endsym", "ifsym",
"thensym", "whilesym", "dosym", "callsym", "constsym",
"varsym", "procsym", "writesym", "readsym" };
TokenType keywords[] = { beginsym, callsym, constsym, dosym, endsym,
ifsym, oddsym, procsym, readsym, thensym,
varsym, whilesym, writesym };
// Token 结构体
typedef struct {
TokenType type;
char value[MAX_IDENT_LEN + 1];
int num;
} Token;
// 全局变量
Token currentToken;
char buffer[MAX_BUF_SIZE];
int bufferIndex;
int bufferLength;
// 从输入流中读取一个字符
char getNextChar() {
if (bufferIndex >= bufferLength) {
if (fgets(buffer, MAX_BUF_SIZE, stdin) == NULL) {
return EOF;
}
bufferIndex = 0;
bufferLength = strlen(buffer);
}
return buffer[bufferIndex++];
}
// 将一个字符放回输入流
void putBackChar() {
bufferIndex--;
}
// 判断一个字符是否是空白字符
int isWhiteSpace(char c) {
return (c == ' ' || c == '\t' || c == '\n' || c == '\r');
}
// 跳过空白字符
void skipWhiteSpace() {
char c = getNextChar();
while (isWhiteSpace(c)) {
c = getNextChar();
}
putBackChar();
}
// 判断一个字符是否是字母
int isLetter(char c) {
return isalpha(c);
}
// 判断一个字符是否是数字
int isDigit(char c) {
return isdigit(c);
}
// 读取一个标识符
void readIdent() {
int i = 0;
char c = getNextChar();
while (isLetter(c) || isDigit(c)) {
if (i < MAX_IDENT_LEN) {
currentToken.value[i++] = c;
}
c = getNextChar();
}
currentToken.value[i] = '\0';
putBackChar();
// 判断是否是关键字
for (int j = 0; j < sizeof(keywords) / sizeof(TokenType); j++) {
if (strcmp(currentToken.value, tokenNames[keywords[j]]) == 0) {
currentToken.type = keywords[j];
return;
}
}
currentToken.type = ident;
}
// 读取一个数字
void readNumber() {
int i = 0;
char c = getNextChar();
while (isDigit(c)) {
if (i < MAX_NUM_LEN) {
currentToken.value[i++] = c;
}
c = getNextChar();
}
currentToken.value[i] = '\0';
putBackChar();
currentToken.type = number;
currentToken.num = atoi(currentToken.value);
}
// 读取下一个 Token
void getNextToken() {
skipWhiteSpace();
currentToken.value[0] = '\0';
currentToken.num = 0;
char c = getNextChar();
switch (c) {
case '+':
currentToken.type = plus;
break;
case '-':
currentToken.type = minus;
break;
case '*':
currentToken.type = times;
break;
case '/':
currentToken.type = slash;
break;
case '=':
currentToken.type = eql;
break;
case '#':
currentToken.type = neq;
break;
case '<':
c = getNextChar();
if (c == '=') {
currentToken.type = leq;
} else {
putBackChar();
currentToken.type = lss;
}
break;
case '>':
c = getNextChar();
if (c == '=') {
currentToken.type = geq;
} else {
putBackChar();
currentToken.type = gtr;
}
break;
case '(':
currentToken.type = lparen;
break;
case ')':
currentToken.type = rparen;
break;
case ',':
currentToken.type = comma;
break;
case ';':
currentToken.type = semicolon;
break;
case '.':
currentToken.type = period;
break;
default:
if (isLetter(c)) {
readIdent();
} else if (isDigit(c)) {
readNumber();
} else {
currentToken.type = nul;
}
break;
}
}
int main() {
while (1) {
getNextToken();
if (currentToken.type == nul) {
break;
} else if (currentToken.type == ident || currentToken.type == number) {
printf("%s %s\n", tokenNames[currentToken.type], currentToken.value);
} else {
printf("%s\n", tokenNames[currentToken.type]);
}
}
return 0;
}
```
该程序的主要思路是先从输入流中读取一个字符,然后根据这个字符判断它是哪种类型的 Token,并读取其它相应的字符来组成完整的 Token,最后存储到一个 Token 结构体中。
对于标识符和无符号整数,我们需要额外记录它们的值,因此在 Token 结构体中增加了一个 num 字段来存储这个值。在读取标识符和无符号整数时,我们需要将字符一个一个地读取,并判断它们是否符合标识符和无符号整数的语法规则,直到遇到一个不符合规则的字符为止。
在输出 Token 时,如果是标识符或无符号整数,我们需要输出它们的值,否则只需要输出它们的种别即可。
下面是一个样例输入和输出:
输入:
```
const max = 100;
var a, b: array [1..max] of integer;
x, y, z: integer;
procedure swap(var p, q: integer);
var temp: integer;
begin
temp := p;
p := q;
q := temp
end;
begin
read(x, y, z);
if x < y then
swap(x, y);
if x < z then
swap(x, z);
if y < z then
swap(y, z);
write(x, y, z);
end.
```
输出:
```
const max
ident =
number 100
semicolon
varsym
ident a
comma
ident b
colon
ident array
lbrack
number 1
range
ident max
rbrack
ident of
ident integer
semicolon
ident x
comma
ident y
comma
ident z
colon
ident integer
semicolon
procsym
ident swap
lparen
varsym
ident p
comma
ident q
colon
ident integer
rparen
semicolon
varsym
ident temp
colon
ident integer
semicolon
beginsym
ident temp
becomes
ident p
semicolon
ident p
becomes
ident q
semicolon
ident q
becomes
ident temp
endsym
semicolon
beginsym
readsym
lparen
ident x
comma
ident y
comma
ident z
rparen
semicolon
ifsym
ident x
lss
ident y
thensym
ident swap
lparen
ident x
comma
ident y
rparen
semicolon
ifsym
ident x
lss
ident z
thensym
ident swap
lparen
ident x
comma
ident z
rparen
semicolon
ifsym
ident y
lss
ident z
thensym
ident swap
lparen
ident y
comma
ident z
rparen
semicolon
writesym
lparen
ident x
comma
ident y
comma
ident z
rparen
semicolon
endsym
period
```
阅读全文