93:c4编译器回顾(3)

我们完成了对词法分析的获取函数next()，现在我们尝试根据token分析，和我们的语法规则，进行简单的语法分析。

解析变量

我们的解释器的语法结构，可以用下面的EBNF的表示法直观的体现：

program ::= {global_declaration}+

global_declaration ::= enum_decl | variable_decl | function_decl

enum_decl ::= 'enum' [id] '{' id ['=' 'num'] {',' id ['=' 'num'} '}'

variable_decl ::= type {'*'} id { ',' {'*'} id } ';'

function_decl ::= type {'*'} id '(' parameter_decl ')' '{' body_decl '}'

parameter_decl ::= type {'*'} id {',' type {'*'} id}

body_decl ::= {variable_decl}, {statement}

statement ::= non_empty_statement | empty_statement

non_empty_statement ::= if_statement | while_statement | '{' statement '}'
| 'return' expression | expression ';'

if_statement ::= 'if' '(' expression ')' statement ['else' non_empty_statement]

while_statement ::= 'while' '(' expression ')' non_empty_statement

对于我们的程序而言，一切都是从对global_delartion开始：

变量定义
类型定义（目前只支持enum）
函数定义

我们的program()作为语法解析函数，框架如下:

void program() {
    next();
    while(token > 0){
        global_declaration();
    }
}

global_declaration

即全局定义的语句，我们通过递归下降的方法，来判断当前的定义类型，具体实现如下：

void global_declaration(){
    int type;
    int i;
    basetype = INT;
    if(token == Enum){
        match(Enum);
        if(token != '{') match(Id);
        else{
            match('{');
            enum_declaration();
            match('}');
        } 
        match(';');
        return;
    }else if(token==Int){
        match(Int);
    }else if(token==Char){
        match(Char);
        basetype = CHAR;
    }

    while(token != ';' && token != '{' && token != '}'){
        type = basetype;
        // 指针处理
        while(token==Mul){
            match(Mul);
            type = type + PTR;
        }
        if (token!=Id){
            printf("%d: bad global declaration\n", line);
            exit(-1);
        }
        if (current_id->class != 0){
            printf("%d: duplicate global declaration\n", line);
            exit(-1);
        }
        match(Id);
        current_id->type = type;

        if (token=='('){
            current_id->class = Fun;
            current_id->value = (int)(text);
            function_declaration();
        }else{
            current_id->class = Glo;
            current_id->value = (int)(data);
            data += sizeof(int);
        }
        if(token==','){
            match(',');
        }
    }
    next();
}

这里我们使用的依旧是lookahead的思想，我们无法根据一个token信息就实现对这一部分的语法功能解析，所以我们需要结合之后的信息，来对定义进行判断。

我们的解释器同时也支持指针类型，我们这里使用type = type+PTR的方式，来表示其包含指针的信息。

enum_declaration

主要用来解析用,分隔的变量，这里我们需要注意编译器对枚举变量的实现：

void enum_declaration(){
    int i=0;
    while(token!='}'){
        if(token!=Id){
            printf("%d: 错误的枚举变量声明\n", line);
            exit(-1);
        }
        next();
        if(token=='='){
            match('=');
            if(token!=Num){
                printf("%d: 枚举变量赋值错误\n", line);
                exit(-1);
            }
            i = token_val;
            next();
        }
        current_id->class = Num;
        current_id->type = INT;
        current_id->value = i++;
        if(token==',') next();
    }
}

辅助函数match

这里我们使用match来匹配并获取下一个token:

void match(int tk){
    if(token == tk){
        next();
    }else{
        printf("%d: 语法错误, 缺少 %c\n", line, tk);
        exit(-1);
    }
}

函数定义的解析

我们在先前的函数Id判断中：

...
if (token == '(') {
current_id[Class] = Fun;
current_id[Value] = (int)(text); 
function_declaration();
} else {
...

在确定是函数类型之后，我们开始对函数结构的解析，我们对函数体的解析结构如下：

void function_declaration(){
    match('(');
    function_parameters();
    match(')');
    match('{');
    function_body();
    // match('}');
    while(current_id->token){
        if(current_id->class==Loc){
            current_id->class=current_id->Bclass;
            current_id->type=current_id->Btype;
            current_id->value=current_id->Bvalue;
        }
        current_id++;
    }
}

这里需要注意两个点：

一是在对函数体的分析结束之后，不需要match("}")，因为函数体解析的函数会匹配他
二是在函数解析完毕之后，我们需要将用到的token信息还原。这是因为在函数体解析的过程中，同名的局部变量会把原来定义的全局变量信息覆盖。

进一步的实现，则分别是对参数和函数体的内容的解析，首先是对参数的解析：

void function_parameters(){
    int type;
    int params = 0;
    type = basetype = INT;
    while(token!=')'){
       if(token==Int) match(Int);
       else if(token==Char){
            match(Char);
            type = CHAR;
       }
       while(token==Mul){
            match(Mul);
            type = type + PTR;
       }
       if(token!=Id){
            printf("%d: 错误的函数参数声明\n", line);
            exit(-1);
        }
        if (current_id->class != 0){
            printf("%d: 重复定义了局部变量-^-\n", line);
            exit(-1);
        }
        match(Id);

        current_id->Bclass = current_id->class;
        current_id->Btype = current_id->type;
        current_id->Bvalue = current_id->value;
        current_id->class = Loc;
        current_id->type = type;
        current_id->value = ++params;
        
        if(token==',') match(',');
    }
}

这里我们需要注意两点，首先是index_of_bp，这个变量用于确定参数在栈上的偏移值，因为从bp到函数传参之间隔着一个返回地址，所以在计算时需要将index_of_bp加一。

其次是，当我们遇到一个Id类型的token，我们将其所有的信息都备份一遍，并且将值赋值为参数的索引

然后是对函数体的解析，我们可以进一步分成局部变量声明和语句部分：

void function_body(){
    int pos_local = 0;
    int type;

    while(token==Int||token==Char){
        basetype = (token==Int ? INT : CHAR);
        match(token);
        while(token!= ';'){
            type = basetype;
            while(token==Mul){
                match(Mul);
                type = type + PTR;
            }
            if (token!=Id){
                printf("%d: 错误的局部变量声明\n", line);
                exit(-1);
            }
            if (current_id->class != 0){
                printf("%d: 重复定义了局部变量-^-\n", line);
                exit(-1);
            }
            match(Id);
            current_id->Bclass = current_id->class;
            current_id->Btype = current_id->type;
            current_id->Bvalue = current_id->value;
            current_id->class = Loc;
            current_id->type = type;
            current_id->value = ++pos_local;
            if(token==',') match(',');
        }
        match(';');
    }
    *++text = ENT;  // 函数入口指令
    *++text = pos_local; // 局部变量大小
    // 语句处理
    while(token != '}'){
        statement();
    }
    *++text = LEV;  // 函数返回指令
}

这一部分需要注意的就是*++text的含义，相当于我们在栈上为局部变量预留了空间，后面的LEV则是用来返回函数调用。

至此我们对全局变量和函数定义的解析完成了，之后我们将着重实现对语句的实现，并为他们生成可用的字节码指令。