Lex和Yacc应用教程(四).语法树的应用

guitarhua 2014-10-30

展开全文

Lex和Yacc应用方法(四).语法树的应用

草木瓜 20070515

一、序

不论什么语言，语法结构总是那几种，可以想象任何程序体都可以解释成一棵语法
树，语法树的本质是递归，很显然Yacc文法的核心思想也是递归。本文就通过具体实例，
使用Yacc构建递归的语法树来解决实际问题。
比较遗憾的是，在总结的过程中想表达清楚并不容易，估且三分言传，七分会意吧。
关键在于个人去思考。

二、递归的一些思想

    我们先看一个简化的C语言示例段：

    i=0;
    while(i<=10) {
    print(i);
    i=i+1;
    }
    print(i+i);

    首先，我们将()+/* print while之类的组合称为expr(expression)，仅表示基本的表
达式，可以理解通过递归可以进行任意的运算符组合。如下面每一行都可称为expr:

    i=0
    while(i<=10)
    print(i)
    i=i+1
    print(i+i)

    再把expr + ;的语句行称为stmt(statement)，表示一条语句的结束。把{}引起来的多个stmt
称为stmt_list。如此，原示例段可表示为：

  stmt
  expr stmt_list
  stmt

  这样显然不符合递归法则，倘若stmt也可由expr stmt_list组合，程序则可以递归到最顶级

  stmt
  stmt
  stmt

    这也要求yacc文法定义必须可以递归到最顶级，即如上所示。


三、内存结构

编译过程必须在内存中形成一定规则且可递归的树形结构，比较容易理解对每一语句stmt
需要构建一棵语法树。

    以下是我们准备采用的语法树实际示例：

  Graph 0:

      [=]
       |
     |----|
     |    |
   idx(i) c(0)


  Graph 1:

                 while
                   |
        |----------------|
        |                |
      [<=]              [;]
        |                |
     |-----|     |----------|
     |     |     |          |
   idx(i) c(10) print      [=]
                 |          |
                 |     |-------|
                 |     |       |
               idx(i) idx(i) [+]
                               |
                             |----|
                             |    |
                           idx(i) c(1)


  Graph 2:

      print
        |
        |
        |
       [+]
        |
     |-----|
     |     |
   idx(i) idx(i)


    细心查看以上三张图，会发现每个stmt即对应一张树形图，树结点包含三种类型：操作符
(如 + = ; )，变量索引(如 idx(i))和值(如 c(10) c(1) )。对于每个操作符，需要保证一
个可递归的规则。

四、具体实例

A. node.h <树结点的定义头文件>

/* 定义树结点的权举类型 */
typedef enum { TYPE_CONTENT, TYPE_INDEX, TYPE_OP } NodeEnum;

  /* 操作符 */
  typedef struct {
  int name; /* 操作符名称 */
  int num; /* 操作元个数 */
  struct NodeTag * node[1]; /* 操作元地址可扩展 */
  } OpNode;

  typedef struct NodeTag {
  NodeEnum type; /* 树结点类型 */
  /* Union 必须是最后一个成员 */
  union {
  int content; /* 内容 */
  int index; /* 索引 */
  OpNode op; /* 操作符对象 */
  };

  } Node;

    extern int Var[26];


    [说明] Node即为定义的树形结点，结点可以是三种类型(CONTENT,INDEX,OP)。结点
    如果是操作符对象(OpNode)的话，结点可继续递归结点。操作符结点包括了名称，个
    数和子结点三个要素，其中子结点可以为多个。


    B.lexya_e.l <lex文件>

  %{
  #include <stdlib.h>
  #include "node.h"
  #include "lexya_e.tab.h"
  void yyerror(char *);
  %}
  %%

  [a-z] {
   yylval.sIndex = *yytext -'a';
   return VARIABLE;
  }

  [0-9]+ {
   yylval.iValue = atoi(yytext);
   return INTEGER;
  }

  [()<>=+*/;{}.] {
   return *yytext;
  }

  ">=" return GE;
  "<=" return LE;
  "==" return EQ;
  "!=" return NE;

  "&&" return AND;
  "||" return OR;

  "while" return WHILE;
  "if" return IF;
  "else" return ELSE;
  "print" return PRINT;

  [/t/n]+ ; /* 去除空格，回车 */
  . printf("unknow symbol:[%s]/n",yytext);
  %%
  int yywrap(void) {
  return 1;
  }

    [说明] 这里的Lex文件比较简单，没啥好说的。


    C.lexya_e.y <yacc文件>

  001 %{
  002
  003 #include <stdio.h>
  004 #include <stdlib.h>
  005 #include <stdarg.h>
  006 #include "node.h"
  007
  008 /* 属性操作类型 */
  009 Node *opr(int name, int num, ...);
  010
  011 Node *set_index(int value);
  012 Node *set_content(int value);
  013
  014 void freeNode(Node *p);
  015 int exeNode(Node *p);
  016
  017 int yylexeNode(void);
  018 void yyerror(char *s);
  019
  020 int Var[26]; /* 变量数组 */
  021
  022 %}
  023
  024 %union {
  025 int iValue; /* 变量值 */
  026 char sIndex; /* 变量数组索引 */
  027 Node *nPtr; /* 结点地址 */
  028 };
  029
  030 %token <iValue> VARIABLE
  031 %token <sIndex> INTEGER
  032 %token WHILE IF PRINT
  033 %nonassoc IFX
  034 %nonassoc ELSE
  035 %left AND OR GE LE EQ NE '>' '<'
  036 %left '+' '-'
  037 %left '*' '/'
  038 %nonassoc UMINUS
  039 %type <nPtr> stmt expr stmt_list
  040 %%
  041 program:
  042 function { exit(0); }
  043 ;
  044 function:
  045 function stmt { exeNode($2); freeNode($2); }
  046 | /* NULL */
  047 ;
  048 stmt:
  049 ';' { $$ = opr(';', 2, NULL, NULL); }
  050 | expr ';' { $$ = $1; }
  051 | PRINT expr ';' { $$ = opr(PRINT, 1, $2); }
  052 | VARIABLE '=' expr ';' { $$ = opr('=', 2, set_index($1), $3); }
  053 | WHILE '(' expr ')' stmt { $$ = opr(WHILE, 2, $3, $5); }
  054 | IF '(' expr ')' stmt %prec IFX { $$ = opr(IF, 2, $3, $5); }
  055 | IF '(' expr ')' stmt ELSE stmt %prec ELSE { $$ = opr(IF, 3, $3, $5, $7); }
  056 | '{' stmt_list '}' { $$ = $2; }
  057 ;
  058 stmt_list:
  059 stmt { $$ = $1; }
  060 | stmt_list stmt { $$ = opr(';', 2, $1, $2); }
  061 ;
  062 expr:
  063 INTEGER { $$ = set_content($1); }
  064 | VARIABLE { $$ = set_index($1); }
  065 | '-' expr %prec UMINUS { $$ = opr(UMINUS, 1, $2); }
  066 | expr '+' expr { $$ = opr('+', 2, $1, $3); }
  067 | expr '-' expr { $$ = opr('-', 2, $1, $3); }
  068 | expr '*' expr { $$ = opr('*', 2, $1, $3); }
  069 | expr '/' expr { $$ = opr('/', 2, $1, $3); }
  070 | expr '<' expr { $$ = opr('<', 2, $1, $3); }
  071 | expr '>' expr { $$ = opr('>', 2, $1, $3); }
  072 | expr GE expr { $$ = opr(GE, 2, $1, $3); }
  073 | expr LE expr { $$ = opr(LE, 2, $1, $3); }
  074 | expr NE expr { $$ = opr(NE, 2, $1, $3); }
  075 | expr EQ expr { $$ = opr(EQ, 2, $1, $3); }
  076 | expr AND expr { $$ = opr(AND, 2, $1, $3); }
  077 | expr OR expr { $$ = opr(OR, 2, $1, $3); }
  078 | '(' expr ')' { $$ = $2; }
  079 ;
  080 %%
  081 #define SIZE_OF_NODE ((char *)&p->content - (char *)p)
  082
  083 Node *set_content(int value) {
  084
  085 Node *p;
  086
  087 size_t sizeNode;
  088 /* 分配结点空间 */
  089 sizeNode = SIZE_OF_NODE + sizeof(int);
  090
  091 if ((p = malloc(sizeNode)) == NULL)
  092   yyerror("out of memory");
  093
  094 /* 复制内容 */
  095 p->type = TYPE_CONTENT;
  096 p->content = value;
  097
  098 return p;
  099
  100 }
  101
  102 Node *set_index(int value) {
  103
  104 Node *p;
  105 size_t sizeNode;
  106 /* 分配结点空间 */
  107 sizeNode = SIZE_OF_NODE + sizeof(int);
  108
  109 if ((p = malloc(sizeNode)) == NULL)
  110   yyerror("out of memory");
  111
  112 /* 复制内容 */
  113 p->type = TYPE_INDEX;
  114 p->index = value;
  115
  116 return p;
  117 }
  118
  119 Node *opr(int name, int num, ...) {
  120
  121 va_list valist;
  122 Node *p;
  123 size_t sizeNode;
  124 int i;
  125 /* 分配结点空间 */
  126 sizeNode = SIZE_OF_NODE + sizeof(OpNode) + (num - 1) * sizeof(Node*);
  127
  128 if ((p = malloc(sizeNode)) == NULL)
  129   yyerror("out of memory");
  130
  131 /* 复制内容 */
  132
  133 p->type = TYPE_OP;
  134 p->op.name = name;
  135 p->op.num = num;
  136
  137 va_start(valist, num);
  138
  139 for (i = 0; i < num; i++)
  140 p->op.node[i] = va_arg(valist, Node*);
  141
  142 va_end(valist);
  143 return p;
  144 }
  145 void freeNode(Node *p) {
  146 int i;
  147 if (!p) return;
  148 if (p->type == TYPE_OP) {
  149   for (i = 0; i < p->op.num; i++)
  150   freeNode(p->op.node[i]);
  151 }
  152 free (p);
  153 }
  154 void yyerror(char *s) {
  155 fprintf(stdout, "%s/n", s);
  156 }
  157 int main(void) {
  158 yyparse();
  159 return 0;
  160 }

    [说明] 这个文件是核心所在，大体分为Yacc预定义文法，BNF递归文法和扩展实现函数
    三个部分。

      Yacc预定义文法的解释：

      (1).(024-031)(039)

      %union {
    int iValue; /* 变量值 */
    char sIndex; /* 变量数组索引 */
    Node *nPtr; /* 结点地址 */
    };

    %token <iValue> INTEGER
    %token <sIndex> VARIABLE

    (024-031)这一段扩充了yystype的内容，默认yystype只是int类型，编译之后会
    生成如下代码：

    #ifndef YYSTYPE
    typedef union {
    int iValue; /* 变量值 */
    char sIndex; /* 变量数组索引 */
    Node *nPtr; /* 结点地址 */
    } yystype;
    # define YYSTYPE yystype
    # define YYSTYPE_IS_TRIVIAL 1
    #endif

    并将<iValue>与INTEGER，<sIndex>与VARIABLE绑定，表示对lex返回的值自动
    进行类型转换。

    (039)将<nPtr>与Union的指针类型绑定。

    即在剖析器的内容栈中，常量、变量和节点都可以由yylval表示。yylval既可以是int,
    char，也可以是Node *。具体可以查看lexya_e.tab.c生成文件部分，可有更深刻的
    理解。(switch (yyn) 下面)


    (2).(032-039)

    这段方法主要定义了操作符的优先级。nonassoc，意味着没有依赖关系。它经常在连接
    词中和 %prec一起使用，用于指定一个规则的优先级。如下面的规则存在IF ELSE的二义
    性，需要使用nonassoc指定规则。054对应IFX，055对应ELSE，055高于054。

    054 | IF '(' expr ')' stmt %prec IFX { $$ = opr(IF, 2, $3, $5); }
    055 | IF '(' expr ')' stmt ELSE stmt %prec ELSE { $$ = opr(IF, 3, $3, $5, $7); }

       (039)type的关键字语句，表示后面的返回值是<ptr>类型。


       BNF递归文法(040-080)：

       这个递归文法看起来容易，自个设计起来还是有点难度的，相关的递归思想可以参见本文
       最前面的“递归的一些思想”。可以考虑一下(056)-(060)的语法定义。


       扩展实现函数：

       本例扩展了set_index,set_value两个赋值语句，其操作实质是在内存空间分配index
       和value的两种树结点。opr这个扩展函数很重要，而且使用了动态参数，主要考虑操作
       符的操作元个数是可变的，这个也与头文件“struct NodeTag * node[1];”的定义思
       想一致。opr主要在内存空间中分配操作符相关的树结点。

       set_index,set_value,opr从概念上是完全一致，目的就是在内存中构造一棵可递归的
       语法树。

    D.parser.c

  #include <stdio.h>
  #include "node.h"
  #include "lexya_e.tab.h"

  int exeNode(Node *p) {
   if (!p) return 0;
   switch(p->type) {
    case TYPE_CONTENT: return p->content;
    case TYPE_INDEX:   return Var[p->index];
    case TYPE_OP:
     switch(p->op.name) {

      case WHILE: while(exeNode(p->op.node[0]))exeNode(p->op.node[1]);
                  return 0;

      case IF:     if (exeNode(p->op.node[0]))
                    exeNode(p->op.node[1]);
                  else
                  if (p->op.num>2)
                  exeNode(p->op.node[2]);
                  return 0;

      case PRINT: printf("%d/n", exeNode(p->op.node[0]));
                  return 0;

      case ';':    exeNode(p->op.node[0]);
                  return exeNode(p->op.node[1]);

      case '=':    return Var[p->op.node[0]->index] = exeNode(p->op.node[1]);
      case UMINUS: return exeNode(p->op.node[0]);
      case '+':    return exeNode(p->op.node[0]) + exeNode(p->op.node[1]);
      case '-':    return exeNode(p->op.node[0]) - exeNode(p->op.node[1]);
      case '*':    return exeNode(p->op.node[0]) * exeNode(p->op.node[1]);
      case '/':    return exeNode(p->op.node[0]) / exeNode(p->op.node[1]);
      case '<':    return exeNode(p->op.node[0]) < exeNode(p->op.node[1]);
      case '>':    return exeNode(p->op.node[0]) > exeNode(p->op.node[1]);
      case GE:     return exeNode(p->op.node[0]) >= exeNode(p->op.node[1]);
      case LE:     return exeNode(p->op.node[0]) <= exeNode(p->op.node[1]);
      case NE:     return exeNode(p->op.node[0]) != exeNode(p->op.node[1]);
      case EQ:     return exeNode(p->op.node[0]) == exeNode(p->op.node[1]);
      case AND:    return exeNode(p->op.node[0]) && exeNode(p->op.node[1]);
      case OR:     return exeNode(p->op.node[0]) || exeNode(p->op.node[1]);
     }

   }

   return 0;

  }

    这个文件是对语法树的解释分析。只包含一个递归函数exeNode。首先分树结点类型，
    再根据操作符一一判定动作。

五、结束

    bison -d lexya_e.y
  lex lexya_e.l
  gcc -g -o parser lex.yy.c lexya_e.tab.c parser.c

  编译即可测试执行。

    多说无益，个人感觉自已领会的东西才是最要的。本示例包含了一些C语法，不过也只是
一小部分，在后续的文章中将逐步扩展本例的功能，从而发掘yacc和lex的强大功能。

<注：本文示例参考Tom Niemann的《A Compact Guide to Lex & Yacc》
http:///lexandyacc/index.html>