Lexical Analysis-barryhu-ChinaUnix博客

// 下面的define是对token类型的定义。 #define T_SEMICOLON ';' // use ASCII values for single char tokens #define T_LPAREN '(' #define T_RPAREN ')' #define T_ASSIGN '=' #define T_DIVIDE '/' ... #define T_WHILE 257 // reserved words #define T_IF 258 #define T_RETURN 259 ... #define T_IDENTIFIER 268 // identifiers, constants, etc. #define T_INTEGER 269 #define T_DOUBLE 270 #define T_STRING 271 #define T_END 349 // code used when at end of file #define T_UNKNOWN 350 // token was unrecognized by scanner

struct token_t { int type; // one of the token codes from above union { // 注意这里union的使用目的。 char stringValue[256]; // holds lexeme value if string/identi int intValue; // holds lexeme value if integer double doubleValue; // holds lexeme value if double } val; }; int main(int argc, char *argv[]) { struct token_t token; InitScanner(); // 说明这里是一边分析文本得到token，一边对其进行处理。 while (ScanOneToken(stdin, &token) != T_END) ; // here is where you would process each token return 0; } // 将所有的保留字存入表中（表的设计？），当然对于这个表需要查询的功能。在这段代码中假设所有的保留字都是大写的。 static void InitScanner() { create_reserved_table(); // table maps reserved words to to insert_reserved("WHILE", T_WHILE) insert_reserved("IF", T_IF) insert_reserved("RETURN", T_RETURN) .... } // 分析输入文件，得到下一个token。token的值以及类型都存在结构token中，此函数同时返回分析出的token的类型，这样方便程序的编写（考虑如果没有返回token类型的话，main函数应该怎么写？）。 static int ScanOneToken(FILE *fp, struct token_t *token){ int i, ch, nextch; ch = getc(fp); // read next char from input stream while (isspace(ch)) // if necessary, keep reading til non-space char ch = getc(fp); // (discard any white space) switch(ch) { case '/': // could either begin comment or T_DIVIDE op nextch = getc(fp); if (nextch == '/' || nextch == '*') ; // here you would skip over the comment else ungetc(nextch, fp); // fall-through to single-char token case case ';': case ',': case '=': // ... and other single char tokens token->type = ch; // ASCII value is used as token type

return ch; // ASCII value used as token type case 'A': case 'B': case 'C': // ... and other upper letters token->val.stringValue[0] = ch; for (i = 1; isupper(ch = getc(fp)); i++) // gather uppercase token->val.stringValue[i] = ch; ungetc(ch, fp); token->val.stringValue[i] = '\0'; // lookup reserved word token->type = lookup_reserved(token->val.stringValue); return token->type; case 'a': case 'b': case 'c': // ... and other lower letters token->type = T_IDENTIFIER; token->val.stringValue[0] = ch; for (i = 1; islower(ch = getc(fp)); i++) token->val.stringValue[i] = ch; // gather lowercase ungetc(ch, fp); token->val.stringValue[i] = '\0'; if (lookup_symtab(token->val.stringValue) == NULL) add_symtab(token->val.stringValue); // get symbol for ident return T_IDENTIFIER; case '0': case '1': case '2': case '3': //.... and other digits token->type = T_INTEGER; token->val.intValue = ch - '0'; while (isdigit(ch = getc(fp))) // convert digit char to number token->val.intValue = token->val.intValue * 10 + ch - '0'; ungetc(ch, fp); return T_INTEGER; case EOF: return T_END; default: // anything else is not recognized token->val.intValue = ch; token->type = T_UNKNOWN; return T_UNKNOWN; } }

这段代码很有用。因为当我们需要处理文本文件的时候，常常也会经过这样的一个处理过程。了解了这段代码的工作原理，我们可以在自己的文本处理程序中加以运用。需要好好的研究研究。比如define的使用目的，union的使用目的，最后最各种类型的token的处理等。

阅读(677) | 评论(0) | 转发(0) |

上一篇：介绍

下一篇：介绍

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6