查看文章
 
RegExp DEMO version -- [1]正则表达式 TO NFA表格 -- 词法分析代码
2007年06月02日 星期六 上午 10:10

最近想做一个正则表达式的解析工具,
目前已经完成了正则表达式向NFA(不确定有穷自动机)表格的转换代码。
源代码包在这里下载:
======================================
       源代码zip包请到这里下载http://www.yuyuyouer.cn/down/regexp.zip
======================================

我再跟几个帖子把核心代码贴出来一部分:供各位赏玩 。。。

readME文件如下
===================================
作者:王旭华
泰山学院
E-MAIL:qfhuazi@163.com
QQ: 471600163

文件:regexp-win.exe -- windows的编译
                             cfree3.5+win2003
           regexp-linux        -- linux下的编译
                             gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7)

功能:将正则表达式转化为一个NFA表格
DEMO支持的运算符:
         重复运算:+ * ? {m} {m, } {m,n}
         连接运算:. |
         字母表运算: [] [^]
         转义字符: \xff \377 \77 \\ \r \n \f \v \t ...
                   \d \D \s \S \w \W


                             2007年6月1日
PS:这个日子很有纪念意义啊 。。。
         ^-^!

=====================================

测试的屏幕拷贝[linux平台]
=====================================

[xuhua@linux xuhua]$ ./regexp.linux
Example: Shell>regexp "x(a[^\S]b)+y"
debug: JUST A TEST BEGIN...
debug: start scan the regexp:
             "x(a[^\S]b)+y"
debug: (A+)ONCE_MORE
debug: end scan the regexp:
             "x(a[^\S]b)+y"
debug: ################### there is the NFA table basic info: #######################
debug: StateCount = 11
head->842d020        cur->842d090         tail->842d090
debug: ################### there is the NFA table: #######################
#842d020->[256]842d070 |
#842d070->[120]842d0c0 |
#842d0c0->[256]842d110 |
#842d110->[97]842d160 |
#842d160->[256]842d1b0 |
#842d1b0->[32]842d1d0 | [13]842d1d0 | [12]842d1d0 | [11]842d1d0 | [10]842d1d0 | [9]842d1d0 |
#842d1d0->[98]842d270 | [256]842d1d0 |
#842d270->[256]842d130 |
#842d130->[121]842d2c0 | [256]842d110 |
#842d2c0->[256]842d090 |
#842d090->
debug: ################### the NFA table ended #######################
debug: StateCount = 11
head->842d020        cur->842d090         tail->842d090
debug: ################### the NFA table basic info ended #######################
debug: JUST A TEST END...

=====================================

测试的屏幕拷贝[win平台]
=====================================

D:\My_Prj\RegExp>RegExp-release.exe "\xcd\xf5王旭华"
debug: JUST A TEST BEGIN...
debug: start scan the regexp:
             "\xcd\xf5王旭华"
debug: end scan the regexp:
             "\xcd\xf5王旭华"
debug: ################### there is the NFA table basic info: ##################
#####
debug: StateCount = 11
head->272480         cur->272518          tail->272518
debug: ################### there is the NFA table: #######################
#272480->[256]2724f8 |
#2724f8->[205]272550 |
#272550->[245]272588 |
#272588->[205]2725c0 |
#2725c0->[245]2725f8 |
#2725f8->[208]272630 |
#272630->[241]272668 |
#272668->[187]2726a0 |
#2726a0->[170]2726d8 |
#2726d8->[256]272518 |
#272518->
debug: ################### the NFA table ended #######################
debug: StateCount = 11
head->272480         cur->272518          tail->272518
debug: ################### the NFA table basic info ended ######################
#
debug: JUST A TEST END...

==============================================

[regexp.c]中主要函数声明和源代码

下面是主要的函数声明
======================================
SymbolType getSymbol();        /*词法分析函数,返回一个符号的类型和它的值*/
int dealState(SymbolType symboltype);       /*处理输入状态*/
void repeat_needed(int m, int eleindex);       /*{m,n}生成必选的m次循环状态表*/
void repeat_optional(int n, int eleindex);         /*{m,n}生成可选的n次循环状态表*/
pStateTable repeatMachine_needed(int m, pMachine pm);       /*A{m,n}生成必选的m次循环状态表*/
pStateTable repeatMachine_optional(int n, pMachine pm);         /*A{m,n}生成可选的n次循环状态表*/
int getASCII(char c); /*得到c的ASCII码*/

====================================

下面是语法解析的驱动部分,用了一个while来驱动这个自动机来运做,
里面对状态的具体处理都放在了函数dealState()中,注释很详细就不多说了 。。

========================================
/*核心解析函数: 语法分析部分*/
int regexp_main() {
         /*初始化 g_st & g_mstk*/
         g_st = newStateTable(256);       /*0-255, 256是epsilon(空)*/
         appendStateTable(g_st, newState()); /*开始状态*/
         g_st->curState = g_st->head;
         g_mstk = newMachineStack();       /*堆栈空*/

         g_scan_pos = -1;
         g_symbol_type = START_REGEXP;
         while_notfinish = 1; /*表达式扫描循环条件,在END_REGEXP中会置0*/
         while (while_notfinish) {
             if (dealState(g_symbol_type) == 1) {
                 /*下一个符号,在前面的switch()中,
                           有些预先读取一个词的分枝会用continue语句会跳过这里*/
                 getSymbol();
             }
         }

         /*输出StateTable,测试用*/
         showStateTable(g_st);
         destroyStateTable(g_st);
         destroyMachineStack(g_mstk);
         /*
         printf("debug: press anykey to continue ...\n");
         getch();
         */
         return 0;
}


=======================================

这里是词法分析的函数,他从一个正则表达式中读区一个特定的词,并返回类型,
具体的功能各位看代码吧,不算复杂,我的文档还没有整理完毕。
这里面的枚举类型:SymbolType(在[regexp.h]中)的定义也一块贴出来了,以便各位参考。

=======================================

/*终结符类型*/
typedef enum SymbolType_enum {
         UNKNOWN = 0,
         END_REGEXP,
         START_REGEXP,
         INPUT_ELE,
         REPEAT_ZERO_MORE, REPEAT_ZERO_ONCE, REPEAT_ONCE_MORE,
         REPEAT_RANGE_MN, REPEAT_RANGE_M, REPEAT_RANGE_M_MORE,
         AND_MACHINE_BEGIN, AND_MACHINE_END,
         OR_MACHINE_BEGIN, OR_MACHINE_END,
         NOT_OP, DOT, BACKTRACE,
         NUMBER, NOT_NUMBER,
         ALL_SPACE, NOT_ALL_SPACE,
         AZaz09_, NOT_AZaz09_,
         LETTER_RANGE
}SymbolType;

=======================================

/*词法分析,取得一个符号*/
SymbolType getSymbol() {
         int *cc = &g_symbol_charvalue;
         char tmp; /*临时变量*/
    
         *cc = getASCII(g_strRegExp[++g_scan_pos]);
         /*正则表达式结尾*/
         if (*cc == NULL) {return g_symbol_type=END_REGEXP;}
         if (*cc == '(') {return g_symbol_type=AND_MACHINE_BEGIN;}
         if (*cc == ')') {return g_symbol_type=AND_MACHINE_END;}
         if (*cc == '[') {return g_symbol_type=OR_MACHINE_BEGIN;}
         if (*cc == ']') {return g_symbol_type=OR_MACHINE_END;}
         if (*cc == '|') {return g_symbol_type=BACKTRACE;}
         if (*cc == '^') {return g_symbol_type=NOT_OP;}
         if (*cc == '.') {return g_symbol_type=DOT;}
         if (*cc == '-') {return g_symbol_type=LETTER_RANGE;}
         /*转义字符*/
         if (*cc == '\\') {
             *cc = getASCII(g_strRegExp[++g_scan_pos]);
             if (*cc == 'd') {return g_symbol_type=NUMBER;}
             if (*cc == 'D') {return g_symbol_type=NOT_NUMBER;}
             if (*cc == 'f') {*cc = '\f'; return g_symbol_type=INPUT_ELE;}
             if (*cc == 'n') {*cc = '\n'; return g_symbol_type=INPUT_ELE;}
             if (*cc == 'r') {*cc = '\r'; return g_symbol_type=INPUT_ELE;}
             if (*cc == 't') {*cc = '\t'; return g_symbol_type=INPUT_ELE;}
             if (*cc == 'v') {*cc = '\v'; return g_symbol_type=INPUT_ELE;}
             if (*cc == 's') {return g_symbol_type=ALL_SPACE;}
             if (*cc == 'S') {return g_symbol_type=NOT_ALL_SPACE;}
             if (*cc == 'w') {return g_symbol_type=AZaz09_;}
             if (*cc == 'W') {return g_symbol_type=NOT_AZaz09_;}
             /*16进制*/
             if (*cc == 'x') {
                 tmp = getASCII(g_strRegExp[++g_scan_pos]);
                 if (tmp>='0' && tmp<='9') {
                     *cc = tmp - '0';
                 } else if (tmp>='a' && tmp<='f') {
                     *cc = tmp - 'a' + 10;
                 } else if (tmp>='A' && tmp<='F') {
                     *cc = tmp - 'A' + 10;
                 } else {
                     /*不是16进制数字,返回原字符'x'*/
                     g_scan_pos--;
                     return g_symbol_type=INPUT_ELE;
                 }
                 tmp = getASCII(g_strRegExp[++g_scan_pos]);
                 if (tmp>='0' && tmp<='9') {
                     *cc *= 16;
                     *cc += tmp - '0';
                 } else if (tmp>='a' && tmp<='f') {
                     *cc *= 16;
                     *cc += tmp - 'a' + 10;
                 } else if (tmp>='A' && tmp<='F') {
                     *cc *= 16;
                     *cc += tmp - 'A' + 10;
                 } else {
                     /*这个不是16进制数字,只有一位:\xF*/
                     g_scan_pos--;
                     return g_symbol_type=INPUT_ELE;
                 }

                 return g_symbol_type=INPUT_ELE;
             }
             /*8进制*/
             /*以0-3开头的8进制数可以有3位*/
             if (*cc>='0' && *cc<='3') {
                 *cc -= '0';
                 tmp = getASCII(g_strRegExp[++g_scan_pos]);
                 if (tmp>='0' && tmp<='7') {
                     *cc *= 8;
                     *cc += tmp - '0';
                 } else {
                     /*只有一位8进制数字:\7*/
                     g_scan_pos--;
                     return g_symbol_type=INPUT_ELE;
                 }
                 tmp = getASCII(g_strRegExp[++g_scan_pos]);
                 if (tmp>='0' && tmp<='7') {
                     *cc *= 8;
                     *cc += tmp - '0';
                 } else {
                     /*只有两位8进制数字:\77*/
                     g_scan_pos--;
                     return g_symbol_type=INPUT_ELE;
                 }
                 return g_symbol_type=INPUT_ELE;
             }
             /*以4-7开头的8进制可以有2位*/
             if (*cc>='4' && *cc<='7') {
                 *cc -= '0';
                 tmp = getASCII(g_strRegExp[++g_scan_pos]);
                 if (tmp>='0' && tmp<='7') {
                     *cc *= 8;
                     *cc += tmp - '0';
                 } else {
                     /*只有一位8进制数字:\7*/
                     g_scan_pos--;
                     return g_symbol_type=INPUT_ELE;
                 }
                 return g_symbol_type=INPUT_ELE;
             } else {
                 /*[^x0-7dDfnrtvsSwW]*/
                 return g_symbol_type=INPUT_ELE;
             }
         }
         /*重复运算*/
         if (*cc == '*') {return g_symbol_type=REPEAT_ZERO_MORE;}
         if (*cc == '+') {return g_symbol_type=REPEAT_ONCE_MORE;}
         if (*cc == '?') {return g_symbol_type=REPEAT_ZERO_ONCE;}
         if (*cc == '{') {
             *cc = getASCII(g_strRegExp[++g_scan_pos]);
             if (*cc>'9' || *cc<'0') {return g_symbol_type=UNKNOWN;}
             g_repeat_m = 0;        /*取{m,n}的m*/
             while (*cc>='0' && *cc<='9') {
                 g_repeat_m *= 10;
                 g_repeat_m += *cc-'0';
                 *cc = getASCII(g_strRegExp[++g_scan_pos]);
             }
             /*{m}*/
             if (*cc == '}') {return g_symbol_type=REPEAT_RANGE_M;}
             if (*cc != ',') {return g_symbol_type=UNKNOWN;}

             *cc = getASCII(g_strRegExp[++g_scan_pos]);
             /*{m,}*/
             if (*cc == '}') {return g_symbol_type=REPEAT_RANGE_M_MORE;}
             /*{m,n}*/
             if (*cc>'9' || *cc<'0') {return g_symbol_type=UNKNOWN;}
             g_repeat_n = 0;        /*取{m,n}的n*/
             while (*cc>='0' && *cc<='9') {
                 g_repeat_n *= 10;
                 g_repeat_n += *cc-'0';
                 *cc = getASCII(g_strRegExp[++g_scan_pos]);
             }
             if (*cc == '}') {
                 return g_symbol_type=REPEAT_RANGE_MN;
             } else {
                 return g_symbol_type=UNKNOWN;
             }
         }
    
         /*其他的普通字符*/
         return g_symbol_type=INPUT_ELE;

         return g_symbol_type=UNKNOWN;
}

/*得到c的ASCII码*/
int getASCII(char c) {
         int ascii;
         ascii = (int)c;
         if (ascii <0) {
             ascii += 256;
         }
         return ascii;
}


类别:c/c++||添加到搜藏 |分享到i贴吧|浏览(3534)|评论 (0)
 
最近读者:
 
网友评论:
发表评论:
姓 名:
网址或邮箱: (选填)
内 容:
     

   
帮助中心 | 空间客服 | 投诉中心 | 空间协议
©2012 Baidu