第三次作业前后向匹配结合统计方法进行中文切分-nothing3618-ChinaUnix博客

雪落随风

首页　| 　博文目录　| 　关于我

nothing3618

博客访问： 262159
博文数量： 54
博客积分： 2915
博客等级：少校
技术积分： 486
用户组：普通用户
注册时间： 2009-05-21 12:20

个人简介

这个人很懒，什么都没有留下

文章分类

全部博文（54）

flash（1）
OS（1）
每天进步一点点（6）
MFC（3）
linux（2）
c（16）
firefox插件开发（1）
html（1）
javascript（4）
机器学习学习日（1）
信息与知识获取学（1）
自然语言处理学习（4）
c++学习日记（8）
perl学习日记（3）
唠叨唠叨（1）
未分配的博文（1）

文章存档

2013年（1）

2012年（6）

2011年（11）

2010年（16）

2009年（20）

我的朋友

最近访客

推荐博文

第三次作业前后向匹配结合统计方法进行中文切分

分类：

2009-05-21 15:42:47

由于加入了linux家族，改用ubuntu了，所以这些编程都在linux下的。如果要到windows下用，要对代码作些修改。具体的在附件里的readme里提到。

（如果不粘点代码上来，感觉这文章太空了，虽然附件里有，但还是粘粘吧。咳咳，其实我是很低调的人）

文件:	作业三.rar
大小:	2222KB
下载:	下载

new_vocabulary.pl

#!/usr/bin/perl use strict; use warnings; my @vocabulary; my $line; my @line; my @text; my %hash; open FILE,"vocabulary.Dic" || die "Cannot open file: $!"; open FILE1, "<","1998-01-qiefen-file.txt" || die "Cannot open file: $!"; open FILE2,">","new_vocabulary.dic" || die "Cannot open file: $!"; while (<FILE>) { #将词表读入哈希 chomp; @vocabulary=split(' ',$_); #因为我的词表中每个词前面都有个编号，所以要作这样的处理 $hash{$vocabulary[1]}='1'; } chomp(@text=<FILE1>); foreach $line (@text) { @line=split(/ /,$line); foreach $line (@line) { #将训练集里的词条加入词表中 $hash{$line}='1'; } } foreach $line (keys %hash) { print FILE2 $line,"\n"; } close(FILE); close(FILE1); close(FILE2);

statistics.pl

#!/usr/bin/perl use strict; use warnings; my @text; my @line; my $line; my $cache1; my $cache2; my $cache3; my %hash; open FILE, "<","1998-01-qiefen-file.txt" || die "Cannot open file: $!"; open FILE1,">","statistics.txt" || die "Cannot open file: $!"; chomp(@text=<FILE>); foreach $line (@text) { @line=split(/ /,$line); #将每一段字符串切分成词存到数组中 foreach $cache1 (@line) { #将每个词存进哈希表 if (!(exists $hash{$cache1})) { $hash{$cache1}=1; #初始化词频 } else { $hash{$cache1}+=1; #记录词频 } } while (@line!=0&&@line!=1) { #将每两个相邻的词组成的字符串存进哈希表 $cache1=shift @line; $cache2=shift @line; $cache3=$cache1.' '.$cache2; #将每两个相邻的词按照设定好的格式组合成一个字符串 if (!(exists $hash{$cache3})) { #统计词频 $hash{$cache3}=1; } else { $hash{$cache3}+=1; } unshift @line, $cache2; } } foreach $cache1 (keys %hash) { print FILE1 $cache1,"::",$hash{$cache1},"\n"; #将词频数据按照设定好的格式写进文本供使用 } close(FILE1); close(FILE); my $time=times; print $time,"\n";

ngram.pl

#!/usr/bin/perl use strict; use warnings; use Encode; ################################### 定义标量 ################################ my %hash; #用于存放词表的哈希表 my %hash1; #用于存放分析训练集得到的统计数据的哈希表 my $V=84292; #加进训练集里所切分的词之后的新词表的总词条数 my $dir="text"; #待切分的文本所在目录的目录名 my $line; my $filename; my $f1; my $b1; my $fs; my $bs; my $s; my $fw; my $bw; my $k; my $i; my $j; my $m; my $n; my $num1; my $num2; my $p1; my $p2; my $cache; my @line; my @FText; my @BText; my @words1; my @words2; my @cache1; my @cache2; ####################################### 函数体 ################################### sub vocabulary { #将词表中的词条读入哈希表 open FILE,"new_vocabulary.dic" || die "Cannot open file: $!"; while (<FILE>) { chomp; $line=decode("utf8",$_); #用utf8编码 $hash{$line}='1'; } close(FILE); } sub read_statistics { #将统计词频的数据读入哈希表 open FILE,"<","statistics.txt" || die "Cannot open file: $!"; while (<FILE>) { chomp; @line=split(/::/,$_); #由于存放词频数据的文本中每行中，词条和词频之间用符号“::”连接 $hash1{$line[0]}=$line[1]; } close(FILE); } sub fmm_bmm { #分别用前向最大匹配和后向最大匹配算法来切分待切分文本 opendir(DH,$dir) || die "Cannot open text: $!"; foreach $filename (grep(/.txt$/i,readdir DH)) { #依次读入文件目录中每个文件的内容 open FILE1,"<","$dir/$filename" || die "Cannot open this file: $!"; open FILE2,">","fmm/$filename" || die "Cannot open this file: $!"; open FILE3,">","bmm/$filename" || die "Cannot open this file: $!"; while (<FILE1>) { #将待切分文本内容读入 chomp; $f1=decode("utf8",$_); #用utf8编码 $b1=$f1; #让前向和后向都处理同一段文字 $fs=""; #用来存放每段切分好了的文本 $bs=""; #同上 while ($f1) { $fw=substr $f1,0,4; #将前四个字（包括标点）作为候选字符串$fw while (!(exists $hash{$fw})&&(length $fw)!=1) { #当$fw在词表中没匹配成功且长度大于单字时 $fw=substr $fw,0,((length $fw)-1); #将$fw最右边一个字去掉 } $fs=$fs.$fw." "; $f1=substr $f1,(length $fw),(length $f1); #将待切分字符串去掉已经匹配成功的候选字符串 } while ($b1) { $bw=substr $b1,-4; #将后四个字（包括标点）作为候选字符串$bw while (!(exists $hash{$bw})&&(length $bw)!=1) { #当$bw在词表中没匹配成功且长度大于单字时 $bw=substr $bw,1,(length $bw); #将$bw最左边一个字去掉 } $bs=$bw." ".$bs; $b1=substr $b1,0,((length $b1)-(length $bw)); #将待切分字符串去掉已经匹配成功的候选字符串 } $fs=$fs."\n"; $bs=$bs."\n"; print FILE2 encode("utf8",$fs); #将前向最大匹配的结果写入文本，作为中间结果 print FILE3 encode("utf8",$bs); #将后向最大匹配的结果写入文本，作为中间结果 } close(FILE1); close(FILE2); close(FILE3); } close(DH); } sub mixture { #辨认出前后向匹配结果不一样的地方，用二元语法判断选择哪个结果 opendir(DH,$dir) || die "Cannot open text: $!"; #由于前后向切分了同样的文本，故只需要从某个文件夹里读入文件名就可以了 foreach $filename (grep(/.txt$/i,readdir DH)) { open FILE1,"<","fmm/$filename" || die "Cannot open this file: $!"; open FILE2,"<","bmm/$filename" || die "Cannot open this file: $!"; open FILE3,">","out/$filename" || die "Cannot open this file: $!"; chomp(@FText=<FILE1>); #读入文本，将每段字符串存到数组中 chomp(@BText=<FILE2>); $k=0; foreach $line (@FText) { #读入每行字符串，并切分为一个个词 $s=""; @words1=split(/ /,$line); #切分好的词存放到数组中 @words2=split(/ /,$BText[$k]); $k++; $i=0;$j=0; while ($i<@words1&&$j<@words2) { #对比每段文字里前后向匹配切分结果不一样的地方 @cache1=();@cache2=(); if ($words1[$i] ne $words2[$j]) { #如果发现切分出来的词不一致 push(@cache1,$words1[$i-1]); #将不一致处前一个词存进“缓存区” push(@cache2,$words2[$j-1]); push(@cache1,$words1[$i]); #将不一致处第一个词存进“缓存区” push(@cache2,$words2[$j]); $num1=length($words1[$i]); #用于存放不一致的词的字符串长度 $num2=length($words2[$j]); until ($num1==$num2) { #由于切分不一样，词的数目不一样，故只能通过字符串总长度来确定结束位置 if ( $num1 > $num2 ) { $j++; push(@cache2, $words2[$j]); #哪个总长度大，哪个就先停下来，让另一个先走一步 $num2=$num2+length($words2[$j]); } if ( $num1 < $num2 ) { $i++; push(@cache1,$words1[$i]); #同理 $num1=$num1+length($words1[$i]); } } $p1=0;$p2=0; for ($m=0;$m<@cache1-1;$m++) { $cache=$cache1[$m].' '.$cache1[$m+1]; if (exists $hash1{$cache}) { #当联合词组在训练集里存在时 $p1+=log(($hash1{$cache}+1)/($hash1{$cache1[$m]}+$V)); #计算条件概率 } else { #否则 if (exists $hash1{$cache1[$m]}) { #当作为分母的词的词频不为0时 $p1+=log(1/($hash1{$cache1[$m]}+$V)); } else { #否则 $p1+=log(1/$V); } } } for ($n=0;$n<@cache2-1;$n++) { $cache=$cache2[$n].' '.$cache2[$n+1]; if (exists $hash1{$cache}) { $p2+=log(($hash1{$cache}+1)/($hash1{$cache2[$n]}+$V)); } else { if (exists $hash1{$cache2[$n]}) { $p2+=log(1/($hash1{$cache2[$n]}+$V)); } else { $p2+=log(1/$V); } } } ######################### 比较两种分词方式的概率 ################# if ($p1>=$p2) { shift @cache1; foreach $cache (@cache1) { $s=$s.$cache.' '; } } else { shift @cache2; foreach $cache (@cache2) { $s=$s.$cache.' '; } } $i++; #这俩个自加非常重要，调试了很久才发现要添加上这俩个东西。 $j++; #目的是跳到不一致处之后的第一个词，也就是下一个一致处的开端 } else { $s=$s.$words1[$i].' '; #如果一致，就正常输出 $i++; $j++; } } $s=$s."\n"; print FILE3 $s; #将切分好了的字符串写进文本 } close(FILE1); close(FILE2); close(FILE3); } close(DH); } ############################# 主函数 #########################################3 vocabulary(); read_statistics(); fmm_bmm(); mixture(); ########################### 打印输出运行时间 ################################3 my $time=times; print $time,"\n";

阅读(993) | 评论(0) | 转发(0) |

上一篇：第二次作业──前向最大匹配算法进行中文切分

下一篇：朴素贝叶斯-中文文本分类

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6