Chinaunix首页 | 论坛 | 博客
  • 博客访问: 258536
  • 博文数量: 54
  • 博客积分: 2915
  • 博客等级: 少校
  • 技术积分: 486
  • 用 户 组: 普通用户
  • 注册时间: 2009-05-21 12:20
个人简介

这个人很懒,什么都没有留下

文章分类
文章存档

2013年(1)

2012年(6)

2011年(11)

2010年(16)

2009年(20)

我的朋友

分类:

2009-05-21 15:42:47

由于加入了linux家族,改用ubuntu了,所以这些编程都在linux下的。如果要到windows下用,要对代码作些修改。具体的在附件里的readme里提到。

(如果不粘点代码上来,感觉这文章太空了,虽然附件里有,但还是粘粘吧。咳咳,其实我是很低调的人)
文件:作业三.rar
大小:2222KB
下载:下载

new_vocabulary.pl


#!/usr/bin/perl

use strict;
use warnings;

my @vocabulary;
my $line;
my @line;
my @text;
my %hash;

open FILE,"vocabulary.Dic" || die "Cannot open file: $!";
open FILE1, "<","1998-01-qiefen-file.txt" || die "Cannot open file: $!";
open FILE2,">","new_vocabulary.dic" || die "Cannot open file: $!";



while (<FILE>) {            #将词表读入哈希

    chomp;

    @vocabulary=split(' ',$_); #因为我的词表中每个词前面都有个编号,所以要作这样的处理


    $hash{$vocabulary[1]}='1';

}

chomp(@text=<FILE1>);

foreach    $line (@text) {
    @line=split(/ /,$line);
    foreach $line (@line) {    #将训练集里的词条加入词表中

        $hash{$line}='1';
    }
}
foreach $line (keys %hash) {
    print FILE2 $line,"\n";
}



close(FILE);
close(FILE1);
close(FILE2);    

statistics.pl


#!/usr/bin/perl

use strict;
use warnings;

my @text;
my @line;
my $line;
my $cache1;
my $cache2;
my $cache3;
my %hash;

open FILE, "<","1998-01-qiefen-file.txt" || die "Cannot open file: $!";
open FILE1,">","statistics.txt" || die "Cannot open file: $!";

chomp(@text=<FILE>);

foreach    $line (@text) {
    @line=split(/ /,$line);        #将每一段字符串切分成词存到数组中

    foreach $cache1 (@line) {        #将每个词存进哈希表

        if (!(exists $hash{$cache1})) {
            $hash{$cache1}=1;    #初始化词频

        }
        else {
            $hash{$cache1}+=1;    #记录词频

    
        }
    }
    while (@line!=0&&@line!=1) {    #将每两个相邻的词组成的字符串存进哈希表

        $cache1=shift @line;
        $cache2=shift @line;
        $cache3=$cache1.' '.$cache2;    #将每两个相邻的词按照设定好的格式组合成一个字符串    

        if (!(exists $hash{$cache3})) {    #统计词频

            $hash{$cache3}=1;
        }
        else {
            $hash{$cache3}+=1;
        }
        unshift @line, $cache2;
    }
}

foreach $cache1 (keys %hash) {
    print FILE1 $cache1,"::",$hash{$cache1},"\n";        #将词频数据按照设定好的格式写进文本供使用

}

close(FILE1);
close(FILE);

my $time=times;
print $time,"\n";

ngram.pl


#!/usr/bin/perl

use strict;
use warnings;
use Encode;

################################### 定义标量 ################################


my %hash;        #用于存放词表的哈希表

my %hash1;        #用于存放分析训练集得到的统计数据的哈希表

my $V=84292;    #加进训练集里所切分的词之后的新词表的总词条数

my $dir="text";    #待切分的文本所在目录的目录名


my $line;        
my $filename;
my $f1;    
my $b1;
my $fs;
my $bs;
my $s;
my $fw;
my $bw;
my $k;
my $i;
my $j;
my $m;
my $n;
my $num1;
my $num2;
my $p1;
my $p2;
my $cache;

my @line;
my @FText;
my @BText;
my @words1;
my @words2;
my @cache1;
my @cache2;

#######################################        函数体        ###################################


sub vocabulary {        #将词表中的词条读入哈希表

    open FILE,"new_vocabulary.dic" || die "Cannot open file: $!";

    while (<FILE>) {    
        chomp;
        $line=decode("utf8",$_);    #用utf8编码


        $hash{$line}='1';

    }
    close(FILE);
}

sub read_statistics {        #将统计词频的数据读入哈希表

    open FILE,"<","statistics.txt" || die "Cannot open file: $!";
    while (<FILE>) {
        chomp;
        @line=split(/::/,$_);    #由于存放词频数据的文本中每行中,词条和词频之间用符号“::”连接

        $hash1{$line[0]}=$line[1];
    }
    close(FILE);
}

sub fmm_bmm {        #分别用前向最大匹配和后向最大匹配算法来切分待切分文本

    opendir(DH,$dir) || die "Cannot open text: $!";    

    foreach $filename (grep(/.txt$/i,readdir DH)) {        #依次读入文件目录中每个文件的内容


        open FILE1,"<","$dir/$filename" || die "Cannot open this file: $!";

        open FILE2,">","fmm/$filename" || die "Cannot open this file: $!";    

        open FILE3,">","bmm/$filename" || die "Cannot open this file: $!";
        
        while (<FILE1>) {        #将待切分文本内容读入

            chomp;

            $f1=decode("utf8",$_);        #用utf8编码

            $b1=$f1;                #让前向和后向都处理同一段文字

            $fs="";            #用来存放每段切分好了的文本

            $bs="";            #同上

            while ($f1) {

                $fw=substr $f1,0,4;     #将前四个字(包括标点)作为候选字符串$fw


                while (!(exists $hash{$fw})&&(length $fw)!=1) { #当$fw在词表中没匹配成功且长度大于单字时


                    $fw=substr $fw,0,((length $fw)-1);        #将$fw最右边一个字去掉


                }
                $fs=$fs.$fw." ";

                $f1=substr $f1,(length $fw),(length $f1);    #将待切分字符串去掉已经匹配成功的候选字符串


            }            

            while ($b1) {

                $bw=substr $b1,-4;     #将后四个字(包括标点)作为候选字符串$bw


                while (!(exists $hash{$bw})&&(length $bw)!=1) { #当$bw在词表中没匹配成功且长度大于单字时


                    $bw=substr $bw,1,(length $bw);        #将$bw最左边一个字去掉


                }
                $bs=$bw." ".$bs;

                $b1=substr $b1,0,((length $b1)-(length $bw));    #将待切分字符串去掉已经匹配成功的候选字符串


            }
            $fs=$fs."\n";
            $bs=$bs."\n";
            print FILE2 encode("utf8",$fs);    #将前向最大匹配的结果写入文本,作为中间结果

            print FILE3 encode("utf8",$bs);    #将后向最大匹配的结果写入文本,作为中间结果

        }            

        close(FILE1);

        close(FILE2);
        close(FILE3);    

    }
    close(DH);
}

sub mixture {        #辨认出前后向匹配结果不一样的地方,用二元语法判断选择哪个结果

    opendir(DH,$dir) || die "Cannot open text: $!";        #由于前后向切分了同样的文本,故只需要从某个文件夹里读入文件名就可以了


    foreach $filename (grep(/.txt$/i,readdir DH)) {        

        open FILE1,"<","fmm/$filename" || die "Cannot open this file: $!";

        open FILE2,"<","bmm/$filename" || die "Cannot open this file: $!";    

        open FILE3,">","out/$filename" || die "Cannot open this file: $!";
        
        chomp(@FText=<FILE1>);    #读入文本,将每段字符串存到数组中

        chomp(@BText=<FILE2>);
        $k=0;    
        foreach $line (@FText) {        #读入每行字符串,并切分为一个个词

            $s="";
            @words1=split(/ /,$line);        #切分好的词存放到数组中

            @words2=split(/ /,$BText[$k]);
            $k++;
            $i=0;$j=0;
            while ($i<@words1&&$j<@words2) {    #对比每段文字里前后向匹配切分结果不一样的地方

                @cache1=();@cache2=();
                if ($words1[$i] ne $words2[$j]) {    #如果发现切分出来的词不一致

                
                    push(@cache1,$words1[$i-1]);    #将不一致处前一个词存进“缓存区”

                    push(@cache2,$words2[$j-1]);
                    push(@cache1,$words1[$i]);        #将不一致处第一个词存进“缓存区”

                    push(@cache2,$words2[$j]);
                    
                    $num1=length($words1[$i]);        #用于存放不一致的词的字符串长度

                    $num2=length($words2[$j]);
                    
                    until ($num1==$num2) {            #由于切分不一样,词的数目不一样,故只能通过字符串总长度来确定结束位置

                        if ( $num1 > $num2 ) {
                            $j++;
                            push(@cache2, $words2[$j]);    #哪个总长度大,哪个就先停下来,让另一个先走一步

                            $num2=$num2+length($words2[$j]);
                        }
                        if ( $num1 < $num2 ) {
                            $i++;
                            push(@cache1,$words1[$i]);    #同理

                            $num1=$num1+length($words1[$i]);
                        }
                    }
                    
                    $p1=0;$p2=0;
                    for ($m=0;$m<@cache1-1;$m++) {
                        $cache=$cache1[$m].' '.$cache1[$m+1];    
                        if (exists $hash1{$cache}) {    #当联合词组在训练集里存在时

                            $p1+=log(($hash1{$cache}+1)/($hash1{$cache1[$m]}+$V));    #计算条件概率

                        }
                        else {    #否则

                            if (exists $hash1{$cache1[$m]}) {        #当作为分母的词的词频不为0时                

                                $p1+=log(1/($hash1{$cache1[$m]}+$V));
                            }
                            else {    #否则                                

                                $p1+=log(1/$V);
                            }
                        }
                    }
                    
                    for ($n=0;$n<@cache2-1;$n++) {
                        $cache=$cache2[$n].' '.$cache2[$n+1];
                        if (exists $hash1{$cache}) {
                            $p2+=log(($hash1{$cache}+1)/($hash1{$cache2[$n]}+$V));
                        }
                        else {
                            if (exists $hash1{$cache2[$n]}) {                            
                                $p2+=log(1/($hash1{$cache2[$n]}+$V));
                            }
                            else {
                                $p2+=log(1/$V);
                            }
                        }
                    }
                        #########################        比较两种分词方式的概率        #################

                    if ($p1>=$p2) {    
                        shift @cache1;
                        foreach $cache (@cache1) {
                            $s=$s.$cache.' ';
                        }
                    }
                    else {
                        shift @cache2;
                        foreach $cache (@cache2) {
                            $s=$s.$cache.' ';
                        }        
                    }            
                    $i++;        #这俩个自加非常重要,调试了很久才发现要添加上这俩个东西。

                    $j++;        #目的是跳到不一致处之后的第一个词,也就是下一个一致处的开端

                    
                }
                else {
                    $s=$s.$words1[$i].' ';    #如果一致,就正常输出

                    $i++;
                    $j++;
                }
            }
            $s=$s."\n";
            print FILE3 $s;    #将切分好了的字符串写进文本

        }
        close(FILE1);
        close(FILE2);
        close(FILE3);
    }
    close(DH);
}

#############################        主函数        #########################################3


vocabulary();
read_statistics();
fmm_bmm();
mixture();

###########################        打印输出运行时间    ################################3


my $time=times;
print $time,"\n";

阅读(972) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~