Chinaunix首页 | 论坛 | 博客
  • 博客访问: 257941
  • 博文数量: 54
  • 博客积分: 2915
  • 博客等级: 少校
  • 技术积分: 486
  • 用 户 组: 普通用户
  • 注册时间: 2009-05-21 12:20
个人简介

这个人很懒,什么都没有留下

文章分类
文章存档

2013年(1)

2012年(6)

2011年(11)

2010年(16)

2009年(20)

我的朋友

分类:

2009-05-21 15:54:02

这里我用的是已经分好词的训练语料和测试语料。
算回来的准确率和召回率都80%多,因为是朴素贝叶斯,对文本的特征向量没作降维处理。留待下次作业再改经。


#!/usr/bin/perl

use strict;
use warnings;

my $class;
my $filename;
my %vocabulary;
my $vector;
my %hash;
my $totalNum = 0;
my $v;
my $p;
my $k;
my $i;
my $j;

#################################    用训练语料训练模板        ##########################################3


opendir(DH,"train") || die "Cannot open path: $!";    #打开训练语料所在目录

foreach $class (readdir DH) {        
    if ($class ne "." && $class ne "..") {        #读进来的应该只包括文件夹名

        opendir(DDH,"train\\$class") || die "Cannot open path: $!";    #打开每个类的训练语料所在目录

        $hash{$class}{"vector_of_$class"} = 0;        #初始化某个类的所有训练语料中不同单词位置的总数 n

        $hash{$class}{"text_of_$class"} = 0;        #初始化某个类的训练语料文本数 |docs|

        foreach $filename (grep(/.seg$|.txt$/i, readdir DDH)) {    #读入某个类的文件夹里的语料

            ++ $hash{$class}{"text_of_$class"};        #统计该类语料的文本数

            ++ $totalNum;                    #统计训练语料的总文本数 |examples|

            open FILE, "<", "train\\$class\\$filename" || die "Cannot open file: $!";
            while(<FILE>) {
                chomp;
                foreach $vector (split(/ /,$_)) {
                    $vocabulary{$vector} = 1;    #统计训练集中所有文本中出现的所有单词及记号的集合,就是不同单词或者记号的种数 |vocabulary|

                    ++ $hash{$class}{"vector_of_$class"};    #统计该类训练集的所有文本中的词的总数,又称不同单词位置的总数 n

                    if (!(exists $hash{$class}{$vector})) {
                        $hash{$class}{$vector} = 1;
                    }
                    else {
                        ++ $hash{$class}{$vector};    #统计词或者记号$vector出现在该类训练文本集中的次数 nk

                    }
                }
            }
            close(FILE);
        }
        closedir(DDH);
    }
}
closedir(DH);

#################################################    计算所需要的概率项 p(v) 和 p(wk|v) #################################


foreach $class (keys %hash) {
    foreach $vector (keys %vocabulary) {    #计算每个词对不同类的条件概率 p(wk|v)

        if (!(exists $hash{$class}{$vector})) {
            $hash{$class}{$vector} = 0;
        }
        #用公式 p(wk|v) = (nk + 1) / (n + |vocabulary|)

        $hash{$class}{$vector} = ($hash{$class}{$vector} + 1) / ($hash{$class}{"vector_of_$class"} + scalar(keys %vocabulary));
    }
        #计算 p(v) , 用公式 p(v) = |docs| / |examples|

    $hash{$class}{"text_of_$class"} = $hash{$class}{"text_of_$class"} / $totalNum;

}

#################################################    对测试文本用贝叶斯算法求分类        ##########################################


opendir(DH,"test") || die "Cannot open path: $!";

foreach $class (readdir DH) {
    if ($class ne "." && $class ne "..") {
        if (!(exists $hash{$class}{"B_of_$class"})) {
            $hash{$class}{"B_of_$class"} = 0;    #初始化。这个代表事件“判断为某类,但事实上不是该类的情况”

        }
        opendir(DDH,"test\\$class") || die "Cannot open path: $!";
        $hash{$class}{"A_of_$class"} = 0;        #初始化。 这个代表事件“判断为某类,而事实上也确实是该类的情况”

        $hash{$class}{"C_of_$class"} = 0;        #初始化。这个代表事件“事实上是某类,但没被判断为该类的情况”

        foreach $filename (grep(/.seg$|.txt$/i, readdir DDH)) {    
            open FILE, "<", "test\\$class\\$filename" || die "Cannot open file: $!";
            while(<FILE>) {
                chomp;
                foreach $vector (split(/ /,$_)) {    #对测试文本里的每个词都分别找出其对应某类的概率 p(wk|v)            

                    foreach $v (keys %hash) {
                        if (!(exists $hash{$class}{"$filename from $v"})) {
                            $hash{$class}{"$filename from $v"} = log($hash{$v}{"text_of_$v"}); #初始化该条件概率为 p(v)

                        }
                        else {
                            if (!(exists $hash{$v}{$vector})) {
                                $p=0;    #当这个词没在训练语料中出现过时

                            }
                            else {$p = log($hash{$v}{$vector});}
                            $hash{$class}{"$filename from $v"} += $p; #用公式 v = P(v) * p(a1|v) * p(a2|v)...

                        }    
                    }        
                }
            }            
            $k = 0;
            foreach $v (keys %hash) { ########## 利用 v = argmax P(v) * p(a1|v) * p(a2|v)... 选出最大的概率从而确定分类结果

                if ($k == 0) {
                    $k = $hash{$class}{"$filename from $v"};
                    $j = $v;
                }
                elsif ($hash{$class}{"$filename from $v"} > $k) {
                    $k = $hash{$class}{"$filename from $v"};
                    $j = $v;
                }    
            }
            if ($j eq $class) {            
                ++ $hash{$class}{"A_of_$class"};    #如果属于事件A,则给相应的数加1

            }
            else {
                ++ $hash{$class}{"C_of_$class"};    #统计事件C

                ++ $hash{$j}{"B_of_$j"};        #统计事件B

            }
        }    
    }    
}

###################################################    计算召回率和准确率并打印输出结果    ################################


foreach $class (keys %hash) {    
    if (exists $hash{$class}{"C_of_$class"} && exists $hash{$class}{"A_of_$class"}) {
        print $class,"\n";    
        print "A: ",$hash{$class}{"A_of_$class"},"\n";
        print "B: ",$hash{$class}{"B_of_$class"},"\n";
        print "C: ",$hash{$class}{"C_of_$class"},"\n";
        $hash{$class}{"recall_of_$class"} = $hash{$class}{"A_of_$class"} / ($hash{$class}{"A_of_$class"} + $hash{$class}{"C_of_$class"});
        print "\tRecall: ",$hash{$class}{"recall_of_$class"},"\n";
        if (($hash{$class}{"A_of_$class"} + $hash{$class}{"B_of_$class"}) != 0) {
            $hash{$class}{"precision_of_$class"} = $hash{$class}{"A_of_$class"} / ($hash{$class}{"A_of_$class"} + $hash{$class}{"B_of_$class"});
            print "\tPrecision: ",$hash{$class}{"precision_of_$class"},"\n";
        }
        else {
            print "\tPrecision: none(分类结果中没有该类的文本。)\n";
        }                
    }
}
closedir(DH);

###########################        打印输出运行时间    ################################3


my $time=times;
print $time,"\n";

阅读(1930) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~