Chinaunix首页 | 论坛 | 博客
  • 博客访问: 13320
  • 博文数量: 6
  • 博客积分: 295
  • 博客等级: 二等列兵
  • 技术积分: 70
  • 用 户 组: 普通用户
  • 注册时间: 2008-05-12 00:03
文章分类

全部博文(6)

文章存档

2012年(2)

2009年(1)

2008年(3)

我的朋友
最近访客

分类: Python/Ruby

2012-09-05 21:30:25

中文二元切词
保留中文单字
英文单词
去除其他符号
忽略字母数小于三个的英文单词
英文统一转成小写
调用时传入的字符串应该是 utf8 flag 没有打开(未 decode )的状态。

调用示例:

点击(此处)折叠或打开

  1. use Hukaa::Search;
  2. my @results = cut_words("中华人民共和国 hel-HHHlo呵呵[]哈哈wor\\ld下What不为例啊");
  3. print join("\n", @results), "\n";

结果:

源代码:

点击(此处)折叠或打开

  1. package Hukaa::Search;

  2. use strict;
  3. use warnings;

  4. use Encode qw(decode_utf8 encode_utf8);

  5. use base qw(Exporter);

  6. our @EXPORT_OK = qw(cut_words);
  7. our @EXPORT = @EXPORT_OK;

  8. sub cut_words {
  9.     my $str = shift;
  10.     $str = decode_utf8($str);
  11.     my @words;
  12.     my $buf;
  13.     my $state = 0; # 0 - out, 1 - in chinese, 2 - in english
  14.     my $two_chinese = 0;
  15.     for my $i (0 .. length($str)) {
  16.         my $char = substr($str, $i, 1);

  17.         #print $char, "\t";
  18.         if (ord($char) > 255) { # chinese
  19.             if ($state == 0) {
  20.                 $buf = $char;
  21.             } elsif ($state == 1) {
  22.                 $buf .= $char;
  23.                 push @words, encode_utf8($buf);
  24.                 $two_chinese = 1;
  25.                 $buf = $char;
  26.             } elsif ($state == 2) {
  27.                 push @words, lc($buf) if length($buf) > 2;
  28.                 $buf = $char;
  29.             }
  30.             $state = 1;
  31.         } elsif ($char =~ /\w/) { # english word
  32.             if ($state == 0) {
  33.                 $buf = $char;
  34.             } elsif ($state == 1) {
  35.                 if ($two_chinese == 0) {
  36.                     push @words, encode_utf8($buf);
  37.                 } else {
  38.                     $two_chinese = 0;
  39.                 }
  40.                 $buf = $char;
  41.             } elsif ($state == 2) {
  42.                 $buf .= $char;
  43.             }
  44.             $state = 2;
  45.         } else { # separaters
  46.             if ($state == 0) {
  47.             } elsif ($state == 1) {
  48.                 if ($two_chinese == 0) {
  49.                     push @words, encode_utf8($buf);
  50.                 } else {
  51.                     $two_chinese = 0;
  52.                 }
  53.                 $buf = "";
  54.             } elsif ($state == 2) {
  55.                 push @words, lc($buf) if length($buf) > 2;
  56.                 $buf = "";
  57.             }
  58.             $state = 0;
  59.         }
  60.     }
  61.     return @words;
  62. }

  63. 1;


阅读(380) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~