Chinaunix首页 | 论坛 | 博客
  • 博客访问: 94489
  • 博文数量: 40
  • 博客积分: 651
  • 博客等级: 上士
  • 技术积分: 356
  • 用 户 组: 普通用户
  • 注册时间: 2011-08-08 22:31
文章分类

全部博文(40)

文章存档

2013年(6)

2012年(3)

2011年(31)

我的朋友

分类: Java

2013-03-22 16:51:51

 【主要代码】



点击(此处)折叠或打开

  1. @Override
  2.     public final boolean incrementToken() throws IOException {
  3.         
  4.          clearAttributes();
  5.          int posIncr = 1;
  6.          
  7.          
  8.         // 返回的结果集
  9.         StringBuffer resultBuffer = new StringBuffer();
  10.         // token类型 single单字节字符 double 双字节字符
  11.         String tokenType = "";
  12.         Token token = null;
  13.         if (tokens != null && tokens.size() > 0) {
  14.             token = tokens.get(0);
  15.             tokens.remove(0);
  16.         } else if (cnBuffer != null && cnBuffer.length() > 0) {
  17.             // 中文段落处理
  18.             tokenType = "double";
  19.             if (nextToken != null && nextToken.length() > 0) {
  20.                 if (nextToken.length() < cnBuffer.length()) {
  21.                     cnBuffer = new StringBuffer(cnBuffer.substring(nextToken.length(),
  22.                             cnBuffer.length()));
  23.                 } else {
  24.                     cnBuffer = null;
  25.                 }
  26.             
  27.                 token = new Token(nextToken.toString(), cnStartOffset, cnStartOffset
  28.                         + nextToken.length(), tokenType);
  29.                 tokens = getAllMatchTokens(nextToken, cnStartOffset);
  30.                 cnOffset += nextToken.length();
  31.                 nextToken = null;
  32.             } else {
  33.                 StringBuffer str = new StringBuffer(cnBuffer.toString());
  34.                 String tokenResult = getTokenResult(str);
  35.                 //token = new Token(tokenResult, cnStartOffset, cnStartOffset     + tokenResult.length(), tokenType);
  36.                     
  37.                 tokens = getAllMatchTokens(new StringBuffer(tokenResult), cnStartOffset);
  38.                 
  39.                 offsetAtt.setOffset(cnStartOffset, cnStartOffset + tokenResult.length());
  40.                 //termAtt.setLength(arg0);
  41.                 //Appends the specified String to this character sequence.
  42.                 termAtt.append(tokenResult);
  43.                 typeAtt.setType(tokenType);
  44.                 
  45.             }
  46.             cnStartOffset = startOffset + cnOffset;
  47.         }{
  48.             startOffset = currentOffset;
  49.             // 得到待处理的段落
  50.             while (true) {

  51.                  posIncrAtt.setPositionIncrement(posIncr);
  52.                 
  53.                 // 缓存取完则载入新的字符串
  54.                 if (bufferIndex >= bufferLen) {
  55.                     bufferLen = input.read(buffer);
  56.                     bufferIndex = 0;
  57.                 }
  58.                 if (bufferLen == -1) {
  59.                     if (resultBuffer.length() > 0) {
  60.                         break;
  61.                     } else {
  62.                         return false;
  63.                     }
  64.                 }
  65.                 // 从缓存中取出字符
  66.                 currentChar = buffer[bufferIndex];
  67.                 if (Character.isLetterOrDigit(currentChar) && !isUnUseWord(currentChar)) {
  68.                     // 有效字符
  69.                     Character.UnicodeBlock ub = Character.UnicodeBlock.of(currentChar);
  70.                     if ((ub == Character.UnicodeBlock.BASIC_LATIN)
  71.                             || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)) {
  72.                         // 当前字符为非中文字符或者全角字符
  73.                         // 如果结果集不为空,而且数据是双字节类型则退出循环进行处理
  74.                         if (resultBuffer.length() > 0 && "double".equals(tokenType)) {
  75.                             break;
  76.                         }
  77.                         // 英文字符处理
  78.                         // 全角转为半角
  79.                         if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
  80.                             int i1 = (int) currentChar;
  81.                             i1 = i1 - 65248;
  82.                             currentChar = (char) i1;
  83.                         }
  84.                         // 转为小写并加入结果集
  85.                         resultBuffer.append(Character.toLowerCase(currentChar));
  86.                         currentOffset++;
  87.                         bufferIndex++;
  88.                         tokenType = "single";
  89.                     } else {
  90.                         // 当前字符为中文字符
  91.                         // 如果结果集不为空,而且数据是单字节类型则退出循环进行处理
  92.                         if (resultBuffer.length() > 0 && "single".equals(tokenType)) {
  93.                             break;
  94.                         }
  95.                         resultBuffer.append(currentChar);
  96.                         currentOffset++;
  97.                         bufferIndex++;
  98.                         tokenType = "double";
  99.                     }
  100.                 } else {
  101.                     // 非索引有效字符
  102.                     if (resultBuffer.length() > 0) {
  103.                         // 结果集中有内容
  104.                         currentOffset++;
  105.                         bufferIndex++;
  106.                         break;
  107.                     } else {
  108.                         // 结果集为空
  109.                         bufferIndex++;
  110.                         currentOffset++;
  111.                         startOffset++;
  112.                         continue;
  113.                     }
  114.                 }
  115.             }
  116.             // 如果结果集是中文段落则进行进一步处理,如果是英文单词则直接处理
  117.             if (resultBuffer.length() > 0 && "double".equals(tokenType)) {
  118.                 cnStartOffset = startOffset;
  119.                 cnOffset = 0;
  120.                 resultBuffer = new StringBuffer(getTokenResult(resultBuffer));
  121.                 cnStartOffset = startOffset + cnOffset;
  122.                 tokens = getAllMatchTokens(resultBuffer, startOffset);
  123.             }
  124.              
  125.             
  126.             offsetAtt.setOffset(startOffset, startOffset
  127.                     + resultBuffer.length());
  128.             termAtt.append(resultBuffer);
  129.             
  130.             typeAtt.setType(tokenType);
  131.              return true;
  132.         }
  133.     }

代码解析,其实代码中有一部分功能其实TokenStreamImpl都已经提供了,这里列出来是为了巩固下,吼吼。。。
首先会通过TokenFilter来调用 input.incrementToken() 通过分词器,获取下一个token,如果在构造createTokenStreamComponent时候没有传入过滤器,默认是调用CachingTokenFilter的incrementToken方法。
首先会进入while(true)循环,这里  posIncrAtt.setPositionIncrement(posIncr);  这里是设置当前token与前一个token在实际的原文本中相隔词的数量,前面也具体提过。
并进一步判断,如果缓存的字符串已经完成读取,则读取新的字符串,或跳出循环。
从缓存中如([我, 一,个, ,, 四, 级, ,, b, i, b, i......])取出第一个字符,判断是否是有效字符,
如果是有效字符,进过一系列操作,如全角转半角等,放入结果集中,当前字符的位置,缓存指针都+1,
------------------------------------------------------------------------------------------
然后从待处理内容缓存中读取第二个字符,判断字符的有效性,(这里的'double','single'是楼主参考2.0lucene时候定义的,不符合现在的定义。)
放入结果集中,当前字符的位置,缓存指针都+1,
------------------------------------------------------------------------------------------
.......
------------------------------------------------------------------------------------------
读入第四个字符,不是有效字符,当前字符的位置,缓存指针都+1,并且跳出循环,
进入下面构建token步奏,

这里主要介绍两个方法
getTokenResult(StringBuffer str),用来对中文字符串进行处理并得到以字符串第一个字符起始的Token
getMatchWords(StringBuffer str),用来返回和字库匹配的token字符串。


'我一个'三个字会首先入getTokenResult方法,

获取字符串第一个字,'我' ,判断是否是连词,如果是连词,则先判断下一个字符开始是否有词,获得匹配直接返回,如果不是
调用如下getMatchWords(StringBuffer str)方法作为最后匹结果返回,如果没有,采用二分法,获取匹配token字符串,
同时将剩余字符串如'个'放入cnBuffer中,然后返回主程序,
填充offsetAtt,termAtt,typeAtt,一个token形成,回到过滤器方法中。


getMatchWords(StringBuffer str) ,字典匹配,采用正向最大匹配法
------
当传入'我一个'三个字后,根据第一个关键字,从字典中获取包含'我'的所有词语,将这些词语按照字符从多到少排列,然后按照
正向最大匹配法匹配,例如 字典中查出'我一',和 '我一个'这三个字比较,截取相同长度,结果匹配,返回该匹配字符到getTokenResult()中




再token流获取下一个token时候,首先会判断是否中文字符段落处理完成,如上一步还遗留一个'个', 交由getTokenResult(str)方法返回token匹配字符串,加入attribute中。
进入while(true)循环中,继续从待处理字符串中获取字符,直到待处理字符串中的字符处理完毕返回‘false’到过滤器中。

贴出两个主要方法

  1. /**
  2.      * 对中文字符串进行处理并得到以字符串第一个字符起始的Token
  3.      *
  4.      *
  5.      * @param str
  6.      * @return
  7.      */
  8.     private String getTokenResult(StringBuffer str) {
  9.         if (str.length() <= 2) {
  10.             cnBuffer = null;
  11.             return str.toString();
  12.         }
  13.         char currChar = str.charAt(0);
  14.         String token = null;
  15.         // 临时存放查找内容
  16.         StringBuffer tmpBuffer = new StringBuffer();
  17.         if (isConj(currChar)) {
  18.             // 如果是连词,则先判断下一个字符开始是否有词
  19.             tmpBuffer.append(str.substring(1, str.length()));
  20.             token = getMatchWords(tmpBuffer);
  21.             if (token != null) {
  22.                 // 有词的话则单独返回该连词
  23.                 nextToken = new StringBuffer(token);
  24.                 cnBuffer = tmpBuffer;
  25.                 cnOffset++;
  26.                 return String.valueOf(currChar);
  27.             }
  28.         }
  29.         tmpBuffer = str;
  30.         token = getMatchWords(tmpBuffer);
  31.         if (token != null) {
  32.             if (token.length() < str.length())
  33.                 cnBuffer = new StringBuffer(str.substring(token.length(), str.length()));
  34.             else
  35.                 cnBuffer = null;
  36.             cnOffset += token.length();
  37.             return token;
  38.         } else {
  39.             // 没有匹配的词,则采用二分法
  40.             cnBuffer = new StringBuffer(str.substring(1, str.length()));
  41.             cnOffset++;
  42.             token = str.substring(0, 2);
  43.             return token;
  44.         }
  45.     }

  46.     /**
  47.      * 字典匹配,采用逆向最大匹配法
  48.      *
  49.      * @param str
  50.      * @return
  51.      */
  52.     @SuppressWarnings("unchecked")
  53.     private String getMatchWords(StringBuffer str) {
  54.         char keyChar = str.charAt(0);
  55.         List<String> list = (List) FMMAnalyzer.dictManager.getWordMap()
  56.                 .get(keyChar);
  57.         // 对字典进行排序,长度长的在前面
  58.         Collections.sort(list, new java.util.Comparator<String>() {
  59.             public int compare(String o1, String o2) {
  60.                 if (o1.length() == o2.length())
  61.                     return 0;
  62.                 if (o1.length() >= o2.length())
  63.                     return -1;
  64.                 return 1;
  65.             }
  66.         });
  67.         for (String word : list) {
  68.             if (word.length() <= str.length()) {
  69.                 String strWord = str.substring(0, word.length());
  70.                 if (word.equals(strWord))
  71.                     return word;
  72.             }
  73.         }
  74.         return null;
  75.     }




阅读(1757) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~