(转载)ICTCLAS分词系统研究（五）--N最短路径-pamire-ChinaUnix博客

进步每一天pamire.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

pamire

博客访问： 2753826
博文数量： 416
博客积分： 10220
博客等级：上将
技术积分： 4193
用户组：普通用户
注册时间： 2006-12-15 09:47

文章分类

全部博文（416）

math（1）
android（6）
QT4（37）
邮件系统（6）
财经（1）
多媒体（59）

H264（5）

MPEG（6）

DirectShow（44）
数据库（3）

mongo（0）

ORACLE（2）

MYSQL（0）
程序语言（242）

脚本（22）

MFC（15）

COM（7）

PHP（9）

JAVA（5）

C++（78）

C语言（12）

VC（89）
杂谈（24）
算法（2）
搜索（4）

clucene（2）

lucene（2）
分词（21）

产品（4）

编码（3）
LINUX（4）
未分配的博文（6）

文章存档

2022年（1）

2021年（1）

2020年（1）

2019年（5）

2018年（7）

2017年（6）

2016年（7）

2015年（11）

2014年（1）

2012年（5）

2011年（7）

2010年（35）

2009年（64）

2008年（48）

2007年（177）

2006年（40）

我的朋友

jpwang09

最近访客

推荐博文

(转载)ICTCLAS分词系统研究（五）--N最短路径

分类：

2006-12-15 10:03:37

ICTCLAS和别的分司系统不一样的地方就是于--N最短路径分词算法。所谓N最短路径其实就是最短路径和最大路径的折中，保留前N个最优路径。这样做的目的就是对这两种方法取长补短，既能达到一个比较理解的分词不达意效果，又能保证分词不达意速度。在此处，我们中国人的中庸思想被完美体现：）。

在源程序中，N最短路径是在CNShortPath类里里面实现的。

bool CSegment::BiSegment(char *sSentence, double dSmoothingPara, CDictionary &dictCore, CDictionary &dictBinary, unsigned int nResultCount)
{

......

//调用构造函数,生成一个二维链表,如下图一所示。每个链表节点是一个队列，数据结构如下图二所示
CNShortPath sp(&aBiwordsNet,nResultCount);

//最短路径算法实现
sp.ShortPath();

//输出最短路径
sp.Output(nSegRoute,false,&m_nSegmentCount);

.....

}

图一

图二

对NShortPath的构造函数分进一步分析：

CNShortPath::CNShortPath(CDynamicArray *apCost,unsigned int nValueKind)
{
//研究（四）中图五所示的链表

m_apCost=apCost;//Set the cost
m_nValueKind=nValueKind;//Set the value kind

    //顶点数
    m_nVertex=apCost->m_nCol+1;
     if(m_nVertexm_nRow+1)
              m_nVertex=apCost->m_nRow+1;//Get the vertex numbers

m_pParent=new CQueue*[m_nVertex-1];//not including the first node
m_pWeight=new ELEMENT_TYPE *[m_nVertex-1];

  for(unsigned int i=0;i  {
   m_pParent[i]=new CQueue[nValueKind];
   m_pWeight[i]=new ELEMENT_TYPE[nValueKind];

  }
}

int CNShortPath::ShortPath()
{
unsigned int nCurNode=1,nPreNode,i,nIndex;
ELEMENT_TYPE eWeight;
PARRAY_CHAIN pEdgeList;

    for(;nCurNode {
    CQueue queWork;
    eWeight=m_apCost->GetElement(-1,nCurNode,0,&pEdgeList);//Get all the edges

     //对研究（四）中的图六所示的表，按列优进行遍历，把每个列的数据放到一个临时工作队列中
       while(pEdgeList!=0 && pEdgeList->col==nCurNode)
    {
     nPreNode=pEdgeList->row;
     eWeight=pEdgeList->value;//Get the value of edges
           for(i=0;i     {
      if(nPreNode>0)//Push the weight and the pre node infomation
      {
         if(m_pWeight[nPreNode-1][i]==INFINITE_VALUE)
                break;

//把行值（ROW）放入队列中，并且保证队列的排序是按Weight值从小到大排列

           queWork.Push(nPreNode,i,eWeight+m_pWeight[nPreNode-1][i]);
      }
      else
      {
       queWork.Push(nPreNode,i,eWeight);
       break;
      }
     }//end for
           pEdgeList=pEdgeList->next;

    }//end while

    //Now get the result queue which sort as weight.
    //Set the current node information
    for(i=0;i    {
   m_pWeight[nCurNode-1][i]=INFINITE_VALUE;
    }
    //memset((void *),(int),sizeof(ELEMENT_TYPE)*);
       //init the weight
    i=0;

   //临时工作队列中取出前面的一个，并记路路径长度的和
       while(i    {//Set the current node weight and parent
     if(m_pWeight[nCurNode-1][i]==INFINITE_VALUE)
      m_pWeight[nCurNode-1][i]=eWeight;
     else if(m_pWeight[nCurNode-1][i]     {
      i++;//Go next queue and record next weight
      if(i==m_nValueKind)//Get the last position
       break;
      m_pWeight[nCurNode-1][i]=eWeight;
     }
           m_pParent[nCurNode-1][i].Push(nPreNode,nIndex);
    }
}//end for

return 1;
}

m_nParent的最终结果如下图三所示，它在当前位置记录该词的父节点位置（对应于研究（四）中的图四）

图三

int CNShortPath::Output(int **nResult,bool bBest,int *npCount)
{//sResult is a string array

......
GetPaths(m_nVertex-1,i,nResult,bBest);

}

//获取分词输出路径。指研究（四）中的图四

void CNShortPath::GetPaths(unsigned int nNode,unsigned int nIndex,int **nResult,bool bBest)
{
    CQueue queResult;
unsigned int nCurNode,nCurIndex,nParentNode,nParentIndex,nResultIndex=0;

if(m_nResultCount>=MAX_SEGMENT_NUM)//Only need 10 result
  return ;
nResult[m_nResultCount][nResultIndex]=-1;//Init the result
queResult.Push(nNode,nIndex);
    nCurNode=nNode;
nCurIndex=nIndex;
    bool bFirstGet;
    while(!queResult.IsEmpty())
{
  while(nCurNode>0)//
  {//Get its parent and store them in nParentNode,nParentIndex
   if(m_pParent[nCurNode-1][nCurIndex].Pop(&nParentNode,&nParentIndex,0,false,true)!=-1)
   {
      nCurNode=nParentNode;
      nCurIndex=nParentIndex;
   }
   if(nCurNode>0)
                queResult.Push(nCurNode,nCurIndex);
  }
  if(nCurNode==0)
  { //Get a path and output
     nResult[m_nResultCount][nResultIndex++]=nCurNode;//Get the first node
     bFirstGet=true;
     nParentNode=nCurNode;
     while(queResult.Pop(&nCurNode,&nCurIndex,0,false,bFirstGet)!=-1)
     {
      nResult[m_nResultCount][nResultIndex++]=nCurNode;
            bFirstGet=false;
      nParentNode=nCurNode;
     }
     nResult[m_nResultCount][nResultIndex]=-1;//Set the end
     m_nResultCount+=1;//The number of result add by 1
     if(m_nResultCount>=MAX_SEGMENT_NUM)//Only need 10 result
    return ;
     nResultIndex=0;
     nResult[m_nResultCount][nResultIndex]=-1;//Init the result

     if(bBest)//Return the best result, ignore others
      return ;
  }
  queResult.Pop(&nCurNode,&nCurIndex,0,false,true);//Read the top node
        while(queResult.IsEmpty()==false&&(m_pParent[nCurNode-1][nCurIndex].IsSingle()||m_pParent[nCurNode-1][nCurIndex].IsEmpty(true)))
  {
        queResult.Pop(&nCurNode,&nCurIndex,0);//Get rid of it
     queResult.Pop(&nCurNode,&nCurIndex,0,false,true);//Read the top node
  }
        if(queResult.IsEmpty()==false&&m_pParent[nCurNode-1][nCurIndex].IsEmpty(true)==false)
  {
      m_pParent[nCurNode-1][nCurIndex].Pop(&nParentNode,&nParentIndex,0,false,false);
      nCurNode=nParentNode;
      nCurIndex=nParentIndex;
      if(nCurNode>0)
          queResult.Push(nCurNode,nCurIndex);
  }
}
}

最终得到最短路么（0，1，2，3，6，9，11，12），里面的数值分别对应研究（四）中图四的下标，到此分词的第一大步就结束了，并形成最终结果：始##始/他/说/的/确实/在/理/末##末

Trackback: http://tb.blog.csdn.net/TrackBack.aspx?PostId=745498

阅读(1162) | 评论(0) | 转发(0) |

上一篇：(转载)ICTCLAS分词系统研究（四）--初次切分

下一篇：(转载)ICTCLAS分词系统研究（六）-- 词性标注

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6