分类:
2006-12-15 10:03:37
ICTCLAS和别的分司系统不一样的地方就是于--N最短路径分词算法。所谓N最短路径其实就是最短路径和最大路径的折中,保留前N个最优路径。这样做的目的就是对这两种方法取长补短,既能达到一个比较理解的分词不达意效果,又能保证分词不达意速度。在此处,我们中国人的中庸思想被完美体现:)。
在源程序中,N最短路径是在CNShortPath类里里面实现的。
bool CSegment::BiSegment(char *sSentence, double dSmoothingPara, CDictionary &dictCore, CDictionary &dictBinary, unsigned int nResultCount)
{
......
//调用构造函数,生成一个二维链表,如下图一所示。每个链表节点是一个队列,数据结构如下图二所示
CNShortPath sp(&aBiwordsNet,nResultCount);
//最短路径算法实现
sp.ShortPath();
//输出最短路径
sp.Output(nSegRoute,false,&m_nSegmentCount);
.....
}
图一
图二
对NShortPath的构造函数分进一步分析:
CNShortPath::CNShortPath(CDynamicArray *apCost,unsigned int nValueKind)
{
//研究(四)中图五所示的链表
m_apCost=apCost;//Set the cost
m_nValueKind=nValueKind;//Set the value kind
//顶点数
m_nVertex=apCost->m_nCol+1;
if(m_nVertex
m_nVertex=apCost->m_nRow+1;//Get the vertex numbers
m_pParent=new CQueue*[m_nVertex-1];//not including the first node
m_pWeight=new ELEMENT_TYPE *[m_nVertex-1];
for(unsigned int i=0;i
m_pParent[i]=new CQueue[nValueKind];
m_pWeight[i]=new ELEMENT_TYPE[nValueKind];
}
}
int CNShortPath::ShortPath()
{
unsigned int nCurNode=1,nPreNode,i,nIndex;
ELEMENT_TYPE eWeight;
PARRAY_CHAIN pEdgeList;
for(;nCurNode
CQueue queWork;
eWeight=m_apCost->GetElement(-1,nCurNode,0,&pEdgeList);//Get all the edges
//对研究(四)中的图六所示的表,按列优进行遍历,把每个列的数据放到一个临时工作队列中
while(pEdgeList!=0 && pEdgeList->col==nCurNode)
{
nPreNode=pEdgeList->row;
eWeight=pEdgeList->value;//Get the value of edges
for(i=0;i
if(nPreNode>0)//Push the weight and the pre node infomation
{
if(m_pWeight[nPreNode-1][i]==INFINITE_VALUE)
break;
//把行值(ROW)放入队列中,并且保证队列的排序是按Weight值从小到大排列
queWork.Push(nPreNode,i,eWeight+m_pWeight[nPreNode-1][i]);
}
else
{
queWork.Push(nPreNode,i,eWeight);
break;
}
}//end for
pEdgeList=pEdgeList->next;
}//end while
//Now get the result queue which sort as weight.
//Set the current node information
for(i=0;i
m_pWeight[nCurNode-1][i]=INFINITE_VALUE;
}
//memset((void *),(int),sizeof(ELEMENT_TYPE)*);
//init the weight
i=0;
//临时工作队列中取出前面的一个,并记路路径长度的和
while(i
if(m_pWeight[nCurNode-1][i]==INFINITE_VALUE)
m_pWeight[nCurNode-1][i]=eWeight;
else if(m_pWeight[nCurNode-1][i]
i++;//Go next queue and record next weight
if(i==m_nValueKind)//Get the last position
break;
m_pWeight[nCurNode-1][i]=eWeight;
}
m_pParent[nCurNode-1][i].Push(nPreNode,nIndex);
}
}//end for
return 1;
}
m_nParent的最终结果如下图三所示,它在当前位置记录该词的父节点位置(对应于研究(四)中的图四)
图三
int CNShortPath::Output(int **nResult,bool bBest,int *npCount)
{//sResult is a string array
......
GetPaths(m_nVertex-1,i,nResult,bBest);
}
//获取分词输出路径。指研究(四)中的图四
void CNShortPath::GetPaths(unsigned int nNode,unsigned int nIndex,int **nResult,bool bBest)
{
CQueue queResult;
unsigned int nCurNode,nCurIndex,nParentNode,nParentIndex,nResultIndex=0;
if(m_nResultCount>=MAX_SEGMENT_NUM)//Only need 10 result
return ;
nResult[m_nResultCount][nResultIndex]=-1;//Init the result
queResult.Push(nNode,nIndex);
nCurNode=nNode;
nCurIndex=nIndex;
bool bFirstGet;
while(!queResult.IsEmpty())
{
while(nCurNode>0)//
{//Get its parent and store them in nParentNode,nParentIndex
if(m_pParent[nCurNode-1][nCurIndex].Pop(&nParentNode,&nParentIndex,0,false,true)!=-1)
{
nCurNode=nParentNode;
nCurIndex=nParentIndex;
}
if(nCurNode>0)
queResult.Push(nCurNode,nCurIndex);
}
if(nCurNode==0)
{ //Get a path and output
nResult[m_nResultCount][nResultIndex++]=nCurNode;//Get the first node
bFirstGet=true;
nParentNode=nCurNode;
while(queResult.Pop(&nCurNode,&nCurIndex,0,false,bFirstGet)!=-1)
{
nResult[m_nResultCount][nResultIndex++]=nCurNode;
bFirstGet=false;
nParentNode=nCurNode;
}
nResult[m_nResultCount][nResultIndex]=-1;//Set the end
m_nResultCount+=1;//The number of result add by 1
if(m_nResultCount>=MAX_SEGMENT_NUM)//Only need 10 result
return ;
nResultIndex=0;
nResult[m_nResultCount][nResultIndex]=-1;//Init the result
if(bBest)//Return the best result, ignore others
return ;
}
queResult.Pop(&nCurNode,&nCurIndex,0,false,true);//Read the top node
while(queResult.IsEmpty()==false&&(m_pParent[nCurNode-1][nCurIndex].IsSingle()||m_pParent[nCurNode-1][nCurIndex].IsEmpty(true)))
{
queResult.Pop(&nCurNode,&nCurIndex,0);//Get rid of it
queResult.Pop(&nCurNode,&nCurIndex,0,false,true);//Read the top node
}
if(queResult.IsEmpty()==false&&m_pParent[nCurNode-1][nCurIndex].IsEmpty(true)==false)
{
m_pParent[nCurNode-1][nCurIndex].Pop(&nParentNode,&nParentIndex,0,false,false);
nCurNode=nParentNode;
nCurIndex=nParentIndex;
if(nCurNode>0)
queResult.Push(nCurNode,nCurIndex);
}
}
}
最终得到最短路么(0,1,2,3,6,9,11,12),里面的数值分别对应研究(四)中图四的下标,到此分词的第一大步就结束了,并形成最终结果:始##始/他/说/的/确实/在/理/末##末
Trackback: http://tb.blog.csdn.net/TrackBack.aspx?PostId=745498