1. 信息检索导论中的倒排索引
P7
-
#include <stdio.h>
-
#include <string.h>
-
#include <ctype.h>
-
#include <stdlib.h>
-
using namespace std;
-
-
typedef struct node
-
{
-
char *data;//结点的数据域,为一个字符串
-
int freq; //单词的出现的频率
-
int index; //一位代表一个index
-
struct node *next;//结点的指针域
-
}linknode;//定义单链表结点结构
-
-
linknode* initlink()
-
{
-
//建立单链表
-
linknode *p;
-
p=(linknode *)malloc(sizeof(linknode));
-
//建立一个只含头结点的空链表,头指针为head
-
p->next=NULL;
-
return p;
-
}
-
-
void insertlink(linknode* head, char* str, int index)
-
{
-
linknode *p;
-
p=head;//在建好的单链表中,以p为扫描指针,从头开始查找有无数据域与t相同的结点
-
while ((p->next)&&(strcmp(str,p->next->data)))
-
p=p->next;
-
if (p->next)//如果存在数据域与t相同的结点,则输出存在信息
-
{
-
printf("\nstring %s existed\r\n",str);
-
p->next->freq++;
-
p->next->index |= 1<<(index-1);
-
}
-
else{//若不存在数据域与t相同结点,则做以下操作
-
p->next=(linknode *)malloc(sizeof(linknode));
-
//在单链表表尾申请一个新结点
-
p=p->next;//p指针指向新的表尾结点
-
p->data = (char*)malloc(sizeof(char)*strlen(str));
-
strcpy(p->data,str);//将str的值复制到*p结点的数据域中
-
p->freq = 1;
-
p->index |= 1<<(index-1);
-
p->next=NULL;//将单链表的表尾结点的next指针置为空
-
}
-
}
-
-
void sort_strings(char *strs[],int count)
-
{
-
char *p;
-
int i,j,k;
-
for(i=0;i<count;i++){
-
for(j=i+1; j<count; j++)
-
{
-
if(strcmp(strs[i],strs[j])>0)
-
{
-
p=strs[i];
-
strs[i]=strs[j];
-
strs[j]=p;
-
}
-
}
-
}
-
}
-
void swap( linknode *s1, linknode *s2 ) //字符串交换
-
{
-
linknode temp1,temp2;
-
temp1 = *s1;
-
temp1.next = s2->next;
-
temp2 = *s2;
-
temp2.next = s1->next;
-
*s1 = temp2;
-
*s2 = temp1;
-
}
-
-
void sort_link(linknode* head)
-
{
-
linknode* p1, *p2;
-
p1 = head->next;
-
while(p1 != NULL)
-
{
-
p2 = p1->next;
-
while(p2 != NULL)
-
{
-
if(strcmp(p1->data, p2->data) > 0 )
-
{
-
swap(p1, p2); //字符串交换
-
}
-
p2 = p2->next;
-
}
-
p1 = p1->next;
-
}
-
}
-
-
void printing(linknode* head){
-
//输出head单链表
-
linknode *p;
-
p=head->next;
-
while(p){
-
printf("%s,%d, %d\r\n",p->data, p->freq, p->index);//输出结点的数据——字符串
-
p=p->next;
-
}
-
}//end of printing
-
-
void deletenode(linknode* head,char *t){
-
//若head单链表中有数据为t的结点,删除之
-
linknode *p,*s;
-
p=head;
-
while ((p->next)&&(strcmp(p->next->data,t)))
-
//以p为扫描指针对head链表进行查找数据域值为*t结点,
-
//为了能方便删除操作,p指向待查结点的前趋
-
p=p->next;
-
if (p->next){//若查找到有,则做删除操作
-
s=p->next;
-
p->next=s->next;
-
free(s);
-
printf("\ndelete successful!");
-
}
-
else//若head链表中没有数据域的值为*t的结点,则输出删除失败的信息
-
printf("\ndelete failure!");
-
}
-
-
int upwords2low(char* str, int len)
-
{
-
int i;
-
for(i=0;i<len;i++)
-
str[i]=tolower(str[i]);
-
}
-
-
char delete_char(char* str, char del)
-
{
-
int i;
-
for (i=0; i<strlen(str); i++) {
-
if (str[i] == del) {
-
str[i] = ' ';
-
}
-
}
-
return del;
-
}
-
-
int process_string(char* str, int len)
-
{
-
int words1_num;
-
//1. turn upercase to lowercase and then remove specail characters
-
upwords2low(str, len);
-
delete_char(str, ',');
-
delete_char(str, '.');
-
delete_char(str, ';');
-
delete_char(str, ':');
-
delete_char(str, '\'');
-
}
-
int get_words(char *sentence,char *words[])
-
{
-
int i=0;
-
char *p;
-
p=strtok(sentence," ,.");
-
while(p!=NULL)
-
{
-
words[i]=p;
-
i++;
-
p=strtok(NULL," ,.");
-
-
}
-
return i;
-
}
-
-
int main ( int argc, char *argv[] )
-
{
-
char str1[]= "I did enact Julius Caesar: I was killed i' the Capitol; Brutus killed me.";
-
char str2[]="So let it be with Caesar. The nobel Brutus hath told you Caesar was ambitions:";
-
int words1_num = 0;
-
int words2_num = 0;
-
char *words1[20];
-
char *words2[20];
-
int i;
-
//建立数据域为字符串类型,且结点无重复的单链表。对单链表做删除结点操作
-
linknode* head = initlink();//建立单链表
-
//1. process string
-
process_string(str1, strlen(str1));
-
process_string(str2, strlen(str2));
-
printf("process1: %s\n", str1);
-
printf("process2: %s\n", str2);
-
//2. splite words form article
-
words1_num = get_words(str1, words1);
-
words2_num = get_words(str2, words2);
-
printf("words1_num=%d\n",words1_num);
-
printf("words2_num=%d\n",words2_num);
-
//3. insert words to list
-
for(i=0; i<words1_num; i++)
-
{
-
insertlink(head, words1[i], 1);
-
}
-
for(i=0; i<words2_num; i++)
-
{
-
insertlink(head, words2[i], 2);
-
}
-
printing(head);//输出单链表
-
printf("\r\n");
-
//4. sort the list
-
sort_link(head);
-
printing(head);//输出单链表
-
printf("\r\n");
-
-
//5.
-
printf("output1:\r\n");
-
for(i=0; i<words1_num; i++)
-
{
-
printf("%s ",words1[i]);
-
}
-
printf("\r\n");
-
printf("output2: \r\n");
-
for(i=0; i<words2_num; i++)
-
{
-
printf("%s ",words2[i]);
-
}
-
printf("\r\n");
-
return 0;
-
}
2. 输出结果
-
ambitions,1, 2
-
be,1, 2
-
brutus,2, 3
-
caesar,3, 3
-
capitol,1, 1
-
did,1, 1
-
enact,1, 1
-
hath,1, 2
-
i,3, 1
-
it,1, 2
-
julius,1, 1
-
killed,2, 1
-
let,1, 2
-
me,1, 1
-
nobel,1, 2
-
so,1, 2
-
the,2, 3
-
told,1, 2
-
was,2, 3
-
with,1, 2
-
you,1, 2
3.思路
a. 处理字符串,把其中的 , 。! :等这样不属于单词的特殊字符去掉
b. 将字符串中的大写字母转为小写字母
c. 从字符串中分离出一个个单词
d. 把这一个个的单词插入到链表中,插入时有重复的单词时freq++
docID在数据结构中用index表示, index是一个32位数,其每一位代表一个docID,
例: did只在doc1中出现,则其index=1
例: it 只在doc1中出现,则其index=2
例如: brutus既在doc1也在doc2中出现index=3
这就是代码中:
p->index |= 1<<(index-1);
则第三篇文章呢?
如abc只在doc3中出现,则index=4
如abc在doc3与doc2中出现,则index=6
以此类推
但是由于index是一个32位的数,所以最大只能表示32篇文章
阅读(1412) | 评论(0) | 转发(0) |