有一段文本,由英文字母、阿拉伯数字、GB2312编码的中文字符和一些常用标点符号(假设只包含全/半角的逗号和句子)组成。请写出程序,统计这段文本中每个字的出现次数,对“字”的定义如下:
1,连续的英文字母或者阿拉伯数字,例如ab3或123,但最长不超过32个字符;
2,包含不超过一个半角句点的两段连续数字,例如2.34,但最长不超过32个字符
3,单个汉字
汉字编码范围
名称 |
第一字节 |
第二字节 |
GB2312 |
0xB0-0xF7(176-247) |
0xA0-0xFE(160-254) |
GBK |
0x81-0xFE(129-254) |
0x40-0xFE(64-254) |
Big5 |
0x81-0xFE(129-255) |
0x40-0x7E(64-126)
0xA1-0xFE(161-254) |
#include <stdio.h> #include <stdlib.h>
#define MAX_LEN 100 #define MAX_WORD 32
int is_words(char* str) { int flag1 = 0; int flag2 = 0; char* tmp = str; while(*tmp) { if(*tmp>='0' && *tmp<='9' ) tmp++; else if(*tmp == '.') { flag1 = 1; tmp++; } else { flag2 = 1; tmp++; } } if(flag1 && flag2) return 0; else return 1; }
int main(int argc, char *argv[]) { int count = 0; int i = 0; int j = 0; char line[MAX_LEN]; char words[MAX_WORD]; FILE* fp = fopen("words.txt","r"); while(fgets( line, MAX_LEN, fp)) { i = 0; while(line[i]!='\n'&&line[i]!='\0') { if(line[i]&0x80) { unsigned char ch1 = (unsigned char)(line[i]); unsigned char ch2 = (unsigned char)(line[i+1]); if( ch1>=0xB0 && ch1<=0xF7 && ch1>=0xA0 && ch1<=0xFE ) { char chinese[3]; snprintf( chinese, 2, "%s", line+i); chinese[2] = '\0'; count++; printf("chinese words %d is %s\n", count, chinese); } i += 2; } else { words[j] = line[i]; i++; j++; if(line[i]&0x80) { words[j] = '\0'; if(is_words(words)) { count++; printf("number words %d is %s\n", count, words); } j = 0; } } } } system("PAUSE"); return 0; }
|
阅读(801) | 评论(0) | 转发(0) |