#!/usr/bin/env python
# -*- coding: gbk -*-
#2010-3-25
#python 测试与应用:41357415
#深圳IT招聘求职:105095215
#武冈深圳高级群:66250781
#用于把”notfound.txt"中的wordid转换为中文词
#针对每行进行处理,之前想针对全文处理,但是涉及到转码的问题
#上述问题留待以后优化。
#gtalk: xurongzhong#gmail.com
import re
import urllib2
import sys
import time
#设定字符编码为GBK
reload(sys)
sys.setdefaultencoding('gbk')
f_out = open("new_notfound.txt",'w')
for item in open("notfound.txt"):
#查找wordid
text = '\ (\d{10,25})'
wordids = re.findall(text,item)
new = item
for word in wordids:
#通过查询系统转换wordid为查询词
words = "None"
url = "%2312345&WordID=" + word
html = urllib2.urlopen(url).read()
html= re.sub("\n"," ",html)
text = '查询结果.*?' + word + r'.*?center">(.*?)
'
result = re.search(text, html, re.IGNORECASE)
if result:
words = result.group(1).strip()
print words
new = new.replace(word,"【" +words + "】")
f_out.write(new)
f_out.close
阅读(30101) | 评论(0) | 转发(0) |