要做国际化的版本,需要把中文字符串都提取出来翻译,写了这个python脚本。
#!/usr/bin/python
# -*- coding: UTF-8 -*
'''
##
# @file match-chinese.py
# @brief 利用正则表达式提取中文字符串
# @author Jesse
# @version 1.0
# @date 2009-11-20
'''
import os,string
import re
directory = "."
output = "chinese.txt"
def match_chinese(s, f, i):
global fd_output
r = re.compile('\"[^\"]*[\x80-\xff]{3}[^\"]*\"')
s_match = r.findall(s)
for c in s_match:
str = "%s ( %d ): %s\n" % (f, i, c)
fd_output.write(str)
def istextfile(filename, blocksize = 512):
return istext(open(filename).read(blocksize))
def istext(s):
if "\0" in s:
return 0
if not s:
return 1
text_characters = "".join(map(chr, range(32, 127)) + list("\n\r\t\b"))
_null_trans = string.maketrans("", "")
t = s.translate(_null_trans, text_characters)
if len(t)/len(s) > 0.30:
return 0
return 1
def read_file(f):
if not istextfile(f):
print "%s is NOT a text file" % (f)
return
#if not re.match(r".*\.[c|h]$", f):
# return
i = 0
fd = open(f,'r')
buff = fd.readlines()
for line in buff:
i += 1
match_chinese(line, f, i)
fd.close()
def walk_dir(dir):
try:
if not os.path.exists(dir):
print dir, ": No such file or directory."
return
if not os.path.isdir(dir):
read_file(dir)
return
files = os.listdir(dir)
for file in files:
file_abs = dir + "/" + file
if os.path.isdir(file_abs):
print file_abs,"is ...DIR..."
walk_dir(file_abs)
else:
read_file(file_abs)
except:
print "Exception occured"
def main():
global fd_output
fd_output = open(output, 'w')
walk_dir(directory)
fd_output.close()
if __name__ == "__main__":
main()
|
参考资料:
编码相关:
http://blog.zol.com.cn/1356/article_1355079.html
ftp://ftp.astron.com/pub/file/file-5.03.tar.gz
python相关:
http://www.blogjava.net/Skynet/archive/2009/05/02/268628.html
http://iregex.org/blog/regex-to-match-chinese.html
http://hi.baidu.com/sinomazing/blog/item/cba18400f4473519738b6508.html
http://blog.donews.com/limodou/archive/2004/08/30/83538.aspx
阅读(4461) | 评论(0) | 转发(0) |