读取word中的文本后,有时候要做文本运算,就要清除格式,这次做作业提交系统,中涉及文本相似度,所以先将其格式给清除掉,然后再作文本运算.同样是网上找的一点代码,有时候并不是自己不会写,主要是节约时间,做应用开发不同于做研究,能用为上,与各位分享
static void Main(string[] args)
{
if (args.Length == 0 || String.IsNullOrEmpty(args[0]))
{
Console.WriteLine("No filename provided.");
return;
}
string filepath = args[0];
if (Path.GetFileName(filepath) == args[0])
{
filepath = Path.Combine(Environment.CurrentDirectory, filepath);
}
if (!File.Exists(args[0]))
{
Console.WriteLine("File doesn't exist.");
}
string html = File.ReadAllText(filepath);
Console.WriteLine("input html is " html.Length " chars");
html = CleanWordHtml(html);
html = FixEntities(html);
filepath = Path.GetFileNameWithoutExtension(filepath) ".modified.htm";
File.WriteAllText(filepath, html);
Console.WriteLine("cleaned html is " html.Length " chars");
}
static string CleanWordHtml(string html)
{
StringCollection sc = new StringCollection();
// get rid of unnecessary tag spans (comments and title)
sc.Add(@"");
sc.Add(@"
(\w|\W) ?");
// Get rid of classes and styles
sc.Add(@"\s?class=\w ");
sc.Add(@"\s style='[^'] '");
// Get rid of unnecessary tags
sc.Add(
@"<(meta|link|/?o:|/?style|/?div|/?st\d|/?head|/?html|body|/?body|/?span|!\[)[^>]*?>");
// Get rid of empty paragraph tags
sc.Add(@"(<[^>] >) (\w >) ");
// remove bizarre v: element attached to
tag
sc.Add(@"\s v:\w =""[^""] """);
// remove extra lines
sc.Add(@"(\n\r){2,}");
foreach (string s in sc)
{
html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);
}
return html;
}
static string FixEntities(string html)
{
NamueCollection nvc = new NamueCollection();
nvc.Add("“", "“");
nvc.Add("”", "”");
nvc.Add("–", "—");
foreach (string key in nvc.Keys)
{
html = html.Replace(key, nvc[key]);
}
return html;
}
阅读(660) | 评论(0) | 转发(0) |