Author:水如烟
引言:信息在流趟,我仅取一瓢
摘要:本文尝试提供一种简便的方法,它从文本字符串中提取特定的信息生成类实例。
关键词:文件、字符串、正则匹配、反射、xml和序列化、抽象和继承、类实例。
场景:我关心一些信息,这些信息来源于网页,我把这些网页下载下来存为文件。我所关心的信息在文件中按一定的规律表现着。我要把这些信息提取出来。这样的事,我现在要做,以后可能还是要做。
哦:因为不时的会这样做,所以我想把“过程规律化”,形成可应用的“作业流程”。
MatchBase.vb
Imports System.Text, System.Text.RegularExpressions
Imports System.IO
Imports System.Reflection
Namespace LzmTW.uSystem.uText.uRegex
Public MustInherit Class MatchBase
Implements IDisposable
'''
''' 需指明匹配名称,名称与类的属性名称对应,大小写敏感
'''
MustOverride ReadOnly Property Pattern() As String
Private gItems As New List(Of MatchBase)
'''
''' 当前匹配项集合
'''
Public ReadOnly Property Items() As List(Of MatchBase)
Get
Return gItems
End Get
End Property
'''
''' 清除当前已匹配的项目
'''
Public Sub Clear()
gItems.Clear()
End Sub
'''
''' 当前匹配项数目
'''
Public ReadOnly Property Count() As Integer
Get
Return gItems.Count
End Get
End Property
'''
''' 读取文件文本捕获项目
'''
''' 文件名
''' 编码格式
''' 是否存在匹配项
Public Function Read(ByVal file As String, ByVal encoding As Encoding) As Boolean
Dim mFileInfo As New FileInfo(file)
If Not mFileInfo.Exists Then Return False
Dim mContent As String
Using mStream As FileStream = mFileInfo.Open(FileMode.Open, FileAccess.Read)
Using mReader As New System.IO.StreamReader(mStream, encoding)
mContent = mReader.ReadToEnd
End Using
End Using
Return Read(mContent)
End Function
'''
''' 读取文本捕获项目
'''
''' 字符串文本
''' 是否存在匹配项
''' 首先尝试多行模式匹配,不成功再尝试单行模式匹配
Public Function Read(ByVal content As String) As Boolean
Dim mResult As Boolean = True
mResult = InternalRead(content, RegexOptions.None)
If mResult = False Then
mResult = InternalRead(content, RegexOptions.Singleline)
End If
Return mResult
End Function
'''
''' 读取文本捕获项目
'''
''' 字符串文本
''' 正则表达式选项
''' 读取文本捕获项目
''' 正则表达式选项侧重设置多行模式还是单行模式匹配
Public Function Read(ByVal content As String, ByVal options As RegularExpressions.RegexOptions) As Boolean
Return InternalRead(content, options)
End Function
'''
''' 尝试捕获匹配项目,若成功,生成相应的类实例
'''
''' 是否匹配成功
Private Function InternalRead(ByVal content As String, ByVal options As RegularExpressions.RegexOptions) As Boolean
Dim mMatchCollection As MatchCollection = Regex.Matches(content, Pattern, options)
If mMatchCollection.Count = 0 Then Return False
With mMatchCollection.GetEnumerator
While .MoveNext
With CType(.Current, Match)
If .Success Then
Dim mItem As MatchBase = CType(System.Activator.CreateInstance(Me.GetType), MatchBase)
mItem.InternalUpdateFromMatch(.Groups)
gItems.Add(mItem)
End If
End With
End While
End With
Return True
End Function
Private Sub InternalUpdateFromMatch(ByVal collection As GroupCollection)
Dim mType As Type = Me.GetType
Dim mPropertyInfo As PropertyInfo
Dim mPropertyName As String = Nothing
For Each mPropertyInfo In mType.GetProperties
mPropertyName = mPropertyInfo.Name
With collection.Item(mPropertyName)
If .Success Then
If mPropertyInfo.CanWrite Then
mPropertyInfo.SetValue(Me, Convert(.Value, mPropertyInfo.PropertyType), Nothing)
End If
End If
End With
Next
End Sub
'''
''' 把当前类实例序列化为xml并存于文件中
'''
''' 文件名
''' 追加否
''' 编码格式
Public Sub SaveXml(ByVal file As String, ByVal append As Boolean, ByVal encoding As System.Text.Encoding)
Using XmlWriter As New System.IO.StreamWriter(file, append, encoding)
XmlWriter.Write(ToXmlString)
End Using
End Sub
Private Function ToXmlString() As String
Dim mResult As String = ""
Dim mSerializer As New System.Xml.Serialization.XmlSerializer(Me.GetType)
Using mStringWriter As New System.IO.StringWriter
mSerializer.Serialize(mStringWriter, Me)
mResult = mStringWriter.ToString
mStringWriter.Close()
End Using
Return mResult
End Function
'''
''' 从文件中读取xml内容反序列化为一个新的类实例
'''
''' 文件名
''' 编码格式
Public Function GetMatchItemFromXml(ByVal filename As String, ByVal encoding As System.Text.Encoding) As MatchBase
Dim XmlReader As New System.IO.StreamReader(filename, encoding)
Dim mObj As MatchBase
Using XmlReader
mObj = XmlStringToItem(XmlReader.ReadToEnd)
XmlReader.Close()
End Using
Return mObj
End Function
Private Function XmlStringToItem(ByVal xml As String) As MatchBase
Dim mSerializer As New System.Xml.Serialization.XmlSerializer(Me.GetType)
Dim mStringReader As New System.IO.StringReader(xml)
Return CType(mSerializer.Deserialize(mStringReader), MatchBase)
End Function
''这个转换是以前做的,没有深究,能用则用
Private Function Convert(ByVal value As Object, ByVal type As Type) As Object
Try
Return System.Convert.ChangeType(value, type.GetTypeCode(type))
Catch ex As Exception
Try
Return Microsoft.VisualBasic.CompilerServices.Conversions.ChangeType(value, GetType(Type))
Catch ex2 As Exception
Dim t As Type = type.GetType(type.AssemblyQualifiedName)
Return Microsoft.VisualBasic.CompilerServices.Conversions.ChangeType(value, t)
End Try
End Try
Return Nothing
End Function
Public Overrides Function ToString() As String
Return Me.ToXmlString
End Function
Private disposedValue As Boolean = False ' 检测冗余的调用
' IDisposable
Protected Overridable Sub Dispose(ByVal disposing As Boolean)
If Not Me.disposedValue Then
If disposing Then
' TODO: 显式调用时释放托管资源
gItems.Clear()
gItems = Nothing
End If
' TODO: 释放共享的非托管资源
End If
Me.disposedValue = True
End Sub
#Region " IDisposable Support "
' Visual Basic 添加此代码是为了正确实现可处置模式。
Public Sub Dispose() Implements IDisposable.Dispose
' 不要更改此代码。请将清理代码放入上面的 Dispose(ByVal disposing As Boolean) 中。
Dispose(True)
GC.SuppressFinalize(Me)
End Sub
#End Region
End Class
End Namespace
MatchHelper.vb
Namespace LzmTW.uSystem.uText.uRegex
Public Class MatchHelper(Of T As MatchBase)
Private gMatcher As T
Public ReadOnly Property Current() As T
Get
Return gMatcher
End Get
End Property
Sub New()
gMatcher = CType(System.Activator.CreateInstance(GetType(T)), T)
End Sub
Public Sub SaveXml(ByVal file As String, ByVal append As Boolean, ByVal encoding As System.Text.Encoding)
gMatcher.SaveXml(file, append, encoding)
End Sub
Public Sub LoadXml(ByVal filename As String, ByVal encoding As System.Text.Encoding)
gMatcher = CType(gMatcher.GetMatchItemFromXml(filename, encoding), T)
End Sub
Public Overrides Function ToString() As String
Return gMatcher.ToString
End Function
End Class
End Namespace
应用
我想提取CSDN BLOG首页里头的专家分类与已注册专家信息。为此,我将该网页的代码(浏览器的源文件)存为CsdnBlogDefault.txt文件以待分析提取。
MatchExpertsLei.vb
Public Class MatchExpertsLei
Inherits LzmTW.uSystem.uText.uRegex.MatchBase
Public Overrides ReadOnly Property Pattern() As String
Get
Return "(?\w+)
"
End Get
End Property
Private gName As String
Public Property Name() As String
Get
Return gName
End Get
Set(ByVal value As String)
gName = value
End Set
End Property
End Class
MatchExperts.vb
Public Class MatchExperts
Inherits LzmTW.uSystem.uText.uRegex.MatchBase
Public Overrides ReadOnly Property Pattern() As String
Get
Return "\w+)"">(?.*?)"
End Get
End Property
Private gUserName As String
Public Property UserName() As String
Get
Return gUserName
End Get
Set(ByVal value As String)
gUserName = value
End Set
End Property
Private gUserId As String
Public Property UserID() As String
Get
Return gUserId
End Get
Set(ByVal value As String)
gUserId = value
End Set
End Property
End Class
现在可以试试看了
Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click
'想提取专家分类信息
Dim helper As New LzmTW.uSystem.uText.uRegex.MatchHelper(Of MatchExpertsLei)
'读取文件并分析
helper.Current.Read("CsdnBlogDefault.txt", System.Text.Encoding.UTF8)
'分析结果存入expertslei.xml文件
helper.SaveXml("ExpertsLei.xml", False, System.Text.Encoding.UTF8)
'重新尝试从上一文件中读取已提取的信息
helper.LoadXml("ExpertsLei.xml", System.Text.Encoding.UTF8)
'看看信息是如何的
Console.WriteLine("专家分成{0}大类,如下:", helper.Current.Count)
Console.WriteLine(helper.ToString)
'清理
helper.Current.Dispose()
End Sub
Private Sub Button2_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button2.Click
'想要提取专家信息
Dim helper As New LzmTW.uSystem.uText.uRegex.MatchHelper(Of MatchExperts)
'读取文件并分析
helper.Current.Read("CsdnBlogDefault.txt", System.Text.Encoding.UTF8)
'分析结果存入experts.xml文件
helper.SaveXml("Experts.xml", False, System.Text.Encoding.UTF8)
'重新尝试从上一文件中读取已提取的信息
helper.LoadXml("Experts.xml", System.Text.Encoding.UTF8)
'看看信息是如何的
Console.WriteLine("已注册{0}个专家,如下:", helper.Current.Count)
Console.WriteLine(helper.ToString)
'清理
helper.Current.Dispose()
End Sub
输出结果呢
专家分成14大类,如下:
xml version="1.0" encoding="utf-16"?>
<MatchExpertsLei xmlns:xsi="" xmlns:xsd="">
<Items>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>业界Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>软件工程Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>项目管理Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>JAVAName>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>DelphiName>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>WEB开发Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>数据库Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>移动开发Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>开源Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>游戏开发Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>企业开发Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>综合Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>网络管理Name>
MatchBase>
<MatchBase xsi:type="MatchExpertsLei">
<Items />
<Name>IT媒体Name>
MatchBase>
Items>
MatchExpertsLei>
再有
阅读(2265) | 评论(0) | 转发(0) |