用于匹配 VB.net 中特殊模式的正则表达式

RegEx for matching a special pattern in VB.net

我有代码使用文件实体引用 (&Ch1;) 提取不同 SGM 文件中的文本。该代码非常适用于此,但现在它已扩展为需要使用具有此类引用 &Ch1-1; 的实体调用来获取分段文件的实体引用。这也可以增长到 &Ch1-1-1;

我需要扩展代码以接受这些新实体,以便可以将这些文件内容添加到主文件中。

我认为问题是使用的正则表达式,所以我将其更改为

Dim rx = New Regex("&Ch(?<EntityNumber>\d+?[-\d+]?)?")

这不会产生错误,但也不会将文件内容带入主文档。我习惯了正则表达式,但我从未使用过命名捕获组,并且发现网络上的解释有点混乱。

Sub runProgram()
    Dim DirFolder As String = txtDirectory.Text
    Dim Directory As New IO.DirectoryInfo(DirFolder)
    Dim allFiles As IO.FileInfo() = Directory.GetFiles("*.sgm")
    Dim singleFile As IO.FileInfo
    Dim Response As String


    Dim Prefix As String
    Dim newMasterFilePath As String
    Dim masterFileName As String
    Dim newMasterFileName As String
    Dim startMark As String = "<!--#start#-->"
    Dim stopMark As String = "<!--#stop#-->"
    searchDir = txtDirectory.Text
    Prefix = txtBxUnique.Text
    For Each singleFile In allFiles
        If File.Exists(singleFile.FullName) Then
            Dim fileName = singleFile.FullName
            Debug.Print("file name : " & fileName)
            ' A backup first    
            Dim backup As String = fileName & ".bak"
            File.Copy(fileName, backup, True)

            ' Load lines from the source file in memory
            Dim lines() As String = File.ReadAllLines(backup)

            ' Now re-create the source file and start writing lines inside a block
            Dim insideBlock As Boolean = False
            Using sw As StreamWriter = File.CreateText(backup)
                For Each line As String In lines
                    If line = startMark Then
                        ' start writing at the line below
                        insideBlock = True
                    ElseIf line = stopMark Then
                        ' Stop writing
                        insideBlock = False
                    ElseIf insideBlock = True Then
                        ' Write the current line in the block
                        sw.WriteLine(line)
                    End If
                Next
            End Using
        End If
    Next

    masterFileName = Prefix & $"_Master_Document.sgm"
    newMasterFileName = Prefix & $"_New_Master_Document.sgm"
    newMasterFilePath = IO.Path.Combine(searchDir, newMasterFileName)

    Dim existingMasterFilePath = IO.Path.Combine(searchDir, masterFileName)


    'Read all text of the Master Document
    'and create a StringBuilder from it.
    'All replacements will be done on the
    'StringBuilder as it is more efficient
    'than using Strings directly
    Dim strMasterDoc = File.ReadAllText(existingMasterFilePath)
    Dim newMasterFileBuilder As New StringBuilder(strMasterDoc)

    'Create a regex with a named capture group.
    'The name is 'EntityNumber' and captures just the
    'entity digits for use in building the file name
    Dim rx = New Regex("&Ch(?<EntityNumber>\d+(-?\d*)*)?")
    Dim rxMatches = rx.Matches(strMasterDoc)

    For Each match As Match In rxMatches
        Dim entity = match.ToString
        'Build the file name using the captured digits from the entity in the master file
        Dim entityFileName = Prefix & $"_Ch{match.Groups("EntityNumber")}.sgm.bak"
        Dim entityFilePath = Path.Combine(searchDir, entityFileName)
        'Check if the entity file exists and use its contents
        'to replace the entity in the copy of the master file
        'contained in the StringBuilder
        If File.Exists(entityFilePath) Then
            Dim entityFileContents As String = File.ReadAllText(entityFilePath)
            newMasterFileBuilder.Replace(entity, entityFileContents)
        End If
    Next


    'write the processed contents of the master file to a different file
    File.WriteAllText(newMasterFilePath, newMasterFileBuilder.ToString)

    Dim largeFilePath As String = newMasterFilePath
    Dim lines1 = File.ReadLines(largeFilePath).ToList 'don't use ReadAllLines
    Dim reg = New Regex("\<\!NOTATION.*$|\<\!ENTITY.*$", RegexOptions.IgnoreCase)
    Dim entities = From line In lines1
                   Where reg.IsMatch(line)


    Dim dictionary As New Dictionary(Of Integer, String)
    Dim idx = -1
    For Each s In entities
        idx = lines1.IndexOf(s, idx + 1)
        dictionary.Add(idx, s.Trim)
    Next

    Dim deletedItems = 0
    For Each itm In dictionary
        lines1.RemoveAt(itm.Key - deletedItems)
        deletedItems += 1
    Next

    Dim uniqueDict = dictionary.GroupBy(Function(itm) itm.Value).
    Select(Function(group) group.First()).
    ToDictionary(Function(itm) itm.Key, Function(itm) itm.Value)

    For Each s In uniqueDict.Values
        lines1.Insert(1, s)
    Next


    Dim builtMaster As String = Prefix & "_FinalDeliverable.sgm"
    Dim newBuiltMasterFilePath = IO.Path.Combine(searchDir, builtMaster)
    Dim builtMasterDoc As String = newBuiltMasterFilePath
    Using sw As New System.IO.StreamWriter(builtMasterDoc)
        For Each line As String In lines1
            sw.WriteLine(line)
        Next
        sw.Flush()
        sw.Close()
    End Using

    'Delete the master document and new master document

    If System.IO.File.Exists(existingMasterFilePath) = True Then
        System.IO.File.Delete(existingMasterFilePath)
    End If

    If System.IO.File.Exists(newMasterFilePath) = True Then
        System.IO.File.Delete(newMasterFilePath)
    End If

    For Each filename As String In IO.Directory.GetFiles(searchDir, "*.bak")
        IO.File.Delete(filename)
    Next


    Response = MsgBox("File 'FinalDeliverable.sgm' has been created.", vbOKOnly, "SGM Status")
    If Response = vbOK Then    ' User chose Yes.
        Close()
    Else    ' User chose No.
        ' Perform some action.
    End If
End Sub

我期望的结果是名称为 Ch1-1.sgm 和 之间的内容的文件将被添加到主文件中。

这对 &Ch1; 文件实体有效。它正确抓取 Ch1.sgm 内容。

感谢您的帮助, 玛克辛

示例代码: Master_Document.sgm

<!DOCTYPE DOC PUBLIC "-//USA-DOD//DTD 38784STD-BV7//EN"[
]>
&Ch1;
<body numcols="2">
&Ch2-1;
&Ch2-2;
&Ch2-3;
&Ch2-4;
&Ch2-5;
&Ch2-6;
&Ch2-7;
&Ch2-8;
&Ch2-9;
&Ch3;
</body></doc>

示例 SGM 文件

 <?Pub /_gtinsert>                     
    <body numcols="2">                    
    <!--#start#-->                        
    <chapter id="Chapter_4__Procedures">  
    <title>Procedures</title>             
    <section>                             
    <title>Introduction</title>           
    <!--#stop#-->                         
    <para0 verdate="7 Never 2012" verstatu
    <title>Description</title>            
    <para>This chapterfor the following:  

事实证明,问题是 &Ch(?<EntityNumber>\d+?[-\d+]?)? 匹配 &Ch,然后是一个或多个但尽可能少的数字(\d+?),然后是一个可选的单个 -、数字或+符号。也就是说,在 &Ch 之后,只匹配了 1 个数字(因为你的情况总是有一个数字),然后如果后面有一个 -,则匹配停止。

使用

Dim rx = New Regex("&Ch(?<EntityNumber>\d+(?:-\d+)*);")

查看 regex demo 和正则表达式图: