The Excel VBA code is using a regular expression to extract section numbers from HTML files. However, the regex includes a negative lookbehind which is not supported in VBA regex. "(?<!tbl"")>(\d(\.\d)+)<"
Sub GetAllSectionNumbers()
LRb = Cells(Rows.Count, "B").End(xlUp).Row
Range("B7:C" & LRb).ClearContents
Dim fileDialog As fileDialog
Set fileDialog = Application.fileDialog(msoFileDialogOpen)
fileDialog.AllowMultiSelect = True
fileDialog.Title = "Select HTML files"
fileDialog.Filters.Clear
fileDialog.Filters.Add "HTML files", "*.htm;*.html", 1
If fileDialog.Show <> -1 Then Exit Sub
Dim file As Variant
For Each file In fileDialog.SelectedItems
Dim fileContents As String
Open file For Input As #1
fileContents = Input$(LOF(1), 1)
Close #1
Dim regex As Object
Set regex = CreateObject("VBScript.RegExp")
regex.Pattern = "(?<!tbl"")>(\d(\.\d)+)<"
regex.Global = True
regex.IgnoreCase = True
regex.MultiLine = True
TRET = regex.Pattern
filePath = file
fileFolder = Left(filePath, InStrRev(filePath, "\"))
fileNameSource = Mid(filePath, InStrRev(filePath, "\") + 1, 100)
Dim match As Object
Set match = regex.Execute(fileContents)
Dim i As Long
For i = 0 To match.Count - 1
LRb = Cells(Rows.Count, "B").End(xlUp).Row + 1
Range("B" & LRb).Value = match.Item(i).SubMatches(0)
Range("C" & LRb).Value = fileNameSource
Next i
Next file
MsgBox "Done!"
End Sub
Is there an alternative regex solution for handling this?
>Solution :
As you are extracting, the traditional approach is to use the "best regex trick" that consists in matching what you do not need and matching and capturing what you need.
The regex in this specific situation will look like
tbl">\d(?:\.\d)+<|>(\d(?:\.\d)+)<
In the code, it will look like
regex.Pattern = "tbl"">\d(?:\.\d)+<|>(\d(?:\.\d)+)<"
Next, in your code, you should check if the match.SubMatches(0)
value is actually present, and if yes, take it since it is what you need.
See the regex demo.