VB 2017使用iText 7。我正在寻找一个方法来搜索PDF中的关键文本。当我找到关键文本时,我想返回它所在的框中的所有文本。
例如,在这个PDF中,我查找关键短语“usable length”,并希望返回找到它的框中的文本“Rwy 33 PAPI-L,usable length,notes”。
以下是我目前所拥有的(based on this),希望您对此概念提出任何建议或意见:
Public Function FindTextInPdfFile(ByVal fileName As String, ByVal searchText As String, ByVal IsCaseSensitive As Boolean) As List(Of String)
'basic checks
If String.IsNullOrWhiteSpace(fileName) Then Return Nothing
If String.IsNullOrWhiteSpace(searchText) Then Return Nothing
If Not File.Exists(fileName) Then Return Nothing
'setup the regex to use or not use case sensitivity in the match
Dim pattern As String = String.Format("({0})", searchText)
Dim regEx As Regex = Nothing
If IsCaseSensitive Then
regEx = New Regex(pattern)
Else
regEx = New Regex(pattern, RegexOptions.IgnoreCase)
End If
'setup the extraction strategy and temp buffer
Dim strategy As ITextExtractionStrategy = New SimpleTextExtractionStrategy
Dim buffBasic As New StringBuilder
'open the PDF and do a basic search for the text in each page. for each page where we detect the search item
'we will add that to the temp buffer.
Using pdfReader As PdfReader = New PdfReader(fileName)
Using pdfDocument As PdfDocument = New PdfDocument(pdfReader)
For pageNum As Integer = 1 To pdfDocument.GetNumberOfPages
Dim page As PdfPage = pdfDocument.GetPage(pageNum)
Dim currentPageText As String = PdfTextExtractor.GetTextFromPage(page, strategy)
If regEx.Matches(currentPageText).Count > 0 Then
'Debug.Print("found search text [{0}] in page num {1}", searchText, pageNum)
'Debug.Print("GetResultantText={0}", strategy.GetResultantText)
'GetResultantText has lines of text separated by an LF
buffBasic.Append(strategy.GetResultantText & lf)
End If
Next pageNum
End Using
End Using
'the buffer should have lines of text separated by an LF
Dim linesBasic As List(Of String) = buffBasic.ToString.Split(lf).ToList
Dim linesMatch As List(Of String) = linesBasic.FindAll(Function(x) regEx.Matches(x).Count > 0)
Debug.Print("match count={0}", linesMatch.Count)
For Each line In linesMatch
Debug.Print("line={0}", line)
Next line
Return linesMatch
End Function
在示例PDF上测试此功能会产生
FindTextInPdfFile(pdf, "usable length", True)
match count=1
line=Rwy 33 PAPI-L, usable length, notes.
1条答案
按热度按时间mwngjboj1#
其中页面= Pdf页面