The samples below will show how to find text in PDF using regular expressions with ByteScout PDF Extractor SDK in C# and VBScript.
using System; using System.Drawing; using Bytescout.PDFExtractor; namespace FindText { class Program { static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample1.pdf"); int pageCount = extractor.GetPageCount(); extractor.RegexSearch = true; // ' turn on the regular expression search // search through pages for (int i = 0; i < pageCount; i++) { // searches for the text starting from LABORIS and ending with VELIT words string regexPattern = "LABORIS.*VELIT"; // see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx // Search each page for the pattern if (extractor.Find(i, regexPattern, false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine ("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine ("Text: " + element.Text); Console.WriteLine ("Font is bold: " + element.FontIsBold); Console.WriteLine ("Font is italic:" + element.FontIsItalic); Console.WriteLine ( "Font name: " + element.FontName); Console.WriteLine ( "Font size:" + element.FontSize); Console.WriteLine ( "Font color:" + element.FontColor); } } while (extractor.FindNext()); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); } } }
' Create Bytescout.PDFExtractor.TextExtractor object Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor") extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile("sample.pdf") extractor.RegexSearch = True ' turn on the regex search pattern = "\d{1,3}[\.,]+\d{2,3}[\.,]+\d{2,4}" ' search for numbers in format like 12,123,000 or 12.10.2010 ' complete regular expressions patterns reference: https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx ' Get page count pageCount = extractor.GetPageCount() For i = 0 to PageCount-1 If extractor.Find(i, pattern, false) Then ' parameters are: page index, string to find, case sensitivity Do extractedString = extractor.FoundText.Text MsgBox "Found match on page #" & CStr(i) & ": " & extractedString extractor.ResetExtractionArea() Loop While extractor.FindNext End If Next MsgBox "Done" Set extractor = Nothing