Check the samples below to learn how to find specific text in a PDF document in ASP.NET, C#, C#-WPF, VB.NET and VBScript using ByteScout PDF Extractor SDK.
If you need to find text with hyphens in your PDF, check this tutorial.
Select your programming language:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | using System; using System.Drawing; using Bytescout.PDFExtractor; namespace FindText { public partial class _Default : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { // This test file will be copied to the project directory on the pre-build event (see the project properties). String inputFile = Server.MapPath( "sample1.pdf" ); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo" ; extractor.RegistrationKey = "demo" ; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); Response.Clear(); Response.ContentType = "text/html" ; Rectangle location; int pageIndex; Response.Write( "Searching for \"ipsum\" string:<br><br>" ); // Search for "ipsum" string if (extractor.Find(0, "ipsum" )) { do { Response.Write( "<br/>" ); Response.Write( "Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()+ "<br/>" ); Response.Write( "<br/>" ); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Response.Write( "Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height + "<br/>" ); Response.Write( "Text: " + element.Text + "<br/>" ); Response.Write( "Font is bold: " + element.FontIsBold + "<br/>" ); Response.Write( "Font is italic:" + element.FontIsItalic + "<br/>" ); Response.Write( "Font name: " + element.FontName + "<br/>" ); Response.Write( "Font size:" + element.FontSize + "<br/>" ); Response.Write( "Font color:" + element.FontColor + "<br/>" ); } } while (extractor.FindNext()); } Response. End (); } } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | using System; using System.Drawing; using Bytescout.PDFExtractor; namespace FindText { class Program { static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo" ; extractor.RegistrationKey = "demo" ; // Load sample PDF document extractor.LoadDocumentFromFile( "sample1.pdf" ); int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Search each page for "ipsum" string if (extractor.Find(i, "ipsum" , false)) { do { Console.WriteLine( "" ); Console.WriteLine( "Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine( "" ); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine ( "Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine ( "Text: " + element.Text); Console.WriteLine ( "Font is bold: " + element.FontIsBold); Console.WriteLine ( "Font is italic:" + element.FontIsItalic); Console.WriteLine ( "Font name: " + element.FontName); Console.WriteLine ( "Font size:" + element.FontSize); Console.WriteLine ( "Font color:" + element.FontColor); } } while (extractor.FindNext()); } } Console.WriteLine(); Console.WriteLine( "Press any key to continue..." ); Console.ReadLine(); } } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | using System; using System.Drawing; using System.Text; using System.Windows; using Bytescout.PDFExtractor; namespace WpfApplication1 { public partial class MainWindow : Window { private string _pdfFile; private TextExtractor extractor; public MainWindow() { InitializeComponent(); extractor = new TextExtractor(); } private void Button_Load(object sender, RoutedEventArgs e) { Microsoft.Win32.OpenFileDialog dlg = new Microsoft.Win32.OpenFileDialog(); dlg.DefaultExt = ".pdf" ; dlg.Filter = "PDF documents (.pdf)|*.pdf" ; bool? result = dlg.ShowDialog(); if (result == true) { try { extractor.LoadDocumentFromFile(dlg.FileName); _pdfFile = dlg.FileName; Title = _pdfFile; } catch (Exception exception) { MessageBox.Show(exception.ToString()); } } } private void Button_Extract(object sender, RoutedEventArgs e) { if (_pdfFile != null) { string text = extractor.GetText(0, 0); // extract from the first page only (for demonstration purposes) textBox1.Text = text; } } private void Button_Find(object sender, RoutedEventArgs e) { if (textBoxFind.Text.Length > 0) { StringBuilder builder = new StringBuilder(); builder.AppendLine( "Searching for \"" + textBoxFind.Text + "\"" ); if (extractor.Find(0, textBoxFind.Text, false)) { do { builder.AppendLine( "" ); builder.AppendLine( "Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); builder.AppendLine( "" ); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { builder.AppendLine( "Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); builder.AppendLine( "Text: " + element.Text); builder.AppendLine( "Font is bold: " + element.FontIsBold); builder.AppendLine( "Font is italic:" + element.FontIsItalic); builder.AppendLine( "Font name: " + element.FontName); builder.AppendLine( "Font size:" + element.FontSize); builder.AppendLine( "Font color:" + element.FontColor); } } while (extractor.FindNext()); } builder.AppendLine( "Finished." ); textBox1.Text = builder.ToString(); } } } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | Imports System.Drawing Imports Bytescout.PDFExtractor Class Program Friend Shared Sub Main(args As String ()) ' Create Bytescout.PDFExtractor.TextExtractor instance Dim extractor As New TextExtractor() extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile( "sample1.pdf" ) Dim pageCount As Integer = extractor.GetPageCount() For i As Integer = 0 To pageCount - 1 ' Search each page for "ipsum" string If extractor.Find(i, "ipsum" , False ) Then Do Console.WriteLine( "" ) Console.WriteLine(( "Found on page " & i & " at location " ) + extractor.FoundText.Bounds.ToString()) Console.WriteLine( "" ) ' iterate through each element in the found text For Each element As SearchResultElement In extractor.FoundText.Elements Console.WriteLine((((( "Element #" + element.Index.ToString() & " at left=" ) + element.Left.ToString() & "; top=" ) + element.Top.ToString() & "; width=" ) + element.Width.ToString() & "; height=" ) + element.Height.ToString()) Console.WriteLine( "Text: " + element.Text) Console.WriteLine( "Font is bold: " + element.FontIsBold.ToString()) Console.WriteLine( "Font is italic:" + element.FontIsItalic.ToString()) Console.WriteLine( "Font name: " + element.FontName) Console.WriteLine( "Font size:" + element.FontSize.ToString()) Console.WriteLine( "Font color:" + element.FontColor.ToString()) Next Loop While extractor.FindNext() End If Next Console.WriteLine() Console.WriteLine( "Press any key to continue..." ) Console.ReadLine() End Sub End Class |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | ' Create Bytescout.PDFExtractor.TextExtractor object Set extractor = CreateObject( "Bytescout.PDFExtractor.TextExtractor" ) extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile( "..\..\sample1.pdf" ) ' Get page count pageCount = extractor.GetPageCount() foundMessage = "" For i=0 to PageCount-1 If extractor.Find(i, "ipsum" , false) Then ' parameters are: page index, string to find, case sensitivity Do foundMessage = "Found word 'ipsum' on page #" & CStr (i) & " at left=" & CStr (extractor.FoundText.Left) & "; top=" & CStr (extractor.FoundText.Top) & "; width=" & CStr (extractor.FoundText.Width) & "; height=" & CStr (extractor.FoundText.Height) ' iterate through each element in the found text For j=0 to extractor.FoundText.ElementCount-1 ' get search result element Set element = extractor.FoundText.GetElement(0) elementInfo= "Element #" & CStr (j) & " at left=" & CStr (element.Left) & "; top=" & CStr (element.Top) & "; width=" & CStr (element.Width) & "; height=" & CStr (element.Height) & vbCRLF elementInfo= elementInfo& "Text: " & CStr (element.Text) & vbCRLF elementInfo= elementInfo& "Font is bold: " & CStr (element.FontIsBold) & vbCRLF elementInfo= elementInfo& "Font is italic:" & CStr (element.FontIsItalic) & vbCRLF elementInfo= elementInfo& "Font name: " & CStr (element.FontName) & vbCRLF elementInfo= elementInfo& "Font size:" & CStr (element.FontSize) & vbCRLF elementInfo= elementInfo & "Font color (as Ole Color):" & CStr (element.FontColorAsOleColor) Next MsgBox foundMessage & vbCRLF & vbCRLF & elementInfo Loop While extractor.FindNext End If Next MsgBox "Done" Set extractor = Nothing |