How to find text in PDF using PDF Extractor SDK in ASP.NET, C#, C#-WPF, VB.NET and VBScript - ByteScout

How to find text in PDF using PDF Extractor SDK in ASP.NET, C#, C#-WPF, VB.NET and VBScript

  • Home
  • /
  • Articles
  • /
  • How to find text in PDF using PDF Extractor SDK in ASP.NET, C#, C#-WPF, VB.NET and VBScript

Check the samples below to learn how to find specific text in a PDF document in ASP.NET, C#, C#-WPF, VB.NET and VBScript using ByteScout PDF Extractor SDK.

If you need to find text with hyphens in your PDF, check this tutorial.

Select your programming language:

How to find text in PDF in ASP.NET

using System;
using System.Drawing;
using Bytescout.PDFExtractor;

namespace FindText
{
	public partial class _Default : System.Web.UI.Page
	{
		protected void Page_Load(object sender, EventArgs e)
		{
			// This test file will be copied to the project directory on the pre-build event (see the project properties).
			String inputFile = Server.MapPath("sample1.pdf");

			// Create Bytescout.PDFExtractor.TextExtractor instance
			TextExtractor extractor = new TextExtractor();
			extractor.RegistrationName = "demo";
			extractor.RegistrationKey = "demo";
			
			// Load sample PDF document
			extractor.LoadDocumentFromFile(inputFile);

			Response.Clear();
			Response.ContentType = "text/html";

			Rectangle location;
			int pageIndex;

			Response.Write("Searching for \"ipsum\" string:<br><br>");
			
			// Search for "ipsum" string
			if (extractor.Find(0, "ipsum"))
			{

                do
                {
                    Response.Write("<br/>");
                    Response.Write("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()+"<br/>");
                    Response.Write("<br/>");
                    // iterate through each element in the found text
                    foreach (SearchResultElement element in extractor.FoundText.Elements)
                    {
                        Response.Write("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height + "<br/>");
                        Response.Write("Text: " + element.Text + "<br/>");
                        Response.Write("Font is bold: " + element.FontIsBold + "<br/>");
                        Response.Write("Font is italic:" + element.FontIsItalic + "<br/>");
                        Response.Write("Font name: " + element.FontName + "<br/>");
                        Response.Write("Font size:" + element.FontSize + "<br/>");
                        Response.Write("Font color:" + element.FontColor + "<br/>");
                    }

                }
                while (extractor.FindNext());

			}

			Response.End();
		}
	}
}

How to find tex in PDF in C#

using System;
using System.Drawing;
using Bytescout.PDFExtractor;

namespace FindText
{
	class Program
	{
		static void Main(string[] args)
		{
			// Create Bytescout.PDFExtractor.TextExtractor instance
			TextExtractor extractor = new TextExtractor();
			extractor.RegistrationName = "demo";
			extractor.RegistrationKey = "demo";

			// Load sample PDF document
			extractor.LoadDocumentFromFile("sample1.pdf");
			
			int pageCount = extractor.GetPageCount();

			for (int i = 0; i < pageCount; i++)
			{
				// Search each page for "ipsum" string
				if (extractor.Find(i, "ipsum", false))
				{
					do
					{
                        Console.WriteLine("");
						Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
            		        Console.WriteLine ("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
		                    Console.WriteLine ("Text: " + element.Text);
		                    Console.WriteLine ("Font is bold: " + element.FontIsBold); 
		                    Console.WriteLine ("Font is italic:" + element.FontIsItalic);
		                    Console.WriteLine ( "Font name: " + element.FontName);
		                    Console.WriteLine ( "Font size:" + element.FontSize);
		                    Console.WriteLine ( "Font color:" + element.FontColor);
                        }


					}
					while (extractor.FindNext());
				}
			}
			
			Console.WriteLine();
			Console.WriteLine("Press any key to continue...");
			Console.ReadLine();
		}
	}
}

How to find text in PDF in C#-WPF

using System;
using System.Drawing;
using System.Text;
using System.Windows;
using Bytescout.PDFExtractor;

namespace WpfApplication1
{
	public partial class MainWindow : Window
	{
		private string _pdfFile;
		private TextExtractor extractor;

		public MainWindow()
		{
			InitializeComponent();

			extractor = new TextExtractor();
		}

		private void Button_Load(object sender, RoutedEventArgs e)
		{
			Microsoft.Win32.OpenFileDialog dlg = new Microsoft.Win32.OpenFileDialog();
			dlg.DefaultExt = ".pdf";
			dlg.Filter = "PDF documents (.pdf)|*.pdf";

			bool? result = dlg.ShowDialog();

			if (result == true)
			{
				try
				{
					extractor.LoadDocumentFromFile(dlg.FileName);
					_pdfFile = dlg.FileName;
					Title = _pdfFile;
				}
				catch (Exception exception)
				{
					MessageBox.Show(exception.ToString());
				}
			}
		}

		private void Button_Extract(object sender, RoutedEventArgs e)
		{
			if (_pdfFile != null)
			{
				string text = extractor.GetText(0, 0); // extract from the first page only (for demonstration purposes)

				textBox1.Text = text;
			}
		}

		private void Button_Find(object sender, RoutedEventArgs e)
		{
			if (textBoxFind.Text.Length > 0)
			{
				StringBuilder builder = new StringBuilder();

				builder.AppendLine("Searching for \"" + textBoxFind.Text + "\"");

				if (extractor.Find(0, textBoxFind.Text, false))
				{
                    do
                    {
                        builder.AppendLine("");
                        builder.AppendLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        builder.AppendLine("");
                        // iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            builder.AppendLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            builder.AppendLine("Text: " + element.Text);
                            builder.AppendLine("Font is bold: " + element.FontIsBold);
                            builder.AppendLine("Font is italic:" + element.FontIsItalic);
                            builder.AppendLine("Font name: " + element.FontName);
                            builder.AppendLine("Font size:" + element.FontSize);
                            builder.AppendLine("Font color:" + element.FontColor);
                        }

                    }
                    while (extractor.FindNext());
                
                }

				builder.AppendLine("Finished.");

				textBox1.Text = builder.ToString();
			}
		}
	}
}

How to find text in PDF in Visual Basic .NET


Imports System.Drawing
Imports Bytescout.PDFExtractor

Class Program
	Friend Shared Sub Main(args As String())

			' Create Bytescout.PDFExtractor.TextExtractor instance
			Dim extractor As New TextExtractor()
			extractor.RegistrationName = "demo"
			extractor.RegistrationKey = "demo"

			' Load sample PDF document
			extractor.LoadDocumentFromFile("sample1.pdf")

			Dim pageCount As Integer = extractor.GetPageCount()

			For i As Integer = 0 To pageCount - 1
				' Search each page for "ipsum" string
				If extractor.Find(i, "ipsum", False) Then
					Do
						Console.WriteLine("")
						Console.WriteLine(("Found on page " & i & " at location ") + extractor.FoundText.Bounds.ToString())
						Console.WriteLine("")
						' iterate through each element in the found text
						For Each element As SearchResultElement In extractor.FoundText.Elements
                        Console.WriteLine((((("Element #" + element.Index.ToString() & " at left=") + element.Left.ToString() & "; top=") + element.Top.ToString() & "; width=") + element.Width.ToString() & "; height=") + element.Height.ToString())
							Console.WriteLine("Text: " + element.Text)
							Console.WriteLine("Font is bold: " + element.FontIsBold.ToString())
							Console.WriteLine("Font is italic:" + element.FontIsItalic.ToString())
							Console.WriteLine("Font name: " + element.FontName)
                        Console.WriteLine("Font size:" + element.FontSize.ToString())
							Console.WriteLine("Font color:" + element.FontColor.ToString())


						Next
					Loop While extractor.FindNext()
				End If
			Next

			Console.WriteLine()
			Console.WriteLine("Press any key to continue...")
			Console.ReadLine()
	End Sub
End Class

How to find text in PDF in VBScript (Visual Basic 6)


' Create Bytescout.PDFExtractor.TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

' Load sample PDF document
extractor.LoadDocumentFromFile("..\..\sample1.pdf")

' Get page count

pageCount = extractor.GetPageCount()

foundMessage = ""

For i=0 to PageCount-1 
 
 If extractor.Find(i, "ipsum", false) Then ' parameters are: page index, string to find, case sensitivity
 	Do
	
		foundMessage = "Found word 'ipsum' on page #" & CStr(i) & " at left=" & CStr(extractor.FoundText.Left) & "; top=" & CStr(extractor.FoundText.Top) & "; width=" & CStr(extractor.FoundText.Width) & "; height=" & CStr(extractor.FoundText.Height)

		' iterate through each element in the found text
		For j=0 to extractor.FoundText.ElementCount-1 	
		
		' get search result element
		Set element = extractor.FoundText.GetElement(0)	

		elementInfo= "Element #" & CStr(j) & " at left=" & CStr(element.Left) & "; top=" & CStr(element.Top) & "; width=" & CStr(element.Width) & "; height=" & CStr(element.Height) & vbCRLF
		elementInfo= elementInfo& "Text: " & CStr(element.Text) & vbCRLF
		elementInfo= elementInfo& "Font is bold: " & CStr(element.FontIsBold) & vbCRLF
		elementInfo= elementInfo& "Font is italic:" & CStr(element.FontIsItalic) & vbCRLF
		elementInfo= elementInfo& "Font name: " & CStr(element.FontName) & vbCRLF
		elementInfo= elementInfo& "Font size:" & CStr(element.FontSize) & vbCRLF
		elementInfo= elementInfo & "Font color (as Ole Color):" & CStr(element.FontColorAsOleColor) 			
		Next 

		MsgBox foundMessage & vbCRLF & vbCRLF & elementInfo


  	Loop While extractor.FindNext
 End If

Next

MsgBox "Done"

Set extractor = Nothing

Tutorials:

prev
next