How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK - ByteScout
Announcement
Our ByteScout SDK products are sunsetting as we focus on expanding new solutions.
Learn More Open modal
Close modal
Announcement Important Update
ByteScout SDK Sunsetting Notice
Our ByteScout SDK products are sunsetting as we focus on our new & improved solutions. Thank you for being part of our journey, and we look forward to supporting you in this next chapter!

How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK

The samples below will show how to find text in PDF using regular expressions with ByteScout PDF Extractor SDK in C# and VBScript.

How to find regexp in PDF in C#

using System;
using System.Drawing;
using Bytescout.PDFExtractor;

namespace FindText
{
	class Program
	{
		static void Main(string[] args)
		{
			 	// Create Bytescout.PDFExtractor.TextExtractor instance
 	TextExtractor extractor = new TextExtractor();
 	extractor.RegistrationName = "demo";
 	extractor.RegistrationKey = "demo";

 	// Load sample PDF document
	extractor.LoadDocumentFromFile("sample1.pdf");

 	
 	int pageCount = extractor.GetPageCount();

         extractor.RegexSearch = true; //  ' turn on the regular expression search

                       // search through pages
 	for (int i = 0; i < pageCount; i++)
 	{
                                // searches for the text starting from LABORIS and ending with VELIT words
                                string regexPattern = "LABORIS.*VELIT";
                                // see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

   // Search each page for the pattern
   if (extractor.Find(i, regexPattern, false))
   {
   	do
   	{
                       Console.WriteLine("");
     Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                       Console.WriteLine("");
                       // iterate through each element in the found text
                       foreach (SearchResultElement element in extractor.FoundText.Elements)
                       {
                     Console.WriteLine ("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                     Console.WriteLine ("Text: " + element.Text);
                     Console.WriteLine ("Font is bold: " + element.FontIsBold); 
                     Console.WriteLine ("Font is italic:" + element.FontIsItalic);
                     Console.WriteLine ( "Font name: " + element.FontName);
                     Console.WriteLine ( "Font size:" + element.FontSize);
                     Console.WriteLine ( "Font color:" + element.FontColor);
                       }


   	}
   	while (extractor.FindNext());
   }
 	}
			
			Console.WriteLine();
			Console.WriteLine("Press any key to continue...");
			Console.ReadLine();
		}
	}
}

How to find regexp in PDF in VBScript (Visual Basic 6)

' Create Bytescout.PDFExtractor.TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

' Load sample PDF document
extractor.LoadDocumentFromFile("sample.pdf")

extractor.RegexSearch = True ' turn on the regex search
pattern = "\d{1,3}[\.,]+\d{2,3}[\.,]+\d{2,4}" ' search for numbers in format like 12,123,000 or 12.10.2010

' complete regular expressions patterns reference: https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx


' Get page count
pageCount = extractor.GetPageCount()

For i = 0 to PageCount-1 
 
If extractor.Find(i, pattern, false) Then ' parameters are: page index, string to find, case sensitivity
	Do
 		extractedString = extractor.FoundText.Text
 		MsgBox "Found match on page #" & CStr(i) & ": " & extractedString
 		extractor.ResetExtractionArea()
	Loop While extractor.FindNext
End If

Next

MsgBox "Done"

Set extractor = Nothing

Tutorials:

prev
next