How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK - ByteScout

How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK

The samples below will show how to find text in PDF using regular expressions with ByteScout PDF Extractor SDK in C# and VBScript.

How to find regexp in PDF in C#

using System;
using System.Drawing;
using Bytescout.PDFExtractor;

namespace FindText
{
	class Program
	{
		static void Main(string[] args)
		{
			 	// Create Bytescout.PDFExtractor.TextExtractor instance
 	TextExtractor extractor = new TextExtractor();
 	extractor.RegistrationName = "demo";
 	extractor.RegistrationKey = "demo";

 	// Load sample PDF document
	extractor.LoadDocumentFromFile("sample1.pdf");

 	
 	int pageCount = extractor.GetPageCount();

         extractor.RegexSearch = true; //  ' turn on the regular expression search

                       // search through pages
 	for (int i = 0; i < pageCount; i++)
 	{
                                // searches for the text starting from LABORIS and ending with VELIT words
                                string regexPattern = "LABORIS.*VELIT";
                                // see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

   // Search each page for the pattern
   if (extractor.Find(i, regexPattern, false))
   {
   	do
   	{
                       Console.WriteLine("");
     Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                       Console.WriteLine("");
                       // iterate through each element in the found text
                       foreach (SearchResultElement element in extractor.FoundText.Elements)
                       {
                     Console.WriteLine ("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                     Console.WriteLine ("Text: " + element.Text);
                     Console.WriteLine ("Font is bold: " + element.FontIsBold); 
                     Console.WriteLine ("Font is italic:" + element.FontIsItalic);
                     Console.WriteLine ( "Font name: " + element.FontName);
                     Console.WriteLine ( "Font size:" + element.FontSize);
                     Console.WriteLine ( "Font color:" + element.FontColor);
                       }


   	}
   	while (extractor.FindNext());
   }
 	}
			
			Console.WriteLine();
			Console.WriteLine("Press any key to continue...");
			Console.ReadLine();
		}
	}
}

How to find regexp in PDF in VBScript (Visual Basic 6)

' Create Bytescout.PDFExtractor.TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

' Load sample PDF document
extractor.LoadDocumentFromFile("sample.pdf")

extractor.RegexSearch = True ' turn on the regex search
pattern = "\d{1,3}[\.,]+\d{2,3}[\.,]+\d{2,4}" ' search for numbers in format like 12,123,000 or 12.10.2010

' complete regular expressions patterns reference: https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx


' Get page count
pageCount = extractor.GetPageCount()

For i = 0 to PageCount-1 
 
If extractor.Find(i, pattern, false) Then ' parameters are: page index, string to find, case sensitivity
	Do
 		extractedString = extractor.FoundText.Text
 		MsgBox "Found match on page #" & CStr(i) & ": " & extractedString
 		extractor.ResetExtractionArea()
	Loop While extractor.FindNext
End If

Next

MsgBox "Done"

Set extractor = Nothing

Tutorials:

prev
next