How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK - ByteScout

How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK

The samples below will show how to find text in PDF using regular expressions with ByteScout PDF Extractor SDK in C# and VBScript.

How to find regexp in PDF in C#

using System;
using System.Drawing;
using Bytescout.PDFExtractor;

namespace FindText
	class Program
		static void Main(string[] args)
			 	// Create Bytescout.PDFExtractor.TextExtractor instance
 	TextExtractor extractor = new TextExtractor();
 	extractor.RegistrationName = "demo";
 	extractor.RegistrationKey = "demo";

 	// Load sample PDF document

 	int pageCount = extractor.GetPageCount();

         extractor.RegexSearch = true; //  ' turn on the regular expression search

                       // search through pages
 	for (int i = 0; i < pageCount; i++)
                                // searches for the text starting from LABORIS and ending with VELIT words
                                string regexPattern = "LABORIS.*VELIT";
                                // see the complete regular expressions reference at

   // Search each page for the pattern
   if (extractor.Find(i, regexPattern, false))
     Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                       // iterate through each element in the found text
                       foreach (SearchResultElement element in extractor.FoundText.Elements)
                     Console.WriteLine ("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                     Console.WriteLine ("Text: " + element.Text);
                     Console.WriteLine ("Font is bold: " + element.FontIsBold); 
                     Console.WriteLine ("Font is italic:" + element.FontIsItalic);
                     Console.WriteLine ( "Font name: " + element.FontName);
                     Console.WriteLine ( "Font size:" + element.FontSize);
                     Console.WriteLine ( "Font color:" + element.FontColor);

   	while (extractor.FindNext());
			Console.WriteLine("Press any key to continue...");

How to find regexp in PDF in VBScript (Visual Basic 6)

' Create Bytescout.PDFExtractor.TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

' Load sample PDF document

extractor.RegexSearch = True ' turn on the regex search
pattern = "\d{1,3}[\.,]+\d{2,3}[\.,]+\d{2,4}" ' search for numbers in format like 12,123,000 or 12.10.2010

' complete regular expressions patterns reference:

' Get page count
pageCount = extractor.GetPageCount()

For i = 0 to PageCount-1 
If extractor.Find(i, pattern, false) Then ' parameters are: page index, string to find, case sensitivity
 		extractedString = extractor.FoundText.Text
 		MsgBox "Found match on page #" & CStr(i) & ": " & extractedString
	Loop While extractor.FindNext
End If


MsgBox "Done"

Set extractor = Nothing

