How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK - ByteScout

How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to search in PDF using regular expressions in C# and VBScript with PDF Extractor SDK

The samples below will show how to find text in PDF using regular expressions with ByteScout PDF Extractor SDK in C# and VBScript.

How to find regexp in PDF in C#

[vb]
using System;
using System.Drawing;
using Bytescout.PDFExtractor;

namespace FindText
{
class Program
{
static void Main(string[] args)
{
// Create Bytescout.PDFExtractor.TextExtractor instance
TextExtractor extractor = new TextExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";

// Load sample PDF document
extractor.LoadDocumentFromFile("sample1.pdf");

int pageCount = extractor.GetPageCount();

extractor.RegexSearch = true; // ‘ turn on the regular expression search

// search through pages
for (int i = 0; i < pageCount; i++)
{
// searches for the text starting from LABORIS and ending with VELIT words
string regexPattern = "LABORIS.*VELIT";
// see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

// Search each page for the pattern
if (extractor.Find(i, regexPattern, false))
{
do
{
Console.WriteLine("");
Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
Console.WriteLine("");
// iterate through each element in the found text
foreach (SearchResultElement element in extractor.FoundText.Elements)
{
Console.WriteLine ("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
Console.WriteLine ("Text: " + element.Text);
Console.WriteLine ("Font is bold: " + element.FontIsBold);
Console.WriteLine ("Font is italic:" + element.FontIsItalic);
Console.WriteLine ( "Font name: " + element.FontName);
Console.WriteLine ( "Font size:" + element.FontSize);
Console.WriteLine ( "Font color:" + element.FontColor);
}

}
while (extractor.FindNext());
}
}

Console.WriteLine();
Console.WriteLine("Press any key to continue…");
Console.ReadLine();
}
}
}
[/vb]

How to find regexp in PDF in VBScript (Visual Basic 6)

[vb]
‘ Create Bytescout.PDFExtractor.TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

‘ Load sample PDF document
extractor.LoadDocumentFromFile("sample.pdf")

extractor.RegexSearch = True ‘ turn on the regex search
pattern = "\d{1,3}[\.,]+\d{2,3}[\.,]+\d{2,4}" ‘ search for numbers in format like 12,123,000 or 12.10.2010

‘ complete regular expressions patterns reference: https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

‘ Get page count
pageCount = extractor.GetPageCount()

For i = 0 to PageCount-1

If extractor.Find(i, pattern, false) Then ‘ parameters are: page index, string to find, case sensitivity
Do
extractedString = extractor.FoundText.Text
MsgBox "Found match on page #" & CStr(i) & ": " & extractedString
extractor.ResetExtractionArea()
Loop While extractor.FindNext
End If

Next

MsgBox "Done"

Set extractor = Nothing
[/vb]

Tutorials:

prev
next