With PDF Extractor SDK, you can extract text from a specific rectangular area of a PDF document defined by coordinates. Check the samples below to learn how to extract text by coordinates C#, VB.NET and VBScript using ByteScout PDF Extractor SDK.
Also, check this tutorial to learn how to extract text from PDF by keyword.
Select your programming language:
using System;
using System.IO;
using System.Text;
using Bytescout.PDFExtractor;
using System.Drawing;
using System.Diagnostics;
namespace Example
{
class Program
{
static void Main(string[] args)
{
TextExtractor extractor = new TextExtractor("demo", "demo");
// load the document
extractor.LoadDocumentFromFile("../../sample2.pdf");
// get page count
int pageCount = extractor.GetPageCount();
int count = 0;
// iterate through pages
for (int i = 0; i < pageCount; i++)
{
// define rectangle location to extract from
RectangleF location = new RectangleF(0, 0, 200, 200);
// set extraction area
extractor.SetExtractionArea(location);
// extract text bounded by the extraction area
string extractedString = extractor.GetTextFromPage(i);
Console.WriteLine("Extracted from page #" + i + ":\r\n" + extractedString);
// reset extraction area to full page (by default)
extractor.ResetExtractionArea();
Console.WriteLine("\r\n");
}
Console.WriteLine("Press any key to exit...");
Console.ReadKey();
}
}
}
Imports System.Drawing
Imports System.IO
Imports Bytescout.PDFExtractor
Class Program
Friend Shared Sub Main(args As String())
' Create Bytescout.PDFExtractor.TextExtractor instance
Dim extractor As New TextExtractor()
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Load sample PDF document
extractor.LoadDocumentFromFile("sample2.pdf")
' define a rectangle location to get text from it from pdf at 0,0 with width and height as 200x200 accordingly
Dim location as RectangleF = new RectangleF(0, 0, 200, 200)
' set text extractor extraction area to this rectangle
extractor.SetExtractionArea(location)
' now we can get text from this pdf rectangle from page #0
Dim extractedString As String = extractor.GetTextFromPage(0)
' write text from pdf rectangle to the console
Console.WriteLine("Extracted from page #0" + ":\r\n" + extractedString)
End Sub
End Class
' Create TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Load sample PDF document
extractor.LoadDocumentFromFile("..\..\sample1.pdf")
' Get page count
pageCount = extractor.GetPageCount()
For i = 0 to pageCount - 1
' find some text
If extractor.Find (i, "ALIQUIP EX EA COMMODO", false) Then
Do
RectLeft = extractor.GetFoundTextRectangle_Left
RectTop = extractor.GetFoundTextRectangle_Top
RectWidth = extractor.GetFoundTextRectangle_Width
RectHeight = extractor.GetFoundTextRectangle_Height
Wscript.echo "Found on page #" & CStr(i) & " at left=" & CStr(RectLeft) & "; top=" & CStr(RectTop) & "; width=" & CStr(RectWidth) & "; height=" & CStr(RectHeight)
' check the same text is extracted from returned coordinates
extractor.SetExtractionArea RectLeft, RectTop, RectWidth, RectHeight
Wscript.echo "Extracted text: " & extractor.GetTextFromPage(i)
extractor.ResetExtractionArea
Loop While extractor.FindNext
End If
Next
Set extractor = Nothing