How to extract text from a specific area by coordinates in PDF in C#, VB.NET and VBScript using ByteScout PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to extract text from a specific area by coordinates in PDF in C#, VB.NET and VBScript using ByteScout PDF Extractor SDK

With PDF Extractor SDK, you can extract text from a specific rectangular area of a PDF document defined by coordinates. Check the samples below to learn how to extract text by coordinates C#, VB.NET and VBScript using ByteScout PDF Extractor SDK.

Also, check this tutorial to learn how to extract text from PDF by keyword.

Select your programming language:

How to extract text from PDF by coordinates in C#

using System;
using System.IO;
using System.Text;
using Bytescout.PDFExtractor;
using System.Drawing;
using System.Diagnostics;

namespace Example
{
    class Program
    {
        static void Main(string[] args)
        {

            TextExtractor extractor = new TextExtractor("demo", "demo");

            // load the document
            extractor.LoadDocumentFromFile("../../sample2.pdf");

            // get page count
            int pageCount = extractor.GetPageCount();
            int count = 0;

            // iterate through pages
            for (int i = 0; i < pageCount; i++)
            {

                // define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);
                
                // set extraction area
                extractor.SetExtractionArea(location);

                // extract text bounded by the extraction area
                string extractedString = extractor.GetTextFromPage(i);
                
                Console.WriteLine("Extracted from page #" + i + ":\r\n" + extractedString);

                // reset extraction area to full page (by default)
                extractor.ResetExtractionArea();

                Console.WriteLine("\r\n");
            }



            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();

        }
    }
}

How to extract text from PDF by coordinates in Visual Basic .NET


Imports System.Drawing
Imports System.IO
Imports Bytescout.PDFExtractor



Class Program
	Friend Shared Sub Main(args As String())

		' Create Bytescout.PDFExtractor.TextExtractor instance
		Dim extractor As New TextExtractor()
		extractor.RegistrationName = "demo"
		extractor.RegistrationKey = "demo"

		' Load sample PDF document
		extractor.LoadDocumentFromFile("sample2.pdf")

		' define a rectangle location to get text from it from pdf at 0,0 with width and height as 200x200 accordingly
            	Dim location as RectangleF  = new RectangleF(0, 0, 200, 200)
                
            	' set text extractor extraction area to this rectangle
            	extractor.SetExtractionArea(location)

            	' now we can get text from this pdf rectangle from page #0
        Dim extractedString As String = extractor.GetTextFromPage(0)
                
            	' write text from pdf rectangle to the console
        Console.WriteLine("Extracted from page #0" + ":\r\n" + extractedString)

	End Sub
End Class

How to extract text from PDF by coordinates in VBScript (Visual Basic 6)

' Create TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

' Load sample PDF document
extractor.LoadDocumentFromFile("..\..\sample1.pdf")

' Get page count
pageCount = extractor.GetPageCount()

For i = 0 to pageCount - 1

	' find some text
	If extractor.Find (i, "ALIQUIP EX EA COMMODO", false) Then

		Do
			RectLeft = extractor.GetFoundTextRectangle_Left
			RectTop = extractor.GetFoundTextRectangle_Top
			RectWidth = extractor.GetFoundTextRectangle_Width
			RectHeight = extractor.GetFoundTextRectangle_Height

			Wscript.echo "Found on page #" & CStr(i) & " at left=" & CStr(RectLeft) & "; top=" & CStr(RectTop) & "; width=" & CStr(RectWidth) & "; height=" & CStr(RectHeight)

			' check the same text is extracted from returned coordinates
			extractor.SetExtractionArea RectLeft, RectTop, RectWidth, RectHeight
			Wscript.echo "Extracted text: " & extractor.GetTextFromPage(i)
			extractor.ResetExtractionArea
		
		Loop While extractor.FindNext
		
	End If
Next

Set extractor = Nothing

prev
next