With PDF Extractor SDK, you can extract text from a specific rectangular area of a PDF document defined by coordinates. Check the samples below to learn how to extract text by coordinates C#, VB.NET and VBScript using ByteScout PDF Extractor SDK.
Also, check this tutorial to learn how to extract text from PDF by keyword.
Select your programming language:
using System; using System.IO; using System.Text; using Bytescout.PDFExtractor; using System.Drawing; using System.Diagnostics; namespace Example { class Program { static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // load the document extractor.LoadDocumentFromFile("../../sample2.pdf"); // get page count int pageCount = extractor.GetPageCount(); int count = 0; // iterate through pages for (int i = 0; i < pageCount; i++) { // define rectangle location to extract from RectangleF location = new RectangleF(0, 0, 200, 200); // set extraction area extractor.SetExtractionArea(location); // extract text bounded by the extraction area string extractedString = extractor.GetTextFromPage(i); Console.WriteLine("Extracted from page #" + i + ":\r\n" + extractedString); // reset extraction area to full page (by default) extractor.ResetExtractionArea(); Console.WriteLine("\r\n"); } Console.WriteLine("Press any key to exit..."); Console.ReadKey(); } } }
Imports System.Drawing Imports System.IO Imports Bytescout.PDFExtractor Class Program Friend Shared Sub Main(args As String()) ' Create Bytescout.PDFExtractor.TextExtractor instance Dim extractor As New TextExtractor() extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile("sample2.pdf") ' define a rectangle location to get text from it from pdf at 0,0 with width and height as 200x200 accordingly Dim location as RectangleF = new RectangleF(0, 0, 200, 200) ' set text extractor extraction area to this rectangle extractor.SetExtractionArea(location) ' now we can get text from this pdf rectangle from page #0 Dim extractedString As String = extractor.GetTextFromPage(0) ' write text from pdf rectangle to the console Console.WriteLine("Extracted from page #0" + ":\r\n" + extractedString) End Sub End Class
' Create TextExtractor object Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor") extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile("..\..\sample1.pdf") ' Get page count pageCount = extractor.GetPageCount() For i = 0 to pageCount - 1 ' find some text If extractor.Find (i, "ALIQUIP EX EA COMMODO", false) Then Do RectLeft = extractor.GetFoundTextRectangle_Left RectTop = extractor.GetFoundTextRectangle_Top RectWidth = extractor.GetFoundTextRectangle_Width RectHeight = extractor.GetFoundTextRectangle_Height Wscript.echo "Found on page #" & CStr(i) & " at left=" & CStr(RectLeft) & "; top=" & CStr(RectTop) & "; width=" & CStr(RectWidth) & "; height=" & CStr(RectHeight) ' check the same text is extracted from returned coordinates extractor.SetExtractionArea RectLeft, RectTop, RectWidth, RectHeight Wscript.echo "Extracted text: " & extractor.GetTextFromPage(i) extractor.ResetExtractionArea Loop While extractor.FindNext End If Next Set extractor = Nothing