How to extract text from columns in PDF by coordinates ASP.NET, C#, VB.NET and VBScript using ByteScout PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to extract text from columns in PDF by coordinates ASP.NET, C#, VB.NET and VBScript using ByteScout PDF Extractor SDK

With PDF Extractor SDK, you can extract text from separate columns of PDF documents defined by column width. This sample code shows extracting text from PDF within 3 separate columns in ASP.NET, C#, VB.NET, and VBScript.

Select your programming language:

How to extract text from columns in PDF by coordinates in ASP.NET

using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.IO;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using Bytescout.PDFExtractor;

namespace ExtractAllText
{
	public partial class _Default : System.Web.UI.Page
	{
		protected void Page_Load(object sender, EventArgs e)
		{
			// This test file will be copied to the project directory on the pre-build event (see the project properties).
			String inputFile = Server.MapPath("columns.pdf");

			// Create Bytescout.PDFExtractor.TextExtractor instance
			TextExtractor extractor = new TextExtractor();
			extractor.RegistrationName = "demo";
			extractor.RegistrationKey = "demo";
			
			// Load sample PDF document
			extractor.LoadDocumentFromFile(inputFile);

			// read width of the very first page (zero index)
			int pageWidth = extractor.GetPageRect_Width(0);
			int pageHeight = extractor.GetPageRect_Height(0);

			// now we are extracting content assuming we have 3 columns 
			// equally distributed on pages

			// first calculate the width of the one column by dividing page width by number of columns (3)
			int columnWidth = pageWidth / 3;


			Response.Clear();
			Response.ContentType = "text/html";


			// iterate through 3 columns
			for (int i=0; i<3; i++)
			{

				// set the extraction area to the #i column 
				extractor.SetExtractionArea(i * columnWidth, 0, columnWidth , pageHeight);

				// Save extracted text to output stream
				extractor.SavePageTextToStream(0, Response.OutputStream);
			}



			Response.End();
		}
	}
}

How to extract text from columns in PDF by coordinates in C#

using System;
using Bytescout.PDFExtractor;

namespace ExtractAllText
{
	class Program
	{
		static void Main(string[] args)
		{
			// Create Bytescout.PDFExtractor.TextExtractor instance
			TextExtractor extractor = new TextExtractor();
			extractor.RegistrationName = "demo";
			extractor.RegistrationKey = "demo";

			// Load sample PDF document
			extractor.LoadDocumentFromFile("columns.pdf");


			// read width of the very first page (zero index)
			float pageWidth = extractor.GetPageRect_Width(0);
			float pageHeight = extractor.GetPageRect_Height(0);

			// now we are extracting content assuming we have 3 columns 
			// equally distributed on pages

			// first calculate the width of the one column by dividing page width by number of columns (3)
			float columnWidth = pageWidth / 3f;

			// iterate through 3 columns
			for (int i=0; i<3; i++)
			{

				// set the extraction area to the #i column 
				extractor.SetExtractionArea(i * columnWidth, 0, columnWidth , pageHeight);

				string outFileName = "columns-column" + i + ".txt";
				extractor.SavePageTextToFile (0, outFileName);

				// Open output file in default associated application
				System.Diagnostics.Process.Start(outFileName);

			}

		}
	}
}

How to extract text from columns in PDF by coordinates in Visual Basic .NET

Imports Bytescout.PDFExtractor
Imports Sysem.Drawing

Class Program
	Friend Shared Sub Main(args As String())

		' Create Bytescout.PDFExtractor.TextExtractor instance
		Dim extractor As New TextExtractor()
		extractor.RegistrationName = "demo"
		extractor.RegistrationKey = "demo"

		' Load sample PDF document
		extractor.LoadDocumentFromFile("columns.pdf")


		' read width of the very first page (zero index)
		Dim pageWidth As Integer = extractor.GetPageRect_Width (0)
		Dim pageHeight As Integer = extractor.GetPageRect_Height (0)

		' now we are extracting content assuming we have 3 columns 
		' equally distributed on pages

		' first calculate the width of the one column by dividing page width by number of columns (3)
		Dim columnWidth As Integer = pageWidth / 3

		' iterate through 3 columns
		For i As Integer=0 to 2

		 ' set the extraction area to the #i column 
		 extractor.SetExtractionArea (i * columnWidth, 0, columnWidth , pageHeight)

		 ' save column to file
   		 Dim outFileName As String = "columns-column" + i + ".txt"
 		 extractor.SavePageTextToFile (0, outFileName)

		 ' Open generated file in default text viewer
		 System.Diagnostics.Process.Start(outFileName )

		Next 



	End Sub
End Class

How to extract text from columns in PDF by coordinates in VBScript (Visual Basic 6)

' Create Bytescout.PDFExtractor.TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

' Load sample PDF document
extractor.LoadDocumentFromFile("..\..\columns.pdf")

' read width of the very first page (zero index)
pageWidth = extractor.GetPageRect_Width (0)
pageHeight = extractor.GetPageRect_Height (0)

' now we are extracting content assuming we have 3 columns 
' equally distributed on pages

' first calculate the width of the one column by dividing page width by number of columns (3)
columnWidth = pageWidth / 3

' iterate through 3 columns
For i=0 to 2

' set the extraction area to the #i column 
extractor.SetExtractionArea i * columnWidth, 0, columnWidth , pageHeight

outFileName = "columns-column"& CStr(i) & ".txt"
extractor.SavePageTextToFile 0, outFileName

' Open output file in default associated application
Set shell = CreateObject("WScript.Shell")
shell.Run outFileName, 1, false
Set shell = Nothing

Next

Set extractor = Nothing

MsgBox "Done"
prev
next