With PDF Extractor SDK, you can extract text from separate columns of PDF documents defined by column width. This sample code shows extracting text from PDF within 3 separate columns in ASP.NET, C#, VB.NET, and VBScript.
Select your programming language:
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.IO;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using Bytescout.PDFExtractor;
namespace ExtractAllText
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
// This test file will be copied to the project directory on the pre-build event (see the project properties).
String inputFile = Server.MapPath("columns.pdf");
// Create Bytescout.PDFExtractor.TextExtractor instance
TextExtractor extractor = new TextExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
// Load sample PDF document
extractor.LoadDocumentFromFile(inputFile);
// read width of the very first page (zero index)
int pageWidth = extractor.GetPageRect_Width(0);
int pageHeight = extractor.GetPageRect_Height(0);
// now we are extracting content assuming we have 3 columns
// equally distributed on pages
// first calculate the width of the one column by dividing page width by number of columns (3)
int columnWidth = pageWidth / 3;
Response.Clear();
Response.ContentType = "text/html";
// iterate through 3 columns
for (int i=0; i<3; i++)
{
// set the extraction area to the #i column
extractor.SetExtractionArea(i * columnWidth, 0, columnWidth , pageHeight);
// Save extracted text to output stream
extractor.SavePageTextToStream(0, Response.OutputStream);
}
Response.End();
}
}
}
using System;
using Bytescout.PDFExtractor;
namespace ExtractAllText
{
class Program
{
static void Main(string[] args)
{
// Create Bytescout.PDFExtractor.TextExtractor instance
TextExtractor extractor = new TextExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
// Load sample PDF document
extractor.LoadDocumentFromFile("columns.pdf");
// read width of the very first page (zero index)
float pageWidth = extractor.GetPageRect_Width(0);
float pageHeight = extractor.GetPageRect_Height(0);
// now we are extracting content assuming we have 3 columns
// equally distributed on pages
// first calculate the width of the one column by dividing page width by number of columns (3)
float columnWidth = pageWidth / 3f;
// iterate through 3 columns
for (int i=0; i<3; i++)
{
// set the extraction area to the #i column
extractor.SetExtractionArea(i * columnWidth, 0, columnWidth , pageHeight);
string outFileName = "columns-column" + i + ".txt";
extractor.SavePageTextToFile (0, outFileName);
// Open output file in default associated application
System.Diagnostics.Process.Start(outFileName);
}
}
}
}
Imports Bytescout.PDFExtractor
Imports Sysem.Drawing
Class Program
Friend Shared Sub Main(args As String())
' Create Bytescout.PDFExtractor.TextExtractor instance
Dim extractor As New TextExtractor()
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Load sample PDF document
extractor.LoadDocumentFromFile("columns.pdf")
' read width of the very first page (zero index)
Dim pageWidth As Integer = extractor.GetPageRect_Width (0)
Dim pageHeight As Integer = extractor.GetPageRect_Height (0)
' now we are extracting content assuming we have 3 columns
' equally distributed on pages
' first calculate the width of the one column by dividing page width by number of columns (3)
Dim columnWidth As Integer = pageWidth / 3
' iterate through 3 columns
For i As Integer=0 to 2
' set the extraction area to the #i column
extractor.SetExtractionArea (i * columnWidth, 0, columnWidth , pageHeight)
' save column to file
Dim outFileName As String = "columns-column" + i + ".txt"
extractor.SavePageTextToFile (0, outFileName)
' Open generated file in default text viewer
System.Diagnostics.Process.Start(outFileName )
Next
End Sub
End Class
' Create Bytescout.PDFExtractor.TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Load sample PDF document
extractor.LoadDocumentFromFile("..\..\columns.pdf")
' read width of the very first page (zero index)
pageWidth = extractor.GetPageRect_Width (0)
pageHeight = extractor.GetPageRect_Height (0)
' now we are extracting content assuming we have 3 columns
' equally distributed on pages
' first calculate the width of the one column by dividing page width by number of columns (3)
columnWidth = pageWidth / 3
' iterate through 3 columns
For i=0 to 2
' set the extraction area to the #i column
extractor.SetExtractionArea i * columnWidth, 0, columnWidth , pageHeight
outFileName = "columns-column"& CStr(i) & ".txt"
extractor.SavePageTextToFile 0, outFileName
' Open output file in default associated application
Set shell = CreateObject("WScript.Shell")
shell.Run outFileName, 1, false
Set shell = Nothing
Next
Set extractor = Nothing
MsgBox "Done"