With PDF Extractor SDK, you can extract text from separate columns of PDF documents defined by column width. This sample code shows extracting text from PDF within 3 separate columns in ASP.NET, C#, VB.NET, and VBScript.
Select your programming language:
using System; using System.Data; using System.Configuration; using System.Collections; using System.IO; using System.Web; using System.Web.Security; using System.Web.UI; using System.Web.UI.WebControls; using System.Web.UI.WebControls.WebParts; using System.Web.UI.HtmlControls; using Bytescout.PDFExtractor; namespace ExtractAllText { public partial class _Default : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { // This test file will be copied to the project directory on the pre-build event (see the project properties). String inputFile = Server.MapPath("columns.pdf"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); // read width of the very first page (zero index) int pageWidth = extractor.GetPageRect_Width(0); int pageHeight = extractor.GetPageRect_Height(0); // now we are extracting content assuming we have 3 columns // equally distributed on pages // first calculate the width of the one column by dividing page width by number of columns (3) int columnWidth = pageWidth / 3; Response.Clear(); Response.ContentType = "text/html"; // iterate through 3 columns for (int i=0; i<3; i++) { // set the extraction area to the #i column extractor.SetExtractionArea(i * columnWidth, 0, columnWidth , pageHeight); // Save extracted text to output stream extractor.SavePageTextToStream(0, Response.OutputStream); } Response.End(); } } }
using System; using Bytescout.PDFExtractor; namespace ExtractAllText { class Program { static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("columns.pdf"); // read width of the very first page (zero index) float pageWidth = extractor.GetPageRect_Width(0); float pageHeight = extractor.GetPageRect_Height(0); // now we are extracting content assuming we have 3 columns // equally distributed on pages // first calculate the width of the one column by dividing page width by number of columns (3) float columnWidth = pageWidth / 3f; // iterate through 3 columns for (int i=0; i<3; i++) { // set the extraction area to the #i column extractor.SetExtractionArea(i * columnWidth, 0, columnWidth , pageHeight); string outFileName = "columns-column" + i + ".txt"; extractor.SavePageTextToFile (0, outFileName); // Open output file in default associated application System.Diagnostics.Process.Start(outFileName); } } } }
Imports Bytescout.PDFExtractor Imports Sysem.Drawing Class Program Friend Shared Sub Main(args As String()) ' Create Bytescout.PDFExtractor.TextExtractor instance Dim extractor As New TextExtractor() extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile("columns.pdf") ' read width of the very first page (zero index) Dim pageWidth As Integer = extractor.GetPageRect_Width (0) Dim pageHeight As Integer = extractor.GetPageRect_Height (0) ' now we are extracting content assuming we have 3 columns ' equally distributed on pages ' first calculate the width of the one column by dividing page width by number of columns (3) Dim columnWidth As Integer = pageWidth / 3 ' iterate through 3 columns For i As Integer=0 to 2 ' set the extraction area to the #i column extractor.SetExtractionArea (i * columnWidth, 0, columnWidth , pageHeight) ' save column to file Dim outFileName As String = "columns-column" + i + ".txt" extractor.SavePageTextToFile (0, outFileName) ' Open generated file in default text viewer System.Diagnostics.Process.Start(outFileName ) Next End Sub End Class
' Create Bytescout.PDFExtractor.TextExtractor object Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor") extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile("..\..\columns.pdf") ' read width of the very first page (zero index) pageWidth = extractor.GetPageRect_Width (0) pageHeight = extractor.GetPageRect_Height (0) ' now we are extracting content assuming we have 3 columns ' equally distributed on pages ' first calculate the width of the one column by dividing page width by number of columns (3) columnWidth = pageWidth / 3 ' iterate through 3 columns For i=0 to 2 ' set the extraction area to the #i column extractor.SetExtractionArea i * columnWidth, 0, columnWidth , pageHeight outFileName = "columns-column"& CStr(i) & ".txt" extractor.SavePageTextToFile 0, outFileName ' Open output file in default associated application Set shell = CreateObject("WScript.Shell") shell.Run outFileName, 1, false Set shell = Nothing Next Set extractor = Nothing MsgBox "Done"