How to find and extract PDF table to CSV in C# and VBScript using PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to find and extract PDF table to CSV in C# and VBScript using PDF Extractor SDK

Use the sample source codes below to detect tables in PDF files and convert PDF table to CSV file in C# and VBScript using PDF Extractor SDK.

C#

using System;
using Bytescout.PDFExtractor;

namespace ExtractTextByPages
{
	class Program
	{
		static void Main(string[] args)
		{
			// Create Bytescout.PDFExtractor.TextExtractor instance
			CSVExtractor extractor = new CSVExtractor();
			extractor.RegistrationName = "demo";
			extractor.RegistrationKey = "demo";

            TableDetector tdetector = new TableDetector();
            tdetector.RegistrationKey = "demo";
            tdetector.RegistrationName = "demo";

			// Load sample PDF document
			extractor.LoadDocumentFromFile("sample3.pdf");
            tdetector.LoadDocumentFromFile("sample3.pdf");

			// Get page count
			int pageCount = tdetector.GetPageCount();

			for (int i = 0; i < pageCount; i++)
			{
                int j = 1;
                // find first table and continue if found
                if (tdetector.FindTable(i))
                    do
                    {
                        // set extraction area for CSV extractor to rectangle given by table detector
                        extractor.SetExtractionArea(tdetector.GetFoundTableRectangle_Left(),
                            tdetector.GetFoundTableRectangle_Top(),
                            tdetector.GetFoundTableRectangle_Width(),
                            tdetector.GetFoundTableRectangle_Height()
                        );

                        // and finally save the table into CSV file
                        extractor.SavePageCSVToFile(i, "page-" + i + "-table-" + j + ".csv");
                        j++;
                    } while (tdetector.FindNextTable()); // search next table
			}

			// Open first output file in default associated application
			System.Diagnostics.Process.Start("page-0-table-1.csv");
		}
	}
}

VBScript

' Create Bytescout.PDFExtractor.TextExtractor object
Set tdetector= CreateObject("Bytescout.PDFExtractor.TableDetector")
tdetector.RegistrationName = "demo"
tdetector.RegistrationKey = "demo"

' Create Bytescout.PDFExtractor.CSVExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.CSVExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

' Load sample PDF document into table detector
tdetector.LoadDocumentFromFile("....sample3.pdf")

' Load sample PDF document into CSV extractor
extractor.LoadDocumentFromFile "....sample3.pdf"

' Get page count

pageCount = tdetector.GetPageCount()

For i=0 to PageCount-1 
 
 If tdetector.FindTable(i) Then ' parameters are: page index, string to find, case sensitivity
 	Do
 		MsgBox "Found a table on page #" & CStr(i) & " at left=" & CStr(tdetector.GetFoundTableRectangle_Left) & "; top=" & CStr(tdetector.GetFoundTableRectangle_Top) & "; width=" & CStr(tdetector.GetFoundTableRectangle_Width) & "; height=" & CStr(tdetector.GetFoundTableRectangle_Height)

	
	' set extraction area to extract table data as CSV
	extractor.SetExtractionArea tdetector.GetFoundTableRectangle_Left, tdetector.GetFoundTableRectangle_Top, tdetector.GetFoundTableRectangle_Width, tdetector.GetFoundTableRectangle_Height

	' define filename to save CSV
 	CSVFileName = "page-" & CStr(i) & "-table-at-" & CStr(tdetector.GetFoundTableRectangle_Top) & ".csv"

	' save CSV from this page (bounded by extraction area) into file
	extractor.SavePageCSVToFile i, CSVFileName

	MsgBox "Table saved into CSV as " & CSVFileName

	' reset extraction area on the page
	extractor.ResetExtractionArea


  	Loop While tdetector.FindNextTable
 End If

Next

MsgBox "Done"

Set tdetector= Nothing


prev
next