How to find table in PDF and extract it as XML in C# and VBScript using PDF Extractor SDK - ByteScout

How to find table in PDF and extract it as XML in C# and VBScript using PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to find table in PDF and extract it as XML in C# and VBScript using PDF Extractor SDK

ByteScout PDF Extractor SDK can be used to find a table in a PDF document and extract it into XML format. Use the sample source code below to find tables in PDF and extract as XML in C# and VBScript.

If you need to extract your PDF as CSV, check this tutorial.

How to find table in PDF and extract it as XML in C#

[vb]
using System;
using Bytescout.PDFExtractor;

namespace ExtractTextByPages
{
class Program
{
static void Main(string[] args)
{
// Create Bytescout.PDFExtractor.XMLExtractor instance
XMLExtractor extractor = new XMLExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";

TableDetector tdetector = new TableDetector();
tdetector.RegistrationKey = "demo";
tdetector.RegistrationName = "demo";

// we should define what kind of tables we should detect
// so we set min required number of columns to 3
tdetector.DetectionMinNumberOfColumns = 3;

// and we set min required number of columns to 3
tdetector.DetectionMinNumberOfRows = 3;

// Load sample PDF document
extractor.LoadDocumentFromFile("sample3.pdf");
tdetector.LoadDocumentFromFile("sample3.pdf");

// Get page count
int pageCount = tdetector.GetPageCount();

for (int i = 0; i < pageCount; i++)
{
int j = 1;
// find first table and continue if found
if (tdetector.FindTable(i))
do
{
// set extraction area for XML extractor to rectangle given by table detector
extractor.SetExtractionArea(tdetector.GetFoundTableRectangle_Left(),
tdetector.GetFoundTableRectangle_Top(),
tdetector.GetFoundTableRectangle_Width(),
tdetector.GetFoundTableRectangle_Height()
);

// and finally save the table into XML file
extractor.SavePageXMLToFile(i, "page-" + i + "-table-" + j + ".XML");
j++;
} while (tdetector.FindNextTable()); // search next table
}

// Open first output file in default associated application
System.Diagnostics.Process.Start("page-0-table-1.XML");
}
}
}
[/vb]

How to find table in PDF and extract it as XML in VBScript (Visual Basic 6)

[vb]
‘ Create Bytescout.PDFExtractor.TextExtractor object
Set tdetector= CreateObject("Bytescout.PDFExtractor.TableDetector")
tdetector.RegistrationName = "demo"
tdetector.RegistrationKey = "demo"

‘ Create Bytescout.PDFExtractor.XMLExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.XMLExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

‘ we should define what kind of tables we should detect
‘ so we set min required number of columns to 3
tdetector.DetectionMinNumberOfColumns = 3

‘ and we set min required number of columns to 3
tdetector.DetectionMinNumberOfRows = 3

‘ Load sample PDF document into table detector
tdetector.LoadDocumentFromFile("..\..\sample3.pdf")

‘ Load sample PDF document into XML extractor
extractor.LoadDocumentFromFile "..\..\sample3.pdf"

‘ Get page count

pageCount = tdetector.GetPageCount()

For i=0 to PageCount-1

If tdetector.FindTable(i) Then ‘ parameters are: page index, string to find, case sensitivity
Do
MsgBox "Found a table on page #" & CStr(i) & " at left=" & CStr(tdetector.GetFoundTableRectangle_Left) & "; top=" & CStr(tdetector.GetFoundTableRectangle_Top) & "; width=" & CStr(tdetector.GetFoundTableRectangle_Width) & "; height=" & CStr(tdetector.GetFoundTableRectangle_Height)

‘ set extraction area to extract table data as XML
extractor.SetExtractionArea tdetector.GetFoundTableRectangle_Left, tdetector.GetFoundTableRectangle_Top, tdetector.GetFoundTableRectangle_Width, tdetector.GetFoundTableRectangle_Height

‘ define filename to save XML
XMLFileName = "page-" & CStr(i) & "-table-at-" & CStr(tdetector.GetFoundTableRectangle_Top) & ".xml"

‘ save XML from this page (bounded by extraction area) into file
extractor.SavePageXMLToFile i, XMLFileName

MsgBox "Table saved into XML as " & XMLFileName

‘ reset extraction area on the page
extractor.ResetExtractionArea

Loop While tdetector.FindNextTable
End If

Next

MsgBox "Done"

Set tdetector= Nothing
[/vb]

Tutorials:

prev
next