How to find table in PDF and extract it as XML in C# and VBScript using PDF Extractor SDK - ByteScout
Announcement
Our ByteScout SDK products are sunsetting as we focus on expanding new solutions.
Learn More Open modal
Close modal
Announcement Important Update
ByteScout SDK Sunsetting Notice
Our ByteScout SDK products are sunsetting as we focus on our new & improved solutions. Thank you for being part of our journey, and we look forward to supporting you in this next chapter!

How to find table in PDF and extract it as XML in C# and VBScript using PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to find table in PDF and extract it as XML in C# and VBScript using PDF Extractor SDK

ByteScout PDF Extractor SDK can be used to find a table in a PDF document and extract it into XML format. Use the sample source code below to find tables in PDF and extract as XML in C# and VBScript.

If you need to extract your PDF as CSV, check this tutorial.

How to find table in PDF and extract it as XML in C#

using System;
using Bytescout.PDFExtractor;

namespace ExtractTextByPages
{
	class Program
	{
		static void Main(string[] args)
		{
			// Create Bytescout.PDFExtractor.XMLExtractor instance
			XMLExtractor extractor = new XMLExtractor();
			extractor.RegistrationName = "demo";
			extractor.RegistrationKey = "demo";

            TableDetector tdetector = new TableDetector();
            tdetector.RegistrationKey = "demo";
            tdetector.RegistrationName = "demo";

		// we should define what kind of tables we should detect
		// so we set min required number of columns to 3
		tdetector.DetectionMinNumberOfColumns = 3;

		// and we set min required number of columns to 3
		tdetector.DetectionMinNumberOfRows = 3;

			// Load sample PDF document
			extractor.LoadDocumentFromFile("sample3.pdf");
            tdetector.LoadDocumentFromFile("sample3.pdf");

			// Get page count
			int pageCount = tdetector.GetPageCount();

			for (int i = 0; i < pageCount; i++)
			{
                int j = 1;
                // find first table and continue if found
                if (tdetector.FindTable(i))
                    do
                    {
                        // set extraction area for XML extractor to rectangle given by table detector
                        extractor.SetExtractionArea(tdetector.GetFoundTableRectangle_Left(),
                            tdetector.GetFoundTableRectangle_Top(),
                            tdetector.GetFoundTableRectangle_Width(),
                            tdetector.GetFoundTableRectangle_Height()
                        );

                        // and finally save the table into XML file
                        extractor.SavePageXMLToFile(i, "page-" + i + "-table-" + j + ".XML");
                        j++;
                    } while (tdetector.FindNextTable()); // search next table
			}

			// Open first output file in default associated application
			System.Diagnostics.Process.Start("page-0-table-1.XML");
		}
	}
}

How to find table in PDF and extract it as XML in VBScript (Visual Basic 6)

' Create Bytescout.PDFExtractor.TextExtractor object
Set tdetector= CreateObject("Bytescout.PDFExtractor.TableDetector")
tdetector.RegistrationName = "demo"
tdetector.RegistrationKey = "demo"

' Create Bytescout.PDFExtractor.XMLExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.XMLExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

' we should define what kind of tables we should detect
' so we set min required number of columns to 3
tdetector.DetectionMinNumberOfColumns = 3

' and we set min required number of columns to 3
tdetector.DetectionMinNumberOfRows = 3



' Load sample PDF document into table detector
tdetector.LoadDocumentFromFile("..\..\sample3.pdf")

' Load sample PDF document into XML extractor
extractor.LoadDocumentFromFile "..\..\sample3.pdf"

' Get page count

pageCount = tdetector.GetPageCount()

For i=0 to PageCount-1 
 
 If tdetector.FindTable(i) Then ' parameters are: page index, string to find, case sensitivity
 	Do
 		MsgBox "Found a table on page #" & CStr(i) & " at left=" & CStr(tdetector.GetFoundTableRectangle_Left) & "; top=" & CStr(tdetector.GetFoundTableRectangle_Top) & "; width=" & CStr(tdetector.GetFoundTableRectangle_Width) & "; height=" & CStr(tdetector.GetFoundTableRectangle_Height)

	
	' set extraction area to extract table data as XML
	extractor.SetExtractionArea tdetector.GetFoundTableRectangle_Left, tdetector.GetFoundTableRectangle_Top, tdetector.GetFoundTableRectangle_Width, tdetector.GetFoundTableRectangle_Height

	' define filename to save XML
 	XMLFileName = "page-" & CStr(i) & "-table-at-" & CStr(tdetector.GetFoundTableRectangle_Top) & ".xml"

	' save XML from this page (bounded by extraction area) into file
	extractor.SavePageXMLToFile i, XMLFileName

	MsgBox "Table saved into XML as " & XMLFileName

	' reset extraction area on the page
	extractor.ResetExtractionArea


  	Loop While tdetector.FindNextTable
 End If

Next

MsgBox "Done"

Set tdetector= Nothing

Tutorials:

prev
next