ByteScout Data Extraction Suite – C# – PDF Invoice Parsing with PDF Extractor SDK

Home
/
Articles
/
ByteScout Data Extraction Suite – C# – PDF Invoice Parsing with PDF Extractor SDK

printable version:
ByteScout-Data-Extraction-Suite-C-sharp-PDF-Invoice-Parsing-with-PDF-Extractor-SDK.pdf

PDF invoice parsing with PDF extractor SDK in C# and ByteScout Data Extraction Suite

PDF invoice parsing with PDF extractor SDK in C#

The documentation is crafted to assist you to apply the features on your side easily. ByteScout Data Extraction Suite was created to assist PDF invoice parsing with PDF extractor SDK in C#. ByteScout Data Extraction Suite is the set that includes 3 SDK products for data extraction from PDF, scans, images and from spreadsheets: PDF Extractor SDK, Data Extraction SDK, Barcode Reader SDK.

This rich and prolific sample source code in C# for ByteScout Data Extraction Suite contains various functions and options you should do calling the API to implement PDF invoice parsing with PDF extractor SDK. Follow the steps-by-step instructions from the scratch to work and copy and paste code for C# into your editor. Use of ByteScout Data Extraction Suite in C# is also described in the documentation included along with the product.

If you want to try other samples for C# then free trial version of ByteScout Data Extraction Suite is available on our website.

On-demand (REST Web API) version:
Web API (on-demand version)

On-premise offline SDK for Windows:
60 Day Free Trial (on-premise)

Program.cs

      using System;
using System.Drawing;
using Bytescout.PDFExtractor;

namespace InvoiceParsing
{
	/// <summary>
	/// This example demonstrates parsing and data extraction from typical invoice.
	/// </summary>
	class Program
	{
		static void Main(string[] args)
		{
			// Create TextExtractor instance
			TextExtractor textExtractor = new TextExtractor("demo", "demo");
			textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch; // Set exact search (default is SmartSearch that works like in Adobe Reader)

			// Create XMLExtractor instance
			XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

			// Load document
			textExtractor.LoadDocumentFromFile("Invoice.pdf");
			xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

			// Results
			string invoiceNo = string.Empty;
			string invoiceDate = string.Empty;
			string total = string.Empty;
			string tableData = string.Empty;

			// Iterate pages
			for (int i = 0; i < textExtractor.GetPageCount(); i++)
			{
				RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
				RectangleF tableRect = new RectangleF(0, 0, pageRectangle.Width, 0);

				// Search for "Invoice No."
				if (textExtractor.Find(i, "Invoice No.", false))
				{
					// Get the found text rectangle
					RectangleF textRect = textExtractor.FoundText.Bounds;
					// Assume the text at right is the invoice number.
					// Shift the rectangle to the right:
					textRect.X = textRect.Right;
					textRect.Width = pageRectangle.Right - textRect.Left;
					// Set the extraction region and extract the text
					textExtractor.SetExtractionArea(textRect);
					invoiceNo = textExtractor.GetTextFromPage(i).Trim();
				}
				
				// Search for "Invoice Date" and extract text at right
				if (textExtractor.Find(i, "Invoice Date", false))
				{
					RectangleF textRect = textExtractor.FoundText.Bounds;
					textRect.X = textRect.Right;
					textRect.Width = pageRectangle.Right - textRect.Left;
					textExtractor.SetExtractionArea(textRect);
					invoiceDate = textExtractor.GetTextFromPage(i).Trim();
				}

				// Search for "Quantity" keyword to detect the top of the tabular data rectangle
				if (textExtractor.Find(i, "Quantity", false))
				{
					// Keep the top table coordinate
					tableRect.Y = textExtractor.FoundText.Bounds.Top; // use textRect.Bottom if you want to skip column headers
				}
				
				// Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
				if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */)) 
				{
					RectangleF textRect = textExtractor.FoundText.Bounds;
					textRect.X = textRect.Right;
					textRect.Width = pageRectangle.Right - textRect.Left;
					textExtractor.SetExtractionArea(textRect);
					total = textExtractor.GetTextFromPage(i).Trim();

					// Calculate the table height
					tableRect.Height = textRect.Top - tableRect.Top;
				}

				// Extract tabular data using XMLExtractor
				if (tableRect.Height > 0)
				{
					xmlExtractor.SetExtractionArea(tableRect);
					tableData = xmlExtractor.GetXMLFromPage(i);
				}
			}

			// Display extracted data
			Console.WriteLine("Invoice No.: " + invoiceNo);
			Console.WriteLine("Invoice Date: " + invoiceDate);
			Console.WriteLine("TOTAL: " + total);
			Console.WriteLine("Table Data: ");
			Console.WriteLine(tableData);

			// Cleanup
		    textExtractor.Dispose();
            xmlExtractor.Dispose();            
            
            Console.WriteLine();
			Console.WriteLine("Press any key...");
			Console.ReadKey();
		}
	}
}