ByteScout PDF Extractor SDK – C# – PDF Invoice Parsing

Home
/
Articles
/
ByteScout PDF Extractor SDK – C# – PDF Invoice Parsing

printable version:
ByteScout-PDF-Extractor-SDK-C-sharp-PDF-Invoice-Parsing.pdf

PDF invoice parsing in C# using ByteScout PDF Extractor SDK

Tutorial: how to do PDF invoice parsing in C#

Today you are going to learn how to PDF invoice parsing in C#. ByteScout PDF Extractor SDK was made to help with PDF invoice parsing in C#. ByteScout PDF Extractor SDK is the SDK that helps developers to extract data from unstructured documents, pdf, images, scanned and electronic forms. Includes AI functions like automatic table detection, automatic table extraction and restructuring, text recognition and text restoration from pdf and scanned documents. Includes PDF to CSV, PDF to XML, PDF to JSON, PDF to searchable PDF functions as well as methods for low level data extraction.

C# code snippet like this for ByteScout PDF Extractor SDK works best when you need to quickly implement PDF invoice parsing in your C# application. This C# sample code should be copied and pasted into your application’s code editor. Then just compile and run it to see how it works. Use of ByteScout PDF Extractor SDK in C# is also explained in the documentation included along with the product.

Free trial version of ByteScout PDF Extractor SDK is available on our website. Get it to try other samples for C#.

On-demand (REST Web API) version:
Web API (on-demand version)

On-premise offline SDK for Windows:
60 Day Free Trial (on-premise)

Program.cs

      using System;
using System.Drawing;
using Bytescout.PDFExtractor;

namespace InvoiceParsing
{
	/// <summary>
	/// This example demonstrates parsing and data extraction from typical invoice.
	/// </summary>
	class Program
	{
		static void Main(string[] args)
		{
			// Create TextExtractor instance
			TextExtractor textExtractor = new TextExtractor("demo", "demo");
			textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch; // Set exact search (default is SmartSearch that works like in Adobe Reader)

			// Create XMLExtractor instance
			XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

			// Load document
			textExtractor.LoadDocumentFromFile("Invoice.pdf");
			xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

			// Results
			string invoiceNo = string.Empty;
			string invoiceDate = string.Empty;
			string total = string.Empty;
			string tableData = string.Empty;

			// Iterate pages
			for (int i = 0; i < textExtractor.GetPageCount(); i++)
			{
				RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
				RectangleF tableRect = new RectangleF(0, 0, pageRectangle.Width, 0);

				// Search for "Invoice No."
				if (textExtractor.Find(i, "Invoice No.", false))
				{
					// Get the found text rectangle
					RectangleF textRect = textExtractor.FoundText.Bounds;
					// Assume the text at right is the invoice number.
					// Shift the rectangle to the right:
					textRect.X = textRect.Right;
					textRect.Width = pageRectangle.Right - textRect.Left;
					// Set the extraction region and extract the text
					textExtractor.SetExtractionArea(textRect);
					invoiceNo = textExtractor.GetTextFromPage(i).Trim();
				}
				
				// Search for "Invoice Date" and extract text at right
				if (textExtractor.Find(i, "Invoice Date", false))
				{
					RectangleF textRect = textExtractor.FoundText.Bounds;
					textRect.X = textRect.Right;
					textRect.Width = pageRectangle.Right - textRect.Left;
					textExtractor.SetExtractionArea(textRect);
					invoiceDate = textExtractor.GetTextFromPage(i).Trim();
				}

				// Search for "Quantity" keyword to detect the top of the tabular data rectangle
				if (textExtractor.Find(i, "Quantity", false))
				{
					// Keep the top table coordinate
					tableRect.Y = textExtractor.FoundText.Bounds.Top; // use textRect.Bottom if you want to skip column headers
				}
				
				// Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
				if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */)) 
				{
					RectangleF textRect = textExtractor.FoundText.Bounds;
					textRect.X = textRect.Right;
					textRect.Width = pageRectangle.Right - textRect.Left;
					textExtractor.SetExtractionArea(textRect);
					total = textExtractor.GetTextFromPage(i).Trim();

					// Calculate the table height
					tableRect.Height = textRect.Top - tableRect.Top;
				}

				// Extract tabular data using XMLExtractor
				if (tableRect.Height > 0)
				{
					xmlExtractor.SetExtractionArea(tableRect);
					tableData = xmlExtractor.GetXMLFromPage(i);
				}
			}

			// Display extracted data
			Console.WriteLine("Invoice No.: " + invoiceNo);
			Console.WriteLine("Invoice Date: " + invoiceDate);
			Console.WriteLine("TOTAL: " + total);
			Console.WriteLine("Table Data: ");
			Console.WriteLine(tableData);

			// Cleanup
		    textExtractor.Dispose();
            xmlExtractor.Dispose();            
            
            Console.WriteLine();
			Console.WriteLine("Press any key...");
			Console.ReadKey();
		}
	}
}