ByteScout Data Extraction Suite – C# – OCR Analyser with PDF Extractor SDK

Home
/
Articles
/
ByteScout Data Extraction Suite – C# – OCR Analyser with PDF Extractor SDK

printable version:
ByteScout-Data-Extraction-Suite-C-sharp-OCR-Analyser-with-PDF-Extractor-SDK.pdf

OCR analyser with PDF extractor SDK in C# and ByteScout Data Extraction Suite

Learn to code OCR analyser with PDF extractor SDK in C#: How-To tutorial

An easy to understand guide to learn how to OCR analyser with PDF extractor SDK in C#. ByteScout Data Extraction Suite helps with OCR analyser with PDF extractor SDK in C#. ByteScout Data Extraction Suite is the bundle that includes three SDK tools for data extraction from PDF, scans, images and from spreadsheets: PDF Extractor SDK, Data Extraction SDK, Barcode Reader SDK.

C# code snippet like this for ByteScout Data Extraction Suite works best when you need to quickly implement OCR analyser with PDF extractor SDK in your C# application. Follow the steps-by-step instructions from the scratch to work and copy and paste code for C# into your editor. These C# sample examples can be used in one or many applications.

Trial version can be downloaded from our website for free. It contains this and other source code samples for C#.

On-demand (REST Web API) version:
Web API (on-demand version)

On-premise offline SDK for Windows:
60 Day Free Trial (on-premise)

Program.cs

      using System;
using System.Drawing;
using System.Diagnostics;
using Bytescout.PDFExtractor;

// This example demonstrates the use of OCR Analyser - a tooling class for analysis of scanned documents
// in PDF or raster image formats to find best parameters for Optical Character Recognition (OCR) that
// provide highest recognition quality.

// To make OCR work you should add the following references to your project:
// 'Bytescout.PDFExtractor.dll', 'Bytescout.PDFExtractor.OCRExtension.dll'.

namespace OCRAnalyser
{
    class Program
    {
        static void Main(string[] args)
        {
            // Input document
            string inputDocument = @".\sample_ocr.pdf";
            
            // Document page index
            int pageIndex = 0;
            
            // Area of the document page to perform the analysis (optional).
            // RectangleF.Empty means the full page.
            RectangleF rectangle = RectangleF.Empty; // new RectangleF(100, 50, 350, 250);

            // Location of language data files
            string ocrLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // OCR language
            string ocrLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata


            // Create OCRAnalyzer instance and activate it with your registration information
            using (OCRAnalyzer ocrAnalyzer = new OCRAnalyzer("demo", "demo"))
            {
                // Display analysis progress
                ocrAnalyzer.ProgressChanged += (object sender, string message, double progress, ref bool cancel) =>
                {
                    Console.WriteLine(message);
                };

                // Load document to OCRAnalyzer
                ocrAnalyzer.LoadDocumentFromFile(inputDocument);

                // Setup OCRAnalyzer
                ocrAnalyzer.OCRLanguage = ocrLanguage;
                ocrAnalyzer.OCRLanguageDataFolder = ocrLanguageDataFolder;
                
                // Set page area for analysis (optional)
                ocrAnalyzer.SetExtractionArea(rectangle);
                
                // Perform analysis and get results
                OCRAnalysisResults analysisResults = ocrAnalyzer.AnalyzeByOCRConfidence(pageIndex);


                // Now extract the text using detected OCR parameters

                string outputDocument = @".\result.txt";
                
                // Create TextExtractor instance
                using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                {
                    // Load document to TextExtractor
                    textExtractor.LoadDocumentFromFile(inputDocument);

                    // Setup TextExtractor
                    textExtractor.OCRMode = OCRMode.Auto;
                    textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
                    textExtractor.OCRLanguage = ocrLanguage;

                    // Apply analysis results to TextExtractor instance
                    ocrAnalyzer.ApplyResults(analysisResults, textExtractor);

                    // Set extraction area (optional)
                    textExtractor.SetExtractionArea(rectangle);

                    // Save extracted text to file
                    textExtractor.SaveTextToFile(outputDocument);

                    // Open result document in default associated application (for demo purpose)
                    ProcessStartInfo processStartInfo = new ProcessStartInfo(outputDocument);
                    processStartInfo.UseShellExecute = true;
                    Process.Start(processStartInfo);
                }
            }
        }
    }
}