ByteScout PDF Suite – C# – OCR Analyser with PDF Extractor SDK

Home
/
Articles
/
ByteScout PDF Suite – C# – OCR Analyser with PDF Extractor SDK

printable version:
ByteScout-PDF-Suite-C-sharp-OCR-Analyser-with-PDF-Extractor-SDK.pdf

OCR analyser with PDF extractor SDK in C# with ByteScout PDF Suite

Learn to code in C# to make OCR analyser with PDF extractor SDK with this simple How-To tutorial

Writing of the code to OCR analyser with PDF extractor SDK in C# can be done by developers of any level using ByteScout PDF Suite. ByteScout PDF Suite helps with OCR analyser with PDF extractor SDK in C#. ByteScout PDF Suite is the bundle that provides six different SDK libraries to work with PDF from generating rich PDF reports to extracting data from PDF documents and converting them to HTML. This bundle includes PDF (Generator) SDK, PDF Renderer SDK, PDF Extractor SDK, PDF to HTML SDK, PDF Viewer SDK and PDF Generator SDK for Javascript.

Save time on writing and testing code by using the code below and use it in your application. If you want to implement this functionality, you should copy and paste code below into your app using code editor. Then compile and run your application. Want to see how it works with your data then code testing will allow the function to be tested and work properly.

Trial version along with the source code samples for C# can be downloaded from our website

On-demand (REST Web API) version:
Web API (on-demand version)

On-premise offline SDK for Windows:
60 Day Free Trial (on-premise)

Program.cs

      using System;
using System.Drawing;
using System.Diagnostics;
using Bytescout.PDFExtractor;

// This example demonstrates the use of OCR Analyser - a tooling class for analysis of scanned documents
// in PDF or raster image formats to find best parameters for Optical Character Recognition (OCR) that
// provide highest recognition quality.

// To make OCR work you should add the following references to your project:
// 'Bytescout.PDFExtractor.dll', 'Bytescout.PDFExtractor.OCRExtension.dll'.

namespace OCRAnalyser
{
    class Program
    {
        static void Main(string[] args)
        {
            // Input document
            string inputDocument = @".\sample_ocr.pdf";
            
            // Document page index
            int pageIndex = 0;
            
            // Area of the document page to perform the analysis (optional).
            // RectangleF.Empty means the full page.
            RectangleF rectangle = RectangleF.Empty; // new RectangleF(100, 50, 350, 250);

            // Location of language data files
            string ocrLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // OCR language
            string ocrLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata


            // Create OCRAnalyzer instance and activate it with your registration information
            using (OCRAnalyzer ocrAnalyzer = new OCRAnalyzer("demo", "demo"))
            {
                // Display analysis progress
                ocrAnalyzer.ProgressChanged += (object sender, string message, double progress, ref bool cancel) =>
                {
                    Console.WriteLine(message);
                };

                // Load document to OCRAnalyzer
                ocrAnalyzer.LoadDocumentFromFile(inputDocument);

                // Setup OCRAnalyzer
                ocrAnalyzer.OCRLanguage = ocrLanguage;
                ocrAnalyzer.OCRLanguageDataFolder = ocrLanguageDataFolder;
                
                // Set page area for analysis (optional)
                ocrAnalyzer.SetExtractionArea(rectangle);
                
                // Perform analysis and get results
                OCRAnalysisResults analysisResults = ocrAnalyzer.AnalyzeByOCRConfidence(pageIndex);


                // Now extract the text using detected OCR parameters

                string outputDocument = @".\result.txt";
                
                // Create TextExtractor instance
                using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                {
                    // Load document to TextExtractor
                    textExtractor.LoadDocumentFromFile(inputDocument);

                    // Setup TextExtractor
                    textExtractor.OCRMode = OCRMode.Auto;
                    textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
                    textExtractor.OCRLanguage = ocrLanguage;

                    // Apply analysis results to TextExtractor instance
                    ocrAnalyzer.ApplyResults(analysisResults, textExtractor);

                    // Set extraction area (optional)
                    textExtractor.SetExtractionArea(rectangle);

                    // Save extracted text to file
                    textExtractor.SaveTextToFile(outputDocument);

                    // Open result document in default associated application (for demo purpose)
                    ProcessStartInfo processStartInfo = new ProcessStartInfo(outputDocument);
                    processStartInfo.UseShellExecute = true;
                    Process.Start(processStartInfo);
                }
            }
        }
    }
}