ByteScout PDF Extractor SDK – ASP.NET C# – OCR (Optical Character Recognition) in PDF

  • Home
  • /
  • Articles
  • /
  • ByteScout PDF Extractor SDK – ASP.NET C# – OCR (Optical Character Recognition) in PDF

ByteScout PDF Extractor SDK – ASP.NET C# – OCR (Optical Character Recognition) in PDF

Default.aspx.cs

using System;
using Bytescout.PDFExtractor;

// Before running the example copy missing .traineddata files from "Redistributable" folder to "tessdata" project folder.
// or download from https://github.com/tesseract-ocr/tessdata/tree/3.04.00
// Make sure "Copy to Output Directory" property of each added language file is set to "Copy always".
// Note: Do not rename the "tessdata" folder - its name is hardcoded in OCR engine.

namespace OpticalCharacterRecognition
{
	public partial class _Default : System.Web.UI.Page
	{
		protected void Page_Load(object sender, EventArgs e)
		{
            String inputFile = Server.MapPath(@".\bin\sample_ocr.pdf");
		    
            // Location of language files
		    String ocrLanguageDataFolder = Server.MapPath(@".\bin\tessdata");

		    // Create Bytescout.PDFExtractor.TextExtractor instance
		    using (TextExtractor extractor = new TextExtractor())
		    {
		        extractor.RegistrationName = "demo";
		        extractor.RegistrationKey = "demo";

		        // Enable Optical Character Recognition (OCR)
		        // in .Auto mode (SDK automatically checks if needs to use OCR or not)
		        extractor.OCRMode = OCRMode.Auto;
		        // Set the location of "tessdata" folder containing language data file
		        extractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
		        // Set OCR language
		        extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
		        // Set PDF document rendering resolution
		        extractor.OCRResolution = 300;


		        // You can also apply various preprocessing filters
		        // to improve the recognition on low-quality scans.

		        // Automatically deskew skewed scans
		        //extractor.OCRImagePreprocessingFilters.AddDeskew();

		        // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
		        //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
		        //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();

		        // Repair broken letters
		        //extractor.OCRImagePreprocessingFilters.AddDilate();

		        // Remove noise
		        //extractor.OCRImagePreprocessingFilters.AddMedian();

		        // Apply Gamma Correction
		        //extractor.OCRImagePreprocessingFilters.AddGammaCorrection();

				// Add Contrast
				//extractor.OCRImagePreprocessingFilters.AddContrast(20);


				// (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing 
				// filters for your specific document.
				// See "OCR Analyser" example.


		        // Load PDF document
		        extractor.LoadDocumentFromFile(inputFile);

		        // Write extracted text to output stream
		        Response.Clear();
		        Response.ContentType = "text/html";

		        Response.Write("<pre>");
		        // Write extracted text to output stream
		        Response.Write(extractor.GetText());
		        Response.Write("</pre>");

		        Response.End();
		    }
		}
	}
}

Web.config

<?xml version="1.0"?>

<configuration>
  
    <appSettings/>
    <connectionStrings/>
  
    <system.web>
        <!-- 
            Set compilation debug="true" to insert debugging 
            symbols into the compiled page. Because this 
            affects performance, set this value to true only 
            during development.
        -->
        <compilation debug="true" />
        <!--
            The <authentication> section enables configuration 
            of the security authentication mode used by 
            ASP.NET to identify an incoming user. 
        -->
        <authentication mode="Windows" />
        <!--
            The <customErrors> section enables configuration 
            of what to do if/when an unhandled error occurs 
            during the execution of a request. Specifically, 
            it enables developers to configure html error pages 
            to be displayed in place of a error stack trace.

        <customErrors mode="RemoteOnly" defaultRedirect="GenericErrorPage.htm">
            <error statusCode="403" redirect="NoAccess.htm" />
            <error statusCode="404" redirect="FileNotFound.htm" />
        </customErrors>
        -->
    </system.web>
</configuration>


  Click here to get your Free Trial version of the SDK

Tutorials:

prev
next