ByteScout PDF Extractor SDK - ASP.NET - OCR (Optical Character Recognition) - ByteScout

ByteScout PDF Extractor SDK – ASP.NET – OCR (Optical Character Recognition)

  • Home
  • /
  • Articles
  • /
  • ByteScout PDF Extractor SDK – ASP.NET – OCR (Optical Character Recognition)

ByteScout PDF Extractor SDK – ASP.NET – OCR (Optical Character Recognition)


using System;
using Bytescout.PDFExtractor;

// Before running the example copy missing .traineddata files from "Redistributable" folder to "tessdata" project folder.
// or download from
// Make sure "Copy to Output Directory" property of each added language file is set to "Copy always".
// Note: Do not rename the "tessdata" folder - its name is hardcoded in OCR engine.

namespace OpticalCharacterRecognition
	public partial class _Default : System.Web.UI.Page
		protected void Page_Load(object sender, EventArgs e)
            String inputFile = Server.MapPath(@".\bin\sample_ocr.pdf");
            // Location of language files
		    String ocrLanguageDataFolder = Server.MapPath(@".\bin\tessdata");

		    // Create Bytescout.PDFExtractor.TextExtractor instance
		    using (TextExtractor extractor = new TextExtractor())
		        extractor.RegistrationName = "demo";
		        extractor.RegistrationKey = "demo";

		        // Enable Optical Character Recognition (OCR)
		        // in .Auto mode (SDK automatically checks if needs to use OCR or not)
		        extractor.OCRMode = OCRMode.Auto;
		        // Set the location of "tessdata" folder containing language data file
		        extractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
		        // Set OCR language
		        extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
		        // Set PDF document rendering resolution
		        extractor.OCRResolution = 300;

		        // You can also apply various preprocessing filters
		        // to improve the recognition on low-quality scans.

		        // Automatically deskew skewed scans

		        // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)

		        // Repair broken letters

		        // Remove noise

		        // Apply Gamma Correction

				// Add Contrast

				// (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing 
				// filters for your specific document.
				// See "OCR Analyser" example.

		        // Load PDF document

		        // Write extracted text to output stream
		        Response.ContentType = "text/html";

		        // Write extracted text to output stream



<?xml version="1.0"?>

            Set compilation debug="true" to insert debugging 
            symbols into the compiled page. Because this 
            affects performance, set this value to true only 
            during development.
        <compilation debug="true" />
            The <authentication> section enables configuration 
            of the security authentication mode used by 
            ASP.NET to identify an incoming user. 
        <authentication mode="Windows" />
            The <customErrors> section enables configuration 
            of what to do if/when an unhandled error occurs 
            during the execution of a request. Specifically, 
            it enables developers to configure html error pages 
            to be displayed in place of a error stack trace.

        <customErrors mode="RemoteOnly" defaultRedirect="GenericErrorPage.htm">
            <error statusCode="403" redirect="NoAccess.htm" />
            <error statusCode="404" redirect="FileNotFound.htm" />

  Click here to get your Free Trial version of the SDK