Default.aspx.cs
using System;
using Bytescout.PDFExtractor;
// Before running the example copy missing .traineddata files from "Redistributable" folder to "tessdata" project folder.
// or download from https://github.com/tesseract-ocr/tessdata/tree/3.04.00
// Make sure "Copy to Output Directory" property of each added language file is set to "Copy always".
// Note: Do not rename the "tessdata" folder - its name is hardcoded in OCR engine.
namespace OpticalCharacterRecognition
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
String inputFile = Server.MapPath(@".\bin\sample_ocr.pdf");
// Location of language files
String ocrLanguageDataFolder = Server.MapPath(@".\bin\tessdata");
// Create Bytescout.PDFExtractor.TextExtractor instance
using (TextExtractor extractor = new TextExtractor())
{
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
// Enable Optical Character Recognition (OCR)
// in .Auto mode (SDK automatically checks if needs to use OCR or not)
extractor.OCRMode = OCRMode.Auto;
// Set the location of "tessdata" folder containing language data file
extractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
// Set OCR language
extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
// Set PDF document rendering resolution
extractor.OCRResolution = 300;
// You can also apply various preprocessing filters
// to improve the recognition on low-quality scans.
// Automatically deskew skewed scans
//extractor.OCRImagePreprocessingFilters.AddDeskew();
// Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
//extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
//extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();
// Repair broken letters
//extractor.OCRImagePreprocessingFilters.AddDilate();
// Remove noise
//extractor.OCRImagePreprocessingFilters.AddMedian();
// Apply Gamma Correction
//extractor.OCRImagePreprocessingFilters.AddGammaCorrection();
// Add Contrast
//extractor.OCRImagePreprocessingFilters.AddContrast(20);
// (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing
// filters for your specific document.
// See "OCR Analyser" example.
// Load PDF document
extractor.LoadDocumentFromFile(inputFile);
// Write extracted text to output stream
Response.Clear();
Response.ContentType = "text/html";
Response.Write("<pre>");
// Write extracted text to output stream
Response.Write(extractor.GetText());
Response.Write("</pre>");
Response.End();
}
}
}
}
Web.config
<?xml version="1.0"?>
<configuration>
<appSettings/>
<connectionStrings/>
<system.web>
<!--
Set compilation debug="true" to insert debugging
symbols into the compiled page. Because this
affects performance, set this value to true only
during development.
-->
<compilation debug="true" />
<!--
The <authentication> section enables configuration
of the security authentication mode used by
ASP.NET to identify an incoming user.
-->
<authentication mode="Windows" />
<!--
The <customErrors> section enables configuration
of what to do if/when an unhandled error occurs
during the execution of a request. Specifically,
it enables developers to configure html error pages
to be displayed in place of a error stack trace.
<customErrors mode="RemoteOnly" defaultRedirect="GenericErrorPage.htm">
<error statusCode="403" redirect="NoAccess.htm" />
<error statusCode="404" redirect="FileNotFound.htm" />
</customErrors>
-->
</system.web>
</configuration>
Click here to get your Free Trial version of the SDK
also available as: