Default.aspx.cs
using System; using Bytescout.PDFExtractor; // Before running the example copy missing .traineddata files from "Redistributable" folder to "tessdata" project folder. // or download from https://github.com/tesseract-ocr/tessdata/tree/3.04.00 // Make sure "Copy to Output Directory" property of each added language file is set to "Copy always". // Note: Do not rename the "tessdata" folder - its name is hardcoded in OCR engine. namespace OpticalCharacterRecognition { public partial class _Default : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { String inputFile = Server.MapPath(@".\bin\sample_ocr.pdf"); // Location of language files String ocrLanguageDataFolder = Server.MapPath(@".\bin\tessdata"); // Create Bytescout.PDFExtractor.TextExtractor instance using (TextExtractor extractor = new TextExtractor()) { extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of "tessdata" folder containing language data file extractor.OCRLanguageDataFolder = ocrLanguageDataFolder; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. // Automatically deskew skewed scans //extractor.OCRImagePreprocessingFilters.AddDeskew(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Repair broken letters //extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove noise //extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction //extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Add Contrast //extractor.OCRImagePreprocessingFilters.AddContrast(20); // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing // filters for your specific document. // See "OCR Analyser" example. // Load PDF document extractor.LoadDocumentFromFile(inputFile); // Write extracted text to output stream Response.Clear(); Response.ContentType = "text/html"; Response.Write("<pre>"); // Write extracted text to output stream Response.Write(extractor.GetText()); Response.Write("</pre>"); Response.End(); } } } }
Web.config
<?xml version="1.0"?> <configuration> <appSettings/> <connectionStrings/> <system.web> <!-- Set compilation debug="true" to insert debugging symbols into the compiled page. Because this affects performance, set this value to true only during development. --> <compilation debug="true" /> <!-- The <authentication> section enables configuration of the security authentication mode used by ASP.NET to identify an incoming user. --> <authentication mode="Windows" /> <!-- The <customErrors> section enables configuration of what to do if/when an unhandled error occurs during the execution of a request. Specifically, it enables developers to configure html error pages to be displayed in place of a error stack trace. <customErrors mode="RemoteOnly" defaultRedirect="GenericErrorPage.htm"> <error statusCode="403" redirect="NoAccess.htm" /> <error statusCode="404" redirect="FileNotFound.htm" /> </customErrors> --> </system.web> </configuration>
Click here to get your Free Trial version of the SDK
also available as: