Program.cs
using Bytescout.PDFExtractor; using System; namespace RepairText { class Program { static void Main(string[] args) { try { //Read all text from pdf file using (TextExtractor extractor = new TextExtractor()) { // Load PDF document extractor.LoadDocumentFromFile("sample.pdf"); // Set the font repairing OCR mode extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts; // Set the location of "tessdata" folder containing language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata // Find more language files at https://github.com/tesseract-ocr/tessdata/tree/3.04.00 // Set PDF document rendering resolution extractor.OCRResolution = 300; //Read all text string allText = extractor.GetText(); Console.WriteLine("Extracted Text: \n\n" + allText); } } catch (Exception ex) { Console.WriteLine(ex.Message); } Console.ReadLine(); } } }
Click here to get your Free Trial version of the SDK
also available as: