These code samples will demonstrate how to use OCR(Optical Character Recognition) to extract text from a PDF document in ASP.NET, C#, C++, VB.NET and VBScript using ByteScout PDF Extractor SDK.

With PDF Extractor SDK you may also extract text from PDF by keyword or by coordinates.

Select your programming language:

How to use OCR to extract text from PDF in ASP.NET

using System;
using Bytescout.PDFExtractor;

// To compile the example copy missing .traineddata files from REDISTRIBUTABLE folder to "tessdata" project folder.
// or download from http://code.google.com/p/tesseract-ocr/downloads/list
// Make sure "Copy to Output Directory" property of each added language file is set to "Copy always".
// Note: Do not rename the "tessdata" folder - its name is hardcoded in OCR engine.

namespace WebApplication1
{
	public partial class Default : System.Web.UI.Page
	{
		protected void Page_Load(object sender, EventArgs e)
		{
			String inputFile = Server.MapPath("sample_ocr.pdf");
            // Set the location of 
		    String ocrLanguageDataFolder = Server.MapPath(@"tessdata");

			using (TextExtractor extractor = new TextExtractor())
			{
				extractor.RegistrationName = "demo";
				extractor.RegistrationKey = "demo";

                // setup OCR
				extractor.OCRMode = OCRMode.Auto;
                extractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
				extractor.OCRLanguage = "eng";
				extractor.OCRResolution = 300;

                extractor.LoadDocumentFromFile(inputFile);

                // Write extracted text to output stream
                Response.Clear();
				Response.ContentType = "text/html";
				Response.Write(extractor.GetText());
				
				Response.End();
			}
		}
	}
}

How to use OCR to extract text from PDF in C#

using Bytescout.PDFExtractor;

// To make OCR work you should add to your project references to Bytescout.PDFExtractor.dll and Bytescout.PDFExtractor.OCRExtension.dll 

namespace OCRExample
{
	class Program
	{
		static void Main(string[] args)
		{
			// Create Bytescout.PDFExtractor.TextExtractor instance
			TextExtractor extractor = new TextExtractor();
			extractor.RegistrationName = "demo";
			extractor.RegistrationKey = "demo";

			// Load sample PDF document
			extractor.LoadDocumentFromFile("sample_ocr.pdf");

			// Enable Optical Character Recognition (OCR)
			// in .Auto mode (SDK automatically checks if needs to use OCR or not)
			extractor.OCRMode = OCRMode.Auto;

			// Set the location of "tessdata" folder containing language data files
			extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\";

			// Set OCR language
			extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
			
			// Set PDF document rendering resolution
			extractor.OCRResolution = 300;

			// Save extracted text to file
			extractor.SaveTextToFile("output.txt");

			// Open output file in default associated application
			System.Diagnostics.Process.Start("output.txt");
		}
	}
}

How to use OCR to extract text from PDF in C++

#include "stdafx.h"
	#include "comip.h"

	#import "c:\\Program Files\\Bytescout PDF Extractor SDK\\net4.00\\Bytescout.PDFExtractor.tlb" raw_interfaces_only

	using namespace Bytescout_PDFExtractor;

	int _tmain(int argc, _TCHAR* argv[])
	{
		// Initialize COM.
		HRESULT hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);

		// Create the interface pointer.
		_TextExtractorPtr pITextExtractor(__uuidof(TextExtractor));

		// Set the registration name and key
		// Note: You should use _bstr_t or BSTR to pass string to the library because of COM requirements
		_bstr_t bstrRegName(L"DEMO"); 
		pITextExtractor->put_RegistrationName(bstrRegName);
		
		_bstr_t bstrRegKey(L"DEMO");
		pITextExtractor->put_RegistrationKey(bstrRegKey);

		// Load sample PDF document
		_bstr_t bstrPath(L"..\\..\\sample_ocr.pdf");
		pITextExtractor->LoadDocumentFromFile(bstrPath);

		// Enable Optical Character Recognition (OCR)
		// in .Auto mode (SDK automatically checks if needs to use OCR or not)
		pITextExtractor->put_OCRMode(OCRMode_Auto);
		
		// Set the location of "tessdata" folder containing language data files
		_bstr_t bstrOCRLangDataPath(L"c:\\Program Files\\Bytescout PDF Extractor SDK\\net4.00\\tessdata");
		pITextExtractor->put_OCRLanguageDataFolder(bstrOCRLangDataPath);

		// Set OCR language
		_bstr_t bstrOCRLanguage(L"eng");
		pITextExtractor->put_OCRLanguage(bstrOCRLanguage);

		// Set PDF document rendering resolution
		pITextExtractor->put_OCRResolution(300);

		// Save extracted text to file
		_bstr_t bstrOutputFile(L"output.txt");
		pITextExtractor->SaveTextToFile(bstrOutputFile);

		pITextExtractor->Release();

		CoUninitialize();

		return 0;
	}

How to use OCR to extract text from PDF in Visual Basic .NET

Imports Bytescout.PDFExtractor

' To make OCR work you should add to your project references to Bytescout.PDFExtractor.dll and Bytescout.PDFExtractor.OCRExtension.dll 

Class Program
	Friend Shared Sub Main(args As String())
		' Create Bytescout.PDFExtractor.TextExtractor instance
		Dim extractor As New TextExtractor()
		extractor.RegistrationName = "demo"
		extractor.RegistrationKey = "demo"

		' Load sample PDF document
		extractor.LoadDocumentFromFile("sample_ocr.pdf")

		' Enable Optical Character Recognition (OCR)
		' in .Auto mode (SDK automatically checks if needs to use OCR or not)
        extractor.OCRMode = OCRMode.Auto

        ' Set the location of "tessdata" folder containing language data files
        extractor.OCRLanguageDataFolder = "c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\"

		' Set OCR language
        extractor.OCRLanguage = "eng"  ' "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
		' Set PDF document rendering resolution
		extractor.OCRResolution = 300

		' Save extracted text to file
		extractor.SaveTextToFile("output.txt")

		' Open output file in default associated application
		System.Diagnostics.Process.Start("output.txt")
	End Sub
End Class

How to use OCR to extract text from PDF in VBScript (Visual Basic 6)

' Create TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"

' Load sample PDF document
extractor.LoadDocumentFromFile("..\..\sample_ocr.pdf")


' Enable Optical Character Recognition (OCR)
extractor.OCRMode = 1 ' OCRMode.Auto = 1

' Set the location of "tessdata" folder containing language data files
extractor.OCRLanguageDataFolder = "c:\Program Files\Bytescout BarCode SDK\Redistributable\tessdata"
			
' Set OCR language
' "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "tessdata" folder.
extractor.OCRLanguage = "eng"  

' Set PDF document rendering resolution
extractor.OCRResolution = 300

' Save extracted text to file
extractor.SaveTextToFile("output.txt")

MsgBox "Text was extracted to output.txt"

Set extractor = Nothing