The sample source codes below will show how to extract text from PDF in ASP.NET, Classic ASP, C++, C#, VB.NET and VBScript with ByteScout PDF Extractor SDK.
Also, here is an article that shows how to extract text from PDF by pages.
Select your programming language:
using System; using System.Data; using System.Configuration; using System.Collections; using System.IO; using System.Web; using System.Web.Security; using System.Web.UI; using System.Web.UI.WebControls; using System.Web.UI.WebControls.WebParts; using System.Web.UI.HtmlControls; using Bytescout.PDFExtractor; namespace ExtractAllText { public partial class _Default : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { // This test file will be copied to the project directory on the pre-build event (see the project properties). String inputFile = Server.MapPath("sample2.pdf"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); Response.Clear(); Response.ContentType = "text/html"; // Save extracted text to output stream extractor.SaveTextToStream(Response.OutputStream); Response.End(); } } }
using System; using Bytescout.PDFExtractor; namespace ExtractAllText { class Program { static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample2.pdf"); // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Open output file in default associated application System.Diagnostics.Process.Start("output.txt"); } } }
#include "stdafx.h" #include "comip.h" #import "c:\\Program Files\\Bytescout PDF Extractor SDK\\net4.00\\Bytescout.PDFExtractor.tlb" raw_interfaces_only using namespace Bytescout_PDFExtractor; int _tmain(int argc, _TCHAR* argv[]) { // Initialize COM. HRESULT hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED); // Create the interface pointer. _TextExtractorPtr pITextExtractor(__uuidof(TextExtractor)); // Set the registration name and key // Note: You should use _bstr_t or BSTR to pass string to the library because of COM requirements _bstr_t bstrRegName(L"DEMO"); pITextExtractor->put_RegistrationName(bstrRegName); _bstr_t bstrRegKey(L"DEMO"); pITextExtractor->put_RegistrationKey(bstrRegKey); // Load sample PDF document _bstr_t bstrPath(L"..\\..\\sample3.pdf"); pITextExtractor->LoadDocumentFromFile(bstrPath); // Save extracted text to file _bstr_t bstrOutputFile(L"output.txt"); pITextExtractor->SaveTextToFile(bstrOutputFile); pITextExtractor->Release(); CoUninitialize(); return 0; }
Imports Bytescout.PDFExtractor Class Program Friend Shared Sub Main(args As String()) ' Create Bytescout.PDFExtractor.TextExtractor instance Dim extractor As New TextExtractor() extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile("sample2.pdf") ' Save extracted text to file extractor.SaveTextToFile("output.txt") ' Open output file in default associated application System.Diagnostics.Process.Start("output.txt") End Sub End Class
' Create Bytescout.PDFExtractor.TextExtractor object Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor") extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile("..\..\sample2.pdf") ' Save extracted text to file extractor.SaveTextToFile("output.txt") ' Open output file in default associated application Set shell = CreateObject("WScript.Shell") shell.Run "output.txt", 1, false Set shell = Nothing Set extractor = Nothing
<% ' In case of "Server.CreateObject Failed", "Server object error "ASP 0177 : 8000ffff" or similar errors: ' Please try the following: ' - Open IIS ' - Find application pools (DefaultAppPool is used by default) ' - Open its properties and check .NET CLR version selected: ' - if you have .NET 1.1 then change to .NET CLR 2.00 ' - if you have .NET CLR 2.00 then try to change to .NET CLR 4.0 Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor") extractor.RegistrationName = "demo" extractor.RegistrationKey = "demo" ' Load sample PDF document extractor.LoadDocumentFromFile("..\..\sample2.pdf") ' Save extracted text to file outputText = extractor.GetText() response.ContentType = "application/text" ' add content type header response.AddHeader "Content-Type", "application/text" ' set the content disposition response.AddHeader "Content-Disposition", "inline;filename=HelloWorld.text" ' write the output text response.Write OutputText response.End Set extractor = Nothing %>