The sample source codes below will show how to extract text from PDF in ASP.NET, Classic ASP, C++, C#, VB.NET and VBScript with ByteScout PDF Extractor SDK.
Also, here is an article that shows how to extract text from PDF by pages.
Select your programming language:
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.IO;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using Bytescout.PDFExtractor;
namespace ExtractAllText
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
// This test file will be copied to the project directory on the pre-build event (see the project properties).
String inputFile = Server.MapPath("sample2.pdf");
// Create Bytescout.PDFExtractor.TextExtractor instance
TextExtractor extractor = new TextExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
// Load sample PDF document
extractor.LoadDocumentFromFile(inputFile);
Response.Clear();
Response.ContentType = "text/html";
// Save extracted text to output stream
extractor.SaveTextToStream(Response.OutputStream);
Response.End();
}
}
}
using System;
using Bytescout.PDFExtractor;
namespace ExtractAllText
{
class Program
{
static void Main(string[] args)
{
// Create Bytescout.PDFExtractor.TextExtractor instance
TextExtractor extractor = new TextExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
// Load sample PDF document
extractor.LoadDocumentFromFile("sample2.pdf");
// Save extracted text to file
extractor.SaveTextToFile("output.txt");
// Open output file in default associated application
System.Diagnostics.Process.Start("output.txt");
}
}
}
#include "stdafx.h"
#include "comip.h"
#import "c:\\Program Files\\Bytescout PDF Extractor SDK\\net4.00\\Bytescout.PDFExtractor.tlb" raw_interfaces_only
using namespace Bytescout_PDFExtractor;
int _tmain(int argc, _TCHAR* argv[])
{
// Initialize COM.
HRESULT hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);
// Create the interface pointer.
_TextExtractorPtr pITextExtractor(__uuidof(TextExtractor));
// Set the registration name and key
// Note: You should use _bstr_t or BSTR to pass string to the library because of COM requirements
_bstr_t bstrRegName(L"DEMO");
pITextExtractor->put_RegistrationName(bstrRegName);
_bstr_t bstrRegKey(L"DEMO");
pITextExtractor->put_RegistrationKey(bstrRegKey);
// Load sample PDF document
_bstr_t bstrPath(L"..\\..\\sample3.pdf");
pITextExtractor->LoadDocumentFromFile(bstrPath);
// Save extracted text to file
_bstr_t bstrOutputFile(L"output.txt");
pITextExtractor->SaveTextToFile(bstrOutputFile);
pITextExtractor->Release();
CoUninitialize();
return 0;
}
Imports Bytescout.PDFExtractor
Class Program
Friend Shared Sub Main(args As String())
' Create Bytescout.PDFExtractor.TextExtractor instance
Dim extractor As New TextExtractor()
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Load sample PDF document
extractor.LoadDocumentFromFile("sample2.pdf")
' Save extracted text to file
extractor.SaveTextToFile("output.txt")
' Open output file in default associated application
System.Diagnostics.Process.Start("output.txt")
End Sub
End Class
' Create Bytescout.PDFExtractor.TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Load sample PDF document
extractor.LoadDocumentFromFile("..\..\sample2.pdf")
' Save extracted text to file
extractor.SaveTextToFile("output.txt")
' Open output file in default associated application
Set shell = CreateObject("WScript.Shell")
shell.Run "output.txt", 1, false
Set shell = Nothing
Set extractor = Nothing
<%
' In case of "Server.CreateObject Failed", "Server object error "ASP 0177 : 8000ffff" or similar errors:
' Please try the following:
' - Open IIS
' - Find application pools (DefaultAppPool is used by default)
' - Open its properties and check .NET CLR version selected:
' - if you have .NET 1.1 then change to .NET CLR 2.00
' - if you have .NET CLR 2.00 then try to change to .NET CLR 4.0
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Load sample PDF document
extractor.LoadDocumentFromFile("..\..\sample2.pdf")
' Save extracted text to file
outputText = extractor.GetText()
response.ContentType = "application/text"
' add content type header
response.AddHeader "Content-Type", "application/text"
' set the content disposition
response.AddHeader "Content-Disposition", "inline;filename=HelloWorld.text"
' write the output text
response.Write OutputText
response.End
Set extractor = Nothing
%>