This example demonstrates how to batch process PDF files using Bytescout PDF Extractor SDK and lists 5 code snippets below (ASP.net, C#, Delphi, VB.net, and VBScript).
You may also find helpful this article that shows how to reduce memory usage (when processing huge PDF files) by disabling page data caching in C#, VB.NET, and VBScript
Select your programming language:
using System;
using System.IO;
using Bytescout.PDFExtractor;
namespace BatchProcessing
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
// Directory containing test files
String inputFolder = Server.MapPath(@".\bin");
// Create Bytescout.PDFExtractor.TextExtractor instance
TextExtractor extractor = new TextExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
Response.Clear();
Response.ContentType = "text/html";
// Get PDF files
string[] pdfFiles = Directory.GetFiles(inputFolder, "*.pdf");
foreach (string file in pdfFiles)
{
// Load document
extractor.LoadDocumentFromFile(file);
// Save extracted text to output stream
extractor.SaveTextToStream(Response.OutputStream);
// Reset the extractor before load another file
extractor.Reset();
}
Response.End();
}
}
}
using System.IO;
using Bytescout.PDFExtractor;
namespace BatchProcessing
{
class Program
{
static void Main()
{
// Create Bytescout.PDFExtractor.TextExtractor instance
TextExtractor extractor = new TextExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
// Get PDF files
string[] pdfFiles = Directory.GetFiles(".", "*.pdf");
foreach (string file in pdfFiles)
{
// Load document
extractor.LoadDocumentFromFile(file);
// Save extracted text to .txt file
extractor.SaveTextToFile(Path.ChangeExtension(file, ".txt"));
// Reset the extractor before load another file
extractor.Reset();
}
}
}
}
program Project1;
{$APPTYPE CONSOLE}
{
IMPORTANT:
To work with Bytescout PDF Extractor SDK you need to import this as a component into Delphi
To import Bytescout PDF Extractor SDK into Delphi 2006 or higher do the following:
1) Click Component | Import Component..
2) Select Type Library and click Next
3) Find and select Bytescout PDF Extractor SDK in the list of available type libraries and
4) Click Next
5) Click Next on next screen
6) Select "Add Bytescout_PDFExtractor_TLB.pas" into Project" and click Finish
This will add Bytescout_PDFExtractor_TLB.pas into your project and now you can use TextExtractor, InfoExtractor, CSVExtractor, XMLExtractor, ImageExtractor object interfaces (_TextExtractor, _InfoExtractor, _CSVExtractor, _XMLExtractor, _ImageExtractor classes)
For Delphi 5,6,7,8 / C++ Builder 5,6,7,8 (for 2006 or higher versions please see above)
1) Start Delphi (or C++ Builder)
2) Select Component menu and "Import ActiveX control.."
3) Find the library in the list of available ActiveX/COM objects
4) Select this library and click "Install"
5) Create a new package for this library imported (for example, TPDFExtractorSDKActiveX)
6) Click OK
7) Answer "Yes" when Delphi (or C++ Builder) asks to rebuild the package
8) The IDE will rebuild the package and will inform that the control has been installed. Close the package and answer "Yes" to save changes
9) The library object is now available on "ActiveX" tab on Tools Pallete. You can simply drag and drop it into the form in your application and use it
}
uses
SysUtils,
ActiveX,
Bytescout_PDFExtractor_TLB in 'c:\program files\borland\bds\4.0\Imports\Bytescout_PDFExtractor_TLB.pas';
var
extractor: _CSVExtractor;
begin
CoInitialize(nil);
// Create Bytescout.PDFExtractor.CSVExtractor object using CoCSVExtractor class
extractor := CoCSVExtractor.Create();
extractor.RegistrationName := 'demo';
extractor.RegistrationKey := 'demo';
// Load sample PDF document
extractor.LoadDocumentFromFile ('../../sample3.pdf');
// extractor.CSVSeparatorSymbol = ','; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales
extractor.SaveCSVToFile ('output.csv');
// reset the extractor so could load another file
extractor.Reset();
// now load another file
// Load sample PDF document
extractor.LoadDocumentFromFile ('../../sample2.pdf');
extractor.SaveCSVToFile ('output2.csv');
// destroy the extractor object
extractor := nil;
end.
Imports Bytescout.PDFExtractor
Imports System.IO
Module Module1
Sub Main()
' Create Bytescout.PDFExtractor.TextExtractor instance
Dim extractor = New TextExtractor()
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Get PDF files
Dim pdfFiles() = Directory.GetFiles(".", "*.pdf")
For Each file As String In pdfFiles
' Load document
extractor.LoadDocumentFromFile(file)
' Save extracted text to .txt file
extractor.SaveTextToFile(Path.ChangeExtension(file, ".txt"))
' Reset the extractor before load another file
extractor.Reset()
Next
End Sub
End Module
' Create Bytescout.PDFExtractor.TextExtractor object
Set extractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Get all files in folder
Set objFSO = CreateObject("Scripting.FileSystemObject")
Set objFolder = objFSO.GetFolder("..\..")
Set colFiles = objFolder.Files
' Convert every PDF file to text
For Each objFile In colFiles
if objFSO.GetExtensionName(objFile) = "pdf" Then
' Load PDF file
extractor.LoadDocumentFromFile objFile.Path
' Save extracted text to .txt file
extractor.SaveTextToFile Replace(objFile.Name, "." & objFSO.GetExtensionName(objFile),".txt")
' Reset the extractor before load another file
extractor.Reset
End If
Next