How to index all PDF files in a directory in C#, VB.NET and VBScript using PDF Extractor SDK - ByteScout

How to index all PDF files in a directory in C#, VB.NET and VBScript using PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to index all PDF files in a directory in C#, VB.NET and VBScript using PDF Extractor SDK

ByteScout PDF Extractor SDK can be used to index all PDF files in a directory. You can include various metadata into the index, such as File Name, Page Count, Author, Title, Producer and others.

Use the sample source code below to list all PDF files in a directory in C#, VB.NET and VBScript.

Select your programming language:

How to index PDF files in C#

[vb]
using System;
using System.IO;
using Bytescout.PDFExtractor;

namespace IndexPDFFiles
{
class Program
{
static void Main(string[] args)
{
// Create Bytescout.PDFExtractor.InfoExtractor instance
InfoExtractor infoExtractor = new InfoExtractor();
infoExtractor.RegistrationName = "demo";
infoExtractor.RegistrationKey = "demo";

TextExtractor textExtractor = new TextExtractor();
textExtractor.RegistrationName = "demo";
textExtractor.RegistrationKey = "demo";

// List all PDF files in directory
foreach (string file in Directory.GetFiles(@"..\..\..\..", "*.pdf"))
{
infoExtractor.LoadDocumentFromFile(file);

Console.WriteLine("File Name: " + Path.GetFileName(file));
Console.WriteLine("Page Count: " + infoExtractor.GetPageCount());
Console.WriteLine("Author: " + infoExtractor.Author);
Console.WriteLine("Title: " + infoExtractor.Title);
Console.WriteLine("Producer: " + infoExtractor.Producer);
Console.WriteLine("Subject: " + infoExtractor.Subject);
Console.WriteLine("CreationDate: " + infoExtractor.CreationDate);
Console.WriteLine("Text (2 lines): ");

textExtractor.LoadDocumentFromFile(file);
StringReader stringReader = new StringReader(textExtractor.GetTextFromPage(0));
Console.WriteLine(stringReader.ReadLine());
Console.WriteLine(stringReader.ReadLine());
Console.WriteLine();
}

Console.WriteLine();
Console.WriteLine("Press any key to continue…");
Console.ReadLine();
}
}
}
[/vb]

How to index PDF files in Visual Basic .NET

[vb]
Imports System.IO
Imports Bytescout.PDFExtractor

Class Program
Friend Shared Sub Main(ByVal args As String())

‘ Create Bytescout.PDFExtractor.InfoExtractor instance
Dim infoExtractor As New InfoExtractor()
infoExtractor.RegistrationName = "demo"
infoExtractor.RegistrationKey = "demo"

‘ Create Bytescout.PDFExtractor.TextExtractor instance
Dim textExtractor As New TextExtractor()
textExtractor.RegistrationName = "demo"
textExtractor.RegistrationKey = "demo"

‘ List all PDF files in directory
For Each file As String In Directory.GetFiles("..\..\..\..", "*.pdf")
infoExtractor.LoadDocumentFromFile(file)

Console.WriteLine("File Name: " & Path.GetFileName(file))
Console.WriteLine("Page Count: " & infoExtractor.GetPageCount())
Console.WriteLine("Author: " & infoExtractor.Author)
Console.WriteLine("Title: " & infoExtractor.Title)
Console.WriteLine("Producer: " & infoExtractor.Producer)
Console.WriteLine("Subject: " & infoExtractor.Subject)
Console.WriteLine("CreationDate: " & infoExtractor.CreationDate)
Console.WriteLine("Text (2 lines): ")

textExtractor.LoadDocumentFromFile(file)
Dim stringReader As New StringReader(textExtractor.GetTextFromPage(0))
Console.WriteLine(stringReader.ReadLine())
Console.WriteLine(stringReader.ReadLine())
Console.WriteLine()
Next

Console.WriteLine()
Console.WriteLine("Press any key to continue…")
Console.ReadLine()
End Sub
End Class
[/vb]

How to index PDF files in VBScript (Visual Basic 6)

[vb]
‘ Create Bytescout.PDFExtractor.InfoExtractor object
Set infoExtractor = CreateObject("Bytescout.PDFExtractor.InfoExtractor")
infoExtractor.RegistrationName = "demo"
infoExtractor.RegistrationKey = "demo"

‘ Create Bytescout.PDFExtractor.TextExtractor object
Set textExtractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
textExtractor.RegistrationName = "demo"
textExtractor.RegistrationKey = "demo"

‘ Create File System object
Set FSO = CreateObject("Scripting.FileSystemObject")

‘ Get folder object
Set objFolder = FSO.GetFolder("..\..")
‘ Get file list
Set files = objFolder.Files

‘ Create output file
Set TS = FSO.CreateTextFile("output.txt")

For Each file in files

ext = UCase(FSO.GetExtensionName(file))
If ext = "PDF" Then

infoExtractor.LoadDocumentFromFile(file)
TS.WriteLine("File Name: " & FSO.GetFileName(file))
TS.WriteLine("Page Count: " & infoExtractor.GetPageCount())
TS.WriteLine("Author: " & infoExtractor.Author)
TS.WriteLine("Title: " & infoExtractor.Title)
TS.WriteLine("Producer: " & infoExtractor.Producer)
TS.WriteLine("Subject: " & infoExtractor.Subject)
TS.WriteLine("CreationDate: " & infoExtractor.CreationDate)

textExtractor.LoadDocumentFromFile(file)
text = textExtractor.GetTextFromPage(0)

If len(text) > 0 Then
TS.WriteLine("Text (a bit): ")
TS.WriteLine(Mid(text, 1, 200))
End If

TS.WriteBlankLines(2)
End If

Next

TS.Close

Set infoExtractor = Nothing
Set textExtractor = Nothing
Set FSO = Nothing
[/vb]

Tutorials:

prev
next