How to index all PDF files in a directory in C#, VB.NET and VBScript using PDF Extractor SDK - ByteScout

How to index all PDF files in a directory in C#, VB.NET and VBScript using PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to index all PDF files in a directory in C#, VB.NET and VBScript using PDF Extractor SDK

ByteScout PDF Extractor SDK can be used to index all PDF files in a directory. You can include various metadata into the index, such as File Name, Page Count, Author, Title, Producer and others.

Use the sample source code below to list all PDF files in a directory in C#, VB.NET and VBScript.

Select your programming language:

How to index PDF files in C#

using System;
using System.IO;
using Bytescout.PDFExtractor;

namespace IndexPDFFiles
{
	class Program
	{
		static void Main(string[] args)
		{
			// Create Bytescout.PDFExtractor.InfoExtractor instance
			InfoExtractor infoExtractor = new InfoExtractor();
			infoExtractor.RegistrationName = "demo";
			infoExtractor.RegistrationKey = "demo";

			TextExtractor textExtractor = new TextExtractor();
			textExtractor.RegistrationName = "demo";
			textExtractor.RegistrationKey = "demo";

			// List all PDF files in directory
			foreach (string file in Directory.GetFiles(@"..\..\..\..", "*.pdf"))
			{
				infoExtractor.LoadDocumentFromFile(file);

				Console.WriteLine("File Name:      " + Path.GetFileName(file));
				Console.WriteLine("Page Count:     " + infoExtractor.GetPageCount());
				Console.WriteLine("Author:         " + infoExtractor.Author);
				Console.WriteLine("Title:          " + infoExtractor.Title);
				Console.WriteLine("Producer:       " + infoExtractor.Producer);
				Console.WriteLine("Subject:        " + infoExtractor.Subject);
				Console.WriteLine("CreationDate:   " + infoExtractor.CreationDate);
				Console.WriteLine("Text (2 lines): ");

				textExtractor.LoadDocumentFromFile(file);
				StringReader stringReader = new StringReader(textExtractor.GetTextFromPage(0));
				Console.WriteLine(stringReader.ReadLine());
				Console.WriteLine(stringReader.ReadLine());
				Console.WriteLine();
			}
			
			Console.WriteLine();
			Console.WriteLine("Press any key to continue...");
			Console.ReadLine();
		}
	}
}

How to index PDF files in Visual Basic .NET

Imports System.IO
Imports Bytescout.PDFExtractor

Class Program
    Friend Shared Sub Main(ByVal args As String())

        ' Create Bytescout.PDFExtractor.InfoExtractor instance
        Dim infoExtractor As New InfoExtractor()
        infoExtractor.RegistrationName = "demo"
        infoExtractor.RegistrationKey = "demo"

        ' Create Bytescout.PDFExtractor.TextExtractor instance
        Dim textExtractor As New TextExtractor()
        textExtractor.RegistrationName = "demo"
        textExtractor.RegistrationKey = "demo"

        ' List all PDF files in directory
        For Each file As String In Directory.GetFiles("..\..\..\..", "*.pdf")
            infoExtractor.LoadDocumentFromFile(file)

            Console.WriteLine("File Name:      " & Path.GetFileName(file))
            Console.WriteLine("Page Count:     " & infoExtractor.GetPageCount())
            Console.WriteLine("Author:         " & infoExtractor.Author)
            Console.WriteLine("Title:          " & infoExtractor.Title)
            Console.WriteLine("Producer:       " & infoExtractor.Producer)
            Console.WriteLine("Subject:        " & infoExtractor.Subject)
            Console.WriteLine("CreationDate:   " & infoExtractor.CreationDate)
            Console.WriteLine("Text (2 lines): ")

            textExtractor.LoadDocumentFromFile(file)
            Dim stringReader As New StringReader(textExtractor.GetTextFromPage(0))
            Console.WriteLine(stringReader.ReadLine())
            Console.WriteLine(stringReader.ReadLine())
            Console.WriteLine()
        Next

        Console.WriteLine()
        Console.WriteLine("Press any key to continue...")
        Console.ReadLine()
    End Sub
End Class

How to index PDF files in VBScript (Visual Basic 6)

' Create Bytescout.PDFExtractor.InfoExtractor object
Set infoExtractor = CreateObject("Bytescout.PDFExtractor.InfoExtractor")
infoExtractor.RegistrationName = "demo"
infoExtractor.RegistrationKey = "demo"

' Create Bytescout.PDFExtractor.TextExtractor object
Set textExtractor = CreateObject("Bytescout.PDFExtractor.TextExtractor")
textExtractor.RegistrationName = "demo"
textExtractor.RegistrationKey = "demo"

' Create File System object
Set FSO = CreateObject("Scripting.FileSystemObject")

' Get folder object
Set objFolder = FSO.GetFolder("..\..")
' Get file list
Set files = objFolder.Files

' Create output file
Set TS = FSO.CreateTextFile("output.txt")

For Each file in files
	
	ext = UCase(FSO.GetExtensionName(file))	
	If ext = "PDF" Then
		
        infoExtractor.LoadDocumentFromFile(file)    
        TS.WriteLine("File Name:    " & FSO.GetFileName(file))
        TS.WriteLine("Page Count:   " & infoExtractor.GetPageCount())
        TS.WriteLine("Author:       " & infoExtractor.Author)
        TS.WriteLine("Title:        " & infoExtractor.Title)
        TS.WriteLine("Producer:     " & infoExtractor.Producer)
        TS.WriteLine("Subject:      " & infoExtractor.Subject)
        TS.WriteLine("CreationDate: " & infoExtractor.CreationDate)
        		
		textExtractor.LoadDocumentFromFile(file)
		text = textExtractor.GetTextFromPage(0)
		
		If len(text) > 0 Then
			TS.WriteLine("Text (a bit): ")
			TS.WriteLine(Mid(text, 1, 200))
		End If
		
		TS.WriteBlankLines(2)
	End If 
	
Next

TS.Close

Set infoExtractor = Nothing
Set textExtractor = Nothing
Set FSO = Nothing

Tutorials:

prev
next