Quickly learn how to index pdf documents in folder with pdf extractor sdk in VB.NET with this sample source code. ByteScout Data Extraction Suite is the set that includes 3 SDK products for data extraction from PDF, scans, images and from spreadsheets: PDF Extractor SDK, Data Extraction SDK, Barcode Reader SDK. It can index pdf documents in folder with pdf extractor sdk in VB.NET.
The following code snippet for ByteScout Data Extraction Suite works best when you need to quickly index pdf documents in folder with pdf extractor sdk in your VB.NET application. Simply copy and paste in your VB.NET project or application you and then run your app! If you want to use these VB.NET sample examples in one or many applications then they can be used easily.
You can download free trial version of ByteScout Data Extraction Suite from our website to see and try many others source code samples for VB.NET.
On-demand (REST Web API) version:
Web API (on-demand version)
On-premise offline SDK for Windows:
60 Day Free Trial (on-premise)
Imports System.IO
Imports Bytescout.PDFExtractor
Module Program
Sub Main()
Try
' Output file list
Dim lstAllFilesInfo = New List(Of FileIndexOutput)()
' Get all files inside directory
Dim allFiles = Directory.GetFiles(".\Files", "*.*")
' Iterate all files, and get details
For Each itmFile In allFiles
' Get basic file information
Dim fileInfo As FileInfo = New FileInfo(itmFile)
' Check whether file is supported
If _IsFileSupported(fileInfo) Then
' Fill file index model
Dim oFileIndex = New FileIndexOutput()
oFileIndex.fileName = fileInfo.Name
oFileIndex.fileDate = fileInfo.CreationTime
oFileIndex.content = _GetFileContent(fileInfo)
' Add to final list
lstAllFilesInfo.Add(oFileIndex)
End If
Next
' Print all output
Console.WriteLine("Total {0} files indexed" & vbLf, lstAllFilesInfo.Count)
For Each itmFileInfo In lstAllFilesInfo
Console.WriteLine("fileName: {0}", itmFileInfo.fileName)
Console.WriteLine("fileDate: {0}", itmFileInfo.fileDate.ToString("MMM dd yyyy hh:mm:ss"))
Console.WriteLine("content: {0}", itmFileInfo.content.Trim())
Console.WriteLine(vbLf)
Next
Catch ex As Exception
Console.WriteLine(("ERROR:" + ex.Message))
End Try
Console.WriteLine("Press any key to exit...")
Console.ReadLine()
End Sub
''' <summary>
''' Get File COntent
''' </summary>
Private Function _GetFileContent(ByVal fileInfo As FileInfo) As String
Dim fileExtension As String = System.IO.Path.GetExtension(fileInfo.FullName)
If fileExtension = ".pdf" Then
Return _GetPdfFileContent(fileInfo)
ElseIf fileExtension = ".png" OrElse fileExtension = ".jpg" Then
Return _GetImageContet(fileInfo)
End If
Throw New Exception("File not supported.")
End Function
''' <summary>
''' Get PDF File Content
''' </summary>
Private Function _GetPdfFileContent(ByVal fileInfo As FileInfo) As String
' Read all file content...
Using textExtractor As TextExtractor = New TextExtractor("demo", "demo")
' Load Document
textExtractor.LoadDocumentFromFile(fileInfo.FullName)
Return textExtractor.GetText()
End Using
End Function
''' <summary>
''' Get Image Contents
''' </summary>
Private Function _GetImageContet(ByVal fileInfo As FileInfo) As String
' Read all file content...
Using extractor As TextExtractor = New TextExtractor()
' Load document
extractor.LoadDocumentFromFile(fileInfo.FullName)
' Set option to repair text
extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts
' Enable Optical Character Recognition (OCR)
' in .Auto mode (SDK automatically checks if needs to use OCR or not)
extractor.OCRMode = OCRMode.Auto
' Set the location of OCR language data files
extractor.OCRLanguageDataFolder = "c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"
' Set OCR language
extractor.OCRLanguage = "eng" '"eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
' Find more language files at https://github.com/bytescout/ocrdata
' Set PDF document rendering resolution
extractor.OCRResolution = 300
' Read all text
Return extractor.GetText()
End Using
End Function
''' <summary>
''' Check whether file is valid
''' </summary>
Private Function _IsFileSupported(ByVal fileInfo As FileInfo) As Boolean
' Get File Extension
Dim fileExtension As String = Path.GetExtension(fileInfo.Name)
' Check whether file extension is valid
Return (fileExtension = ".pdf" OrElse fileExtension = ".png" OrElse fileExtension = ".jpg")
End Function
''' <summary>
''' FileIndexOutput class
''' </summary>
Public Class FileIndexOutput
Public Property fileName As String
Public Property fileDate As DateTime
Public Property content As String
End Class
End Module
60 Day Free Trial or Visit ByteScout Data Extraction Suite Home Page
Explore ByteScout Data Extraction Suite Documentation
Explore Samples
Sign Up for ByteScout Data Extraction Suite Online Training
Get Your API Key
Explore Web API Docs
Explore Web API Samples
60 Day Free Trial or Visit ByteScout Data Extraction Suite Home Page
Explore ByteScout Data Extraction Suite Documentation
Explore Samples
Sign Up for ByteScout Data Extraction Suite Online Training
Get Your API Key
Explore Web API Docs
Explore Web API Samples
also available as: