ByteScout PDF Suite – VB.NET – Make Large PDF Document Searchable Using Parallel Processing

Home
/
Articles
/
ByteScout PDF Suite – VB.NET – Make Large PDF Document Searchable Using Parallel Processing

printable version:
ByteScout-PDF-Suite-VB-NET-Make-Large-PDF-Document-Searchable-Using-Parallel-Processing.pdf

How to make large PDF document searchable using parallel processing in VB.NET and ByteScout PDF Suite

This code in VB.NET shows how to make large PDF document searchable using parallel processing with this how to tutorial

Source code documentation samples give simple and easy method to install a needed feature into your application. ByteScout PDF Suite is the bundle that provides six different SDK libraries to work with PDF from generating rich PDF reports to extracting data from PDF documents and converting them to HTML. This bundle includes PDF (Generator) SDK, PDF Renderer SDK, PDF Extractor SDK, PDF to HTML SDK, PDF Viewer SDK and PDF Generator SDK for Javascript. It can make large PDF document searchable using parallel processing in VB.NET.

The following code snippet for ByteScout PDF Suite works best when you need to quickly make large PDF document searchable using parallel processing in your VB.NET application. Just copy and paste the code into your VB.NET application’s code and follow the instructions. Complete and detailed tutorials and documentation are available along with installed ByteScout PDF Suite if you’d like to learn more about the topic and the details of the API.

All these programming tutorials along with source code samples and ByteScout free trial version are available for download from our website.

On-demand (REST Web API) version:
Web API (on-demand version)

On-premise offline SDK for Windows:
60 Day Free Trial (on-premise)

Module1.vb

      Imports System.IO
Imports System.Threading
Imports Bytescout.PDFExtractor

Module Module1

    ' Limit to 4 threads in queue.
    ' Set this value to number of your processor cores for max performance.
    Dim ThreadLimiter as Semaphore = New Semaphore(4, 4)

	<MTAThread>
	Sub Main()

		Const inputFile = "sample.pdf"
		Const resultFile = "result.pdf"

		Dim pageCount As Integer

		' Get document page count
		Using infoExtractor = New InfoExtractor()
			infoExtractor.LoadDocumentFromFile(inputFile)
			pageCount = infoExtractor.GetPageCount()
		End Using

		' Process the document by 10-page pieces
		Dim numberOfThreads As Integer
		numberOfThreads = pageCount \ 10
		If (pageCount - numberOfThreads * 10 > 0) Then numberOfThreads = numberOfThreads + 1

		Dim doneEvents(numberOfThreads - 1) As WaitHandle
		Dim stopwatch As Stopwatch = Stopwatch.StartNew()
		Dim startPage, endPage As Integer
		Dim pieces(numberOfThreads - 1) As String

		' Run threads
		For i As Integer = 0 To numberOfThreads - 1

            ' Wait for the queue
		    ThreadLimiter.WaitOne()

			doneEvents(i) = New ManualResetEvent(False)
			startPage = i * 10
			endPage = Math.Min(pageCount - 1, (i + 1) * 10 - 1)
			
			If numberOfThreads = 1 Then
				endPage = endPage - 1
			End If
						
			pieces(i) = String.Format("temp-{0}-{1}.pdf", startPage, endPage)
			ThreadPool.QueueUserWorkItem(New WaitCallback(AddressOf ThreadProc), New Object() {i, doneEvents(i), inputFile, pieces(i), startPage, endPage})

		Next

		' Wait for all threads
		WaitHandle.WaitAll(doneEvents)

		' Merge pieces 
		Using merger = New DocumentMerger
			merger.Merge(pieces, resultFile)
		End Using

		' Delete temp files
		For Each tempFile As String In pieces
			File.Delete(tempFile)
		Next
		
		Console.WriteLine("All done in {0}.", stopwatch.Elapsed)
		Console.WriteLine()

		Console.WriteLine("Press any key to exit...")
		Console.ReadKey()

	End Sub

	Sub ThreadProc(ByVal stateInfo As Object)

		Dim threadIndex As Integer = stateInfo(0)
		Dim waitEvent As ManualResetEvent = stateInfo(1)
		Dim inputFile As String = stateInfo(2)
		Dim outputFile As String = stateInfo(3)
		Dim startPage As Integer = stateInfo(4)
		Dim endPage As Integer = stateInfo(5)

	    Try

	        Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage)

		    Dim stopwatch As Stopwatch = Stopwatch.StartNew()

		    ' Extract a piece of document
		    Dim chunk As String = String.Format("temp-{0}-{1}", startPage, endPage)
		    Using splitter = New DocumentSplitter
			    splitter.ExtractPageRange(inputFile, chunk, startPage + 1, endPage + 1)
		    End Using

		    ' Process the piece
		    Using searchablePdfMaker As New SearchablePDFMaker("demo", "demo")

			    searchablePdfMaker.OCRDetectPageRotation = True
			    searchablePdfMaker.OCRLanguageDataFolder = "C:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata"
			    searchablePdfMaker.LoadDocumentFromFile(chunk)

			    ' 300 DPI resolution is recommended. 
			    ' Using of higher values will slow down the processing but does not guarantee the higher quality.
			    searchablePdfMaker.OCRResolution = 300

			    searchablePdfMaker.MakePDFSearchable(outputFile)

		    End Using

		    File.Delete(chunk)

		    Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed)

	    Finally

            ' Signal the thread is finished
	        waitEvent.Set()

            ' Release semaphore
	        ThreadLimiter.Release()

	    End Try

		
	End Sub

End Module