RPA Robotic Process Automation - Make Large PDF Document Searchable Using Parallel Processing - C# - ByteScout
Announcement
Our ByteScout SDK products are sunsetting as we focus on expanding new solutions.
Learn More Open modal
Close modal
Announcement Important Update
ByteScout SDK Sunsetting Notice
Our ByteScout SDK products are sunsetting as we focus on our new & improved solutions. Thank you for being part of our journey, and we look forward to supporting you in this next chapter!

RPA Robotic Process Automation – Make Large PDF Document Searchable Using Parallel Processing – C#

  • Home
  • /
  • Articles
  • /
  • RPA Robotic Process Automation – Make Large PDF Document Searchable Using Parallel Processing – C#

How to make large PDF document searchable using parallel processing in C# with ByteScout Robotic Process Automation

What is ByteScout Robotic Process Automation? It is set of integrated APIs for quick replaccement of manual data processing with robotic process automations.

On-demand (REST Web API) version:
 Web API (on-demand version)

On-premise offline SDK for Windows:
 60 Day Free Trial (on-premise)

MultithreadProcessing.csproj

      
<?xml version="1.0" encoding="utf-8"?> <Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> <PropertyGroup> <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> <ProjectGuid>{0B102DA4-C143-481D-A076-1F56E3CB1CF5}</ProjectGuid> <OutputType>Exe</OutputType> <RootNamespace>MultithreadProcessing</RootNamespace> <AssemblyName>MultithreadProcessing</AssemblyName> <TargetFrameworkVersion>v4.0</TargetFrameworkVersion> <FileAlignment>512</FileAlignment> </PropertyGroup> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> <PlatformTarget>AnyCPU</PlatformTarget> <DebugSymbols>true</DebugSymbols> <DebugType>full</DebugType> <Optimize>false</Optimize> <OutputPath>bin\Debug\</OutputPath> <DefineConstants>DEBUG;TRACE</DefineConstants> <ErrorReport>prompt</ErrorReport> <WarningLevel>4</WarningLevel> </PropertyGroup> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> <PlatformTarget>AnyCPU</PlatformTarget> <DebugType>pdbonly</DebugType> <Optimize>true</Optimize> <OutputPath>bin\Release\</OutputPath> <DefineConstants>TRACE</DefineConstants> <ErrorReport>prompt</ErrorReport> <WarningLevel>4</WarningLevel> </PropertyGroup> <ItemGroup> <Reference Include="Bytescout.PDFExtractor, Version=8.6.0.2917, Culture=neutral, PublicKeyToken=f7dd1bd9d40a50eb, processorArchitecture=MSIL"> <SpecificVersion>False</SpecificVersion> <HintPath>C:\Program Files\Bytescout PDF Extractor SDK\net4.00\Bytescout.PDFExtractor.dll</HintPath> </Reference> <Reference Include="Bytescout.PDFExtractor.OCRExtension, Version=8.6.0.2917, Culture=neutral, PublicKeyToken=f7dd1bd9d40a50eb, processorArchitecture=MSIL"> <SpecificVersion>False</SpecificVersion> <HintPath>C:\Program Files\Bytescout PDF Extractor SDK\net4.00\Bytescout.PDFExtractor.OCRExtension.dll</HintPath> </Reference> <Reference Include="System" /> <Reference Include="System.Core" /> <Reference Include="System.Xml.Linq" /> <Reference Include="System.Data" /> <Reference Include="System.Xml" /> </ItemGroup> <ItemGroup> <Compile Include="Program.cs" /> <Compile Include="Properties\AssemblyInfo.cs" /> </ItemGroup> <ItemGroup> <Content Include="sample.pdf"> <CopyToOutputDirectory>Always</CopyToOutputDirectory> </Content> </ItemGroup> <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> </Project>

MultithreadProcessing.sln

      
Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 VisualStudioVersion = 15.0.26730.10 MinimumVisualStudioVersion = 10.0.40219.1 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MultithreadProcessing", "MultithreadProcessing.csproj", "{0B102DA4-C143-481D-A076-1F56E3CB1CF5}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {0B102DA4-C143-481D-A076-1F56E3CB1CF5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {0B102DA4-C143-481D-A076-1F56E3CB1CF5}.Debug|Any CPU.Build.0 = Debug|Any CPU {0B102DA4-C143-481D-A076-1F56E3CB1CF5}.Release|Any CPU.ActiveCfg = Release|Any CPU {0B102DA4-C143-481D-A076-1F56E3CB1CF5}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {50466307-7059-438B-8545-42FDA71BC1A6} EndGlobalSection EndGlobal

Program.cs

      
using System; using System.Diagnostics; using System.IO; using System.Threading; using Bytescout.PDFExtractor; namespace MultithreadProcessing { class Program { // Limit to 4 threads in queue. // Set this value to number of your processor cores for max performance. private static readonly Semaphore ThreadLimiter = new Semaphore(4, 4); static void Main(string[] args) { const string inputFile = "sample.pdf"; const string resultFile = "result.pdf"; int pageCount; // Get document page count using (var infoExtractor = new InfoExtractor("demo", "demo")) { infoExtractor.LoadDocumentFromFile(inputFile); pageCount = infoExtractor.GetPageCount(); } // Process the document by 10-page pieces int numberOfThreads = pageCount / 10; if (pageCount - numberOfThreads * 10 > 0) numberOfThreads += 1; WaitHandle[] doneEvents = new WaitHandle[numberOfThreads]; Stopwatch stopwatch = Stopwatch.StartNew(); int startPage, endPage; string[] pieces = new string[numberOfThreads]; for (int i = 0; i < numberOfThreads; i++) { // Wait for the queue ThreadLimiter.WaitOne(); doneEvents[i] = new ManualResetEvent(false); startPage = i * 10; endPage = Math.Min(pageCount - 1, (i + 1) * 10 - 1); pieces[i] = string.Format("temp-{0}-{1}.pdf", startPage, endPage); ThreadPool.QueueUserWorkItem(new WaitCallback(ThreadProc), new object[] { i, doneEvents[i], inputFile, pieces[i], startPage, endPage }); } // Wait for all threads WaitHandle.WaitAll(doneEvents); // Merge pieces using (DocumentMerger merger = new DocumentMerger("demo", "demo")) merger.Merge(pieces, resultFile); // Delete temp files foreach (string tempFile in pieces) File.Delete(tempFile); Console.WriteLine("All done in {0}.", stopwatch.Elapsed); Console.WriteLine(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); } private static void ThreadProc(object stateInfo) { int threadIndex = (int) ((object[]) stateInfo)[0]; ManualResetEvent doneEvent = (ManualResetEvent) ((object[]) stateInfo)[1]; string inputFile = (string) ((object[]) stateInfo)[2]; string outputFile = (string)((object[])stateInfo)[3]; int startPage = (int)((object[])stateInfo)[4]; int endPage = (int)((object[])stateInfo)[5]; try { Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage); Stopwatch stopwatch = Stopwatch.StartNew(); // Extract a piece of document string chunk = string.Format("temp-{0}-{1}", startPage, endPage); using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) splitter.ExtractPageRange(inputFile, chunk, startPage + 1, endPage + 1); // Process the piece using (SearchablePDFMaker searchablePdfMaker = new SearchablePDFMaker("demo", "demo")) { searchablePdfMaker.OCRDetectPageRotation = true; searchablePdfMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; searchablePdfMaker.LoadDocumentFromFile(chunk); // 300 DPI resolution is recommended. // Using of higher values will slow down the processing but does not guarantee the higher quality. searchablePdfMaker.OCRResolution = 300; searchablePdfMaker.MakePDFSearchable(outputFile); } File.Delete(chunk); Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed); } finally { // Signal the thread is finished doneEvent.Set(); // Release semaphore ThreadLimiter.Release(); } } } }

VIDEO

ON-PREMISE OFFLINE SDK

Get 60 Day Free Trial

See also:

ON-DEMAND REST WEB API

Get Your API Key

See also:

Tutorials:

prev
next