ByteScout PDF Extractor SDK - C# - Index Documents In Folder - ByteScout
Announcement
Our ByteScout SDK products are sunsetting as we focus on expanding new solutions.
Learn More Open modal
Close modal
Announcement Important Update
ByteScout SDK Sunsetting Notice
Our ByteScout SDK products are sunsetting as we focus on our new & improved solutions. Thank you for being part of our journey, and we look forward to supporting you in this next chapter!

ByteScout PDF Extractor SDK – C# – Index Documents In Folder

  • Home
  • /
  • Articles
  • /
  • ByteScout PDF Extractor SDK – C# – Index Documents In Folder

ByteScout PDF Extractor SDK – C# – Index Documents In Folder

Program.cs

using Bytescout.PDFExtractor;
using System;
using System.Collections.Generic;
using System.IO;

namespace IndexDocsInFolder
{
    class Program
    {
        static void Main(string[] args)
        {
            try
            {
                // Output file list
                var lstAllFilesInfo = new List<FileIndexOutput>();

                // Get all files inside directory
                var allFiles = Directory.GetFiles(@".\Files", "*.*");

                // Iterate all files, and get details
                foreach (var itmFile in allFiles)
                {
                    // Get basic file information
                    FileInfo fileInfo = new FileInfo(itmFile);

                    // Check whether file is supported
                    if (_IsFileSupported(fileInfo))
                    {
                        // Fill file index model
                        var oFileIndex = new FileIndexOutput();
                        oFileIndex.fileName = fileInfo.Name;
                        oFileIndex.fileDate = fileInfo.CreationTime;
                        oFileIndex.content = _GetFileContent(fileInfo);

                        // Add to final list
                        lstAllFilesInfo.Add(oFileIndex);
                    }
                }

                // Print all output
                Console.WriteLine("Total {0} files indexed\n", lstAllFilesInfo.Count);
                foreach (var itmFileInfo in lstAllFilesInfo)
                {
                    Console.WriteLine("fileName: {0}", itmFileInfo.fileName);
                    Console.WriteLine("fileDate: {0}", itmFileInfo.fileDate.ToString("MMM dd yyyy hh:mm:ss"));
                    Console.WriteLine("content: {0}", itmFileInfo.content.Trim());
                    Console.WriteLine();
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }

            Console.WriteLine("Press any key to exit...");
            Console.ReadLine();
        }

        /// <summary>
        /// Get File Content
        /// </summary>
        private static string _GetFileContent(FileInfo fileInfo)
        {
            string fileExtension = System.IO.Path.GetExtension(fileInfo.FullName);

            if (fileExtension == ".pdf")
            {
                return _GetPdfFileContent(fileInfo);
            }
            else if (fileExtension == ".png" || fileExtension == ".jpg")
            {
                return _GetImageContet(fileInfo);
            }

            throw new Exception("File not supported.");
        }

        /// <summary>
        /// Get PDF File Content
        /// </summary>
        private static string _GetPdfFileContent(FileInfo fileInfo)
        {
            //Read all file content...
            using (TextExtractor textExtractor = new TextExtractor("demo","demo"))
            {
                //Load Document
                textExtractor.LoadDocumentFromFile(fileInfo.FullName);

                return textExtractor.GetText();
            }
        }

        /// <summary>
        /// Get Image Contents
        /// </summary>
        private static string _GetImageContet(FileInfo fileInfo)
        {
            //Read all file content...
            using (TextExtractor extractor = new TextExtractor())
            {
                // Load document
                extractor.LoadDocumentFromFile(fileInfo.FullName);

                //Set option to repair text
                extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts;

                // Enable Optical Character Recognition (OCR)
                // in .Auto mode (SDK automatically checks if needs to use OCR or not)
                extractor.OCRMode = OCRMode.Auto;

                // Set the location of "tessdata" folder containing language data files
                extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\";

                // Set OCR language
                extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
                                               // Find more language files at https://github.com/tesseract-ocr/tessdata/tree/3.04.00

                // Set PDF document rendering resolution
                extractor.OCRResolution = 300;

                //Read all text
                return extractor.GetText();
            }
        }

        /// <summary>
        /// Check whether file is valid
        /// </summary>
        private static bool _IsFileSupported(FileInfo fileInfo)
        {
            //Get File Extension
            string fileExtension = Path.GetExtension(fileInfo.Name);

            //Check whether file extension is valid
            return (fileExtension == ".pdf" || fileExtension == ".png" || fileExtension == ".jpg");
        }

    }

    /// <summary>
    /// FileIndexOutput class
    /// </summary>
    public class FileIndexOutput
    {
        public string fileName { get; set; }

        public DateTime fileDate { get; set; }

        public string content { get; set; }
    }

}


  Click here to get your Free Trial version of the SDK

Tutorials:

prev
next