How to extract data from tables with superscript from PDF in C# using ByteScout PDF Extractor SDK - ByteScout

How to extract data from tables with superscript from PDF in C# using ByteScout PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to extract data from tables with superscript from PDF in C# using ByteScout PDF Extractor SDK

This sample code shows how to extract data from PDF tables containing superscript values in C# using ByteScout PDF Extractor SDK.

You may also find useful to check how to extract filled form data from PDF with PDF Extractor SDK.

C#

using System;
using System.Collections.Generic;
using System.Text;
using Bytescout.PDFExtractor;
using System.Diagnostics;

/*
This sample code shows how to extract data from tables where numbers with superscript
The code is doing the following with Prices.pdf:
 - uses StructuredExtractor class to iterate through rows
 - finds the header of the table by checking first columns in the header's row 
 - then iterates through rows and checking rows where no date in the first column
 - if no date then it means that this row contains superscript values for the row below and we should
 merge values from this row with cells in the row below
 - we are saving this row with superscript values
 - merging with the next row below
 
 saving each row into a comma delimited string
 collecting these strings into a single file and finally saving as .csv
*/


namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            // char to delimit cells in a row
            const string delimChar = ",";
            // char to service as decimal separator 
            const string precChar = ".";

            // output filename 
            const string outputCSVFileName = "outputTable.csv";

            // Create Bytescout.PDFExtractor.StructuredExtractor object
            StructuredExtractor structuredExtractor = new StructuredExtractor();

            // set the registration key
            structuredExtractor.RegistrationName = "demo";
            structuredExtractor.RegistrationKey = "demo";

            // Load sample PDF document
            structuredExtractor.LoadDocumentFromFile("Prices.pdf");

            // page to process
            int PageIndex = 0;

            // prepare structure of the first page (zero index)
            structuredExtractor.PrepareStructure(PageIndex);

            // get total number of rows in a table
            int iRowCount = structuredExtractor.GetRowCount(PageIndex);

            // search for the header column

            // index of the row with the header
            int iHeaderRowIndex = -1;

            // iterate through rows to find the header row
            for (int y = 0; y < iRowCount; y++)
            {

                // checking if the very first column (with date) is empty in the row
                // if date is empty it means this row contains superscript values for the row below
                if (structuredExtractor.GetCellValue(PageIndex, y, 0).IndexOf("Date") == 0 && // if 1st column starts with "Date"
                    structuredExtractor.GetCellValue(PageIndex, y, 1).IndexOf("Product") == 0 // if 2nd column starts with "Product"
                    )
                {
                    // found row with the header, saving its index
                    iHeaderRowIndex = y;
                    break;
                }

            } // end for

            // save number of columns in the header (so we are using the header row as a reference for the whole table)
            int iHeaderColumnCount = structuredExtractor.GetColumnCount(PageIndex, iHeaderRowIndex);

            // writing the header to the console
            for (int jj = 0; jj < iHeaderColumnCount; jj++)
            {
                Console.Write(structuredExtractor.GetCellValue(PageIndex, iHeaderRowIndex, jj) + " | ");
            }
            Console.Write("\n"); // add line break


            // we found a header i.e. we have a table below 
            // now reading row by row
            // if first cell in a row contains superscript values for the row below
            // we should save it and process along with the next row

            // array to store superscript row
            string[] superScriptRow = new string[iHeaderColumnCount];
            bool PreviousRowWasSuperscript = false;

            StringBuilder finalOutput = new StringBuilder();

            // now iterate through rows from row after the header until we got to the stop text (ending the table)
            for (int y = iHeaderRowIndex + 1; y < iRowCount; y++)
            {

                // current row 
                string[] currentRow = new string[iHeaderColumnCount];

                // fill up row array with values
                for (int x = 0; x < iHeaderColumnCount; x++)
                {
                    currentRow[x] = structuredExtractor.GetCellValue(PageIndex, y, x);
                }

                // if no values in 1st and 2nd column for the current row then it means we have 
                // superscript values in this row. These superscript values belong to cells below
                if (currentRow[0] == "" & currentRow[1] == "")
                {
                    // clean the superscript row
                    superScriptRow.Initialize();
                    // copy current row to superscriptRow
                    for (int i = 0; i < iHeaderColumnCount; i++)
                    {
                        superScriptRow[i] = currentRow[i];
                    }
                    PreviousRowWasSuperscript = true;
                    // jump to next iteration
                    continue;

                } // end if

                // if previous row was superscript then add it to current row
                if (PreviousRowWasSuperscript)
                {
                    for (int i = 0; i < iHeaderColumnCount; i++)
                    {
                        // if previous row (with superscript values) is NOT empty then add superscript values
                        // to values in the current row separated by decimal char
                        if (superScriptRow[i] != "")
                        {
                            currentRow[i] = currentRow[i] + precChar + superScriptRow[i];
                        }
                    }
                }

                // reset flag for next iteration not to use
                PreviousRowWasSuperscript = false;

                // get current row as a string
                StringBuilder rowString = new StringBuilder();
                foreach (string cc in currentRow)
                {
                    rowString.Append("\"" + cc + "\"");
                    rowString.Append(delimChar);
                }

                // add to final output string 
                finalOutput.AppendLine(rowString.ToString());


            }

            // write the generated csv into the console
            Console.WriteLine(finalOutput.ToString());

            // save the generate csv text into a file
            System.IO.File.WriteAllText(outputCSVFileName, finalOutput.ToString());

            Console.WriteLine("Done! Press any key to exit...");
            Console.ReadKey();

            Process.Start(outputCSVFileName);

        }
    }
}

Tutorials:

prev
next