How to extract data from tables with superscript from PDF in C# using ByteScout PDF Extractor SDK - ByteScout
Announcement
Our ByteScout SDK products are sunsetting as we focus on expanding new solutions.
Learn More Open modal
Close modal
Announcement Important Update
ByteScout SDK Sunsetting Notice
Our ByteScout SDK products are sunsetting as we focus on our new & improved solutions. Thank you for being part of our journey, and we look forward to supporting you in this next chapter!

How to extract data from tables with superscript from PDF in C# using ByteScout PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to extract data from tables with superscript from PDF in C# using ByteScout PDF Extractor SDK

This sample code shows how to extract data from PDF tables containing superscript values in C# using ByteScout PDF Extractor SDK.

You may also find useful to check how to extract filled form data from PDF with PDF Extractor SDK.

C#

using System;
using System.Collections.Generic;
using System.Text;
using Bytescout.PDFExtractor;
using System.Diagnostics;

/*
This sample code shows how to extract data from tables where numbers with superscript
The code is doing the following with Prices.pdf:
 - uses StructuredExtractor class to iterate through rows
 - finds the header of the table by checking first columns in the header's row 
 - then iterates through rows and checking rows where no date in the first column
 - if no date then it means that this row contains superscript values for the row below and we should
 merge values from this row with cells in the row below
 - we are saving this row with superscript values
 - merging with the next row below
 
 saving each row into a comma delimited string
 collecting these strings into a single file and finally saving as .csv
*/


namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            // char to delimit cells in a row
            const string delimChar = ",";
            // char to service as decimal separator 
            const string precChar = ".";

            // output filename 
            const string outputCSVFileName = "outputTable.csv";

            // Create Bytescout.PDFExtractor.StructuredExtractor object
            StructuredExtractor structuredExtractor = new StructuredExtractor();

            // set the registration key
            structuredExtractor.RegistrationName = "demo";
            structuredExtractor.RegistrationKey = "demo";

            // Load sample PDF document
            structuredExtractor.LoadDocumentFromFile("Prices.pdf");

            // page to process
            int PageIndex = 0;

            // prepare structure of the first page (zero index)
            structuredExtractor.PrepareStructure(PageIndex);

            // get total number of rows in a table
            int iRowCount = structuredExtractor.GetRowCount(PageIndex);

            // search for the header column

            // index of the row with the header
            int iHeaderRowIndex = -1;

            // iterate through rows to find the header row
            for (int y = 0; y < iRowCount; y++)
            {

                // checking if the very first column (with date) is empty in the row
                // if date is empty it means this row contains superscript values for the row below
                if (structuredExtractor.GetCellValue(PageIndex, y, 0).IndexOf("Date") == 0 && // if 1st column starts with "Date"
                    structuredExtractor.GetCellValue(PageIndex, y, 1).IndexOf("Product") == 0 // if 2nd column starts with "Product"
                    )
                {
                    // found row with the header, saving its index
                    iHeaderRowIndex = y;
                    break;
                }

            } // end for

            // save number of columns in the header (so we are using the header row as a reference for the whole table)
            int iHeaderColumnCount = structuredExtractor.GetColumnCount(PageIndex, iHeaderRowIndex);

            // writing the header to the console
            for (int jj = 0; jj < iHeaderColumnCount; jj++)
            {
                Console.Write(structuredExtractor.GetCellValue(PageIndex, iHeaderRowIndex, jj) + " | ");
            }
            Console.Write("\n"); // add line break


            // we found a header i.e. we have a table below 
            // now reading row by row
            // if first cell in a row contains superscript values for the row below
            // we should save it and process along with the next row

            // array to store superscript row
            string[] superScriptRow = new string[iHeaderColumnCount];
            bool PreviousRowWasSuperscript = false;

            StringBuilder finalOutput = new StringBuilder();

            // now iterate through rows from row after the header until we got to the stop text (ending the table)
            for (int y = iHeaderRowIndex + 1; y < iRowCount; y++)
            {

                // current row 
                string[] currentRow = new string[iHeaderColumnCount];

                // fill up row array with values
                for (int x = 0; x < iHeaderColumnCount; x++)
                {
                    currentRow[x] = structuredExtractor.GetCellValue(PageIndex, y, x);
                }

                // if no values in 1st and 2nd column for the current row then it means we have 
                // superscript values in this row. These superscript values belong to cells below
                if (currentRow[0] == "" & currentRow[1] == "")
                {
                    // clean the superscript row
                    superScriptRow.Initialize();
                    // copy current row to superscriptRow
                    for (int i = 0; i < iHeaderColumnCount; i++)
                    {
                        superScriptRow[i] = currentRow[i];
                    }
                    PreviousRowWasSuperscript = true;
                    // jump to next iteration
                    continue;

                } // end if

                // if previous row was superscript then add it to current row
                if (PreviousRowWasSuperscript)
                {
                    for (int i = 0; i < iHeaderColumnCount; i++)
                    {
                        // if previous row (with superscript values) is NOT empty then add superscript values
                        // to values in the current row separated by decimal char
                        if (superScriptRow[i] != "")
                        {
                            currentRow[i] = currentRow[i] + precChar + superScriptRow[i];
                        }
                    }
                }

                // reset flag for next iteration not to use
                PreviousRowWasSuperscript = false;

                // get current row as a string
                StringBuilder rowString = new StringBuilder();
                foreach (string cc in currentRow)
                {
                    rowString.Append("\"" + cc + "\"");
                    rowString.Append(delimChar);
                }

                // add to final output string 
                finalOutput.AppendLine(rowString.ToString());


            }

            // write the generated csv into the console
            Console.WriteLine(finalOutput.ToString());

            // save the generate csv text into a file
            System.IO.File.WriteAllText(outputCSVFileName, finalOutput.ToString());

            Console.WriteLine("Done! Press any key to exit...");
            Console.ReadKey();

            Process.Start(outputCSVFileName);

        }
    }
}

Tutorials:

prev
next