How to extract data from tables with superscript from PDF in C# using ByteScout PDF Extractor SDK - ByteScout

How to extract data from tables with superscript from PDF in C# using ByteScout PDF Extractor SDK

  • Home
  • /
  • Articles
  • /
  • How to extract data from tables with superscript from PDF in C# using ByteScout PDF Extractor SDK

This sample code shows how to extract data from PDF tables containing superscript values in C# using ByteScout PDF Extractor SDK.

You may also find useful to check how to extract filled form data from PDF with PDF Extractor SDK.


using System;
using System.Collections.Generic;
using System.Text;
using Bytescout.PDFExtractor;
using System.Diagnostics;

This sample code shows how to extract data from tables where numbers with superscript
The code is doing the following with Prices.pdf:
 - uses StructuredExtractor class to iterate through rows
 - finds the header of the table by checking first columns in the header's row 
 - then iterates through rows and checking rows where no date in the first column
 - if no date then it means that this row contains superscript values for the row below and we should
 merge values from this row with cells in the row below
 - we are saving this row with superscript values
 - merging with the next row below
 saving each row into a comma delimited string
 collecting these strings into a single file and finally saving as .csv

namespace ConsoleApplication1
    class Program
        static void Main(string[] args)
            // char to delimit cells in a row
            const string delimChar = ",";
            // char to service as decimal separator 
            const string precChar = ".";

            // output filename 
            const string outputCSVFileName = "outputTable.csv";

            // Create Bytescout.PDFExtractor.StructuredExtractor object
            StructuredExtractor structuredExtractor = new StructuredExtractor();

            // set the registration key
            structuredExtractor.RegistrationName = "demo";
            structuredExtractor.RegistrationKey = "demo";

            // Load sample PDF document

            // page to process
            int PageIndex = 0;

            // prepare structure of the first page (zero index)

            // get total number of rows in a table
            int iRowCount = structuredExtractor.GetRowCount(PageIndex);

            // search for the header column

            // index of the row with the header
            int iHeaderRowIndex = -1;

            // iterate through rows to find the header row
            for (int y = 0; y < iRowCount; y++)

                // checking if the very first column (with date) is empty in the row
                // if date is empty it means this row contains superscript values for the row below
                if (structuredExtractor.GetCellValue(PageIndex, y, 0).IndexOf("Date") == 0 && // if 1st column starts with "Date"
                    structuredExtractor.GetCellValue(PageIndex, y, 1).IndexOf("Product") == 0 // if 2nd column starts with "Product"
                    // found row with the header, saving its index
                    iHeaderRowIndex = y;

            } // end for

            // save number of columns in the header (so we are using the header row as a reference for the whole table)
            int iHeaderColumnCount = structuredExtractor.GetColumnCount(PageIndex, iHeaderRowIndex);

            // writing the header to the console
            for (int jj = 0; jj < iHeaderColumnCount; jj++)
                Console.Write(structuredExtractor.GetCellValue(PageIndex, iHeaderRowIndex, jj) + " | ");
            Console.Write("\n"); // add line break

            // we found a header i.e. we have a table below 
            // now reading row by row
            // if first cell in a row contains superscript values for the row below
            // we should save it and process along with the next row

            // array to store superscript row
            string[] superScriptRow = new string[iHeaderColumnCount];
            bool PreviousRowWasSuperscript = false;

            StringBuilder finalOutput = new StringBuilder();

            // now iterate through rows from row after the header until we got to the stop text (ending the table)
            for (int y = iHeaderRowIndex + 1; y < iRowCount; y++)

                // current row 
                string[] currentRow = new string[iHeaderColumnCount];

                // fill up row array with values
                for (int x = 0; x < iHeaderColumnCount; x++)
                    currentRow[x] = structuredExtractor.GetCellValue(PageIndex, y, x);

                // if no values in 1st and 2nd column for the current row then it means we have 
                // superscript values in this row. These superscript values belong to cells below
                if (currentRow[0] == "" & currentRow[1] == "")
                    // clean the superscript row
                    // copy current row to superscriptRow
                    for (int i = 0; i < iHeaderColumnCount; i++)
                        superScriptRow[i] = currentRow[i];
                    PreviousRowWasSuperscript = true;
                    // jump to next iteration

                } // end if

                // if previous row was superscript then add it to current row
                if (PreviousRowWasSuperscript)
                    for (int i = 0; i < iHeaderColumnCount; i++)
                        // if previous row (with superscript values) is NOT empty then add superscript values
                        // to values in the current row separated by decimal char
                        if (superScriptRow[i] != "")
                            currentRow[i] = currentRow[i] + precChar + superScriptRow[i];

                // reset flag for next iteration not to use
                PreviousRowWasSuperscript = false;

                // get current row as a string
                StringBuilder rowString = new StringBuilder();
                foreach (string cc in currentRow)
                    rowString.Append("\"" + cc + "\"");

                // add to final output string 


            // write the generated csv into the console

            // save the generate csv text into a file
            System.IO.File.WriteAllText(outputCSVFileName, finalOutput.ToString());

            Console.WriteLine("Done! Press any key to exit...");