This sample code shows how to extract data from PDF tables containing superscript values in C# using ByteScout PDF Extractor SDK.
You may also find useful to check how to extract filled form data from PDF with PDF Extractor SDK.
using System;
using System.Collections.Generic;
using System.Text;
using Bytescout.PDFExtractor;
using System.Diagnostics;
/*
This sample code shows how to extract data from tables where numbers with superscript
The code is doing the following with Prices.pdf:
- uses StructuredExtractor class to iterate through rows
- finds the header of the table by checking first columns in the header's row
- then iterates through rows and checking rows where no date in the first column
- if no date then it means that this row contains superscript values for the row below and we should
merge values from this row with cells in the row below
- we are saving this row with superscript values
- merging with the next row below
saving each row into a comma delimited string
collecting these strings into a single file and finally saving as .csv
*/
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
// char to delimit cells in a row
const string delimChar = ",";
// char to service as decimal separator
const string precChar = ".";
// output filename
const string outputCSVFileName = "outputTable.csv";
// Create Bytescout.PDFExtractor.StructuredExtractor object
StructuredExtractor structuredExtractor = new StructuredExtractor();
// set the registration key
structuredExtractor.RegistrationName = "demo";
structuredExtractor.RegistrationKey = "demo";
// Load sample PDF document
structuredExtractor.LoadDocumentFromFile("Prices.pdf");
// page to process
int PageIndex = 0;
// prepare structure of the first page (zero index)
structuredExtractor.PrepareStructure(PageIndex);
// get total number of rows in a table
int iRowCount = structuredExtractor.GetRowCount(PageIndex);
// search for the header column
// index of the row with the header
int iHeaderRowIndex = -1;
// iterate through rows to find the header row
for (int y = 0; y < iRowCount; y++)
{
// checking if the very first column (with date) is empty in the row
// if date is empty it means this row contains superscript values for the row below
if (structuredExtractor.GetCellValue(PageIndex, y, 0).IndexOf("Date") == 0 && // if 1st column starts with "Date"
structuredExtractor.GetCellValue(PageIndex, y, 1).IndexOf("Product") == 0 // if 2nd column starts with "Product"
)
{
// found row with the header, saving its index
iHeaderRowIndex = y;
break;
}
} // end for
// save number of columns in the header (so we are using the header row as a reference for the whole table)
int iHeaderColumnCount = structuredExtractor.GetColumnCount(PageIndex, iHeaderRowIndex);
// writing the header to the console
for (int jj = 0; jj < iHeaderColumnCount; jj++)
{
Console.Write(structuredExtractor.GetCellValue(PageIndex, iHeaderRowIndex, jj) + " | ");
}
Console.Write("\n"); // add line break
// we found a header i.e. we have a table below
// now reading row by row
// if first cell in a row contains superscript values for the row below
// we should save it and process along with the next row
// array to store superscript row
string[] superScriptRow = new string[iHeaderColumnCount];
bool PreviousRowWasSuperscript = false;
StringBuilder finalOutput = new StringBuilder();
// now iterate through rows from row after the header until we got to the stop text (ending the table)
for (int y = iHeaderRowIndex + 1; y < iRowCount; y++)
{
// current row
string[] currentRow = new string[iHeaderColumnCount];
// fill up row array with values
for (int x = 0; x < iHeaderColumnCount; x++)
{
currentRow[x] = structuredExtractor.GetCellValue(PageIndex, y, x);
}
// if no values in 1st and 2nd column for the current row then it means we have
// superscript values in this row. These superscript values belong to cells below
if (currentRow[0] == "" & currentRow[1] == "")
{
// clean the superscript row
superScriptRow.Initialize();
// copy current row to superscriptRow
for (int i = 0; i < iHeaderColumnCount; i++)
{
superScriptRow[i] = currentRow[i];
}
PreviousRowWasSuperscript = true;
// jump to next iteration
continue;
} // end if
// if previous row was superscript then add it to current row
if (PreviousRowWasSuperscript)
{
for (int i = 0; i < iHeaderColumnCount; i++)
{
// if previous row (with superscript values) is NOT empty then add superscript values
// to values in the current row separated by decimal char
if (superScriptRow[i] != "")
{
currentRow[i] = currentRow[i] + precChar + superScriptRow[i];
}
}
}
// reset flag for next iteration not to use
PreviousRowWasSuperscript = false;
// get current row as a string
StringBuilder rowString = new StringBuilder();
foreach (string cc in currentRow)
{
rowString.Append("\"" + cc + "\"");
rowString.Append(delimChar);
}
// add to final output string
finalOutput.AppendLine(rowString.ToString());
}
// write the generated csv into the console
Console.WriteLine(finalOutput.ToString());
// save the generate csv text into a file
System.IO.File.WriteAllText(outputCSVFileName, finalOutput.ToString());
Console.WriteLine("Done! Press any key to exit...");
Console.ReadKey();
Process.Start(outputCSVFileName);
}
}
}