Split PDF based on keyword tutorial – C# and VB.NET samples

  • Home
  • /
  • Articles
  • /
  • Split PDF based on keyword tutorial – C# and VB.NET samples

Split PDF based on keyword tutorial shows how to split PDF file by keywords in C# or Visual Basic .NET using PDF Extractor SDK. Use sample source codes below for splitting PDF documents into pages by keywords.

C#

using Bytescout.PDFExtractor;
using System.IO;

namespace FindAndExtractPageExample
{
class Program
{
 static void Main(string[] args)
 {
 	string inputFile = "sample.pdf";
 	string keyword = "demographic";

 	TextExtractor extractor = new TextExtractor("demo", "demo");
 	extractor.LoadDocumentFromFile(inputFile);

 	// Search each page for keyword
 	for (int i = 0; i < extractor.GetPageCount(); i++)
 	{
   if (extractor.Find(i, keyword, false))
   {
   	// extract the page containing the keyword
   	ExtractPage(inputFile, i, "page" + i + ".pdf");
   }
 	}
 }

 private static void ExtractPage(string inputFile, int pageIndex, string outputFile)
 {
 	DocumentSplitter splitter = new DocumentSplitter("demo", "demo");

 	if (pageIndex == 0)
 	{
   if (splitter.GetPageCount(inputFile) == 1)
   {
   	// no splitting required if there is the only page
   	File.Copy(inputFile, outputFile);
   }
   else
   {
   	// split at the second page (page numeration starts from 1 in this function).
   	// the first part will be our sought-for 1-page document.
   	splitter.Split(inputFile, outputFile, "waste", 2);
   	File.Delete("waste"); // delete the waste part
   }
 	}
 	else
 	{
   if (pageIndex == splitter.GetPageCount(inputFile) - 1)
   {
   	// if this is the last page, just split on it.
   	// the second part will be our sought-for 1-page document.
   	splitter.Split(inputFile, "waste", outputFile, pageIndex + 1);
   	File.Delete("waste"); // delete the waste part
   }
   else
   {
   	// if the required page is in the middle of the document, we need two split operations:
   	splitter.Split(inputFile, "waste", "part", pageIndex + 1);
   	File.Delete("waste");
   	splitter.Split("part", outputFile, "waste", 2);
   	File.Delete("part");
   	File.Delete("waste");
   }
 	}  	
 }
}
}

VB.NET

Imports Bytescout.PDFExtractor
Imports System.IO

Class Program
Friend Shared Sub Main(args As String())
 Dim inputFile As String = "sample.pdf"
 Dim keyword As String = "demographic"

 Dim extractor As New TextExtractor("demo", "demo")
 extractor.LoadDocumentFromFile(inputFile)

 ' Search each page for keyword
 For i As Integer = 0 To extractor.GetPageCount() - 1
 	If extractor.Find(i, keyword, False) Then
   ' extract the page containing the keyword
   ExtractPage(inputFile, i, "page" & i & ".pdf")
 	End If
 Next
End Sub

Private Shared Sub ExtractPage(inputFile As String, pageIndex As Integer, outputFile As String)
 Dim splitter As New DocumentSplitter("demo", "demo")

 If pageIndex = 0 Then
 	If splitter.GetPageCount(inputFile) = 1 Then
   ' no splitting required if there is the only page
   File.Copy(inputFile, outputFile)
 	Else
   ' split at the second page (page numeration starts from 1 in this function).
   ' the first part will be our sought-for 1-page document.
   splitter.Split(inputFile, outputFile, "waste", 2)
               ' delete the waste part
   File.Delete("waste")
 	End If
 Else
 	If pageIndex = splitter.GetPageCount(inputFile) - 1 Then
   ' if this is the last page, just split on it.
   ' the second part will be our sought-for 1-page document.
   splitter.Split(inputFile, "waste", outputFile, pageIndex + 1)
               ' delete the waste part
   File.Delete("waste")
 	Else
   ' if the required page is in the middle of the document, we need two split operations:
   splitter.Split(inputFile, "waste", "part", pageIndex + 1)
   File.Delete("waste")
   splitter.Split("part", outputFile, "waste", 2)
   File.Delete("part")
   File.Delete("waste")
 	End If
 End If
End Sub
End Class

prev
next