Split PDF Based on Keyword - Tutorial with C# and VB.NET Samples - ByteScout

Split PDF Based on Keyword – Tutorial with C# and VB.NET Samples

  • Home
  • /
  • Articles
  • /
  • Split PDF Based on Keyword – Tutorial with C# and VB.NET Samples

Split PDF based on keyword tutorial shows how to split PDF file by keywords in C# or Visual Basic .NET using PDF Extractor SDK. Use sample source codes below for splitting PDF documents into pages by keywords.

C# Code Snippet

using Bytescout.PDFExtractor;
using System.IO;

namespace FindAndExtractPageExample
class Program
 static void Main(string[] args)
 	string inputFile = "sample.pdf";
 	string keyword = "demographic";

 	TextExtractor extractor = new TextExtractor("demo", "demo");

 	// Search each page for keyword
 	for (int i = 0; i < extractor.GetPageCount(); i++)
   if (extractor.Find(i, keyword, false))
   	// extract the page containing the keyword
   	ExtractPage(inputFile, i, "page" + i + ".pdf");

 private static void ExtractPage(string inputFile, int pageIndex, string outputFile)
 	DocumentSplitter splitter = new DocumentSplitter("demo", "demo");

 	if (pageIndex == 0)
   if (splitter.GetPageCount(inputFile) == 1)
   	// no splitting required if there is the only page
   	File.Copy(inputFile, outputFile);
   	// split at the second page (page numeration starts from 1 in this function).
   	// the first part will be our sought-for 1-page document.
   	splitter.Split(inputFile, outputFile, "waste", 2);
   	File.Delete("waste"); // delete the waste part
   if (pageIndex == splitter.GetPageCount(inputFile) - 1)
   	// if this is the last page, just split on it.
   	// the second part will be our sought-for 1-page document.
   	splitter.Split(inputFile, "waste", outputFile, pageIndex + 1);
   	File.Delete("waste"); // delete the waste part
   	// if the required page is in the middle of the document, we need two split operations:
   	splitter.Split(inputFile, "waste", "part", pageIndex + 1);
   	splitter.Split("part", outputFile, "waste", 2);

VB.NET Code Snippet

Imports Bytescout.PDFExtractor
Imports System.IO

Class Program
Friend Shared Sub Main(args As String())
 Dim inputFile As String = "sample.pdf"
 Dim keyword As String = "demographic"

 Dim extractor As New TextExtractor("demo", "demo")

 ' Search each page for keyword
 For i As Integer = 0 To extractor.GetPageCount() - 1
 	If extractor.Find(i, keyword, False) Then
   ' extract the page containing the keyword
   ExtractPage(inputFile, i, "page" & i & ".pdf")
 	End If
End Sub

Private Shared Sub ExtractPage(inputFile As String, pageIndex As Integer, outputFile As String)
 Dim splitter As New DocumentSplitter("demo", "demo")

 If pageIndex = 0 Then
 	If splitter.GetPageCount(inputFile) = 1 Then
   ' no splitting required if there is the only page
   File.Copy(inputFile, outputFile)
   ' split at the second page (page numeration starts from 1 in this function).
   ' the first part will be our sought-for 1-page document.
   splitter.Split(inputFile, outputFile, "waste", 2)
               ' delete the waste part
 	End If
 	If pageIndex = splitter.GetPageCount(inputFile) - 1 Then
   ' if this is the last page, just split on it.
   ' the second part will be our sought-for 1-page document.
   splitter.Split(inputFile, "waste", outputFile, pageIndex + 1)
               ' delete the waste part
   ' if the required page is in the middle of the document, we need two split operations:
   splitter.Split(inputFile, "waste", "part", pageIndex + 1)
   splitter.Split("part", outputFile, "waste", 2)
 	End If
 End If
End Sub
End Class

