Split PDF based on keyword tutorial shows how to split PDF file by keywords in C# or Visual Basic .NET using PDF Extractor SDK. Use sample source codes below for splitting PDF documents into pages by keywords.
using Bytescout.PDFExtractor; using System.IO; namespace FindAndExtractPageExample { class Program { static void Main(string[] args) { string inputFile = "sample.pdf"; string keyword = "demographic"; TextExtractor extractor = new TextExtractor("demo", "demo"); extractor.LoadDocumentFromFile(inputFile); // Search each page for keyword for (int i = 0; i < extractor.GetPageCount(); i++) { if (extractor.Find(i, keyword, false)) { // extract the page containing the keyword ExtractPage(inputFile, i, "page" + i + ".pdf"); } } } private static void ExtractPage(string inputFile, int pageIndex, string outputFile) { DocumentSplitter splitter = new DocumentSplitter("demo", "demo"); if (pageIndex == 0) { if (splitter.GetPageCount(inputFile) == 1) { // no splitting required if there is the only page File.Copy(inputFile, outputFile); } else { // split at the second page (page numeration starts from 1 in this function). // the first part will be our sought-for 1-page document. splitter.Split(inputFile, outputFile, "waste", 2); File.Delete("waste"); // delete the waste part } } else { if (pageIndex == splitter.GetPageCount(inputFile) - 1) { // if this is the last page, just split on it. // the second part will be our sought-for 1-page document. splitter.Split(inputFile, "waste", outputFile, pageIndex + 1); File.Delete("waste"); // delete the waste part } else { // if the required page is in the middle of the document, we need two split operations: splitter.Split(inputFile, "waste", "part", pageIndex + 1); File.Delete("waste"); splitter.Split("part", outputFile, "waste", 2); File.Delete("part"); File.Delete("waste"); } } } } }
Imports Bytescout.PDFExtractor Imports System.IO Class Program Friend Shared Sub Main(args As String()) Dim inputFile As String = "sample.pdf" Dim keyword As String = "demographic" Dim extractor As New TextExtractor("demo", "demo") extractor.LoadDocumentFromFile(inputFile) ' Search each page for keyword For i As Integer = 0 To extractor.GetPageCount() - 1 If extractor.Find(i, keyword, False) Then ' extract the page containing the keyword ExtractPage(inputFile, i, "page" & i & ".pdf") End If Next End Sub Private Shared Sub ExtractPage(inputFile As String, pageIndex As Integer, outputFile As String) Dim splitter As New DocumentSplitter("demo", "demo") If pageIndex = 0 Then If splitter.GetPageCount(inputFile) = 1 Then ' no splitting required if there is the only page File.Copy(inputFile, outputFile) Else ' split at the second page (page numeration starts from 1 in this function). ' the first part will be our sought-for 1-page document. splitter.Split(inputFile, outputFile, "waste", 2) ' delete the waste part File.Delete("waste") End If Else If pageIndex = splitter.GetPageCount(inputFile) - 1 Then ' if this is the last page, just split on it. ' the second part will be our sought-for 1-page document. splitter.Split(inputFile, "waste", outputFile, pageIndex + 1) ' delete the waste part File.Delete("waste") Else ' if the required page is in the middle of the document, we need two split operations: splitter.Split(inputFile, "waste", "part", pageIndex + 1) File.Delete("waste") splitter.Split("part", outputFile, "waste", 2) File.Delete("part") File.Delete("waste") End If End If End Sub End Class